문제

I ported a Java GC test program to C++ (see the code below) as well as Python. The Java and Python performance is much greater than C++ and I was thinking this was due to all the calls to new that have to be done to create the strings each time. I've tried using Boost's fast_pool_allocator but that actually worsened performance from 700ms to 1200ms. Am I using the allocator wrong, or is there something else I should be doing?

EDIT: Compiled with g++ -O3 -march=native --std=c++11 garbage.cpp -lboost_system. g++ is version 4.8.1 One iteration takes in Python is about 300ms and with Java about 50ms. std::allocator gives about 700ms and boost::fast_pool_allocator gives about 1200ms.

#include <string>
#include <vector>
#include <chrono>
#include <list>
#include <iostream>
#include <boost/pool/pool_alloc.hpp>
#include <memory>
//#include <gc/gc_allocator.h>


using namespace std;
#include <sstream>
typedef boost::fast_pool_allocator<char> c_allocator;
//typedef std::allocator<char> c_allocator;
typedef basic_string<char, char_traits<char>, c_allocator> pool_string;
namespace patch {
    template <typename T> pool_string to_string(const T& in) {
        std::basic_stringstream<char, char_traits<char>, c_allocator> stm;
        stm << in;
        return stm.str();
    }
}


#include "mytime.hpp"

class Garbage {
public:
    vector<pool_string> outer;
    vector<pool_string> old;
    const int nThreads = 1;
    //static auto time = chrono::high_resolution_clock();

    void go() {
//        outer.resize(1000000);
        //old.reserve(1000000);
        auto tt = mytime::msecs();
        for (int i = 0; i < 10; ++i) {
            if (i % 100 == 0) {
                cout << "DOING AN OLD" << endl;
                doOld();
                tt = mytime::msecs();
            }

            for (int j = 0; j < 1000000/nThreads; ++j)
                outer.push_back(patch::to_string(j));

            outer.clear();
            auto t = mytime::msecs();
            cout << (t - tt) << endl;
            tt = t;
        }
    }

    void doOld() {
        old.clear();
        for (int i = 0; i < 1000000/nThreads; ++i)
            old.push_back(patch::to_string(i));
    }
};

int main() {
    Garbage().go();
}
도움이 되었습니까?

해결책

The problem is you're using a new string stream each time to convert an integer.

Fix it:

namespace patch {
    template <typename T> pool_string to_string(const T& in) {
        return boost::lexical_cast<pool_string>(in);
    }
}

Now the timings are:

DOING AN OLD
0.175462
0.0670085
0.0669926
0.0687969
0.0692518
0.0669318
0.0669196
0.0669187
0.0668962
0.0669185

real    0m0.801s
user    0m0.784s
sys 0m0.016s

See it Live On Coliru

Full code for reference:

#include <boost/pool/pool_alloc.hpp>
#include <chrono>
#include <iostream>
#include <list>
#include <memory>
#include <sstream>
#include <string>
#include <vector>
#include <boost/lexical_cast.hpp>
//#include <gc/gc_allocator.h>

using string = std::string;

namespace patch {
    template <typename T> string to_string(const T& in) {
        return boost::lexical_cast<string>(in);
    }
}

class Timer
{
    typedef std::chrono::high_resolution_clock clock;
    clock::time_point _start;
  public:
    Timer() { reset(); }
    void reset() { _start = now(); }
    double elapsed()
    {
        using namespace std::chrono;
        auto e = now() - _start;
        return duration_cast<nanoseconds>(e).count()*1.0e-9;
    }
    clock::time_point now()
    {
        return clock::now();
    }
};


class Garbage {
    public:
        std::vector<string> outer;
        std::vector<string> old;
        const int nThreads = 1;

        void go() {
            outer.resize(1000000);
            //old.reserve(1000000);
            Timer timer;

            for (int i = 0; i < 10; ++i) {
                if (i % 100 == 0) {
                    std::cout << "DOING AN OLD" << std::endl;
                    doOld();
                }

                for (int j = 0; j < 1000000/nThreads; ++j)
                    outer.push_back(patch::to_string(j));

                outer.clear();
                std::cout << timer.elapsed() << std::endl;
                timer.reset();
            }
        }

        void doOld() {
            old.clear();
            for (int i = 0; i < 1000000/nThreads; ++i)
                old.push_back(patch::to_string(i));
        }
};

int main() {
    Garbage().go();
}

다른 팁

Since I don't use boost on my machine, I simplified the code to use standard C++11 to_string (thus accidentally "fixing" the problem sehe found), and got this:

#include <string>
#include <vector>
#include <chrono>
#include <list>
#include <iostream>
#include <memory>
//#include <gc/gc_allocator.h>
#include <sstream>
using namespace std;


class Timer
{
    typedef std::chrono::high_resolution_clock clock;
    clock::time_point _start;
    public:
    Timer() { reset(); }
    void reset() { _start = now(); }
    double elapsed()
    {
        using namespace std::chrono;
        auto e = now() - _start;
        return duration_cast<nanoseconds>(e).count()*1.0e-9;
    }
    clock::time_point now()
    {
        return clock::now();
    }
};


class Garbage {
public:
    vector<string> outer;
    vector<string> old;
    const int nThreads = 1;
Timer timer;

    void go() {
//        outer.resize(1000000);
        //old.reserve(1000000);
        for (int i = 0; i < 10; ++i) {
            if (i % 100 == 0) {
                cout << "DOING AN OLD" << endl;
                doOld();
            }

            for (int j = 0; j < 1000000/nThreads; ++j)
                outer.push_back(to_string(j));

            outer.clear();
            cout << timer.elapsed() << endl;
            timer.reset();
        }
    }

    void doOld() {
        old.clear();
        for (int i = 0; i < 1000000/nThreads; ++i)
            old.push_back(to_string(i));
    }
};

int main() {
    Garbage().go();
}

Compiling with:

$ g++ -O3 -std=c++11 gc.cpp
$ ./a.out
DOING AN OLD
0.414637
0.189082
0.189143
0.186336
0.184449
0.18504
0.186302
0.186055
0.183123
0.186835

clang 3.5 build with source from Friday 18th of April 2014 gives similar results with the same compiler options.

My processor is a AMD Phenom(tm) II X4 965, running at 3.6GHz (if I remember right).

라이센스 : CC-BY-SA ~와 함께 속성
제휴하지 않습니다 StackOverflow
scroll top