As I mentioned in the comments, your MATLAB code is missing a call to the sum
function (otherwise the two codes are computing different things!). So it ought to be:
MATLAB
A = rand(1000,40000);
B = rand(1000,1);
tic
count = sum(bsxfun(@ne, A, B));
toc
On my machine I get:
Elapsed time is 0.036931 seconds.
Remember that the above statement is vectorized (think SIMD parallelization). MATLAB might also automatically run this multithreaded if the size is large enough.
Here is my version of the code in C++. I'm using simple classes to create a vector/matrix interface. Note that the underlying data is basically stored as a 1D array with column-major order similar to MATLAB.
C++
#include <iostream>
#include <cstdlib> // rand
#include <ctime> // time
#include <sys/time.h> // gettimeofday
class Timer
{
private:
timeval t1, t2;
public:
Timer() {}
~Timer() {}
void start() { gettimeofday(&t1, NULL); }
void stop() { gettimeofday(&t2, NULL); }
double elapsedTime() { return (t2.tv_sec - t1.tv_sec)*1000.0 + (t2.tv_usec - t1.tv_usec)/1000; }
};
template<typename T>
class Vector
{
private:
T *data;
const size_t num;
public:
Vector(const size_t num) : num(num) { data = new T[num]; }
~Vector() { delete[] data; }
inline T& operator() (const size_t i) { return data[i]; }
inline const T& operator() (const size_t i) const { return data[i]; }
size_t size() const { return num; }
};
template<typename T>
class Matrix
{
private:
T *data;
const size_t nrows, ncols;
public:
Matrix(const size_t nr, const size_t nc) : nrows(nr), ncols(nc) { data = new T[nrows * ncols]; }
~Matrix() { delete[] data; }
inline T& operator() (const size_t r, const size_t c) { return data[c*nrows + r]; }
inline const T& operator() (const size_t r, const size_t c) const { return data[c*nrows + r]; }
size_t size1() const { return nrows; }
size_t size2() const { return ncols; }
};
inline double rand_double(double min=0.0, double max=1.0)
{
return (max - min) * (static_cast<double>(rand()) / RAND_MAX) + min;
}
int main() {
// seed random number generator
srand( static_cast<unsigned int>(time(NULL)) );
// intialize data
const int m = 1000, n = 40000;
Matrix<double> A(m,n);
Vector<double> B(m);
for(size_t i=0; i<A.size1(); i++) {
B(i) = rand_double();
for(size_t j=0; j<A.size2(); j++) {
A(i,j) = rand_double();
}
}
// measure timing
Timer timer;
timer.start();
// in MATLAB: count = sum(bsxfun(@ne, A, B))
Vector<double> count(n);
#pragma omp parallel for
for(int j=0; j<n; ++j) {
count(j) = 0.0;
for(int i=0; i<m; i++) {
count(j) += (A(i,j) != B(i));
}
}
timer.stop();
// elapsed time in milliseconds
std::cout << "Elapsed time is " << timer.elapsedTime() << " milliseconds." << std::endl;
return 0;
}
The result:
$ g++ -Wall -O3 test.cpp -o test
$ ./test
Elapsed time is 63 milliseconds.
If I compile and run it with OpenMP support enabled, I get:
$ g++ -Wall -O3 -fopenmp test.cpp -o test_omp
$ ./test_omp
Elapsed time is 16 milliseconds.
Not a bad improvement (almost x4 faster) just by adding a single line to the code (the pargma omp
macro).
This last one beats the 37 ms I get in MATLAB (R2013b). The code was compiled using GCC 4.8.1 (MinGW-w64 running on Windows 8, Core i7 laptop).
If you really want to push the limits here for the C++ code, you'll have to add vectorization (SSE/AVX intrinsics) in addition to the multithreading achieved with OpenMP.
You might also want to consider using GPGPU programming (CUDA, OpenCL). In MATLAB that's very easy to do:
AA = gpuArray(A);
BB = gpuArray(B);
CC = sum(bsxfun(@ne, AA, BB));
C = gather(CC);
gpuArray(.)
will transfer the matrix to the GPU, after which all operations done on it are performed on the GPU device instead of the CPU. gather(.)
will transfer the array back to the MATLAB workspace. However the problem here is largely memory-bound, so will not likely see any improvement (possibly even slower due to the overhead of the data transfer).