Cuda matrix multiplication results differs from MATLAB

Question

There is nothing wrong with the code you posted. If I expand your kernel and function into a complete running example like this:

#include <iostream>

struct Matrix
{
    int row;
    int col;
    float *elements;

    __device__ __host__
    float& operator()(int r, int c) { return elements[r*col + c]; };
};

__global__ void CalculateA(Matrix W, Matrix Data, Matrix A)
{
    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    int Col = blockIdx.x * blockDim.x + threadIdx.x;
    if ((Row < W.row) && (Col < Data.col)){ 
        float Cvalue = 0.0;
        for (int i = 0; i < W.col; ++i){
            Cvalue += W.elements[Row*W.col+i] * Data.elements[i*Data.col+Col];
        }
    A.elements[Row*A.col+Col] = Cvalue;
    }
} 

void myFunc(Matrix W1, Matrix data)
{
    Matrix d_W1, d_data, d_a2, a2;
    size_t size;

    a2.row = W1.row;    d_a2.row = a2.row;
    a2.col = data.col;  d_a2.col = a2.col;
    size = a2.col*a2.row*sizeof(float);
    cudaMalloc(&d_a2.elements,size);

    d_W1.row = W1.row;  d_W1.col = W1.col;
    size = W1.col*W1.row*sizeof(float);
    cudaMalloc(&d_W1.elements,size);
    cudaMemcpy(d_W1.elements,W1.elements,size,cudaMemcpyHostToDevice);

    d_data.col = data.col; d_data.row = data.row;
    size = data.row*data.col*sizeof(float);
    cudaMalloc(&d_data.elements,size);
    cudaMemcpy(d_data.elements,data.elements,size,cudaMemcpyHostToDevice);
    dim3 dimGrid(data.col/32 + 1, W1.row/32 + 1, 1);
    dim3 dimBlock(32, 32, 1);

    CalculateA<<<dimGrid, dimBlock>>>(d_W1, d_data, d_a2);
    a2.elements = new float [a2.row*a2.col];
    cudaMemcpy(a2.elements,d_a2.elements,sizeof(float)*a2.row*a2.col,cudaMemcpyDeviceToHost);

    for(int j=0; j<a2.col; ++j) {
        for(int i=0; i<a2.row; ++i) {
            std::cout << a2(i,j) << " ";
        }
        std::cout << std::endl;
    }
}

int main(void)
{
    float a[6] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f };
    float b[6] = { 0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f};

    Matrix W1; W1.row=2; W1.col=3; W1.elements = &a[0];
    Matrix Data; Data.row=3; Data.col=2; Data.elements = &b[0];

    myFunc(W1, Data);

    return 0;
}

and run it, I get this:

>nvcc -arch=sm_21 -Xptxas="-v" -m32 matrix.cu
matrix.cu
tmpxft_000014f4_00000000-5_matrix.cudafe1.gpu
tmpxft_000014f4_00000000-10_matrix.cudafe2.gpu
matrix.cu
ptxas : info : 132 bytes gmem, 28 bytes cmem[14]
ptxas : info : Compiling entry function '_Z10CalculateA6MatrixS_S_' for 'sm_21'
ptxas : info : Function properties for _Z10CalculateA6MatrixS_S_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas : info : Used 14 registers, 68 bytes cmem[0]
tmpxft_000014f4_00000000-5_matrix.cudafe1.cpp
tmpxft_000014f4_00000000-15_matrix.ii

>cuda-memcheck a.exe
========= CUDA-MEMCHECK
2.2 4.9
2.8 6.4
========= ERROR SUMMARY: 0 errors

which is the correct answer for the dot product assuming column major ordering (which is the Matlab convention).

So if your results don't agree, it is because of something you haven't shown us. One likelihood is that your test problem is so large (and kernel so inefficient) that if you are running this on a display GPU, your program is hitting the display driver watchdog timer limit and being killed before the kernel finishes running. Also note that you have no CUDA API error checking whatsoever, so it is possible that you are getting runtime errors which is either stopping your kernel from finishing or even running at all, but you simply don't notice because of the lack of error checking.