Question

Its been two days and I am still cant figure it out why my implementation of CUDA matrix multiplication differs from the results produced in MATLAB.

CUDA kernel: A(200x60000) = W(200x784) * Data(784x6000)

__global__ void CalculateA(Matrix W, Matrix Data, Matrix A)
{
    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    int Col = blockIdx.x * blockDim.x + threadIdx.x;
    if ((Row < W.row) && (Col < Data.col)){ 
        float Cvalue = 0.0;
        for (int i = 0; i < W.col; ++i){
            Cvalue += W.elements[Row*W.col+i] * Data.elements[i*Data.col+Col];
        }
    A.elements[Row*A.col+Col] = Cvalue;
    }
} 

And calling the kernel:

 void myFunc(Matrix W1, Matrix data){
        Matrix d_W1, d_data, d_a2, a2;
    size_t size;

    a2.row = W1.row;    d_a2.row = a2.row;
    a2.col = data.col;  d_a2.col = a2.col;
    size = a2.col*a2.row*sizeof(float);
    cudaMalloc(&d_a2.elements,size);

    d_W1.row = W1.row;  d_W1.col = W1.col;
    size = W1.col*W1.row*sizeof(float);
    cudaMalloc(&d_W1.elements,size);
    cudaMemcpy(d_W1.elements,W1.elements,size,cudaMemcpyHostToDevice);

    d_data.col = data.col; d_data.row = data.row;
    size = data.row*data.col*sizeof(float);
    cudaMalloc(&d_data.elements,size);
    cudaMemcpy(d_data.elements,data.elements,size,cudaMemcpyHostToDevice);
    dim3 dimGrid(data.col/32 + 1, W1.row/32 + 1, 1);
    dim3 dimBlock(32, 32, 1);

    CalculateA<<<dimGrid, dimBlock>>>(d_W1, d_data, d_a2);
    a2.elements = new float [a2.row*a2.col];
    cudaMemcpy(a2.elements,d_a2.elements,sizeof(float)*a2.row*a2.col,cudaMemcpyDeviceToHost);

    printf("\nA2 first and last member %f - %f\n",a2.elements[0],a2.elements[a2.row*a2.col-1]);
}

Results difference is not low for example first and last elements of CUDA code is 0.011322 and -0.179534 but multiplying in MATLAB results in 0.4280 and 0.0056.

this is how I do it in MATLAB:

>> size(W1)     ans =       200   784

>> size(data)   ans =       784       60000

>> z2=W1*data;

>> size(z2)     ans =       200       60000

>> z2 = z2(:);

>> z2(1)        ans =   0.4280

>> z2(200*60000)ans =  0.0056
Was it helpful?

Solution

There is nothing wrong with the code you posted. If I expand your kernel and function into a complete running example like this:

#include <iostream>

struct Matrix
{
    int row;
    int col;
    float *elements;

    __device__ __host__
    float& operator()(int r, int c) { return elements[r*col + c]; };
};

__global__ void CalculateA(Matrix W, Matrix Data, Matrix A)
{
    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    int Col = blockIdx.x * blockDim.x + threadIdx.x;
    if ((Row < W.row) && (Col < Data.col)){ 
        float Cvalue = 0.0;
        for (int i = 0; i < W.col; ++i){
            Cvalue += W.elements[Row*W.col+i] * Data.elements[i*Data.col+Col];
        }
    A.elements[Row*A.col+Col] = Cvalue;
    }
} 

void myFunc(Matrix W1, Matrix data)
{
    Matrix d_W1, d_data, d_a2, a2;
    size_t size;

    a2.row = W1.row;    d_a2.row = a2.row;
    a2.col = data.col;  d_a2.col = a2.col;
    size = a2.col*a2.row*sizeof(float);
    cudaMalloc(&d_a2.elements,size);

    d_W1.row = W1.row;  d_W1.col = W1.col;
    size = W1.col*W1.row*sizeof(float);
    cudaMalloc(&d_W1.elements,size);
    cudaMemcpy(d_W1.elements,W1.elements,size,cudaMemcpyHostToDevice);

    d_data.col = data.col; d_data.row = data.row;
    size = data.row*data.col*sizeof(float);
    cudaMalloc(&d_data.elements,size);
    cudaMemcpy(d_data.elements,data.elements,size,cudaMemcpyHostToDevice);
    dim3 dimGrid(data.col/32 + 1, W1.row/32 + 1, 1);
    dim3 dimBlock(32, 32, 1);

    CalculateA<<<dimGrid, dimBlock>>>(d_W1, d_data, d_a2);
    a2.elements = new float [a2.row*a2.col];
    cudaMemcpy(a2.elements,d_a2.elements,sizeof(float)*a2.row*a2.col,cudaMemcpyDeviceToHost);

    for(int j=0; j<a2.col; ++j) {
        for(int i=0; i<a2.row; ++i) {
            std::cout << a2(i,j) << " ";
        }
        std::cout << std::endl;
    }
}

int main(void)
{
    float a[6] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f };
    float b[6] = { 0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f};

    Matrix W1; W1.row=2; W1.col=3; W1.elements = &a[0];
    Matrix Data; Data.row=3; Data.col=2; Data.elements = &b[0];

    myFunc(W1, Data);

    return 0;
}

and run it, I get this:

>nvcc -arch=sm_21 -Xptxas="-v" -m32 matrix.cu
matrix.cu
tmpxft_000014f4_00000000-5_matrix.cudafe1.gpu
tmpxft_000014f4_00000000-10_matrix.cudafe2.gpu
matrix.cu
ptxas : info : 132 bytes gmem, 28 bytes cmem[14]
ptxas : info : Compiling entry function '_Z10CalculateA6MatrixS_S_' for 'sm_21'
ptxas : info : Function properties for _Z10CalculateA6MatrixS_S_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas : info : Used 14 registers, 68 bytes cmem[0]
tmpxft_000014f4_00000000-5_matrix.cudafe1.cpp
tmpxft_000014f4_00000000-15_matrix.ii

>cuda-memcheck a.exe
========= CUDA-MEMCHECK
2.2 4.9
2.8 6.4
========= ERROR SUMMARY: 0 errors

which is the correct answer for the dot product assuming column major ordering (which is the Matlab convention).

So if your results don't agree, it is because of something you haven't shown us. One likelihood is that your test problem is so large (and kernel so inefficient) that if you are running this on a display GPU, your program is hitting the display driver watchdog timer limit and being killed before the kernel finishes running. Also note that you have no CUDA API error checking whatsoever, so it is possible that you are getting runtime errors which is either stopping your kernel from finishing or even running at all, but you simply don't notice because of the lack of error checking.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top