Integral Image or Summed Area Table of 2D matrix using CUDA C

Question

Your main problem is a wrong memory usage and storage. With your code you also corrupted the heap! I rechanged your code by using row-major ordering, as it's usually used in c/c++.

Your first error occurs when you write the inputs into host memory matrix_a[r*M+c]. Because r range is from 0..M(3) and c range is from 0..N(2) the maximum index is 2*3+1=7. But your matrix only have 6 elements - maximum index is 5! Therefore I rechanged all matrix accesses.

With that changes I have to fit your grid setup, too. Now it's dim3 grid(N,M);.

If it's uncertain for you what a variable represents or how to use it, use good representing names for them, as you did it in the c reference code!

With that changes your code work for me. Be aware, the way of input the matrix has changed, too!

Above the changed complete code: Kernel function:

__global__ void image_integral(int *a, int*b, int rowsTotal,int colsTotal)
{
    // Thread Ids equal to block Ids because the each blocks contains one thread only.
    int col = blockIdx.x;
    int row = blockIdx.y;
    int temp=0;

    if(col < colsTotal && row < rowsTotal)
    {
        // The first loop iterates from zero to the Y index of the thread which represents the corresponding element of the output/input array.  
        for(int r=0;r<=row;r++)
        {
            // The second loop iterates from zero to the X index of the thread which represents the corresponding element of the output/input array  
            for(int c=0; c<=col; c++)
            {
                temp = temp+a[r*colsTotal+c];
            }
        }
    }

    //Transfer the final result to the output array
    b[row*colsTotal+col]=temp;
}

The host implementation:

void main()
{
    //M is number of rows
    //N is number of columns

    int M=3,N=2, m_e=0;
    int total_e=M*N;
    int widthstep=total_e*sizeof(int);

    int * matrix_a= (int *)malloc(widthstep);
    int * matrix_b= (int *)malloc(widthstep);

    cout<<"Enter elements for "<< M<<"x"<<N<<" matrix";

    for(int r=0;r<M;r++)
    {
        for(int c=0; c<N;c++)
        {
            cout<<"Enter Matrix element [ "<<r<<","<<c<<"]";
            cin>>m_e;
            matrix_a[r*N+c]=m_e;
            matrix_b[r*N+c]=0;
        }
    }

    int * d_matrix_a, * d_matrix_b;

    cout<<"Input:"<<endl;

    for(int r=0;r<M;r++)
    {
        for(int c=0; c<N;c++)
        {
            cout << matrix_a[r*N+c]<<" ";
        }
        cout << endl;
    }

    cout<<endl;

    cudaMalloc(&d_matrix_a,widthstep);
    cudaMalloc(&d_matrix_b,widthstep);

    cudaMemcpy(d_matrix_a,matrix_a,widthstep,cudaMemcpyHostToDevice);
    cudaMemcpy(d_matrix_b,matrix_b,widthstep,cudaMemcpyHostToDevice);

    //Creating a grid where the number of blocks are equal to the number of pixels or input matrix elements.

    //Each block contains only one thread.

    dim3 grid(N,M);

    image_integral<<<grid,1>>>(d_matrix_a, d_matrix_b,M,N);

    cudaThreadSynchronize();

    cudaMemcpy(matrix_b,d_matrix_b,widthstep,cudaMemcpyDeviceToHost);

    cout<<"The Summed Area table is: "<<endl;

    for(int r=0;r<M;r++)
    {
        for(int c=0; c<N;c++)
        {
            cout << matrix_b[r*N+c]<<" ";
        }
        cout << endl;
    }

    system("pause");

    cudaFree(d_matrix_a);
    cudaFree(d_matrix_b);
    free(matrix_a);
    free(matrix_b);
}

Integral Image or Summed Area Table of 2D matrix using CUDA C

Basic Algorithm for Integral Image/Summed Area Table: