One of the other approaches to 2D arrays is, if you think its nothing but the arrangement of the elements in memory. The following code explains you such an approach with row-major order and more generalised way to write with proper error checking.
#include<stdio.h>
#define NUM_ROWS 2
#define NUM_COLS 2
__global__ void add2(int* da, int iNumCol)
{
int idx=threadIdx.x;
int idy=threadIdx.y;
da[(idx * iNumCol) + idy]+=2;
// printf("It came here");
//printf("%d \n",da[idx][idy]);
}
int main()
{
int ha[NUM_ROWS][NUM_COLS] ;
int *da ;
int hb[NUM_ROWS][NUM_COLS] ;
int iSize = NUM_ROWS * NUM_COLS * sizeof(int) ;
cudaError_t cuError = cudaSuccess ;
dim3 dimGrid (1,1,1) ;
dim3 dimBlock (NUM_ROWS, NUM_COLS, 1) ;
for(int i=0;i<NUM_ROWS;i++)
{
for(int j=0;j<NUM_COLS;j++)
{
ha[i][j]=0;
}
}
cuError = cudaMalloc((void**)&da, iSize) ;
if (cudaSuccess != cuError)
{
printf ("Failed to allocate memory\n") ;
return 1 ;
}
cuError = cudaMemcpy(da, ha, iSize, cudaMemcpyHostToDevice);
if (cudaSuccess != cuError)
{
cudaFree (da) ;
printf ("Failed in Memcpy 1\n") ;
return 1 ;
}
printf("Before kernel\n");
for(int i=0;i<NUM_ROWS;i++)
{
for(int j=0;j<NUM_COLS;j++)
{
printf("%d ",ha[i][j]);
}
printf("\n");
}
printf("\n");
add2<<<dimGrid, dimBlock>>>(da, NUM_COLS);
cuError = cudaGetLastError () ;
if (cudaSuccess != cuError)
{
printf ("Failed in kernel launch and reason is %s\n", cudaGetErrorString(cuError)) ;
return 1 ;
}
cuError = cudaMemcpy(hb, da, iSize, cudaMemcpyDeviceToHost);
if (cudaSuccess != cuError)
{
cudaFree (da) ;
printf ("Failed in Memcpy 2\n") ;
return 1 ;
}
printf("After kernel\n");
for(int i=0;i<NUM_ROWS;i++)
{
for(int j=0;j<NUM_COLS;j++)
{
printf("%d ",hb[i][j]);
}
printf("\n");
}
cudaFree (da) ;
return 0;
}