2D array in the Kernel CUDA

Question

One of the other approaches to 2D arrays is, if you think its nothing but the arrangement of the elements in memory. The following code explains you such an approach with row-major order and more generalised way to write with proper error checking.

 #include<stdio.h>

 #define NUM_ROWS 2
 #define NUM_COLS 2

 __global__ void add2(int* da, int iNumCol)
{
    int idx=threadIdx.x;
    int idy=threadIdx.y;

    da[(idx * iNumCol) + idy]+=2;
    // printf("It came here");
    //printf("%d \n",da[idx][idy]);
}

int main()
{
    int             ha[NUM_ROWS][NUM_COLS] ;
    int             *da ;
    int             hb[NUM_ROWS][NUM_COLS] ;
    int             iSize = NUM_ROWS * NUM_COLS * sizeof(int) ;
    cudaError_t     cuError = cudaSuccess ;
    dim3            dimGrid (1,1,1) ;
    dim3            dimBlock (NUM_ROWS, NUM_COLS, 1) ;

    for(int i=0;i<NUM_ROWS;i++)
    {
            for(int j=0;j<NUM_COLS;j++)
            {
                    ha[i][j]=0;
            }
    }

    cuError = cudaMalloc((void**)&da, iSize) ;
    if (cudaSuccess != cuError)
    {
            printf ("Failed to allocate memory\n") ;
            return 1 ;
    }
    cuError = cudaMemcpy(da, ha, iSize, cudaMemcpyHostToDevice);
    if (cudaSuccess != cuError)
    {
            cudaFree (da) ;
            printf ("Failed in Memcpy 1\n") ;
            return 1 ;
    }

    printf("Before kernel\n");
    for(int i=0;i<NUM_ROWS;i++)
    {
            for(int j=0;j<NUM_COLS;j++)
            {
                    printf("%d ",ha[i][j]);
            }
            printf("\n");
    }
    printf("\n");

    add2<<<dimGrid, dimBlock>>>(da, NUM_COLS);
    cuError = cudaGetLastError () ;
    if (cudaSuccess != cuError)
    {
            printf ("Failed in kernel launch and reason is %s\n", cudaGetErrorString(cuError)) ;
            return 1 ;
    }

    cuError = cudaMemcpy(hb, da, iSize, cudaMemcpyDeviceToHost);
    if (cudaSuccess != cuError)
    {
            cudaFree (da) ;
            printf ("Failed in Memcpy 2\n") ;
            return 1 ;
    }

    printf("After kernel\n");
    for(int i=0;i<NUM_ROWS;i++)
    {
            for(int j=0;j<NUM_COLS;j++)
            {
                    printf("%d ",hb[i][j]);
            }
            printf("\n");
    }
    cudaFree (da) ;

    return 0;
}