Question

The dynamic memory allocation using malloc()/calloc() seems to be not working properly, when used in CUDA.

As for checking, I wrote the following code using calloc(). The array seems to be allocated with required memory and I could also assign some values. But I could see only the garbage values, when I print the matrix elements from the Kernel. I thought it could be a problem with cudaMemcpy() but, instead of **A, if I put like, A[5][5], the code works perfect.

And the memset() usage leads to 'core dumped' error.

Could anyone help in getting along with malloc()/calloc() with no errors?

#include<stdio.h>

__global__ void threads(int* dA)
{
 int gi=threadIdx.x+(blockIdx.x*blockDim.x);
 int gj=threadIdx.y+(blockIdx.y*blockDim.y);

 printf("global Id in X= %d, in Y =%d, E= %d\n", gi,gj,dA[gi*5+gj]);
}

int main(int argc, char** argv)
{
 int **A, *dA;
 int R=5, C=4;
 int size=R*C*sizeof(int);

 A=(int **)calloc(R, sizeof(int*));

 for(int i=0; i<R; i++)
    A[i]=(int *)calloc(C, sizeof(int));

// memset(A, 0, size);

 for(int i=0; i<R; i++)
   {
   for(int j=0; j<C; j++)
      A[i][j]=i*C+j;
   }

printf(" \n Before \n");
for(int i=0; i<R; i++)
   {
    for(int j=0; j<C; j++)
        printf("%d ",A[i][j]);
    printf("\n");
   }

cudaMalloc((int**) &dA, size);
cudaMemcpy(dA, A, size, cudaMemcpyHostToDevice);

dim3 nblocks(R,C);
dim3 nthreads(1);

threads<<<nblocks, nthreads>>>(dA);
cudaDeviceSynchronize();

cudaFree(dA);
free(A);
return 0;
}
Was it helpful?

Solution

The problem with your code isn't related to the use of malloc and calloc which are host functions. The problem is that you are not correctly dealing with double pointers and how they are passed to a CUDA kernel. As pointed out by Robert Crovella, a proper error checking would have let you gain a better insight in what was missing by your implementation.

Below there is a working version of your program. It is nothing more than an application of the answer provided by talonmies in cuda 2D array problem.

#include<stdio.h>
#include<conio.h>

inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
    if (code != 0) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
        if (Abort) exit(code);
    }       
}

#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }

__global__ void threads(int* dA[]) {

    int gi=blockIdx.x;
    int gj=blockIdx.y;

    printf("global Id in X= %i, in Y =%i, E= %i\n", gi, gj, dA[gi][gj]);

}

int main(int argc, char** argv)
{

    int **A, *dA;
    int R=5, C=4;
    int size=R*C*sizeof(int);

    A=(int**)calloc(R,sizeof(int*));
    for(int i=0; i<R; i++) A[i]=(int*)calloc(C,sizeof(int));
    for(int i=0; i<R; i++) for(int j=0; j<C; j++) A[i][j]=i*C+j;

    printf("Before transfer \n");
    for(int i=0; i<R; i++) { for(int j=0; j<C; j++) { printf("%d ",A[i][j]); } printf("\n"); }
    printf("\n");

    // --- Create an array of R pointers on the host
    int** h_A = (int**)malloc(R*sizeof(int*));
    for(int i=0; i<R;i++){
        // --- For each array pointer, allocate space for C ints on the device
        GPUerrchk(cudaMalloc((void**)&h_A[i], C*sizeof(int)));
        // --- Copy the rows of A from host to device at the address determined by h_A[i]
        GPUerrchk(cudaMemcpy(h_A[i], &A[i][0], C*sizeof(int), cudaMemcpyHostToDevice));
    }

    // --- Create an array of R pointers on the device
    int **d_A; GPUerrchk(cudaMalloc((void***)&d_A, R*sizeof(int*)));
    // --- Copy the addresses of the rows of the device matrix from host to device
    GPUerrchk(cudaMemcpy(d_A, h_A, R*sizeof(int*), cudaMemcpyHostToDevice));

    dim3 nblocks(R,C);
    dim3 nthreads(1);

    printf("After transfer \n");
    threads<<<nblocks, nthreads>>>(d_A);
    GPUerrchk(cudaPeekAtLastError());

    cudaDeviceSynchronize();

    getch();

    return 0;

}

As also underlined in cuda 2D array problem, it is always better to flatten the 2D array to 1D to avoid this cumbersome array handling.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top