Why do malloc() and calloc() not seem work when using CUDA?

Question

The problem with your code isn't related to the use of malloc and calloc which are host functions. The problem is that you are not correctly dealing with double pointers and how they are passed to a CUDA kernel. As pointed out by Robert Crovella, a proper error checking would have let you gain a better insight in what was missing by your implementation.

Below there is a working version of your program. It is nothing more than an application of the answer provided by talonmies in cuda 2D array problem.

#include<stdio.h>
#include<conio.h>

inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
    if (code != 0) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
        if (Abort) exit(code);
    }       
}

#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }

__global__ void threads(int* dA[]) {

    int gi=blockIdx.x;
    int gj=blockIdx.y;

    printf("global Id in X= %i, in Y =%i, E= %i\n", gi, gj, dA[gi][gj]);

}

int main(int argc, char** argv)
{

    int **A, *dA;
    int R=5, C=4;
    int size=R*C*sizeof(int);

    A=(int**)calloc(R,sizeof(int*));
    for(int i=0; i<R; i++) A[i]=(int*)calloc(C,sizeof(int));
    for(int i=0; i<R; i++) for(int j=0; j<C; j++) A[i][j]=i*C+j;

    printf("Before transfer \n");
    for(int i=0; i<R; i++) { for(int j=0; j<C; j++) { printf("%d ",A[i][j]); } printf("\n"); }
    printf("\n");

    // --- Create an array of R pointers on the host
    int** h_A = (int**)malloc(R*sizeof(int*));
    for(int i=0; i<R;i++){
        // --- For each array pointer, allocate space for C ints on the device
        GPUerrchk(cudaMalloc((void**)&h_A[i], C*sizeof(int)));
        // --- Copy the rows of A from host to device at the address determined by h_A[i]
        GPUerrchk(cudaMemcpy(h_A[i], &A[i][0], C*sizeof(int), cudaMemcpyHostToDevice));
    }

    // --- Create an array of R pointers on the device
    int **d_A; GPUerrchk(cudaMalloc((void***)&d_A, R*sizeof(int*)));
    // --- Copy the addresses of the rows of the device matrix from host to device
    GPUerrchk(cudaMemcpy(d_A, h_A, R*sizeof(int*), cudaMemcpyHostToDevice));

    dim3 nblocks(R,C);
    dim3 nthreads(1);

    printf("After transfer \n");
    threads<<<nblocks, nthreads>>>(d_A);
    GPUerrchk(cudaPeekAtLastError());

    cudaDeviceSynchronize();

    getch();

    return 0;

}

As also underlined in cuda 2D array problem, it is always better to flatten the 2D array to 1D to avoid this cumbersome array handling.