Using cudaMemcpy3D to transfer *** pointer

Question

Any time you are having trouble with a cuda code, it's a good idea to do proper cuda error checking. The code you have posted here, at least, does not run correctly for me - the cudaMemcpy3D line throws an error. This is due to item 2 below. (I suspect the code you used to generate the output was not identical to the code you have shown here, but that's just a guess.)
Your usage of make_cudaPitchedPtr is not correct:
```
cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Ny, Nz);
```
review the API documentation. Making a CUDA pitched pointer this way is no different between 2D and 3D. So it makes no sense to pass 3 different dimensions as you are doing. Instead do this:
```
cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Nx, Ny);
```
The remaining issues I found I attribute to incorrect understanding of 3 dimensions in C. The last subscript on a multiply-subscripted array is the rapidly varying dimension, i.e. it is the one where adjacent values in memory occupy adjacent index values. Your usage of Z in the 3rd dimension is confusing to me due to this. Your host allocation was using Nx in the first subscript place, but your device indexing didn't match. There are obviously multiple ways to handle this. If you don't like my arrangement, you can change it, but the host and device indexing must match.

Anyway, the following code modifications worked for me:

#include <stdio.h>

int ***alloc_tensor(int Nx, int Ny, int Nz) {
   int i, j;
   int ***tensor;

   tensor = (int ***) malloc((size_t) (Nx * sizeof(int **)));
   tensor[0] = (int **) malloc((size_t) (Nx * Ny * sizeof(int *)));
   tensor[0][0] = (int *) malloc((size_t) (Nx * Ny * Nz * sizeof(int)));

   for(j = 1; j < Ny; j++)
      tensor[0][j] = tensor[0][j-1] + Nz;
   for(i = 1; i < Nx; i++) {
      tensor[i] = tensor[i - 1] + Ny;
      tensor[i][0] = tensor[i - 1][0] + Ny * Nz;
      for(j = 1; j < Ny; j++)
         tensor[i][j] = tensor[i][j - 1] + Nz;
   }

   return tensor;
}

__global__ void kernel(cudaPitchedPtr tensor, int Nx, int Ny, int Nz) {
   int i, j, k;
   char *tensorslice;
   int *tensorrow;

   for (i = 0; i < Nx; i++) {
      for (j = 0; j < Ny; j++) {
         for (k = 0; k < Nz; k++) {
            tensorslice = ((char *)tensor.ptr) + k * tensor.pitch * Ny;
            tensorrow = (int *)(tensorslice + j * tensor.pitch);
            printf("d_tensor[%d][%d][%d] = %d\n", i, j, k, tensorrow[i]);
         }
      }
   }
}

int main() {
   int i, j, k, value = 0;
   int Nx = 2, Ny = 6, Nz = 4;

   int ***h_tensor;
   struct cudaPitchedPtr d_tensor;

   h_tensor = alloc_tensor(Nz, Ny, Nx);
   cudaMalloc3D(&d_tensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz));

   for(i = 0; i < Nx; i++) {
      for(j = 0; j < Ny; j++) {
         for(k = 0; k < Nz; k++) {
            h_tensor[k][j][i] = value++;
            //printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[i][j][k]);
         }
      }
   }
   for(i = 0; i < Nx; i++) {
      for(j = 0; j < Ny; j++) {
         for(k = 0; k < Nz; k++) {
            //h_tensor[i][j][k] = value++;
            printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[k][j][i]);
         }
      }
   }

   cudaMemcpy3DParms cpy = { 0 };
   cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Nx, Ny);
   cpy.dstPtr = d_tensor;
   cpy.extent = make_cudaExtent(Nx * sizeof(int), Ny, Nz);
   cpy.kind = cudaMemcpyHostToDevice;

   cudaMemcpy3D(&cpy);

   kernel<<<1, 1>>>(d_tensor, Nx, Ny, Nz);
   cudaDeviceSynchronize();
   // ... clean-up
}