質問

I am trying to use cudaMemcpy3D to transfer dynamically allocated 3d matrix (tensor). Tensor is allocated as contiguous block of memory (see code below). I tried various combinations of cudaExtent and cudaMemcpy3DParms, however the order of elements gets mixed up. I created the following example to demonstrate the issue:

#include <stdio.h>

int ***alloc_tensor(int Nx, int Ny, int Nz) {
   int i, j;
   int ***tensor;

   tensor = (int ***) malloc((size_t) (Nx * sizeof(int **)));
   tensor[0] = (int **) malloc((size_t) (Nx * Ny * sizeof(int *)));
   tensor[0][0] = (int *) malloc((size_t) (Nx * Ny * Nz * sizeof(int)));

   for(j = 1; j < Ny; j++)
      tensor[0][j] = tensor[0][j-1] + Nz;
   for(i = 1; i < Nx; i++) {
      tensor[i] = tensor[i - 1] + Ny;
      tensor[i][0] = tensor[i - 1][0] + Ny * Nz;
      for(j = 1; j < Ny; j++)
         tensor[i][j] = tensor[i][j - 1] + Nz;
   }

   return tensor;
}

__global__ void kernel(cudaPitchedPtr tensor, int Nx, int Ny, int Nz) {
   int i, j, k;
   char *tensorslice;
   int *tensorrow;

   for (i = 0; i < Nx; i++) {
      for (j = 0; j < Ny; j++) {
         for (k = 0; k < Nz; k++) {
            tensorslice = ((char *)tensor.ptr) + k * tensor.pitch * Nx;
            tensorrow = (int *)(tensorslice + i * tensor.pitch);
            printf("d_tensor[%d][%d][%d] = %d\n", i, j, k, tensorrow[j]);
         }
      }
   }   
}

int main() {
   int i, j, k, value = 0;
   int Nx = 2, Ny = 6, Nz = 4;

   int ***h_tensor;
   struct cudaPitchedPtr d_tensor;

   h_tensor = alloc_tensor(Nx, Ny, Nz);
   cudaMalloc3D(&d_tensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz));

   for(i = 0; i < Nx; i++) {
      for(j = 0; j < Ny; j++) {
         for(k = 0; k < Nz; k++) {
            h_tensor[i][j][k] = value++;
            printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[i][j][k]);
         }
      }
   }

   cudaMemcpy3DParms cpy = { 0 };
   cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Ny, Nz);
   cpy.dstPtr = d_tensor;
   cpy.extent = make_cudaExtent(Nx * sizeof(int), Ny, Nz);
   cpy.kind = cudaMemcpyHostToDevice;

   cudaMemcpy3D(&cpy);

   kernel<<<1, 1>>>(d_tensor, Nx, Ny, Nz);

   // ... clean-up
}

Output for host variable (h_tensor) and device (d_tensor) differ, looking like

h_tensor[0][0][0] = 0
h_tensor[0][0][1] = 1
h_tensor[0][0][2] = 2
h_tensor[0][0][3] = 3
h_tensor[0][1][0] = 4
h_tensor[0][1][1] = 5
h_tensor[0][1][2] = 6
...

d_tensor[0][0][0] = 0
d_tensor[0][0][1] = 12
d_tensor[0][0][2] = 24
d_tensor[0][0][3] = 36
d_tensor[0][1][0] = 1
d_tensor[0][1][1] = 13
d_tensor[0][1][2] = 25
...

What am I doing wrong? What would be the correct way to use cudaMemcpy3D?

役に立ちましたか?

解決

  1. Any time you are having trouble with a cuda code, it's a good idea to do proper cuda error checking. The code you have posted here, at least, does not run correctly for me - the cudaMemcpy3D line throws an error. This is due to item 2 below. (I suspect the code you used to generate the output was not identical to the code you have shown here, but that's just a guess.)
  2. Your usage of make_cudaPitchedPtr is not correct:

    cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Ny, Nz);
    

    review the API documentation. Making a CUDA pitched pointer this way is no different between 2D and 3D. So it makes no sense to pass 3 different dimensions as you are doing. Instead do this:

    cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Nx, Ny);
    
  3. The remaining issues I found I attribute to incorrect understanding of 3 dimensions in C. The last subscript on a multiply-subscripted array is the rapidly varying dimension, i.e. it is the one where adjacent values in memory occupy adjacent index values. Your usage of Z in the 3rd dimension is confusing to me due to this. Your host allocation was using Nx in the first subscript place, but your device indexing didn't match. There are obviously multiple ways to handle this. If you don't like my arrangement, you can change it, but the host and device indexing must match.

Anyway, the following code modifications worked for me:

#include <stdio.h>

int ***alloc_tensor(int Nx, int Ny, int Nz) {
   int i, j;
   int ***tensor;

   tensor = (int ***) malloc((size_t) (Nx * sizeof(int **)));
   tensor[0] = (int **) malloc((size_t) (Nx * Ny * sizeof(int *)));
   tensor[0][0] = (int *) malloc((size_t) (Nx * Ny * Nz * sizeof(int)));

   for(j = 1; j < Ny; j++)
      tensor[0][j] = tensor[0][j-1] + Nz;
   for(i = 1; i < Nx; i++) {
      tensor[i] = tensor[i - 1] + Ny;
      tensor[i][0] = tensor[i - 1][0] + Ny * Nz;
      for(j = 1; j < Ny; j++)
         tensor[i][j] = tensor[i][j - 1] + Nz;
   }

   return tensor;
}

__global__ void kernel(cudaPitchedPtr tensor, int Nx, int Ny, int Nz) {
   int i, j, k;
   char *tensorslice;
   int *tensorrow;

   for (i = 0; i < Nx; i++) {
      for (j = 0; j < Ny; j++) {
         for (k = 0; k < Nz; k++) {
            tensorslice = ((char *)tensor.ptr) + k * tensor.pitch * Ny;
            tensorrow = (int *)(tensorslice + j * tensor.pitch);
            printf("d_tensor[%d][%d][%d] = %d\n", i, j, k, tensorrow[i]);
         }
      }
   }
}

int main() {
   int i, j, k, value = 0;
   int Nx = 2, Ny = 6, Nz = 4;

   int ***h_tensor;
   struct cudaPitchedPtr d_tensor;

   h_tensor = alloc_tensor(Nz, Ny, Nx);
   cudaMalloc3D(&d_tensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz));

   for(i = 0; i < Nx; i++) {
      for(j = 0; j < Ny; j++) {
         for(k = 0; k < Nz; k++) {
            h_tensor[k][j][i] = value++;
            //printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[i][j][k]);
         }
      }
   }
   for(i = 0; i < Nx; i++) {
      for(j = 0; j < Ny; j++) {
         for(k = 0; k < Nz; k++) {
            //h_tensor[i][j][k] = value++;
            printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[k][j][i]);
         }
      }
   }

   cudaMemcpy3DParms cpy = { 0 };
   cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Nx, Ny);
   cpy.dstPtr = d_tensor;
   cpy.extent = make_cudaExtent(Nx * sizeof(int), Ny, Nz);
   cpy.kind = cudaMemcpyHostToDevice;

   cudaMemcpy3D(&cpy);

   kernel<<<1, 1>>>(d_tensor, Nx, Ny, Nz);
   cudaDeviceSynchronize();
   // ... clean-up
}
ライセンス: CC-BY-SA帰属
所属していません StackOverflow
scroll top