I want to copy a dynamically allocated 2D array from host to device to get its Discrete Fourier Transform.
I'm using below code to copy the array to the device
cudaMalloc((void**)&array_d, sizeof(cufftComplex)*NX*(NY/2+1));
cudaMemcpy(array_d, array_h, sizeof(float)*NX*NY, cudaMemcpyHostToDevice);
This works fine with static arrays, i get the intended output from my fft.
But it doesn't work with dynamic arrays. After little bit searching I learnt I can not copy dynamic arrays like this from host to device. So I found this solution.
cudaMalloc((void**)&array_d, sizeof(cufftComplex)*NX*(NY/2+1));
for(int i=0; i<NX; ++i){
cudaMemcpy(array_d+ i*NY, array_h[i], sizeof(float)*NY, cudaMemcpyHostToDevice);
}
But it's also not doing the task properly since I get wrong values from my fft.
Given below is my fft code.
cufftPlanMany(&plan, NRANK, n,NULL, 1, 0,NULL, 1, 0,CUFFT_R2C,BATCH);
cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE);
cufftExecR2C(plan, (cufftReal*)data, data);
cudaThreadSynchronize();
cudaMemcpy(c, data, sizeof(float)*NX*NY, cudaMemcpyDeviceToHost);
How can I overcome this problem ?
EDIT
given below is the code
#define NX 4
#define NY 5
#define NRANK 2
#define BATCH 10
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h>
#include <iostream>
int check();
int main()
{
// static array
float b[NX][NY] ={
{0.7943 , 0.6020 , 0.7482 , 0.9133 , 0.9961},
{0.3112 , 0.2630 , 0.4505 , 0.1524 , 0.0782},
{0.5285 , 0.6541 , 0.0838 , 0.8258 , 0.4427},
{0.1656 , 0.6892 , 0.2290 , 0.5383 , 0.1067}
};
// dynamic array
float **a = new float*[NX];
for (int r = 0; r < NX; ++r)
{
a[r] = new float[NY];
for (int c = 0; c < NY; ++c)
{
a[r][c] = b[r][c];
}
}
// arrray to store the results - host side
float c[NX][NY] = { 0 };
cufftHandle plan;
cufftComplex *data;
int n[NRANK] = {NX, NY};
cudaMalloc((void**)&data, sizeof(cufftComplex)*NX*(NY/2+1));
cudaMemcpy(data, b, sizeof(float)*NX*NY, cudaMemcpyHostToDevice);
/* Create a 2D FFT plan. */
cufftPlanMany(&plan, NRANK, n,NULL, 1, 0,NULL, 1, 0,CUFFT_R2C,BATCH);
cufftSetCompatibilityMode(plan, CUFFT_COMPATIBILITY_NATIVE);
cufftExecR2C(plan, (cufftReal*)data, data);
cudaThreadSynchronize();
cudaMemcpy(c, data, sizeof(float)*NX*NY, cudaMemcpyDeviceToHost);
cufftDestroy(plan);
cudaFree(data);
return 0;
}