My mistake was forgetting to multiply the number of items with their size in some of the cudaMemcpy
calls, thus the end of the vectors fed to cuFFT was made up of NaNs. Fixing those has solved the problem.
I also replaced the cufftReal arrays with cufftComplex ones as the C2C transformations seem to be more predictable and added normalization for the values.
So the final working method is:
///////////////////////////////////////////////////////////////////////////////
// Function to help invoking the kernel, creates the parameters and gets
// the result
__host__
void Process(
BitmapStruct& in_img,
BitmapStruct& out_img,
MaskGenerator maskGenerator,
float param1,
float param2)
{
// Declare and allocate variables
cufftHandle plan;
cufftComplex* img;
cufftComplex* dev_img;
cufftComplex* dev_freq_img;
int imgsize = in_img.image_size();
int pixelcount = imgsize / 4;
img = new cufftComplex[pixelcount];
checkResult(
cudaMalloc(&dev_img, sizeof(cufftComplex) * pixelcount));
checkResult(
cudaMalloc(&dev_freq_img, sizeof(cufftComplex) * pixelcount));
// Optimize execution
cudaFuncAttributes attrs;
checkResult(
cudaFuncGetAttributes(&attrs, &Filter));
std::pair<dim3, dim3> params =
Optimizer::GetOptimalParameters(pixelcount, attrs);
// Process r, g, b channels
for(int chan = 0; chan <= 2; chan++)
{
// Init
for(int i = 0; i < pixelcount; i++)
{
img[i].x = in_img.pixels[4 * i + chan];
img[i].y = 0;
}
checkResult(
cudaMemcpy(
dev_img,
img,
pixelcount * sizeof(cufftComplex),
cudaMemcpyHostToDevice));
// Create frequency image
checkResult(
cufftPlan1d(&plan, pixelcount, CUFFT_C2C, 1));
checkResult(
cufftExecC2C(plan, dev_img, dev_freq_img, CUFFT_FORWARD));
checkResult(
cudaThreadSynchronize());
checkResult(
cufftDestroy(plan));
// Mask frequency image
Filter<<<params.first, params.second>>>(
dev_freq_img,
in_img.x,
in_img.y,
maskGenerator,
param1,
param2);
getLastCudaError("Filtering the image failed.");
// Get result
checkResult(
cufftPlan1d(&plan, pixelcount, CUFFT_C2C, 1));
checkResult(
cufftExecC2C(plan, dev_freq_img, dev_img, CUFFT_INVERSE));
checkResult(
cudaThreadSynchronize());
checkResult(
cufftDestroy(plan));
checkResult(
cudaMemcpy(
img,
dev_img,
pixelcount * sizeof(cufftComplex),
cudaMemcpyDeviceToHost));
for(int i = 0; i < pixelcount; i++)
{
out_img.pixels[4 * i + chan] = img[i].x / pixelcount;
}
}
// Copy alpha channel
for(int i = 0; i < pixelcount; i++)
{
out_img.pixels[4 * i + 3] = in_img.pixels[4 * i + 3];
}
// Free memory
checkResult(
cudaFree(dev_freq_img));
checkResult(
cudaFree(dev_img));
delete img;
getLastCudaError("An error occured during processing the image.");
}
Thank you for the help.