NaN problems with cuFFT

Question 1

My mistake was forgetting to multiply the number of items with their size in some of the cudaMemcpy calls, thus the end of the vectors fed to cuFFT was made up of NaNs. Fixing those has solved the problem.

I also replaced the cufftReal arrays with cufftComplex ones as the C2C transformations seem to be more predictable and added normalization for the values.

So the final working method is:

///////////////////////////////////////////////////////////////////////////////
// Function to help invoking the kernel, creates the parameters and gets 
// the result
__host__
void Process(
        BitmapStruct& in_img, 
        BitmapStruct& out_img, 
        MaskGenerator maskGenerator, 
        float param1, 
        float param2)
{    
    // Declare and allocate variables
    cufftHandle plan;

    cufftComplex* img;
    cufftComplex* dev_img;
    cufftComplex* dev_freq_img;

    int imgsize = in_img.image_size();
    int pixelcount = imgsize / 4;

    img = new cufftComplex[pixelcount];
    checkResult(
        cudaMalloc(&dev_img, sizeof(cufftComplex) * pixelcount));
    checkResult(
        cudaMalloc(&dev_freq_img, sizeof(cufftComplex) * pixelcount));

    // Optimize execution
    cudaFuncAttributes attrs;
    checkResult(
        cudaFuncGetAttributes(&attrs, &Filter));
    std::pair<dim3, dim3> params = 
            Optimizer::GetOptimalParameters(pixelcount, attrs);

    // Process r, g, b channels
    for(int chan = 0; chan <= 2; chan++)
    {
        // Init
        for(int i = 0; i < pixelcount; i++)
        {
            img[i].x = in_img.pixels[4 * i + chan];
            img[i].y = 0;
        }

        checkResult(
            cudaMemcpy(
                dev_img, 
                img, 
                pixelcount * sizeof(cufftComplex), 
                cudaMemcpyHostToDevice));

        // Create frequency image
        checkResult(
            cufftPlan1d(&plan, pixelcount, CUFFT_C2C, 1));
        checkResult(
            cufftExecC2C(plan, dev_img, dev_freq_img, CUFFT_FORWARD));
        checkResult(
            cudaThreadSynchronize());
        checkResult(
            cufftDestroy(plan));

        // Mask frequency image
        Filter<<<params.first, params.second>>>(
            dev_freq_img, 
            in_img.x, 
            in_img.y, 
            maskGenerator, 
            param1, 
            param2);
        getLastCudaError("Filtering the image failed.");

        // Get result
        checkResult(
            cufftPlan1d(&plan, pixelcount, CUFFT_C2C, 1));
        checkResult(
            cufftExecC2C(plan, dev_freq_img, dev_img, CUFFT_INVERSE));
        checkResult(
            cudaThreadSynchronize());
        checkResult(
            cufftDestroy(plan));
        checkResult(
            cudaMemcpy(
                img, 
                dev_img, 
                pixelcount * sizeof(cufftComplex), 
                cudaMemcpyDeviceToHost));

        for(int i = 0; i < pixelcount; i++)
        {
            out_img.pixels[4 * i + chan] = img[i].x / pixelcount;
        }
    }

    // Copy alpha channel
    for(int i = 0; i < pixelcount; i++)
    {
        out_img.pixels[4 * i + 3] = in_img.pixels[4 * i + 3];
    }

    // Free memory
    checkResult(
        cudaFree(dev_freq_img));
    checkResult(
        cudaFree(dev_img));
    delete img;

    getLastCudaError("An error occured during processing the image.");
}

Thank you for the help.

Question 2

I haven't compiled and run your reduced version, but I think the problem is in the size of dev_img and dev_freq_imag.

Consider the example on Section 4.2 of the CUFFT Library User's Guide. It performs an in-place real-to-complex transform, which is the same step you are performing first.

#define NX 256

cufftHandle plan;
cufftComplex *data;
cudaMalloc((void**)&data, sizeof(cufftComplex)*(NX/2+1)*BATCH);

cufftPlan1d(&plan, NX, CUFFT_R2C, BATCH);
cufftExecR2C(plan, (cufftReal*)data, data);

Due to the symmetry properties of the transform, cufftExecR2C fills only NX/2+1 output elements, where NX is the size of the input array.

In your case, you are doing the following:

cufftHandle plan;

cufftReal* dev_img;
cufftComplex* dev_freq_img;

cudaMalloc(&dev_img, sizeof(cufftReal) * pixelcount);
cudaMalloc(&dev_freq_img, sizeof(cufftComplex) * pixelcount);

so you are allocating a cufftReal array and a cufftComplex array of the same size. When you use

cufftPlan1d(&plan, pixelcount, CUFFT_R2C, 1);
cufftExecR2C(plan, dev_img, dev_freq_img);

then only half of the dev_freq_img is filled by cufftExecR2C, the remaining part containing garbage. If you use the full extent of dev_freq_img in the Filter __global__ function, then this will be probably the cause of your NaNs.