CUDA debugging with NSight Visual Studio 2010

https://stackoverflow.com/questions/20245738
05-08-2022
|
Question
I am not able to debug for the "global" function lines for which I set breakpoints. I debug with "Start CUDA Debugging" option from NSight menu.
My NSight plugin is successfully installed for VS 2010, I am able to debug my other projects (sample projects came within NSight debugger)
My code is here (it is a bit long but generally repeats same functions) :
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "device_launch_parameters.h"
#include <cuda_runtime.h>
#include <cufft.h>
#include <helper_cuda.h>
#include "book.h"

#define N   (131072)

__global__ void conjugate( float2 *a ) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N) {
        a[idx] = cuConjf(a[idx]);
    }
}

__global__ void multWithReference( float2 *signal, float2 *reference ) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N) {
        signal[idx].x = signal[idx].x * reference[idx].x;
        signal[idx].y = signal[idx].y * reference[idx].y;
    }
}

__global__ void shift( float2 *signal, size_t shiftamount, float2* shifted ) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    *(shifted+((idx+shiftamount)%131072)) = *(signal+idx);
}

__global__ void fftshift(float2 *u_d)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if(i < 131072)
    {
        double a = 1-2*(i&1);
        u_d[i].x *= a;
        u_d[i].y *= a;
    }
}


static inline cufftHandle createFFTPlan(cudaStream_t* stream)
{
    cufftHandle plan;

    if (cudaGetLastError() != cudaSuccess){
        fprintf(stderr, "Cuda error: Failed to allocate\n");
    }
    if (cufftPlan1d(&plan, 131072, CUFFT_C2C,1) != CUFFT_SUCCESS){
        fprintf(stderr, "CUFFT error: Plan creation failed");
    }
    if (cufftSetStream(plan, *stream) != CUFFT_SUCCESS){
        fprintf(stderr, "CUFFT error: Plan stream association failed");
    }

    return plan;
}

int main( void ) {

    cudaDeviceProp  prop;
    int whichDevice;
    HANDLE_ERROR( cudaGetDevice( &whichDevice ) );
    HANDLE_ERROR( cudaGetDeviceProperties( &prop, whichDevice ) );
    if (!prop.deviceOverlap) {
        printf( "Device will not handle overlaps, so no speed up from streams\n" );
        return 0;
    }

    cudaEvent_t     start, stop;
    float           elapsedTime;

    cudaStream_t    stream0, stream1, stream2, stream3, stream4, stream5, stream6, stream7;
    float2* host_ref, *host_0, *host_1, *host_2, *host_3, *host_4, *host_5, *host_6, *host_7;
    float2* dev_ref, *dev_0, *dev_1, *dev_2, *dev_3, *dev_4, *dev_5, *dev_6, *dev_7;

    // start the timers
    HANDLE_ERROR( cudaEventCreate( &start ) );
    HANDLE_ERROR( cudaEventCreate( &stop ) );

    // initialize the streams
    HANDLE_ERROR( cudaStreamCreate( &stream0 ) );
    HANDLE_ERROR( cudaStreamCreate( &stream1 ) );
    HANDLE_ERROR( cudaStreamCreate( &stream2 ) );
    HANDLE_ERROR( cudaStreamCreate( &stream3 ) );
    HANDLE_ERROR( cudaStreamCreate( &stream4 ) );
    HANDLE_ERROR( cudaStreamCreate( &stream5 ) );
    HANDLE_ERROR( cudaStreamCreate( &stream6 ) );
    HANDLE_ERROR( cudaStreamCreate( &stream7 ) );

    // allocate the memory on the GPU
    HANDLE_ERROR( cudaMalloc( (void**)&dev_ref,
                              N * sizeof(float2) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_0,
                              N * sizeof(float2) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_1,
                              N * sizeof(float2) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_2,
                              N * sizeof(float2) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_3,
                              N * sizeof(float2) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_4,
                              N * sizeof(float2) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_5,
                              N * sizeof(float2) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_6,
                              N * sizeof(float2) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_7,
                              N * sizeof(float2) ) );

    // allocate host locked memory, used to stream
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_ref,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_0,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_1,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_2,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_3,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_4,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_5,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_6,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_7,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );

    // Open signal file
    FILE *fp;
    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_ref, sizeof(float2), 131072, fp);
    fclose(fp);

    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_0, sizeof(float2), 131072, fp);
    fclose(fp);

    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_1, sizeof(float2), 131072, fp);
    fclose(fp);

    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_2, sizeof(float2), 131072, fp);
    fclose(fp);

    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_3, sizeof(float2), 131072, fp);
    fclose(fp);

    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_4, sizeof(float2), 131072, fp);
    fclose(fp);

    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_5, sizeof(float2), 131072, fp);
    fclose(fp);

    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_6, sizeof(float2), 131072, fp);
    fclose(fp);

    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_7, sizeof(float2), 131072, fp);
    fclose(fp);

    // create FFT plans
    cufftHandle plan0 = createFFTPlan(&stream0);
    cufftHandle plan1 = createFFTPlan(&stream1);
    cufftHandle plan2 = createFFTPlan(&stream2);
    cufftHandle plan3 = createFFTPlan(&stream3);
    cufftHandle plan4 = createFFTPlan(&stream4);
    cufftHandle plan5 = createFFTPlan(&stream5);
    cufftHandle plan6 = createFFTPlan(&stream6);
    cufftHandle plan7 = createFFTPlan(&stream7);

    float2* shifted0;
    HANDLE_ERROR( cudaMalloc( (void**)&shifted0,
                            N * sizeof(float2) ) );
    float2* shifted1;
    HANDLE_ERROR( cudaMalloc( (void**)&shifted1,
                            N * sizeof(float2) ) );
    float2* shifted2;
    HANDLE_ERROR( cudaMalloc( (void**)&shifted2,
                            N * sizeof(float2) ) );
    float2* shifted3;
    HANDLE_ERROR( cudaMalloc( (void**)&shifted3,
                            N * sizeof(float2) ) );
    float2* shifted4;
    HANDLE_ERROR( cudaMalloc( (void**)&shifted4,
                            N * sizeof(float2) ) );
    float2* shifted5;
    HANDLE_ERROR( cudaMalloc( (void**)&shifted5,
                            N * sizeof(float2) ) );
    float2* shifted6;
    HANDLE_ERROR( cudaMalloc( (void**)&shifted6,
                            N * sizeof(float2) ) );
    float2* shifted7;
    HANDLE_ERROR( cudaMalloc( (void**)&shifted7,
                            N * sizeof(float2) ) );

    HANDLE_ERROR( cudaEventRecord( start, 0 ) );

    // enqueue copies of a in stream0 and stream1
    HANDLE_ERROR( cudaMemcpyAsync( dev_ref, host_ref,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream2 ) );

    HANDLE_ERROR( cudaMemcpyAsync( dev_0, host_0,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream0 ) );
    HANDLE_ERROR( cudaMemcpyAsync( dev_1, host_1,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream1 ) );
    HANDLE_ERROR( cudaMemcpyAsync( dev_2, host_2,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream2 ) );
    HANDLE_ERROR( cudaMemcpyAsync( dev_3, host_3,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream3 ) );
    HANDLE_ERROR( cudaMemcpyAsync( dev_4, host_4,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream4 ) );
    HANDLE_ERROR( cudaMemcpyAsync( dev_5, host_5,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream5 ) );
    HANDLE_ERROR( cudaMemcpyAsync( dev_6, host_6,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream6 ) );
    HANDLE_ERROR( cudaMemcpyAsync( dev_7, host_7,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream7 ) );

    for(int i = 0; i < 100; i++){

        shift<<<131072,131072,0>>>(dev_0, i, shifted0);
        shift<<<131072,131072,0,stream1>>>(dev_1, i, shifted1);
        shift<<<131072,131072,0,stream2>>>(dev_2, i, shifted2);
        shift<<<131072,131072,0,stream3>>>(dev_3, i, shifted3);
        shift<<<131072,131072,0,stream4>>>(dev_4, i, shifted4);
        shift<<<131072,131072,0,stream5>>>(dev_5, i, shifted5);
        shift<<<131072,131072,0,stream6>>>(dev_6, i, shifted6);
        shift<<<131072,131072,0,stream7>>>(dev_7, i, shifted7);

        conjugate<<<131072/256,131072,0,stream0>>>(shifted0);
        conjugate<<<131072/256,131072,0,stream1>>>(shifted1);
        conjugate<<<131072/256,131072,0,stream2>>>(shifted2);
        conjugate<<<131072/256,131072,0,stream3>>>(shifted3);
        conjugate<<<131072/256,131072,0,stream4>>>(shifted4);
        conjugate<<<131072/256,131072,0,stream5>>>(shifted5);
        conjugate<<<131072/256,131072,0,stream6>>>(shifted6);
        conjugate<<<131072/256,131072,0,stream7>>>(shifted7);

        multWithReference<<<131072/256,131072,0,stream0>>>(shifted0,dev_ref);
        multWithReference<<<131072/256,131072,0,stream1>>>(shifted1,dev_ref);
        multWithReference<<<131072/256,131072,0,stream2>>>(shifted2,dev_ref);
        multWithReference<<<131072/256,131072,0,stream3>>>(shifted3,dev_ref);
        multWithReference<<<131072/256,131072,0,stream4>>>(shifted4,dev_ref);
        multWithReference<<<131072/256,131072,0,stream5>>>(shifted5,dev_ref);
        multWithReference<<<131072/256,131072,0,stream6>>>(shifted6,dev_ref);
        multWithReference<<<131072/256,131072,0,stream7>>>(shifted7,dev_ref);

        if (cufftExecC2C(plan0, shifted0, shifted0, CUFFT_FORWARD) != CUFFT_SUCCESS){
            fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
        }
        if (cufftExecC2C(plan1, shifted1, shifted1, CUFFT_FORWARD) != CUFFT_SUCCESS){
            fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
        }
        if (cufftExecC2C(plan2, shifted2, shifted2, CUFFT_FORWARD) != CUFFT_SUCCESS){
            fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
        }
        if (cufftExecC2C(plan3, shifted3, shifted3, CUFFT_FORWARD) != CUFFT_SUCCESS){
            fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
        }
        if (cufftExecC2C(plan4, shifted4, shifted4, CUFFT_FORWARD) != CUFFT_SUCCESS){
            fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
        }
        if (cufftExecC2C(plan5, shifted5, shifted5, CUFFT_FORWARD) != CUFFT_SUCCESS){
            fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
        }
        if (cufftExecC2C(plan6, shifted6, shifted6, CUFFT_FORWARD) != CUFFT_SUCCESS){
            fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
        }
        if (cufftExecC2C(plan7, shifted7, shifted7, CUFFT_FORWARD) != CUFFT_SUCCESS){
            fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
        }

        fftshift<<<131072,131072,0,stream0>>>(shifted0);
        fftshift<<<131072,131072,0,stream1>>>(shifted1);
        fftshift<<<131072,131072,0,stream2>>>(shifted2);
        fftshift<<<131072,131072,0,stream3>>>(shifted3);
        fftshift<<<131072,131072,0,stream4>>>(shifted4);
        fftshift<<<131072,131072,0,stream5>>>(shifted5);
        fftshift<<<131072,131072,0,stream6>>>(shifted6);
        fftshift<<<131072,131072,0,stream7>>>(shifted7);

    }

    if (cudaThreadSynchronize() != cudaSuccess){
        fprintf(stderr, "Cuda error: Failed to synchronize\n");
    }

    float2 *host_last = (float2 *)malloc(8*131072);

    // enqueue copies of c from device to locked memory
    HANDLE_ERROR( cudaMemcpyAsync( host_last, shifted0,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream0 ) );

    // enqueue copies of c from device to locked memory
    HANDLE_ERROR( cudaMemcpyAsync( host_0, shifted0,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream0 ) );
    HANDLE_ERROR( cudaMemcpyAsync( host_1, shifted1,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream1 ) );
    HANDLE_ERROR( cudaMemcpyAsync( host_2, shifted2,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream2 ) );
    HANDLE_ERROR( cudaMemcpyAsync( host_3, shifted3,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream3 ) );
    HANDLE_ERROR( cudaMemcpyAsync( host_4, shifted4,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream4 ) );
    HANDLE_ERROR( cudaMemcpyAsync( host_5, shifted5,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream5 ) );
    HANDLE_ERROR( cudaMemcpyAsync( host_6, shifted6,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream6 ) );
    HANDLE_ERROR( cudaMemcpyAsync( host_7, shifted7,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream7 ) );
    // Streamleri senkronize et
    HANDLE_ERROR( cudaStreamSynchronize( stream0 ) );
    HANDLE_ERROR( cudaStreamSynchronize( stream1 ) );
    HANDLE_ERROR( cudaStreamSynchronize( stream2 ) );
    HANDLE_ERROR( cudaStreamSynchronize( stream3 ) );
    HANDLE_ERROR( cudaStreamSynchronize( stream4 ) );
    HANDLE_ERROR( cudaStreamSynchronize( stream5 ) );
    HANDLE_ERROR( cudaStreamSynchronize( stream6 ) );
    HANDLE_ERROR( cudaStreamSynchronize( stream7 ) );


    // Stop timer
    HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
    HANDLE_ERROR( cudaEventSynchronize( stop ) );
    HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
                                        start, stop ) );
    printf( "Time taken:  %3.1f ms\n", elapsedTime );

    FILE *fp2;
    if(NULL == (fp2 = fopen("result.bin","wb+"))){
        printf("can not open file...");
        exit(1);
    }

    fwrite(host_last, sizeof(float2), 131072, fp2);

    printf("signal written \n");
    fflush(stdout);
    fclose(fp2);

    // cleanup the streams and memory
    HANDLE_ERROR( cudaFreeHost( host_0 ) );
    HANDLE_ERROR( cudaFreeHost( host_1 ) );
    HANDLE_ERROR( cudaFreeHost( host_2 ) );
    HANDLE_ERROR( cudaFreeHost( host_3 ) );
    HANDLE_ERROR( cudaFreeHost( host_4 ) );
    HANDLE_ERROR( cudaFreeHost( host_5 ) );
    HANDLE_ERROR( cudaFreeHost( host_6 ) );
    HANDLE_ERROR( cudaFreeHost( host_7 ) );

    HANDLE_ERROR( cudaFree( dev_0 ) );
    HANDLE_ERROR( cudaFree( dev_1 ) );
    HANDLE_ERROR( cudaFree( dev_2 ) );
    HANDLE_ERROR( cudaFree( dev_3 ) );
    HANDLE_ERROR( cudaFree( dev_4 ) );
    HANDLE_ERROR( cudaFree( dev_5 ) );
    HANDLE_ERROR( cudaFree( dev_6 ) );
    HANDLE_ERROR( cudaFree( dev_7 ) );

    cufftDestroy(plan0);
    cufftDestroy(plan1);
    cufftDestroy(plan2);
    cufftDestroy(plan3);
    cufftDestroy(plan4);
    cufftDestroy(plan5);
    cufftDestroy(plan6);
    cufftDestroy(plan7);

    HANDLE_ERROR( cudaStreamDestroy( stream0 ) );
    HANDLE_ERROR( cudaStreamDestroy( stream1 ) );
    HANDLE_ERROR( cudaStreamDestroy( stream2 ) );
    HANDLE_ERROR( cudaStreamDestroy( stream3 ) );
    HANDLE_ERROR( cudaStreamDestroy( stream4 ) );
    HANDLE_ERROR( cudaStreamDestroy( stream5 ) );
    HANDLE_ERROR( cudaStreamDestroy( stream6 ) );
    HANDLE_ERROR( cudaStreamDestroy( stream7 ) );

    printf("hit [enter] to exit...");
    fflush(stdout);
    getchar();

    return 0;
}
Binary file needed to reproduce the problem is within this link :
Binary file
When I run "cuda-memcheck" on release exe file I get the following result:
memcheck result
Solution
When debugging GPU code in Nsight VSE you need to start debugging through the Nsight menu ("Start CUDA Debugging"). See this walkthrough for more information.
EDIT
Based on the additional information you provided, in particular the cuda-memcheck output, it seems like your kernel is not actually being launched. Error 9 is cudaErrorInvalidConfiguration indicating that the launch configuration (blocks, threads/block, smem/block) is incompatible with the device.
cudaErrorInvalidConfiguration = 9

This indicates that a kernel launch is requesting resources that can never be satisfied by the current device. Requesting more shared memory per block than the device supports will trigger this error, as will requesting too many threads or blocks. See cudaDeviceProp for more device limitations.
In fact, you're trying to launch 131072 threads/block which is way above the limits (see the Programming Guide for details and for the specific limits). You should launch smaller blocks and increase the number of blocks accordingly.
As Robert Crovella said, you should always ensure you have proper error checking.
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow