Question

Thank you very much for reading my threads.

I am doing CUDA work, but keep getting cudaDeviceSynchronize() error code 77: cudaErrorIllegalAddress, without any idea why. I did the search for both the code and the function, surprisingly , only a few records showed up. Very strange.

I basically sum up all pixels of images. To make my questions have as much reference as it can, I am showing all my CUDA code here:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "thorcalgpu.h"
#include <stdio.h>
#include "math.h"
#include <vector>
#include <algorithm>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <numeric>
#include <iostream>

using namespace std;

float random_float(void)
{
  return static_cast<float>(rand()) / RAND_MAX;
}


__global__ void reduceSum(unsigned short *input,
                          unsigned long long *per_block_results,
                          const int n)
{
    extern __shared__ unsigned long long sdata[];

    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    // load input into __shared__ memory
    unsigned short x = 0;
    if(i < n)
    {
        x = input[i];
    }
    sdata[threadIdx.x] = x;
    __syncthreads();

    // contiguous range pattern
    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
    {
        if(threadIdx.x < offset)
        {
            // add a partial sum upstream to our own
            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
        }

        // wait until all threads in the block have
        // updated their partial sums
        __syncthreads();
    }

    // thread 0 writes the final result
    if(threadIdx.x == 0)
    {
        per_block_results[blockIdx.x] = sdata[0];
    }
}

// Helper function for using CUDA to add vectors in parallel.
//template <class T>
cudaError_t gpuWrapper(float *mean,  int N,  vector<string> filelist)
{
    int size = N*N;
    unsigned long long* dev_sum = 0;
    unsigned short* dev_img = 0;
    cudaError_t cudaStatus;
    const int block_size = 512;
    const int num_blocks = (size/block_size) + ((size%block_size) ? 1 : 0);
    int L = filelist.size();

    // Choose which GPU to run on, change this on a multi-GPU system.

    double totalgpuinittime = 0;
    StartCounter(7);

    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_img, size * sizeof(unsigned short));
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_sum, num_blocks*sizeof(unsigned long long));
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    totalgpuinittime = GetCounter(7);

    unsigned short* img;
    unsigned short* pimg;
    unsigned long long* sum = new unsigned long long[num_blocks];
    unsigned long long* psum = sum;

    cout<<endl;
    cout << "gpu looping starts, and in progress ..." << endl;
    StartCounter(6);

    double totalfileiotime = 0;
    double totalh2dcpytime = 0;
    double totalkerneltime = 0;
    double totald2hcpytime = 0;
    double totalcpusumtime = 0;
    double totalloopingtime = 0;

    for (int k = 0; k < L; k++)
    {
        StartCounter(1);
        img = (unsigned short*)LoadTIFF(filelist[k].c_str());
        totalfileiotime += GetCounter(1);

        psum = sum;
        pimg = img;

        float gpumean = 0;

        memset(psum, 0, sizeof(unsigned long long)*num_blocks);

        StartCounter(2);
        // Copy input vectors from host memory to GPU buffers.
        cudaStatus = cudaMemcpy(dev_img, pimg, size * sizeof(unsigned short), cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }

        cudaStatus = cudaMemcpy(dev_sum, psum, num_blocks*sizeof(unsigned long long), cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }   

        totalh2dcpytime += GetCounter(2);

        StartCounter(3);
        //reduceSum<<<num_blocks,block_size,num_blocks * sizeof(unsigned long long)>>>(dev_img, dev_sum, size);
         //reduceSum<<<num_blocks,block_size,block_size * sizeof(unsigned short)>>>(dev_img, dev_sum, size);
          reduceSum<<<num_blocks,block_size>>>(dev_img, dev_sum, size);
        totalkerneltime += GetCounter(3);

      // Check for any errors launching the kernel
        cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "reduction Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
            goto Error;
        }

        // cudaDeviceSynchronize waits for the kernel to finish, and returns
        // any errors encountered during the launch.

                // !!!!!! following is where the code 77 error occurs!!!!!!!
        cudaStatus = cudaDeviceSynchronize();
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
            goto Error;
        }

        // Copy output vector from GPU buffer to host memory.
        StartCounter(4);
        cudaStatus = cudaMemcpy(psum, dev_sum, num_blocks * sizeof(unsigned long long ), cudaMemcpyDeviceToHost);
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }
        totald2hcpytime += GetCounter(4);

        StartCounter(5);
        for (int i = 0; i < num_blocks; i++)
        {
            gpumean += *psum;
            psum++;
        }

        gpumean /= N*N;
        totalcpusumtime += GetCounter(5);

        delete img; 
        img = NULL;

        cout<<gpumean<<endl;

    }

    int S = 1e+6;
    int F = filelist.size();
    float R = S/F;

    totalloopingtime = GetCounter(6);
    cout<<"gpu looping ends."<<endl<<endl;
    cout<< "analysis:"<<endl;
    cout<<"gpu initialization time: "<<totalgpuinittime<<" sec"<<endl<<endl;
    cout<<"file I/O time: "<<endl;
    cout<<" total "<<totalfileiotime<<" sec | average "<<totalfileiotime*R<<" usec/frame"<<endl<<endl;
    cout<<"host-to-device copy time: "<<endl;
    cout<<" total "<<totalh2dcpytime<<" sec | average "<<totalh2dcpytime*R<<" usec/frame"<<endl<<endl;
    cout<<"pure gpu kerneling time: "<<endl;
    cout<<" total "<<totalkerneltime<<" sec | average "<<totalkerneltime*R<<" usec/frame"<<endl<<endl;
    cout<<"device-to-host copy time: "<<endl;
    cout<<" total "<<totald2hcpytime<<" sec | average "<<totald2hcpytime*R<<" usec/frame"<<endl<<endl;
    /*cout<<"cpu summing time: "<<endl;
    cout<<" total: "<<totalcpusumtime<<" sec | average: "<<totalcpusumtime*R<<" usec/frame"<<endl<<endl;;*/

    /*cout <<"gpu looping time: " << endl;
    cout<<" total: "<<totalloopingtime<<" sec | average: "<<totalloopingtime*R<<" usec/frame"<<endl;*/


Error:
    cudaFree(dev_sum);
    cudaFree(dev_img);

    delete sum;
    sum = NULL;

    return cudaStatus;
}

void kernel(float* &mean, int N, vector<string> filelist)
{
    // wrapper and kernel
    cudaError_t cudaStatus = gpuWrapper(mean, N,  filelist);

    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "gpuWapper failed!");

    }

   // printf("mean is: %f\n", mean);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.

    StartCounter(8);
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaDeviceReset failed!");

    }
    cout<<"gpu reset time: "<<GetCounter(8)<<" sec"<<endl<<endl;
    //return *mean;
}

I have assigned enough and equivalent memory space for both host and device memory. Any comments is appreciated.

Was it helpful?

Solution

While this may not be the only source of error in the code, you are not allocating any dynamic shared memory for the reduction kernel, leading to the illegal addressing error you see. The correct kernel launch should be something like

size_t shm_size = block_size * sizeof(unsigned long long);
reduceSum<<<num_blocks,block_size,shm_size>>>(dev_img, dev_sum, size);

This allocates the equivalent of one unsigned long long for each thread running in the reduction kernel, which (by my very cursory reading of your code) should make the shared memory array sdata the correct size for the kernel to run without out-of-bounds access to that array.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top