How to solve CUDA Thrust library - for_each synchronization error?

https://stackoverflow.com//questions/9645054

10-12-2019
|

Question

I'm trying to modify a simple dynamic vector in CUDA using the thrust library of CUDA. But I'm getting "launch_closure_by_value" error on the screen indicatiing that the error is related to some synchronization process.

A simple 1D dynamic array modification is not possible due to this error.

My code segment which is causing the error is as follows.

from a .cpp file I call setIndexedGrid, which is defined in System.cu

float* a= (float*)(malloc(8*sizeof(float))); 
a[0]= 0; a[1]= 1; a[2]= 2; a[3]= 3; a[4]= 4; a[5]= 5; a[6]= 6; a[7]= 7;
float* b = (float*)(malloc(8*sizeof(float)));
setIndexedGridInfo(a,b);

The code segment at System.cu:

void
setIndexedGridInfo(float* a, float*b)
{

    thrust::device_ptr<float> d_oldData(a);
    thrust::device_ptr<float> d_newData(b);

    float c = 0.0;

    thrust::for_each(
        thrust::make_zip_iterator(thrust::make_tuple(d_oldData,d_newData)),
        thrust::make_zip_iterator(thrust::make_tuple(d_oldData+8,d_newData+8)),
        grid_functor(c));
}

grid_functor is defined in _kernel.cu

struct grid_functor
{
    float a;

    __host__ __device__
    grid_functor(float grid_Info) : a(grid_Info) {}

    template <typename Tuple>
    __device__
    void operator()(Tuple t)
    {
        volatile float data = thrust::get<0>(t);
        float pos = data + 0.1;
        thrust::get<1>(t) = pos;
    }

};

I also get these on the Output window (I use Visual Studio):

First-chance exception at 0x000007fefdc7cacd in Particles.exe: Microsoft C++ exception: cudaError_enum at memory location 0x0029eb60.. First-chance exception at 0x000007fefdc7cacd in smokeParticles.exe: Microsoft C++ exception: thrust::system::system_error at memory location 0x0029ecf0.. Unhandled exception at 0x000007fefdc7cacd in Particles.exe: Microsoft C++ exception: thrust::system::system_error at memory location 0x0029ecf0..

What is causing the problem?

Solution

You are trying to use host memory pointers in functions expecting pointers in device memory. This code is the problem:

float* a= (float*)(malloc(8*sizeof(float))); 
a[0]= 0; a[1]= 1; a[2]= 2; a[3]= 3; a[4]= 4; a[5]= 5; a[6]= 6; a[7]= 7;
float* b = (float*)(malloc(8*sizeof(float)));
setIndexedGridInfo(a,b);

.....

thrust::device_ptr<float> d_oldData(a);
thrust::device_ptr<float> d_newData(b);

The thrust::device_ptr is intended for "wrapping" a device memory pointer allocated with the CUDA API so that thrust can use it. You are trying to treat a host pointer directly as a device pointer. That is illegal. You could modify your setIndexedGridInfo function like this:

void setIndexedGridInfo(float* a, float*b, const int n)
{

    thrust::device_vector<float> d_oldData(a,a+n);
    thrust::device_vector<float> d_newData(b,b+n);

    float c = 0.0;

    thrust::for_each(
        thrust::make_zip_iterator(thrust::make_tuple(d_oldData.begin(),d_newData.begin())),
        thrust::make_zip_iterator(thrust::make_tuple(d_oldData.end(),d_newData.end())),
        grid_functor(c));
}

The device_vector constructor will allocate device memory and then copy the contents of your host memory to the device. That should fix the error you are seeing, although I am not sure what you are trying to do with the for_each iterator and whether the functor you have wrttien is correct.

Edit:

Here is a complete, compilable, runnable version of your code:

#include <cstdlib>
#include <cstdio>
#include <thrust/device_vector.h>
#include <thrust/for_each.h>
#include <thrust/copy.h>

struct grid_functor
{
    float a;

    __host__ __device__
    grid_functor(float grid_Info) : a(grid_Info) {}

    template <typename Tuple>
    __device__
    void operator()(Tuple t)
    {
        volatile float data = thrust::get<0>(t);
        float pos = data + 0.1f;
        thrust::get<1>(t) = pos;
    }

};

void setIndexedGridInfo(float* a, float*b, const int n)
{

    thrust::device_vector<float> d_oldData(a,a+n);
    thrust::device_vector<float> d_newData(b,b+n);

    float c = 0.0;

    thrust::for_each(
        thrust::make_zip_iterator(thrust::make_tuple(d_oldData.begin(),d_newData.begin())),
        thrust::make_zip_iterator(thrust::make_tuple(d_oldData.end(),d_newData.end())),
        grid_functor(c));

    thrust::copy(d_newData.begin(), d_newData.end(), b);
}

int main(void)
{
    const int n = 8;
    float* a= (float*)(malloc(n*sizeof(float))); 
    a[0]= 0; a[1]= 1; a[2]= 2; a[3]= 3; a[4]= 4; a[5]= 5; a[6]= 6; a[7]= 7;
    float* b = (float*)(malloc(n*sizeof(float)));
    setIndexedGridInfo(a,b,n);

    for(int i=0; i<n; i++) {
        fprintf(stdout, "%d (%f,%f)\n", i, a[i], b[i]);
    }

    return 0;
}

I can compile and run this code on an OS 10.6.8 host with CUDA 4.1 like this:

$ nvcc -Xptxas="-v" -arch=sm_12 -g -G thrustforeach.cu 
./thrustforeach.cu(18): Warning: Cannot tell what pointer points to, assuming global memory space
./thrustforeach.cu(20): Warning: Cannot tell what pointer points to, assuming global memory space
./thrustforeach.cu(18): Warning: Cannot tell what pointer points to, assuming global memory space
./thrustforeach.cu(20): Warning: Cannot tell what pointer points to, assuming global memory space
ptxas info    : Compiling entry function '_ZN6thrust6detail7backend4cuda6detail23launch_closure_by_valueINS2_18for_each_n_closureINS_12zip_iteratorINS_5tupleINS0_15normal_iteratorINS_10device_ptrIfEEEESB_NS_9null_typeESC_SC_SC_SC_SC_SC_SC_EEEEi12grid_functorEEEEvT_' for 'sm_12'
ptxas info    : Used 14 registers, 160+0 bytes lmem, 16+16 bytes smem, 4 bytes cmem[1]
ptxas info    : Compiling entry function '_ZN6thrust6detail7backend4cuda6detail23launch_closure_by_valueINS2_18for_each_n_closureINS_12zip_iteratorINS_5tupleINS0_15normal_iteratorINS_10device_ptrIfEEEESB_NS_9null_typeESC_SC_SC_SC_SC_SC_SC_EEEEj12grid_functorEEEEvT_' for 'sm_12'
ptxas info    : Used 14 registers, 160+0 bytes lmem, 16+16 bytes smem, 4 bytes cmem[1]

$ ./a.out
0 (0.000000,0.100000)
1 (1.000000,1.100000)
2 (2.000000,2.100000)
3 (3.000000,3.100000)
4 (4.000000,4.100000)
5 (5.000000,5.100000)
6 (6.000000,6.100000)
7 (7.000000,7.100000)

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow