Segfault when using CUDA streams

https://stackoverflow.com/questions/21977306

15-10-2022
|

Question

I tried to use 2 different streams such as shown in the book "CUDA By Example" and apply it to some code, unfortunately when I run it I obtain a segmentation fault. I had previously written the code without streams, and it was working, but I can't see the problem here.

#include <cuda.h>
#include <stdio.h>


__global__ 
 void GPU(node *tree ,char *data,int *out){
    int  tid =  blockIdx.x * blockDim.x + threadIdx.x;
 }



void streamTest(wrapp * wrap, char *data){

    int size = wrap->size;
    nodes *tree = wrap->nodes;

    char *data_d0;
    nodes *tree_d0;
    int *out_d0;

    char *data_d1;
    nodes *tree_d1;
    int *out_d1;

    char *data_h;
    nodes *tree_h;
    int *out_h;

    const int N = 100000000/100;

    cudaStream_t stream0, stream1;

     cudaMalloc((void **)&data_d0, N * sizeof(char));
     cudaMalloc((void **)&tree_d0, (wrap->size*sizeof(nodes)));
     cudaMalloc((void **)&out_d0,  sizeof(int));


     cudaMalloc((void **)&data_d1, N * sizeof(char));
     cudaMalloc((void **)&tree_d1, (wrap->size*sizeof(nodes)));
     cudaMalloc((void **)&out_d1,  sizeof(int));


     cudaHostAlloc((void**)&data_h, 100000000*(sizeof(char)),cudaHostAllocDefault);
     cudaHostAlloc((void**)&tree_h, wrap->size*(sizeof(nodes)),cudaHostAllocDefault);
     cudaHostAlloc((void**)&out_h, (sizeof(int)),cudaHostAllocDefault);



    int x;
    for(x=0; x<100000000; x++){
        data_h[x] = data_h[x];
    }

    int z;
    for(x=0; x<wrap->size;x++){
        tree_h[x].value = wrap->nodes[x].value;

        for(z=0; z<32; z++){
          tree_h[x].array[z] = wrap->nodes[x].array[z];  
        }
    }

    for(x=0; x<100000000; x+=N*2){

         cudaMemcpyAsync(data_d0, data_h+x, N*sizeof(char), cudaMemcpyHostToDevice, stream0);
         cudaMemcpyAsync(data_d1, data_h+x+N, N*sizeof(char), cudaMemcpyHostToDevice, stream1);


         cudaMemcpyAsync(tree_d0, tree_h, wrap->size*sizeof(nodes), cudaMemcpyHostToDevice, stream0);
         cudaMemcpyAsync(tree_d1, tree_h, wrap->size*sizeof(nodes), cudaMemcpyHostToDevice, stream1);

        GPU<<<256,256,0,stream0>>>(tree_d0, data_d0, out_d0 );
        GPU<<<256,256,0,stream1>>>(tree_d1, data_d1, out_d1);

    } 

}

int main(void){

    char *data = (char *)calloc('a', 100000000 *sizeof(char));

    nodes *node = (nodes *) malloc(sizeof(nodes));

    streamTest(wrap, data);
}

When I try to run this I obtain :

./a.out Segmentation fault (core dumped)

and when I check var/log/kern.log I can see the following :

a.out[20204]: segfault at 4 ip 00007fd26303f92c sp 00007fff7694efb8 error 4 in libcuda.so.331.49[7fd262e09000+b6f000]

Solution

In order to use streams, you have to create them first.

When I modify your code like so:

 cudaStream_t stream0, stream1;
 cudaStreamCreate(&stream0);    // add this line
 cudaStreamCreate(&stream1);    // add this line

The segfault goes away, for me.

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow