I tried to use 2 different streams such as shown in the book "CUDA By Example" and apply it to some code, unfortunately when I run it I obtain a segmentation fault. I had previously written the code without streams, and it was working, but I can't see the problem here.
#include <cuda.h>
#include <stdio.h>
__global__
void GPU(node *tree ,char *data,int *out){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
}
void streamTest(wrapp * wrap, char *data){
int size = wrap->size;
nodes *tree = wrap->nodes;
char *data_d0;
nodes *tree_d0;
int *out_d0;
char *data_d1;
nodes *tree_d1;
int *out_d1;
char *data_h;
nodes *tree_h;
int *out_h;
const int N = 100000000/100;
cudaStream_t stream0, stream1;
cudaMalloc((void **)&data_d0, N * sizeof(char));
cudaMalloc((void **)&tree_d0, (wrap->size*sizeof(nodes)));
cudaMalloc((void **)&out_d0, sizeof(int));
cudaMalloc((void **)&data_d1, N * sizeof(char));
cudaMalloc((void **)&tree_d1, (wrap->size*sizeof(nodes)));
cudaMalloc((void **)&out_d1, sizeof(int));
cudaHostAlloc((void**)&data_h, 100000000*(sizeof(char)),cudaHostAllocDefault);
cudaHostAlloc((void**)&tree_h, wrap->size*(sizeof(nodes)),cudaHostAllocDefault);
cudaHostAlloc((void**)&out_h, (sizeof(int)),cudaHostAllocDefault);
int x;
for(x=0; x<100000000; x++){
data_h[x] = data_h[x];
}
int z;
for(x=0; x<wrap->size;x++){
tree_h[x].value = wrap->nodes[x].value;
for(z=0; z<32; z++){
tree_h[x].array[z] = wrap->nodes[x].array[z];
}
}
for(x=0; x<100000000; x+=N*2){
cudaMemcpyAsync(data_d0, data_h+x, N*sizeof(char), cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(data_d1, data_h+x+N, N*sizeof(char), cudaMemcpyHostToDevice, stream1);
cudaMemcpyAsync(tree_d0, tree_h, wrap->size*sizeof(nodes), cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(tree_d1, tree_h, wrap->size*sizeof(nodes), cudaMemcpyHostToDevice, stream1);
GPU<<<256,256,0,stream0>>>(tree_d0, data_d0, out_d0 );
GPU<<<256,256,0,stream1>>>(tree_d1, data_d1, out_d1);
}
}
int main(void){
char *data = (char *)calloc('a', 100000000 *sizeof(char));
nodes *node = (nodes *) malloc(sizeof(nodes));
streamTest(wrap, data);
}
When I try to run this I obtain :
./a.out
Segmentation fault (core dumped)
and when I check var/log/kern.log
I can see the following :
a.out[20204]: segfault at 4 ip 00007fd26303f92c sp 00007fff7694efb8 error 4 in libcuda.so.331.49[7fd262e09000+b6f000]