I have been trying to use streams and 1D texture, but the texture seem to be empty each time i look inside it. In the beginning my plan was to use 2 streams, but I couldn't access the texture, so I reduced the number of streams to 1 (for debugging), reduced the kernel to 1 block of 1 thread, as you can see below.
#include <stdio.h>
#include <string.h>
#include <cuda.h>
texture <int,1,cudaReadModeElementType> tex1;
__global__
void textureTest(int *out){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
float x;
int i;
for(i=0; i<30*8; i++){
x = tex1Dfetch(tex1, i);
printf("%d: %d \n ",i,x);
}
out[0]=x;
}
void testTextureCPU(){
const int N = 100/2;
int *array_d0;
int *array_d1;
int *array_h;
int x=0;
int *out_d0 =(int *)calloc(1, sizeof(int));
int *out_d1 =(int *)calloc(1, sizeof(int));
int *out_h =(int *)calloc(2, sizeof(int));
cudaStream_t stream0, stream1;
cudaStreamCreate(&stream0);
cudaStreamCreate(&stream1);
cudaHostAlloc((void**)&array_d0, (30 * 8*sizeof(int)),cudaHostAllocDefault);
cudaHostAlloc((void**)&array_d1, (30 * 8*sizeof(int)),cudaHostAllocDefault);
cudaHostAlloc((void**)&array_h, (30 * 8*sizeof(int)),cudaHostAllocDefault);
cudaMalloc((void **)&out_d0, 1 *sizeof(int));
cudaMalloc((void **)&out_d1, 1 *sizeof(int));
cudaHostAlloc((void**)&out_h, (2*sizeof(int)),cudaHostAllocDefault);
array_h[8 * 10 + 0] = 10;
array_h[8 * 11 + 1] = 11;
array_h[8 * 12 + 2] = 12;
array_h[8 * 13 + 3] = 13;
array_h[8 * 14 + 4] = 14;
array_h[8 * 15 + 5] = 15;
array_h[8 * 16 + 6] = 16;
array_h[8 * 17 + 7] = 17;
for(x=0; x<2; x++){
cudaMemcpyAsync(array_d0, array_h, (30 * 8*sizeof(int)), cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(array_d1, array_h, (30 * 8*sizeof(int)), cudaMemcpyHostToDevice, stream1);
cudaBindTexture(NULL,tex1,array_d0, (30 * 8 *sizeof(int)));
textureTest<<<1,2,0,stream0>>>(out_d0);
cudaBindTexture(NULL,tex1,array_d0, (30 * 8 *sizeof(int)));
textureTest<<<1,2,0,stream1>>>(out_d1);
cudaMemcpyAsync(out_h+x, out_d0 , 1 * sizeof(int), cudaMemcpyDeviceToHost, stream0);
cudaMemcpyAsync(out_h+x+N, out_d1 ,1 * sizeof(int), cudaMemcpyDeviceToHost, stream1);
}
}
int main(void){
testTextureCPU();
return 0;
}
But I can't figure out what's wrong with this code, and how to make it work for one or more streams.