Question

I have been trying to use streams and 1D texture, but the texture seem to be empty each time i look inside it. In the beginning my plan was to use 2 streams, but I couldn't access the texture, so I reduced the number of streams to 1 (for debugging), reduced the kernel to 1 block of 1 thread, as you can see below.

#include <stdio.h>
#include <string.h>
#include <cuda.h>

texture <int,1,cudaReadModeElementType> tex1;

__global__
void textureTest(int *out){
    int  tid =  blockIdx.x * blockDim.x + threadIdx.x;
    float x;
    int i;
    for(i=0; i<30*8; i++){
        x = tex1Dfetch(tex1, i);
        printf("%d: %d \n ",i,x);
    }
    out[0]=x;
}

void testTextureCPU(){
    const int N = 100/2;
    int *array_d0;
    int *array_d1;
    int *array_h;
    int x=0;
    int *out_d0 =(int *)calloc(1, sizeof(int));
    int *out_d1 =(int *)calloc(1, sizeof(int));
    int *out_h =(int *)calloc(2, sizeof(int));

    cudaStream_t stream0, stream1;
    cudaStreamCreate(&stream0);
    cudaStreamCreate(&stream1);   

    cudaHostAlloc((void**)&array_d0, (30 * 8*sizeof(int)),cudaHostAllocDefault);
    cudaHostAlloc((void**)&array_d1, (30 * 8*sizeof(int)),cudaHostAllocDefault);
    cudaHostAlloc((void**)&array_h, (30 * 8*sizeof(int)),cudaHostAllocDefault);

    cudaMalloc((void **)&out_d0,  1 *sizeof(int));
    cudaMalloc((void **)&out_d1,  1 *sizeof(int));
    cudaHostAlloc((void**)&out_h, (2*sizeof(int)),cudaHostAllocDefault);


    array_h[8 * 10 + 0] = 10;
    array_h[8 * 11 + 1] = 11;
    array_h[8 * 12 + 2] = 12;
    array_h[8 * 13 + 3] = 13;
    array_h[8 * 14 + 4] = 14;
    array_h[8 * 15 + 5] = 15;
    array_h[8 * 16 + 6] = 16;
    array_h[8 * 17 + 7] = 17;

    for(x=0; x<2; x++){

        cudaMemcpyAsync(array_d0, array_h, (30 * 8*sizeof(int)), cudaMemcpyHostToDevice, stream0);
        cudaMemcpyAsync(array_d1, array_h, (30 * 8*sizeof(int)), cudaMemcpyHostToDevice, stream1);

        cudaBindTexture(NULL,tex1,array_d0, (30 * 8 *sizeof(int)));

        textureTest<<<1,2,0,stream0>>>(out_d0);

        cudaBindTexture(NULL,tex1,array_d0, (30 * 8 *sizeof(int)));

        textureTest<<<1,2,0,stream1>>>(out_d1);

        cudaMemcpyAsync(out_h+x, out_d0 , 1 * sizeof(int), cudaMemcpyDeviceToHost, stream0);
        cudaMemcpyAsync(out_h+x+N, out_d1 ,1 * sizeof(int), cudaMemcpyDeviceToHost, stream1);
    }
} 

int main(void){
    testTextureCPU();
    return 0;
}

But I can't figure out what's wrong with this code, and how to make it work for one or more streams.

Was it helpful?

Solution

Your edited code contains a number of absolutely elementary errors which have nothing to do with textures or their usage with streams:

  1. In the kernel, you have a broken printf statement which treats a floating point value as an integer
  2. In the host code, the host memory you use to populate the texture is mostly uninitialised
  3. Within the host loop, there is a terrible buffer overflow with the second cudaMemcpyAsync call

If you fix these three things, the code works as expected. I would suggest paying a little more attention to the quality of you code in future.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top