Vector addition with streams

https://stackoverflow.com/questions/22126624

18-10-2022
|

Question

I've already seen this question vector addition in CUDA using streams but that's not the problem with my code. Although I'm getting the same error but the root cause is different. When I compile, I get the following error.

Solution is not correct. The solution did not match the expected results at row 0. Expecting (1+0.5=1.5) but got 0.

I tried to print the values inside the kernel and found the calculations to be correct. But when I copy from device to host I see all zeroes being printed.

#include<wb.h>

#define wbCheck(stmt) do {                                                    \
        cudaError_t err = stmt;                                               \
        if (err != cudaSuccess) {                                             \
            wbLog(ERROR, "Failed to run stmt ", #stmt);                       \
            wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));    \
            return -1;                                                        \
        }                                                                     \
    } while(0)

#define NUM_STREAMS 2

__global__ void vecAdd(float * in1, float * in2, float * out, int len) {
    //@@ Insert code to implement vector addition here
    int i = blockIdx.x*blockDim.x + threadIdx.x;

    if(i< len)
    {

        out[i]= in1[i]+in2[i];
        printf("Thread %d %f  %f  out %f\n",i,in1[i],in2[i],out[i]);
    }
}

int main(int argc, char ** argv) {
    wbArg_t args;
    int inputLength;
    float * hostInput1;
    float * hostInput2;
    float * hostOutput;
    float * deviceInput1;
    float * deviceInput2;
    float * deviceOutput;

    args = wbArg_read(argc, argv);

    wbTime_start(Generic, "Importing data and creating memory on host");
    hostInput1 = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength);
    hostInput2 = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength);
    hostOutput = (float *) malloc(inputLength * sizeof(float));
    wbTime_stop(Generic, "Importing data and creating memory on host");

    float *h_A, *h_B, *h_C;
    float *d_A0, *d_B0, *d_C0; //Device memory for stream0
    float *d_A1, *d_B1, *d_C1; //Device memory for stream1

    cudaHostAlloc((void**)&h_A, inputLength*sizeof(float), cudaHostAllocDefault);
    cudaHostAlloc((void**)&h_B, inputLength*sizeof(float), cudaHostAllocDefault);
    cudaHostAlloc((void**)&h_C, inputLength*sizeof(float), cudaHostAllocDefault);

    memcpy(h_A, hostInput1,inputLength*sizeof(float));
    memcpy(h_B, hostInput2,inputLength*sizeof(float));
    printf("%f %f\n", h_A[0],hostInput1[0]);
    printf("%f %f \n",h_A[1],hostInput1[1]);

    printf("Input length is %d\n", inputLength);


    int nstreams = NUM_STREAMS;
    cudaStream_t *streams = (cudaStream_t*) malloc(nstreams * sizeof(cudaStream_t));
    for(int i = 0; i < nstreams; i++)
        cudaStreamCreate(&(streams[i]));


    long segSize = 1024;

    wbCheck(cudaMalloc((void **)&d_A0, segSize*sizeof(float)));
    wbCheck(cudaMalloc((void **)&d_A1, segSize*sizeof(float)));
    wbCheck(cudaMalloc((void **)&d_B0, segSize*sizeof(float)));
    wbCheck(cudaMalloc((void **)&d_B1, segSize*sizeof(float)));
    wbCheck(cudaMalloc((void **)&d_C0, segSize*sizeof(float)));
    wbCheck(cudaMalloc((void **)&d_C1, segSize*sizeof(float)));


    for(int i=0; i< inputLength; i+=segSize*2)
    {

        if(i+segSize <= inputLength)
        {
            cudaMemcpyAsync(d_A0,h_A+i,segSize*sizeof(float),cudaMemcpyHostToDevice,streams[0]);
            cudaMemcpyAsync(d_B0,h_B+i,segSize*sizeof(float),cudaMemcpyHostToDevice,streams[0]);

            if(i+2*segSize <= inputLength )
            {
                cudaMemcpyAsync(d_A1,h_A+i+segSize,segSize*sizeof(float),cudaMemcpyHostToDevice,streams[1]);
                cudaMemcpyAsync(d_B1,h_B+i+segSize,segSize*sizeof(float),cudaMemcpyHostToDevice,streams[1]);
            }
            else
            {
                cudaMemcpyAsync(d_A1,h_A+i+segSize,(inputLength-i-segSize)*sizeof(float),cudaMemcpyHostToDevice,streams[1]);
                cudaMemcpyAsync(d_B1,h_B+i+segSize,(inputLength-i-segSize)*sizeof(float),cudaMemcpyHostToDevice,streams[1]);

            }
        }
        else
        {
            cudaMemcpyAsync(d_A0,h_A+i,(inputLength-i)*sizeof(float),cudaMemcpyHostToDevice,streams[0]);
            cudaMemcpyAsync(d_B0,h_B+i,(inputLength-i)*sizeof(float),cudaMemcpyHostToDevice,streams[0]);
        }


        if(i+segSize <= inputLength)
        {

            vecAdd<<<segSize/256, 256, 1, streams[0]>>>(d_A0,d_B0,d_C0, segSize);
            if(i+2*segSize <= inputLength )
            {
                vecAdd<<<segSize/256, 256, 1, streams[1]>>>(d_A1,d_B1,d_C1, segSize);
            }
            else
            {
                vecAdd<<<segSize/256, 256, 1, streams[1]>>>(d_A1,d_B1,d_C1, inputLength-i-segSize);
            }

        }
        else
        {
            vecAdd<<<segSize/256, 256, 1, streams[0]>>>(d_A0,d_B0,d_C0, inputLength-i);
        }


        if(i+segSize <= inputLength)
        {
            cudaMemcpyAsync(h_C+i,d_C0,segSize*sizeof(float),cudaMemcpyDeviceToHost,streams[0]);

            if(i+2*segSize <= inputLength )
            {
                                    cudaMemcpyAsync(h_C+i+segSize,d_C1,segSize*sizeof(float),cudaMemcpyDeviceToHost,streams[1]);
                printf("hello %f\n", h_C[0]);
            }
            else
            {
                cudaMemcpyAsync(h_C+i+segSize,d_C1,(inputLength-i-segSize)*sizeof(float),cudaMemcpyDeviceToHost,streams[1]);
            }
        }
        else
        {
            cudaMemcpyAsync(h_C+i,d_C0,(inputLength-i)*sizeof(float),cudaMemcpyDeviceToHost,streams[0]);
        }
    }

    memcpy(hostOutput, h_C, inputLength*sizeof(float)); 

    wbSolution(args, hostOutput, inputLength); //hostOutput and h_C contains all zeroes 

    free(hostInput1);
    free(hostInput2);
    free(hostOutput);

    cudaFree(d_A0);
    cudaFree(d_A1);
    cudaFree(d_B0);
    cudaFree(d_B1);
    cudaFree(d_C0);
    cudaFree(d_C1);

    return 0;
}

Solution

As @hubs suggested in his comment below that I should be using cudaDeviceSynchronize(); before memcpy, the suggestion worked.

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow