Question

I'm trying very hard to make a code run in parallel (CUDA). The code, simplified, is:

float sum = ...         //sum = some number
for (i = 0; i < N; i++){
    f = ...             // f = a function that returns a float and puts it into f
    sum += f;
}

The issue I'm having is sum+=f because it requires sum to be shared between the threads. I tried using the __shared__ argument when declaring sum (__shared__ float sum), but that didn't work (it didn't give me the proper result). I've also heard about reduction (and know how to use it on OpenMP), but have no idea how to apply it here.

Any help would be greatly appreciated. Thanks!

Was it helpful?

Solution

Here is the code:

#include <stdio.h>
__global__ void para(float* f, int len) {
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    if (i < len ){
        // calculate f[i], f in iteration ith
        f[i] = i;
    }
}

int main(int argc, char ** argv) {
    int inputLength=1024;
    float * h_f;
    float * d_f;
    int size = inputLength*sizeof(float);

    h_f = (float *) malloc(size);
    cudaMalloc((void**)&d_f , size);
    cudaMemcpy(d_f, h_f, size, cudaMemcpyHostToDevice);

    dim3 DimGrid((inputLength)/256 +1 , 1 , 1);
    dim3 DimBlock(256 , 1, 1);

    para<<<DimGrid , DimBlock>>>(d_f , inputLength);
    cudaThreadSynchronize();

    cudaMemcpy(h_f, d_f, size , cudaMemcpyDeviceToHost);

    cudaFree(d_f);

    // do parallel reduction
    int i;
    float sum=0;
    for(i=0; i<inputLength; i++)
        sum+=h_f[i];

    printf("%6.4f\n",sum);

    free(h_f);

    return 0;
}

The parallel reduction section can be replaced by a working CUDA parallel sum reduction (for example this one). Soon I will take time to change it.

EDIT:

Here is the code for performing parallel reduction with CUDA:

#include <stdio.h>
__global__ void para(float* f, int len) {
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    if (i < len ){
        // calculate f[i], f in iteration ith
        f[i] = i;
    }
}
__global__ void strideSum(float *f, int len, int strid){
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    if(i+strid<len){
        f[i]=f[i]+f[i+strid];
    }
}

#define BLOCKSIZE 256
int main(int argc, char ** argv) {
    int inputLength=4096;
    float * h_f;
    float * d_f;
    int size = inputLength*sizeof(float);

    h_f = (float *) malloc(size);
    cudaMalloc((void**)&d_f , size);
    cudaMemcpy(d_f, h_f, size, cudaMemcpyHostToDevice);

    dim3 DimGrid((inputLength)/BLOCKSIZE +1 , 1 , 1);
    dim3 DimBlock(BLOCKSIZE , 1, 1);

    para<<<DimGrid , DimBlock>>>(d_f , inputLength);
    cudaThreadSynchronize();

    int i;
    float sum=0, d_sum=0;
    // serial sum on host. YOU CAN SAFELY COMMENT FOLLOWING COPY AND LOOP. intended for sum validity check.
    cudaMemcpy(h_f, d_f, size , cudaMemcpyDeviceToHost);
    for(i=0; i<inputLength; i++)
        sum+=h_f[i];

    // parallel reduction on gpu
    for(i=inputLength; i>1; i=i/2){
        strideSum<<<((i/BLOCKSIZE)+1),BLOCKSIZE>>>(d_f,i,i/2);
        cudaThreadSynchronize();
    }
    cudaMemcpy(&d_sum, d_f, 1*sizeof(float) , cudaMemcpyDeviceToHost);

    printf("Host -> %6.4f, Device -> %6.4f\n",sum,d_sum);

    cudaFree(d_f);
    free(h_f);

    return 0;
}

OTHER TIPS

What you want is to map ranges of numbers to threads, have each thread add its range and then have a reduction phase. The reduction would be adding the totals from each thread.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top