CUDA C/C++: Calculate the average of inverse of distance per point (interaction energy, perhaps?)

Question

Actually, from what I can tell, the problem seems to be on the CPU side. I created a sample code based on your code.

I was able to reproduce your results.

First I switched all instances of sinf, cosf, and sqrtf to their corresponding double versions. This made no difference in the results.

Next I included a typedef so I could easily switch the precision from float to double and back, replacing every relevant instance of float in the code with mytype which is my typedef.

When I run the code with typedef of float and a data size of 4096 I get these results:

GPU average = 108.294922
CPU average = 109.925285

When I run the code with typedef of double and a data size of 4096 I get these results:

GPU average = 108.294903
CPU average = 108.294903

When I run the code with typedef of float and a data size of 8192 I get these results:

GPU average = 153.447327
CPU average = 131.473526

When I run the code with typedef of double and a data size of 8192 I get these results:

GPU average = 153.447380
CPU average = 153.447380

There are at least 2 observations:

The GPU results don't vary between float and double, except in the 5th decimal place
The CPU results vary by 1-20% or so between float and double, but when double is selected, they line up exactly (to the 6th decimal place, anyway) with the GPU results.

Based on this, I believe the CPU is providing the variable, questionable behavior.

Here's my code for reference:

#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#define DSIZE 8192
#define BlockSize 32
#define pi 3.14159f


#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

typedef double mytype;

__global__ void InitRNG ( curandState * state, const int seed ){
    int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
    curand_init (seed, tIdx, 0, &state[tIdx]);
}

__global__
void SortPoints(mytype* X, mytype* Y,const int N, curandState *state){

    mytype rdmn1, rdmn2;

    unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
    mytype range;
    if(tIdx < N){
        rdmn1 = curand_uniform(&state[tIdx]);
        rdmn2 = curand_uniform(&state[tIdx]);
        range = sqrt(0.25e0f*N*rdmn1);
            X[tIdx] = range*cos(2.0e0f*pi*rdmn2);
            Y[tIdx] = range*sin(2.0e0f*pi*rdmn2);
    }
}

__device__
mytype ReduceSum2(mytype In){

    __shared__ mytype data[BlockSize];

    unsigned int tIdx = threadIdx.x;

    data[tIdx] = In;
    __syncthreads();

    for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){

        if(tIdx < i){

            data[tIdx] += data[tIdx + i];
        }

        __syncthreads();
    }

    return data[0];

}

__global__
void AvgDistance(mytype *X, mytype *Y, mytype *Avg, const int N){

    int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
    int bIdx = blockIdx.x;

    mytype x , y;
    mytype d = 0.0f;
    if(tIdx < N){

        for(int i = tIdx + 1; i < N ; i++){

            x = X[tIdx] - X[i];
            y = Y[tIdx] - Y[i];

            d += 1.0e0f/(sqrt(x*x + y*y));
        }
        __syncthreads();
        Avg[bIdx] = ReduceSum2(d);

    }
}

mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
  mytype average = 0.0f;
  for(int i = 0; i < size; i++){
    for(int j = i + 1; j < size; j++){
        average += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
    }
  }
  average = average/(mytype)size;
  return average;
}

int main() {

  int Np = DSIZE;
  mytype *rx, *ry, *d_rx, *d_ry, *d_Avg, *Avg;
  curandState *d_state;
  int seed = 1;

  dim3 threads(BlockSize,BlockSize);
  dim3 blocks((int)ceilf(Np/(float)threads.x),(int)ceilf(Np/(float)threads.y));
  printf("number of blocks = %d\n", blocks.x);
  printf("number of threads= %d\n", threads.x);
  rx = (mytype *)malloc(DSIZE*sizeof(mytype));
  if (rx == 0) {printf("malloc fail\n"); return 1;}
  ry = (mytype *)malloc(DSIZE*sizeof(mytype));
  if (ry == 0) {printf("malloc fail\n"); return 1;}

  cudaMalloc((void**)&d_rx, DSIZE * sizeof(mytype));
  cudaMalloc((void**)&d_ry, DSIZE * sizeof(mytype));
  cudaMalloc((void**)&d_Avg, blocks.x * sizeof(mytype));
  cudaMalloc((void**)&d_state, DSIZE * sizeof(curandState));
  cudaCheckErrors("cudamalloc");



  InitRNG<<<blocks.x,threads.x>>>(d_state,seed);
  SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state);
  AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(mytype)>>>(d_rx,d_ry,d_Avg,Np);
  cudaCheckErrors("kernels");


  Avg = new mytype[blocks.x];

  cudaMemcpy(Avg,d_Avg,blocks.x*sizeof(mytype),cudaMemcpyDeviceToHost);
  cudaMemcpy(rx, d_rx, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
  cudaMemcpy(ry, d_ry, DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudamemcpy");
  mytype average = 0;

  for(int i = 0; i < blocks.x; i++){
    average += Avg[i];
  }
  average = average/(mytype)Np;

  printf("GPU average = %f\n", average);
  average = cpu_avg(rx, ry, DSIZE);
  printf("CPU average = %f\n", average);

  return 0;
}

I am running on RHEL 5.5, CUDA 5.0, Intel Xeon X5560

compiled with:

nvcc -O3 -arch=sm_20 -lcurand -lm -o t93 t93.cu

EDIT: After observing that the variability was on the CPU side, I found that I could eliminate most of the CPU variability by modifying your CPU averaging code like this:

mytype cpu_avg(const mytype *rx, const mytype *ry, const int size){
  mytype average = 0.0f;
  mytype temp = 0.0f;
  for(int i = 0; i < size; i++){
    for(int j = i + 1; j < size; j++){
        temp += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j]));
    }
    average += temp/(mytype)size;
    temp = 0.0f;
  }
  return average;
}

So I would say there's a problem with intermediate results on the CPU side. It's interesting that it doesn't show up on the GPU result. I suspect the reason for this is that the final summation of GPU averages is done on the CPU (therefore each individual GPU block result is scaled down by the size, e.g. 8192), and these may have an intermediate precision that is sufficient to survive until the final division. If you inlined the CPU average calculation, you may observe something different again.