As was pointed out by Robert Crovella via comment above, what probably was happening is that while tIdx = n
was computing Xn[n]
and Yn[n]
, other threads was using this value and it may not be computed yet. In that case, the only reason to the other runs (other then the first one) to get ways the same (correct) value is that the memory pointed to by Xn and Yn is already occupied with the right value, and even with that synchronization problem the application returns the right value.
In any case, I avoided the synchronization problem by splitting the kernel into two, just as I was advised by Robert Crovella via comment:
__global__
void DeltaE1(float *X, float *Y,float *Xn, float *Yn, float *step,float *DELTA, curandState *state, const int N,const int n){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
float x , y;
float rdmn1, rdmn2;
if(tIdx < N){
DELTA[tIdx] = 0.0e0f;
if(tIdx == n){
step[tIdx] = 0.2e0f;
rdmn1 = curand_uniform(&state[tIdx]);
rdmn2 = curand_uniform(&state[tIdx]);
Xn[tIdx] = X[tIdx] + step[tIdx]*(2.0e0f*rdmn1 - 1.0e0f);
Yn[tIdx] = Y[tIdx] + step[tIdx]*(2.0e0f*rdmn2 - 1.0e0f);
DELTA[tIdx] = - (X[tIdx]*X[tIdx] + Y[tIdx]*Y[tIdx]);
DELTA[tIdx] += Xn[tIdx]*Xn[tIdx] + Yn[tIdx]*Yn[tIdx];
}
else{
x = X[tIdx] - X[n];
y = Y[tIdx] - Y[n];
DELTA[tIdx] += -1.0e0f/sqrt(x*x + y*y);
}
}
}
__global__
void DeltaE2(float *X, float *Y,float *Xn, float *Yn,float *DELTA,const int N,const int n){
int tIdx = blockIdx.x*blockDim.x + threadIdx.x;
int bIdx = blockIdx.x;
float x , y;
float dTotE = 0.0e0f;
if(tIdx < N){
if(tIdx != n){
x = X[tIdx] - Xn[n];
y = Y[tIdx] - Yn[n];
DELTA[tIdx] += 1.0e0f/sqrt(x*x + y*y);
}
dTotE = DELTA[tIdx];
dTotE = ReduceSum2(dTotE);
if(threadIdx.x == 0)DELTA[bIdx] = dTotE;
}
}