I would suggest using atomic operations to avoid the race condition whose effect you are observing. As atomic operation, you can use atomicCAS
. The approach below will work for any grid size. As an alternative approach, you can define a new, temporary array (as you mentioned in your post) and avoid atomic operations.
Here is the code using CUDA atomics.
#include <stdio.h>
#define N 10
__global__ void swap_kernel(int* data) {
int tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid < N) { atomicCAS(&data[tid], data[tid], data[(tid+1) % N]); }
}
void main() {
int* h_vec = (int*) malloc(N*sizeof(int));
int* d_vec; cudaMalloc((void**)&d_vec,N*sizeof(int));
for (int i=0; i<N; i++) h_vec[i] = i;
cudaMemcpy(d_vec,h_vec,N*sizeof(int),cudaMemcpyHostToDevice);
swap_kernel<<<2,8>>>(d_vec);
cudaMemcpy(h_vec,d_vec,N*sizeof(int),cudaMemcpyDeviceToHost);
for (int i=0; i<N; i++) printf("%i %i\n",i,h_vec[i]);
getchar();
}