on the host, the staging buffer is allocated and the memcpy starts by having
the source GPU copy source data into the staging buffer and recording a event.
But: Unlike the host2device memcpy, the CPU doesn't need to
synchronize as all synchronization will be done by the GPUs. Because memcpy
and the event-record are asynchronous, directly after the initial memcpy,
the CPU can request the destination-GPU to wait on that initial event and start a memcpy of the same buffer.
In order to let the two GPUs can use the staging buffers concurrently, two staging buffers and two CUDA events are needed. The CPU loops over the input buffer and output buffers, issuing memcpy and event-record
commands, until it has requested copies for all bytes, waiting for both GPUs to finish processing.
cudaError_t chCpyP2P(void *_dst,int dstDevice,const void *_src,int srcDevice,size_t N)
{
cudaError_t status;
char *dst = (char *) _dst;
const char *src = (const char *) _src;
int stg_idx = 0; // staging-index
while (N) {
size_t sz_cpy = min(N,STAGING_BUFFER_SIZE);
cudaSetDevice( srcDevice );
cudaStreamWaitEvent(0,g_events[dstDevice][stg_idx],0);
cudaMemcpyAsync(g_hostBuffers[stg_idx],src,sz_cpy,cudaMemcpyDeviceToHost,NULL);
cudaEventRecord(g_events[srcDevice][stg_idx]);
cudaSetDevice(dstDevice);
cudaStreamWaitEvent(0,g_events[srcDevice][stg_idx],0);
cudaMemcpyAsync(dst,g_hostBuffers[stg_idx],sz_cpy,cudaMemcpyHostToDevice,NULL);
cudaEventRecord(g_events[dstDevice][stg_idx]);
dst += sz_cpy;
src += sz_cpy;
N -= sz_cpy;
stg_idx = 1 - stg_idx;
}
cudaSetDevice(srcDevice);
cudaDeviceSynchronize();
cudaSetDevice(dstDevice);
cudaDeviceSynchronize();
return status;
}
You also need to define size_t sz_cpy outside the loop ;-)