If I'm reading the following example from the CUDA webinar on using multiple GPUs correctly, it is an error to execute with a stream that is not on the currently selected device.
Example 2
cudaStream_t streamA, streamB;
cudaEvent_t eventA, eventB;
cudaSetDevice(0);
cudaStreamCreate(&streamA); // streamA and eventA belong to device-0
cudaEventCreaet(&eventA);
cudaSetDevice(1);
cudaStreamCreate(&streamB); // streamB and eventB belong to device-1
cudaEventCreate (&eventB);
kernel<<<..., streamA>>>(...);
cudaEventRecord(eventB, streamB);
cudaEventSynchronize( eventB);
ERROR:
- device-1 is current
- streamA belongs to device-0