Yes, you can allocate the memory yourself. In your example it should be possible to achieve this using the device_ptr
pragma, so something like:
cudaMalloc((void**)&a, num_bytes);
cudaMalloc((void**)&b, num_bytes);
cudaMalloc((void**)&c, num_bytes);
cudaMemcpy(hostA, a, num_bytes, cudaMemcpyHostToDevice);
cudaMemcpy(hostB, b, num_bytes, cudaMemcpyHostToDevice);
#pragma acc data deviceptr(a, b, c)
#pragma acc kernels
for (i = 0; i < SIZE; ++i)
for (j = 0; j < SIZE; ++j)
for (k = 0; k < SIZE; ++k)
c[i][j] += a[i][k] * b[k][j];
cudaMemcpy(c, hostC, num_bytes, cudaMemcpyDeviceToHost);
cudaFree(a);cudaFree(b);cudaFree(c);
[disclaimer: written in browser, never compiled or tested, use at own risk]
This should declare that a
, b
and c
are pre-existing allocations to the compiler. You should also be able to use the OpenACC acc_malloc
routine to allocate memory in place of cudaMalloc
, if you so wish.
Thanks to @user2054656 for pointing out my incorrect use of device_resident
in the first version of this answer.