I would suggest that you not reinvent the wheel, but use the cublas fortran bindings that are provided for this purpose.
The "thunking" wrapper is not what you want. It does implicit copy operations as needed, any time you use a cublas call in fortran.
You want the "non-thunking" wrapper, so you have explicit control over the copying going on. You can use fortran equivalents of Get/SetMatrix
and Get/SetVector
to copy data back and forth.
There is a sample code (example B.2) showing how to use the non-thunking wrapper included in the cublas documentation.
Even if you do want to re-invent the wheel, the wrapper will show you how to make the necessary syntax work to move between C and Fortran.
On a standard linux CUDA install, the wrappers are in /usr/local/cuda/src
The non-thunking wrapper is /usr/local/cuda/src/fortran.c
Here's a fully worked example:
cublasf.f:
program cublas_fortran_example
implicit none
integer i, j
c helper functions
integer cublas_init
integer cublas_shutdown
integer cublas_alloc
integer cublas_free
integer cublas_set_vector
integer cublas_get_vector
c selected blas functions
double precision cublas_ddot
external cublas_daxpy
external cublas_dscal
external cublas_dcopy
double precision cublas_dnrm2
c cublas variables
integer cublas_status
real*8 x(30), y(30)
double precision alpha, beta
double precision nrm
integer*8 d_x, d_y, d_alpha, d_beta, d_nrm
integer*8 dsize1, dlength1, dlength2
double precision dresult
write(*,*) "testing cublas fortran example"
c initialize cublas library
c CUBLAS_STATUS_SUCCESS=0
cublas_status = cublas_init()
if (cublas_status /= 0) then
write(*,*) "CUBLAS Library initialization failed"
write(*,*) "cublas_status=",cublas_status
stop
endif
c initialize data
do j=1,30
x(j) = 1.0
y(j) = 2.0
enddo
dsize1 = 8
dlength1 = 30
dlength2 = 1
alpha = 2.0
beta = 3.0
c allocate device storage
cublas_status = cublas_alloc(dlength1, dsize1, d_x)
if (cublas_status /= 0) then
write(*,*) "CUBLAS device malloc failed"
stop
endif
cublas_status = cublas_alloc(dlength1, dsize1, d_y)
if (cublas_status /= 0) then
write(*,*) "CUBLAS device malloc failed"
stop
endif
cublas_status = cublas_alloc(dlength2, dsize1, d_alpha)
if (cublas_status /= 0) then
write(*,*) "CUBLAS device malloc failed"
stop
endif
cublas_status = cublas_alloc(dlength2, dsize1, d_beta)
if (cublas_status /= 0) then
write(*,*) "CUBLAS device malloc failed"
stop
endif
cublas_status = cublas_alloc(dlength2, dsize1, d_nrm)
if (cublas_status /= 0) then
write(*,*) "CUBLAS device malloc failed"
stop
endif
c copy data from host to device
cublas_status = cublas_set_vector(dlength1, dsize1, x, dlength2,
> d_x, dlength2)
if (cublas_status /= 0) then
write(*,*) "CUBLAS copy to device failed"
write(*,*) "cublas_status=",cublas_status
stop
endif
cublas_status = cublas_set_vector(dlength1, dsize1, y, dlength2,
> d_y, dlength2)
if (cublas_status /= 0) then
write(*,*) "CUBLAS copy to device failed"
write(*,*) "cublas_status=",cublas_status
stop
endif
dresult = cublas_ddot(dlength1, d_x, dlength2, d_y, dlength2)
write(*,*) "dot product result=",dresult
dresult = cublas_dnrm2(dlength1, d_x, dlength2)
write(*,*) "nrm2 of x result=",dresult
dresult = cublas_dnrm2(dlength1, d_y, dlength2)
write(*,*) "nrm2 of y result=",dresult
call cublas_daxpy(dlength1, alpha, d_x, dlength2, d_y, dlength2)
cublas_status = cublas_get_vector(dlength1, dsize1, d_y, dlength2,
> y, dlength2)
if (cublas_status /= 0) then
write(*,*) "CUBLAS copy to host failed"
write(*,*) "cublas_status=",cublas_status
stop
endif
write(*,*) "daxpy y(1) =", y(1)
write(*,*) "daxpy y(30) =", y(30)
call cublas_dscal(dlength1, beta, d_x, dlength2)
cublas_status = cublas_get_vector(dlength1, dsize1, d_x, dlength2,
> x, dlength2)
if (cublas_status /= 0) then
write(*,*) "CUBLAS copy to host failed"
write(*,*) "cublas_status=",cublas_status
stop
endif
write(*,*) "dscal x(1) =", x(1)
write(*,*) "dscal x(30) =", x(30)
call cublas_dcopy(dlength1, d_x, dlength2, d_y, dlength2)
cublas_status = cublas_get_vector(dlength1, dsize1, d_y, dlength2,
> y, dlength2)
if (cublas_status /= 0) then
write(*,*) "CUBLAS copy to host failed"
write(*,*) "cublas_status=",cublas_status
stop
endif
write(*,*) "dcopy y(1) =", y(1)
write(*,*) "dcopy y(30) =", y(30)
c deallocate GPU memory and exit
cublas_status = cublas_free(d_x)
cublas_status = cublas_free(d_y)
cublas_status = cublas_free(d_alpha)
cublas_status = cublas_free(d_beta)
cublas_status = cublas_free(d_nrm)
cublas_status = cublas_shutdown()
stop
end
compile/run:
$ gfortran -c -o cublasf.o cublasf.f
$ gcc -c -DCUBLAS_GFORTRAN -I/usr/local/cuda/include -I/usr/local/cuda/src -o fortran.o /usr/local/cuda/src/fortran.c
$ gfortran -L/usr/local/cuda/lib64 -lcublas -o cublasf cublasf.o fortran.o
$ ./cublasf
testing cublas fortran example
dot product result= 60.0000000000000
nrm2 of x result= 5.47722557505166
nrm2 of y result= 10.9544511501033
daxpy y(1) = 4.00000000000000
daxpy y(30) = 4.00000000000000
dscal x(1) = 3.00000000000000
dscal x(30) = 3.00000000000000
dcopy y(1) = 3.00000000000000
dcopy y(30) = 3.00000000000000
$
CUDA 5.0, RHEL 5.5