Returning a pointer to a device allocated matrix from C to Fortran

Question

I would suggest that you not reinvent the wheel, but use the cublas fortran bindings that are provided for this purpose.

The "thunking" wrapper is not what you want. It does implicit copy operations as needed, any time you use a cublas call in fortran.

You want the "non-thunking" wrapper, so you have explicit control over the copying going on. You can use fortran equivalents of Get/SetMatrix and Get/SetVector to copy data back and forth.

There is a sample code (example B.2) showing how to use the non-thunking wrapper included in the cublas documentation.

Even if you do want to re-invent the wheel, the wrapper will show you how to make the necessary syntax work to move between C and Fortran.

On a standard linux CUDA install, the wrappers are in /usr/local/cuda/src The non-thunking wrapper is /usr/local/cuda/src/fortran.c

Here's a fully worked example:

cublasf.f:

      program cublas_fortran_example
      implicit none
      integer i, j

c     helper functions
      integer cublas_init
      integer cublas_shutdown
      integer cublas_alloc
      integer cublas_free
      integer cublas_set_vector
      integer cublas_get_vector
c     selected blas functions
      double precision cublas_ddot
      external cublas_daxpy
      external cublas_dscal
      external cublas_dcopy
      double precision cublas_dnrm2
c     cublas variables
      integer cublas_status
      real*8 x(30), y(30)
      double precision alpha, beta
      double precision nrm
      integer*8 d_x, d_y, d_alpha, d_beta, d_nrm
      integer*8 dsize1, dlength1, dlength2
      double precision dresult



      write(*,*) "testing cublas fortran example"

c     initialize cublas library
c     CUBLAS_STATUS_SUCCESS=0
      cublas_status = cublas_init()
      if (cublas_status /= 0) then
         write(*,*) "CUBLAS Library initialization failed"
         write(*,*) "cublas_status=",cublas_status
         stop
      endif
c     initialize data
      do j=1,30
        x(j) = 1.0
        y(j) = 2.0
      enddo
      dsize1 = 8
      dlength1 = 30
      dlength2 = 1
      alpha = 2.0
      beta = 3.0
c     allocate device storage
      cublas_status = cublas_alloc(dlength1, dsize1, d_x)
      if (cublas_status /= 0) then
         write(*,*) "CUBLAS device malloc failed"
         stop
      endif
      cublas_status = cublas_alloc(dlength1, dsize1, d_y)
      if (cublas_status /= 0) then
         write(*,*) "CUBLAS device malloc failed"
         stop
      endif
      cublas_status = cublas_alloc(dlength2, dsize1, d_alpha)
      if (cublas_status /= 0) then
         write(*,*) "CUBLAS device malloc failed"
         stop
      endif
      cublas_status = cublas_alloc(dlength2, dsize1, d_beta)
      if (cublas_status /= 0) then
         write(*,*) "CUBLAS device malloc failed"
         stop
      endif
      cublas_status = cublas_alloc(dlength2, dsize1, d_nrm)
      if (cublas_status /= 0) then
         write(*,*) "CUBLAS device malloc failed"
         stop
      endif

c     copy data from host to device

      cublas_status = cublas_set_vector(dlength1, dsize1, x, dlength2,
     >     d_x, dlength2)
      if (cublas_status /= 0) then
         write(*,*) "CUBLAS copy to device failed"
         write(*,*) "cublas_status=",cublas_status
         stop
      endif
      cublas_status = cublas_set_vector(dlength1, dsize1, y, dlength2,
     >     d_y, dlength2)
      if (cublas_status /= 0) then
         write(*,*) "CUBLAS copy to device failed"
         write(*,*) "cublas_status=",cublas_status
         stop
      endif

      dresult = cublas_ddot(dlength1, d_x, dlength2, d_y, dlength2)
      write(*,*) "dot product result=",dresult

      dresult = cublas_dnrm2(dlength1, d_x, dlength2)
      write(*,*) "nrm2 of x result=",dresult

      dresult = cublas_dnrm2(dlength1, d_y, dlength2)
      write(*,*) "nrm2 of y result=",dresult

      call cublas_daxpy(dlength1, alpha, d_x, dlength2, d_y, dlength2)
      cublas_status = cublas_get_vector(dlength1, dsize1, d_y, dlength2,
     >     y, dlength2)
      if (cublas_status /= 0) then
         write(*,*) "CUBLAS copy to host failed"
         write(*,*) "cublas_status=",cublas_status
         stop
      endif
      write(*,*) "daxpy y(1)  =", y(1)
      write(*,*) "daxpy y(30) =", y(30)


      call cublas_dscal(dlength1, beta, d_x, dlength2)
      cublas_status = cublas_get_vector(dlength1, dsize1, d_x, dlength2,
     >     x, dlength2)
      if (cublas_status /= 0) then
         write(*,*) "CUBLAS copy to host failed"
         write(*,*) "cublas_status=",cublas_status
         stop
      endif
      write(*,*) "dscal x(1)  =", x(1)
      write(*,*) "dscal x(30) =", x(30)


      call cublas_dcopy(dlength1, d_x, dlength2, d_y, dlength2)
      cublas_status = cublas_get_vector(dlength1, dsize1, d_y, dlength2,
     >     y, dlength2)
      if (cublas_status /= 0) then
         write(*,*) "CUBLAS copy to host failed"
         write(*,*) "cublas_status=",cublas_status
         stop
      endif
      write(*,*) "dcopy y(1)  =", y(1)
      write(*,*) "dcopy y(30) =", y(30)

c     deallocate GPU memory and exit
      cublas_status = cublas_free(d_x)
      cublas_status = cublas_free(d_y)
      cublas_status = cublas_free(d_alpha)
      cublas_status = cublas_free(d_beta)
      cublas_status = cublas_free(d_nrm)
      cublas_status = cublas_shutdown()
      stop
      end

compile/run:

$ gfortran -c -o cublasf.o cublasf.f
$ gcc -c -DCUBLAS_GFORTRAN -I/usr/local/cuda/include -I/usr/local/cuda/src -o fortran.o /usr/local/cuda/src/fortran.c
$ gfortran -L/usr/local/cuda/lib64 -lcublas -o cublasf cublasf.o fortran.o
$ ./cublasf
 testing cublas fortran example
 dot product result=   60.0000000000000
 nrm2 of x result=   5.47722557505166
 nrm2 of y result=   10.9544511501033
 daxpy y(1)  =   4.00000000000000
 daxpy y(30) =   4.00000000000000
 dscal x(1)  =   3.00000000000000
 dscal x(30) =   3.00000000000000
 dcopy y(1)  =   3.00000000000000
 dcopy y(30) =   3.00000000000000
$

CUDA 5.0, RHEL 5.5