I'm writing a simple matrix vector multiplication program in Fortran. In this program I am making use of cuBLAS's Fortran Binding routines explained here. The program works like a charm when my array contains less than 90 elements. I've read a few other posts suggesting to use ulimit -s unlimited
or -heap-array [size]
, but these didn't work in my case. Is there another method to solving this issue that I'm not aware of? I am compiling with ifort and gfortran, using nvcc to compile the cublas routines.
ifort
$ifort -heap-arrays 1 -names uppercase -assume nounderscore -c -o cublasf.o main.f90
$nvcc -c -DCUBLAS_INTEL_FORTRAN -I/usr/local/cuda/include -I/usr/local/cuda/src -o fortran.o /usr/local/cuda/src/fortran.c
$ifort -heap-arrays 1 -o cublasf fortran.o cublasf.o -L/usr/local/cuda/lib64 -lcublas
gfortran
$gfortran -c -o cublasf.o main.f90
$nvcc -c -DCUBLAS_GFORTRAN -I/usr/local/cuda/include -I/usr/local/cuda/src -o fortran.o /usr/local/cuda/src/fortran.c
$gfortran -o cublasf cublasf.o fortran.o -L/usr/local/cuda/lib64 -lcublas
Any assistance would be greatly appreciated. Thanks in advance!
UPDATE
Here is the code that I am trying to run. I know it's not the prettiest code and there are some optimizations I can do, but it is only for testing at the moment. There are some elements commented out that are only used for testing purposes (no need to worry about those).
main.f90
program main
implicit none
INTEGER CUBLAS_INIT
INTEGER CUBLAS_ALLOC
INTEGER CUBLAS_FREE
INTEGER CUBLAS_SET_VECTOR
INTEGER CUBLAS_SET_MATRIX
INTEGER CUBLAS_GET_VECTOR
INTEGER CUBLAS_SGEMV
EXTERNAL CUBLAS_SGEMV
INTEGER CUBLAS_SHUTDOWN
integer :: countNumCuda, countNumCuda2,countNumFort,countNumFort2
integer :: clockRate = 4
integer*4, parameter :: N=80
real*4, dimension(N,N) :: mat
real*8 :: devPtr_mat, devPtr_vect, devPtr_outVect
real*4, dimension(N) :: vect,outVect,outVect2, anotherVect
integer :: i,j
integer :: size_of_real = 8
character :: trans = "N"
real*4 :: alpha = 1.0
real*4 :: beta = 0.0
integer :: cublas_status
! Initialize matrix and array
do i = 1, N
do j = 1, N
mat(i,j) = i
end do
vect(i) = i
outVect(i) = 0
outVect2(i) = 0
end do
!call SYSTEM_CLOCK(countNumFort, clockRate)
!call MATRIXVECTORMULT(mat, vect, outVect, N)
!call SYSTEM_CLOCK(countNumFort2, clockRate)
!do i = 1, N
! anotherVect(i) = outVect(i)
!enddo
call SYSTEM_CLOCK(countNumCuda, clockRate)
print*,"Entering GPU"
cublas_status = CUBLAS_INIT()
if (cublas_status .NE. 0) then
print *, "initialization error"
end if
cublas_status = CUBLAS_ALLOC(N*N,size_of_real, devPtr_mat)
if (cublas_status .NE. 0) then
print *, "Allocation error devMat"
end if
cublas_status = CUBLAS_ALLOC(N, size_of_real, devPtr_vect)
if (cublas_status .NE. 0) then
print *, "Allcoation error devVect"
end if
cublas_status = CUBLAS_ALLOC(N, size_of_real, devPtr_outVect)
if (cublas_status .NE. 0) then
print *, "Allocation error devPtrOut"
end if
cublas_status = CUBLAS_SET_VECTOR(N, size_of_real, vect, 1, devPtr_vect, 1)
if (cublas_status .NE. 0) then
print *, "Set vector error", cublas_status
end if
cublas_status = CUBLAS_SET_MATRIX(N,N,size_of_real, mat, N, devPtr_mat, N)
if (cublas_status .NE. 0) then
print *, "Set matrix error", cublas_status
end if
cublas_status = CUBLAS_SGEMV(trans,N,N,alpha, devPtr_mat, N, devPtr_vect,1,beta,devPtr_outVect,1)
if (cublas_status .NE. 0) then
print *, "SGEMV error", cublas_status
end if
cublas_status = CUBLAS_GET_VECTOR(N,size_of_real, devPtr_outVect, 1, outVect2, 1)
if (cublas_status .NE. 0) then
print *, "Get vector error", cublas_status
end if
cublas_status = CUBLAS_FREE(devPtr_mat)
if (cublas_status .NE. 0) then
print *, "free mat error", cublas_status
end if
cublas_status = CUBLAS_FREE(devPtr_vect)
if (cublas_status .NE. 0) then
print *, "free vect error", cublas_status
end if
cublas_status = CUBLAS_FREE(devPtr_outVect)
if (cublas_status .NE. 0) then
print *, "Free outVect error", cublas_status
end if
cublas_status = CUBLAS_SHUTDOWN()
if (cublas_status .NE. 0) then
print *, "shutdown error", cublas_status
end if
call SYSTEM_CLOCK(countNumCuda2, clockRate)
!Results from CUDA
!do i = 1, N
! print *, anotherVect(i), outVect(i), outVect2(i)
!end do
do i = 1, N
print*, outVect2(i)
enddo
!print *, countNumCuda2 - countNumCuda, "Cuda Rate"
!print *, countNumFort2 - countNumFort, "Fortran Rate"
end program main
Thanks again!