I am trying to use CUBLAS to perform a simple matrix multiplication. I am using the following function
#ifdef CUBLAS_API_H_
// cuBLAS API errors
static const char *_cudaGetErrorEnum(cublasStatus_t error)
{
switch (error)
{
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "<unknown>";
}
#endif
void gpu_blas_mmul(cublasHandle_t &handle, cudaStream_t &stream, const real_t *A, const real_t *B, real_t *C, const int m, const int k, const int n) {
int lda=m,ldb=k,ldc=m;
const real_t alf = 1;
const real_t bet = 0;
const real_t *alpha = &alf;
const real_t *beta = &bet;
cublasSetStream(handle, stream);
// Do the actual multiplication
cublasStatus_t err = GEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
if(err!=0)
{
std::cout<<"CUBLAS err : "<<_cudaGetErrorEnum(err)<<"\n";
}
}
In a header file, GEMM is defined as
#define GEMM cublasDgemm
#define real_t double
The function is called like this:
gpu_blas_mmul(cublas[i], streams[P/2-i-1], A, B, C, N, N, N);
A, B and C are device memory locations and I am trying to multiply two NxN matrices (both stored in column-major format).
streams is a P/2 length array of CUDA Streams and cublas is an array of CUBLAS handles and i counts up from 0 to P/2-1. Both arrays contain valid handles and streams respectively (no errors when creating them). I am compiling the code for sm2.0. So double-precision shouldn't be a problem.
The code works fine when called from one file. This section has its own cublasCreate and cublasDestroy calls. The same function when called from another location throws the error "CUBLAS_STATUS_ARCH_MISMATCH".
What could be wrong?
Thank you,
Thomas