سؤال

I can't run cublasStrsmBatched (line 113) without CUBLAS_STATUS_EXECUTION_FAILED (13) output. To simplify, all matrix values and alpha are 1.0, all matrices are square and lda, ldb, m and n are equal. I am able to run cublasSgemmBatched and cublasStrsm in the same way, with no error. cublasStrsmBatched should be the same, but it is not, not for me. Please tell me if you have any idea about what am I doing wrong in this code:

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

cublasHandle_t handle;

void CheckCublasCreate(cublasStatus_t status);
void CheckAllocateHost(void* h_pointer);
void CheckCudaMalloc(cudaError_t d_allocStatus);
void CheckCudaMemcpy( cudaError_t error );
void CheckCublasSetGetMatrix(cublasStatus_t status);
void CheckKernelExecution(cublasStatus_t status);
void CheckCublasDestroy(cublasStatus_t status);

void TestCublasStrsmBatched(int size, int numOfLinSys);

int main()
{
    cublasStatus_t status = cublasCreate(&handle);
    CheckCublasCreate(status);

    /*arguments are size of square matrix 
    and number of linear systems*/
    TestCublasStrsmBatched(2,2);

    status = cublasDestroy(handle);
    CheckCublasDestroy(status);
}

void TestCublasStrsmBatched(int size, int numOfLinSys)
{
    cublasStatus_t status;
    cudaError_t error;
    float **h_A;
    float **d_A;
    float **h_B;
    float **d_B;
    float **hd_A;
    float **hd_B;
    float *alpha;

    const int n = size;
    const int m = size;
    const int lda=m;
    const int ldb=m;
    const int matA_numOfElem = m*m;
    const int matB_numOfElem = m*n;

    int i,j;

    h_A = (float **)malloc(numOfLinSys * sizeof(float*));
    CheckAllocateHost(h_A);

    h_B = (float **)malloc(numOfLinSys * sizeof(float*));
    CheckAllocateHost(h_B);

    alpha=(float *)malloc(sizeof(float));
    *alpha = 1.0;

    for (j=0; j<numOfLinSys; j++){
        h_A[j] = (float *)malloc(matA_numOfElem * sizeof(float));
        CheckAllocateHost(h_A);
        for (i=0; i < matA_numOfElem; i++) 
            h_A[j][i] = 1.0;

        h_B[j] = (float *)malloc(matB_numOfElem * sizeof(float));
        CheckAllocateHost(h_B);
        for (i=0; i < matB_numOfElem; i++)
            h_B[j][i] = 1.0;
        }

    hd_A = (float **)malloc(numOfLinSys * sizeof(float*));
    CheckAllocateHost(hd_A);

    hd_B = (float **)malloc(numOfLinSys * sizeof(float*));
    CheckAllocateHost(hd_B);

    for (j=0; j<numOfLinSys; j++){
        error = cudaMalloc((void **)&hd_A[j], 
                           matA_numOfElem * sizeof(float));
        CheckCudaMalloc(error);

        error = cudaMalloc((void **)&hd_B[j], 
                           matB_numOfElem * sizeof(float));
        CheckCudaMalloc(error);

        status = cublasSetMatrix(m, m, sizeof(float), 
                                 h_A[j], lda, hd_A[j], lda);
        CheckCublasSetGetMatrix(status);

        status = cublasSetMatrix(m, n, sizeof(float), 
                                 h_B[j], ldb, hd_B[j], ldb);
        CheckCublasSetGetMatrix(status);
        }

    error = cudaMalloc((void **)&d_A, numOfLinSys * sizeof(float*));
    CheckCudaMalloc(error);

    error = cudaMalloc((void **)&d_B, numOfLinSys * sizeof(float*));
    CheckCudaMalloc(error);

    error = cudaMemcpy(d_A, hd_A, numOfLinSys * sizeof(float*), 
                       cudaMemcpyHostToDevice);
    CheckCudaMemcpy(error);

    error = cudaMemcpy(d_B, hd_B, numOfLinSys * sizeof(float*), 
                       cudaMemcpyHostToDevice);
    CheckCudaMemcpy(error);

    /*After cublasStrsmBatched call 
    status changes to CUBLAS_STATUS_EXECUTION_FAILED (13)*/
    status = cublasStrsmBatched(handle,
                                CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER,
                                CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
                                m, n, alpha, d_A, lda, d_B, ldb, numOfLinSys);
    CheckKernelExecution(status);
}


void CheckCublasCreate( cublasStatus_t status )
{
    if (status != CUBLAS_STATUS_SUCCESS){
        fprintf(stderr, 
                "!!!! CUBLAS initialization error \n");
        exit(EXIT_FAILURE);
        }
}

void CheckAllocateHost( void* h_pointer )
{
    if (h_pointer == 0){
        fprintf(stderr, 
                "!!!! host memory allocation error \n");
        exit(EXIT_FAILURE);
        }
}

void CheckCudaMalloc( cudaError_t error )
{
    if (error != cudaSuccess){
        fprintf(stderr, 
                "!!!! device memory allocation error (error code %s)\n", 
                cudaGetErrorString(error));
        exit(EXIT_FAILURE);
        }
}

void CheckCudaMemcpy( cudaError_t error )
{
    if (error != cudaSuccess){
        fprintf(stderr, "!!!! data copy error (error code %s)\n", 
                cudaGetErrorString(error));
        exit(EXIT_FAILURE);
        }
}

void CheckCublasSetGetMatrix( cublasStatus_t status )
{
    if (status != CUBLAS_STATUS_SUCCESS){
        fprintf(stderr, "!!!! device access error \n");
        exit(EXIT_FAILURE);
        }
}

void CheckKernelExecution( cublasStatus_t status )
{
    if (status != CUBLAS_STATUS_SUCCESS){
        fprintf(stderr, "!!!! kernel execution error.\n");
        exit(EXIT_FAILURE);
        }
}

void CheckCublasDestroy( cublasStatus_t status )
{
    if (status != CUBLAS_STATUS_SUCCESS){
        fprintf(stderr, "!!!! shutdown error \n");
        exit(EXIT_FAILURE);
        }
}

Using Linux, CUDA 5.5, T10 and Windows, CUDA 5.5, GTX285

Thanks!

هل كانت مفيدة؟

المحلول

The batched triangular backsolver is something I hadn't tried before in CUBLAS, so I was interested to take a look and see what might be going on. Your code is rather complex, so I didn't bother trying to understand it, but when I ran it, it appeared to be failing with an internal CUBLAS launch failure:

$ cuda-memcheck ./a.out
========= CUDA-MEMCHHECK
!!!! kernel execution error.
========= Program hit error 8 on CUDA API call to cudaLaunch 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/Library/Frameworks/CUDA.framework/Versions/A/Libraries/libcuda_256.00.35.dylib (cudbgGetAPIVersion + 0x27bd7) [0x4538e7]
=========     Host Frame:/usr/local/cuda/lib/libcudart.dylib (cudaLaunch + 0x26c) [0x45c8c]
=========     Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasZgetrfBatched + 0x1e34) [0x196ae4]
=========     Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasCtrsmBatched + 0x64d) [0x1974cd]
=========     Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasCtrsmBatched + 0xacb) [0x19794b]
=========     Host Frame:/Users/talonmies/./a.out (_Z22TestCublasStrsmBatchedii + 0x3c1) [0x1b28]
=========     Host Frame:/Users/talonmies/./a.out (main + 0x3d) [0x1b7d]
=========     Host Frame:/Users/talonmies/./a.out (start + 0x35) [0x14e9]
=========     Host Frame:[0x1]

(This is an OS X machine with a compute 1.2 GPU and CUDA 5.0). Error 8 is cudaErrorInvalidDeviceFunction, which usually only comes up when a library or fatbinary doesn't have an architecture which matches or can't be JIT recompiled into something your GPU can run.

Intrigued, I wrote my own much simpler repro case from scratch:

#include <iostream>
#include <cublas_v2.h>

int main(void)
{
    const int Neq = 5, Nrhs = 2, Nsys = 4;

    float Atri[Neq][Neq] = 
        { { 1,  6, 11, 16, 21},
        { 0,  7, 12, 17, 22},
        { 0,  0, 13, 18, 23},
        { 0,  0,  0, 19, 24},
        { 0,  0,  0,  0, 25} };

    float B[Nrhs][Neq] = 
        { {  1,  27, 112, 290, 595},
        {  2,  40, 148, 360, 710} };


    float *syslhs[Nsys], *sysrhs[Nsys];
    float *A_, *B_, **syslhs_, **sysrhs_;

    size_t Asz = sizeof(float) * (size_t)(Neq * Neq);
    size_t Bsz = sizeof(float) * (size_t)(Neq * Nrhs);

    cudaMalloc((void **)(&A_), Asz);
    cudaMalloc((void **)(&B_), Bsz * size_t(Nsys));

    cudaMemcpy(A_, Atri, Asz, cudaMemcpyHostToDevice);
    for(int i=0; i<Nsys; i++) {
        syslhs[i] = A_;
        sysrhs[i] = (float*)((char *)B_ + i*Bsz);
        cudaMemcpy(sysrhs[i], B, Bsz, cudaMemcpyHostToDevice);
    }

    size_t syssz = sizeof(float *) * (size_t)Nsys;
    cudaMalloc((void **)&syslhs_, syssz);
    cudaMalloc((void **)&sysrhs_, syssz);
    cudaMemcpy(syslhs_, syslhs, syssz, cudaMemcpyHostToDevice);
    cudaMemcpy(sysrhs_, sysrhs, syssz, cudaMemcpyHostToDevice);

    const cublasSideMode_t side = CUBLAS_SIDE_LEFT;
    const cublasDiagType_t diag = CUBLAS_DIAG_NON_UNIT;
    const cublasFillMode_t ulo = CUBLAS_FILL_MODE_LOWER;
    const cublasOperation_t trans = CUBLAS_OP_N;
    float alpha = 1.f;

    cublasHandle_t handle;
    cublasCreate(&handle);

    cublasStrsmBatched(
                handle,
                side, ulo, trans, diag,
                Neq, Nrhs,
                &alpha, 
                syslhs_, Neq,
                sysrhs_, Neq,
                Nsys
                );


    for(int k=0; k<Nsys; k++) {
        cudaMemcpy(B, sysrhs[k], Bsz, cudaMemcpyDeviceToHost);
        for(int i=0; i<Nrhs; i++) {
            for(int j=0; j<Neq; j++) {
                std::cout << B[i][j] << ",";
            }
            std::cout << std::endl;
        }
        std::cout << std::endl;
    }

    return 0;
} 

This also fails the same way as your code. At first inspection, this really does seem to be a CUBLAS internal problem, although it is very difficult to say what. About the only thing I can think of is that these solvers are only supported on compute capability 3.5 devices not supported on compute 1.x devices, but the documentation fails to mention it. Between us we have tested compute 1.2, compute 1.3, and compute 3.0[error on my part, I read K10 not T10 in your question] devices, so there isn't much else left.....

All I can suggest is trying to run your code with cuda-memcheck and see whether it reports the same error. if it does, I see a bug report to NVIDIA in your future.


EDIT: I flagrantly disregarded the EULA and used cuobjdump to explore the cubin payloads in the CUDA 5 cublas library. For the single precision batched trsm routines I found cubins for

  • 32 bit sm_20
  • 32 bit sm_30
  • 32 bit sm_35
  • 64 bit sm_20
  • 64 bit sm_30
  • 64 bit sm_35

There are clearly no sm_1x cubins in the library, so my compute_12 device should produce the runtime library error I see. It also explains your error with the GTX 285 and Telsa T10, which are both compute_13.


EDIT2:

As suspected, my repro code runs perfectly on a linux system with a compute_30 device under both CUDA 5.0 and CUDA 5.5 release libraries.

مرخصة بموجب: CC-BY-SA مع الإسناد
لا تنتمي إلى StackOverflow
scroll top