Question

I noticed that when calling cublasSgemm function for each call of gemm from a host, there are 3 kernel invocations: memset, scal_kernel and gemm kernel itself (e.g. sgemm_large). This happens even if I use constants alpha/beta allocated in device memory. While the overhead of memset and scal_kernel is relatively small, the problem is memset is always launched in default stream which causes unnecessary synchronization.

The code:

__constant__ __device__ float alpha = 1;
__constant__ __device__ float beta = 1;

int main()
{
    // ... memory allocation skipped ...
    float* px = thrust::raw_pointer_cast(x.data());
    float* py = thrust::raw_pointer_cast(y.data());
    float* pmat = thrust::raw_pointer_cast(mat.data());
    for (int iter = 0; iter < 3; ++iter)
    {
        cbstatus = cublasSgemm(cbh, CUBLAS_OP_N, CUBLAS_OP_N, crow, ccol, cshared, &alpha, px, crow, py, cshared, &beta, pmat, crow);
        assert(0 == cbstatus);
    }
}

This is what I see in profiler:

memset in profiler

The question: is there a way to avoid memset or make it run in the stream assigned to CUBLAS handle? One idea is to use DP and run device version of the gemm function, but this will work only on CC 3.0 and higher.

Was it helpful?

Solution

There was a bug in CUBLAS5.5 where a cudaMemset was used instead of cudaMemsetAsync in the specialized path where k >> m,n.

It is fixed in CUBLAS6.0 RC. And you can have access to it if you are a registered developer.

Btw, I wonder why you use __constant__ __device__ for alpha,beta. Are you using pointerMode = DEVICE?

If not, you could simply use alpha,beta on the host.

OTHER TIPS

Try the code below. The code is conceived to have only a cublasSgemm call, apart from unavoidable memory allocations and copies. You will see that

  1. You have only one kernel launched (gemm_kernel1x1_core);
  2. The two calls to cublasSgemm run perfectly in two different streams.

In the picture, the Visual Profiler timeline is shown.

My system: GeForce 540M, Windows 7, CUDA 5.5.

enter image description here

#include <conio.h>
#include <stdio.h>
#include <assert.h>

#include <cublas_v2.h> 

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { getchar(); exit(code); }
    }
}

/**********************/
/* cuBLAS ERROR CHECK */
/**********************/
#ifndef cublasSafeCall
#define cublasSafeCall(err)     __cublasSafeCall(err, __FILE__, __LINE__)
#endif

inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line)
{
    if( CUBLAS_STATUS_SUCCESS != err) {
        fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err); 
        getch(); cudaDeviceReset(); assert(0); 
    }
}

/********/
/* MAIN */
/********/
int main()
{
    int N = 5;

    float *A1, *A2, *B1, *B2, *C1, *C2;
    float *d_A1, *d_A2, *d_B1, *d_B2, *d_C1, *d_C2;

    A1 = (float*)malloc(N*N*sizeof(float));
    B1 = (float*)malloc(N*N*sizeof(float));
    C1 = (float*)malloc(N*N*sizeof(float));

    A2 = (float*)malloc(N*N*sizeof(float));
    B2 = (float*)malloc(N*N*sizeof(float));
    C2 = (float*)malloc(N*N*sizeof(float));

    gpuErrchk(cudaMalloc((void**)&d_A1,N*N*sizeof(float)));
    gpuErrchk(cudaMalloc((void**)&d_B1,N*N*sizeof(float)));
    gpuErrchk(cudaMalloc((void**)&d_C1,N*N*sizeof(float)));
    gpuErrchk(cudaMalloc((void**)&d_A2,N*N*sizeof(float)));
    gpuErrchk(cudaMalloc((void**)&d_B2,N*N*sizeof(float)));
    gpuErrchk(cudaMalloc((void**)&d_C2,N*N*sizeof(float)));

    for (int i=0; i<N*N; i++) {
        A1[i] = ((float)rand()/(float)RAND_MAX);
        A2[i] = ((float)rand()/(float)RAND_MAX);
        B1[i] = ((float)rand()/(float)RAND_MAX);
        B2[i] = ((float)rand()/(float)RAND_MAX);
    }
    gpuErrchk(cudaMemcpy(d_A1, A1, N*N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_B1, B1, N*N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_A2, A2, N*N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_B2, B2, N*N*sizeof(float), cudaMemcpyHostToDevice));

    cublasHandle_t handle;
    cublasSafeCall(cublasCreate(&handle));

    cudaStream_t stream1, stream2;
    gpuErrchk(cudaStreamCreate(&stream1));
    gpuErrchk(cudaStreamCreate(&stream2));

    float alpha = 1.f;
    float beta = 1.f;

    cublasSafeCall(cublasSetStream(handle,stream1));
    cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A1, N, d_B1, N, &beta, d_C1, N));
    cublasSafeCall(cublasSetStream(handle,stream2));
    cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A2, N, d_B2, N, &beta, d_C2, N));

    gpuErrchk(cudaDeviceReset());

    return 0;

 }
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top