Question

I am using the following code to take the transpose of the matrix but it gives me again the initial matrix.

---EDITED--------

Now , I am getting zeros for result matrix.

    int  main(int argc,char**argv)
{

    const int  rows=2,cols=2;

    //size in bytes
    const int ARRAY_BYTES = ( rows * cols ) *sizeof(int);

    float *A , *A_Copy;
    A = (float *) malloc(ARRAY_BYTES);
    A_Copy = (float *) malloc(ARRAY_BYTES);

    //initialize
    for (int i=0; i<rows; ++i)
        for (int  j=0; j<cols; ++j)
            A_Copy[i+rows*j] = 0;

            A[0]=0;
            A[1]=1;
            A[2]=2;
            A[3]=3;



    // print matrix
    printf("\nA matrix");
    for (int  i=0; i<rows; ++i) {
        for (int  j=0; j<cols; ++j)
        printf("\nA = %f",A[i+rows*j]);
    printf("\n");
        }

    float *A_dev , *A_Copy_dev;
    cudaMalloc((void **) &A_dev, ARRAY_BYTES);
    cudaMalloc((void **) &A_Copy_dev, ARRAY_BYTES);


    cudaMemcpy(A_dev, A, ARRAY_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(A_Copy_dev, A_Copy, ARRAY_BYTES, cudaMemcpyHostToDevice);


    float const alpha(1.0);
    float const beta(0.0);

    cublasHandle_t handle;

    cublasStatus_t status;

    status = cublasCreate(&handle);

    cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
    //status = cublasSgeam(handle,CUBLAS_OP_T, CUBLAS_OP_N,  rows, cols , &alpha ,A_Copy ,cols , &beta , A_Copy , rows , A ,rows);
    status = cublasSgeam(handle,CUBLAS_OP_T, CUBLAS_OP_N,  rows, cols , &alpha ,A_Copy_dev ,rows , &beta ,A_Copy_dev ,rows, A_dev , rows);


    cudaMemcpy(A,A_dev, ARRAY_BYTES, cudaMemcpyDeviceToHost);

    printf("\nA transposed ");
    for (int  i=0; i<rows; ++i) {
        for (int  j=0; j<cols; ++j)
        printf("\nA = %f",A[i+rows*j]);
    printf("\n");
        }

    cudaFree(A_dev);
    cudaFree(A_Copy_dev);

    free(A);
    free(A_Copy);

    return 0;
}
Was it helpful?

Solution

CUBLAS expects input and output matrices to be allocated on the device. So in your case you should create device copies of A and A_Copy using cudaMalloc, and pass them to the function cublasSgeam.

Also, by default, alpha and beta should also be allocated on the device but cublas provides an option to use host pointers for these variables. All you have to do is to change the pointer mode of cublas handle before calling cublasSgeam.

cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);

Update:

You are getting zeros, because you are initializing A_Copy with zeros and copying it to A_Copy_dev which is used as the A input matrix to the cublas function. So basically, you provide zeros input and get zeros output.

In the second cudaMemcpy call, instead of A_Copy, you should copy A to A_Copy_dev like this:

cudaMemcpy(A_Copy_dev, A, ARRAY_BYTES, cudaMemcpyHostToDevice);

There is no need of A_Copy in this code.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top