Matrix inverse usng linear system solver through cublas, cublasCreate exception or something else

https://stackoverflow.com/questions/17131415

31-05-2022
|

Question

I'm trying to inverse a matrix using linear equation solver through cublas CUDA library.

The original equation is in form of:

Ax  = B  = I 

I - identity matrix 
A - The matrix I'm trying to inverse 
x - (hopefully) the inverse(A) matrix

I'd like to perform LU decomposition, receive the following:

LUx = B 

L - is a lower triangle matrix 
U - is a upper triangle matrix

Here is a good example who can show what I'm trying to do:

http://www.youtube.com/watch?v=dza5JTvMpzk

For the sake of discussion, let's assume I already have LU decomposition of A. (A = LU). I'm trying to find the inverse in a two steps series:

Ax = B = I 

LUx = B = I 

1st step:  Declare y: 

**Ly  = I** 

solve 1st linear equation 

2nd step, Solve the following linear equation

**Ux = y** 

find X = inverse(A) - *Bingo!@!*

For now I have no idea what am I doing wrong here. There might be two assumptions, either I'm not using cublas properly or cublas throws an exception for no reason.

See my code attached, there is Matlab code at the end:

     #include "cuda_runtime.h" 
     #include "device_launch_parameters.h"

     #include <stdio.h>


     //#include "cublas.h"
     #include "cublas_v2.h"


 int main()
 {

cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;

//cublasInit();

stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS)
{
    printf ("CUBLAS initialization failed\n");
    return -1;
}

unsigned int size = 3; 
// Allowcate device matrixes 
float* p_h_LowerTriangle; 
float* p_d_LowerTriangle; 

p_h_LowerTriangle = new float[size*size];
p_h_LowerTriangle[0] = 1.f;  
p_h_LowerTriangle[1] = 0.f;
p_h_LowerTriangle[2] = 0.f;
p_h_LowerTriangle[3] = 2.56f;
p_h_LowerTriangle[4] = 1.f; 
p_h_LowerTriangle[5] = 0.f; 
p_h_LowerTriangle[6] = 5.76f; 
p_h_LowerTriangle[7] = 3.5f;
p_h_LowerTriangle[8] = 1.f;

float* p_h_UpperTriangle; 
float* p_d_UpperTriangle; 

p_h_UpperTriangle = new float[size*size];
p_h_UpperTriangle[0] = 25.f;  
p_h_UpperTriangle[1] = 5.f;
p_h_UpperTriangle[2] = 1.f;
p_h_UpperTriangle[3] = 0.f;
p_h_UpperTriangle[4] = -4.8f; 
p_h_UpperTriangle[5] = -1.56f; 
p_h_UpperTriangle[6] = 0.f; 
p_h_UpperTriangle[7] = 0.f;
p_h_UpperTriangle[8] = 0.7f;

float* p_h_IdentityMatrix; 
float* p_d_IdentityMatrix; 

p_h_IdentityMatrix = new float[size*size];
p_h_IdentityMatrix[0] = 1.f;  
p_h_IdentityMatrix[1] = 0.f;
p_h_IdentityMatrix[2] = 0.f;
p_h_IdentityMatrix[3] = 0.f;
p_h_IdentityMatrix[4] = 1.f; 
p_h_IdentityMatrix[5] = 0.f; 
p_h_IdentityMatrix[6] = 0.f; 
p_h_IdentityMatrix[7] = 0.f;
p_h_IdentityMatrix[8] = 1.f;

cudaMalloc ((void**)&p_d_LowerTriangle, size*size*sizeof(float));
cudaMalloc ((void**)&p_d_UpperTriangle, size*size*sizeof(float));
cudaMalloc ((void**)&p_d_IdentityMatrix, size*size*sizeof(float));


float* p_d_tempMatrix; 
cudaMalloc ((void**)&p_d_tempMatrix, size*size*sizeof(float));


stat = cublasSetMatrix(size,size,sizeof(float),p_h_LowerTriangle,size,p_d_LowerTriangle,size);
stat = cublasSetMatrix(size,size,sizeof(float),p_h_UpperTriangle,size,p_d_UpperTriangle,size);
stat = cublasSetMatrix(size,size,sizeof(float),p_h_IdentityMatrix,size,p_d_IdentityMatrix,size);

cudaDeviceSynchronize();

// 1st phase - solve Ly = b 
const float alpha  = 1.f;

// Function solves the triangulatr linear system with multiple right hand sides, function overrides b as a result 
stat =  cublasStrsm(handle,
    cublasSideMode_t::CUBLAS_SIDE_LEFT, 
    cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
    cublasOperation_t::CUBLAS_OP_N,
    CUBLAS_DIAG_UNIT,
    size,
    size,
    &alpha,
    p_d_LowerTriangle,
    size,
    p_d_IdentityMatrix,
    size);

////////////////////////////////////
// TODO: printf 1st phase the results 
cudaDeviceSynchronize();

cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);

stat =cublasGetMatrix(size,size,sizeof(float),p_d_IdentityMatrix,size,p_h_IdentityMatrix,size);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_UpperTriangle,size,p_h_UpperTriangle,size);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_LowerTriangle,size,p_h_LowerTriangle,size);

////////////////////////////////////

// 2nd phase - solve Ux = b
stat =  cublasStrsm(handle,
    cublasSideMode_t::CUBLAS_SIDE_LEFT, 
    cublasFillMode_t::CUBLAS_FILL_MODE_UPPER,
    cublasOperation_t::CUBLAS_OP_N,
    CUBLAS_DIAG_NON_UNIT,
    size,
    size,
    &alpha,
    p_d_UpperTriangle,
    size,
    p_d_IdentityMatrix,
    size);

// TODO print the results 
cudaDeviceSynchronize();

////////////////////////////////////
cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
////////////////////////////////////

cublasDestroy(handle);
//cublasShutdown();

cudaFree(p_d_LowerTriangle);
cudaFree(p_d_UpperTriangle);
cudaFree(p_d_IdentityMatrix);

free(p_h_LowerTriangle);
free(p_h_UpperTriangle);
free(p_h_IdentityMatrix);


return 0;
}

Matlab code - works perfect:

  clear all

   UpperMatrix  = [ 25 5 1  ;  0 -4.8  -1.56  ;  0 0 0.7 ]

   LowerMatrix  = [1 0 0 ; 2.56 1 0 ; 5.76 3.5 1  ]

   IdentityMatrix = eye(3)

    % 1st phase solution 
    otps1.LT = true;
    y = linsolve(LowerMatrix,IdentityMatrix,otps1)

    %2nd phase solution 
    otps2.UT = true;
    y = linsolve(UpperMatrix,y,otps2)

MATLAB output

UpperMatrix =

25.0000    5.0000    1.0000
      0   -4.8000   -1.5600
      0         0    0.7000

LowerMatrix =

1.0000         0         0
2.5600    1.0000         0
5.7600    3.5000    1.0000

IdentityMatrix =

 1     0     0
 0     1     0
 0     0     1

y =

 1.0000         0         0
-2.5600    1.0000         0
 3.2000   -3.5000    1.0000

y =

 0.0476   -0.0833    0.0357
-0.9524    1.4167   -0.4643
 4.5714   -5.0000    1.4286

UpperMatrix =

25.0000    5.0000    1.0000
      0   -4.8000   -1.5600
      0         0    0.7000

LowerMatrix =

1.0000         0         0
2.5600    1.0000         0
5.7600    3.5000    1.0000

IdentityMatrix =

 1     0     0
 0     1     0
 0     0     1

y =

 1.0000         0         0
-2.5600    1.0000         0
 3.2000   -3.5000    1.0000

>> 
>> 
>> 
>> Inverse_LU_UT

UpperMatrix =

25.0000    5.0000    1.0000
      0   -4.8000   -1.5600
      0         0    0.7000

LowerMatrix =

1.0000         0         0
2.5600    1.0000         0
5.7600    3.5000    1.0000

IdentityMatrix =

 1     0     0
 0     1     0
 0     0     1

y =

 1.0000         0         0
-2.5600    1.0000         0
 3.2000   -3.5000    1.0000

y =

 0.0476   -0.0833    0.0357
-0.9524    1.4167   -0.4643
 4.5714   -5.0000    1.4286

I have no idea what am I doing wrong here, I suspect the cublasCreate operation. Every time I hit the command:

  cublasStatus_t stat;
  cublasHandle_t handle;
  stat = cublasCreate(&handle);

stat and handle variables are both valid.

But VS10 output several error messages specified the following:

First chance exception at 0x... microsoft C++ exception cudaError_enum at memory location 0x...

Some may say it's an internal cublas error message, handled by the library itself.

Solution

You know what, cuBlas stores matrix in column-major way, but Matlab and C use row-major way.

Rearrange your initialization and run. Result should be good.

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow