You know what, cuBlas stores matrix in column-major way, but Matlab and C use row-major way.
Rearrange your initialization and run. Result should be good.
Question
I'm trying to inverse a matrix using linear equation solver through cublas CUDA library.
The original equation is in form of:
Ax = B = I
I - identity matrix
A - The matrix I'm trying to inverse
x - (hopefully) the inverse(A) matrix
I'd like to perform LU decomposition, receive the following:
LUx = B
L - is a lower triangle matrix
U - is a upper triangle matrix
Here is a good example who can show what I'm trying to do:
http://www.youtube.com/watch?v=dza5JTvMpzk
For the sake of discussion, let's assume I already have LU decomposition of A. (A = LU). I'm trying to find the inverse in a two steps series:
Ax = B = I
LUx = B = I
1st step: Declare y:
**Ly = I**
solve 1st linear equation
2nd step, Solve the following linear equation
**Ux = y**
find X = inverse(A) - *Bingo!@!*
For now I have no idea what am I doing wrong here. There might be two assumptions, either I'm not using cublas properly or cublas throws an exception for no reason.
See my code attached, there is Matlab code at the end:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
//#include "cublas.h"
#include "cublas_v2.h"
int main()
{
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
//cublasInit();
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS)
{
printf ("CUBLAS initialization failed\n");
return -1;
}
unsigned int size = 3;
// Allowcate device matrixes
float* p_h_LowerTriangle;
float* p_d_LowerTriangle;
p_h_LowerTriangle = new float[size*size];
p_h_LowerTriangle[0] = 1.f;
p_h_LowerTriangle[1] = 0.f;
p_h_LowerTriangle[2] = 0.f;
p_h_LowerTriangle[3] = 2.56f;
p_h_LowerTriangle[4] = 1.f;
p_h_LowerTriangle[5] = 0.f;
p_h_LowerTriangle[6] = 5.76f;
p_h_LowerTriangle[7] = 3.5f;
p_h_LowerTriangle[8] = 1.f;
float* p_h_UpperTriangle;
float* p_d_UpperTriangle;
p_h_UpperTriangle = new float[size*size];
p_h_UpperTriangle[0] = 25.f;
p_h_UpperTriangle[1] = 5.f;
p_h_UpperTriangle[2] = 1.f;
p_h_UpperTriangle[3] = 0.f;
p_h_UpperTriangle[4] = -4.8f;
p_h_UpperTriangle[5] = -1.56f;
p_h_UpperTriangle[6] = 0.f;
p_h_UpperTriangle[7] = 0.f;
p_h_UpperTriangle[8] = 0.7f;
float* p_h_IdentityMatrix;
float* p_d_IdentityMatrix;
p_h_IdentityMatrix = new float[size*size];
p_h_IdentityMatrix[0] = 1.f;
p_h_IdentityMatrix[1] = 0.f;
p_h_IdentityMatrix[2] = 0.f;
p_h_IdentityMatrix[3] = 0.f;
p_h_IdentityMatrix[4] = 1.f;
p_h_IdentityMatrix[5] = 0.f;
p_h_IdentityMatrix[6] = 0.f;
p_h_IdentityMatrix[7] = 0.f;
p_h_IdentityMatrix[8] = 1.f;
cudaMalloc ((void**)&p_d_LowerTriangle, size*size*sizeof(float));
cudaMalloc ((void**)&p_d_UpperTriangle, size*size*sizeof(float));
cudaMalloc ((void**)&p_d_IdentityMatrix, size*size*sizeof(float));
float* p_d_tempMatrix;
cudaMalloc ((void**)&p_d_tempMatrix, size*size*sizeof(float));
stat = cublasSetMatrix(size,size,sizeof(float),p_h_LowerTriangle,size,p_d_LowerTriangle,size);
stat = cublasSetMatrix(size,size,sizeof(float),p_h_UpperTriangle,size,p_d_UpperTriangle,size);
stat = cublasSetMatrix(size,size,sizeof(float),p_h_IdentityMatrix,size,p_d_IdentityMatrix,size);
cudaDeviceSynchronize();
// 1st phase - solve Ly = b
const float alpha = 1.f;
// Function solves the triangulatr linear system with multiple right hand sides, function overrides b as a result
stat = cublasStrsm(handle,
cublasSideMode_t::CUBLAS_SIDE_LEFT,
cublasFillMode_t::CUBLAS_FILL_MODE_LOWER,
cublasOperation_t::CUBLAS_OP_N,
CUBLAS_DIAG_UNIT,
size,
size,
&alpha,
p_d_LowerTriangle,
size,
p_d_IdentityMatrix,
size);
////////////////////////////////////
// TODO: printf 1st phase the results
cudaDeviceSynchronize();
cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_IdentityMatrix,size,p_h_IdentityMatrix,size);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_UpperTriangle,size,p_h_UpperTriangle,size);
stat =cublasGetMatrix(size,size,sizeof(float),p_d_LowerTriangle,size,p_h_LowerTriangle,size);
////////////////////////////////////
// 2nd phase - solve Ux = b
stat = cublasStrsm(handle,
cublasSideMode_t::CUBLAS_SIDE_LEFT,
cublasFillMode_t::CUBLAS_FILL_MODE_UPPER,
cublasOperation_t::CUBLAS_OP_N,
CUBLAS_DIAG_NON_UNIT,
size,
size,
&alpha,
p_d_UpperTriangle,
size,
p_d_IdentityMatrix,
size);
// TODO print the results
cudaDeviceSynchronize();
////////////////////////////////////
cudaMemcpy(p_h_IdentityMatrix,p_d_IdentityMatrix,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_UpperTriangle,p_d_UpperTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
cudaMemcpy(p_h_LowerTriangle,p_d_LowerTriangle,size*size*sizeof(float),cudaMemcpyDeviceToHost);
////////////////////////////////////
cublasDestroy(handle);
//cublasShutdown();
cudaFree(p_d_LowerTriangle);
cudaFree(p_d_UpperTriangle);
cudaFree(p_d_IdentityMatrix);
free(p_h_LowerTriangle);
free(p_h_UpperTriangle);
free(p_h_IdentityMatrix);
return 0;
}
Matlab code - works perfect:
clear all
UpperMatrix = [ 25 5 1 ; 0 -4.8 -1.56 ; 0 0 0.7 ]
LowerMatrix = [1 0 0 ; 2.56 1 0 ; 5.76 3.5 1 ]
IdentityMatrix = eye(3)
% 1st phase solution
otps1.LT = true;
y = linsolve(LowerMatrix,IdentityMatrix,otps1)
%2nd phase solution
otps2.UT = true;
y = linsolve(UpperMatrix,y,otps2)
UpperMatrix =
25.0000 5.0000 1.0000
0 -4.8000 -1.5600
0 0 0.7000
LowerMatrix =
1.0000 0 0
2.5600 1.0000 0
5.7600 3.5000 1.0000
IdentityMatrix =
1 0 0
0 1 0
0 0 1
y =
1.0000 0 0
-2.5600 1.0000 0
3.2000 -3.5000 1.0000
y =
0.0476 -0.0833 0.0357
-0.9524 1.4167 -0.4643
4.5714 -5.0000 1.4286
UpperMatrix =
25.0000 5.0000 1.0000
0 -4.8000 -1.5600
0 0 0.7000
LowerMatrix =
1.0000 0 0
2.5600 1.0000 0
5.7600 3.5000 1.0000
IdentityMatrix =
1 0 0
0 1 0
0 0 1
y =
1.0000 0 0
-2.5600 1.0000 0
3.2000 -3.5000 1.0000
>>
>>
>>
>> Inverse_LU_UT
UpperMatrix =
25.0000 5.0000 1.0000
0 -4.8000 -1.5600
0 0 0.7000
LowerMatrix =
1.0000 0 0
2.5600 1.0000 0
5.7600 3.5000 1.0000
IdentityMatrix =
1 0 0
0 1 0
0 0 1
y =
1.0000 0 0
-2.5600 1.0000 0
3.2000 -3.5000 1.0000
y =
0.0476 -0.0833 0.0357
-0.9524 1.4167 -0.4643
4.5714 -5.0000 1.4286
I have no idea what am I doing wrong here, I suspect the cublasCreate operation. Every time I hit the command:
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
stat and handle variables are both valid.
But VS10 output several error messages specified the following:
First chance exception at 0x... microsoft C++ exception cudaError_enum at memory location 0x...
Some may say it's an internal cublas error message, handled by the library itself.
La solution
You know what, cuBlas stores matrix in column-major way, but Matlab and C use row-major way.
Rearrange your initialization and run. Result should be good.