QR decomposition to solve linear systems in CUDA

Question 1

void GEQRF(int M,int N,T* A,int LDA, T* TAU, T* WORK,int LWORK,int &INFO)

After GEQRF, R is stored in the upper triangular portion of A. Q can then be generated using xORGQR with A and TAU as the inputs.

more explanation: http://www.culatools.com/forums/viewtopic.php?f=15&t=684

Question 2

As of February 2015, CUDA 7.0 (now in Release Candidate) offers the new cuSOLVER library including the possibility of calculating the QR decomposition of a matrix. This, in conjunction with the cuBLAS library, enables to solve a linear system according to the guidelines expounded in Appendix C of the cuSOLVER User's Guide.

The steps you have to follow are three:

1) geqrf: it calculates the QR decomposition of the matrix by returning the upper triangular matrix R in the upper triangular part of A and the matrix Q in the form of Householder's vectors stored in the lower triangular part of A, while the scaling factors of the Householder's vectors are returned by the TAU parameter;

2) ormqr: it returns the product of Q and a matrix C by overwriting C;

3) trsm: it solves an upper triangular linear system.

Below, I'm providing a full example of the usage of those routines.

#include "cuda_runtime.h"
#include "device_launch_paraMeters.h"

#include<iostream>
#include<fstream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>

#include <cusolverDn.h>
#include <cublas_v2.h>
#include <cuda_runtime_api.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define BLOCK_SIZE 32

#define prec_save 10

/***************/
/* COPY KERNEL */
/***************/
__global__ void copy_kernel(const double * __restrict d_in, double * __restrict d_out, const int M, const int N) {

    const int i = blockIdx.x * blockDim.x + threadIdx.x;
    const int j = blockIdx.y * blockDim.y + threadIdx.y;

    if ((i < N) && (j < N)) d_out[j * N + i] = d_in[j * M + i];
}

/****************************************************/
/* LOAD INDIVIDUAL REAL MATRIX FROM txt FILE TO CPU */
/****************************************************/
// --- Load individual real matrix from txt file
template <class T>
void loadCPUrealtxt(T * __restrict h_out, const char *filename, const int M) {

    std::ifstream infile;
    infile.open(filename);
    for (int i = 0; i < M; i++) {
        double temp;
        infile >> temp;
        h_out[i] = (T)temp;
    }

    infile.close();

}

/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {

    T *h_in = (T *)malloc(M * sizeof(T));

    gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));

    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
    outfile.close();

}

/********/
/* MAIN */
/********/
int main(){

    // --- Extension of Appendix C.1 of cuSOLVER library User's Guide
    // --- See also http://www.netlib.org/lapack/lug/node40.html

    // --- ASSUMPTION Nrows >= Ncols
    const int Nrows = 500;
    const int Ncols = 500;

    TimingGPU timerGPU;
    double timingQR, timingSolve;

    // --- cuSOLVE input/output parameters/arrays
    int work_size = 0;
    int *devInfo;           gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));

    // --- CUDA solver initialization
    cusolverDnHandle_t solver_handle;
    cusolveSafeCall(cusolverDnCreate(&solver_handle));

    // --- CUBLAS initialization
    cublasHandle_t cublas_handle;
    cublasSafeCall(cublasCreate(&cublas_handle));

    /***********************/
    /* SETTING THE PROBLEM */
    /***********************/
    // --- Setting the host, Nrows x Ncols matrix
    double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
    loadCPUrealtxt(h_A, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\testMatrix.txt", Nrows * Ncols);

    // --- Setting the device matrix and moving the host matrix to the device
    double *d_A;            gpuErrchk(cudaMalloc(&d_A, Nrows * Ncols * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));

    // --- Initializing the data matrix C (Of course, this step could be done by a kernel function directly on the device).
    // --- Notice that, in this case, only the first column of C contains actual data, the others being empty (zeroed). However, cuBLAS trsm
    //     has the capability of solving triangular linear systems with multiple right hand sides.
    double *h_C = (double *)calloc(Nrows * Nrows, sizeof(double));
    loadCPUrealtxt(h_C, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\testVector.txt", Nrows);

    double *d_C;            gpuErrchk(cudaMalloc(&d_C, Nrows * Nrows * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_C, h_C, Nrows * Nrows * sizeof(double), cudaMemcpyHostToDevice));

    /**********************************/
    /* COMPUTING THE QR DECOMPOSITION */
    /**********************************/
    timerGPU.StartCounter();

    // --- CUDA QR GEQRF preliminary operations
    double *d_TAU;      gpuErrchk(cudaMalloc((void**)&d_TAU, min(Nrows, Ncols) * sizeof(double)));
    cusolveSafeCall(cusolverDnDgeqrf_bufferSize(solver_handle, Nrows, Ncols, d_A, Nrows, &work_size));
    double *work;   gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));

    // --- CUDA GEQRF execution: The matrix R is overwritten in upper triangular part of A, including diagonal 
    //     elements. The matrix Q is not formed explicitly, instead, a sequence of householder vectors are
    //     stored in lower triangular part of A.
    cusolveSafeCall(cusolverDnDgeqrf(solver_handle, Nrows, Ncols, d_A, Nrows, d_TAU, work, work_size, devInfo));
    int devInfo_h = 0;  gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
    if (devInfo_h != 0) std::cout << "Unsuccessful gerf execution\n\n";

    timingQR = timerGPU.GetCounter();
    printf("Timing for QR calculation = %f [ms]\n", timingQR);

    /*****************************/
    /* SOLVING THE LINEAR SYSTEM */
    /*****************************/
    timerGPU.StartCounter();

    // --- CUDA ORMQR execution: Computes the multiplication Q^T * C and stores it in d_C
    cusolveSafeCall(cusolverDnDormqr(solver_handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, Nrows, Ncols, min(Nrows, Ncols), d_A, Nrows, d_TAU, d_C, Nrows, work, work_size, devInfo));

    // --- Reducing the linear system size
    double *d_R; gpuErrchk(cudaMalloc(&d_R, Ncols * Ncols * sizeof(double)));
    double *d_B; gpuErrchk(cudaMalloc(&d_B, Ncols * sizeof(double)));
    dim3 Grid(iDivUp(Ncols, BLOCK_SIZE), iDivUp(Ncols, BLOCK_SIZE));
    dim3 Block(BLOCK_SIZE, BLOCK_SIZE);
    copy_kernel << <Grid, Block >> >(d_A, d_R, Nrows, Ncols);
    gpuErrchk(cudaMemcpy(d_B, d_C, Ncols * sizeof(double), cudaMemcpyDeviceToDevice));

    // --- Solving an upper triangular linear system - compute x = R \ Q^T * B
    const double alpha = 1.;
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
        CUBLAS_DIAG_NON_UNIT, Ncols, 1, &alpha, d_R, Ncols, d_B, Ncols));

    timingSolve = timerGPU.GetCounter();
    printf("Timing for solution of the linear system = %f [ms]\n", timingSolve);
    printf("Overall timing = %f [ms]\n", timingQR + timingSolve);

    /************************/
    /* CHECKING THE RESULTS */
    /************************/
    // --- The upper triangular part of A contains the elements of R. Showing this.
    saveGPUrealtxt(d_A, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\d_R.txt", Nrows * Ncols);

    // --- The first Nrows elements of d_C contain the result of Q^T * C
    saveGPUrealtxt(d_C, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\d_QTC.txt", Nrows);

    // --- Initializing the output Q matrix (Of course, this step could be done by a kernel function directly on the device)
    double *h_Q = (double *)malloc(Nrows * Nrows * sizeof(double));
    for (int j = 0; j < Nrows; j++)
        for (int i = 0; i < Nrows; i++)
            if (j == i) h_Q[j + i*Nrows] = 1.;
            else        h_Q[j + i*Nrows] = 0.;

    double *d_Q;            gpuErrchk(cudaMalloc(&d_Q, Nrows * Nrows * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_Q, h_Q, Nrows * Nrows * sizeof(double), cudaMemcpyHostToDevice));

    // --- Calculation of the Q matrix
    cusolveSafeCall(cusolverDnDormqr(solver_handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_N, Nrows, Ncols, min(Nrows, Ncols), d_A, Nrows, d_TAU, d_Q, Nrows, work, work_size, devInfo));

    // --- d_Q contains the elements of Q. Showing this.
    saveGPUrealtxt(d_Q, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\d_Q.txt", Nrows * Nrows);

    // --- At this point, d_C contains the elements of Q^T * C, where C is the data vector. Showing this.
    // --- According to the above, only the first column of d_C makes sense.
    //gpuErrchk(cudaMemcpy(h_C, d_C, Nrows * Nrows * sizeof(double), cudaMemcpyDeviceToHost));
    //printf("\n\n");
    //for (int j = 0; j < Nrows; j++)
    //  for (int i = 0; i < Nrows; i++)
    //      printf("C[%i, %i] = %f\n", j, i, h_C[j + i*Nrows]);

    // --- Check final result
    saveGPUrealtxt(d_B, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\d_B.txt", Ncols);

    cusolveSafeCall(cusolverDnDestroy(solver_handle));

    return 0;
}

The Utilities.cu and Utilities.cuh files needed to run such an example are maintained at this github page. The TimingGPU.cu and TimingGPU.cuh files are maintained at this github page.

Data can be generated and results can be checked by the following Matlab code:

clear all
close all
clc

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% GENERATE RANDOM NON-SQUARE MATRIX WITH DESIRED CONDITION NUMBER %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% --- Credit to https://math.stackexchange.com/questions/198515/can-we-generate-random-singular-matrices-with-desired-condition-number-using-mat
Nrows = 500;                % --- Number of rows
Ncols = 500;                % --- Number of columns
% condNumber = 10 * sqrt(2);  % --- Desired condition number
% A = randn(Nrows, Ncols);
% [U, S, V] = svd(A);
% S(S~=0) = linspace(condNumber, 1, min(Nrows, Ncols));
% A = U * S * V';

% --- Setting the problem solution
x = ones(Ncols, 1);

% y = A * x;
% 
% Asave = reshape(A, Nrows * Ncols, 1);
% save testMatrix.txt Asave -ascii -double
% save testVector.txt y -ascii -double

load testMatrix.txt
load testVector.txt
A = reshape(testMatrix, Nrows, Ncols);
y = testVector;

[Q, R] = qr(A);

xMatlab = R \ (Q.' * y);

fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));

fprintf('Percentage rms of Q * R - A %f\n', 100 * sqrt(sum(sum(abs(Q * R - A).^2)) / sum(sum(abs(A).^2))));

load d_R.txt
d_R = reshape(d_R, Nrows, Ncols);

d_R = d_R(1 : Ncols, :);
R   = R(1 : Ncols, :);

fprintf('Percentage rms of matrix R between Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(triu(R) - triu(d_R)).^2)) / sum(sum(abs(triu(d_R)).^2))));

load d_QTC.txt
fprintf('Percentage rms of Q^T * y - d_QTC %f\n', 100 * sqrt(sum(sum(abs(Q.' * y - d_QTC).^2)) / sum(sum(abs(d_QTC).^2))));

load d_B.txt
fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(d_B - x).^2)) / sum(sum(abs(x).^2))));

Please, comment/uncomment rows as required.

Timing

Timings (in ms) (tests performed on a GTX960 card, cc. 5.2):

Size         QR decomposition       Solving system       Overall
100x100      0.89                   1.41                 2.30
200x200      5.97                   3.23                 9.20
500x500      17.08                  21.6                 38.7

Question 3

The following code is a slight expansion of JackOLantern's answer for a general M-by-K input RHS matrix b. Basically you need to copy the upper matrix for R and intermediate b so that matrices have the right stride.

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <iostream>
#include "cuda_runtime.h"
#include "cublas_v2.h"
#include "cusolverDn.h"
#include "cublas_test.h"
#include "Eigen/Dense"
#include "gpu_util.h"
//##############################################################################
template<typename T>
void PrintEMatrix(const T &mat, const char *name) {
    std::cout << name << " =\n";
    std::cout << mat << std::endl;
}
//##############################################################################
template<typename T>
__global__
void Ker_CopyUpperSubmatrix(const T *__restrict d_in,
                                  T *__restrict d_ou,
                            const int M, const int N, const int subM) {
    const int i = threadIdx.x + blockIdx.x*blockDim.x;
    const int j = threadIdx.y + blockIdx.y*blockDim.y;
    if (i>=subM || j>=N)
        return;
    d_ou[j*subM+i] = d_in[j*M+i];
}
//##############################################################################
int TestQR() {
    typedef double T; // NOTE: don't change this. blas has different func name
    typedef Eigen::Matrix<T,Eigen::Dynamic,Eigen::Dynamic> MatrixXd;
    typedef Eigen::Matrix<T,Eigen::Dynamic,1> VectorXd;

    // define handles
    cusolverDnHandle_t cusolverH = NULL;
    cublasHandle_t cublasH = NULL;

    const int M = 3;
    const int N = 2;
    const int K = 5;

    MatrixXd A;
    A = MatrixXd::Random(M,N);
    MatrixXd x_ref, x_sol;
    x_sol.resize(N,K);
    x_ref = MatrixXd::Random(N,K);
    MatrixXd b = A*x_ref;

    PrintEMatrix(A, "A");
    PrintEMatrix(b, "b");
    PrintEMatrix(x_ref, "x_ref");

#define CUSOLVER_ERRCHK(x) \
    assert(x == CUSOLVER_STATUS_SUCCESS && "cusolver failed");
#define CUBLAS_ERRCHK(x) \
    assert(x == CUBLAS_STATUS_SUCCESS && "cublas failed");

    CUSOLVER_ERRCHK(cusolverDnCreate(&cusolverH));
    CUBLAS_ERRCHK(cublasCreate(&cublasH));

    T *d_A, *d_b, *d_work, *d_work2, *d_tau;
    int *d_devInfo, devInfo;
    gpuErrchk(cudaMalloc((void**)&d_A, sizeof(T)*M*N));
    gpuErrchk(cudaMalloc((void**)&d_b, sizeof(T)*M*K));
    gpuErrchk(cudaMalloc((void**)&d_tau, sizeof(T)*M));
    gpuErrchk(cudaMalloc((void**)&d_devInfo, sizeof(int)));
    gpuErrchk(cudaMemcpy(d_A, A.data(), sizeof(T)*M*N, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_b, b.data(), sizeof(T)*M*K, cudaMemcpyHostToDevice));
    int bufSize,bufSize2;

    // in-place A = QR
    CUSOLVER_ERRCHK(
        cusolverDnDgeqrf_bufferSize(
            cusolverH,
            M,
            N,
            d_A,
            M,
            &bufSize
        )
    );
    gpuErrchk(cudaMalloc((void**)&d_work, sizeof(T)*bufSize));
    CUSOLVER_ERRCHK(
        cusolverDnDgeqrf(
            cusolverH,
            M,
            N,
            d_A,
            M,
            d_tau,
            d_work,
            bufSize,
            d_devInfo
        )
    );
    gpuErrchk(cudaMemcpy(&devInfo, d_devInfo, sizeof(int),
        cudaMemcpyDeviceToHost));
    assert(0 == devInfo && "QR factorization failed");

    // Q^T*b
    CUSOLVER_ERRCHK(                                                                                                                                                                                                                                                                  
        cusolverDnDormqr_bufferSize(                                        
            cusolverH,                                                      
            CUBLAS_SIDE_LEFT,                                               
            CUBLAS_OP_T,                                                    
            M,                                                              
            K,                                                              
            N,                                                              
            d_A,                                                            
            M,                                                              
            d_tau,                                                          
            d_b,                                                            
            M,                                                              
            &bufSize2                                                       
        )                                                                   
    );                                                                      
    gpuErrchk(cudaMalloc((void**)&d_work2, sizeof(T)*bufSize2));            
    CUSOLVER_ERRCHK(                                                        
        cusolverDnDormqr(                                                   
            cusolverH,                                                      
            CUBLAS_SIDE_LEFT,                                               
            CUBLAS_OP_T,                                                    
            M,                                                              
            K,                                                              
            min(M,N),                                                       
            d_A,                                                            
            M,                                                              
            d_tau,                                                          
            d_b,                                                            
            M,                                                              
            d_work2,                                                        
            bufSize2,                                                       
            d_devInfo                                                       
        )                                                                   
    );
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaMemcpy(&devInfo, d_devInfo, sizeof(int),
        cudaMemcpyDeviceToHost));
    assert(0 == devInfo && "Q^T b failed");

    // need to explicitly copy submatrix for the triangular solve
    T *d_R, *d_b_;
    gpuErrchk(cudaMalloc((void**)&d_R, sizeof(T)*N*N));
    gpuErrchk(cudaMalloc((void**)&d_b_,sizeof(T)*N*K));
    dim3 thd_size(32,32);
    dim3 blk_size((N+thd_size.x-1)/thd_size.x,(N+thd_size.y-1)/thd_size.y);
    Ker_CopyUpperSubmatrix<T><<<blk_size,thd_size>>>(d_A, d_R, M, N, N);
    blk_size = dim3((N+thd_size.x-1)/thd_size.x,(K+thd_size.y-1)/thd_size.y);
    Ker_CopyUpperSubmatrix<T><<<blk_size,thd_size>>>(d_b, d_b_, M, K, N);

    // solve x = R \ (Q^T*B)
    const double one = 1.0;
    CUBLAS_ERRCHK(
        cublasDtrsm(
            cublasH,
            CUBLAS_SIDE_LEFT,
            CUBLAS_FILL_MODE_UPPER,
            CUBLAS_OP_N,
            CUBLAS_DIAG_NON_UNIT,
            N,
            K,
            &one,
            d_R,
            N,
            d_b_,
            N
        )
    );
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(x_sol.data(), d_b_, sizeof(T)*N*K,
        cudaMemcpyDeviceToHost));

    PrintEMatrix(x_ref, "x_ref");
    PrintEMatrix(x_sol, "x_sol");
    std::cout << "solution l2 error = " << (x_ref-x_sol).norm()
              << std::endl;

    exit(0);
    return 0;
}
//##############################################################################