Pergunta

I have a mixed mex and cuda code to evaluate phi = 1/2*x'*A*x - b'*x, where x and b are both m by 1 vectors and A is an m by m matrix. The code can be compiled and executed and it also gives me the correct answer.

However, when I exited MATLAB, I kept getting the error Segmentation fault (core dumped). What I did in the code is that, I generate A, b and x in MATLAB, use a mex function to pass them to cuda. Then I evaluate phi = 1/2*x'*A*x - b'*x on GPU (use the cuda linear algebra library cublas) and use mex to transfer phi back to MATLAB.

Can anyone help me to see where the problem is? Thanks in advance.

Btw, here is how I compile it:

nvcc -arch=sm_20 -c test.cu -Xcompiler -fPIC -I/site/local/matlab-r2012a/extern/include/
mex -L/usr/local/cuda/lib64 -lcudart -lcublas test.o

To open the MATLAB, you need to link the libstdc++ library:

LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6 matlab

In MATLAB, I did the following to test the code:

N = 500; x = randn(N,1);B = randn(N);A = B'*B; b = randn(N,1);    
tic, 1/2*x'*A*x-b'*x, toc    
tic, phi = cublas_mex_test(x,A,b),toc

Below is my code test.cu:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cuda.h"
#include "mex.h"
#include <time.h>
#define IDX2C(i,j,ld) (((j)*(ld))+(i))

// this is the actual function that evaluates phi = 1/2*x'*A*x - b'*x
int PhiEval(double *x, double *A, double *b, double &phi, size_t m)
{ 
    cudaError_t cudaStat; 
    cublasStatus_t stat;
    cublasHandle_t handle;
    clock_t start, end;

    start = clock();
    //host data
    double *bx, *xAx;
    bx = (double*)malloc(1*sizeof(double)); 
    xAx = (double*)malloc(1*sizeof(double)); 

    //device data
    double* A_d, * x_d, *b_d, *Ax_d, *bx_d, *xAx_d;
    cudaStat = cudaMalloc ((void**)&A_d, m*m*sizeof(double));
    cudaStat = cudaMalloc ((void**)&x_d, m*sizeof(double));
    cudaStat = cudaMalloc ((void**)&b_d, m*sizeof(double));
    cudaStat = cudaMalloc ((void**)&Ax_d, m*sizeof(double));
    cudaStat = cudaMalloc ((void**)&bx_d, 1*sizeof(double));
    cudaStat = cudaMalloc ((void**)&xAx_d, 1*sizeof(double));
    if (cudaStat != cudaSuccess) {
        printf ("device memory allocation failed");
        return EXIT_FAILURE;
    }
    end = clock();
    printf ("It takes: %d clicks (%.7f seconds) to allocate the memory.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);

    start = clock();
    stat = cublasCreate(&handle);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("CUBLAS initialization failed\n");
        return EXIT_FAILURE;
    }

    // copy host data to device
    stat = cublasSetMatrix (m,m, sizeof(double), A, m, A_d, m);
    stat = cublasSetVector (m, sizeof(double), x, 1, x_d, 1);
    stat = cublasSetVector (m, sizeof(double), b, 1, b_d, 1);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("data download failed");
        cudaFree (A_d);
        cudaFree (x_d);
        cudaFree (b_d);
        cudaFree (Ax_d);
        cudaFree(xAx_d);
        cudaFree(bx_d);
        cublasDestroy(handle);
        free(bx); free(xAx);
        return EXIT_FAILURE;
    }
    end = clock();
    printf ("It takes: %d clicks (%.7f seconds) to copy the data to GPU.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);


    start = clock();
    //calculate A*x and store the result in Ax_d
    double alpha = 1;
    double beta = 0;
    stat = cublasDgemv(handle, CUBLAS_OP_N, m,m, &alpha, A_d, m, x_d, 1, &beta, Ax_d, 1);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("data download failed");
        cudaFree (A_d);
        cudaFree (x_d);
        cudaFree (b_d);
        cudaFree (Ax_d);
        cudaFree(xAx_d);
        cudaFree(bx_d);
        cublasDestroy(handle);
        free(bx); free(xAx);
        return EXIT_FAILURE;
    }

    //calculate x'*A*x and store the result in xAx_d
    stat = cublasDgemv(handle, CUBLAS_OP_T, m,1, &alpha, x_d, m, Ax_d, 1, &beta, xAx_d, 1);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("inner product failed");
        cudaFree (A_d);
        cudaFree (x_d);
        cudaFree (b_d);
        cudaFree (Ax_d);
        cudaFree(xAx_d);
        cudaFree(bx_d);
        cublasDestroy(handle);
        free(bx); free(xAx);
        return EXIT_FAILURE;
    }
    stat = cublasGetVector (1, sizeof(double),xAx_d, 1, xAx, 1); // copy the result x'*A*x to host

    //calculate b'*x and store the result in bx_d
    stat = cublasDgemv(handle, CUBLAS_OP_T, m,1, &alpha, b_d, m, x_d, 1, &beta, bx_d, 1);
    if (stat != CUBLAS_STATUS_SUCCESS) {
        printf ("inner product failed");
        cudaFree (A_d);
        cudaFree (x_d);
        cudaFree (b_d);
        cudaFree (Ax_d);
        cudaFree(xAx_d);
        cudaFree(bx_d);
        cublasDestroy(handle);
        free(bx); free(xAx);
        return EXIT_FAILURE;
    }
    stat = cublasGetVector (1, sizeof(double),bx_d, 1, bx, 1);
    end = clock();
    printf ("It takes: %d clicks (%.7f seconds) to call functions in cublas.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);

    //calculate phi = 1/2*x'*A*x - b'*x
    phi = .5*xAx[0]-bx[0];

    start = clock();
    //free the memory
    cudaFree (A_d);
    cudaFree (x_d);
    cudaFree (b_d);
    cudaFree (Ax_d);
    cudaFree(xAx_d);
    cudaFree(bx_d);
    cublasDestroy(handle);
    free(bx); free(xAx);

    end = clock();
    printf ("It takes: %d clicks (%.7f seconds) to free the memory.\n",end-start,(double)(end-start)/CLOCKS_PER_SEC);

    return EXIT_SUCCESS;
}

/* the gateway function */
void mexFunction( int nlhs, mxArray *plhs[],
                  int nrhs, const mxArray *prhs[])
{
    double phi;
    double *A, *b, *x;

    size_t mrows,ncols;

    /*  check for proper number of arguments */
    if(nrhs!=3) 
        mexErrMsgIdAndTxt( "MATLAB:MinTest:invalidNumInputs",
            "Three inputs required.");
    if(nlhs!=1) 
        mexErrMsgIdAndTxt( "MATLAB:MinTest:invalidNumOutputs",
            "One output required.");


    /*  create a pointer to the input vector x */
    x = mxGetPr(prhs[0]);

    /*  create a pointer to the input matrix A */
    A = mxGetPr(prhs[1]);

    /*  create a pointer to the input vector b */
    b = mxGetPr(prhs[2]);

    /*  get the dimensions of the matrix input A */
    mrows = mxGetM(prhs[1]);
    ncols = mxGetN(prhs[1]);

    if(mrows!=ncols) 
        mexErrMsgIdAndTxt( "MATLAB:MinTest:invalidMatrixInput",
            "A has to be a square matrix");

    /*  call the cpp subroutine */
    PhiEval(x,A,b,phi,mrows);   

    plhs[0] = mxCreateDoubleScalar(phi);
}
Foi útil?

Solução

Apparently a number of folks have observed that matlab mex may crash (on matlab exit or mex clear) in some environments. This may be due to having an installed cuda version different than what matlab PCT (if installed) is using, as suggested here

A workaround in this case is to issue a gpu function, such as gpuDevice() in matlab, before executing any mex function.

Licenciado em: CC-BY-SA com atribuição
Não afiliado a StackOverflow
scroll top