Question

Next program is going to inverse a matrix on GF(2^8)(addition is similar to XOR,multiplication applys table loop-up method) using Gauss-Jordan elimination on NVIDIA GeForce 310 GPU,CUDA v4.2.

typedef unsigned char BYTE;
#define BLOCK_SIZE 16

// addition
__inline __device__ BYTE  
add_GF(BYTE a,BYTE b)
{
    return a^b;
}

// subtraction
__inline __device__ BYTE  
sub_GF(BYTE a,BYTE b)   
{
    return a^b;
}

// multiplication
__inline __device__ BYTE  
mul_GF(BYTE a,BYTE b,BYTE *d_numOf, BYTE *d_indexOf )    
{
    if(a==0 || b == 0) return 0;
    return d_numOf[(d_indexOf[a] + d_indexOf[b])%255];

}

// divison
__inline __device__ BYTE
div_GF(BYTE a,BYTE b, BYTE *d_numOf,BYTE *d_indexOf, BYTE *d_inv)
{
    if(b == 0) return 0;
    return mul_GF(a,d_inv[b],d_numOf,d_indexOf);
}

// swap two line
__global__ void
LineSwap(BYTE *M, int *n,int *a, int *b)
{
    BYTE temp;
    const unsigned int tid = blockIdx.x*blockDim.x+threadIdx.x;

    temp = M[(*a)*(*n+*n)+tid];
    M[*a*(*n+*n)+tid] = M[(*b)*(*n+*n)+tid];
    M[*b*(*n+*n)+tid] = temp;

}

// multiply a line by a factor
__global__ void
LineMul(BYTE *M, int *n,int *a, BYTE *d_numOf, BYTE *d_indexOf, BYTE *d_inv)
{
    BYTE k =  div_GF(128, M[*a*(*n+*n)+*a], d_numOf, d_indexOf, d_inv);
    const unsigned int tid = blockIdx.x*blockDim.x+threadIdx.x;

    M[*a*(*n+*n)+tid] = mul_GF( k , M[*a*(*n+*n)+tid], d_numOf, d_indexOf );
}

// multiply a line by a factor then subtract another line
__global__ void
LineMulSub(BYTE *M, int *n,int *a, BYTE *k, int *b, BYTE *d_numOf, BYTE *d_indexOf)
{
    const unsigned int tid = blockIdx.x*blockDim.x+threadIdx.x;

    M[*b*(*n+*n)+tid] = sub_GF( M[*b*(*n+*n)+tid] , mul_GF(*k ,M[*a*(*n+*n)+tid], d_numOf, d_indexOf));
}

// compute the inverse matrix 
bool InvMatGF(BYTE* h_A, BYTE* &h_Inv, int n)
{
    //h_M[n*(n+n)] is a augmented matrix.
    BYTE *h_M = new BYTE [n*(n+n)];
    for(int i=0; i < n*(n+n); i++)
    {
        h_M[i] = 0;
    }

    for( int i=0; i<n; i++ )
    {
        for( int j=0; j<n; j++ )
        {
            h_M[i*(n+n)+j] = h_A[i*n+j];
            h_M[i*(n+n)+(n+j)] = 0;
        }
    }

    for( int i=0; i<n; i++ )
    {
        h_M[i*(n+n)+(n+i)] = 128;
    }

    BYTE *d_A = NULL;
    BYTE *d_M = NULL;
    int *d_n = NULL;
    int *d_i = NULL;
    int *d_j = NULL;
    BYTE *d_numOf = NULL;
    BYTE *d_indexOf = NULL;
    BYTE *d_inv = NULL;

    int size_A = n*n*sizeof(BYTE);
    int size_M = n*(n+n)*sizeof(BYTE);
    int size_Lookup_Table = TABLE_SIZE*sizeof(BYTE);
    int size_INTEGER = sizeof(int);

    checkCudaErrors( cudaMalloc((void**) &d_A, size_A) );
    checkCudaErrors( cudaMalloc((void**) &d_M, size_M));
    checkCudaErrors( cudaMalloc((void**) &d_n, size_INTEGER) );
    checkCudaErrors( cudaMalloc((void**) &d_i, size_INTEGER) );
    checkCudaErrors( cudaMalloc((void**) &d_j, size_INTEGER) );
    checkCudaErrors( cudaMalloc((void**) &d_numOf, size_Lookup_Table) );
    checkCudaErrors( cudaMalloc((void**) &d_indexOf, size_Lookup_Table) );
    checkCudaErrors( cudaMalloc((void**) &d_inv, size_Lookup_Table) );

    checkCudaErrors( cudaMemcpy(d_A,h_A,size_A,cudaMemcpyHostToDevice) );
    checkCudaErrors( cudaMemcpy(d_n,&n,size_INTEGER,cudaMemcpyHostToDevice) );
    checkCudaErrors( cudaMemcpy(d_numOf,&numOf,size_Lookup_Table,cudaMemcpyHostToDevice) );
    checkCudaErrors( cudaMemcpy(d_indexOf,&indexOf,size_Lookup_Table,cudaMemcpyHostToDevice) );
    checkCudaErrors( cudaMemcpy(d_inv,&inv,size_Lookup_Table,cudaMemcpyHostToDevice) );

    dim3 blockDim(BLOCK_SIZE,BLOCK_SIZE,1);
    dim3 gridDim(((n+n)+blockDim.x-1)/blockDim.x,1,1);

    cudaEvent_t start,stop;
    cudaEventCreate( &start );
    cudaEventCreate( &stop );
    cudaEventRecord( start, 0 );

    for(int i = 0; i < n; i++)
    {
        if(h_M[i*(n+n)+i] != 0)
        {
            checkCudaErrors(cudaMemcpy(d_i, &i, sizeof(int), cudaMemcpyHostToDevice));
            checkCudaErrors( cudaMemcpy(d_M,h_M,size_M,cudaMemcpyHostToDevice) );
            LineMul<<<gridDim,blockDim,0>>>(d_M,d_n,d_i,d_numOf,d_indexOf,d_inv); // on GPU
            checkCudaErrors( cudaMemcpy(h_M,d_M,size_M,cudaMemcpyDeviceToHost) );

            for(int j = 0; j < n; j++)
            {
                if(j != i)
                {
                    BYTE *d_MElem = 0;

                    checkCudaErrors( cudaMalloc((void**) &d_MElem,sizeof(BYTE)) );
                    checkCudaErrors( cudaMemcpy(d_j, &j, sizeof(int), cudaMemcpyHostToDevice) );
                    checkCudaErrors( cudaMemcpy(d_MElem,&h_M[j*(n+n)+i],sizeof(BYTE),cudaMemcpyHostToDevice) );
                    LineMulSub<<<gridDim,blockDim,0>>>(d_M,d_n,d_i,d_MElem,d_j,d_numOf,d_indexOf);// on GPU
                    checkCudaErrors( cudaMemcpy(h_M,d_M,size_M,cudaMemcpyDeviceToHost) );
                    checkCudaErrors( cudaFree(d_MElem) );
                }
            }
        }
        else
        {
            for(int j = i+1; j < n; j++)
            {
                if(h_M[j*(n+n)+i] != 0)
                {
                    checkCudaErrors(cudaMemcpy(d_i, &i, sizeof(int), cudaMemcpyHostToDevice));
                    checkCudaErrors(cudaMemcpy(d_j, &j, sizeof(int), cudaMemcpyHostToDevice));
                    checkCudaErrors( cudaMemcpy(d_M,h_M,size_M,cudaMemcpyHostToDevice) );
                    LineSwap<<<gridDim,blockDim,0>>>(d_M,d_n,d_i,d_j);//on GPU
                    checkCudaErrors( cudaMemcpy(h_M,d_M,size_M,cudaMemcpyDeviceToHost) );
                    i--;
                    break;
                }
                if(j == n-1)
                {
                    printf("(1)No inverse matrix!\n");
                    return false;
                }
            }
        }
    }

    for (int i = 0; i < n; i++)
    {
            if(h_M[i*(n+n)+i] != 128)
        {
            printf("(2)No inverse matrix: not full rank!\n");
            return false;
        }
    }

    for (int i = 0; i < n; i++)
    {
        for (int j = 0; j < n; j++)
        {
            h_Inv[i*n+j] =  h_M[i*(n+n)+n+j];
        }
    }

    cudaEventRecord( stop, 0 );// united on "ms"
    cudaEventSynchronize( stop );
    float elapsedTime;
    cudaEventElapsedTime( &elapsedTime, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );

    float throughputInverse = (float) n/(elapsedTime*0.001) *0.000001;
    printf("%d\t%f\t%f\t",n,elapsedTime*0.001,throughputInverse);

    checkCudaErrors( cudaFree(d_i) );
    checkCudaErrors( cudaFree(d_j) );
    checkCudaErrors( cudaFree(d_A) );
    checkCudaErrors( cudaFree(d_M) );
    checkCudaErrors( cudaFree(d_n) );
    checkCudaErrors( cudaFree(d_numOf) );
    checkCudaErrors( cudaFree(d_indexOf) );
    checkCudaErrors( cudaFree(d_inv) );
    delete[] h_M;

    return true;
}

But question is when I compile it by:

nvcc -g -G INVonGPUv1.1.cu -o INVonGPUv1.1 -I../../NVIDIA_GPU_Computing_SDK/C/common/inc -I../../NVIDIA_GPU_Computing_SDK/shared/inc  -arch=compute_12

,a normal output is right as follow.

################### Inversing start ####################
#n  timeInverse(s)  throughputInverse(MB/s) errorRate(0~1)  isInverse
#=================== INVERSE on GPU v1.0 ====================
128 1.565791    0.000082    1
256 14.190008   0.000018    1
512 154.687016  0.000003    1
################ Inversing stop ####################

But when I remove "-g -G" and compile with:

nvcc INVonGPUv1.1.cu -o INVonGPUv1.1 -I../../NVIDIA_GPU_Computing_SDK/C/common/inc -I../../NVIDIA_GPU_Computing_SDK/shared/inc  -arch=compute_12

,I couldn't get the inverse matrix. Why and what the working principle of "-g -G"?

################### Inversing start ####################
#n  timeInverse(s)  throughputInverse(MB/s) errorRate(0~1)  isInverse
#=================== INVERSE on GPU v1.0 ====================
(1)No inverse matrix!
0
(1)No inverse matrix!
0
(1)No inverse matrix!
0
################ Inversing stop ####################
Was it helpful?

Solution

-g is similar to that option in gcc: it produces debug information for host code.

-G produces debug information for device code.

For more information on command options to NVCC see the manual on NVCC. This PDF should be installed along with CUDA Toolkit.

Your code sample is a bit too long to dissect. Go over your code carefully. Bugs that appear in release build and not in debug mode and vice versa are pretty common. They are usually due to some memory bug in the code that manifests in one mode and not in the other.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top