How to configure cublas{t}symm() function arguments

Question

EDIT: Based on questions posed below, I elected to re-work my answer and example code. You can handle row-major storage without transpose at least for these operations. And this observation is further facilitated by the fact that the symm function does not used the packed storage.

So to answer the additional questions:

the cublasCsymm function does not use a packed storage format (like some other functions such as cublasCspmv for example), because the cublasCsymm function is intended to duplicate the functionality of the corresponding netlib function, which also does not use a packed storage format. Based on my review of the cublas API, I don't see a symmetric-packed-storage matrix-matrix multiply function available.
You can use row-major storage (e.g. C-style) with cublas, without transposing, at least for these operations (matrix-matrix multiply, without packed storage) by following the advice given here.

What follows is a re-worked version of my previous example, that incorporates the information in item 2 above.

// Matrix multiplication: C = A * B.
// Host code.
//

// Utilities and system includes
#include <assert.h>
#include <helper_string.h>  // helper for shared functions common to CUDA SDK sa
mples

// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>

// error check macros
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
    do { \
        cublasStatus_t __err = fn; \
        if (__err != CUBLAS_STATUS_SUCCESS) { \
            fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
                (int)(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)



#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif

////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions (in addition to helper_cuda.h)

void inline checkError(cublasStatus_t status, const char *msg)
{
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("%s", msg);
        exit(EXIT_FAILURE);
    }
}
// end of CUDA Helper Functions

// Allocates a matrix with random float entries.
void randomCmplxInit(cuComplex *data, int size)
{
    for (int i = 0; i < size; ++i)
        data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND
_MAX);
}


//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMa
trixSize &matrix_size)
void initializeCUDA(int argc, char **argv, int &devID)
{
    // By default, we use device 0, otherwise we override the device ID based on
 what is provided at the command line
    cudaError_t error;
    devID = 0;

    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
    {
        devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
        error = cudaSetDevice(devID);

        if (error != cudaSuccess)
        {
            printf("cudaSetDevice returned error code %d, line(%d)\n", error, __
LINE__);
            exit(EXIT_FAILURE);
        }
    }

    // get number of SMs on this GPU
    error = cudaGetDevice(&devID);
    cudaDeviceProp deviceProp;
    error = cudaGetDeviceProperties(&deviceProp, devID);

    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, dev
iceProp.name, deviceProp.major, deviceProp.minor);

}


////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID)
{
    int i,j;
    unsigned int m,n,k;
    cudaDeviceProp deviceProp;
    cudaError_t error;

    error = cudaGetDeviceProperties(&deviceProp, devID);

    if (error != cudaSuccess)
    {
        printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    // use a larger block size for Fermi and above

    m=3; //number of rows of matrix op(A) and C.  A--> (m x k)
    n=2; //number of columns of matrix op(B) and C. B--> (k x n)
    k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)

    // I want to compute C = A*B in row-major format,
    //so I must find C(T)=B(T)A(T) = C(T)A in column-major format

    // allocate host memory for matrices A and B
    unsigned int size_A = m*m; //size of a symmetric matrix
    printf("size_A = %d\n", size_A);
    unsigned int mem_size_A = sizeof(cuComplex) * size_A;
    cuComplex *h_A = (cuComplex *)malloc(mem_size_A);

    unsigned int size_B = m*n;
    unsigned int mem_size_B = sizeof(cuComplex) * size_B;
    cuComplex *h_B = (cuComplex *)malloc(mem_size_B);

    // initialize host memory
//    for (i = 0; i < size_A; ++i)
//    h_A[i] = make_cuComplex( (float)(i+1),(float)0);
    h_A[0] = make_cuComplex((float)1, (float)0);
    h_A[1] = make_cuComplex((float)2, (float)0);
    h_A[2] = make_cuComplex((float)4, (float)0);
    h_A[3] = make_cuComplex((float)0, (float)0);
    h_A[4] = make_cuComplex((float)3, (float)0);
    h_A[5] = make_cuComplex((float)5, (float)0);
    h_A[6] = make_cuComplex((float)0, (float)0);
    h_A[7] = make_cuComplex((float)0, (float)0);
    h_A[8] = make_cuComplex((float)6, (float)0);

//    for (i = 0; i < size_B; ++i)
//    h_B[i] = make_cuComplex((float)(i+2), (float)0);
    h_B[0] = make_cuComplex((float)2, (float)0);
    h_B[1] = make_cuComplex((float)3, (float)0);
    h_B[2] = make_cuComplex((float)4, (float)0);
    h_B[3] = make_cuComplex((float)5, (float)0);
    h_B[4] = make_cuComplex((float)6, (float)0);
    h_B[5] = make_cuComplex((float)7, (float)0);

    // allocate device memory
    cuComplex *d_A, *d_B, *d_C;
    unsigned int size_C = m*n;
    unsigned int mem_size_C = sizeof(cuComplex) * size_C;

    // allocate host memory for the result
    cuComplex *h_C      = (cuComplex *) malloc(mem_size_C);
    cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);

    error = cudaMalloc((void **) &d_A, mem_size_A);
    error = cudaMalloc((void **) &d_B, mem_size_B);

    // copy host memory to device
    error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
    error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
    error = cudaMalloc((void **) &d_C, mem_size_C);


    // create and start timer
    printf("Computing result using CUBLAS...");

    // CUBLAS version 2.0
    {
        cublasHandle_t handle;
        cublasStatus_t ret;

        ret = cublasCreate(&handle);

        if (ret != CUBLAS_STATUS_SUCCESS)
        {
            printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
            exit(EXIT_FAILURE);
        }

        const cuComplex alpha = make_cuComplex(1.0f,0.0f);
        const cuComplex beta  = make_cuComplex(0.0f,0.0f);
        //Perform operation with cublas
        ret = cublasCsymm(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, n,m,&alpha,d_A,m,d_B,n,&beta,d_C,n);

        if (ret != CUBLAS_STATUS_SUCCESS)
        {
            printf("cublasCsymm returned error code %d, line(%d)\n", ret, __LINE__);
            exit(EXIT_FAILURE);
        }

        // copy result from device to host
        error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);

        checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
    }


    printf ("\nComputations completed.\n\n");

    printf (" symm matrix A: \n");
//    int s=0;
    for (i=0; i<min(m,4); i++) {
      for (j=0; j<min(m,4); j++) {
        //printf ("%7.5G + j(%7.5G)", h_A[j+i*k].x,h_A[j+i*k].y);
//        printf ("%7.5G", h_A[s].x);
        printf ("%7.5G", h_A[j+(i*m)].x);
//        s++;
      }
      printf ("\n");
    }

    printf ("\n matrix B: \n");
    for (i=0; i<min(k,4); i++) {
      for (j=0; j<min(n,4); j++) {
        //printf ("%7.5G + j(%7.5G)", h_B[j+i*n].x,h_B[j+i*n].y);
          printf ("%7.5G", h_B[j+(i*n)].x);
      }
      printf ("\n");
    }

    printf ("\n matrix C=A*B: \n");
    for (i=0; i<min(m,4); i++) {
      for (j=0; j<min(n,4); j++) {
        //printf ("%7.5G + j(%7.5G)", h_CUBLAS[j+i*n].x,h_CUBLAS[j+i*n].y);
          printf ("%7.5G", h_CUBLAS[j+(i*n)].x);
      }
      printf ("\n");
    }

    // clean up memory
    free(h_A);
    free(h_B);
    free(h_C);
    //free(reference);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    cudaDeviceReset();
    return 0;
}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    printf("[Matrix Multiply CUBLAS] - Starting...\n");
    int devID = 0;
    initializeCUDA(argc, argv, devID);
    int matrix_result = matrixMultiply(argc, argv, devID);
    cudaCheckErrors("some error");
    return 0;
}
$ ./t213
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "Tesla M2070" with compute capability 2.0

size_A = 9
Computing result using CUBLAS...
Computations completed.

 symm matrix A:
      1      2      4
      0      3      5
      0      0      6

 matrix B:
      2      3
      4      5
      6      7

 matrix C=A*B:
     34     41
     46     56
     64     79
$

ORIGINAL RESPONSE:

Several problems:

When I run your code as you have it posted right now, I don't get the results that you show. Here's what I get:

[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "Tesla M2070" with compute capability 2.0

Computing result using CUBLAS...
Computations completed.

symm matrix A:
  1
  2      3
  4      5      6

matrix B:
  2      3
  4      5
  6      7

matrix C=A*B:
  -131   -128
   260   -122
  -115    266

The code compiles with a number of warnings and also you're not doing proper error checking (for example you're not checking the return value from cublasCsymm

You are wanting to multiply C = A*B This means A is on the LEFT, but you are passing CUBLAS_SIDE_RIGHT to cublasCsymm Several other cublasCsymm parameters were wrong as well. I think maybe you thought you could do A*B as (B(T)*A(T)) but that only works for square matrices. Not sure what you were thinking, exactly.
You having row-major storage on your matrices and passing them to cublas which interprets them in column-major order. For the following matrix:
```
1 2
3 4 
```

row-major storage looks like this:

    1 2 3 4

column-major storage looks like this:

    1 3 2 4

You can transpose these matrices if you wish, using cublasCgeam or you can manually modify your storage.

You're making some sort of assumption about some kind of compressed storage format for the symmetric matrix A which is not correct. Read carefully the defintion of the storage type. It doesn't say the portion of the matrix that is "supplied" or "present" it says the portion of the matrix that is filled.

Here is a complete code that has the above problems fixed:

// Matrix multiplication: C = A * B.
// Host code.
//

// Utilities and system includes
#include <assert.h>
#include <helper_string.h>  // helper for shared functions common to CUDA SDK sa
mples

// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>

// error check macros
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
    do { \
        cublasStatus_t __err = fn; \
        if (__err != CUBLAS_STATUS_SUCCESS) { \
            fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
                (int)(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)



#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif

////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions (in addition to helper_cuda.h)


void inline checkError(cublasStatus_t status, const char *msg)
{
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("%s", msg);
        exit(EXIT_FAILURE);
    }
}
// end of CUDA Helper Functions

// Allocates a matrix with random float entries.
void randomCmplxInit(cuComplex *data, int size)
{
    for (int i = 0; i < size; ++i)
        data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND_MAX);
}


//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
void initializeCUDA(int argc, char **argv, int &devID)
{
    // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
    cudaError_t error;
    devID = 0;

    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
    {
        devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
        error = cudaSetDevice(devID);

        if (error != cudaSuccess)
        {
            printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
            exit(EXIT_FAILURE);
        }
    }

    // get number of SMs on this GPU
    error = cudaGetDevice(&devID);
    cudaDeviceProp deviceProp;
    error = cudaGetDeviceProperties(&deviceProp, devID);

    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);

}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID)
{
    int i,j;
    unsigned int m,n,k;
    cudaDeviceProp deviceProp;
    cudaError_t error;

    error = cudaGetDeviceProperties(&deviceProp, devID);

    if (error != cudaSuccess)
    {
        printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    // use a larger block size for Fermi and above

    m=3; //number of rows of matrix op(A) and C.  A--> (m x k)
    n=2; //number of columns of matrix op(B) and C. B--> (k x n)
    k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)

    // I want to compute C = A*B in row-major format,
    //so I must find C(T)=B(T)A(T) = C(T)A in column-major format

    // allocate host memory for matrices A and B
    unsigned int size_A = m*m; //size of a symmetric matrix
    printf("size_A = %d\n", size_A);
    unsigned int mem_size_A = sizeof(cuComplex) * size_A;
    cuComplex *h_A = (cuComplex *)malloc(mem_size_A);

    unsigned int size_B = m*n;
    unsigned int mem_size_B = sizeof(cuComplex) * size_B;
    cuComplex *h_B = (cuComplex *)malloc(mem_size_B);

    // initialize host memory
//    for (i = 0; i < size_A; ++i)
//    h_A[i] = make_cuComplex( (float)(i+1),(float)0);
    h_A[0] = make_cuComplex((float)1, (float)0);
    h_A[1] = make_cuComplex((float)2, (float)0);
    h_A[2] = make_cuComplex((float)4, (float)0);
    h_A[3] = make_cuComplex((float)0, (float)0);
    h_A[4] = make_cuComplex((float)3, (float)0);
    h_A[5] = make_cuComplex((float)5, (float)0);
    h_A[6] = make_cuComplex((float)0, (float)0);
    h_A[7] = make_cuComplex((float)0, (float)0);
    h_A[8] = make_cuComplex((float)6, (float)0);

//    for (i = 0; i < size_B; ++i)
//    h_B[i] = make_cuComplex((float)(i+2), (float)0);
    h_B[0] = make_cuComplex((float)2, (float)0);
    h_B[1] = make_cuComplex((float)4, (float)0);
    h_B[2] = make_cuComplex((float)6, (float)0);
    h_B[3] = make_cuComplex((float)3, (float)0);
    h_B[4] = make_cuComplex((float)5, (float)0);
    h_B[5] = make_cuComplex((float)7, (float)0);

    // allocate device memory
    cuComplex *d_A, *d_B, *d_C;
    unsigned int size_C = m*n;
    unsigned int mem_size_C = sizeof(cuComplex) * size_C;

    // allocate host memory for the result
    cuComplex *h_C      = (cuComplex *) malloc(mem_size_C);
    cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);

    error = cudaMalloc((void **) &d_A, mem_size_A);
    error = cudaMalloc((void **) &d_B, mem_size_B);

    // copy host memory to device
    error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
    error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
    error = cudaMalloc((void **) &d_C, mem_size_C);


    // create and start timer
    printf("Computing result using CUBLAS...");

    // CUBLAS version 2.0
    {
        cublasHandle_t handle;
        cublasStatus_t ret;

        ret = cublasCreate(&handle);

        if (ret != CUBLAS_STATUS_SUCCESS)
        {
            printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
            exit(EXIT_FAILURE);
        }

        const cuComplex alpha = make_cuComplex(1.0f,0.0f);
        const cuComplex beta  = make_cuComplex(0.0f,0.0f);
        //Perform operation with cublas
        ret = cublasCsymm(handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, m,n,&alpha,d_A,m,d_B,m,&beta,d_C,m);
        if (ret != CUBLAS_STATUS_SUCCESS)
        {
            printf("cublasCsymm returned error code %d, line(%d)\n", ret, __LINE__);
            exit(EXIT_FAILURE);
        }

Here is the output:

[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "Tesla M2070" with compute capability 2.0

size_A = 9
Computing result using CUBLAS...
Computations completed.

 symm matrix A:
      1      0      0
      2      3      0
      4      5      6

 matrix B:
      2      3
      4      5
      6      7

 matrix C=A*B:
     34     41
     46     56
     64     79