Question

I am trying to use the vector::data() pointer when using cudaMalloc, cudaMemcpy, and cublasSgemm but I can't seem to get it to work. If I am not mistaken, vector::data() should return a pointer to the actual array stored in memory for that vector so it should be the same as having a T* aArray pointer to an array of type T stored in memory. Using the latter does work, but not the data() pointer.

Here is the code I am working on:

Matrix<T> Matrix<T>::cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
{
C = Matrix<T>(A.height, B.width); //resizing of the vector of elements for Matrix C
//A[m][n]*B[n][k]=C[m][k]
int m = A.height;
int n = B.height;
int k = B.width;
float alpha = 1.0f;
float beta = 0.0f;

T* d_a = A.GetPointer();
T* d_b = B.GetPointer();
T* d_c = C.GetPointer();

cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);

cudaMemcpy(d_a,A.GetPointer(),A.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,B.GetPointer(),B.size,cudaMemcpyHostToDevice);

cublasHandle_t handle;

cublasStatus_t status = cublasCreate(&handle);

if (status != CUBLAS_STATUS_SUCCESS) 
{
    std::cerr << "!!!! CUBLAS initialization error\n";
}

status = cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,k,m,n,&alpha,d_b,k,d_a,n,&beta,d_c,k);

if (status != CUBLAS_STATUS_SUCCESS) 
{
    std::cerr << "!!!! kernel execution error.\n";
}

status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS) 
{
    std::cerr << "!!!! shutdown error (A)\n";
}

cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

The GetPointer() member function returns vector::data() of the vector of elements for that Matrix object. Size is the vector element's size in memory.

The vector of Matrix C returns all zeros when using the data() pointer, and returns the product of Matrix A and B when using T* aArray pointers without vectors.

Is it actually possible to use vectors to store the array of elements and then the data() pointer to initialize the device copy of the array, or am I forced to use the C style array storage on the host? Also, I have tried using thrust::device_vector and that works but I would like to stay away from creating raw_pointer_casts.

Thanks for your help!

Edit: For those having trouble with copy and pasting, here is the complete example:

#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_device_runtime_api.h>
#include <cublas_v2.h>
#include <vector>
#include <iostream>

using namespace std;

template<typename T> class Matrix
{
public:
~Matrix();
Matrix();
Matrix(int rows, int columns);
int width;
int height;
int stride;
size_t size;

T &GetElement(int row, int column);
void SetElement(int row, int column, T value);
void SetElements(vector<T> value);
vector<T>& GetElements();
T* GetPointer();
Matrix<T> cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C);
private:
vector<T> elements;
T* firstElement;
};

template<typename T>
Matrix<T>::~Matrix()
{
}

template<typename T>
Matrix<T>::Matrix()
{
}

template<typename T>
Matrix<T>::Matrix(int rows, int columns)
{
height = rows;
width = columns;
stride = columns; //in row major order this is equal to the # of columns
elements.resize(rows*columns);
firstElement = elements.data();
size = height*width*sizeof(T);
}

template<typename T>
T &Matrix<T>::GetElement(int row, int column)
{
return elements[row*width + column]; //row major order return
}

template<typename T>
vector<T>& Matrix<T>::GetElements()
{
return elements; //row major order return
}

template<typename T>
void Matrix<T>::SetElement(int row, int column, T value)
{
elements[row*width + column] = value; //row major order return
}

template<typename T>
void Matrix<T>::SetElements(vector<T> value)
{
elements = value;
}

template<typename T>
T* Matrix<T>::GetPointer()
{
return firstElement;
}


template<typename T>
//Matrix Multiplication using CUDA
Matrix<T> Matrix<T>::cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)
{
C = Matrix<T>(A.height, B.width);
//A[m][n]*B[n][k]=C[m][k]
int m = A.height;
int n = B.height;
int k = B.width;
float alpha = 1.0f;
float beta = 0.0f;


//Thrust usage

/*thrust::device_vector<T> d_A = A.GetElements();
T* d_a = thrust::raw_pointer_cast(&d_A[0]);
thrust::device_vector<T> d_B = B.GetElements();
T* d_b = thrust::raw_pointer_cast(&d_B[0]);
thrust::device_vector<T> d_C = C.GetElements();
T* d_c = thrust::raw_pointer_cast(&d_C[0]);*/

T* d_a = A.GetPointer();
T* d_b = B.GetPointer();
T* d_c = C.GetPointer();

cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);

cudaMemcpy(d_a,A.GetPointer(),A.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,B.GetPointer(),B.size,cudaMemcpyHostToDevice);
cudaMemcpy(d_c,C.GetPointer(),C.size,cudaMemcpyHostToDevice);

cublasHandle_t handle;

cublasStatus_t status = cublasCreate(&handle);

if (status != CUBLAS_STATUS_SUCCESS) 
{
    std::cerr << "!!!! CUBLAS initialization error\n";
}

status = cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,k,m,n,&alpha,d_b,k,d_a,n,&beta,d_c,k);

if (status != CUBLAS_STATUS_SUCCESS) 
{
    std::cerr << "!!!! kernel execution error.\n";
}

status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS) 
{
    std::cerr << "!!!! shutdown error (A)\n";
}

//thrust::copy(d_C.begin(), d_C.end(), C.GetElements().begin());

cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

return C;

}

int main()
{
    Matrix<float> A(2,2);
Matrix<float> B(2,2);
Matrix<float> C;

vector<float> aE(4,2);
vector<float> bE(4,4);
A.SetElements(aE);
B.SetElements(bE);

C = C.cudaProd(A, B, C);  //function call to cudaProd()

for(int row = 0; row < A.height; ++row)
{
    for(int col = 0; col < A.width; ++col)
    {       
        cout<<A.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
     }
     printf("\n");
}
printf("\n");

for(int row = 0; row < B.height; ++row)
{
    for(int col = 0; col < B.width; ++col)
    {       
        cout<<B.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
     }
     printf("\n");
}
printf("\n");

for(int row = 0; row < C.height; ++row)
{
    for(int col = 0; col < C.width; ++col)
    {       
        cout<<C.GetElement(row, col)<<" "; //h_c is stored on device in column major order, need to switch to row major order
     }
     printf("\n");
}
printf("\n");
}
Was it helpful?

Solution 2

From the std::vector::data documentation, data() returns both const and non-const qualified pointers, depending on the fact that the vector is qualified as const or not. Quoting the documentation

If the vector object is const-qualified, the function returns a pointer to const value_type. Otherwise, it returns a pointer to value_type.

Accordingly, using

firstElement = elements.data();

in the Matrix constructor is fine to read/write the data.

The main problem with your code is that you are declaring C in the main, passing a reference to C to the cudaProd method and then internally using

C = Matrix<T>(A.height, B.width);

which will redeclare the Matrix.

If you change the definition of the cudaProd method to

template<typename T>
void cudaProd(Matrix<T>&A,Matrix<T>&B, Matrix<T>&C)

remove the

return C;

statement and allocate space for C in the main as

Matrix<float> C(2,2);
vector<float> cE(4,10);
C.SetElements(cE);

your code should work correctly.

OTHER TIPS

If I am not mistaken, vector::data() should return a pointer to the actual array stored in memory for that vector so it should be the same as having a T* aArray pointer to an array of type T stored in memory.

The std::vector class is an owning resource class. It means that trying to manage the underlying resource yourself with the data pointer will make you enter a world of pain.

For this very same reason:

cudaMalloc(&d_a,A.size);
cudaMalloc(&d_b,B.size);
cudaMalloc(&d_c,C.size);

and:

cudaMemcpy(C.GetPointer(), d_c, C.size,cudaMemcpyDeviceToHost);

and:

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

cannot possibly work.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top