- Allocate memory for
B
withcudaMalloc()
- Copy it from host to device with
cudaMemcpy()
- Pass the device pointer in the kernel argument list
Finally you use it from the kernel with the argument you have passed! Example:
1 // Kernel definition, see also section 4.2.3 of Nvidia Cuda Programming Guide
2 __global__ void vecAdd(float* A, float* B, float* C)
3 {
4 // threadIdx.x is a built-in variable provided by CUDA at runtime
5 int i = threadIdx.x;
6 A[i]=0;
7 B[i]=i;
8 C[i] = A[i] + B[i];
9 }
10
11 #include <stdio.h>
12 #define SIZE 10
13 int main()
14 {
15 int N=SIZE;
16 float A[SIZE], B[SIZE], C[SIZE];
17 float *devPtrA;
18 float *devPtrB;
19 float *devPtrC;
20 int memsize= SIZE * sizeof(float);
21
22 **cudaMalloc((void**)&devPtrA, memsize);**
23 cudaMalloc((void**)&devPtrB, memsize);
24 cudaMalloc((void**)&devPtrC, memsize);
25 **cudaMemcpy(devPtrA, A, memsize, cudaMemcpyHostToDevice);**
26 cudaMemcpy(devPtrB, B, memsize, cudaMemcpyHostToDevice);
27 // __global__ functions are called: Func<<< Dg, Db, Ns >>>(parameter);
28 **vecAdd<<<1, N>>>(devPtrA, devPtrB, devPtrC);**
29 cudaMemcpy(C, devPtrC, memsize, cudaMemcpyDeviceToHost);
30
31 for (int i=0; i<SIZE; i++)
32 printf("C[%d]=%f\n",i,C[i]);
33
34 cudaFree(devPtrA);
35 cudaFree(devPtrA);
36 cudaFree(devPtrA);
37 }
The ** areas are the important part for you. Example taken from here. You may want to look at this question.
EDIT#1:
First of all to declare a kernel function you need to place the keyword __global__
before the returning type, e.g.
__global__ void copyFNFVecs_(double **FNFVecs, int numpulsars, int numcoeff)
.
Moreover, I would use just one pointer to the first element of the matrix you have.
double *devPtr
.
Allocate it with
cudaMalloc((void*)&devPtr, size)
and then copy
cudaMemcpy(devPtr, hostPtr, size, hostToDevice)
.
Note that to calculate the size your structure you need the dimensions (say X and Y) and the size of the underlying type of elements (say double).
size_t size = X*Y*sizeof(double)
.
sizeof(double *)
means size of pointer to a double which is incorrect (In 32bit machines the size of a pointer is 4 bytes but the size of double is 8 bytes).