Pregunta

I'm using CUDA and attempting to use a function pointer to pass a CUDA function to a library that later uses this function in its device kernel, similar to the CUDA function pointer example.

The important sections of the code are:

/** Type definition for the execution function in #qsched_run. */
typedef void (*qsched_funtype)( int , void * );

__device__ void gpuTest(int type , void *data)
{
  ....
}
__device__ qsched_funtype function = gpuTest;

void main(...)
{
//Various initialization setup.

if( cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype) ) != cudaSuccess)
    error("Failed to copy function pointer from device");

qsched_run_CUDA( &s , func );
}

The qsched_run_CUDA function is a library function that does some initialization, copies the function pointer to the device (to a variable it can see) and then runs a kernel that at some points calls the gpuTest function using that function pointer.

The code compiles correctly provided I use -G with the following nvcc call:

nvcc -g -G -m64 -I../src ../src/.libs/libquicksched_cuda.a -L/home/aidan/cuda_6.0/lib -L/home/aidan/cuda_6.0/lib64 -lcudart -lcuda -DWITH_CUDA -gencode arch=compute_30,code=sm_30 -lgomp test_gpu_simple.cu -o out.out

where

../src/.libs/libquicksched_cuda.a

is the library containing the qsched_run_CUDA function.

The moment I remove the -G flag from my nvcc call then suddenly it all breaks, and the kernel run in qsched_run_CUDA crashes with an invalid program counter error, and the function pointer (including in my own .cu file) is set to 0x4.

Presumably I need to use the seperate compilation in CUDA ( http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#using-separate-compilation-in-cuda ) as explained vaguely in Cuda function pointer consistency - however I'm not sure how to do this when using library functions, neither nvcc's guide nor the stackoverflow link make it obvious how to do this.

Has anyone any experience with this? I attempted to briefly try to work out nvlink to do this but I didn't get far (it didn't seem happy with my passing it a library).

¿Fue útil?

Solución

Yes, you will need to use separate compilation. I put together a simple test case based on what you have shown so far, and using the nvcc separate compilation library example from the documentation. Here is the code:

kernel_lib.cu:

#include <stdio.h>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

/** Type definition for the execution function in #qsched_run. */
typedef void (*qsched_funtype)( int , void * );

__global__ void mykernel(int type, void *data, void *func){
  ((qsched_funtype)func)(type, data);
}

int qsched_run_CUDA(int val, void *d_data, void *func)
{
  mykernel<<<1,1>>>(val, d_data, func);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");
  return 0;
}

main.cu:

#include <stdio.h>
#define DATA_VAL 5

int qsched_run_CUDA(int, void*, void*);

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

/** Type definition for the execution function in #qsched_run. */
typedef void (*qsched_funtype)( int , void * );

__device__ void gpuTest(int type , void *data)
{
  ((int *)data)[0] = type;
}
__device__ qsched_funtype function = gpuTest;


int main()
{
  void *func;
  cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype));
  cudaCheckErrors("Failed to copy function pointer from device");
  int h_data = 0;
  int *d_data;
  cudaMalloc((void **)&d_data, sizeof(int));
  cudaCheckErrors("cudaMalloc fail");
  cudaMemset(d_data, 0, sizeof(int));
  cudaCheckErrors("cudaMemset fail");
  int return_val = qsched_run_CUDA(DATA_VAL, (void *)d_data, func);
  if (return_val != 0) printf("return code error\n");
  cudaMemcpy(&h_data, d_data, sizeof(int), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy fail");
  if (h_data != DATA_VAL) {printf("Fail! %d\n", h_data); return 1;}
  printf("Success!\n");
  return 0;
}

compile commands and result:

$ nvcc -arch=sm_20 -dc kernel_lib.cu
$ nvcc -lib kernel_lib.o -o test.a
$ nvcc -arch=sm_20 -dc main.cu
$ nvcc -arch=sm_20 main.o test.a -o test
$ ./test
Success!
$

I used CUDA 5.0 for this test.

Licenciado bajo: CC-BY-SA con atribución
No afiliado a StackOverflow
scroll top