Yes, you will need to use separate compilation. I put together a simple test case based on what you have shown so far, and using the nvcc separate compilation library example from the documentation. Here is the code:
kernel_lib.cu:
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
/** Type definition for the execution function in #qsched_run. */
typedef void (*qsched_funtype)( int , void * );
__global__ void mykernel(int type, void *data, void *func){
((qsched_funtype)func)(type, data);
}
int qsched_run_CUDA(int val, void *d_data, void *func)
{
mykernel<<<1,1>>>(val, d_data, func);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
return 0;
}
main.cu:
#include <stdio.h>
#define DATA_VAL 5
int qsched_run_CUDA(int, void*, void*);
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
/** Type definition for the execution function in #qsched_run. */
typedef void (*qsched_funtype)( int , void * );
__device__ void gpuTest(int type , void *data)
{
((int *)data)[0] = type;
}
__device__ qsched_funtype function = gpuTest;
int main()
{
void *func;
cudaMemcpyFromSymbol( &func , function , sizeof(qsched_funtype));
cudaCheckErrors("Failed to copy function pointer from device");
int h_data = 0;
int *d_data;
cudaMalloc((void **)&d_data, sizeof(int));
cudaCheckErrors("cudaMalloc fail");
cudaMemset(d_data, 0, sizeof(int));
cudaCheckErrors("cudaMemset fail");
int return_val = qsched_run_CUDA(DATA_VAL, (void *)d_data, func);
if (return_val != 0) printf("return code error\n");
cudaMemcpy(&h_data, d_data, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy fail");
if (h_data != DATA_VAL) {printf("Fail! %d\n", h_data); return 1;}
printf("Success!\n");
return 0;
}
compile commands and result:
$ nvcc -arch=sm_20 -dc kernel_lib.cu
$ nvcc -lib kernel_lib.o -o test.a
$ nvcc -arch=sm_20 -dc main.cu
$ nvcc -arch=sm_20 main.o test.a -o test
$ ./test
Success!
$
I used CUDA 5.0 for this test.