Simple programme OpenCL compile et fonctionne, mais la sortie est incorrect
Question
J'ai écrit un simple programme OpenCL basé sur l'SDK et il compile et fonctionne, mais la sortie est erroné. Y at-il quelque chose que je fais mal?
Toutes les suggestions pour apprendre à debug C et OpenCL est très apprécié. Je suis tout à fait nouvelle à la plate-forme.
Le code est ci-dessous.
La sortie en c du tableau est zéro.
Merci.
test_opencl.h
#ifndef _TEST_OPENCL_H_
#define _TEST_OPENCL_H_
int main( int argc, const char** argv);
int runTest( int argc, const char** argv);
#endif
test_opencl.cl
// simple test of adding a[i] to b[i] to get c[i]
__kernel void add_array(__global float *a, __global float *b, __global float *c)
{
int xid = get_global_id(0);
c[xid] = a[xid] + b[xid];
}
test_opencl.cpp
// standard utility and system includes
#include <oclUtils.h>
#include "test_opencl.h"
// OpenCL error catcher
cl_int err = 0;
// Main Program
// *********************************************************************
int main( int argc, const char** argv)
{
// set logfile name and start logs
shrSetLogFileName ("test_opencl.txt");
shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]);
// run the main test
int result = runTest(argc, argv);
shrCheckError(result, 0);
// finish
shrEXIT(argc, argv);
}
//! Run a simple test for OPENCL
// *********************************************************************
int runTest( int argc, const char** argv)
{
cl_context gpu_context;
cl_command_queue cmd_queue;
cl_program program;
cl_kernel test_kernel;
const size_t szGlobalWorkSize = 10;
const size_t szLocalWorkSize = 10;
// size of memory required to store the array
const unsigned int mem_size = sizeof(int) * 10;
// create the OpenCL context on a GPU device
gpu_context = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &err);
shrCheckError(err, CL_SUCCESS);
// get devices
cl_device_id device;
if( shrCheckCmdLineFlag(argc, argv, "device") ) {
int device_nr = 0;
shrGetCmdLineArgumenti(argc, argv, "device", &device_nr);
device = oclGetDev(gpu_context, device_nr);
} else {
device = oclGetMaxFlopsDev(gpu_context);
}
// create a command-queue
cmd_queue = clCreateCommandQueue(gpu_context, device, 0, &err);
shrCheckError(err, CL_SUCCESS);
// allocate and initalize host memory
int a[10], b[10], c[10];
for (int i = 0; i < 10; i++) {
a[i] = i;
b[i] = i * i;
}
// create buffers on device
cl_mem vol_a = clCreateBuffer(gpu_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, a, &err);
shrCheckError(err, CL_SUCCESS);
cl_mem vol_b = clCreateBuffer(gpu_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, b, &err);
shrCheckError(err, CL_SUCCESS);
cl_mem vol_c = clCreateBuffer(gpu_context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, c, &err);
shrCheckError(err, CL_SUCCESS);
// copy data from host to device
err = clEnqueueWriteBuffer(cmd_queue, vol_a, CL_TRUE, 0, mem_size, a, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(cmd_queue, vol_b, CL_TRUE, 0, mem_size, b, 0, NULL, NULL);
shrCheckError(err, CL_SUCCESS);
// Program Setup
size_t program_length;
char* source_path = shrFindFilePath("test_opencl.cl", argv[0]);
shrCheckError(source_path != NULL, shrTRUE);
char *source = oclLoadProgSource(source_path, "", &program_length);
shrCheckError(source != NULL, shrTRUE);
// create the program
program = clCreateProgramWithSource(gpu_context, 1, (const char **)&source, &program_length, &err);
shrCheckError(err, CL_SUCCESS);
// build the program
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then return error
shrLog(LOGBOTH | ERRORMSG, err, STDERROR);
return(EXIT_FAILURE);
}
clFinish(cmd_queue);
shrLog(LOGBOTH, 0, "%s Starting kernel operation...\n\n", argv[0]);
// create the test kernel
test_kernel = clCreateKernel(program, "add_array", &err);
shrCheckError(err, CL_SUCCESS);
// set the args values for the kernel
err = clSetKernelArg(test_kernel, 0, sizeof(cl_mem), (void *) &vol_a);
err |= clSetKernelArg(test_kernel, 1, sizeof(cl_mem), (void *) &vol_b);
err |= clSetKernelArg(test_kernel, 2, sizeof(cl_mem), (void *) &vol_c);
shrCheckError(err, CL_SUCCESS);
err = clEnqueueNDRangeKernel(cmd_queue, test_kernel, 1, NULL, &szGlobalWorkSize, NULL, 0, NULL, NULL);
shrCheckError(err, CL_SUCCESS);
clFinish(cmd_queue);
// copy result from device to host
err = clEnqueueReadBuffer(cmd_queue, vol_c, CL_TRUE, 0, mem_size, c, 0, NULL, NULL);
shrCheckError(err, CL_SUCCESS);
int d[10];
err = clEnqueueReadBuffer(cmd_queue, vol_a, CL_TRUE, 0, mem_size, d, 0, NULL, NULL);
shrCheckError(err, CL_SUCCESS);
clFinish(cmd_queue);
shrLog(LOGBOTH, 0, "%s Finished kernel operation...\n\n", argv[0]);
bool passed = true;
for (int i = 0; i < 10; i++) {
if (c[i] != i + i * i)
passed = false;
shrLog(LOGBOTH, 0, "c = %d d = %d\n", c[i], d[i]);
}
if (passed)
shrLog(LOGBOTH, 0, "%s Test Passed\n\n", argv[0]);
else
shrLog(LOGBOTH, 0, "%s Test Failed\n\n", argv[0]);
// cleanup OpenCL
clReleaseMemObject(vol_a);
clReleaseMemObject(vol_b);
clReleaseMemObject(vol_c);
clReleaseKernel(test_kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmd_queue);
clReleaseContext(gpu_context);
return 0;
}
Licencié sous: CC-BY-SA avec attribution
Non affilié à StackOverflow