Question

This is a sample problem to understand the functioning of OpenCl.

Problem: I have three arrays array1, array2, array3. addition of each element of an array2 and array3 is saved into array1.

example: array1[1] = array2[1] + array3[1];

array1, array2 and array3 is a global double array in a c program. these arrays are initialized with value 1.0;

then these arrays are passed to GPU using opencl in buffer. In opencl code, i am using 10 workitems, hence, each workitem is dealing with each element of these arrays by calling the desired function. and updating the buffer. Updated buffer is read back with updated values of array1, array2 and array3 arrays.

updated value of array1 is passed to array2 and array3 and again kernel is called. to evaluate again.

Hence result should come:

loading kernel..
kernel loaded..
Step 0..
array1[0] = 2.000000
array1[1] = 2.000000
array1[2] = 2.000000
array1[3] = 2.000000
array1[4] = 2.000000
array1[5] = 2.000000
array1[6] = 2.000000
array1[7] = 2.000000
array1[8] = 2.000000
array1[9] = 2.000000
....
....
....
Step 10..
array1[0] = 18.000000
array1[1] = 18.000000
array1[2] = 18.000000
array1[3] = 18.000000
array1[4] = 18.000000
array1[5] = 18.000000
array1[6] = 18.000000
array1[7] = 18.000000
array1[8] = 18.000000
array1[9] = 18.000000

which is coming fine for nVidia Corporation GeForce GT 630 (rev a1) graphics card with driver version: 331.49

But if i am running the same code in nVidia Corporation G96 [GeForce 9500 GT] (rev a1) graphics card with driver version: 260.19.26. Then results are wrong. Infact values are not even changing for array1

loading kernel..
kernel loaded..
Step 0..
array1[0] = 1.000000
array1[1] = 1.000000
array1[2] = 1.000000
array1[3] = 1.000000
array1[4] = 1.000000
array1[5] = 1.000000
array1[6] = 1.000000
array1[7] = 1.000000
array1[8] = 1.000000
array1[9] = 1.000000
....
....
....
Step 10..
array1[0] = 1.000000
array1[1] = 1.000000
array1[2] = 1.000000
array1[3] = 1.000000
array1[4] = 1.000000
array1[5] = 1.000000
array1[6] = 1.000000
array1[7] = 1.000000
array1[8] = 1.000000
array1[9] = 1.000000

Why different results on different graphics card?

running.c code:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

#define MAX_SOURCE_SIZE (0x100000)

cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
cl_event event;
size_t source_size;
cl_mem array1Buffer, array2Buffer, array3Buffer;
size_t global_work_size[1] = {5};

#define size 10
double array1[size];
double array2[size];
double array3[size];


void create () {
    FILE *fp;
    char *source_str;
    fp = fopen("calc.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);


    /*Initialization*/
    /* Get Platform and Device Info */
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);

    /* Create OpenCL context */
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

    /* Create Command Queue */
    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    /*Initialization complete*/

    int i;
    for (i = 0; i< size ; i++) {
        array1[i] = 1.0;
        array2[i] = 1.0;
        array3[i] = 1.0;
    }

    /* Create Kernel Program from the source */
    program = clCreateProgramWithSource(context, 1, (const char **)&source_str,(const size_t *)&source_size, &ret);

    /* Build Kernel Program */
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

    /* Create OpenCL Kernel */
    kernel = clCreateKernel(program, "eval", &ret);
}

void eval_eq () {
    ret = clReleaseMemObject(array1Buffer);
    ret = clReleaseMemObject(array2Buffer);
    ret = clReleaseMemObject(array3Buffer);
    array1Buffer = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, size * sizeof(double),(void *) array1, NULL);
    ret = clEnqueueWriteBuffer(command_queue,
                                  array1Buffer,
                                  CL_FALSE,
                                  0,
                                  size * sizeof(double),
                                  array1,
                                  0,
                                  NULL,
                                  &event);
    ret = clWaitForEvents(1, &event);
    clReleaseEvent(event);
    array2Buffer = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, size * sizeof(double),(void *) array2, NULL);
    ret = clEnqueueWriteBuffer(command_queue,
                                  array2Buffer,
                                  CL_FALSE,
                                  0,
                                  size * sizeof(double),
                                  array2,
                                  0,
                                  NULL,
                                  &event);
    ret = clWaitForEvents(1, &event);
    clReleaseEvent(event);
    array3Buffer = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, size * sizeof(double),(void *) array3, NULL);
    ret = clEnqueueWriteBuffer(command_queue,
                                  array3Buffer,
                                  CL_FALSE,
                                  0,
                                  size * sizeof(double),
                                  array3,
                                  0,
                                  NULL,
                                  &event);
    ret = clWaitForEvents(1, &event);
    clReleaseEvent(event);
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&array1Buffer);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&array2Buffer);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&array3Buffer);
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
    ret = clEnqueueReadBuffer(command_queue, array1Buffer, CL_TRUE, 0, size * sizeof(double), array1, 0, NULL, NULL);
    ret = clEnqueueReadBuffer(command_queue, array2Buffer, CL_TRUE, 0, size * sizeof(double), array2, 0, NULL, NULL);
    ret = clEnqueueReadBuffer(command_queue, array3Buffer, CL_TRUE, 0, size * sizeof(double), array3, 0, NULL, NULL);
}


int main () {
    printf("loading kernel..\n");
    create();
    printf("kernel loaded..\n");
    int i, j;
    for (i = 0; i <= size; i++) {
        printf("Step %d..\n", i);
        eval_eq();
        for (j = 0; j < size; j++) {
            printf("array1[%d] = %lf\n", j, array1[j]);
            array2[j] = (double) i;
            array3[j] = (double) i;
        }
    }
    return 0;
}

calc.cl code:

void sub_gp (__global double* ar1, __global double* ar2, __global double* ar3, int gpno) {
    ar1[gpno] = ar2[gpno] + ar3[gpno];
}

void gp (__global double* ar1, __global double* ar2, __global double* ar3, int gpno) {
    sub_gp(ar1,ar2,ar3,gpno);
}


__kernel void eval(__global double* ar1, __global double* ar2, __global double* ar3)
{
    int idx = get_global_id(0);
    gp(ar1,ar2,ar3,idx);
}

No correct solution

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top