Question

I'm a beginner in OpenCL. And I've been trying to write a matrix multiplication code. It works fine only it gives garbage value as the output for C array. I'm unable to fix the error. Any help will be much appreciated.

Here's is the host and kernel code.

#include <CL/cl.h>
#include <iostream>
#include <cstdio>
#include <fstream>
#include <stdlib.h>
#include <assert.h>
#include <string.h>

using namespace std;
#define SUCCESS 0
#define FAILURE 1

// Function to convert file name into a string
int convertToString(const char *filename, std::string &s)
{
    size_t size;
    char *str;
    std::fstream f(filename, (std::fstream::in | std::fstream::binary));

    if (f.is_open())
    {
        size_t fileSize;
        f.seekg(0, std::fstream::end);
        size = fileSize = (size_t)f.tellg();
        f.seekg(0, std::fstream::beg);
        str = new char[size + 1];
        if (!str)
        {
            f.close();
            return 0;
        }

        f.read(str, fileSize);
        f.close();
        str[size] = '\0';
        s = str;
        delete[] str;
        return 0;
    }
    cout << "Error: failed to open file\n:" << filename << endl;
    return FAILURE;
}

int main()
{
    cl_uint status;
    cl_int *error;
    int A[9] = {1, 1, 1, 1, 1, 1, 1, 1, 1};
    int B[9] = {2, 2, 2, 2, 2, 2, 2, 2, 2};
    int C[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
    // Setting up platforms
    cl_platform_id platform = NULL;
    cl_uint numPlatforms = 0;
    // Getting no of platforms
    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (status != CL_SUCCESS)
    {
        cout << "\nUnable to query platforms";
        return 0;
    }

    // Get the platform
    if (numPlatforms > 0)
    {
            cl_platform_id*platforms=
                  cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
            status = clGetPlatformIDs(numPlatforms, platforms, NULL);
            platform = platforms[0];
            free(platforms);
    }

    cl_uint numDevices = 0;
    cl_device_id *devices = NULL;
    status =
        clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, devices, &numDevices);

    if (numDevices == 0)
    {
        cout << "No GPU device available! Choosing CPU.\n";
        status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 0, devices,
                                &numDevices);
        devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id));
        status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, numDevices,
                                devices, NULL);
    }

    else
    {
        devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id));
        status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices,
                                devices, NULL);
        if (status == 0)
        {
            cout << "Device error!";
            return 0;
        }
    }

    // Creating contexts

    cl_context context =
        clCreateContext(NULL, 1, devices, NULL, NULL, (cl_int *)status);

    if (status != CL_SUCCESS)
    {
        cout << status;
    }

    // Creating command queues
    cl_command_queue command =
        clCreateCommandQueue(context, devices[0], 0, NULL);
    //  if(error!=CL_SUCCESS)
    //{
    //  cout<<error;
    //}

    // Creating buffers
    cl_mem bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                    3 * 3 * sizeof(int), NULL, NULL);
    cl_mem bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                    3 * 3 * sizeof(int), NULL, NULL);
    cl_mem bufferC = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                    3 * 3 * sizeof(int), NULL, NULL);

    status = clEnqueueWriteBuffer(command, bufferA, CL_TRUE, 0, 9 * sizeof(int),
                                  (void *)A, 0, NULL, NULL);
    status = clEnqueueWriteBuffer(command, bufferB, CL_TRUE, 0, 9 * sizeof(int),
                                  (void *)B, 0, NULL, NULL);
    // status=clEnqueueReadBuffer(command,bufferA,CL_TRUE,0,9*sizeof(int),(void*)C,0,NULL,NULL);

    const char *filename = "kernel.cl";
    string sourceStr;
    status = convertToString(filename, sourceStr);
    const char *source = sourceStr.c_str();
    size_t sourceSize[] = {strlen(source)};
    cl_program program =
        clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);

    status = clBuildProgram(program, numDevices, 0, NULL, NULL, NULL);
    cl_kernel myKernel = clCreateKernel(program, "multiply", NULL);

    // Setting kernel arguments
    clSetKernelArg(myKernel, 0, sizeof(cl_mem), &bufferC);
    clSetKernelArg(myKernel, 1, sizeof(cl_mem), &bufferA);
    clSetKernelArg(myKernel, 2, sizeof(cl_mem), &bufferB);

    size_t localws[2] = {9, 9};
    size_t globalws[2] = {3, 3};

    status = clEnqueueNDRangeKernel(command, myKernel, 2, NULL, globalws,
                                    localws, 0, NULL, NULL);
    status = clEnqueueReadBuffer(command, bufferC, CL_TRUE, 0, 9 * sizeof(int),
                                 (void *)C, 0, NULL, NULL);

    for (int i = 0; i < 9; i++) cout << C[i] << " ";
    status = clReleaseKernel(myKernel);  // Release kernel.
    status = clReleaseProgram(program);  // Release program object.
    status = clReleaseMemObject(bufferA);  // Release mem object.
    status = clReleaseMemObject(bufferB);
    status = clReleaseMemObject(bufferC);
    status = clReleaseCommandQueue(command);  // Release  Command queue.
    status = clReleaseContext(context);  // Release context.
}

Kernel code:

__kernel void multiply(_global int outputC, _global int inputA,
                       _global int inputB)
{
    int row = get_global_id(0);
    int col = get_global_id(1);

    int sum = 0;
    for (int i = 0; i < 3; i++)
        sum += inputA[row * 3 + 1] * inputB[i * 3 + col];

    outputC[row + 3 + col] = sum;
}
Was it helpful?

Solution

As already pointed out by @Marco13 the kernel suffers from quite a few issues.

When running this kernel through a tool like clcc you can see that there are a number of compilation errors to begin with:

> clcc matmul.cl 
"/tmp/OCLu7FyFF.cl", line 1: error: identifier "_global" is undefined
  __kernel void multiply(_global int outputC, _global int inputA,
                         ^

"/tmp/OCLu7FyFF.cl", line 1: error: invalid combination of type specifiers
  __kernel void multiply(_global int outputC, _global int inputA,
                                 ^

"/tmp/OCLu7FyFF.cl", line 1: error: identifier "_global" is undefined
  __kernel void multiply(_global int outputC, _global int inputA,
                                              ^

"/tmp/OCLu7FyFF.cl", line 1: error: invalid combination of type specifiers
  __kernel void multiply(_global int outputC, _global int inputA,
                                                      ^

"/tmp/OCLu7FyFF.cl", line 2: error: identifier "_global" is undefined
                         _global int inputB)
                         ^

"/tmp/OCLu7FyFF.cl", line 2: error: invalid combination of type specifiers
                         _global int inputB)
                                 ^

6 errors detected in the compilation of "/tmp/OCLu7FyFF.cl".

A tool like clcc is very useful for catching errors early on. Most vendors also have their own version of a standalone kernel compiler/checker: e.g. Intel has its Kernel Builder, AMD's CodeXL contains a static kernel analyzer. Another option is to retrieve kernel compilation errors right from your host code, by calling clGetProgramBuildInfo to retrieve the compiler output, after clBuildProgram returned CL_BUILD_PROGRAM_FAILURE.

Once these compilation errors are fixed, it looks like your kernel is still not doing what you expect: as noted, the inputs and outputs should be pointers, as you will be passing buffers to the kernel. Also, the indexing of your input and output arrays is incorrect: In the for-loop inputA[row * 3 + 1] should be inputA[row * 3 + i] (i instead of 1). When saving the result to outputC, I would expect outputC[row * 3 + col] (row * 3) instead of row + 3).

I haven't looked in detail at the host code, but I would at least make sure, especially when just starting out with OpenCL, to always check every return code and error. This will save you a lot of time and frustration.

Finally, if you want a quick jump-start to learning OpenCL with a hands-on approach, I would strongly recommend going through the open source Hands-on OpenCL training by Simon McIntosh-Smith and Tom Deakin. It doesn't take very long, is quite pragmatic and provides lots of useful insights. Optimizing matrix multiplication is one of the use cases that is shown step-by-step.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top