I'm using PyOpenCL to let my GPU do some regression on a large data set. Right now the GPU is slower than the CPU, probably because there is a loop that requires access to the global memory during each increment (I think...). The data set is too large to store into the local memory, but each loop does not require the entire data set, so I want to copy a portion of this array to the local memory. My question is: how do I do this? In Python one can easily slice a portion, but I don't think that's possible in OpenCL.
Here's the OpenCL code I'm using, if you spot any more potential optimisations, please shout:
__kernel void gpu_slope(__global double * data, __global double * time, __global int * win_results, const unsigned int N, const unsigned int Nmax, const double e, __global double * result) {
__local unsigned int n, length, leftlim, rightlim, i;
__local double sumx, sumy, x, y, xx, xy, invlen, a, b;
n = get_global_id(0);
leftlim = win_results[n*2];
rightlim = win_results[n*2+1];
sumx = 0;
sumy = 0;
xy = 0;
xx = 0;
length = rightlim - leftlim;
for(i = leftlim; i <= rightlim; i++) {
x = time[i]; /* I think this is fetched from global memory */
y = data[i];
sumx += x;
sumy += y;
xy += x*y;
xx += x*x;
}
invlen = 1.0/length;
a = xy-(sumx*sumy)*invlen;
b = xx-(sumx*sumx)*invlen;
result[n] = a/b;
}
I'm new to OpenCL, so please bear with me. Thanks!