سؤال

OpenCL Newbie question :)

I'm trying to write an opencl kernel like

__kernel void NLLSQ
(
    __global double* image,
    __global double* nllsqResult
)
{
    //Do some stuff
}

Which works fine until I try to put a loop in. ie:

__kernel void NLLSQ
(
    __global double* image,
    __global double* nllsqResult
)
{
    for (int i = 0; i < 2; i++)
    {
        //Do some stuff
    }
}

Which causes my computer to crash an monitor to go black. I think the problem is that I've sent too much work to the graphics card.

My full code looks like this

#ifdef cl_khr_fp64
    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
    #error "Double precision doubleing point not supported by OpenCL implementation."
#endif

int2 clipPixel(int2 coordinate, int width, int height)
{
    coordinate.x = max(0, coordinate.x);
    coordinate.y = max(0, coordinate.y);
    coordinate.x = min(width, coordinate.x); //1911
    coordinate.y = min(height, coordinate.y); //1071
    return coordinate;
}

int Coord2Index(int X, int Y, int width)
{
    return (width * Y) + X;
}



//2D Gaussian 'bubble' Function
double f(int x, int y, double a, double b, double s)
{
    return a + b*exp(-(x*x+y*y)/(s*s));
}

// (∂f/∂b)
double dfdb(int x, int y, double s)
{
    return exp(-(x*x+y*y)/(s*s));
}

// (∂f/∂σ)
double dfds(int x, int y, double b, double s)
{
    double v = -(x*x + y*y);
    return b * exp(v/(s*s))*-2*v/(s*s*s);
}

//Non-Linear Least Squares 
__kernel void NLLSQ
(
    __global double* image,
    __global double* nllsqResult
)
{
    const int x = get_global_id(0);
    const int y = get_global_id(1);

    int index = Coord2Index( x, y, 1912 );
    int jacIndex = 0;
    int dyIndex = 0;
    int indexRslt = Coord2Index( x, y, 1904 );

    double dY[81];
    double J[81][3];
    double JTJ[3][3];
    double3 B = (double3)(0, 1, 1); //initial guess
    double JTdY[3]; 

    //Creates the dY vector
    for (int j = -4; j <= 4; j++)
    {
        for (int i = -4; i <= 4; i++)
        {
            dY[dyIndex] = image[index] - f( i, j, B.x, B.y, B.z);
            dyIndex = dyIndex + 1;
        }
    }

    //Creates the Jacobian 
    for (int j = -4; j <= 4; j++)
    {
        for (int i = -4; i <= 4; i++)
        {
            index = Coord2Index( x + i + 4, y + j + 4, 1912 );

            J[jacIndex][0] = 1;
            J[jacIndex][1] = dfdb(i, j, B.z);
            J[jacIndex][2] = dfds(i, j, B.y, B.z);

            jacIndex = jacIndex + 1;
        }
    }

    //Now to solve (JT * J) * ΔB = JT * ΔY   for ΔB  ....

    JTdY[0] = 0;
    JTdY[1] = 0;
    JTdY[2] = 0;
    //Create JTJ
    for (int i = 0; i < 81; i++)
    {
        JTJ[0][0] = J[i][0] * J[i][0];
        JTJ[0][1] = J[i][0] * J[i][1];
        JTJ[0][2] = J[i][0] * J[i][2];

        JTJ[1][0] = J[i][1] * J[i][0];
        JTJ[1][1] = J[i][1] * J[i][1];
        JTJ[1][2] = J[i][1] * J[i][2];

        JTJ[2][0] = J[i][2] * J[i][0];
        JTJ[2][1] = J[i][2] * J[i][1];
        JTJ[2][2] = J[i][2] * J[i][2];

        //JT * ΔY
        JTdY[0] = J[i][0] * dY[i];
        JTdY[1] = J[i][1] * dY[i];
        JTdY[2] = J[i][2] * dY[i];
    }

    //TO DO: might have to make this next part more general if I decide not to use a 9x9 bubble template size
    // Also not sure what to do when det(A) = 0  (is that even possible?)

    // (JT * J) * ΔB = JT * ΔY is a system of the form Ax = b
    // A = (JT * J), ΔB = x, JT * ΔY = b
    //Solve using cramer's rule   http://en.wikipedia.org/wiki/Cramer%27s_rule
    // xi = det(Ai)/det(A)

    //determinant of A
    double detA = 
    JTJ[0][0] * (JTJ[1][1] * JTJ[2][2] - JTJ[1][2] * JTJ[2][1]) -
    JTJ[0][1] * (JTJ[1][0] * JTJ[2][2] - JTJ[1][2] * JTJ[2][0]) + 
    JTJ[0][2] * (JTJ[1][0] * JTJ[2][1] - JTJ[1][1] * JTJ[2][0]) ;

    double detA1 =
      JTdY[0] * (JTJ[1][1] * JTJ[2][2] - JTJ[1][2] * JTJ[2][1]) -
    JTJ[0][1] * (  JTdY[1] * JTJ[2][2] - JTJ[1][2] * JTdY[2]  ) + 
    JTJ[0][2] * (  JTdY[1] * JTJ[2][1] - JTJ[1][1] * JTdY[2]  ) ;

    double detA2 = 
    JTJ[0][0] * (JTdY[1]   * JTJ[2][2] - JTJ[1][2] * JTdY[2]  ) -
    JTdY[0]   * (JTJ[1][0] * JTJ[2][2] - JTJ[1][2] * JTJ[2][0]) + 
    JTJ[0][2] * (JTJ[1][0] * JTdY[2]   - JTdY[1]   * JTJ[2][0]) ;

    double detA3 = 
    JTJ[0][0] * (JTJ[1][1] * JTdY[2]   - JTdY[1]   * JTJ[2][1]) -
    JTJ[0][1] * (JTJ[1][0] * JTdY[2]   - JTdY[1]   * JTJ[2][0]) + 
    JTdY[0]   * (JTJ[1][0] * JTJ[2][1] - JTJ[1][1] * JTJ[2][0]) ;

    // B(k+1) = B(k) + ΔB
    B.x = B.x + (detA1/detA);
    B.y = B.y + (detA2/detA);
    B.z = B.z + (detA3/detA);

    nllsqResult[indexRslt] = B.z;
}

I would like to use a for loop as such

//Non-Linear Least Squares 
__kernel void NLLSQ
(
    __global double* image,
    __global double* nllsqResult
)
{
    const int x = get_global_id(0);
    const int y = get_global_id(1);

    int index = Coord2Index( x, y, 1912 );
    int jacIndex = 0;
    int dyIndex = 0;
    int indexRslt = Coord2Index( x, y, 1904 );

    double dY[81];
    double J[81][3];
    double JTJ[3][3];
    double3 B = (double3)(0, 1, 1); //initial guess
    double JTdY[3]; 

    //Creates the dY vector
    for (int j = -4; j <= 4; j++)
    {
        for (int i = -4; i <= 4; i++)
        {
            dY[dyIndex] = image[index] - f( i, j, B.x, B.y, B.z);
            dyIndex = dyIndex + 1;
        }
    }

      for (int iters = 0; iters < 10; iters++) //FOR LOOP ADDED HERE
      {
      jacIndex = 0;
    //Creates the Jacobian 
    for (int j = -4; j <= 4; j++)
    {
        for (int i = -4; i <= 4; i++)
        {
            index = Coord2Index( x + i + 4, y + j + 4, 1912 );

            J[jacIndex][0] = 1;
            J[jacIndex][1] = dfdb(i, j, B.z);
            J[jacIndex][2] = dfds(i, j, B.y, B.z);

            jacIndex = jacIndex + 1;
        }
    }

    //Now to solve (JT * J) * ΔB = JT * ΔY   for ΔB  ....

    JTdY[0] = 0;
    JTdY[1] = 0;
    JTdY[2] = 0;
    //Create JTJ
    for (int i = 0; i < 81; i++)
    {
        JTJ[0][0] = J[i][0] * J[i][0];
        JTJ[0][1] = J[i][0] * J[i][1];
        JTJ[0][2] = J[i][0] * J[i][2];

        JTJ[1][0] = J[i][1] * J[i][0];
        JTJ[1][1] = J[i][1] * J[i][1];
        JTJ[1][2] = J[i][1] * J[i][2];

        JTJ[2][0] = J[i][2] * J[i][0];
        JTJ[2][1] = J[i][2] * J[i][1];
        JTJ[2][2] = J[i][2] * J[i][2];

        //JT * ΔY
        JTdY[0] = J[i][0] * dY[i];
        JTdY[1] = J[i][1] * dY[i];
        JTdY[2] = J[i][2] * dY[i];
    }

    //TO DO: might have to make this next part more general if I decide not to use a 9x9 bubble template size
    // Also not sure what to do when det(A) = 0  (is that even possible?)

    // (JT * J) * ΔB = JT * ΔY is a system of the form Ax = b
    // A = (JT * J), ΔB = x, JT * ΔY = b
    //Solve using cramer's rule   http://en.wikipedia.org/wiki/Cramer%27s_rule
    // xi = det(Ai)/det(A)

    //determinant of A
    double detA = 
    JTJ[0][0] * (JTJ[1][1] * JTJ[2][2] - JTJ[1][2] * JTJ[2][1]) -
    JTJ[0][1] * (JTJ[1][0] * JTJ[2][2] - JTJ[1][2] * JTJ[2][0]) + 
    JTJ[0][2] * (JTJ[1][0] * JTJ[2][1] - JTJ[1][1] * JTJ[2][0]) ;

    double detA1 =
      JTdY[0] * (JTJ[1][1] * JTJ[2][2] - JTJ[1][2] * JTJ[2][1]) -
    JTJ[0][1] * (  JTdY[1] * JTJ[2][2] - JTJ[1][2] * JTdY[2]  ) + 
    JTJ[0][2] * (  JTdY[1] * JTJ[2][1] - JTJ[1][1] * JTdY[2]  ) ;

    double detA2 = 
    JTJ[0][0] * (JTdY[1]   * JTJ[2][2] - JTJ[1][2] * JTdY[2]  ) -
    JTdY[0]   * (JTJ[1][0] * JTJ[2][2] - JTJ[1][2] * JTJ[2][0]) + 
    JTJ[0][2] * (JTJ[1][0] * JTdY[2]   - JTdY[1]   * JTJ[2][0]) ;

    double detA3 = 
    JTJ[0][0] * (JTJ[1][1] * JTdY[2]   - JTdY[1]   * JTJ[2][1]) -
    JTJ[0][1] * (JTJ[1][0] * JTdY[2]   - JTdY[1]   * JTJ[2][0]) + 
    JTdY[0]   * (JTJ[1][0] * JTJ[2][1] - JTJ[1][1] * JTJ[2][0]) ;

    // B(k+1) = B(k) + ΔB
    B.x = B.x + (detA1/detA);
    B.y = B.y + (detA2/detA);
    B.z = B.z + (detA3/detA);
      }

    nllsqResult[indexRslt] = B.z;
}
هل كانت مفيدة؟

المحلول

It seams that your kernel takes to long and the Timeout Detection and Recovery mechanisms from Windows kicks in. You can disable TDR by changing the registriy values as described here: MSDN However, if you dsiable TDR your screen may hang until the computation of your kernel is finished. If you have an infinit loop in your kernel nothing will stop it, and since you haven't any response of your computer killing the task would be very difficult. Good that there are power and reset buttons.

مرخصة بموجب: CC-BY-SA مع الإسناد
لا تنتمي إلى StackOverflow
scroll top