Вопрос

This is a part of c++ code for solving a problem in computational mathematics of large dimension, say more than 100000 variables. I'd like to parallelise it using OpenMP. What is the best way of paralleling the following nested loop by OpenMP?

e = 0;
// m and n are are big numbers 200000 - 10000000
int i,k,r,s,t;
// hpk,hqk,pk_x0,n2pk_x0,dk,sk are double and declared before.
for (k=0; k<m; k++) 
{
  hpk     = 0;     
  hqk     = 0;
  n2pk_x0 = 0;
  dk      = 0;
  sk      = 0; 

  for (int i=0; i<n; i++) 
  {
     if (lamb[i] <= lam[k]) 
     {
          if (h[i]<0)
          {
             pk[i] = xu[i];
          }
          else if (h[i]>0)
          {
             pk[i] = xl[i];
          }
          qk[i] = 0;
     }
     else
     {
          pk[i] = x0[i];
          qk[i] = -h[i];
     }

     hpk     += h[i]*pk[i];
     hqk     += h[i]*qk[i];
     pk_x0    = pk[i]-x0[i];
     n2pk_x0 += pk_x0*pk_x0;
     dk      += pk_x0*qk[i];
     sk      += qk[i]*qk[i];
  }
  //}//p

  /* ------- Compute ak, bk, ck, dk and sk to construct e(lam) -------- */
  ak = - (gamma + hpk);
  bk = - hqk;
  ck = q0 + 0.5 * n2pk_x0;
  sk = 0.5 * sk;
  // some calculation based on index k
} // end of first for

I did some of the advises to private the local variables in the nested loop.The CPU time decreased by factor 1/2, but the output is not correct! Is there any way to improve the code in such a way that get correct result with less CPU time? (In the nested loop, if we set m=1, the output will be correct, but for m>1 the output is incorrect.)

This is the whole code:

static void subboconcpp(
               double u[],
               double *Egh,
                   double h[],
               double gamma,
                   double x0[],
               double q0,
                   double xl[],
               double xu[],
                   int    dim
               )
{

int    n,m,infinity = INT_MAX,i,k,r,s,t;;
double e; 
double hpk, hqk, dk1, sk1, n2pk_x0;
double ak, bk, ck, dk, sk; 
double lam_hat, phik, ek1, ek2;
double *pk    = new double[dim];
double *qk    = new double[dim]; 
double *lamb  = new double[dim];
double *lamb1 = new double[dim];
double *lam   = new double[dim];

/* ------------------ Computing lambl(i) and lambu(i) ------------------ */
/* n is the length of x0 */   
n = dim;  

#pragma omp parallel for shared(n,h,x0,xl,xu)//num_threads(8)
for (int i=0; i<n; i++) 
{
  double lamb_flag;
  if (h[i] > 0) 
  {
            lamb_flag = (x0[i] - xl[i])/h[i];
            lamb[i]  = lamb_flag;
            lamb1[i] = lamb_flag;
  } 
  else if (h[i] < 0) 
  {
            lamb_flag = (x0[i] - xu[i])/h[i];
            lamb[i]  = lamb_flag;
            lamb1[i] = lamb_flag;
  } 
  //cout << "lamb:" << lamb[i];
}
/* --------------------------------------------------------------------- */

/* ----------------- Sorting lamb and constructing lam ----------------- */

/* lamb = sort(lamb,1); */
sort(lamb1, lamb1+n);

int    q        = 0;
double lam_flag = 0;
#pragma omp parallel for shared(n) firstprivate(q) lastprivate(m)
for (int j=0; j<n; j++) 
{
  if (lamb1[j] > lam_flag) 
  {
     lam_flag = lamb1[j];
     q      = q + 1;
     lam[q] = lam_flag;
     //cout << "lam: \n" << lam[q];
  }

  if (j == n-1)
  {
     if (lam_flag < infinity)
     {
        m = q+1;
        lam[m] = + infinity;
     }
     else
     {
         m = q;
     }
  }
  //cout << "q: \n" << q;
}

/* --------------------------------------------------------------------- */

/* -- Finding the global maximizer of e(lam)  for lam in[-inf, + inf] -- */
e = 0;  

#pragma omp parallel shared(m,n,h,x0,xl,xu,lamb,lam) \
private(i,r,s,t,hpk, hqk, dk1, sk1, n2pk_x0,ak, bk, ck, dk, sk,lam_hat, phik, ek1, ek2) 
{
#pragma omp for nowait

for (k=0; k<1; k++) 
{
  /*double hpk=0, hqk=0, dk1=0, sk1=0, n2pk_x0=0;
  double ak, bk, ck, dk, sk; 
  double lam_hat, phik, ek1, ek2; 
  double *pk = new double[dim];
  double *qk = new double[dim];*/    

  hpk     = 0;     
  hqk     = 0;
  n2pk_x0 = 0;
  dk1     = 0;
  sk1     = 0; 

  for (int i=0; i<n; i++) 
  {
     double pk_x0;
     if (lamb[i] <= lam[k]) 
     {
          if (h[i]<0)
          {
             pk[i] = xu[i];
          }
          else if (h[i]>0)
          {
             pk[i] = xl[i];
          }
          qk[i] = 0;
     }
     else
     {
          pk[i] = x0[i];
          qk[i] = -h[i];
     }

     hpk     += h[i]*pk[i];
     hqk     += h[i]*qk[i];
     pk_x0    = pk[i]-x0[i];
     n2pk_x0 += pk_x0*pk_x0;
     dk1     += pk_x0*qk[i];
     sk1     += qk[i]*qk[i];
  }

  /* ------- Compute ak, bk, ck, dk and sk to construct e(lam) -------- */
  ak = - (gamma + hpk);
  bk = - hqk;
  ck = q0 + 0.5 * n2pk_x0;
  dk = dk1;
  sk = 0.5 * sk1;
  /* ----------------------------------------------------------------- */ 

  /* - Finding the global maximizer of e(lam) for [lam(k), lam(k+1)] - */
  /* --------------------- using Proposition 4 ----------------------- */
  if (bk != 0) 
  {
         double w = ak*ak - bk*(ak*dk - bk*ck)/sk;
         if (w == 0) 
         {
                lam_hat = -ak / bk;
                phik    = 0;
         } 
         else 
         {
                double w = ak*ak - bk*(ak*dk - bk*ck)/sk;
                lam_hat = (-ak + sqrt(w))/bk;
                phik    = bk / (2*sk*lam_hat + dk);  
         }
  } 
  else 
  {
         if (ak > 0) 
         {
                   lam_hat = -dk / (2 * sk);
                   phik    = 4*ak*sk / (4*ck*sk + (sk - 2)*(dk*dk));
         } 
         else 
         {
                   lam_hat = + infinity; 
                   phik    = 0;
         }
  }
  /* ----------------------------------------------------------------- */

  /* --- Checking the feasibility of the solution of Proposition 4 --- */
  if (lam[k] <= lam_hat && lam_hat <= lam[k + 1]) 
  {
         if (phik > e) 
         {
            for (r=0; r<n; r++)
            {
               u[r] = pk[r] + lam_hat * qk[r];
            }

            e = phik;
         }
  } 
  else 
  {
         ek1 = (ak+bk*lam[k])/(ck+(dk+sk*lam[k])*lam[k]);
         ek2 = (ak+bk*lam[k+1])/(ck+(dk+sk*lam[k+1])*lam[k+1]);      
         if (ek1 >= ek2) 
         {
                lam_hat = lam[k];
                if (ek1 > e) 
                {
                   for (s=0; s<n;s++)
                   {
                      u[s] = pk[s] + lam_hat * qk[s];
                   }

                   e = ek1;
                }
         } 
         else 
         { 
                lam_hat = lam[k + 1];
                if (ek2 > e) 
                {
                   for (t=0; t<n;t++)
                   {
                      u[t] = pk[t] + lam_hat * qk[t];
                   } 

                   e = ek2;
                }
         }
  }
  /* ------------------------------------------------------------------ */

}/* ------------------------- End of for (k) --------------------------- */
}//p
/* --------- The global maximizer by searching all m intervals --------- */
*Egh = e;
delete[] pk;
delete[] qk;
delete[] lamb1;
delete[] lamb;
delete[] lam; 

return;
/* --------------------------------------------------------------------- */

}

Please note that the first two parallel code working well, but just the output of the nested loop is in correct.

Any suggestion or comment is appreciated.

Это было полезно?

Решение

The outermost loop: I do not know all code but it look like that variables hpk, hqk, n2pk_x0, dk, sk should be private. If you do not specify them to be private it will break correctness.

OpenMP is not always very good for nested parallelism. It depends on OpenMP settings but nested loop can create p*p threads, where p is a default concurrency of your machine. So big oversubscription may lead significant performance degradation. In most cases it is Ok to parallelise the outermost loop and leave the nested loops to be serial.

The one of the reason of parallelising nested loops is achieving better work balancing. But your case seems to have balanced work and you should not face the work balancing problem if you parallelise only the outermost loop.

But if you still want to parallelise both loops may I suggest using Intel TBB instead of OpenMP? You can use tbb::parallel_for for outermost loop and tbb::parallel_reduce for the nested one. Intel TBB uses one thread pool for all its algorithms so it will not lead your application to have oversubscription.

[updated] Some parallelization advises:

  1. Until you achieve correctness the execution time does not mean anything. Since a correctness fix can change it significantly (even for better in some cases);
  2. Do not try to parallelise "all and at once": try to parallelise loop-by-loop. It will be easier to understand when correctness is broken;
  3. Do not modify shared variables concurrently. If you really need it you should rethink you algorithm and use special constructions such as reductions, atomic operations, locks/mutexes/semaphores and so on.
  4. Be accurate when write in shared arrays with private-modified indices since different threads may have the same indices.

Другие советы

I think your idea of nested parallelistation does not fit the OpenMP mindset very well. Allthough nested parallelism can be achieved in OpenMP, it brings more complications than necessary. Typically in OpenMP you only parallelise a single loop at once.

Parallelisation should be done on the level with the least interleaving dependencies. Often this comes out to be the top level. In your particular case this is true as well, as the steps in the outer loop are not strongly coupled.

I don't know what the rest of your code does, especially what happens to the values of hpk,hqk,n2pk_x0, dk and sk. All you have to do is add #pragma omp parallel for to your code.

Лицензировано под: CC-BY-SA с атрибуция
Не связан с StackOverflow
scroll top