This is a part of c++ code for solving a problem in computational mathematics of large dimension, say more than 100000 variables. I'd like to parallelise it using OpenMP. What is the best way of paralleling the following nested loop by OpenMP?
e = 0;
// m and n are are big numbers 200000 - 10000000
int i,k,r,s,t;
// hpk,hqk,pk_x0,n2pk_x0,dk,sk are double and declared before.
for (k=0; k<m; k++)
{
hpk = 0;
hqk = 0;
n2pk_x0 = 0;
dk = 0;
sk = 0;
for (int i=0; i<n; i++)
{
if (lamb[i] <= lam[k])
{
if (h[i]<0)
{
pk[i] = xu[i];
}
else if (h[i]>0)
{
pk[i] = xl[i];
}
qk[i] = 0;
}
else
{
pk[i] = x0[i];
qk[i] = -h[i];
}
hpk += h[i]*pk[i];
hqk += h[i]*qk[i];
pk_x0 = pk[i]-x0[i];
n2pk_x0 += pk_x0*pk_x0;
dk += pk_x0*qk[i];
sk += qk[i]*qk[i];
}
//}//p
/* ------- Compute ak, bk, ck, dk and sk to construct e(lam) -------- */
ak = - (gamma + hpk);
bk = - hqk;
ck = q0 + 0.5 * n2pk_x0;
sk = 0.5 * sk;
// some calculation based on index k
} // end of first for
I did some of the advises to private the local variables in the nested loop.The CPU time decreased by factor 1/2, but the output is not correct! Is there any way to improve the code in such a way that get correct result with less CPU time? (In the nested loop, if we set m=1, the output will be correct, but for m>1 the output is incorrect.)
This is the whole code:
static void subboconcpp(
double u[],
double *Egh,
double h[],
double gamma,
double x0[],
double q0,
double xl[],
double xu[],
int dim
)
{
int n,m,infinity = INT_MAX,i,k,r,s,t;;
double e;
double hpk, hqk, dk1, sk1, n2pk_x0;
double ak, bk, ck, dk, sk;
double lam_hat, phik, ek1, ek2;
double *pk = new double[dim];
double *qk = new double[dim];
double *lamb = new double[dim];
double *lamb1 = new double[dim];
double *lam = new double[dim];
/* ------------------ Computing lambl(i) and lambu(i) ------------------ */
/* n is the length of x0 */
n = dim;
#pragma omp parallel for shared(n,h,x0,xl,xu)//num_threads(8)
for (int i=0; i<n; i++)
{
double lamb_flag;
if (h[i] > 0)
{
lamb_flag = (x0[i] - xl[i])/h[i];
lamb[i] = lamb_flag;
lamb1[i] = lamb_flag;
}
else if (h[i] < 0)
{
lamb_flag = (x0[i] - xu[i])/h[i];
lamb[i] = lamb_flag;
lamb1[i] = lamb_flag;
}
//cout << "lamb:" << lamb[i];
}
/* --------------------------------------------------------------------- */
/* ----------------- Sorting lamb and constructing lam ----------------- */
/* lamb = sort(lamb,1); */
sort(lamb1, lamb1+n);
int q = 0;
double lam_flag = 0;
#pragma omp parallel for shared(n) firstprivate(q) lastprivate(m)
for (int j=0; j<n; j++)
{
if (lamb1[j] > lam_flag)
{
lam_flag = lamb1[j];
q = q + 1;
lam[q] = lam_flag;
//cout << "lam: \n" << lam[q];
}
if (j == n-1)
{
if (lam_flag < infinity)
{
m = q+1;
lam[m] = + infinity;
}
else
{
m = q;
}
}
//cout << "q: \n" << q;
}
/* --------------------------------------------------------------------- */
/* -- Finding the global maximizer of e(lam) for lam in[-inf, + inf] -- */
e = 0;
#pragma omp parallel shared(m,n,h,x0,xl,xu,lamb,lam) \
private(i,r,s,t,hpk, hqk, dk1, sk1, n2pk_x0,ak, bk, ck, dk, sk,lam_hat, phik, ek1, ek2)
{
#pragma omp for nowait
for (k=0; k<1; k++)
{
/*double hpk=0, hqk=0, dk1=0, sk1=0, n2pk_x0=0;
double ak, bk, ck, dk, sk;
double lam_hat, phik, ek1, ek2;
double *pk = new double[dim];
double *qk = new double[dim];*/
hpk = 0;
hqk = 0;
n2pk_x0 = 0;
dk1 = 0;
sk1 = 0;
for (int i=0; i<n; i++)
{
double pk_x0;
if (lamb[i] <= lam[k])
{
if (h[i]<0)
{
pk[i] = xu[i];
}
else if (h[i]>0)
{
pk[i] = xl[i];
}
qk[i] = 0;
}
else
{
pk[i] = x0[i];
qk[i] = -h[i];
}
hpk += h[i]*pk[i];
hqk += h[i]*qk[i];
pk_x0 = pk[i]-x0[i];
n2pk_x0 += pk_x0*pk_x0;
dk1 += pk_x0*qk[i];
sk1 += qk[i]*qk[i];
}
/* ------- Compute ak, bk, ck, dk and sk to construct e(lam) -------- */
ak = - (gamma + hpk);
bk = - hqk;
ck = q0 + 0.5 * n2pk_x0;
dk = dk1;
sk = 0.5 * sk1;
/* ----------------------------------------------------------------- */
/* - Finding the global maximizer of e(lam) for [lam(k), lam(k+1)] - */
/* --------------------- using Proposition 4 ----------------------- */
if (bk != 0)
{
double w = ak*ak - bk*(ak*dk - bk*ck)/sk;
if (w == 0)
{
lam_hat = -ak / bk;
phik = 0;
}
else
{
double w = ak*ak - bk*(ak*dk - bk*ck)/sk;
lam_hat = (-ak + sqrt(w))/bk;
phik = bk / (2*sk*lam_hat + dk);
}
}
else
{
if (ak > 0)
{
lam_hat = -dk / (2 * sk);
phik = 4*ak*sk / (4*ck*sk + (sk - 2)*(dk*dk));
}
else
{
lam_hat = + infinity;
phik = 0;
}
}
/* ----------------------------------------------------------------- */
/* --- Checking the feasibility of the solution of Proposition 4 --- */
if (lam[k] <= lam_hat && lam_hat <= lam[k + 1])
{
if (phik > e)
{
for (r=0; r<n; r++)
{
u[r] = pk[r] + lam_hat * qk[r];
}
e = phik;
}
}
else
{
ek1 = (ak+bk*lam[k])/(ck+(dk+sk*lam[k])*lam[k]);
ek2 = (ak+bk*lam[k+1])/(ck+(dk+sk*lam[k+1])*lam[k+1]);
if (ek1 >= ek2)
{
lam_hat = lam[k];
if (ek1 > e)
{
for (s=0; s<n;s++)
{
u[s] = pk[s] + lam_hat * qk[s];
}
e = ek1;
}
}
else
{
lam_hat = lam[k + 1];
if (ek2 > e)
{
for (t=0; t<n;t++)
{
u[t] = pk[t] + lam_hat * qk[t];
}
e = ek2;
}
}
}
/* ------------------------------------------------------------------ */
}/* ------------------------- End of for (k) --------------------------- */
}//p
/* --------- The global maximizer by searching all m intervals --------- */
*Egh = e;
delete[] pk;
delete[] qk;
delete[] lamb1;
delete[] lamb;
delete[] lam;
return;
/* --------------------------------------------------------------------- */
}
Please note that the first two parallel code working well, but just the output of the nested loop is in correct.
Any suggestion or comment is appreciated.