質問

I am developing a originally serial code in KSVD package to support OpenMP. The original code, which serves like im2col in MATLAB and extracts patches from the image, is shown as follows:

/* n stands for the size of an image, sz stands for the patch size to extract */
int blocknum = 0;
for (k=0; k<=n[2]-sz[2]; k+=1) {
    for (j=0; j<=n[1]-sz[1]; j+=1) {
        for (i=0; i<=n[0]-sz[0]; i+=1) {

            /* copy single block */
            for (m=0; m<sz[2]; m++) {
                for (l=0; l<sz[1]; l++) {
                    memcpy(b + blocknum*sz[0]*sz[1]*sz[2] + m*sz[0]*sz[1] + l*sz[0], x+(k+m)*n[0]*n[1]+(j+l)*n[0]+i, sz[0]*sizeof(double));
                }
            }
            blocknum ++;
        }
    }
}

While, I would like to make it parallel by replacing the incremental blocknum with an indexing variable blockid.

/* n stands for the size of an image, sz stands for the patch size to extract */
int blockid3, blockid2, blockid;
for (k=0; k<=n[2]-sz[2]; k+=1) {
    blockid3 = k * (n[1]-sz[1]+1) * (n[0]-sz[0]+1);
#pragma omp parallel for
    for (j=0; j<=n[1]-sz[1]; j+=1) {
        blockid2 = j * (n[0]-sz[0]+1);
        for (i=0; i<=n[0]-sz[0]; i+=1) {
            blockid = i + blockid2 + blockid3;

            /* copy single block */
            for (m=0; m<sz[2]; m++) {
                for (l=0; l<sz[1]; l++) {
                    memcpy(b + blockid*sz[0]*sz[1]*sz[2] + m*sz[0]*sz[1] + l*sz[0], x+(k+m)*n[0]*n[1]+(j+l)*n[0]+i, sz[0]*sizeof(double));
                }
            }

        }
    }
}

Then running leads me to fatal segmentation error. I do not know why (according to the stack trace, it seems related to safe threading). Because I thought parallel threads should not visit the same address once. Am I supposed to set some properties of variables, i.e. static or shared or private? Here is the stack trace:

Stack Trace (from fault):
[  0] 0x00007f9bcaa695de    /usr/local/MATLAB/R2011b/bin/glnxa64/libmwfl.so+00210398 _ZN2fl4diag15stacktrace_base7capt
ureERKNS0_14thread_contextEm+000158
[  1] 0x00007f9bcaa6b62d    /usr/local/MATLAB/R2011b/bin/glnxa64/libmwfl.so+00218669
[  2] 0x00007f9bcaa6b8f5    /usr/local/MATLAB/R2011b/bin/glnxa64/libmwfl.so+00219381 _ZN2fl4diag13terminate_logEPKcRKN
S0_14thread_contextEb+000165
[  3] 0x00007f9bc9a714f5   /usr/local/MATLAB/R2011b/bin/glnxa64/libmwmcr.so+00447733 _ZN2fl4diag13terminate_logEPKcPK8
ucontextb+000085
[  4] 0x00007f9bc9a6e5b4   /usr/local/MATLAB/R2011b/bin/glnxa64/libmwmcr.so+00435636
[  5] 0x00007f9bc9a6f333   /usr/local/MATLAB/R2011b/bin/glnxa64/libmwmcr.so+00439091
[  6] 0x00007f9bc9a6f4c7   /usr/local/MATLAB/R2011b/bin/glnxa64/libmwmcr.so+00439495
[  7] 0x00007f9bc9a7085f   /usr/local/MATLAB/R2011b/bin/glnxa64/libmwmcr.so+00444511
[  8] 0x00007f9bc9a70a15   /usr/local/MATLAB/R2011b/bin/glnxa64/libmwmcr.so+00444949
[  9] 0x00007f9bc89f0cb0              /lib/x86_64-linux-gnu/libpthread.so.0+00064688
[ 10] 0x00007f9bc876cb8e                    /lib/x86_64-linux-gnu/libc.so.6+01346446
[ 11] 0x00007f9b88238bb8 /home/peiyun/schmax3.0/test_im2col/mex_im2colstep.mexa64+00003000
[ 12] 0x00007f9bcb004eea    /usr/lib/gcc/x86_64-linux-gnu/4.6.3//libgomp.so+00032490
[ 13] 0x00007f9bc89e8e9a              /lib/x86_64-linux-gnu/libpthread.so.0+00032410
[ 14] 0x00007f9bc87164bd                    /lib/x86_64-linux-gnu/libc.so.6+00992445 clone+000109

By the way, if they are writing to different addresses, are there any race conditions regarding memcpy inside the omp for-loop?

役に立ちましたか?

解決

There are multiple data races in your code, namely:

/* n stands for the size of an image, sz stands for the patch size to extract */
int blockid3, blockid2, blockid;
for (k=0; k<=n[2]-sz[2]; k+=1) {
    blockid3 = k * (n[1]-sz[1]+1) * (n[0]-sz[0]+1);
#pragma omp parallel for
    for (j=0; j<=n[1]-sz[1]; j+=1) {
        blockid2 = j * (n[0]-sz[0]+1);          // <--- here
        for (i=0; i<=n[0]-sz[0]; i+=1) {        // <--- here
            blockid = i + blockid2 + blockid3;  // <--- here

            /* copy single block */
            for (m=0; m<sz[2]; m++) {           // <--- here
                for (l=0; l<sz[1]; l++) {       // <--- and here
                    memcpy(b + blockid*sz[0]*sz[1]*sz[2] + m*sz[0]*sz[1] + l*sz[0], x+(k+m)*n[0]*n[1]+(j+l)*n[0]+i, sz[0]*sizeof(double));
                }
            }

        }
    }
}

By the rules of OpenMP blockid2, i, blockid, m, and l are all implicitly shared which is not what you want. You should either make them private, or better declare them inside the parallel region and thus make them implicitly private:

#pragma omp parallel for private(i,m,l,blockid,blockid2)
...

or

int blockid3;
for (k=0; k<=n[2]-sz[2]; k+=1) {
    blockid3 = k * (n[1]-sz[1]+1) * (n[0]-sz[0]+1);
#pragma omp parallel for
    for (j=0; j<=n[1]-sz[1]; j+=1) {
        int blockid2 = j * (n[0]-sz[0]+1);
        for (int i=0; i<=n[0]-sz[0]; i+=1) {
            int blockid = i + blockid2 + blockid3;

            /* copy single block */
            for (int m=0; m<sz[2]; m++) {
                for (int l=0; l<sz[1]; l++) {
                    memcpy(b + blockid*sz[0]*sz[1]*sz[2] + m*sz[0]*sz[1] + l*sz[0], x+(k+m)*n[0]*n[1]+(j+l)*n[0]+i, sz[0]*sizeof(double));
                }
            }

        }
    }
}

The latter requires a C99-compliant compiler (because of the way loop variables are declared). Your GCC 4.6.3 requires the -std=c99 option to enable C99 compliance. If no such compiler is available (are there still non-C99 compilers in general use?), you should add the private(i,l,m) clause. You might also want to move the parallelisation to the outermost loop instead in order to minimise the OpenMP overhead.

ライセンス: CC-BY-SA帰属
所属していません StackOverflow
scroll top