How to safely parallel the for-loop with memcpy inside

Question

There are multiple data races in your code, namely:

/* n stands for the size of an image, sz stands for the patch size to extract */
int blockid3, blockid2, blockid;
for (k=0; k<=n[2]-sz[2]; k+=1) {
    blockid3 = k * (n[1]-sz[1]+1) * (n[0]-sz[0]+1);
#pragma omp parallel for
    for (j=0; j<=n[1]-sz[1]; j+=1) {
        blockid2 = j * (n[0]-sz[0]+1);          // <--- here
        for (i=0; i<=n[0]-sz[0]; i+=1) {        // <--- here
            blockid = i + blockid2 + blockid3;  // <--- here

            /* copy single block */
            for (m=0; m<sz[2]; m++) {           // <--- here
                for (l=0; l<sz[1]; l++) {       // <--- and here
                    memcpy(b + blockid*sz[0]*sz[1]*sz[2] + m*sz[0]*sz[1] + l*sz[0], x+(k+m)*n[0]*n[1]+(j+l)*n[0]+i, sz[0]*sizeof(double));
                }
            }

        }
    }
}

By the rules of OpenMP blockid2, i, blockid, m, and l are all implicitly shared which is not what you want. You should either make them private, or better declare them inside the parallel region and thus make them implicitly private:

#pragma omp parallel for private(i,m,l,blockid,blockid2)
...

or

int blockid3;
for (k=0; k<=n[2]-sz[2]; k+=1) {
    blockid3 = k * (n[1]-sz[1]+1) * (n[0]-sz[0]+1);
#pragma omp parallel for
    for (j=0; j<=n[1]-sz[1]; j+=1) {
        int blockid2 = j * (n[0]-sz[0]+1);
        for (int i=0; i<=n[0]-sz[0]; i+=1) {
            int blockid = i + blockid2 + blockid3;

            /* copy single block */
            for (int m=0; m<sz[2]; m++) {
                for (int l=0; l<sz[1]; l++) {
                    memcpy(b + blockid*sz[0]*sz[1]*sz[2] + m*sz[0]*sz[1] + l*sz[0], x+(k+m)*n[0]*n[1]+(j+l)*n[0]+i, sz[0]*sizeof(double));
                }
            }

        }
    }
}

The latter requires a C99-compliant compiler (because of the way loop variables are declared). Your GCC 4.6.3 requires the -std=c99 option to enable C99 compliance. If no such compiler is available (are there still non-C99 compilers in general use?), you should add the private(i,l,m) clause. You might also want to move the parallelisation to the outermost loop instead in order to minimise the OpenMP overhead.