Cholesky decomposition with OpenMP

Question

I manged to get SIMD working with the Cholesky decomposition. I did this using loop tiling as I have used before in matrix multiplication. The solution was not trivial. Here are the times for a 5790x5790 matrix on my 4 core/ 8 HT Ivy Bridge system (eff = GFLOPS/(peak GFLOPS)):

double floating point peak GFLOPS 118.1
1 thread       time 36.32 s, GFLOPS  1.78, eff  1.5%
8 threads      time  7.99 s, GFLOPS  8.10, eff  6.9%
4 threads+AVX  time  1.36 s, GFLOPS 47.64, eff 40.3%
4 threads MKL  time  0.68 s, GFLOPS 95.14, eff 80.6% // from LAPACKE_dpotrf

single floating point peak GFLOPS 236.2
1 thread       time 33.88 s, GFLOPS  1.91, eff  0.8%
8 threads      time  4.74 s, GFLOPS 13.64, eff  5.8%
4 threads+AVX  time  0.78 s, GFLOPS 82.61, eff 35.0%

The new method is 25 times faster for double and 40 times faster for single. The efficiency is about 35-40% of the peak FLOPS now. With matrix multiplication I get up to 70% with AVX in my own code. I don't know what to expect from Cholesky decomposition. The algorithm is partially serial (when calculating the diagonal block, called triangle in my code below) unlike matrix multiplication.

Update: I'm within a factor for 2 of the MKL. I don't know if I should be proud of that or embarrassed by that but apparently my code can still be improved significantly. I found a PhD thesis on this which shows that my block algorithm is a common solution so I managed to reinvent the wheel.

I use 32x32 tiles for double and 64x64 tiles for float. I also reorder the memory for each tile to be contiguous and be its transpose. I defined a new matrix production function. Matrix multiplication is defined as:

C_i,j = A_i,k * B_k,j //sum over k

I realized that in the Cholesky algorithm there is something very similar

C_j,i = A_i,k * B_j,k //sum over k

By writing the transpose of the tiles I was able to use my optimzied function for matrix multiplication here almost exactly (I only had to change one line of code). Here is the main function:

reorder(tmp,B,n2,bs);
for(int j=0; j<nb; j++) {
    #pragma omp parallel for schedule(static) num_threads(ncores)
    for(int i=j; i<nb; i++) {
        for(int k=0; k<j; k++) {
            product(&B[stride*(nb*j+k)],&B[stride*(nb*i+k)],&B[stride*(nb*i+j)],bs);
        }
    }
    triangle(&B[stride*(nb*j+j)], bs);
    #pragma omp parallel for schedule(static)
    for(int i=j+1; i<nb; i++) {         
        block(&B[stride*(nb*i+j)],&B[stride*(nb*j+j)],bs);
    }           
}
reorder_inverse(B,tmp,n2,bs);

Here are the other functions. I have six product functions for SSE2, AVX, and FMA each with double and float version. I only show the one for AVX and double here:

template <typename Type>
void triangle(Type *A, int n) {
    for (int j = 0; j < n; j++) {
        Type s = 0;
        for(int k=0; k<j; k++) s+= A[k*n+j]*A[k*n+j];
        //if((A[j * n + j] - s)<0) printf("asdf3 j %d, %f %f\n", j, A[j * n + j] - s, sqrt(A[j * n + j] - s));
        A[j*n+j] = sqrt(A[j*n+j] - s);
        Type fact = 1.0/A[j*n+j];
        for (int i = j+1; i<n; i++) {
            Type s = 0;
            for(int k=0; k<j; k++) s+=A[k*n+i]*A[k*n+j];
            A[j*n+i] = fact * (A[j*n+i] - s);
        }
    }
}

template <typename Type>
void block(Type *A, Type *B, int n) {   
    for (int j = 0; j <n; j++) {
        Type fact = 1.0/B[j*n+j];   
        for (int i = 0; i<n; i++) {
            Type s = 0;
            for(int k=0; k<j; k++) {
                s += A[k*n+i]*B[k*n+j];
            }
            A[j*n+i] = fact * (A[j*n+i] - s);
        }
    }
}

template <typename Type>
void reorder(Type *A, Type *B, int n, int bs) {
    int nb = n/bs;
    int stride = bs*bs;
    //printf("%d %d %d\n", bs, nb, stride);
    #pragma omp parallel for schedule(static)
    for(int i=0; i<nb; i++) {
        for(int j=0; j<nb; j++) {
            for(int i2=0; i2<bs; i2++) {
                for(int j2=0; j2<bs; j2++) {
                    B[stride*(nb*i+j) + bs*j2+i2] = A[n*bs*i + j*bs + n*i2 + j2];
                }
            }
        }
    }
}

template <typename Type>
void reorder_inverse(Type *A, Type *B, int n, int bs) {
    int nb = n/bs;
    int stride = bs*bs;
    //printf("%d %d %d\n", bs, nb, stride);
    #pragma omp parallel for schedule(static)
    for(int i=0; i<nb; i++) {
        for(int j=0; j<nb; j++) {
            for(int i2=0; i2<bs; i2++) {
                for(int j2=0; j2<bs; j2++) {
                    B[n*bs*i + j*bs + n*i2 + j2] = A[stride*(nb*i+j) + bs*j2+i2];
                }
            }
        }
    }

extern "C" void product32x32_avx(double *a, double *b, double *c, int n) 
{
    for(int i=0; i<n; i++) {    
        __m256d t1 = _mm256_loadu_pd(&c[i*n +  0]);
        __m256d t2 = _mm256_loadu_pd(&c[i*n +  4]);
        __m256d t3 = _mm256_loadu_pd(&c[i*n +  8]);
        __m256d t4 = _mm256_loadu_pd(&c[i*n + 12]);
        __m256d t5 = _mm256_loadu_pd(&c[i*n + 16]);
        __m256d t6 = _mm256_loadu_pd(&c[i*n + 20]);
        __m256d t7 = _mm256_loadu_pd(&c[i*n + 24]);
        __m256d t8 = _mm256_loadu_pd(&c[i*n + 28]);
        for(int k=0; k<n; k++) {
            __m256d a1 = _mm256_set1_pd(a[k*n+i]);

            __m256d b1 = _mm256_loadu_pd(&b[k*n+0]);
            t1 = _mm256_sub_pd(t1,_mm256_mul_pd(a1,b1));

            __m256d b2 = _mm256_loadu_pd(&b[k*n+4]);
            t2 = _mm256_sub_pd(t2,_mm256_mul_pd(a1,b2));

            __m256d b3 = _mm256_loadu_pd(&b[k*n+8]);
            t3 = _mm256_sub_pd(t3,_mm256_mul_pd(a1,b3));

            __m256d b4 = _mm256_loadu_pd(&b[k*n+12]);
            t4 = _mm256_sub_pd(t4,_mm256_mul_pd(a1,b4));

            __m256d b5 = _mm256_loadu_pd(&b[k*n+16]);
            t5 = _mm256_sub_pd(t5,_mm256_mul_pd(a1,b5));

            __m256d b6 = _mm256_loadu_pd(&b[k*n+20]);
            t6 = _mm256_sub_pd(t6,_mm256_mul_pd(a1,b6));

            __m256d b7 = _mm256_loadu_pd(&b[k*n+24]);
            t7 = _mm256_sub_pd(t7,_mm256_mul_pd(a1,b7));

            __m256d b8 = _mm256_loadu_pd(&b[k*n+28]);
            t8 = _mm256_sub_pd(t8,_mm256_mul_pd(a1,b8));
        }
        _mm256_storeu_pd(&c[i*n +  0], t1);
        _mm256_storeu_pd(&c[i*n +  4], t2);
        _mm256_storeu_pd(&c[i*n +  8], t3);
        _mm256_storeu_pd(&c[i*n + 12], t4);
        _mm256_storeu_pd(&c[i*n + 16], t5);
        _mm256_storeu_pd(&c[i*n + 20], t6);
        _mm256_storeu_pd(&c[i*n + 24], t7);
        _mm256_storeu_pd(&c[i*n + 28], t8);
    }
}