Question

Firstly, I will prefix this by saying I don't think it is necessary to understand the functioning of the code below to make a sensible attempt to solve my problem. This is primarily an optimisation problem. The code is to understand what is being done.

I have the following somewhat optimised convolution main loop (which works):

for(int i=0; i<length-kernel_length; i+=4){

    acc = _mm_setzero_ps();

    for(int k=0; k<KERNEL_LENGTH; k+=4){

        int data_offset = i + k;

        for (int l = 0; l < 4; l++){

            data_block = _mm_load_ps(in_aligned[l] + data_offset);
            prod = _mm_mul_ps(kernel_reverse[k+l], data_block);

            acc = _mm_add_ps(acc, prod);
        }
    }
    _mm_storeu_ps(out+i, acc);

}

KERNEL_LENGTH is 4. in_aligned is the input array (upon which the convolution is performed) repeated 4 times, with each repeat being shifted one sample to the left from the others. This is so every sample can be found on a 16-byte aligned location. kernel_reverse is the reversed kernel, with every sample repeated 4 times to fill a 4-vector and is declared and defined as:

float kernel_block[4] __attribute__ ((aligned (16)));
__m128 kernel_reverse[KERNEL_LENGTH] __attribute__ ((aligned (16)));    

// Repeat the kernel across the vector
for(int i=0; i<KERNEL_LENGTH; i++){
    kernel_block[0] = kernel[kernel_length - i - 1];
    kernel_block[1] = kernel[kernel_length - i - 1];
    kernel_block[2] = kernel[kernel_length - i - 1];
    kernel_block[3] = kernel[kernel_length - i - 1];

    kernel_reverse[i] = _mm_load_ps(kernel_block);
}

The code computes the algorithm correctly and pretty quickly too.

I compile the code with gcc -std=c99 -Wall -O3 -msse3 -mtune=core2

My question is this: The loop is compiled to the machine code below. Inside this loop, a not-insignificant number of instructions are spent loading the kernel every time. The kernel does not change on each iteration of the loop and so can, in principle, be kept in SSE registers. As I understand it, there are sufficient registers to easily store the kernel (and indeed, the machine code doesn't suggest too much register pressure).

How do I persuade the compiler to not load the kernel on every loop?

I was expecting the compiler to do this automatically when the kernel length was set to be constant.

    testl   %edx, %edx
    jle .L79
    leaq    (%rcx,%rcx,2), %rsi
    movaps  -144(%rbp), %xmm6
    xorps   %xmm2, %xmm2
    leal    -1(%rdx), %ecx
    movaps  -128(%rbp), %xmm5
    xorl    %eax, %eax
    movaps  -112(%rbp), %xmm4
    leaq    0(%r13,%rsi,4), %rsi
    shrl    $2, %ecx
    addq    $1, %rcx
    movaps  -96(%rbp), %xmm3
    salq    $4, %rcx
    .p2align 4,,10
    .p2align 3
.L80:
    movaps  0(%r13,%rax), %xmm0
    movaps  (%r14,%rax), %xmm1
    mulps   %xmm6, %xmm0
    mulps   %xmm5, %xmm1
    addps   %xmm2, %xmm0
    addps   %xmm1, %xmm0
    movaps  (%r9,%rax), %xmm1
    mulps   %xmm4, %xmm1
    addps   %xmm1, %xmm0
    movaps  (%rsi,%rax), %xmm1
    mulps   %xmm3, %xmm1
    addps   %xmm1, %xmm0
    movups  %xmm0, (%rbx,%rax)
    addq    $16, %rax
    cmpq    %rcx, %rax
    jne .L80
.L79:

Edit: the full code listing is as follows:

#define KERNEL_LENGTH 4
int convolve_sse_in_aligned_fixed_kernel(float* in, float* out, int length,
        float* kernel, int kernel_length)
{
    float kernel_block[4] __attribute__ ((aligned (16)));
    float in_aligned[4][length] __attribute__ ((aligned (16)));

    __m128 kernel_reverse[KERNEL_LENGTH] __attribute__ ((aligned (16)));    
    __m128 data_block __attribute__ ((aligned (16)));

    __m128 prod __attribute__ ((aligned (16)));
    __m128 acc __attribute__ ((aligned (16)));

    // Repeat the kernel across the vector
    for(int i=0; i<KERNEL_LENGTH; i++){
        int index = kernel_length - i - 1;
        kernel_block[0] = kernel[index];
        kernel_block[1] = kernel[index];
        kernel_block[2] = kernel[index];
        kernel_block[3] = kernel[index];

        kernel_reverse[i] = _mm_load_ps(kernel_block);
    }

    /* Create a set of 4 aligned arrays
     * Each array is offset by one sample from the one before
     */
    for(int i=0; i<4; i++){
        memcpy(in_aligned[i], (in+i), (length-i)*sizeof(float));
    }

    for(int i=0; i<length-kernel_length; i+=4){

        acc = _mm_setzero_ps();

        for(int k=0; k<KERNEL_LENGTH; k+=4){

            int data_offset = i + k;

            for (int l = 0; l < 4; l++){

                data_block = _mm_load_ps(in_aligned[l] + data_offset);
                prod = _mm_mul_ps(kernel_reverse[k+l], data_block);

                acc = _mm_add_ps(acc, prod);
            }
        }
        _mm_storeu_ps(out+i, acc);

    }

    // Need to do the last value as a special case
    int i = length - kernel_length;
    out[i] = 0.0;
    for(int k=0; k<kernel_length; k++){
        out[i] += in_aligned[0][i+k] * kernel[kernel_length - k - 1];
    }

    return 0;
}
Was it helpful?

Solution

The answer is, it is doing exactly what I wanted. The problem, it seems, was down to me being inept at reading the output from objdump -d. In modifying the question to use the output from gcc -S as suggested by @PascalCuoq, the loop is notably easier to understand.

I left the question because somebody may value that point! (and indeed the code).

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top