Persuading the compiler to set registers outside a loop
-
25-07-2021 - |
Question
Firstly, I will prefix this by saying I don't think it is necessary to understand the functioning of the code below to make a sensible attempt to solve my problem. This is primarily an optimisation problem. The code is to understand what is being done.
I have the following somewhat optimised convolution main loop (which works):
for(int i=0; i<length-kernel_length; i+=4){
acc = _mm_setzero_ps();
for(int k=0; k<KERNEL_LENGTH; k+=4){
int data_offset = i + k;
for (int l = 0; l < 4; l++){
data_block = _mm_load_ps(in_aligned[l] + data_offset);
prod = _mm_mul_ps(kernel_reverse[k+l], data_block);
acc = _mm_add_ps(acc, prod);
}
}
_mm_storeu_ps(out+i, acc);
}
KERNEL_LENGTH
is 4.
in_aligned
is the input array (upon which the convolution is performed) repeated 4 times, with each repeat being shifted one sample to the left from the others. This is so every sample can be found on a 16-byte aligned location.
kernel_reverse
is the reversed kernel, with every sample repeated 4 times to fill a 4-vector and is declared and defined as:
float kernel_block[4] __attribute__ ((aligned (16)));
__m128 kernel_reverse[KERNEL_LENGTH] __attribute__ ((aligned (16)));
// Repeat the kernel across the vector
for(int i=0; i<KERNEL_LENGTH; i++){
kernel_block[0] = kernel[kernel_length - i - 1];
kernel_block[1] = kernel[kernel_length - i - 1];
kernel_block[2] = kernel[kernel_length - i - 1];
kernel_block[3] = kernel[kernel_length - i - 1];
kernel_reverse[i] = _mm_load_ps(kernel_block);
}
The code computes the algorithm correctly and pretty quickly too.
I compile the code with gcc -std=c99 -Wall -O3 -msse3 -mtune=core2
My question is this: The loop is compiled to the machine code below. Inside this loop, a not-insignificant number of instructions are spent loading the kernel every time. The kernel does not change on each iteration of the loop and so can, in principle, be kept in SSE registers. As I understand it, there are sufficient registers to easily store the kernel (and indeed, the machine code doesn't suggest too much register pressure).
How do I persuade the compiler to not load the kernel on every loop?
I was expecting the compiler to do this automatically when the kernel length was set to be constant.
testl %edx, %edx
jle .L79
leaq (%rcx,%rcx,2), %rsi
movaps -144(%rbp), %xmm6
xorps %xmm2, %xmm2
leal -1(%rdx), %ecx
movaps -128(%rbp), %xmm5
xorl %eax, %eax
movaps -112(%rbp), %xmm4
leaq 0(%r13,%rsi,4), %rsi
shrl $2, %ecx
addq $1, %rcx
movaps -96(%rbp), %xmm3
salq $4, %rcx
.p2align 4,,10
.p2align 3
.L80:
movaps 0(%r13,%rax), %xmm0
movaps (%r14,%rax), %xmm1
mulps %xmm6, %xmm0
mulps %xmm5, %xmm1
addps %xmm2, %xmm0
addps %xmm1, %xmm0
movaps (%r9,%rax), %xmm1
mulps %xmm4, %xmm1
addps %xmm1, %xmm0
movaps (%rsi,%rax), %xmm1
mulps %xmm3, %xmm1
addps %xmm1, %xmm0
movups %xmm0, (%rbx,%rax)
addq $16, %rax
cmpq %rcx, %rax
jne .L80
.L79:
Edit: the full code listing is as follows:
#define KERNEL_LENGTH 4
int convolve_sse_in_aligned_fixed_kernel(float* in, float* out, int length,
float* kernel, int kernel_length)
{
float kernel_block[4] __attribute__ ((aligned (16)));
float in_aligned[4][length] __attribute__ ((aligned (16)));
__m128 kernel_reverse[KERNEL_LENGTH] __attribute__ ((aligned (16)));
__m128 data_block __attribute__ ((aligned (16)));
__m128 prod __attribute__ ((aligned (16)));
__m128 acc __attribute__ ((aligned (16)));
// Repeat the kernel across the vector
for(int i=0; i<KERNEL_LENGTH; i++){
int index = kernel_length - i - 1;
kernel_block[0] = kernel[index];
kernel_block[1] = kernel[index];
kernel_block[2] = kernel[index];
kernel_block[3] = kernel[index];
kernel_reverse[i] = _mm_load_ps(kernel_block);
}
/* Create a set of 4 aligned arrays
* Each array is offset by one sample from the one before
*/
for(int i=0; i<4; i++){
memcpy(in_aligned[i], (in+i), (length-i)*sizeof(float));
}
for(int i=0; i<length-kernel_length; i+=4){
acc = _mm_setzero_ps();
for(int k=0; k<KERNEL_LENGTH; k+=4){
int data_offset = i + k;
for (int l = 0; l < 4; l++){
data_block = _mm_load_ps(in_aligned[l] + data_offset);
prod = _mm_mul_ps(kernel_reverse[k+l], data_block);
acc = _mm_add_ps(acc, prod);
}
}
_mm_storeu_ps(out+i, acc);
}
// Need to do the last value as a special case
int i = length - kernel_length;
out[i] = 0.0;
for(int k=0; k<kernel_length; k++){
out[i] += in_aligned[0][i+k] * kernel[kernel_length - k - 1];
}
return 0;
}
Solution
The answer is, it is doing exactly what I wanted. The problem, it seems, was down to me being inept at reading the output from objdump -d
. In modifying the question to use the output from gcc -S
as suggested by @PascalCuoq, the loop is notably easier to understand.
I left the question because somebody may value that point! (and indeed the code).