I have two designs of code iterating over vectors of size 500. One of the designs contains arrays of 64-bit doubles and the second design uses arrays containing 32-bit integers. I was expecting the 32-bit design to be quicker because more useful data can be packed in the cache.
Compiler MSVC, CPU Ivy Bridge, compiling 64-bit mode.
This is code 1, using the 32-bit ints (runs in 2600 CPU cycles):
#include <vector>
#include <iostream>
int main(){
std::vector<unsigned int> x1;
std::vector<unsigned int> x2;
std::vector<unsigned int> x3;
x1.resize(500);
x2.resize(500);
x3.resize(500);
for(int i =0; i<500; i++){
x1[i] = i;
x2[i] = 2*i;
x3[i] = 4*i;
}
int counter = 0;
while(counter < 1000){
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
double n = 0;
start = __rdtsc();
for(int i=0; i < 500; i++){
unsigned int a = x1[i];
unsigned int b = x2[i];
unsigned int g = x3[i];
m = m + (a * g);
n = n + (b * g);
}
end = __rdtscp();
std::cout << (end-start) << "\t\t"<<m << n << std::endl;
counter++;
}
}
producing this asm (-Os):
start = __rdtscp(&p);
rdtscp
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rax,rdx
mov r10,rax
unsigned int p;
unsigned int q;
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
mov r8,rbx
mov r9d,1F4h
unsigned int a = x1[i];
unsigned int b = x2[i];
unsigned int g = x3[i];
mov edx,dword ptr [r8+r15]
m = m + (a * g);
mov ecx,edx
imul ecx,dword ptr [r8+r14]
xorps xmm0,xmm0
cvtsi2sd xmm0,rcx
addsd xmm7,xmm0
n = n + (b * g);
imul edx,dword ptr [r8]
mov eax,edx
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
addsd xmm8,xmm0
for(int i=0; i < 500; i++){
add r8,4
dec r9
jne main+0E5h (013F681261h)
}
end = __rdtscp(&q);
rdtscp
}
end = __rdtscp(&q);
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rdx,rax
This is code 2, using the 64-bit doubles (code runs in 2000 CPU cycles):
#include <vector>
#include <iostream>
int main(){
std::vector<double> x1;
std::vector<double> x2;
std::vector<unsigned long long> x3;
x1.resize(500);
x2.resize(500);
x3.resize(500);
for(int i =0; i<500; i++){
x1[i] = i;
x2[i] = 2*i;
x3[i] = 4*i;
}
int counter = 0;
while(counter < 1000){
unsigned int p;
unsigned int q;
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
double n = 0;
start = __rdtscp(&p);
for(int i=0; i < 500; i++){
double a = x1[i];
double b = x2[i];
unsigned long long g = x3[i];
m = m + (a * g);
n = n + (b * g);
}
end = __rdtscp(&q);
std::cout << (end-start) << "\t\t"<<m << n << std::endl;
counter++;
}
}
and here is the asm (-Os) produced:
start = __rdtscp(&p);
rdtscp
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rax,rdx
mov r9,rax
unsigned int p;
unsigned int q;
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
mov rdx,rbx
mov r8d,1F4h
double a = x1[i];
double b = x2[i];
unsigned long long g = x3[i];
mov rcx,qword ptr [rdx+r15]
xorps xmm1,xmm1
m = m + (a * g);
cvtsi2sd xmm1,rcx
test rcx,rcx
jns main+120h (013F32129Ch)
addsd xmm1,xmm9
movaps xmm0,xmm1
mulsd xmm0,mmword ptr [rdx+r14]
addsd xmm6,xmm0
n = n + (b * g);
mulsd xmm1,mmword ptr [rdx]
addsd xmm7,xmm1
for(int i=0; i < 500; i++){
add rdx,8
dec r8
jne main+10Ah (013F321286h)
}
end = __rdtscp(&q);
rdtscp
}
end = __rdtscp(&q);
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rdx,rax