A test might be flawed.
First Test:
#include <iostream>
#include <chrono>
template <unsigned Mask>
inline bool function1(const unsigned n, const unsigned m)
{
const unsigned diff = m-n;
const unsigned msk = Mask & diff;
return (n <= m) && ((!msk && !diff) || (msk && msk <= diff));
}
template <unsigned Mask>
inline bool function2(const unsigned n, const unsigned m)
{
return (n <= m) && ((!(Mask & (m-n)) && !(m-n)) || ((Mask & (m-n)) && (Mask & (m-n)) <= (m-n)));
}
template <unsigned Mask>
inline bool function3(const unsigned n, const unsigned m)
{
if(m < n) return false;
else if(m == n) return true;
else return Mask & (m-n);
}
template <unsigned Mask>
inline bool function4(const unsigned n, const unsigned m)
{
return (n < m && (Mask & (m-n))) || (n == m);
}
volatile unsigned a = std::rand();
volatile unsigned b = std::rand();
volatile bool result;
inline double duration(
std::chrono::system_clock::time_point start,
std::chrono::system_clock::time_point end)
{
return double((end - start).count())
/ std::chrono::system_clock::period::den;
}
int main() {
typedef bool (*Function)(const unsigned, const unsigned);
const unsigned N = 4;
std::chrono::system_clock::duration timing[N] = {};
Function fn[] = {
function1<0x1234>,
function2<0x1234>,
function3<0x1234>,
function4<0x1234>,
};
for(unsigned i = 0; i < 10000; ++i) {
for(unsigned j = 0; j < 100; ++j) {
unsigned Loops = 100;
for(unsigned f = 0; f < N; ++f) {
auto start = std::chrono::system_clock::now();
for(unsigned loop = 0; loop < Loops; ++loop) {
result = fn[f](a, b);
}
auto end = std::chrono::system_clock::now();
timing[f] += (end-start);
}
}
}
for(unsigned i = 0; i < 4; ++i) {
std::cout
<< "Timing " << i+1 << ": "
<< double(timing[i].count()) / std::chrono::system_clock::period::den
<< "\n";
}
}
compiled with g++ -std=c++11 -O3
shows:
Timing 1: 0.435909
Timing 2: 0.435438
Timing 3: 0.435435
Timing 4: 0.435523
Second Test:
#include <iostream>
#include <chrono>
inline bool function1(const unsigned Mask, const unsigned n, const unsigned m)
{
const unsigned diff = m-n;
const unsigned msk = Mask & diff;
return (n <= m) && ((!msk && !diff) || (msk && msk <= diff));
}
inline bool function2(const unsigned Mask, const unsigned n, const unsigned m)
{
return (n <= m) && ((!(Mask & (m-n)) && !(m-n)) || ((Mask & (m-n)) && (Mask & (m-n)) <= (m-n)));
}
inline bool function3(const unsigned Mask, const unsigned n, const unsigned m)
{
if(m < n) return false;
else if(m == n) return true;
else return Mask & (m-n);
}
inline bool function4(const unsigned Mask, const unsigned n, const unsigned m)
{
return (n < m && (Mask & (m-n))) || (n == m);
}
inline double duration(
std::chrono::system_clock::time_point start,
std::chrono::system_clock::time_point end)
{
return double((end - start).count())
/ std::chrono::system_clock::period::den;
}
int main() {
typedef bool (*Function)(const unsigned, const unsigned, const unsigned);
const unsigned N = 4;
std::chrono::system_clock::duration timing[N] = {};
Function fn[] = {
function1,
function2,
function3,
function4,
};
const unsigned OuterLoops = 1000000;
const unsigned InnerLoops = 100;
const unsigned Samples = OuterLoops * InnerLoops;
unsigned* M = new unsigned[Samples];
unsigned* A = new unsigned[Samples];
unsigned* B = new unsigned[Samples];
for(unsigned i = 0; i < Samples; ++i) {
M[i] = std::rand();
A[i] = std::rand();
B[i] = std::rand();
}
unsigned result[N];
for(unsigned i = 0; i < OuterLoops; ++i) {
for(unsigned f = 0; f < N; ++f) {
auto start = std::chrono::system_clock::now();
for(unsigned j = 0; j < InnerLoops; ++j) {
unsigned index = i + j;
unsigned mask = M[index];
unsigned a = A[index];
unsigned b = B[index];
result[f] = fn[f](mask, a, b);
}
auto end = std::chrono::system_clock::now();
timing[f] += (end-start);
}
for(unsigned f = 1; f < N; ++f) {
if(result[0] != result[f]) {
std::cerr << "Different Results\n";
exit(-1);
}
}
}
for(unsigned i = 0; i < 4; ++i) {
std::cout
<< "Timing " << i+1 << ": "
<< double(timing[i].count()) / std::chrono::system_clock::period::den
<< "\n";
}
}
compiled with g++ -std=c++11 -O3
shows:
Timing 1: 0.763875
Timing 2: 0.738105
Timing 3: 0.518714
Timing 4: 0.785299
Disassembly of the second functions (compiled without inline):
0000000000000000 <_Z9function1jjj>:
0: 31 c0 xor %eax,%eax
2: 39 f2 cmp %esi,%edx
4: 72 10 jb 16 <_Z9function1jjj+0x16>
6: 29 f2 sub %esi,%edx
8: 21 d7 and %edx,%edi
a: 89 f8 mov %edi,%eax
c: 09 d0 or %edx,%eax
e: 74 18 je 28 <_Z9function1jjj+0x28>
10: 39 d7 cmp %edx,%edi
12: 76 0c jbe 20 <_Z9function1jjj+0x20>
14: 31 c0 xor %eax,%eax
16: f3 c3 repz retq
18: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
1f: 00
20: 85 ff test %edi,%edi
22: 74 f0 je 14 <_Z9function1jjj+0x14>
24: 0f 1f 40 00 nopl 0x0(%rax)
28: b8 01 00 00 00 mov $0x1,%eax
2d: c3 retq
2e: 66 90 xchg %ax,%ax
0000000000000030 <_Z9function2jjj>:
30: 31 c0 xor %eax,%eax
32: 39 d6 cmp %edx,%esi
34: 77 0c ja 42 <_Z9function2jjj+0x12>
36: 89 d1 mov %edx,%ecx
38: 29 f1 sub %esi,%ecx
3a: 21 cf and %ecx,%edi
3c: 75 0a jne 48 <_Z9function2jjj+0x18>
3e: 39 f2 cmp %esi,%edx
40: 74 0a je 4c <_Z9function2jjj+0x1c>
42: f3 c3 repz retq
44: 0f 1f 40 00 nopl 0x0(%rax)
48: 39 f9 cmp %edi,%ecx
4a: 72 f6 jb 42 <_Z9function2jjj+0x12>
4c: b8 01 00 00 00 mov $0x1,%eax
51: c3 retq
52: 66 66 66 66 66 2e 0f data32 data32 data32 data32 nopw %cs:0x0(%rax,%rax,1)
59: 1f 84 00 00 00 00 00
0000000000000060 <_Z9function3jjj>:
60: 31 c0 xor %eax,%eax
62: 39 f2 cmp %esi,%edx
64: 72 0f jb 75 <_Z9function3jjj+0x15>
66: 74 08 je 70 <_Z9function3jjj+0x10>
68: 29 f2 sub %esi,%edx
6a: 85 fa test %edi,%edx
6c: 0f 95 c0 setne %al
6f: c3 retq
70: b8 01 00 00 00 mov $0x1,%eax
75: f3 c3 repz retq
77: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
7e: 00 00
0000000000000080 <_Z9function4jjj>:
80: 39 d6 cmp %edx,%esi
82: 73 0d jae 91 <_Z9function4jjj+0x11>
84: 89 d1 mov %edx,%ecx
86: b8 01 00 00 00 mov $0x1,%eax
8b: 29 f1 sub %esi,%ecx
8d: 85 f9 test %edi,%ecx
8f: 75 05 jne 96 <_Z9function4jjj+0x16>
91: 39 d6 cmp %edx,%esi
93: 0f 94 c0 sete %al
96: f3 c3 repz retq
Hardware:
Intel® Core™ i3-2310M CPU @ 2.10GHz × 4
7.7 GiB
My conclusion:
- Analyze the algorithm properly (See @George answer)
- Express the optimized algorithm in simple code and leave fine tuning optimizations to the compiler.
- Write a proper test case (measurement), but the kind of measurement will impact the result. (Here: The first and second show different results) -