Execution time for loops

Question

I can offer a counter-example, closely based on your code.

Code

#include "timer.h"
#include <stdio.h>

enum { M = 128 };

extern int SumByColRow (int matrix[M][M], int size);
extern int SumByRowCol (int matrix[M][M], int size);

int SumByColRow (int matrix[M][M], int size)
{
    int Sum = 0;

    for (int j = 0; j < size; j ++)
    {
        for (int i = 0; i < size; i ++)
            Sum += matrix[i][j];
    }
    return Sum;
}

int SumByRowCol (int matrix[M][M], int size)
{
    int Sum = 0;

    for (int i = 0; i < size; i ++)
    {
        for (int j = 0; j < size; j ++)
            Sum += matrix[i][j];
    }
    return Sum;
}

static inline int max(int i, int j) { return (i > j) ? i : j; }

int main(void)
{
    int matrix[M][M];
    for (int i = 0; i < M; i++)
        for (int j = 0; j < M; j++)
            matrix[i][j] = 1000*i + j;

    Clock clk;
    unsigned long long x[M];
    char buffer[32];
    unsigned long long sum;

    clk_init(&clk);

    clk_start(&clk);
    for (int i = 0; i < M; i++)
        x[i] = SumByColRow(matrix, max(M - i, 10));
    clk_stop(&clk);
    sum = 0;
    for (int i = 0; i < M; i++)
        sum += x[i];
    printf("SumByColRow: value = %llu, time = %s\n", sum, clk_elapsed_us(&clk, buffer, sizeof(buffer)));

    clk_start(&clk);
    for (int i = 0; i < M; i++)
        x[i] = SumByRowCol(matrix, max(M - i, 10));
    clk_stop(&clk);
    sum = 0;
    for (int i = 0; i < M; i++)
        sum += x[i];
    printf("SumByRowCol: value = %llu, time = %s\n", sum, clk_elapsed_us(&clk, buffer, sizeof(buffer)));

    return 0;
}

The two SumBy functions are substantially unchanged (minor notational tweaks, but nothing more). The timing harness stores a start time and a stop time in the Clock structure, and the clk_elapsed_us() function formats the elapsed time in microseconds into the string it is passed.

The messing around with x[i] and so on is to (try and) ensure that the compiler doesn't optimize everything away.

Output

Machine: Mac OS X 10.8.5, GCC (i686-apple-darwin11-llvm-gcc-4.2 (GCC) 4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2336.11.00)), Intel Core 2 Duo at 2 GHz, 4 GB 1067 MHz DDR3 RAM (an 'Early 2009' Mac Mini).

SumByColRow: value = 33764046316, time = 0.002411
SumByRowCol: value = 33764046316, time = 0.000677

This shows the expected result — that the columns by rows computation is slower because the matrix is big enough to span pages (64 KiB). It is not yet clear from the question what size M is, nor what size is passed to the SumBy functions, but with a 'big enough' array and varying sizes, you can get the expected performance pattern.

Those times aren't big enough for comfort — I'd rather the lower time was of the order of a second or two. Adding a for (int j = 0; j < 1600; j++) loop in front of each of the timed loops in the main program yields:

SumByColRow: value = 33764046316, time = 2.529205
SumByRowCol: value = 33764046316, time = 1.022970

The ratio is smaller (3.56 vs 2.47), but still decidedly tilted in favour of SumByRowCol().

Initializing the matrix 'warms the cache' to the extent it can be warmed. Reversing the order of computation (SumByRowCol before SumByColRow) does not make a significant difference to the timings. The results are pretty consistent across multiple runs.

Assembler output

Compiled with gcc -O3 -std=c99 -S:

    .section        __TEXT,__text,regular,pure_instructions
    .globl  _SumByColRow
    .align  4, 0x90
_SumByColRow:
Leh_func_begin1:
    pushq   %rbp
Ltmp0:
    movq    %rsp, %rbp
Ltmp1:
    testl   %esi, %esi
    jg      LBB1_5
    xorl    %eax, %eax
LBB1_2:
    popq    %rbp
    ret
LBB1_5:
    movl    %esi, %ecx
    xorl    %eax, %eax
    movq    %rcx, %rdx
    jmp     LBB1_6
    .align  4, 0x90
LBB1_3:
    addl    (%r8), %eax
    addq    $512, %r8
    decq    %rsi
    jne     LBB1_3
    addq    $4, %rdi
    decq    %rdx
    je      LBB1_2
LBB1_6:
    movq    %rcx, %rsi
    movq    %rdi, %r8
    jmp     LBB1_3
Leh_func_end1:

    .globl  _SumByRowCol
    .align  4, 0x90
_SumByRowCol:
Leh_func_begin2:
    pushq   %rbp
Ltmp2:
    movq    %rsp, %rbp
Ltmp3:
    testl   %esi, %esi
    jg      LBB2_5
    xorl    %eax, %eax
LBB2_2:
    popq    %rbp
    ret
LBB2_5:
    movl    %esi, %ecx
    xorl    %eax, %eax
    movq    %rcx, %rdx
    jmp     LBB2_6
    .align  4, 0x90
LBB2_3:
    addl    (%r8), %eax
    addq    $4, %r8
    decq    %rsi
    jne     LBB2_3
    addq    $512, %rdi
    decq    %rdx
    je      LBB2_2
LBB2_6:
    movq    %rcx, %rsi
    movq    %rdi, %r8
    jmp     LBB2_3
Leh_func_end2:

Execution time for loops

Update

Code

Output

Assembler output