GCC generates SSE instructions instead of AVX

https://stackoverflow.com/questions/21840673

12-10-2022
|

Question

I called GCC like this:

$ gcc -I/usr/include/SDL2 -D_REENTRANT -Ibuild -I. -S -fverbose-asm -O2 -m64 -mpc64 -mfpmath=both -fipa-pta -ftree-loop-linear -floop-interchange -floop-strip-mine -floop-block -ftree-loop-distribution -ftree-loop-distribute-patterns -funswitch-loops -ftree-vectorize -march=core-avx-i -c algo/collision.c -o build/collision.s

the important options being:

-S                      : output assembly
-ftree-vectorize        : vectorize loops
-march=core-avx-i       : enable "MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2,
                        : AVX, AES, PCLMUL, FSGSBASE, RDRND and F16C
                        : instruction set support."

here is the source prior to assembly:

#include "collision.h"

int8_t currentField[FIELD_W][FIELD_H];

// Clear and rebuild the field based on the objects with a gravity well
void buildField (const gravityWell *body) {
    int x, y;
    int w, h, Cx, Cy;
    int Vx[2], Vy[2];

    // Clear the field
    for (x = 0; x < FIELD_W; x++) {
        memset (currentField[x], 0x00, FIELD_H);
    }

    // Rebuild the field
    for (x = 0; x < body->object_count; x++) {
        // Fetch the position and dimensions of the object and round
        // them to ints
        Cx =    body->stuff[x].pos.x;
        Cy =    body->stuff[x].pos.y;
        w = body->stuff[x].pos.w;
        h = body->stuff[x].pos.h;

        // Calculate the lower left and upper right edges of a
        // rectangle encompassing the object
        w = w / 2;
        h = h / 2;
        Vx[0] = Cx - w;
        Vx[1] = Cx + w;
        Vy[0] = Cy - h;
        Vy[1] = Cy + h;

        // Add in the offset for array accesses
        Vx[0] += FIELD_W / 2;
        Vx[1] += FIELD_W / 2;
        Vy[0] += FIELD_H / 2;
        Vy[1] += FIELD_H / 2;

        Vx[1]++;
        Vy[1]++;

        // Set the area occupied by the object to ones
        for (y = Vx[0]; y < Vx[1]; y++) {
            memset (currentField[y], 0x01, (Vy[1] - Vy[0]));
        }
    }

    return;
}

and here is the assembled source (GAS syntax):

    .file   "collision.c"
# GNU C (Ubuntu/Linaro 4.8.1-10ubuntu9) version 4.8.1 (x86_64-linux-gnu)
#   compiled by GNU C version 4.8.1, GMP version 5.1.2, MPFR version 3.1.1-p2, MPC version 1.0.1
# GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
# options passed:  -I /usr/include/SDL2 -I build -I .
# -imultiarch x86_64-linux-gnu -D _REENTRANT algo/collision.c -m64 -mpc64
# -mfpmath=both -march=core-avx-i -auxbase-strip build/collision.s -O2
# -fverbose-asm -fipa-pta -floop-interchange -floop-strip-mine -floop-block
# -ftree-loop-distribution -ftree-loop-distribute-patterns -funswitch-loops
# -ftree-vectorize -fstack-protector -Wformat -Wformat-security
# options enabled:  -faggressive-loop-optimizations
# -fasynchronous-unwind-tables -fauto-inc-dec -fbranch-count-reg
# -fcaller-saves -fcombine-stack-adjustments -fcommon -fcompare-elim
# -fcprop-registers -fcrossjumping -fcse-follow-jumps -fdefer-pop
# -fdelete-null-pointer-checks -fdevirtualize -fdwarf2-cfi-asm
# -fearly-inlining -feliminate-unused-debug-types -fexpensive-optimizations
# -fforward-propagate -ffunction-cse -fgcse -fgcse-lm -fgnu-runtime
# -fguess-branch-probability -fhoist-adjacent-loads -fident -fif-conversion
# -fif-conversion2 -findirect-inlining -finline -finline-atomics
# -finline-functions-called-once -finline-small-functions -fipa-cp
# -fipa-profile -fipa-pta -fipa-pure-const -fipa-reference -fipa-sra
# -fira-hoist-pressure -fira-share-save-slots -fira-share-spill-slots
# -fivopts -fkeep-static-consts -fleading-underscore -floop-block
# -floop-interchange -floop-strip-mine -fmath-errno -fmerge-constants
# -fmerge-debug-strings -fmove-loop-invariants -fomit-frame-pointer
# -foptimize-register-move -foptimize-sibling-calls -foptimize-strlen
# -fpartial-inlining -fpeephole -fpeephole2 -fprefetch-loop-arrays -free
# -freg-struct-return -fregmove -freorder-blocks -freorder-functions
# -frerun-cse-after-loop -fsched-critical-path-heuristic
# -fsched-dep-count-heuristic -fsched-group-heuristic -fsched-interblock
# -fsched-last-insn-heuristic -fsched-rank-heuristic -fsched-spec
# -fsched-spec-insn-heuristic -fsched-stalled-insns-dep -fschedule-insns2
# -fshow-column -fshrink-wrap -fsigned-zeros -fsplit-ivs-in-unroller
# -fsplit-wide-types -fstack-protector -fstrict-aliasing -fstrict-overflow
# -fstrict-volatile-bitfields -fsync-libcalls -fthread-jumps
# -ftoplevel-reorder -ftrapping-math -ftree-bit-ccp -ftree-builtin-call-dce
# -ftree-ccp -ftree-ch -ftree-coalesce-vars -ftree-copy-prop
# -ftree-copyrename -ftree-cselim -ftree-dce -ftree-dominator-opts
# -ftree-dse -ftree-forwprop -ftree-fre -ftree-loop-distribute-patterns
# -ftree-loop-distribution -ftree-loop-if-convert -ftree-loop-im
# -ftree-loop-ivcanon -ftree-loop-optimize -ftree-parallelize-loops=
# -ftree-phiprop -ftree-pre -ftree-pta -ftree-reassoc -ftree-scev-cprop
# -ftree-sink -ftree-slp-vectorize -ftree-slsr -ftree-sra
# -ftree-switch-conversion -ftree-tail-merge -ftree-ter
# -ftree-vect-loop-version -ftree-vectorize -ftree-vrp -funit-at-a-time
# -funswitch-loops -funwind-tables -fverbose-asm -fzero-initialized-in-bss
# -m128bit-long-double -m64 -m80387 -maccumulate-outgoing-args -maes
# -malign-stringops -mavx -mavx256-split-unaligned-load
# -mavx256-split-unaligned-store -mcx16 -mf16c -mfancy-math-387
# -mfp-ret-in-387 -mfsgsbase -mfxsr -mglibc -mieee-fp -mlong-double-80
# -mmmx -mpc64 -mpclmul -mpopcnt -mpush-args -mrdrnd -mred-zone -msahf
# -msse -msse2 -msse3 -msse4 -msse4.1 -msse4.2 -mssse3
# -mtls-direct-seg-refs -mvzeroupper -mxsave -mxsaveopt

    .text
    .p2align 4,,15
    .globl  buildField
    .type   buildField, @function
buildField:
.LFB24:
    .cfi_startproc
    pushq   %r14    #
    .cfi_def_cfa_offset 16
    .cfi_offset 14, -16
    pushq   %r13    #
    .cfi_def_cfa_offset 24
    .cfi_offset 13, -24
    movq    %rdi, %r13  # body, body
    pushq   %r12    #
    .cfi_def_cfa_offset 32
    .cfi_offset 12, -32
    pushq   %rbp    #
    .cfi_def_cfa_offset 40
    .cfi_offset 6, -40
    pushq   %rbx    #
    .cfi_def_cfa_offset 48
    .cfi_offset 3, -48
    movl    $currentField, %ebx #, ivtmp.26
    .p2align 4,,10
    .p2align 3
.L3:
    xorl    %esi, %esi  #
    movq    %rbx, %rdi  # ivtmp.26,
    movl    $4000, %edx #,
    call    memset  #
    addq    $4000, %rbx #, ivtmp.26
    cmpq    $currentField+16000000, %rbx    #, ivtmp.26
    jne .L3 #,
    movl    8(%r13), %eax   # body_11(D)->object_count,
    xorl    %r14d, %r14d    # ivtmp.19
    xorl    %r12d, %r12d    # x
    testl   %eax, %eax  #
    jle .L12    #,
    .p2align 4,,10
    .p2align 3
.L11:
    movq    %r14, %rax  # ivtmp.19, D.2657
    addq    0(%r13), %rax   # body_11(D)->stuff, D.2657
    movl    96(%rax), %edx  # _16->pos.w, w
    vmovss  88(%rax), %xmm0 # _16->pos.x,
    vmovss  92(%rax), %xmm1 # _16->pos.y,
    movl    100(%rax), %eax # _16->pos.h, h
    vcvttss2si  %xmm0, %esi #, Cx
    movl    %edx, %edi  # w, tmp125
    vcvttss2si  %xmm1, %ecx #, Cy
    shrl    $31, %edi   #, tmp125
    addl    %edi, %edx  # tmp125, tmp127
    movl    %eax, %edi  # h, tmp128
    sarl    %edx    # tmp127
    shrl    $31, %edi   #, tmp128
    movl    %ecx, %r8d  # Cy, D.2655
    addl    %edi, %eax  # tmp128, tmp130
    movl    %esi, %edi  # Cx, D.2655
    sarl    %eax    # tmp130
    subl    %edx, %edi  # tmp127, D.2655
    addl    %esi, %edx  # Cx, D.2655
    leal    2001(%rcx,%rax), %ebp   #, D.2655
    subl    %eax, %r8d  # tmp130, D.2655
    leal    2000(%rdi), %esi    #, y
    addl    $2000, %r8d #, D.2655
    leal    2001(%rdx), %eax    #, D.2655
    cmpl    %eax, %esi  # D.2655, y
    jge .L8 #,
    movslq  %esi, %rax  # y, D.2660
    subl    %edi, %edx  # D.2655, D.2654
    subl    %r8d, %ebp  # D.2655, D.2655
    leaq    (%rdx,%rax), %rbx   #, D.2654
    movslq  %ebp, %rbp  # D.2655, D.2661
    imulq   $4000, %rax, %rcx   #, D.2660, D.2660
    imulq   $4000, %rbx, %rbx   #, D.2654, D.2654
    addq    $currentField, %rcx #, ivtmp.12
    addq    $currentField+4000, %rbx    #, D.2654
    .p2align 4,,10
    .p2align 3
.L9:
    movq    %rcx, %rdi  # ivtmp.12,
    movq    %rbp, %rdx  # D.2661,
    movl    $1, %esi    #,
    call    memset  #
    movq    %rax, %rcx  #, ivtmp.12
    addq    $4000, %rcx #, ivtmp.12
    cmpq    %rbx, %rcx  # D.2654, ivtmp.12
    jne .L9 #,
.L8:
    addl    $1, %r12d   #, x
    subq    $-128, %r14 #, ivtmp.19
    cmpl    %r12d, 8(%r13)  # x, body_11(D)->object_count
    jg  .L11    #,
.L12:
    popq    %rbx    #
    .cfi_def_cfa_offset 40
    popq    %rbp    #
    .cfi_def_cfa_offset 32
    popq    %r12    #
    .cfi_def_cfa_offset 24
    popq    %r13    #
    .cfi_def_cfa_offset 16
    popq    %r14    #
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE24:
    .size   buildField, .-buildField
    .comm   currentField,16000000,32
    .ident  "GCC: (Ubuntu/Linaro 4.8.1-10ubuntu9) 4.8.1"
    .section    .note.GNU-stack,"",@progbits

GCC uses SSE instructions instead of AVX instructions, especially considering that it is using SSE's 128-bit %xmm registers as opposed to AVX's 256-bit %ymm registers.

Why is this, and more importantly, how can I force gcc to use AVX over SSE?

Solution

Your code does all integer arithmetic; there are no integer operations in the AVX extension. They were added in AVX2, which you haven’t enabled.

Before you go and rewrite all your code to use float or buy a processor with AVX2, I should point out that the array-of-structures memory layout you appear to be using defeats many auto-vectorizers, so it isn’t totally obvious that your code would take advantage of AVX if integer ops were available. You may want to consider using a structure-of-arrays layout instead, though that may also prove to be a relatively invasive change to make.

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow