Вопрос

Tried the following vector subtraction code with a console project generated by VS2012 Update 1. I didn't really touch the default options other than disabling global optimizations and enabling assembler listings.

Compiled with x64 release configuration on Windows 7 x64 SP1.

#include <stdio.h>
#include <tchar.h>

#include <emmintrin.h>

typedef unsigned short ushort;
typedef unsigned int uint;

void print(__m128i i)
{
    auto& arr = i.m128i_u16;
    printf("[%d %d  %d  %d  %d  %d  %d  %d]\n", arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7]);
}

int _tmain(int argc, _TCHAR* argv[])
{
    const int lineSize = 912;
    ushort input[lineSize];
    ushort vals[lineSize];
//  printf("%X %X\n", input, vals); // note this one

    for (uint i=0; i<lineSize; i+=8)
    {
        __m128i vecinput = _mm_loadu_si128((__m128i*) &input[i]);
        __m128i vecvals = _mm_loadu_si128((__m128i*) &vals[i]);

        __m128i output = _mm_subs_epu16(vecinput, vecvals);
        print(output);
        printf("===\n");
    }

    return 0;
}

Generated assembly in release mode:

; 20   :    const int lineSize = 912;
; 21   :    ushort input[lineSize];
; 22   :    ushort vals[lineSize];

; without printf
; 23   : // printf("%X %X\n", input, vals);
; with printf
; 23   :    printf("%X %X\n", input, vals);

    lea r8, QWORD PTR vals$[rsp]
    lea rdx, QWORD PTR input$[rsp]
    lea rcx, OFFSET FLAT:??_C@_06NBKGFLKK@?$CFX?5?$CFX?6?$AA@
    call    QWORD PTR __imp_printf

; 24   : 
; 25   :    for (uint i=0; i<lineSize; i+=8)

    xor esi, esi
    lea ebp, QWORD PTR [rsi+114]
    npad    2
$LL3@wmain:

; 26   :    {
; 27   :        __m128i vecinput = _mm_loadu_si128((__m128i*) &input[i]);

    movdqu  xmm1, XMMWORD PTR input$[rsp+rsi]

; 28   :        __m128i vecvals = _mm_loadu_si128((__m128i*) &vals[i]);

; without printf
    movdqu  xmm0, xmm1
; with printf
    movdqu  xmm0, XMMWORD PTR vals$[rsp+rsi]

; 29   : 
; 30   :        __m128i output = _mm_subs_epu16(vecinput, vecvals);

; without printf
    psubusw xmm1, xmm1
; with printf
    psubusw xmm1, xmm0

; 15   :    printf("[%d %d  %d  %d  %d  %d  %d  %d]\n", arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7]);

    pextrw  ax, xmm1, 7
    movzx   edi, ax
    pextrw  ax, xmm1, 6
    movzx   ebx, ax
    pextrw  ax, xmm1, 5
    mov DWORD PTR [rsp+64], edi
    movzx   r11d, ax
    pextrw  ax, xmm1, 4
    mov DWORD PTR [rsp+56], ebx
    movzx   r10d, ax
    pextrw  ax, xmm1, 3
    mov DWORD PTR [rsp+48], r11d
    movzx   ecx, ax
    pextrw  ax, xmm1, 2
    mov DWORD PTR [rsp+40], r10d
    movzx   r9d, ax
    pextrw  ax, xmm1, 1
    mov DWORD PTR [rsp+32], ecx
    movzx   r8d, ax
    lea rcx, OFFSET FLAT:??_C@_0BL@ONEMJFJK@?$FL?$CFd?7?$CFd?7?$CFd?7?$CFd?7?$CFd?7?$CFd?7?$CFd?7?$CFd?$FN?6?$AA@
    movd    eax, xmm1
    movzx   edx, ax
    call    QWORD PTR __imp_printf

; 31   :        print(output);
; 32   :        printf("===\n");

    lea rcx, OFFSET FLAT:??_C@_04LEHBMKOA@?$DN?$DN?$DN?6?$AA@
    call    QWORD PTR __imp_printf
    lea rsi, QWORD PTR [rsi+16]
    dec rbp
    jne $LL3@wmain

; 33   :    }
; 34   : 
; 35   :    return 0;

    xor eax, eax

; 95   : }

    mov rcx, QWORD PTR __$ArrayPad$[rsp]
    xor rcx, rsp
    call    __security_check_cookie
    lea r11, QWORD PTR [rsp+1920]
    mov rbx, QWORD PTR [r11+16]
    mov rbp, QWORD PTR [r11+24]
    mov rsi, QWORD PTR [r11+32]
    mov rsp, r11
    pop rdi
    ret 0
wmain   ENDP

So vals is incorrectly treated like being the same as input and the result will always be 0. It's also interesting how xmm0 is never used anymore due to that false optimization, yet still not thrown out. If you uncomment that printf the generated code is correct.

So the question is, is there anything wrong with my code? To me it totally looks like a bug in the optimizer.

Это было полезно?

Решение

You never initialize the arrays ushort input[lineSize] and ushort vals[lineSize], so the optimizer happens to be treating them as being identical, which is fine for undefined behavior.

When you have the printf("%X %X\n", input, vals) call in there, you're passing the address of the arrays to an external function, so the optimizer has reason to believe that the memory they point to may be updated by that external function.

Лицензировано под: CC-BY-SA с атрибуция
Не связан с StackOverflow
scroll top