I tried 3 compilers: MS Visual Studio 2012, gcc481, and Intel icl 13.1. They all warn as you point out. I found that both gcc and MS automatically generate initialization code for tmp1, even as they warn about lack of initialization. The MS compiler generates an undesirable memory access: movaps xmm0,xmmword ptr [rsp]
. Gcc generates a more efficient xorps xmm0,xmm0
. So in the case of gcc, adding tmp1=_mm_setzero_ps()
eliminates the warning and produces exactly the same code as without. In the case of MS, adding tmp1=_mm_setzero_ps()
makes the code shorter and probably faster. Only the Intel compiler is smart enough to avoid the unneeded initialization. Here is a possible workaround for MS and gcc compilers:
__m128 tmp1 = _mm_loadh_pi(_mm_load_ps (src), (__m64*)(src + 4));
Code generation is:
movaps xmm0,xmmword ptr [rcx]
movhps xmm0,qword ptr [rcx+10h]
It looks shorter, but should be benchmarked to make sure it is faster.
09/12/2013: test code for different warning suppression ideas:
#include <xmmintrin.h>
#include <stdint.h>
#include <stdio.h>
//---------------------------------------------------------------------------
// original code from http://download.intel.com/design/PentiumIII/sml/24504301.pdf
__m128 func1 (float *src)
{
__m128 tmp1;
tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
return tmp1;
}
//---------------------------------------------------------------------------
// original code plus tmp1 initialization
__m128 func2 (float *src)
{
__m128 tmp1 = _mm_loadh_pi(_mm_loadl_pi (_mm_setzero_ps (), (__m64*)(src)), (__m64*)(src + 4));
return tmp1;
}
//---------------------------------------------------------------------------
// use redundant load to eliminate warning
__m128 func3 (float *src)
{
__m128 tmp1 = _mm_loadh_pi(_mm_load_ps (src), (__m64*)(src + 4));
return tmp1;
}
//---------------------------------------------------------------------------
static void dump (void *data)
{
float *f16 = data;
int index;
for (index = 0; index < 4; index++)
printf ("%g ", f16 [index]);
printf ("\n");
}
//---------------------------------------------------------------------------
int main (void)
{
float f [8] = {1, 2, 3, 4, 5, 6, 7, 8};
__m128 tmp;
tmp = func1 (f);
dump (&tmp);
tmp = func2 (f);
dump (&tmp);
tmp = func3 (f);
dump (&tmp);
return 0;
}
build commands:
gcc -O3 -Wall -Wfatal-errors sample.c -osample.exe
objdump -Mintel --disassemble sample.exe > disasm.txt
cl -Ox -Zi -W4 sample.c
dumpbin -disasm -symbols sample.exe > disasm.txt
icl -Ox -Zi sample.c
dumpbin -disasm -symbols sample.exe > disasm.txt