A pair of phaddd instructions might be the best solution. Here is an example:
#define __USE_MINGW_ANSI_STDIO 1
#include <stdio.h>
#include <intrin.h>
//---------------------------------------------------------------------------
__m128i add_32X4 (__m128i value)
{
__m128i result;
result = _mm_hadd_epi32 (value, value);
result = _mm_hadd_epi32 (result, result);
return result;
}
//---------------------------------------------------------------------------
int main (void)
{
__m128i input = _mm_set_epi32 (1, 10, 100, 1000);
__m128i result = add_32X4 (input);
printf ("%d\n", _mm_cvtsi128_si32 (result));
return 0;
}
//---------------------------------------------------------------------------
program output:
1111
sample code generation (gcc 4.8.1):
<add_32X4>:
30: 66 0f 6f 01 movdqa xmm0,XMMWORD PTR [rcx]
34: 66 0f 38 02 c0 phaddd xmm0,xmm0
39: 66 0f 38 02 c0 phaddd xmm0,xmm0
3e: c3 ret