If your goal is to take the horizontal sum of 8 16-bit values you can do this with SSE2 like this:
__m128i sum1 = _mm_shuffle_epi32(a,0x0E); // 4 high elements
__m128i sum2 = _mm_add_epi16(a,sum1); // 4 sums
__m128i sum3 = _mm_shuffle_epi32(sum2,0x01); // 2 high elements
__m128i sum4 = _mm_add_epi16(sum2,sum3); // 2 sums
__m128i sum5 = _mm_shufflelo_epi16(sum4,0x01); // 1 high element
__m128i sum6 = _mm_add_epi16(sum4,sum5); // 1 sum
int16_t sum7 = _mm_cvtsi128_si32(sum6); // 16 bit sum