I suggest two changes:
- Replace
((int8_t*) ((void *) &buffer))[0]
with_mm_cvtsi128_si32
. Replace
_mm_shuffle_epi8
with_mm_shuffle_epi32
/_mm_shufflelo_epi16
which have lower latency on recent AMD processors and Intel Atom, and will save you memory load operations:static inline int16_t hMin(__m128i buffer) { buffer = _mm_min_epi8(buffer, _mm_shuffle_epi32(buffer, _MM_SHUFFLE(3, 2, 3, 2))); buffer = _mm_min_epi8(buffer, _mm_shuffle_epi32(buffer, _MM_SHUFFLE(1, 1, 1, 1))); buffer = _mm_min_epi8(buffer, _mm_shufflelo_epi16(buffer, _MM_SHUFFLE(1, 1, 1, 1))); buffer = _mm_min_epi8(buffer, _mm_srli_epi16(buffer, 8)); return (int8_t)_mm_cvtsi128_si32(buffer); }