If you are limited to AVX instructions, you can still use the conditional blend instruction (VBLENDVPD
) to select the correct rotation without using a switch. This is probably faster, especially if the condition cannot be easily predicted.
The full implementation of the right rotation (tested):
// rotate packed double vector right by n
__m256d rotate_pd_right(__m256d x, int n) {
__m128i c = _mm_cvtsi32_si128(n);
__m128i cc = _mm_unpacklo_epi64(c,c);
// create blend masks (highest bit)
__m128d half_low = _mm_castsi128_pd(_mm_slli_epi64(cc, 63));
__m128d swap_low = _mm_castsi128_pd(_mm_slli_epi64(cc, 62));
__m256d half = _mm256_insertf128_pd(_mm256_castpd128_pd256(half_low), half_low, 1);
__m256d swap = _mm256_insertf128_pd(_mm256_castpd128_pd256(swap_low), swap_low, 1);
// compute rotations
__m256d t0 = _mm256_permute_pd(x, 0x05); // [2 3 0 1]
__m256d t1 = _mm256_permute2f128_pd(t0, t0, 0x01); // [1 0 2 3]
__m256d y0 = x; // [3 2 1 0]
__m256d y1 = _mm256_blend_pd(t0, t1, 0x0a); // [0 3 2 1]
__m256d y2 = _mm256_permute2f128_pd(x, x, 0x01); // [1 0 3 2]
__m256d y3 = _mm256_blend_pd(t0, t1, 0x05); // [2 1 0 3]
// select correct rotation
__m256d y01 = _mm256_blendv_pd(y0, y1, half);
__m256d y23 = _mm256_blendv_pd(y2, y3, half);
__m256d yn = _mm256_blendv_pd(y01, y23, swap);
return yn;
}
Left rotation can be done simply as
__m256d rotate_pd_left(__m256d x, int n) {
return rotate_pd_right(x, -n);
}