Probably this variant will be faster
/* byte_result_vec 000H 000G 000F 000E 000D 000C 000B 000A */
const __m256i shuffle_mask = _mm256_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1);
/* abcdefgh 0000 0000 HGFE 0000 0000 0000 0000 DCBA */
const __m256i abcdefgh = _mm256_shuffle_epi8(byte_result_vec, shuffle_mask);
/* abcd 0000 0000 0000 DCBA */
const __m128i abcd = _mm256_castsi256_si128(abcdefgh);
/* efgh 0000 0000 HGFE 0000 */
const __m128i efgh = _mm256_extracti128_si256(abcdefgh, 1);
_mm_storel_epi64((__m128i*)&result_array[j], _mm_or_si128(abcd, efgh));