What you need here is 4 loads followed by a 4x4 transpose:
#include "emmintrin.h" // SSE2
v0 = _mm_load_si128((__m128i *)&a[0]); // v0 = a0 b0 c0 d0
v1 = _mm_load_si128((__m128i *)&a[16]); // v1 = a1 b1 c1 d1
v2 = _mm_load_si128((__m128i *)&a[32]); // v2 = a2 b2 c2 d2
v3 = _mm_load_si128((__m128i *)&a[48]); // v3 = a3 b3 c3 d3
// 4x4 transpose
w0 = _mm_unpacklo_epi32(v0, v1); // w0 = a0 a1 b0 b1
w1 = _mm_unpackhi_epi32(v0, v1); // w1 = c0 c1 d0 d1
w2 = _mm_unpacklo_epi32(v2, v3); // w2 = a2 a3 b2 b3
w3 = _mm_unpackhi_epi32(v2, v3); // w3 = c2 c3 d2 d3
v0 = _mm_unpacklo_epi64(w0, w2); // v0 = a0 a1 a2 a3
v1 = _mm_unpackhi_epi64(w0, w2); // v1 = b0 b1 b2 b3
v2 = _mm_unpacklo_epi64(w1, w3); // v2 = c0 c1 c2 c3
v3 = _mm_unpackhi_epi64(w1, w3); // v3 = d0 d1 d2 d3
Note: this is probably more efficient than using AVX2 gathered loads, since they generate a read cycle per element, which makes them really only useful when the access pattern is unknown or difficult to work with.