linear search through uint64[] with SSE

Question 1

I suggest NOT using the built in intrinsics and implicit vectors. This only makes sense if you don't use the non GCC intrinsics (e.g. _mm_cmpeq_epi32) and only want to stick to GCC. You can do what you want like this

__m128i key2 = _mm_set1_epi64x(key);
__m128i v1 = _mm_loadu_si128((const __m128i *)&data[start + i + 0]);
__m128i v2 = _mm_loadu_si128((const __m128i *)&data[start + i + 2]);

__m128i cmp0 = _mm_cmpeq_epi64(key2, v1);
__m128i cmp1 = _mm_cmpeq_epi64(key2, v2);

__m128i low2  = _mm_shuffle_epi32(cmp0,0xD8);  
__m128i high2 = _mm_shuffle_epi32(cmp1,0xD8);      
__m128i pack = _mm_unpacklo_epi64(low2,high2);

__m128i pack01 = _mm_packs_epi32(pack, _mm_setzero_si128());
__m128i pack0123 = _mm_packs_epi16(pack01, _mm_setzero_si128());

int res =  _mm_movemask_epi8(pack0123);

You can probably find a more efficient version that avoids the packing but then you would have to use a different function than __builtin_ctz.

For 32-bit ints I suggest

__m128i key4 = _mm_set1_epi32(key);
__m128i v1 = _mm_loadu_si128((const __m128i *)&data[start + i + 0]);
__m128i v2 = _mm_loadu_si128((const __m128i *)&data[start + i + 4]);
__m128i v3 = _mm_loadu_si128((const __m128i *)&data[start + i + 8]);
__m128i v4 = _mm_loadu_si128((const __m128i *)&data[start + i + 12]);

__m128i cmp0 = _mm_cmpeq_epi32(key4, v1);
__m128i cmp1 = _mm_cmpeq_epi32(key4, v2);
__m128i cmp2 = _mm_cmpeq_epi32(key4, v3);
__m128i cmp3 = _mm_cmpeq_epi32(key4, v4);

__m128i pack01 = _mm_packs_epi32(cmp0, cmp1);
__m128i pack23 = _mm_packs_epi32(cmp2, cmp3);
__m128i pack0123 = _mm_packs_epi16(pack01, pack23);

int res = _mm_movemask_epi8(pack0123);

Question 2

This is a complete solution for searching 64 bit values for a match. In this case, the value (namehash) is a struct member. This routine compares 8 64 bit values on each iteration and provides the matching struct index.

//ptr is a struct array
__m128i key2 = _mm_set1_epi64x(k); //k is the 64 bit search key
for(;;)
   {
   if(!ptr[i].namehash)return NULL;
   __m128i v1 = _mm_set_epi64x (ptr[i+1].namehash,ptr[i].namehash);
   __m128i v2 = _mm_set_epi64x (ptr[i+3].namehash,ptr[i+2].namehash);
   __m128i v3 = _mm_set_epi64x (ptr[i+5].namehash,ptr[i+4].namehash);
   __m128i v4 = _mm_set_epi64x (ptr[i+7].namehash,ptr[i+6].namehash);

   __m128i cmp0 = _mm_cmpeq_epi64(key2, v1);
   __m128i cmp1 = _mm_cmpeq_epi64(key2, v2);
   __m128i cmp2 = _mm_cmpeq_epi64(key2, v3);
   __m128i cmp3 = _mm_cmpeq_epi64(key2, v4);

   __m128i L0  = _mm_shuffle_epi32(cmp0,0xD8);
   __m128i H1 = _mm_shuffle_epi32(cmp1,0xD8);
   __m128i L2  = _mm_shuffle_epi32(cmp2,0xD8);
   __m128i H3 = _mm_shuffle_epi32(cmp3,0xD8);

   __m128i pack0 = _mm_unpacklo_epi64(L0,H1);
   __m128i pack1 = _mm_unpacklo_epi64(L2,H3);

   __m128i pack01 = _mm_packs_epi32(pack0,pack1);
   __m128i pack0123 = _mm_packs_epi16(pack01, _mm_setzero_si128());

   res =  _mm_movemask_epi8(pack0123);
   if(res > 0)break;
   i+=8;
   }

int index = i + __builtin_ctz(res);  //The struct table index to the matching struct.

IMPORTANT: The struct array length needs to be a multiple of 8, with at least 1 NULL trailing member

Alternatively, if only 2 64 bit comparisons are wanted per iteration, the above can be greatly simplified to:

for(;;)
   {
   if(!ptr[i].namehash)return NULL;
   __m128i v1 = _mm_set_epi64x (ptr[i+1].namehash,ptr[i].namehash);
   __m128i cmp0 = _mm_cmpeq_epi64(key2, v1);
   res =  _mm_movemask_epi8(cmp0);
   if(res > 0)break;
   i+=2;
   }
int ctz = __builtin_ctz(res);
int index = i + (ctz>>5);  //The struct table index to the matching struct.

Question 3

I can't find any instructions to convert 64-bit integer to a 32-bit integer, which is what you need to then use packssdw, etc. It gets quite long and messy, but should work:

So, I think the solution is to use a bitmask (bits 0, 1, 2, 3:

These go before the loop:

vec2uint64 mask0 = { 2, 1 };
vec2uint64 mask1 = { 8, 4 };
vec2uint64 zero  = { 0, 0 };

Inside loop:

vec2uint64 res0 = _mm_and_si128(cmp0, mask0);
vec2uint64 res1 = _mm_and_si128(cmp1, mask1);

vec2uint64 res2 = _mm_or_si128(res0, res1);

Then we need to shuffle the high part to the low part of a new variable:

vec2uint64 hi0 = _mm_unpackhi_epi64(res0, zero);
vec2uint64 hi1 = _mm_unpackhi_epi64(res1, zero);

vec2uint64 hi2 = _mm_or_si128(hi0, hi1);
vec2uint64 res3 = _mm_or_si128(res2, hi2);

Now the low 64 bits [well, the low 4 bits, the rest are zero] of res is a bitmask of which one of the values that matches.

int res = _mm_cvtsi128_si32(res3);

(Now you can count trailing zeros like before).