Question

I am representing a bitfield with __m128i and need a fast way to check whether or not a specific bit is set, and also a way to set a specific bit. Do I have to set up another __m128i as a mask and OR them, or is there an instruction I am missing that is faster? I am using the Intel compilers.

Was it helpful?

Solution

You could try something like this. I don't believe there is a quicker way. You will likely want to pull some of the constant values and the table out of the performance crittle part of the code.

  __m128i v; // todo: set v to something here

  // to check
  int n; // todo: set n to the zero-indexed bit to check

  __m128i chkmask  = _mm_slli_epi16(_mm_set1_epi16(1), n & 0xF);
  int     movemask = (1 << (n >> 3));
  int     isSet  = (_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(chkmask, v), _mm_setzero_si128())) & movemask) ^ movemask;

  // to set
  int m; // todo: set m to the zero-indexed bit to set

  __m128i shuf    = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
          shuf    = _mm_add_epi8(shuf, _mm_set1_epi8(16 - (m >> 3)));
          shuf    = _mm_and_si128(shuf, _mm_set1_epi8(0x0F));
  __m128i setmask = _mm_shuffle_epi8(_mm_cvtsi32_si128(1 << (m & 0x7)), shuf);
  v = _mm_or_si128(v, setmask);

  // or to try the look-up table approach to check and set
  __declspec(align(16)) __m128i lut[] = {
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000001),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000002),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000004),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000008),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000010),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000020),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000040),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000080),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000100),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000200),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000400),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000800),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00001000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00002000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00004000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00008000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00010000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00020000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00040000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00080000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00100000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00200000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00400000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00800000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x01000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x02000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x04000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x08000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x10000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x20000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x40000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x80000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000001, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000002, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000004, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000008, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000010, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000020, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000040, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000080, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000100, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000200, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000400, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00000800, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00001000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00002000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00004000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00008000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00010000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00020000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00040000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00080000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00100000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00200000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00400000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x00800000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x01000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x02000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x04000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x08000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x10000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x20000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x40000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000000, 0x80000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000001, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000002, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000004, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000008, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000010, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000020, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000040, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000080, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000100, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000200, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000400, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00000800, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00001000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00002000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00004000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00008000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00010000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00020000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00040000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00080000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00100000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00200000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00400000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x00800000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x01000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x02000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x04000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x08000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x10000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x20000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x40000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000000, 0x80000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000001, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000002, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000004, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000008, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000010, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000020, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000040, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000080, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000100, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000200, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000400, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00000800, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00001000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00002000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00004000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00008000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00010000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00020000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00040000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00080000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00100000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00200000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00400000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x00800000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x01000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x02000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x04000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x08000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x10000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x20000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x40000000, 0x00000000, 0x00000000, 0x00000000),
    _mm_set_epi32(0x80000000, 0x00000000, 0x00000000, 0x00000000)
  };

   // to check with look-up table
   movemask = (1 << (n >> 3));
   isSet    = (_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(v, _mm_load_si128(lut + m)), _mm_setzero_si128())) & movemask) ^ movemask;

   // to set with look-up table
   v = _mm_or_si128(v, _mm_load_si128(lut + m));

OTHER TIPS

For what it's worth here is a variation I came up with for testing a bit. If the mask and one resister can be precomputed then this only needs three intrinsic.

For setting single bits I don't think there is an efficient way. Here is a discussion on going from movemask back to an SSE register How to perform the inverse of _mm256_movemask_epi8 (VPMOVMSKB)?

#include <emmintrin.h>
#include <stdio.h>
int main() {
    __m128i x = _mm_setr_epi32(0,0,0,1);
    __m128i mask = _mm_setr_epi32(0,0,0,1);
    __m128i one = _mm_set1_epi8(1);
    int isSet = 0xffff != _mm_movemask_epi8(_mm_sub_epi8(_mm_and_si128(x,mask),one));
    printf("%X\n", isSet);  
}

Edit actually there is a faster way to check a bit with SSE4.1 using _mm_testz_si128.

#include <smmintrin.h>
#include <stdio.h>

int main() {
    __m128i x = _mm_setr_epi32(0,0,0,1);
    __m128i mask = _mm_setr_epi32(0,0,0,1);

    __m128i t = _mm_and_si128(x,mask);
    int isSet = !_mm_testz_si128(t,t);

    printf("%d\n", isSet);  
}

There are no instructions for setting individual bits in __m128i.

You can try using the general-purpose BTS instruction, but it will probably be slower than making a mask, because it can only write to memory (or to 32-bit registers, which doesn't help).

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top