Fast popcount on Intel Xeon Phi

Question 1

Since posting yesterday, I have been able to run your code and my suggestion on my own card. I do not get exactly the same timings as you, probably due to the stepping of my hardware, and perhaps related to the versions of my compiler. But the trend holds up, and my suggestion seemed to attain about a fifteen percent performance boost.

I got an additional small performance boost, between five and ten percent, with a little more tweaking, as shown in the code below. Please note that in the following code snippet, B6 has each element set to 0x000000FF. At this point, I think the algorithm might be pressing up pretty close to the maximum sustainable bandwidth deliverable from GDDR to the L2 cache.

(ADDED NOTE: A testament to this assertion is that if I wrap the body of the popcount5 function with a for loop that repeats it ten times -- and note that this is ten rapid repetitions of the "chunk_size" of input data, so nine of the times it will be sizzling hot in L2 -- the total time for the test increases by only a factor of about five, rather than ten. I bring this up because I THINK your goal is to tune the speed of the bit count logic, but perhaps the application in which you hope to deploy it actually has a smaller and/or hotter working set. If that is so, throttling introduced by DRAM-->L2 bandwidth is fogging the picture. But note that ratcheting back the size of your test input so that it tends to stay hotter in L2 seems to cause other overhead -- probably the openmp overhead -- to become relatively more significant.)

inline uint64_t vpu_popcount5(uint64_t* buf, size_t n)  {
    _mm_prefetch((const char *)&buf[0], _MM_HINT_T0); // vprefetch0
    _mm_prefetch((const char *)&buf[8], _MM_HINT_T0); // vprefetch0
    _mm_prefetch((const char *)&buf[16], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[24], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[32], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[40], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[48], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[56], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[64], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[72], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[80], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[88], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[96], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[104], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[112], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[120], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[128], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[136], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[144], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[152], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[160], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[168], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[176], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[184], _MM_HINT_T1); // vprefetch1
    register size_t result;
    size_t i;

    register const __m512i B0 = _mm512_load_epi32((void*)(magic+0));
    register const __m512i B1 = _mm512_load_epi32((void*)(magic+16));
    register const __m512i B2 = _mm512_load_epi32((void*)(magic+32));
    register const __m512i B6 = _mm512_load_epi32((void*)(magic+80));
    register __m512i total0;
    register __m512i total1;
    register __m512i total2;
    register __m512i total3;
    register __m512i shuf0;
    register __m512i shuf1;
    register __m512i shuf2;
    register __m512i shuf3;
    register __m512i result0;
    register __m512i result1;

    result0 = _mm512_setzero_epi32();
    result1 = _mm512_setzero_epi32();

    for (i = 0; i < n; i+=32) {
        shuf0 = _mm512_load_epi32(&buf[i   ]);
        shuf1 = _mm512_load_epi32(&buf[i+ 8]);
        shuf2 = _mm512_load_epi32(&buf[i+16]);
        shuf3 = _mm512_load_epi32(&buf[i+24]);
        _mm_prefetch((const char *)&buf[i+192], _MM_HINT_T1); // vprefetch1
        _mm_prefetch((const char *)&buf[i+200], _MM_HINT_T1); // vprefetch1
        _mm_prefetch((const char *)&buf[i+208], _MM_HINT_T1); // vprefetch1
        _mm_prefetch((const char *)&buf[i+216], _MM_HINT_T1); // vprefetch1
        _mm_prefetch((const char *)&buf[i+32], _MM_HINT_T0); // vprefetch0
        _mm_prefetch((const char *)&buf[i+40], _MM_HINT_T0); // vprefetch0
        _mm_prefetch((const char *)&buf[i+48], _MM_HINT_T0); // vprefetch0
        _mm_prefetch((const char *)&buf[i+56], _MM_HINT_T0); // vprefetch0
        total0 = _mm512_sub_epi32(shuf0, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf0,1)));                        //  max value in nn is 10
        total1 = _mm512_sub_epi32(shuf1, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf1,1)));
        total2 = _mm512_sub_epi32(shuf2, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf2,1)));
        total3 = _mm512_sub_epi32(shuf3, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf3,1)));
        total0 = _mm512_add_epi32(_mm512_and_epi32(B1, total0), _mm512_and_epi32(B1,_mm512_srli_epi32(total0,2))); //  max value in nnnn is 0100
        total1 = _mm512_add_epi32(_mm512_and_epi32(B1, total1), _mm512_and_epi32(B1,_mm512_srli_epi32(total1,2)));
        total2 = _mm512_add_epi32(_mm512_and_epi32(B1, total2), _mm512_and_epi32(B1,_mm512_srli_epi32(total2,2)));
        total3 = _mm512_add_epi32(_mm512_and_epi32(B1, total3), _mm512_and_epi32(B1,_mm512_srli_epi32(total3,2)));
        total0 = _mm512_and_epi32(B2, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,4)));                      //  max value in 0000nnnn is 00001000
        total1 = _mm512_and_epi32(B2, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,4)));
        total2 = _mm512_and_epi32(B2, _mm512_add_epi32(total2, _mm512_srli_epi32(total2,4)));
        total3 = _mm512_and_epi32(B2, _mm512_add_epi32(total3, _mm512_srli_epi32(total3,4)));
        total0 = _mm512_add_epi32(total0, total1);                                                                 //  max value in 000nnnnn is 00010000
        total1 = _mm512_add_epi32(total2, total3);
        total0 = _mm512_add_epi32(total0, _mm512_srli_epi32(total0,8));                                            //  max value in xxxxxxxx00nnnnnn is 00100000
        total1 = _mm512_add_epi32(total1, _mm512_srli_epi32(total1,8));
        total0 = _mm512_and_epi32(B6, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,16)));                     //  max value in each element is 01000000, i.e. 64
        total1 = _mm512_and_epi32(B6, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,16)));
        result0 = _mm512_add_epi32(result0,total0);
        result1 = _mm512_add_epi32(result1,total1);

        /* Reduce add, which is analogous to SSSE3's PSADBW instruction,
           is not implementated as a single instruction in VPUv1, thus
           emulated by multiple instructions*/
    }

    result0 = _mm512_add_epi32(result0,result1);
    result  = _mm512_reduce_add_epi32(result0);

    return result;
}

Question 2

Will you please try the following variant, and report back whether this improves performance for you? I am addressing several points that I think aren't quite optimal in your coding:

I don't think your prefetching distances were quite right. It looked to me like you might have been thinking of byte offset distances when the indexing was really in terms of uint64's.
I see no reason to do the reduction operation every iteration of the loop. You can do partial accumulations of bit counts in the 16 SIMD elements, and then do a single reduction outside the loop.
I do not think that doing scalar-side popcount instructions is as advantageous as really getting the best of VPU scheduling. Focusing on an excellent VPU schedule is of highest importance. I also do not think that the scalar popcount instruction actually pairs with a vector operation; i.e. I think it is supported only in the U-pipe.

inline uint64_t vpu_popcount3_revised(uint64_t* buf, size_t n) {
    _mm_prefetch((const char *)&buf[0], _MM_HINT_T0); // vprefetch0
    _mm_prefetch((const char *)&buf[8], _MM_HINT_T0); // vprefetch0
    _mm_prefetch((const char *)&buf[16], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[24], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[32], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[40], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[48], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[56], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[64], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[72], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[80], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[88], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[96], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[104], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[112], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[120], _MM_HINT_T1); // vprefetch1
    register size_t result;
    size_t i;

    register const __m512i B0 = _mm512_load_epi32((void*)(magic+0));
    register const __m512i B1 = _mm512_load_epi32((void*)(magic+16));
    register const __m512i B2 = _mm512_load_epi32((void*)(magic+32));
    register const __m512i B3 = _mm512_load_epi32((void*)(magic+48));
    register const __m512i B4 = _mm512_load_epi32((void*)(magic+64));
    register __m512i total0;
    register __m512i total1;
    register __m512i shuf0;
    register __m512i shuf1;
    register __m512i result0;
    register __m512i result1;

    result0 = _mm512_setzero_epi32();
    result1 = _mm512_setzero_epi32();

    for (i = 0; i < n; i+=16) {
        shuf0 = _mm512_load_epi32(&buf[i  ]);
        shuf1 = _mm512_load_epi32(&buf[i+8]);
        _mm_prefetch((const char *)&buf[i+128], _MM_HINT_T1); // vprefetch1
        _mm_prefetch((const char *)&buf[i+136], _MM_HINT_T1); // vprefetch1
        _mm_prefetch((const char *)&buf[i+16], _MM_HINT_T0); // vprefetch0
        _mm_prefetch((const char *)&buf[i+24], _MM_HINT_T0); // vprefetch0
        total0 = _mm512_sub_epi32(shuf0, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf0,1)));
        total1 = _mm512_sub_epi32(shuf1, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf1,1)));
        total0 = _mm512_add_epi32(_mm512_and_epi32(B1, total0), _mm512_and_epi32(B1,_mm512_srli_epi32(total0,2)));
        total1 = _mm512_add_epi32(_mm512_and_epi32(B1, total1), _mm512_and_epi32(B1,_mm512_srli_epi32(total1,2)));
        total0 = _mm512_and_epi32(B2, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,4)));
        total1 = _mm512_and_epi32(B2, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,4)));
        total0 = _mm512_and_epi32(B3, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,8)));
        total1 = _mm512_and_epi32(B3, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,8)));
        total0 = _mm512_and_epi32(B4, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,16)));
        total1 = _mm512_and_epi32(B4, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,16)));
        result0 = _mm512_add_epi32(result0,total0);
        result1 = _mm512_add_epi32(result1,total1);

    }

    /* Reduce add, which is analogous to SSSE3's PSADBW instruction,
       is not implementated as a single instruction in VPUv1, thus
       emulated by multiple instructions*/

    result0 = _mm512_add_epi32(result0,result1);
    result  = _mm512_reduce_add_epi32(result0);

    return result;
}