Question

Continuing on from my first question, I am trying to optimize a memory hotspot found via VTune profiling a 64-bit C program.

In particular, I'd like to find the fastest way to test if a 128-byte block of memory contains all zeros. You may assume any desired memory alignment for the memory block; I used 64-byte alignment.

I am using a PC with an Intel Ivy Bridge Core i7 3770 processor with 32 GB of memory and the free version of Microsoft vs2010 C compiler.

My first attempt was:

const char* bytevecM;    // 4 GB block of memory, 64-byte aligned
size_t* psz;             // size_t is 64-bits
// ...
// "m7 & 0xffffff80" selects the 128 byte block to test for all zeros
psz = (size_t*)&bytevecM[(unsigned int)m7 & 0xffffff80];
if (psz[0]  == 0 && psz[1]  == 0
&&  psz[2]  == 0 && psz[3]  == 0
&&  psz[4]  == 0 && psz[5]  == 0
&&  psz[6]  == 0 && psz[7]  == 0
&&  psz[8]  == 0 && psz[9]  == 0
&&  psz[10] == 0 && psz[11] == 0
&&  psz[12] == 0 && psz[13] == 0
&&  psz[14] == 0 && psz[15] == 0) continue;
// ...

VTune profiling of the corresponding assembly follows:

cmp    qword ptr [rax],      0x0       0.171s
jnz    0x14000222                     42.426s
cmp    qword ptr [rax+0x8],  0x0       0.498s
jnz    0x14000222                      0.358s
cmp    qword ptr [rax+0x10], 0x0       0.124s
jnz    0x14000222                      0.031s
cmp    qword ptr [rax+0x18], 0x0       0.171s
jnz    0x14000222                      0.031s
cmp    qword ptr [rax+0x20], 0x0       0.233s
jnz    0x14000222                      0.560s
cmp    qword ptr [rax+0x28], 0x0       0.498s
jnz    0x14000222                      0.358s
cmp    qword ptr [rax+0x30], 0x0       0.140s
jnz    0x14000222
cmp    qword ptr [rax+0x38], 0x0       0.124s
jnz    0x14000222
cmp    qword ptr [rax+0x40], 0x0       0.156s
jnz    0x14000222                      2.550s
cmp    qword ptr [rax+0x48], 0x0       0.109s
jnz    0x14000222                      0.124s
cmp    qword ptr [rax+0x50], 0x0       0.078s
jnz    0x14000222                      0.016s
cmp    qword ptr [rax+0x58], 0x0       0.078s
jnz    0x14000222                      0.062s
cmp    qword ptr [rax+0x60], 0x0       0.093s
jnz    0x14000222                      0.467s
cmp    qword ptr [rax+0x68], 0x0       0.047s
jnz    0x14000222                      0.016s
cmp    qword ptr [rax+0x70], 0x0       0.109s
jnz    0x14000222                      0.047s
cmp    qword ptr [rax+0x78], 0x0       0.093s
jnz    0x14000222                      0.016s

I was able to improve on that via Intel instrinsics:

const char* bytevecM;                        // 4 GB block of memory
__m128i* psz;                                // __m128i is 128-bits
__m128i one = _mm_set1_epi32(0xffffffff);    // all bits one
// ...
psz = (__m128i*)&bytevecM[(unsigned int)m7 & 0xffffff80];
if (_mm_testz_si128(psz[0], one) && _mm_testz_si128(psz[1], one)
&&  _mm_testz_si128(psz[2], one) && _mm_testz_si128(psz[3], one)
&&  _mm_testz_si128(psz[4], one) && _mm_testz_si128(psz[5], one)
&&  _mm_testz_si128(psz[6], one) && _mm_testz_si128(psz[7], one)) continue;
// ...

VTune profiling of the corresponding assembly follows:

movdqa xmm0, xmmword ptr [rax]         0.218s
ptest  xmm0, xmm2                     35.425s
jnz    0x14000ddd                      0.700s
movdqa xmm0, xmmword ptr [rax+0x10]    0.124s
ptest  xmm0, xmm2                      0.078s
jnz    0x14000ddd                      0.218s
movdqa xmm0, xmmword ptr [rax+0x20]    0.155s
ptest  xmm0, xmm2                      0.498s
jnz    0x14000ddd                      0.296s
movdqa xmm0, xmmword ptr [rax+0x30]    0.187s
ptest  xmm0, xmm2                      0.031s
jnz    0x14000ddd
movdqa xmm0, xmmword ptr [rax+0x40]    0.093s
ptest  xmm0, xmm2                      2.162s
jnz    0x14000ddd                      0.280s
movdqa xmm0, xmmword ptr [rax+0x50]    0.109s
ptest  xmm0, xmm2                      0.031s
jnz    0x14000ddd                      0.124s
movdqa xmm0, xmmword ptr [rax+0x60]    0.109s
ptest  xmm0, xmm2                      0.404s
jnz    0x14000ddd                      0.124s
movdqa xmm0, xmmword ptr [rax+0x70]    0.093s
ptest  xmm0, xmm2                      0.078s
jnz    0x14000ddd                      0.016s

As you can see, there are fewer assembly instructions and this version further proved to be faster in timing tests.

Since I am quite weak in the area of Intel SSE/AVX instructions, I welcome advice on how they might be better employed to speed up this code.

Though I scoured the hundreds of available instrinsics, I may have missed the ideal ones. In particular, I was unable to effectively employ _mm_cmpeq_epi64(); I looked for a "not equal" version of this instrinsic (which seems better suited to this problem) but came up dry. Though the below code "works":

if (_mm_testz_si128(_mm_andnot_si128(_mm_cmpeq_epi64(psz[7], _mm_andnot_si128(_mm_cmpeq_epi64(psz[6], _mm_andnot_si128(_mm_cmpeq_epi64(psz[5], _mm_andnot_si128(_mm_cmpeq_epi64(psz[4], _mm_andnot_si128(_mm_cmpeq_epi64(psz[3], _mm_andnot_si128(_mm_cmpeq_epi64(psz[2], _mm_andnot_si128(_mm_cmpeq_epi64(psz[1], _mm_andnot_si128(_mm_cmpeq_epi64(psz[0], zero), one)), one)), one)), one)), one)), one)), one)), one), one)) continue;

it is borderline unreadable and (unsurprisingly) proved to be way slower than the two versions given above. I feel sure there must be a more elegant way to employ _mm_cmpeq_epi64() and welcome advice on how that might be achieved.

In addition to using intrinsics from C, raw Intel assembly language solutions to this problem are also welcome.

Was it helpful?

Solution

The main problem, as others have pointed out, is that the 128-byte data you are checking is missing the data cache and/or the TLB and going to DRAM, which is slow. VTune is telling you this

cmp    qword ptr [rax],      0x0       0.171s
jnz    0x14000222                     42.426s

You have another, smaller, hotspot half-way down

cmp    qword ptr [rax+0x40], 0x0       0.156s
jnz    0x14000222                      2.550s

Those 42.4 + 2.5 seconds accounted to the JNZ instructions are really a stall caused by the previous load from memory... the processor is sitting around doing nothing for 45 seconds total over the time you profiled the program...waiting on DRAM.

You might ask what the 2nd hotspot half-way down is all about. Well, you are accessing 128-bytes and cache lines are 64-bytes, the CPU started prefetching for you as soon as it read the first 64-bytes... but you didn't do enough work with the first 64-bytes to totally overlap the latency of going to memory.

The memory bandwidth of Ivy Bridge is very high (it depends on your system, but I'm guessing over 10 GB/sec). Your block of memory is 4GB, you should be able to zip thru it in less than 1 second if you access it sequentially and let the CPU prefetch data ahead for you.

My guess is you are thwarting the CPU data prefetcher by accessing the 128-byte blocks in a non-contiguous fashion.

Change your access pattern to be sequential and you'll be surprised how much faster it runs. You can then worry about the next level of optimization, which will be making sure the branch prediction works well.

Another thing to consider is TLB misses. Those are costly, especially in a 64-bit system. Rather than using 4KB pages consider using 2MB huge pages. See this link for Windows support for these: Large-Page Support (Windows)

If you must access the 4GB data in a somewhat random fashion, but you know ahead of time the sequence of m7 values (your index) then you can pipeline the memory fetching explicitly ahead of your use (it needs to be several 100 CPU cycles ahead of when you will be using it to be effective). See

Here are some links that might be helpful in general on the subject of memory optimizations

What Every Programmer Should Know About Memory by Ulrich Drepper

http://www.akkadia.org/drepper/cpumemory.pdf

Machine Architecture: Things Your Programming Language Never Told You, by Herb Sutter

http://www.gotw.ca/publications/concurrency-ddj.htm

http://nwcpp.org/static/talks/2007/Machine_Architecture_-_NWCPP.pdf

http://video.google.com/videoplay?docid=-4714369049736584770#

OTHER TIPS

Sorry for the answer post, I don't have enough reputation for comments.
What happens if you use the following as a test?

if( (psz[0]  | psz[1]  | psz[2]  | psz[3]  |
     psz[4]  | psz[5]  | psz[6]  | psz[7]  |
     psz[8]  | psz[9]  | psz[10] | psz[11] |
     psz[12] | psz[13] | psz[14] | psz[15] ) == 0) continue;

Unfortunately, I don't have a 64-bit system on which to compile it, and I am unfamiliar with what exactly the compiler does with c code, but it would seem to me that a binary or would be faster than individual == comparisons. I also don't know what Intel intrinsics are, but it may be possible to optimize the above code in a similar manner to what you have already done.
I hope my answer helps.
Mmarss

At 98% of 128-byte blocks being all zero, you're averaging less than one nonzero byte per 4K page. With an array that sparse, have you tried storing it as a sparse array? You'll save huge swathes of memory and the attendant cache-miss delays; I wouldn't be surprised if a plain std::map turns out faster.

Have you considered the Intel string scan instructions? These tend to have very high data rates, and the processor knows the data access is sequential.

     mov      rdi, <blockaddress>
     cld
     xor      rax, rax
     mov      rcx, 128/8
     repe     scasq
     jne      ...

This won't help the problem of your data not being in cache. You might fix that by using Intel's prefetch instruction if you know which chunk you want to consider well in advance. See http://software.intel.com/en-us/articles/use-software-data-prefetch-on-32-bit-intel-architecture

[EDITS to code to intergrate minor hiccups pointed out in comments]

Thanks for the excellent tips received so far.

I felt confident that the Mmarss "mega or" approach would improve performance because it generated fewer assembly language instructions. However, when I ran my benchmark program, it took 163 seconds versus 150 seconds for my original clunky && solution and 145 seconds for my original clunky Intel instrinsics solution (these two are described in my original post).

For completeness, here is the C code I used for the "mega or" approach:

if ((psz[0]  | psz[1]  | psz[2]  | psz[3]
|    psz[4]  | psz[5]  | psz[6]  | psz[7]
|    psz[8]  | psz[9]  | psz[10] | psz[11]
|    psz[12] | psz[13] | psz[14] | psz[15]) == 0) continue;

The VTune assembly was:

mov    rax, qword ptr [rcx+0x78]    0.155s
or     rax, qword ptr [rcx+0x70]   80.972s
or     rax, qword ptr [rcx+0x68]    1.292s
or     rax, qword ptr [rcx+0x60]    0.311s
or     rax, qword ptr [rcx+0x58]    0.249s
or     rax, qword ptr [rcx+0x50]    1.229s
or     rax, qword ptr [rcx+0x48]    0.187s
or     rax, qword ptr [rcx+0x40]    0.233s
or     rax, qword ptr [rcx+0x38]    0.218s
or     rax, qword ptr [rcx+0x30]    1.742s
or     rax, qword ptr [rcx+0x28]    0.529s
or     rax, qword ptr [rcx+0x20]    0.233s
or     rax, qword ptr [rcx+0x18]    0.187s
or     rax, qword ptr [rcx+0x10]    1.244s
or     rax, qword ptr [rcx+0x8]     0.155s
or     rax, qword ptr [rcx]         0.124s
jz     0x1400070b9                  0.342s

I then tried translating the "mega or" idea to Intel instrinsics via:

__m128i tt7;
// ...
tt7 = _mm_or_si128(_mm_or_si128(_mm_or_si128(psz[0], psz[1]),
      _mm_or_si128(psz[2], psz[3])),
      _mm_or_si128(_mm_or_si128(psz[4], psz[5]),
      _mm_or_si128(psz[6], psz[7])));
if ( (tt7.m128i_i64[0] | tt7.m128i_i64[1]) == 0) continue;

though also that turned out to be slower, taking 155 seconds. Its VTune assembly was:

movdqa xmm2, xmmword ptr [rax]         0.047s
movdqa xmm0, xmmword ptr [rax+0x20]   75.461s
movdqa xmm1, xmmword ptr [rax+0x40]    2.567s
por    xmm0, xmmword ptr [rax+0x30]    1.867s
por    xmm2, xmmword ptr [rax+0x10]    0.078s
por    xmm1, xmmword ptr [rax+0x50]    0.047s
por    xmm2, xmm0                      0.684s
movdqa xmm0, xmmword ptr [rax+0x60]    0.093s
por    xmm0, xmmword ptr [rax+0x70]    1.214s
por    xmm1, xmm0                      0.420s
por    xmm2, xmm1                      0.109s
movdqa xmmword ptr tt7$[rsp], xmm2     0.140s
mov    rax, qword ptr [rsp+0x28]       0.233s
or     rax, qword ptr [rsp+0x20]       1.027s
jz     0x1400070e2                     0.498s

The Intel instrinsics approach above is pretty crude. Suggestions for improving it are welcome.

This shows yet again how important it is to measure. Almost every time I've guessed which would be faster I've been wrong. That said, so long as you carefully measure each change, you can't get any worse, can only improve. Though I've gone backwards (as above) more often than forwards, over the past week I've been able to reduce the running time of the little test program from 221 seconds down to 145. Given that the real program will be running for months, that will save days.

Suggestion: align your array to 128B, so the spatial prefetcher will always want to fill the correct cache-line to make a 128B pair of cache lines. Intel optimization manual, page 2-30 (pg 60 of the PDF), describing Sandybridge/Ivybridge:

Spatial Prefetcher: This prefetcher strives to complete every cache line fetched to the L2 cache with the pair line that completes it to a 128-byte aligned chunk.

With your array only aligned to 64B, reading 128B can touch two pairs of cache lines, leading the L2 spatial prefetcher to issue more loads for data you'll probably never use.


Your answer has the right idea: OR the block together with vectors, then test that for all-zero. Using a single branch is probably better than branching separately on every 8 bytes.

But your strategy for testing a vector sucks: don't store it and then scalar load+OR both halves. This is a perfect use-case for SSE4 PTEST, which lets us avoid the usual pcmpeqb / pmovmskb:

ptest   xmm0,xmm0      ; 2 uops, and Agner Fog lists it as 1c latency for SnB/IvB, but this is probably bogus.  2c is more likely
jz    vector_is_all_zero
; 3 uops, but shorter latency and smaller code-size than pmovmskb

Normally branches predict well, and latency to generate their flag inputs isn't important. But in this case, the main bottleneck is branch mispredicts. So it's probably worth it to spend more uops (if necessary) to reduce latency.


I'm not sure whether it's better to test the first cache line before loading the second cache line, in case you find a non-zero byte without suffering the second cache miss. The spatial-prefetcher can't get the second cache line loaded instantly, so probably try an early-out before loading the second 64B cache line, unless that leads to a lot of extra branch mispredicts.

So I might do:

allzero_128B(const char *buf)
{
    const __m128i *buf128 = (const __m128i*)buf;  // dereferencing produces 128b aligned-load instructions

    __m128i or0 = _mm_or_si128(buf[0], buf[2]);
    __m128i or2 = _mm_or_si128(buf[1], buf[3]);
    __m128i first64 = _mm_or_si128(or0, or2);
    // A chain of load + 3 OR instructions would be fewer fused-domain uops
    //  than load+or, load+or, or(xmm,xmm).  But resolving the branch faster is probably the most important thing.

    if (_mm_testz_si128(first64, first64)
        return 0;

    __m128i or4 = _mm_or_si128(buf[4], buf[6]);
    __m128i or6 = _mm_or_si128(buf[5], buf[7]);
    __m128i first64 = _mm_or_si128(or4, or6);


}

On IvyBrige, there's not much if anything to gain from using 256b AVX ops. Vector-FP 256b VORPS ymm does twice as much work per uop, but only runs on port5. (POR xmm runs on p015). 256b loads are done as two 128b halves, but they are still only 1 uop.

I don't see a way to use a single CMPEQPS to check a 256b vector for all-zero. +0.0 compares equal to -0.0, so a 1-bit in the sign-bit position would go undetected in a compare against zero. I don't think any of the CMPPS predicates help, either, since none of them implement compares that treat -0.0 different from +0.0. (See SIMD instructions for floating point equality comparison (with NaN == NaN) for more about FP-equality).

; First 32B arrives in L1D (and load buffers) on cycle n
vmovaps  ymm0,   [rdi+64]              ; ready on cycle n+1  (256b loads take 2 cycles)
vorps    ymm0,   ymm0, [rdi+96]        ; ready on cycle n+3  (the load uop is executing on cycles n+1 and n+2)
vextractf128 xmm1, ymm0, 1           ; 2c latency on IvB, 3c on Haswell
                                     ; xmm1 ready on cycle n+5
vpor     xmm0,   xmm0, xmm1          ; ready on n+6 (should be no bypass delay for a shuffle (vextractf128) -> integer booleans)
vptest   xmm0,   xmm0
jz   second_cacheline_all_zero

No, that's not better than

; First 32B of the cache-line arrives in L1D on cycle n (IvB has a 32B data path from L2->L1)
vmovaps  xmm0,   [rdi+64]              ; result ready on cycle n
vmovaps  xmm1,   [rdi+64 + 16]         ; result ready on cycle n  (data should be forwarded to outstanding load buffers, I think?)
vpor     xmm0,   xmm0, [rdi+64 + 32]   ; ready on cycle n+1
vpor     xmm1,   xmm1, [rdi+64 + 48]   ; ready on cycle n+1, assuming the load uops get their data the cycle after the first pair.
vpor     xmm0,   xmm1                  ; ready on cycle n+2
vptest   xmm0,   xmm0
jz   second_cacheline_all_zero

With AVX2, 256b ops would make sense, including VPTEST ymm,ymm.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top