Stream intrinsic degrades performance

Question 1

As ScottD pointed out, the answer to the question lies in the generated assembly code. Apparently the Intel compiler is smart enough to detect the access pattern and automatically generates non-temporal loads even for the temporal version.

Here is a the compiler-generated assembly code for the temporal version:

..___tag_value___Z13copy_temporalPfS_.35:                       #
        xor       edx, edx                                      #22.4
        xor       eax, eax                                      #
..B2.2:                         # Preds ..B2.2 ..B2.1
        vmovups   xmm0, XMMWORD PTR [rax+rdi]                   #23.34
        inc       rdx                                           #22.4
        vmovntps  XMMWORD PTR [rax+rsi], xmm0                   #23.20
        vmovups   xmm1, XMMWORD PTR [16+rax+rdi]                #24.36
        vmovntps  XMMWORD PTR [16+rax+rsi], xmm1                #24.20
        vmovups   xmm2, XMMWORD PTR [32+rax+rdi]                #23.34
        vmovntps  XMMWORD PTR [32+rax+rsi], xmm2                #23.20
        vmovups   xmm3, XMMWORD PTR [48+rax+rdi]                #24.36
        vmovntps  XMMWORD PTR [48+rax+rsi], xmm3                #24.20
        add       rax, 64                                       #22.4
        cmp       rdx, 5000000                                  #22.4
        jb        ..B2.2        # Prob 99%                      #22.4

The question which still remains is the following:

Why does the non-aligned, temporal version perform better than the non-temporal version for the CPU E5-2650 (see above). I've already looked at the generated assembly code and the compiler really generates vmovups instructions (due to the non existing alignment).

Question 2

The stream variation creates pipelined burst writes directly to DRAM. The speed should roughly match the speed of your DRAM. The standard store writes to cache (but if the data is not already in the cache, it reads it into cache first). If the data is already in cache, the standard store runs at the speed of cache writes. In general, writes with size much larger than the last level cache size are much faster using the stream method. Small writes are often faster using standard stores. Try running the test using a buffer size of a couple of GB. The stream method should be faster.

Here is a benchmark to demonstrate:

#define __USE_MINGW_ANSI_STDIO 1
#include <stdlib.h>
#include <intrin.h>
#include <windows.h>
#include <stdio.h>
#include <stdint.h>

//-----------------------------------------------------------------------------
//
//  queryPerformanceCounter - similar to QueryPerformanceCounter, but returns
//                            count directly.

uint64_t queryPerformanceCounter (void)
    {
    LARGE_INTEGER int64;
    QueryPerformanceCounter (&int64);
    return int64.QuadPart;
    }

//-----------------------------------------------------------------------------
//
// queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns  count direcly.

uint64_t queryPerformanceFrequency (void)
    {
    LARGE_INTEGER int64;

    QueryPerformanceFrequency (&int64);
    return int64.QuadPart;
    }

//---------------------------------------------------------------------------

static void testNontemporal (float *x, float *y, uint64_t numberOfVectors)
    {
    uint64_t i;
    for(i = 0; i < numberOfVectors / 2; ++i)
        {
        _mm_stream_ps(y,_mm_load_ps(x));
        _mm_stream_ps(y+4,_mm_load_ps(x+4));
        y+=8; x+=8;
        }
    }

//---------------------------------------------------------------------------

static void testTemporal (float *x, float *y, uint64_t numberOfVectors)
    {
    uint64_t i;
    for(i = 0; i < numberOfVectors / 2; ++i)
        {
        _mm_store_ps(y,_mm_load_ps(x));
        _mm_store_ps(y+4,_mm_load_ps(x+4));
        y+=8; x+=8;
        }
    }

//---------------------------------------------------------------------------

static void runtests (int nonTemporal)
    {
    uint64_t startCount, elapsed, index;
    float *x, *y;
    uint64_t numberOfBytes = 400 * 0x100000ull;
    uint64_t numberOfFloats = numberOfBytes / sizeof *x;
    uint64_t numberOfVectors = numberOfFloats / 4;
    double gbPerSecond;

    x = _mm_malloc (numberOfBytes, 32);
    y = _mm_malloc (numberOfBytes, 32);
    if (x == NULL || y == NULL) exit (1);

    // put valid floating point data into the source buffer
    // to avoid performance penalty
    for (index = 0; index < numberOfFloats; index++)
        x [index] = (float) index, y [index] = 0;

    startCount = queryPerformanceCounter ();
    if (nonTemporal)
        testNontemporal (x, y, numberOfVectors);
    else
        testTemporal (x, y, numberOfVectors);
    elapsed = queryPerformanceCounter () - startCount;
    gbPerSecond = (double) numberOfBytes / 0x40000000 * queryPerformanceFrequency () / elapsed;
    printf ("%.2f GB/s\n", gbPerSecond);
    _mm_free (x);
    _mm_free (y);
    }

//---------------------------------------------------------------------------

int main (void)
    {
    // raise our priority to increase measurement accuracy
    SetPriorityClass (GetCurrentProcess (), REALTIME_PRIORITY_CLASS);

    printf ("using temporal stores\n");
    runtests (0);
    printf ("using non-temporal stores\n");
    runtests (1);
    return 0;
    }

//---------------------------------------------------------------------------

Output from Intel Core i7-2600K:

using temporal stores
5.57 GB/s
using non-temporal stores
8.35 GB/s

Question 3

AFAIK, non-temporal stores drop the target cacheline from all caches. If the line is touched again before it would have been dropped naturally, you've lost pretty hard.