You have a problem here:
float *p1 = new float[4];
_mm_store_ps(p1,mm_sum);
p1
is not guaranteed to be 16 byte aligned, so _mm_store_ps
may fail. Either ensure that p1
is 16 byte aligned, or use _mm_storeu_ps
instead of _mm_store_ps
.
You have another problem here:
delete[] array1;
array1
was not allocated via new []
so this will result in Undefined Behaviour.
Also mm_sum
needs to be initialised.
Here is a complete working version of your code:
#include <iostream>
#include <xmmintrin.h>
using namespace std;
int main()
{
__m128 mm_sum = _mm_setzero_ps(); // <<< mm_sum needs to be initialised
__m128 mm_buf;
float array1[4] __attribute__ ((aligned(16)));
array1[0] = 1.1;
array1[1] = 2.1;
array1[2] = 3.1;
array1[3] = 4.1;
mm_buf = _mm_load_ps(array1);
mm_sum = _mm_add_ps(mm_sum, mm_buf);
float *p1 = new float[4];
_mm_storeu_ps(p1,mm_sum); // <<< use unaligned store as p1 is not aligned
float Sum = p1[0]+p1[1]+p1[2]+p1[3];
cout<<"Sum= "<<Sum<<endl;
delete[] p1;
// <<< NB: do NOT delete [] array1 !
return 0;
}
Test it:
$ g++ -Wall -msse3 so_storeu_ps.cpp && ./a.out
Sum= 10.4
$