Question

I have an array with room for two float numbers, and I have a __m128 variable. I want to only store the two first floats of the __m128 variable.

What I'm doing now is

_mm_storeu_ps((float*)a, m0); //a is the array, m0 is the __m128 variable

this puts the first two floats of m0 into a, but it also continues to store its last two floats beyond the memory of a.

Was it helpful?

Solution

You can use the _mm_storel_pi intrinsic. This intrinsic generates a single movlps instruction. Here is an example. Functions sample1-sample4 demonstrate suggestions so far. Sample5 demonstrates the _mm_storel_pi method.

#include <stdio.h>
#include <intrin.h>

//-----------------------------------------

void sample1 (float *a, __m128 m0)
    {
    _mm_storeu_ps(a, m0); //a is the array, m0 is the __m128 variable
    }

//-----------------------------------------

void sample2 (float *a, __m128 m0)
    {
    float *p = (float *)&m0;
    a[0] = p[0];
    a[1] = p[1];
    }

//-----------------------------------------

void sample3 (float *a, __m128 m0)
    {
    _mm_store_ss(&a[0], m0);
    _mm_store_ss(&a[1], _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(1,1,1,1)));
    }

//-----------------------------------------

void sample4 (float *a, __m128 m0)
    {
    union { __m128 i; float f[4]; } u;
    u.i = m0;
    a [0] = u.f[0];
    a [1] = u.f[1];
    }

//-----------------------------------------

void sample5 (float *a, __m128 m0)
    {
    _mm_storel_pi ((__m64 *)a, m0);
    }

//-----------------------------------------

void printa (float *a)
    {
    printf ("%g %g %g %g\n", a [0], a [1], a [2], a [3]);
    }

//-----------------------------------------

int main (void)
    {
    __m128 m0 = _mm_set_ps (1.0, 2.0, 3.0, 4.0);
    float a [4];

    memset (a, 0, sizeof a);
    sample1 (a, m0);
    printa (a);

    memset (a, 0, sizeof a);
    sample2 (a, m0);
    printa (a);

    memset (a, 0, sizeof a);
    sample3 (a, m0);
    printa (a);

    memset (a, 0, sizeof a);
    sample4 (a, m0);
    printa (a);

    memset (a, 0, sizeof a);
    sample5 (a, m0);
    printa (a);

    return 0;
    }

//-------------------------------------

output:

4 3 2 1
4 3 0 0
4 3 0 0
4 3 0 0
4 3 0 0

Here is gcc 4.8.1 x64 code generation for the functions:

0000000000401510 <sample1>:
 401510:    0f 28 02                movaps xmm0,XMMWORD PTR [rdx]
 401513:    0f 11 01                movups XMMWORD PTR [rcx],xmm0
 401516:    c3                      ret    

0000000000401520 <sample2>:
 401520:    0f 28 02                movaps xmm0,XMMWORD PTR [rdx]
 401523:    f3 0f 11 01             movss  DWORD PTR [rcx],xmm0
 401527:    0f c6 c0 55             shufps xmm0,xmm0,0x55
 40152b:    f3 0f 11 41 04          movss  DWORD PTR [rcx+0x4],xmm0
 401530:    c3                      ret    

0000000000401540 <sample3>:
 401540:    0f 28 02                movaps xmm0,XMMWORD PTR [rdx]
 401543:    f3 0f 11 01             movss  DWORD PTR [rcx],xmm0
 401547:    0f c6 c0 55             shufps xmm0,xmm0,0x55
 40154b:    f3 0f 11 41 04          movss  DWORD PTR [rcx+0x4],xmm0
 401550:    c3                      ret    

0000000000401560 <sample4>:
 401560:    48 8b 02                mov    rax,QWORD PTR [rdx]
 401563:    89 01                   mov    DWORD PTR [rcx],eax
 401565:    48 c1 e8 20             shr    rax,0x20
 401569:    89 41 04                mov    DWORD PTR [rcx+0x4],eax
 40156c:    c3                      ret    

0000000000401570 <sample5>:
 401570:    0f 28 02                movaps xmm0,XMMWORD PTR [rdx]
 401573:    0f 13 01                movlps QWORD PTR [rcx],xmm0
 401576:    c3                      ret    

OTHER TIPS

You have a couple of options:

Option 1

You can cast a pointer to the __m128 to a float* and index it accordingly:

float *p = (float *)&m0;
a[0] = p[0];
a[1] = p[1];

Some people prefer to create a union of an array of 4 floats and a __m128, which performance wise would be very similar.

Option 2

If you only want to use the SSE intrinsics, you can use _mm_store_ss and _mm_shuffle_ps:

_mm_store_ss(&a[0], m0);
_mm_store_ss(&a[1], _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(1,1,1,1)));

The shuffle instructions in SSE are extremely useful, read more about them here.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top