Question

I have an array with room for two float numbers, and I have a __m128 variable. I want to only store the two first floats of the __m128 variable.

What I'm doing now is

_mm_storeu_ps((float*)a, m0); //a is the array, m0 is the __m128 variable

this puts the first two floats of m0 into a, but it also continues to store its last two floats beyond the memory of a.

Était-ce utile?

La solution

You can use the _mm_storel_pi intrinsic. This intrinsic generates a single movlps instruction. Here is an example. Functions sample1-sample4 demonstrate suggestions so far. Sample5 demonstrates the _mm_storel_pi method.

#include <stdio.h>
#include <intrin.h>

//-----------------------------------------

void sample1 (float *a, __m128 m0)
    {
    _mm_storeu_ps(a, m0); //a is the array, m0 is the __m128 variable
    }

//-----------------------------------------

void sample2 (float *a, __m128 m0)
    {
    float *p = (float *)&m0;
    a[0] = p[0];
    a[1] = p[1];
    }

//-----------------------------------------

void sample3 (float *a, __m128 m0)
    {
    _mm_store_ss(&a[0], m0);
    _mm_store_ss(&a[1], _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(1,1,1,1)));
    }

//-----------------------------------------

void sample4 (float *a, __m128 m0)
    {
    union { __m128 i; float f[4]; } u;
    u.i = m0;
    a [0] = u.f[0];
    a [1] = u.f[1];
    }

//-----------------------------------------

void sample5 (float *a, __m128 m0)
    {
    _mm_storel_pi ((__m64 *)a, m0);
    }

//-----------------------------------------

void printa (float *a)
    {
    printf ("%g %g %g %g\n", a [0], a [1], a [2], a [3]);
    }

//-----------------------------------------

int main (void)
    {
    __m128 m0 = _mm_set_ps (1.0, 2.0, 3.0, 4.0);
    float a [4];

    memset (a, 0, sizeof a);
    sample1 (a, m0);
    printa (a);

    memset (a, 0, sizeof a);
    sample2 (a, m0);
    printa (a);

    memset (a, 0, sizeof a);
    sample3 (a, m0);
    printa (a);

    memset (a, 0, sizeof a);
    sample4 (a, m0);
    printa (a);

    memset (a, 0, sizeof a);
    sample5 (a, m0);
    printa (a);

    return 0;
    }

//-------------------------------------

output:

4 3 2 1
4 3 0 0
4 3 0 0
4 3 0 0
4 3 0 0

Here is gcc 4.8.1 x64 code generation for the functions:

0000000000401510 <sample1>:
 401510:    0f 28 02                movaps xmm0,XMMWORD PTR [rdx]
 401513:    0f 11 01                movups XMMWORD PTR [rcx],xmm0
 401516:    c3                      ret    

0000000000401520 <sample2>:
 401520:    0f 28 02                movaps xmm0,XMMWORD PTR [rdx]
 401523:    f3 0f 11 01             movss  DWORD PTR [rcx],xmm0
 401527:    0f c6 c0 55             shufps xmm0,xmm0,0x55
 40152b:    f3 0f 11 41 04          movss  DWORD PTR [rcx+0x4],xmm0
 401530:    c3                      ret    

0000000000401540 <sample3>:
 401540:    0f 28 02                movaps xmm0,XMMWORD PTR [rdx]
 401543:    f3 0f 11 01             movss  DWORD PTR [rcx],xmm0
 401547:    0f c6 c0 55             shufps xmm0,xmm0,0x55
 40154b:    f3 0f 11 41 04          movss  DWORD PTR [rcx+0x4],xmm0
 401550:    c3                      ret    

0000000000401560 <sample4>:
 401560:    48 8b 02                mov    rax,QWORD PTR [rdx]
 401563:    89 01                   mov    DWORD PTR [rcx],eax
 401565:    48 c1 e8 20             shr    rax,0x20
 401569:    89 41 04                mov    DWORD PTR [rcx+0x4],eax
 40156c:    c3                      ret    

0000000000401570 <sample5>:
 401570:    0f 28 02                movaps xmm0,XMMWORD PTR [rdx]
 401573:    0f 13 01                movlps QWORD PTR [rcx],xmm0
 401576:    c3                      ret    

Autres conseils

You have a couple of options:

Option 1

You can cast a pointer to the __m128 to a float* and index it accordingly:

float *p = (float *)&m0;
a[0] = p[0];
a[1] = p[1];

Some people prefer to create a union of an array of 4 floats and a __m128, which performance wise would be very similar.

Option 2

If you only want to use the SSE intrinsics, you can use _mm_store_ss and _mm_shuffle_ps:

_mm_store_ss(&a[0], m0);
_mm_store_ss(&a[1], _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(1,1,1,1)));

The shuffle instructions in SSE are extremely useful, read more about them here.

Licencié sous: CC-BY-SA avec attribution
Non affilié à StackOverflow
scroll top