some intersectRaySphere c procedure optymization through rewriting it to x86 asm (How?)

https://stackoverflow.com/questions/11793883

24-06-2021
|

سؤال

Hullo, I've got not to much knowledge in assembly, and i am thinking how to optimize it by rewriting this in x86 (32-bit fpu or sse2) assembly, thing should be optimized - rewritten in correct assembly, then I will test if I've got some speed up (length() and dot() should be written in asm here also) This code is used by my simple real time ray-tracer and it works - but I am not much good in asm optimizations)

    inline float intersectRaySphere(float3* rO, float3* rV, float3* sO, float sR)
   {
    static float3 Q;

    Q = sub(sO,rO);
    float c = length(&Q);
    float v = dot(&Q,rV);
    float d = sR*sR - (c*c - v*v);

    // If there was no intersection, return -1
    if (d < 0.0) return (-1.0f);

    // Return the distance to the [first] intersecting point
    return (v - sqrt(d));
    }

Thank You in Advance

//edits

    struct float3
    {
     float x;
     float y;
     float z;
    };


    inline float length(float3* v) {
     return sqrt( (v->x)*(v->x) + (v->y)*(v->y) + (v->z)*(v->z) );
    }

   inline float dot(float3* a, float3* b) {
     return (*a).x * (*b).x + (*a).y * (*b).y + (*a).z * (*b).z;
   }

and demo exe (unoptymized in not even so much optymized c):

dl.dropbox.com/u/42887985/re29.zip

Maybe someone could give me a somewhat good fpu asm routines for length dot (or normalize not shown here) ?? Though whole procedure for intersect procedure would be the best ;-)

المحلول

This isn't a "nice" function to convert to SSE. Almost nothing is actually parallel. So let's change the function to intersect 4 rays at once. And it would help if the rays were stores in SOA (struct of arrays) instead of AOS (array of structs) as well.

With those changes, it might become something like this (not tested in any way):

inline void intersect4RaysSphere(
 float* rOx, float* rOy, float* rOz,
 float* rVx, float* rVy, float* rVz,
 float sOx, float sOy, float sOz,
 float sR)
{
    // calculate Q
    movss xmm0, sOx
    movss xmm1, sOy
    movss xmm2, sOz
    shufps xmm0, xmm0, 0
    shufps xmm1, xmm1, 0
    shufps xmm2, xmm2, 0
    subps xmm0, [rOx]
    subps xmm1, [rOy]
    subps xmm2, [rOz]
    // calculate pow(dot(Q, rV), 2) in xmm3
    movaps xmm3, [rVx]
    movaps xmm4, [rVy]
    movaps xmm5, [rVz]
    mulps xmm3, xmm0
    mulps xmm4, xmm1
    mulps xmm5, xmm2
    addps xmm3, xmm4
    addps xmm3, xmm5
    movaps xmm4, xmm3
    mulps xmm3, xmm3
    // calculate pow(length(Q), 2)
    // there's no point in taking the square root only to then square it
    mulps xmm0, xmm0
    mulps xmm1, xmm1
    mulps xmm2, xmm2
    addps xmm0, xmm1
    addps xmm0, xmm2
    // calculate d
    movss xmm1, sR
    mulss xmm1, xmm1
    shufps xmm1, xmm1, 0
    subps xmm0, xmm3
    subps xmm1, xmm0
    sqrtps xmm1, xmm1
    // test for intersection
    // at this point:
    // xmm3 = v * v
    // xmm4 = v
    // xmm1 = sqrt(d)
    movaps xmm0, [minus1]  // memory location with { -1.0, -1.0, -1.0, -1.0 }
    subps xmm4, xmm1
    // get a mask of d's smaller than 0.0
    psrad xmm1, 31
    // select -1 if less than zero or v*v - d if >= 0
    andps xmm0, xmm1
    andnps xmm1, xmm4
    orps xmm0, xmm1
    ret
}

Version with intrinsics (only slightly tested - it's compilable and seems to generate OK assembly):

__m128 intersect4RaysSphere(
     float* rOx, float* rOy, float* rOz,
     float* rVx, float* rVy, float* rVz,
     float sOx, float sOy, float sOz,
     float sR)
{
    __m128 Qx = _mm_sub_ps(_mm_set1_ps(sOx), _mm_load_ps(rOx));
    __m128 Qy = _mm_sub_ps(_mm_set1_ps(sOy), _mm_load_ps(rOy));
    __m128 Qz = _mm_sub_ps(_mm_set1_ps(sOz), _mm_load_ps(rOz));
    __m128 v = _mm_add_ps(_mm_mul_ps(Qx, _mm_load_ps(rVx)),
               _mm_add_ps(_mm_mul_ps(Qy, _mm_load_ps(rVy)),
                          _mm_mul_ps(Qz, _mm_load_ps(rVz))));
    __m128 vsquared = _mm_mul_ps(v, v);
    __m128 lengthQsquared = _mm_add_ps(_mm_mul_ps(Qx, Qx),
                            _mm_add_ps(_mm_mul_ps(Qy, Qy),
                                       _mm_mul_ps(Qz, Qz)));
    __m128 sr = _mm_set1_ps(sR);
    __m128 d = _mm_sub_ps(_mm_mul_ps(sr, sr), _mm_sub_ps(lengthQsquared, vsquared));
    __m128 mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(d), 31));
    //__m128 result = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-1.0f), mask),
                              _mm_andnot_ps(mask, _mm_sub_ps(vsquared, d)));
    __m128 result = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-1.0f), mask),
                              _mm_andnot_ps(mask, _mm_sub_ps(v, _mm_sqrt_ps(d))));
    return result;
}

نصائح أخرى

__asm
    {
    movaps xmm0,[float3] //this is vector of yours into xmm0
    mulps xmm0,xmm0       //this is each term squared
    pxor xmm1,xmm1       //clean xmm1 first
    movlhps xmm1,xmm0    //lower 2 terms to the higher 2 parts of xmm1
    addps xmm0,xmm1      //higher 2 terms of xmm0 now has x_square+z_square and  y_square + zero_square
    shufps xmm2,xmm0,0 //we copy y_square to all 4 elements of xmm2
    addps xmm0,xmm2     //now we have sum of all squares in highest of xmm0
    shufps xmm0,xmm0,11111111b // copy result to all 4 parts
    sqrtss xmm0,xmm0           //scalar square-root
    movaps [result],xmm0
    }

this may be slower than fully optimized but should be fast enough for vector length calculation. Needs the vector to be aligned-16 byte. Change movaps to movups if you dont want alignment. If you can get this code work, then you can furhter increase performance by putting

align 16

in the beginning of movaps xmm0,[float3] to make code also aligned. Then you can check how many bytes each instuction has. Try to reach optimal code-length(multiple of 16 bytes). After sse2(sse3,sse4,avx) there are vertical-horizontal vector instructions which make only 1 instruction to get result.

edited mm0,xmm0 to xmm0,xmm0 at second instruction

here is some list:

http://softpixel.com/~cwright/programming/simd/sse2.php

مرخصة بموجب: CC-BY-SA مع الإسناد

لا تنتمي إلى StackOverflow