some intersectRaySphere c procedure optymization through rewriting it to x86 asm (How?)
-
24-06-2021 - |
题
Hullo, I've got not to much knowledge in assembly, and i am thinking how to optimize it by rewriting this in x86 (32-bit fpu or sse2) assembly, thing should be optimized - rewritten in correct assembly, then I will test if I've got some speed up (length() and dot() should be written in asm here also) This code is used by my simple real time ray-tracer and it works - but I am not much good in asm optimizations)
inline float intersectRaySphere(float3* rO, float3* rV, float3* sO, float sR)
{
static float3 Q;
Q = sub(sO,rO);
float c = length(&Q);
float v = dot(&Q,rV);
float d = sR*sR - (c*c - v*v);
// If there was no intersection, return -1
if (d < 0.0) return (-1.0f);
// Return the distance to the [first] intersecting point
return (v - sqrt(d));
}
Thank You in Advance
//edits
struct float3
{
float x;
float y;
float z;
};
inline float length(float3* v) {
return sqrt( (v->x)*(v->x) + (v->y)*(v->y) + (v->z)*(v->z) );
}
inline float dot(float3* a, float3* b) {
return (*a).x * (*b).x + (*a).y * (*b).y + (*a).z * (*b).z;
}
and demo exe (unoptymized in not even so much optymized c):
dl.dropbox.com/u/42887985/re29.zip
Maybe someone could give me a somewhat good fpu asm routines for length dot (or normalize not shown here) ?? Though whole procedure for intersect procedure would be the best ;-)
解决方案
This isn't a "nice" function to convert to SSE. Almost nothing is actually parallel. So let's change the function to intersect 4 rays at once. And it would help if the rays were stores in SOA (struct of arrays) instead of AOS (array of structs) as well.
With those changes, it might become something like this (not tested in any way):
inline void intersect4RaysSphere(
float* rOx, float* rOy, float* rOz,
float* rVx, float* rVy, float* rVz,
float sOx, float sOy, float sOz,
float sR)
{
// calculate Q
movss xmm0, sOx
movss xmm1, sOy
movss xmm2, sOz
shufps xmm0, xmm0, 0
shufps xmm1, xmm1, 0
shufps xmm2, xmm2, 0
subps xmm0, [rOx]
subps xmm1, [rOy]
subps xmm2, [rOz]
// calculate pow(dot(Q, rV), 2) in xmm3
movaps xmm3, [rVx]
movaps xmm4, [rVy]
movaps xmm5, [rVz]
mulps xmm3, xmm0
mulps xmm4, xmm1
mulps xmm5, xmm2
addps xmm3, xmm4
addps xmm3, xmm5
movaps xmm4, xmm3
mulps xmm3, xmm3
// calculate pow(length(Q), 2)
// there's no point in taking the square root only to then square it
mulps xmm0, xmm0
mulps xmm1, xmm1
mulps xmm2, xmm2
addps xmm0, xmm1
addps xmm0, xmm2
// calculate d
movss xmm1, sR
mulss xmm1, xmm1
shufps xmm1, xmm1, 0
subps xmm0, xmm3
subps xmm1, xmm0
sqrtps xmm1, xmm1
// test for intersection
// at this point:
// xmm3 = v * v
// xmm4 = v
// xmm1 = sqrt(d)
movaps xmm0, [minus1] // memory location with { -1.0, -1.0, -1.0, -1.0 }
subps xmm4, xmm1
// get a mask of d's smaller than 0.0
psrad xmm1, 31
// select -1 if less than zero or v*v - d if >= 0
andps xmm0, xmm1
andnps xmm1, xmm4
orps xmm0, xmm1
ret
}
Version with intrinsics (only slightly tested - it's compilable and seems to generate OK assembly):
__m128 intersect4RaysSphere(
float* rOx, float* rOy, float* rOz,
float* rVx, float* rVy, float* rVz,
float sOx, float sOy, float sOz,
float sR)
{
__m128 Qx = _mm_sub_ps(_mm_set1_ps(sOx), _mm_load_ps(rOx));
__m128 Qy = _mm_sub_ps(_mm_set1_ps(sOy), _mm_load_ps(rOy));
__m128 Qz = _mm_sub_ps(_mm_set1_ps(sOz), _mm_load_ps(rOz));
__m128 v = _mm_add_ps(_mm_mul_ps(Qx, _mm_load_ps(rVx)),
_mm_add_ps(_mm_mul_ps(Qy, _mm_load_ps(rVy)),
_mm_mul_ps(Qz, _mm_load_ps(rVz))));
__m128 vsquared = _mm_mul_ps(v, v);
__m128 lengthQsquared = _mm_add_ps(_mm_mul_ps(Qx, Qx),
_mm_add_ps(_mm_mul_ps(Qy, Qy),
_mm_mul_ps(Qz, Qz)));
__m128 sr = _mm_set1_ps(sR);
__m128 d = _mm_sub_ps(_mm_mul_ps(sr, sr), _mm_sub_ps(lengthQsquared, vsquared));
__m128 mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(d), 31));
//__m128 result = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-1.0f), mask),
_mm_andnot_ps(mask, _mm_sub_ps(vsquared, d)));
__m128 result = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-1.0f), mask),
_mm_andnot_ps(mask, _mm_sub_ps(v, _mm_sqrt_ps(d))));
return result;
}
其他提示
__asm
{
movaps xmm0,[float3] //this is vector of yours into xmm0
mulps xmm0,xmm0 //this is each term squared
pxor xmm1,xmm1 //clean xmm1 first
movlhps xmm1,xmm0 //lower 2 terms to the higher 2 parts of xmm1
addps xmm0,xmm1 //higher 2 terms of xmm0 now has x_square+z_square and y_square + zero_square
shufps xmm2,xmm0,0 //we copy y_square to all 4 elements of xmm2
addps xmm0,xmm2 //now we have sum of all squares in highest of xmm0
shufps xmm0,xmm0,11111111b // copy result to all 4 parts
sqrtss xmm0,xmm0 //scalar square-root
movaps [result],xmm0
}
this may be slower than fully optimized but should be fast enough for vector length calculation. Needs the vector to be aligned-16 byte. Change movaps to movups if you dont want alignment. If you can get this code work, then you can furhter increase performance by putting
align 16
in the beginning of movaps xmm0,[float3] to make code also aligned. Then you can check how many bytes each instuction has. Try to reach optimal code-length(multiple of 16 bytes). After sse2(sse3,sse4,avx) there are vertical-horizontal vector instructions which make only 1 instruction to get result.
edited mm0,xmm0 to xmm0,xmm0 at second instruction
here is some list: