1621 lines
60 KiB
C++
1621 lines
60 KiB
C++
//========= Copyright © 1996-2005, Valve Corporation, All rights reserved. ============//
|
|
//
|
|
// Purpose:
|
|
//
|
|
// $NoKeywords: $
|
|
//
|
|
//=============================================================================//
|
|
|
|
#ifndef COMPRESSED_VECTOR_H
|
|
#define COMPRESSED_VECTOR_H
|
|
|
|
#ifdef _WIN32
|
|
#pragma once
|
|
#endif
|
|
|
|
#include <math.h>
|
|
#include <float.h>
|
|
|
|
// For vec_t, put this somewhere else?
|
|
#include "basetypes.h"
|
|
|
|
// For rand(). We really need a library!
|
|
#include <stdlib.h>
|
|
|
|
#include "tier0/dbg.h"
|
|
#include "mathlib/vector.h"
|
|
|
|
#include "mathlib/mathlib.h"
|
|
#include "mathlib/ssemath.h"
|
|
#ifdef _PS3
|
|
#if defined(__SPU__)
|
|
#include <spu_intrinsics.h>
|
|
#include <vmx2spu.h>
|
|
#endif
|
|
#include <vectormath/cpp/vectormath_aos.h>
|
|
#endif
|
|
|
|
#if defined( _X360 )
|
|
#pragma bitfield_order( push, lsb_to_msb )
|
|
#elif defined( _PS3 )
|
|
#pragma ms_struct on
|
|
#pragma reverse_bitfields on
|
|
#endif
|
|
|
|
#ifdef OSX
|
|
#pragma GCC diagnostic ignored "-Wtautological-compare"
|
|
#endif
|
|
|
|
class Quaternion48;
|
|
|
|
|
|
FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec );
|
|
|
|
|
|
//=========================================================
|
|
// fit a 3D vector into 32 bits
|
|
//=========================================================
|
|
|
|
class Vector32
|
|
{
|
|
public:
|
|
// Construction/destruction:
|
|
Vector32(void);
|
|
Vector32(vec_t X, vec_t Y, vec_t Z);
|
|
|
|
// assignment
|
|
Vector32& operator=(const Vector &vOther);
|
|
operator Vector ();
|
|
|
|
private:
|
|
unsigned short x:10;
|
|
unsigned short y:10;
|
|
unsigned short z:10;
|
|
unsigned short exp:2;
|
|
};
|
|
|
|
inline Vector32& Vector32::operator=(const Vector &vOther)
|
|
{
|
|
CHECK_VALID(vOther);
|
|
|
|
static float expScale[4] = { 4.0f, 16.0f, 32.f, 64.f };
|
|
|
|
float fmax = MAX( fabs( vOther.x ), fabs( vOther.y ) );
|
|
fmax = fpmax( fmax, fabs( vOther.z ) );
|
|
|
|
for (exp = 0; exp < 3; exp++)
|
|
{
|
|
if (fmax < expScale[exp])
|
|
break;
|
|
}
|
|
Assert( fmax < expScale[exp] );
|
|
|
|
float fexp = 512.0f / expScale[exp];
|
|
|
|
x = clamp( (int)(vOther.x * fexp) + 512, 0, 1023 );
|
|
y = clamp( (int)(vOther.y * fexp) + 512, 0, 1023 );
|
|
z = clamp( (int)(vOther.z * fexp) + 512, 0, 1023 );
|
|
return *this;
|
|
}
|
|
|
|
|
|
inline Vector32::operator Vector ()
|
|
{
|
|
Vector tmp;
|
|
|
|
static float expScale[4] = { 4.0f, 16.0f, 32.f, 64.f };
|
|
|
|
float fexp = expScale[exp] / 512.0f;
|
|
|
|
tmp.x = (((int)x) - 512) * fexp;
|
|
tmp.y = (((int)y) - 512) * fexp;
|
|
tmp.z = (((int)z) - 512) * fexp;
|
|
return tmp;
|
|
}
|
|
|
|
|
|
//=========================================================
|
|
// Fit a unit vector into 32 bits
|
|
//=========================================================
|
|
|
|
class Normal32
|
|
{
|
|
public:
|
|
// Construction/destruction:
|
|
Normal32(void);
|
|
Normal32(vec_t X, vec_t Y, vec_t Z);
|
|
|
|
// assignment
|
|
Normal32& operator=(const Vector &vOther);
|
|
operator Vector ();
|
|
|
|
private:
|
|
unsigned short x:15;
|
|
unsigned short y:15;
|
|
unsigned short zneg:1;
|
|
};
|
|
|
|
|
|
inline Normal32& Normal32::operator=(const Vector &vOther)
|
|
{
|
|
CHECK_VALID(vOther);
|
|
|
|
x = clamp( (int)(vOther.x * 16384) + 16384, 0, 32767 );
|
|
y = clamp( (int)(vOther.y * 16384) + 16384, 0, 32767 );
|
|
zneg = (vOther.z < 0);
|
|
//x = vOther.x;
|
|
//y = vOther.y;
|
|
//z = vOther.z;
|
|
return *this;
|
|
}
|
|
|
|
|
|
inline Normal32::operator Vector ()
|
|
{
|
|
Vector tmp;
|
|
|
|
tmp.x = ((int)x - 16384) * (1 / 16384.0);
|
|
tmp.y = ((int)y - 16384) * (1 / 16384.0);
|
|
tmp.z = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y );
|
|
if (zneg)
|
|
tmp.z = -tmp.z;
|
|
return tmp;
|
|
}
|
|
|
|
|
|
//=========================================================
|
|
// 64 bit Quaternion
|
|
//=========================================================
|
|
|
|
class Quaternion64
|
|
{
|
|
public:
|
|
// Construction/destruction:
|
|
Quaternion64(void);
|
|
Quaternion64(vec_t X, vec_t Y, vec_t Z);
|
|
|
|
// assignment
|
|
// Quaternion& operator=(const Quaternion64 &vOther);
|
|
Quaternion64& operator=(const Quaternion &vOther);
|
|
operator Quaternion () const;
|
|
inline fltx4 LoadUnalignedSIMD() const; // load onto a SIMD register without assumptions of being on a 16byte boundary
|
|
|
|
private:
|
|
Quaternion64( uint64 xx, uint64 yy, uint64 zz, uint64 ww ) : x(xx), y(yy), z(zz), wneg(ww) {}; // stricly for static construction
|
|
uint64 x:21;
|
|
uint64 y:21;
|
|
uint64 z:21;
|
|
uint64 wneg:1;
|
|
};
|
|
|
|
|
|
inline Quaternion64::operator Quaternion () const
|
|
{
|
|
#if defined(__SPU__)
|
|
fltx4 tmpV;
|
|
QuaternionAligned tmpQ;
|
|
|
|
tmpV = LoadUnalignedSIMD();
|
|
StoreAlignedSIMD( (float *)&tmpQ, tmpV );
|
|
|
|
return tmpQ;
|
|
#else
|
|
Quaternion tmp;
|
|
|
|
// shift to -1048576, + 1048575, then round down slightly to -1.0 < x < 1.0
|
|
tmp.x = ((int)x - 1048576) * (1 / 1048576.5f);
|
|
tmp.y = ((int)y - 1048576) * (1 / 1048576.5f);
|
|
tmp.z = ((int)z - 1048576) * (1 / 1048576.5f);
|
|
|
|
tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
|
|
if (wneg)
|
|
tmp.w = -tmp.w;
|
|
return tmp;
|
|
#endif
|
|
}
|
|
|
|
inline Quaternion64& Quaternion64::operator=(const Quaternion &vOther)
|
|
{
|
|
CHECK_VALID(vOther);
|
|
|
|
x = clamp( (int)(vOther.x * 1048576) + 1048576, 0, 2097151 );
|
|
y = clamp( (int)(vOther.y * 1048576) + 1048576, 0, 2097151 );
|
|
z = clamp( (int)(vOther.z * 1048576) + 1048576, 0, 2097151 );
|
|
wneg = (vOther.w < 0);
|
|
return *this;
|
|
}
|
|
|
|
inline fltx4 Quaternion64::LoadUnalignedSIMD() const
|
|
{
|
|
#ifdef _PS3 // assume little endian packing
|
|
|
|
#if 1
|
|
|
|
const static u32x4 xmask = { 0x00000000, 0x001fffff, 0, 0 }; // bottom 21 bits ( 0 .. 20 ) true
|
|
const static u32x4 ymask = { 0x000003ff, 0xffe00000, 0, 0 }; // bits 21 .. 41 true
|
|
const static u32x4 zmask = { 0x7ffffC00, 0x00000000, 0, 0 }; // bits 42 .. 62 true
|
|
const static u32x4 wmask = { 0x80000000, 0x00000000, 0, 0 }; // only bit 63 is true
|
|
|
|
|
|
const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ;
|
|
// fish x, y, and z and put them into the the first words of their respective vec registers
|
|
// the end type for these registers must be signed for the following subtract, BUT!
|
|
// the shift has to happen as an UNSIGNED type so that it doesn't sign-extend.
|
|
// the code as present assumes that the fused multiply-add operation has an intermediate
|
|
// precision higher than 32 bits -- otherwise, we'll need to perform the initial subtract as an
|
|
// int op because of course 21 bits is right at the limit of floating point precision.
|
|
i32x4 ix = (i32x4) (ShiftLeftByBits<32>(vec_and( qbits, xmask ))); // shift x by eleven bits so its 21 bits of precision are sitting at the low end of the first word
|
|
i32x4 iy = (i32x4) (ShiftLeftByBits<11>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, left by 10 bits so its 21 bits of precision are sitting at the low end of the first word
|
|
i32x4 iz = (i32x4) (ShiftRightByBits<10>(vec_and( qbits, zmask ))); // shift z, which straddles the first two words, left by 31 bits so its 21 bits of precision are sitting at the low end of the first word
|
|
|
|
/* // this is how to put them into their respective words instead (but we don't want to do that because we need a dot product)
|
|
i32x4 iy = (i32x4) (ShiftRightByBits<22>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, right by 22 bits so its 21 bits of precision are sitting at the low end of the second word
|
|
i32x4 iz = (i32x4) (ShiftRightByBits<33>(vec_and( qbits, zmask ))); // shift z right by 33 bits so its 21 bits of precision are sitting at the low end of the third word
|
|
*/
|
|
i32x4 wsignbit = (i32x4) (ShiftRightByBits<96>(vec_and( qbits, wmask ))); // shift the w bit RIGHT so that it sits at the sign bit of the LAST word.
|
|
|
|
// convert each of the vectors from int to float. (because of the way the pipeline is organized,
|
|
// it's as fast to do this as it would have been to do by combining them into one register above
|
|
// and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll
|
|
// map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float-
|
|
// by-float conversion above, but the better stability of the vector quaternion ops makes that okay.
|
|
const fltx4 ONE = LoadOneSIMD();
|
|
#if defined(__SPU__)
|
|
fltx4 fx = SubSIMD( vec_ctf( ix, 20 ), ONE);
|
|
fltx4 fy = SubSIMD( vec_ctf( iy, 20 ), ONE);
|
|
fltx4 fz = SubSIMD( vec_ctf( iz, 20 ), ONE);
|
|
#else
|
|
fltx4 fx = SubSIMD( vec_vcfsx( ix, 20 ), ONE);
|
|
fltx4 fy = SubSIMD( vec_vcfsx( iy, 20 ), ONE);
|
|
fltx4 fz = SubSIMD( vec_vcfsx( iz, 20 ), ONE);
|
|
#endif
|
|
|
|
// compute the dot product
|
|
fltx4 fw = MsubSIMD( fz, fz, ONE ); // 1 - z*z
|
|
fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y
|
|
fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z
|
|
fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here
|
|
|
|
fltx4 result = Compress4SIMD( fx, fy, fz, fw );
|
|
// and for the coup de grace, set the sign bit of fw appropriately
|
|
result = OrSIMD( result, (fltx4)wsignbit );
|
|
|
|
return result;
|
|
|
|
|
|
|
|
#else
|
|
// original version
|
|
|
|
/*
|
|
union Qmask {
|
|
struct qq {
|
|
Quaternion64 mask;
|
|
uint64 padding;
|
|
} asQ ;
|
|
u32x4 asVec;
|
|
|
|
Qmask( const Quaternion64 &m ) : mask(m) {}
|
|
};
|
|
*/
|
|
const static u32x4 xmask = { 0xfffff800, 0x00000000, 0, 0 }; // top 21 bits ( 0 .. 20 ) true
|
|
const static u32x4 ymask = { 0x000007ff, 0xffc00000, 0, 0 }; // bits 21 .. 41 true
|
|
const static u32x4 zmask = { 0x00000000, 0x003ffffe, 0, 0 }; // bits 42 .. 62 true
|
|
const static u32x4 wmask = { 0x00000000, 0x00000001, 0, 0 }; // only bit 63 is true
|
|
|
|
const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ;
|
|
// fish x, y, and z and put them into the the first words of their respective vec registers
|
|
// the end type for these registers must be signed for the following subtract, BUT!
|
|
// the shift has to happen as an UNSIGNED type so that it doesn't sign-extend.
|
|
// the code as present assumes that the fused multiply-add operation has an intermediate
|
|
// precision higher than 32 bits -- otherwise, we'll need to perform the initial subtract as an
|
|
// int op because of course 21 bits is right at the limit of floating point precision.
|
|
i32x4 ix = (i32x4) (ShiftRightByBits<11>(vec_and( qbits, xmask ))); // shift x by eleven bits so its 21 bits of precision are sitting at the low end of the first word
|
|
i32x4 iy = (i32x4) (ShiftLeftByBits<10>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, left by 10 bits so its 21 bits of precision are sitting at the low end of the first word
|
|
i32x4 iz = (i32x4) (ShiftLeftByBits<31>(vec_and( qbits, zmask ))); // shift z, which straddles the first two words, left by 31 bits so its 21 bits of precision are sitting at the low end of the first word
|
|
/* // this is how to put them into their respective words instead (but we don't want to do that because we need a dot product)
|
|
i32x4 iy = (i32x4) (ShiftRightByBits<22>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, right by 22 bits so its 21 bits of precision are sitting at the low end of the second word
|
|
i32x4 iz = (i32x4) (ShiftRightByBits<33>(vec_and( qbits, zmask ))); // shift z right by 33 bits so its 21 bits of precision are sitting at the low end of the third word
|
|
*/
|
|
i32x4 wsignbit = (i32x4) (ShiftRightByBits<33>(vec_and( qbits, wmask ))); // shift the w bit RIGHT so that it sits at the sign bit of the LAST word.
|
|
|
|
// convert each of the vectors from int to float. (because of the way the pipeline is organized,
|
|
// it's as fast to do this as it would have been to do by combining them into one register above
|
|
// and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll
|
|
// map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float-
|
|
// by-float conversion above, but the better stability of the vector quaternion ops makes that okay.
|
|
const fltx4 ONE = LoadOneSIMD();
|
|
#if defined(__SPU__)
|
|
fltx4 fx = SubSIMD( vec_ctf( ix, 20 ), ONE);
|
|
fltx4 fy = SubSIMD( vec_ctf( iy, 20 ), ONE);
|
|
fltx4 fz = SubSIMD( vec_ctf( iz, 20 ), ONE);
|
|
#else
|
|
fltx4 fx = SubSIMD( vec_vcfsx( ix, 20 ), ONE);
|
|
fltx4 fy = SubSIMD( vec_vcfsx( iy, 20 ), ONE);
|
|
fltx4 fz = SubSIMD( vec_vcfsx( iz, 20 ), ONE);
|
|
#endif
|
|
|
|
// compute the dot product
|
|
fltx4 fw = MsubSIMD( ONE, fz, fz ); // 1 - z*z
|
|
fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y
|
|
fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z
|
|
fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here
|
|
|
|
fltx4 result = Compress4SIMD( fx, fy, fz, fw );
|
|
// and for the coup de grace, set the sign bit of fw appropriately
|
|
result = OrSIMD( result, (fltx4)wsignbit );
|
|
|
|
return result;
|
|
|
|
#endif
|
|
|
|
#elif 0 // basic C implementation (which ends up being slower than writing the whole Q onto the stack and then reading it back at once)
|
|
struct { float x; float y; float z; float w; } tmp;
|
|
|
|
tmp.x = ((int)x - 1048576) * (1 / 1048576.5f);
|
|
tmp.y = ((int)y - 1048576) * (1 / 1048576.5f);
|
|
tmp.z = ((int)z - 1048576) * (1 / 1048576.5f);
|
|
tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
|
|
if (wneg)
|
|
tmp.w = -tmp.w;
|
|
|
|
fltx4 ret = { tmp.x, tmp.y, tmp.z, tmp.w };
|
|
return ret;
|
|
#else // naive implementation (which ends up being faster than the explicit c imp above)
|
|
const QuaternionAligned q(Quaternion(*this)) ;
|
|
return LoadAlignedSIMD( &q );
|
|
#endif
|
|
}
|
|
|
|
//=========================================================
|
|
// 48 bit Quaternion
|
|
//=========================================================
|
|
|
|
class Quaternion48
|
|
{
|
|
public:
|
|
// Construction/destruction:
|
|
Quaternion48(void);
|
|
Quaternion48(vec_t X, vec_t Y, vec_t Z);
|
|
|
|
// assignment
|
|
// Quaternion& operator=(const Quaternion48 &vOther);
|
|
Quaternion48& operator=(const Quaternion &vOther);
|
|
operator Quaternion () const;
|
|
inline fltx4 LoadUnalignedSIMD() const; // load onto a SIMD register without assumptions of being on a 16byte boundary
|
|
|
|
//private:
|
|
unsigned short x:16;
|
|
unsigned short y:16;
|
|
unsigned short z:15;
|
|
unsigned short wneg:1;
|
|
};
|
|
|
|
|
|
inline Quaternion48::operator Quaternion () const
|
|
{
|
|
#if defined(__SPU__)
|
|
|
|
fltx4 tmpV;
|
|
QuaternionAligned tmpQ;
|
|
|
|
tmpV = LoadUnalignedSIMD();
|
|
StoreAlignedSIMD( (float *)&tmpQ, tmpV );
|
|
|
|
tmpV = UnpackQuaternion48SIMD( this );
|
|
StoreAlignedSIMD( (float *)&tmpQ, tmpV );
|
|
|
|
return tmpQ;
|
|
|
|
#else
|
|
|
|
Quaternion tmp;
|
|
|
|
tmp.x = ((int)x - 32768) * (1 / 32768.5);
|
|
tmp.y = ((int)y - 32768) * (1 / 32768.5);
|
|
tmp.z = ((int)z - 16384) * (1 / 16384.5);
|
|
tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
|
|
if (wneg)
|
|
tmp.w = -tmp.w;
|
|
return tmp;
|
|
|
|
#endif
|
|
}
|
|
|
|
|
|
inline Quaternion48& Quaternion48::operator=(const Quaternion &vOther)
|
|
{
|
|
CHECK_VALID(vOther);
|
|
|
|
x = clamp( (int)(vOther.x * 32768) + 32768, 0, 65535 );
|
|
y = clamp( (int)(vOther.y * 32768) + 32768, 0, 65535 );
|
|
z = clamp( (int)(vOther.z * 16384) + 16384, 0, 32767 );
|
|
wneg = (vOther.w < 0);
|
|
return *this;
|
|
}
|
|
|
|
inline fltx4 Quaternion48::LoadUnalignedSIMD() const
|
|
{
|
|
#ifdef _PS3 // assume little endian packing
|
|
|
|
const static u32x4 xmask = { 0x00000000, 0xffff0000, 0, 0 };
|
|
const static u32x4 ymask = { 0x0000ffff, 0x00000000, 0, 0 };
|
|
const static u32x4 zmask = { 0x7fff0000, 0x00000000, 0, 0 };
|
|
const static u32x4 wmask = { 0x80000000, 0x00000000, 0, 0 };
|
|
|
|
const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ;
|
|
// fish x, y, and z and put them into the the first words of their respective vec registers
|
|
i32x4 ix = (i32x4) (ShiftLeftByBits<16>(vec_and( qbits, xmask )));
|
|
i32x4 iy = (i32x4) ((vec_and( qbits, ymask )));
|
|
i32x4 iz = (i32x4) (ShiftRightByBits<16>(vec_and( qbits, zmask )));
|
|
|
|
// shift the w bit RIGHT so that it sits at the sign bit of the LAST word.
|
|
i32x4 wsignbit = (i32x4) (ShiftRightByBits<96>(vec_and( qbits, wmask )));
|
|
|
|
// convert each of the vectors from int to float. (because of the way the pipeline is organized,
|
|
// it's as fast to do this as it would have been to do by combining them into one register above
|
|
// and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll
|
|
// map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float-
|
|
// by-float conversion above, but the better stability of the vector quaternion ops makes that okay.
|
|
const fltx4 ONE = LoadOneSIMD();
|
|
#if defined(__SPU__)
|
|
fltx4 fx = SubSIMD( vec_ctf( ix, 15 ), ONE);
|
|
fltx4 fy = SubSIMD( vec_ctf( iy, 15 ), ONE);
|
|
fltx4 fz = SubSIMD( vec_ctf( iz, 14 ), ONE);
|
|
#else
|
|
fltx4 fx = SubSIMD( vec_vcfsx( ix, 15 ), ONE);
|
|
fltx4 fy = SubSIMD( vec_vcfsx( iy, 15 ), ONE);
|
|
fltx4 fz = SubSIMD( vec_vcfsx( iz, 14 ), ONE);
|
|
#endif
|
|
|
|
// compute the dot product
|
|
fltx4 fw = MsubSIMD( fz, fz, ONE ); // 1 - z*z
|
|
fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y
|
|
fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z
|
|
fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here
|
|
|
|
fltx4 result = Compress4SIMD( fx, fy, fz, fw );
|
|
// and for the coup de grace, set the sign bit of fw appropriately
|
|
result = OrSIMD( result, (fltx4)wsignbit );
|
|
|
|
return result;
|
|
|
|
|
|
#elif 0 // basic C implementation (which ends up being slower than writing the whole Q onto the stack and then reading it back at once)
|
|
struct { float x; float y; float z; float w; } tmp;
|
|
|
|
tmp.x = ((int)x - 32768) * (1 / 32768.5);
|
|
tmp.y = ((int)y - 32768) * (1 / 32768.5);
|
|
tmp.z = ((int)z - 16384) * (1 / 16384.5);
|
|
tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
|
|
if (wneg)
|
|
tmp.w = -tmp.w;
|
|
|
|
fltx4 ret = { tmp.x, tmp.y, tmp.z, tmp.w };
|
|
return ret;
|
|
#else // naive implementation (which ends up being faster than the explicit c imp above)
|
|
const QuaternionAligned q(Quaternion(*this)) ;
|
|
return LoadAlignedSIMD( &q );
|
|
#endif
|
|
}
|
|
|
|
|
|
//=========================================================
|
|
// 48 bit sorted Quaternion
|
|
//=========================================================
|
|
|
|
|
|
class Quaternion48S
|
|
{
|
|
public:
|
|
// Construction/destruction:
|
|
Quaternion48S(void);
|
|
Quaternion48S(vec_t X, vec_t Y, vec_t Z);
|
|
|
|
// assignment
|
|
// Quaternion& operator=(const Quaternion48 &vOther);
|
|
Quaternion48S& operator=(const Quaternion &vOther);
|
|
operator Quaternion () const;
|
|
operator fltx4 () const RESTRICT ;
|
|
//private:
|
|
// shift the quaternion so that the largest value is recreated by the sqrt()
|
|
// abcd maps modulo into quaternion xyzw starting at "offset"
|
|
// "offset" is split into two 1 bit fields so that the data packs into 6 bytes (3 shorts)
|
|
unsigned short a:15; // first of the 3 consecutive smallest quaternion elements
|
|
unsigned short offsetH:1; // high bit of "offset"
|
|
unsigned short b:15;
|
|
unsigned short offsetL:1; // low bit of "offset"
|
|
unsigned short c:15;
|
|
unsigned short dneg:1; // sign of the largest quaternion element
|
|
};
|
|
|
|
#define SCALE48S 23168.0f // needs to fit 2*sqrt(0.5) into 15 bits.
|
|
#define SHIFT48S 16384 // half of 2^15 bits.
|
|
|
|
inline Quaternion48S::operator Quaternion () const
|
|
{
|
|
#if defined(__SPU__)
|
|
|
|
fltx4 tmpV;
|
|
QuaternionAligned tmpQ;
|
|
|
|
tmpV = *this;
|
|
StoreAlignedSIMD( (float *)&tmpQ, tmpV );
|
|
|
|
return tmpQ;
|
|
|
|
#else
|
|
|
|
Quaternion tmp;
|
|
|
|
COMPILE_TIME_ASSERT( sizeof( Quaternion48S ) == 6 );
|
|
|
|
float *ptmp = &tmp.x;
|
|
int ia = offsetL + offsetH * 2;
|
|
int ib = ( ia + 1 ) % 4;
|
|
int ic = ( ia + 2 ) % 4;
|
|
int id = ( ia + 3 ) % 4;
|
|
ptmp[ia] = ( (int)a - SHIFT48S ) * ( 1.0f / SCALE48S );
|
|
ptmp[ib] = ( (int)b - SHIFT48S ) * ( 1.0f / SCALE48S );
|
|
ptmp[ic] = ( (int)c - SHIFT48S ) * ( 1.0f / SCALE48S );
|
|
ptmp[id] = sqrt( 1.0f - ptmp[ia] * ptmp[ia] - ptmp[ib] * ptmp[ib] - ptmp[ic] * ptmp[ic] );
|
|
if (dneg)
|
|
ptmp[id] = -ptmp[id];
|
|
|
|
return tmp;
|
|
|
|
#endif
|
|
}
|
|
|
|
inline Quaternion48S& Quaternion48S::operator=(const Quaternion &vOther)
|
|
{
|
|
CHECK_VALID(vOther);
|
|
|
|
const float *ptmp = &vOther.x;
|
|
|
|
// find largest field, make sure that one is recreated by the sqrt to minimize error
|
|
int i = 0;
|
|
if ( fabs( ptmp[i] ) < fabs( ptmp[1] ) )
|
|
{
|
|
i = 1;
|
|
}
|
|
if ( fabs( ptmp[i] ) < fabs( ptmp[2] ) )
|
|
{
|
|
i = 2;
|
|
}
|
|
if ( fabs( ptmp[i] ) < fabs( ptmp[3] ) )
|
|
{
|
|
i = 3;
|
|
}
|
|
|
|
int offset = ( i + 1 ) % 4; // make "a" so that "d" is the largest element
|
|
offsetL = offset & 1;
|
|
offsetH = offset > 1;
|
|
a = clamp( (int)(ptmp[ offset ] * SCALE48S) + SHIFT48S, 0, (int)(SCALE48S * 2) );
|
|
b = clamp( (int)(ptmp[ ( offset + 1 ) % 4 ] * SCALE48S) + SHIFT48S, 0, (int)(SCALE48S * 2) );
|
|
c = clamp( (int)(ptmp[ ( offset + 2 ) % 4 ] * SCALE48S) + SHIFT48S, 0, (int)(SCALE48S * 2) );
|
|
dneg = ( ptmp[ ( offset + 3 ) % 4 ] < 0.0f );
|
|
|
|
return *this;
|
|
}
|
|
|
|
|
|
// decode onto a SIMD register
|
|
inline Quaternion48S::operator fltx4 () const RESTRICT
|
|
{
|
|
AssertMsg1( (((uintp) this) & 1) == 0, "Quaternion48S is unaligned at %p\n", this );
|
|
#ifdef PLATFORM_PPC // this algorithm depends heavily on the Altivec permute op, for which there is no analogue in SSE. This function should not be used on PC.
|
|
// define some vector constants. the shift-scale will be done as a fused multiply-add,
|
|
// with the scale already distributed onto the shift (the part subtracted)
|
|
const static fltx4 vrSCALE48S = { (1.0f / SCALE48S), (1.0f / SCALE48S), (1.0f / SCALE48S), (1.0f / SCALE48S) };
|
|
const static fltx4 vrSHIFT48S = { ((float) -SHIFT48S) / SCALE48S, ((float) -SHIFT48S) / SCALE48S, ((float) -SHIFT48S) / SCALE48S, ((float) -SHIFT48S) / SCALE48S };
|
|
|
|
// start by hoisting the q48 onto a SIMD word.
|
|
u32x4 source = (u32x4) LoadUnalignedSIMD( this );
|
|
const u32x4 ZERO = (u32x4) LoadZeroSIMD();
|
|
// also hoist the offset into an int word. Hopefully this executes in parallel with the vector ops thanks to SUPERSCALAR!
|
|
const unsigned int offset = offsetL | ( offsetH << 1 );
|
|
const bi32x4 vDMask = (bi32x4) LoadAlignedSIMD( g_SIMD_ComponentMask[(offset+3)%4] ); // lets vsel poke D into the right word
|
|
|
|
#if 0 // This code can be used to deal with a situation where LoadUnalignedSIMD() fails to properly load
|
|
// vectors starting on halfword boundaries (rather than 32-bit aligned). Because this is a 48-bit
|
|
// structure, sometimes it'll only be 16-bit aligned. I expected that lvlx would always load from
|
|
// a word boundary, requiring me to shift misaligned vectors over by 16 bits, but evidently,
|
|
// lvlx actually works even on halfword boundaries. Who knew!
|
|
// Anyway, this code is still here in case the problem crops up, as a hint to both cause and solution.
|
|
if ( ((unsigned int) this) & 2 )
|
|
{
|
|
source = ShiftLeftByBits<16>(source);
|
|
}
|
|
#endif
|
|
|
|
// mask out the offset and dneg bits. Because of the packing #pragmas, the one-bit fields are actually at the MSB
|
|
// of the halfwords, not the LSB as you might expect.
|
|
ALIGN16 const static uint32 vMaskTopBits[4] = { 0x80008000, 0x80000000, 0, 0 }; // just the LSB of each the first three halfwords
|
|
u32x4 abc = AndNotSIMD( (u32x4) LoadAlignedSIMD(vMaskTopBits), source ); // now this is just the A, B, C halfwords.
|
|
// Next, unpack abc as unsigned numbers. We can do this with a permute op. In fact, we can exploit
|
|
// the integer pipe and load the offset while we're loading the SIMD numbers, then use the integer offset to select
|
|
// the permute, which will therefore also perform the rotate that maps abc to their rightful destinations.
|
|
// the masks below are for the vperm instruction, which is a byte-by-byte mapping from source to destination.
|
|
// it's assumed that the FIRST parameter to vperm will be ZERO, and the second the data. (that makes the masks a little clearer)
|
|
// in the simplest case -- imagine each letter below represents one byte; the source vector looks like
|
|
// AABB CCxx xxxx xxxx. We're going to permute it onto the work register like
|
|
// 00AA 00BB 00CC 0000
|
|
ALIGN16 const static uint32 vPermutations[4][4] = {
|
|
// offset = 0 means a->x, b->y, c->z, d->w
|
|
{ 0x00001011, 0x00001213, 0x00001415, 0x00000000 },
|
|
// offset = 1 means a->y, b->z, c->w, d->a
|
|
{ 0x00000000, 0x00001011, 0x00001213, 0x00001415 },
|
|
{ 0x00001415, 0x00000000, 0x00001011, 0x00001213 },
|
|
{ 0x00001213, 0x00001415, 0x00000000, 0x00001011 }
|
|
};
|
|
// compute two permutations on the input data: one where the zero-word is always in the w component,
|
|
// which lets us do a 3-way rather than 4-way dot product; and another where the zero-word corresponds to
|
|
// wherever D is supposed to go.
|
|
// Even though this seems redundant, the duplicated work ends up fitting into the pipeline bubbles,
|
|
// and the savings between a 4-way and 3-way dot seem to be about 3ns.
|
|
u32x4 abcfordot = PermuteVMX( ZERO, abc, LoadAlignedSIMD( vPermutations[0] ) );
|
|
abc = PermuteVMX( ZERO, abc, LoadAlignedSIMD( vPermutations[offset] ) );
|
|
|
|
// turn each of the ints into floats. Because we masked out the one-bit field at the top,
|
|
// We can think of this as a conversion from fixed-point where there's no fractional bit.
|
|
// This is done in line with the shift-scale operation, which is itself fused.
|
|
// we do this twice: once for the vector with the guaranteed zero w-word, and
|
|
// once for the vector rotated by the offset.
|
|
fltx4 vfDest = AndNotSIMD( vDMask, MaddSIMD( UnsignedFixedIntConvertToFltSIMD( abc, 0 ), vrSCALE48S, vrSHIFT48S ) );
|
|
fltx4 vfDestForDot = MaddSIMD( UnsignedFixedIntConvertToFltSIMD( abcfordot, 0 ), vrSCALE48S, vrSHIFT48S ) ;
|
|
// compute magnitude of the vector we know to have a 0 in the w word.
|
|
const fltx4 vDot = Dot3SIMD( vfDestForDot, vfDestForDot );
|
|
// recover the "D" word
|
|
const fltx4 vD = SqrtSIMD( SubSIMD( LoadOneSIMD(), vDot ) );
|
|
// mask D into the converted-and-offset vector, then return.
|
|
return MaskedAssign( vDMask, dneg ? NegSIMD(vD) : vD, vfDest );
|
|
#else
|
|
AssertMsg( false, "Quaternion48S::operator fltx4 is slow on this platform and should not be used.\n" );
|
|
QuaternionAligned q( (Quaternion) *this );
|
|
return LoadAlignedSIMD( &q );
|
|
#endif
|
|
}
|
|
|
|
|
|
//=========================================================
|
|
// 32 bit Quaternion
|
|
//=========================================================
|
|
|
|
class Quaternion32
|
|
{
|
|
public:
|
|
// Construction/destruction:
|
|
Quaternion32(void);
|
|
Quaternion32(vec_t X, vec_t Y, vec_t Z);
|
|
|
|
// assignment
|
|
// Quaternion& operator=(const Quaternion48 &vOther);
|
|
Quaternion32& operator=(const Quaternion &vOther);
|
|
operator Quaternion ();
|
|
inline fltx4 LoadUnalignedSIMD() const; // load onto a SIMD register without assumptions of being on a 16byte boundary
|
|
|
|
private:
|
|
unsigned int x:11;
|
|
unsigned int y:10;
|
|
unsigned int z:10;
|
|
unsigned int wneg:1;
|
|
};
|
|
|
|
|
|
inline Quaternion32::operator Quaternion ()
|
|
{
|
|
#if defined(__SPU__)
|
|
|
|
fltx4 tmpV;
|
|
QuaternionAligned tmpQ;
|
|
|
|
tmpV = LoadUnalignedSIMD();
|
|
StoreAlignedSIMD( (float *)&tmpQ, tmpV );
|
|
|
|
return tmpQ;
|
|
|
|
#else
|
|
|
|
Quaternion tmp;
|
|
|
|
tmp.x = ((int)x - 1024) * (1 / 1024.0);
|
|
tmp.y = ((int)y - 512) * (1 / 512.0);
|
|
tmp.z = ((int)z - 512) * (1 / 512.0);
|
|
tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
|
|
if (wneg)
|
|
tmp.w = -tmp.w;
|
|
return tmp;
|
|
|
|
#endif
|
|
}
|
|
|
|
inline Quaternion32& Quaternion32::operator=(const Quaternion &vOther)
|
|
{
|
|
CHECK_VALID(vOther);
|
|
|
|
x = clamp( (int)(vOther.x * 1024) + 1024, 0, 2047 );
|
|
y = clamp( (int)(vOther.y * 512) + 512, 0, 1023 );
|
|
z = clamp( (int)(vOther.z * 512) + 512, 0, 1023 );
|
|
wneg = (vOther.w < 0);
|
|
return *this;
|
|
}
|
|
|
|
|
|
|
|
inline fltx4 Quaternion32::LoadUnalignedSIMD() const
|
|
{
|
|
#ifdef _PS3 // assume little endian packing
|
|
|
|
const static u32x4 xmask = { 0x000007ff, 0, 0, 0 };
|
|
const static u32x4 ymask = { 0x001ff800, 0, 0, 0 };
|
|
const static u32x4 zmask = { 0x7fe00000, 0, 0, 0 };
|
|
const static u32x4 wmask = { 0x80000000, 0, 0, 0 };
|
|
|
|
const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ;
|
|
// fish x, y, and z and put them into the the first words of their respective vec registers
|
|
i32x4 ix = (i32x4) ((vec_and( qbits, xmask )));
|
|
i32x4 iy = (i32x4) (ShiftRightByBits<11>(vec_and( qbits, ymask )));
|
|
i32x4 iz = (i32x4) (ShiftRightByBits<21>(vec_and( qbits, zmask )));
|
|
|
|
// shift the w bit RIGHT so that it sits at the sign bit of the LAST word.
|
|
i32x4 wsignbit = (i32x4) (ShiftRightByBits<96>(vec_and( qbits, wmask )));
|
|
|
|
// convert each of the vectors from int to float. (because of the way the pipeline is organized,
|
|
// it's as fast to do this as it would have been to do by combining them into one register above
|
|
// and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll
|
|
// map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float-
|
|
// by-float conversion above, but the better stability of the vector quaternion ops makes that okay.
|
|
const fltx4 ONE = LoadOneSIMD();
|
|
#if defined(__SPU__)
|
|
fltx4 fx = SubSIMD( vec_ctf( ix, 10 ), ONE);
|
|
fltx4 fy = SubSIMD( vec_ctf( iy, 9 ), ONE);
|
|
fltx4 fz = SubSIMD( vec_ctf( iz, 9 ), ONE);
|
|
#else
|
|
fltx4 fx = SubSIMD( vec_vcfsx( ix, 10 ), ONE);
|
|
fltx4 fy = SubSIMD( vec_vcfsx( iy, 10 ), ONE);
|
|
fltx4 fz = SubSIMD( vec_vcfsx( iz, 9 ), ONE);
|
|
#endif
|
|
|
|
// compute the dot product
|
|
fltx4 fw = MsubSIMD( fz, fz, ONE ); // 1 - z*z
|
|
fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y
|
|
fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z
|
|
fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here
|
|
|
|
fltx4 result = Compress4SIMD( fx, fy, fz, fw );
|
|
// and for the coup de grace, set the sign bit of fw appropriately
|
|
result = OrSIMD( result, (fltx4)wsignbit );
|
|
|
|
return result;
|
|
|
|
#else
|
|
|
|
struct { float x; float y; float z; float w; } tmp;
|
|
|
|
tmp.x = ((int)x - 1024) * (1 / 1024.0);
|
|
tmp.y = ((int)y - 512) * (1 / 512.0);
|
|
tmp.z = ((int)z - 512) * (1 / 512.0);
|
|
tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
|
|
if (wneg)
|
|
tmp.w = -tmp.w;
|
|
|
|
fltx4 ret = { tmp.x, tmp.y, tmp.z, tmp.w };
|
|
return ret;
|
|
|
|
#endif
|
|
}
|
|
|
|
|
|
//=========================================================
|
|
// 16 bit float
|
|
//=========================================================
|
|
|
|
|
|
const int float32bias = 127;
|
|
const int float16bias = 15;
|
|
|
|
const float maxfloat16bits = 65504.0f;
|
|
|
|
class float16
|
|
{
|
|
public:
|
|
// float16() {};
|
|
//float16( float f ) { m_storage.rawWord = ConvertFloatTo16bits(f); }
|
|
float16& operator=(const unsigned short &other) { m_storage.rawWord = other; return *this; };
|
|
|
|
void Init() { m_storage.rawWord = 0; }
|
|
// float16& operator=(const float16 &other) { m_storage.rawWord = other.m_storage.rawWord; return *this; }
|
|
// float16& operator=(const float &other) { m_storage.rawWord = ConvertFloatTo16bits(other); return *this; }
|
|
// operator unsigned short () { return m_storage.rawWord; }
|
|
// operator float () { return Convert16bitFloatTo32bits( m_storage.rawWord ); }
|
|
unsigned short GetBits() const
|
|
{
|
|
return m_storage.rawWord;
|
|
}
|
|
float GetFloat() const
|
|
{
|
|
return Convert16bitFloatTo32bits( m_storage.rawWord );
|
|
}
|
|
void SetFloat( float in )
|
|
{
|
|
m_storage.rawWord = ConvertFloatTo16bits( in );
|
|
}
|
|
|
|
bool IsInfinity() const
|
|
{
|
|
return m_storage.bits.biased_exponent == 31 && m_storage.bits.mantissa == 0;
|
|
}
|
|
bool IsNaN() const
|
|
{
|
|
return m_storage.bits.biased_exponent == 31 && m_storage.bits.mantissa != 0;
|
|
}
|
|
|
|
bool operator==(const float16 other) const { return m_storage.rawWord == other.m_storage.rawWord; }
|
|
bool operator!=(const float16 other) const { return m_storage.rawWord != other.m_storage.rawWord; }
|
|
|
|
// bool operator< (const float other) const { return GetFloat() < other; }
|
|
// bool operator> (const float other) const { return GetFloat() > other; }
|
|
|
|
template< bool BRANCHLESS > // allows you to force branchy/branchless implementation regardless of the current platform
|
|
static unsigned short ConvertFloatTo16bitsNonDefault( float input );
|
|
static float Convert16bitFloatTo32bits( unsigned short input );
|
|
|
|
// a special case useful for the pixel writer: take four input float values, which are already in memory (not on registers),
|
|
// convert them all at once and write them sequentially through the output pointer.
|
|
static void ConvertFourFloatsTo16BitsAtOnce( float16 * RESTRICT pOut,
|
|
const float *a, const float *b, const float *c, const float *d );
|
|
|
|
// unfortunately, function templates can't have default template parameters in 2010-era C++
|
|
inline static unsigned short ConvertFloatTo16bits( float input )
|
|
{ // default to branchless on ppc and branchy on x86
|
|
#ifdef PLATFORM_PPC
|
|
return ConvertFloatTo16bitsNonDefault<true>(input);
|
|
#else
|
|
return ConvertFloatTo16bitsNonDefault<false>(input);
|
|
#endif
|
|
}
|
|
|
|
protected:
|
|
union float32bits
|
|
{
|
|
float rawFloat;
|
|
uint32 rawAsInt;
|
|
struct
|
|
{
|
|
unsigned int mantissa : 23;
|
|
unsigned int biased_exponent : 8;
|
|
unsigned int sign : 1;
|
|
} bits;
|
|
};
|
|
|
|
union float16bits
|
|
{
|
|
unsigned short rawWord;
|
|
struct
|
|
{
|
|
unsigned short mantissa : 10;
|
|
unsigned short biased_exponent : 5;
|
|
unsigned short sign : 1;
|
|
} bits;
|
|
};
|
|
|
|
static bool IsNaN( float16bits in )
|
|
{
|
|
return in.bits.biased_exponent == 31 && in.bits.mantissa != 0;
|
|
}
|
|
static bool IsInfinity( float16bits in )
|
|
{
|
|
return in.bits.biased_exponent == 31 && in.bits.mantissa == 0;
|
|
}
|
|
|
|
// 0x0001 - 0x03ff
|
|
float16bits m_storage;
|
|
};
|
|
|
|
class float16_with_assign : public float16
|
|
{
|
|
public:
|
|
float16_with_assign() {}
|
|
float16_with_assign( float f ) { m_storage.rawWord = ConvertFloatTo16bits(f); }
|
|
|
|
float16& operator=(const float16 &other) { m_storage.rawWord = ((float16_with_assign &)other).m_storage.rawWord; return *this; }
|
|
float16& operator=(const float &other) { m_storage.rawWord = ConvertFloatTo16bits(other); return *this; }
|
|
// operator unsigned short () const { return m_storage.rawWord; }
|
|
operator float () const { return Convert16bitFloatTo32bits( m_storage.rawWord ); }
|
|
};
|
|
|
|
//=========================================================
|
|
// Fit a 3D vector in 48 bits
|
|
//=========================================================
|
|
|
|
class Vector48
|
|
{
|
|
public:
|
|
// Construction/destruction:
|
|
Vector48(void) {}
|
|
Vector48(vec_t X, vec_t Y, vec_t Z) { x.SetFloat( X ); y.SetFloat( Y ); z.SetFloat( Z ); }
|
|
|
|
// assignment
|
|
Vector48& operator=(const Vector &vOther);
|
|
operator Vector ();
|
|
|
|
const float operator[]( int i ) const { return (((float16 *)this)[i]).GetFloat(); }
|
|
|
|
float16 x;
|
|
float16 y;
|
|
float16 z;
|
|
};
|
|
|
|
// The uses of isel below are malformed because the first expression is unsigned and thus always >= 0,
|
|
// so this whole expression maps to a simple assignment. This was found through a noisy clang
|
|
// warning. I am preprocessing this out until it is needed.
|
|
#if 0
|
|
inline void float16::ConvertFourFloatsTo16BitsAtOnce( float16 * RESTRICT pOut,
|
|
const float *a, const float *b, const float *c, const float *d )
|
|
{
|
|
COMPILE_TIME_ASSERT( sizeof(float) == 4 );
|
|
// being meant for use on the PPC, this is tuned for that.
|
|
// it is mostly branchless, except for the large outer for loop,
|
|
// since there's enough instructions inside that unrolling is
|
|
// a bad idea. This fucntion is four-at-once to simplify SIMDifying in the
|
|
// future should a convenient SIMD way to decimate emerge
|
|
// Also, because this is only used for the special case of converting
|
|
// float arrays into float16 GPU textures, this turns denorms into zeroes
|
|
// and infinities into MAXFLTs, since the shader can't deal with nonfinite
|
|
// numbers anyway.
|
|
|
|
// alias the input floats onto a union giving their mantissa etc
|
|
const float32bits * const inFloat[4] = {
|
|
reinterpret_cast<const float32bits *>(a),
|
|
reinterpret_cast<const float32bits *>(b),
|
|
reinterpret_cast<const float32bits *>(c),
|
|
reinterpret_cast<const float32bits *>(d) };
|
|
|
|
const static unsigned int maxfloat16bitsAsInt = 0x477FE000; // 65504.0f
|
|
|
|
const static unsigned int SIGNBIT = 0x80000000;
|
|
|
|
|
|
for ( int i = 0 ; i < 4 ; ++i ) // performs better not unrolled (less stack spilling)
|
|
{
|
|
unsigned int onGPR = inFloat[i]->rawAsInt;
|
|
|
|
// make a mask for each word; will be all 1's if the float is
|
|
// negative, all 0s if it is positive. Can do this just by
|
|
// using arithmetic shift to smear out the sign bit.
|
|
int isNegative = ((int) onGPR) >> 31;
|
|
|
|
// clamp to be within -maxfloat16bits, maxfloat16bits
|
|
// can't just use isel because IEEE754 floats are sign-magnitude, not two's comp. However,
|
|
// positive IEEE754s can be compared as if they were ints. So, we need to do a little extra
|
|
// work to test the negative case efficiently.
|
|
// clamp to -maxfloat16
|
|
#error See above for explanation of why this and other uses of isel in this file are broken.
|
|
int clampedNeg = isel( ((int)(onGPR & ~SIGNBIT)) - maxfloat16bitsAsInt, // -in >= maxfloatbits so in <= -maxfloat
|
|
maxfloat16bitsAsInt | SIGNBIT, // -65504.0f
|
|
onGPR );
|
|
// clamp to +maxfloat16
|
|
int clampedPos = isel( ((int)(onGPR)) - maxfloat16bitsAsInt, // in >= maxfloatbits
|
|
maxfloat16bitsAsInt , // -65504.0f
|
|
onGPR );
|
|
|
|
// take advantage of PPC's andc operator to effectively do a masked-move
|
|
onGPR = ( clampedNeg & isNegative ) | ( clampedPos & ~isNegative );
|
|
|
|
|
|
// fish out the input exponent and mantis fields directly (using the union induces an LHS)
|
|
int inExponent = (onGPR & 0x7f800000) >> 23;
|
|
unsigned int inMantissa = (onGPR & 0x007FFFFF);
|
|
|
|
int exponent = inExponent - 127 + 15; // rebias the exponent
|
|
unsigned int mantissa = isel( exponent, inMantissa >> 13, (unsigned) 0 ); // squash the mantissa to zero if the number is too small to represent (no denorms)
|
|
|
|
float16bits output;
|
|
// saturate the mantissa if rebiased exponent >= 31 (too big to store)
|
|
output.bits.mantissa = isel( exponent - 31, (unsigned) 0x3ff, mantissa );
|
|
// clamp the exponent to 0..30
|
|
output.bits.biased_exponent = isel( exponent, isel( exponent - 31, 30, exponent ), 0 );
|
|
output.bits.sign = isNegative; // this doesn't lhs, but instead issues the insrdi op to a word on GPR
|
|
pOut[i].m_storage.rawWord = output.rawWord;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#ifdef _X360
|
|
#define __cntlzw _CountLeadingZeros
|
|
#endif
|
|
|
|
template< bool BRANCHLESS >
|
|
inline unsigned short float16::ConvertFloatTo16bitsNonDefault( float input )
|
|
{
|
|
float16bits output;
|
|
float32bits inFloat;
|
|
//if ( !BRANCHLESS ) // x86 code
|
|
{
|
|
if ( input > maxfloat16bits )
|
|
input = maxfloat16bits;
|
|
else if ( input < -maxfloat16bits )
|
|
input = -maxfloat16bits;
|
|
|
|
|
|
inFloat.rawFloat = input;
|
|
|
|
}
|
|
/*
|
|
// The use of isel is incorrect because the first expression is unsigned and therefore always passes
|
|
// the test.
|
|
else // PPC code
|
|
{
|
|
// force the float onto the stack and then a GPR so we eat the LHS only once.
|
|
// you can't just write to one union member and then read back another;
|
|
// the compiler is inconsistent about supporting that kind of type-punning.
|
|
// (ie, it will work in one file, but not another.)
|
|
memcpy(&inFloat.rawFloat, &input, sizeof(inFloat.rawFloat));
|
|
// inFloat.rawFloat = input;
|
|
// clamp using the GPR
|
|
{
|
|
const unsigned int maxfloat16bitsAsInt = 0x477FE000; // 65504.0f
|
|
// clamp to be <= maxfloat16bits
|
|
uint32 &rawint = inFloat.rawAsInt; // <--- lhs
|
|
if ( rawint & 0x80000000 ) // negative
|
|
{
|
|
// because floats are sign-magnitude, not two's comp, need to
|
|
// flip the int positive briefly to do the isel comparison
|
|
#error See above for explanation of why this and other uses of isel in this file are broken.
|
|
rawint = isel( ((int)(rawint & ~0x80000000)) - maxfloat16bitsAsInt, // -in >= maxfloatbits so in <= -maxfloat
|
|
maxfloat16bitsAsInt | 0x80000000, // -65504.0f
|
|
rawint );
|
|
}
|
|
else // positive
|
|
{
|
|
rawint = isel( ((int)(rawint)) - maxfloat16bitsAsInt, // in >= maxfloatbits
|
|
maxfloat16bitsAsInt , // -65504.0f
|
|
rawint );
|
|
}
|
|
}
|
|
}
|
|
*/
|
|
output.bits.sign = inFloat.bits.sign;
|
|
|
|
if ( (inFloat.bits.biased_exponent==0) )
|
|
{
|
|
// zero and denorm both map to zero
|
|
output.bits.mantissa = 0;
|
|
output.bits.biased_exponent = 0;
|
|
}
|
|
else if ( inFloat.bits.biased_exponent==0xff )
|
|
{
|
|
if ( !BRANCHLESS )
|
|
{
|
|
if ( (inFloat.bits.mantissa==0) )
|
|
{
|
|
/*
|
|
// infinity
|
|
output.bits.mantissa = 0;
|
|
output.bits.biased_exponent = 31;
|
|
*/
|
|
|
|
// infinity maps to maxfloat
|
|
output.bits.mantissa = 0x3ff;
|
|
output.bits.biased_exponent = 0x1e;
|
|
}
|
|
else if ( (inFloat.bits.mantissa!=0) )
|
|
{
|
|
/*
|
|
// NaN
|
|
output.bits.mantissa = 1;
|
|
output.bits.biased_exponent = 31;
|
|
*/
|
|
|
|
// NaN maps to zero
|
|
output.bits.mantissa = 0;
|
|
output.bits.biased_exponent = 0;
|
|
}
|
|
}
|
|
else // branchless, only meant for PPC really bc needing the cntlzw op.
|
|
{
|
|
// else if ( inFloat.bits.biased_exponent==0xff ) // either infinity (biased_exponent is 0xff) or NaN.
|
|
{
|
|
#ifdef PLATFORM_PPC
|
|
#if defined(__SPU__)
|
|
int mantissamask = __builtin_clz( output.bits.mantissa ) - 32; // this is 0 if the mantissa is zero, and negative otherwise
|
|
#else
|
|
int mantissamask = __cntlzw( output.bits.mantissa ) - 32; // this is 0 if the mantissa is zero, and negative otherwise
|
|
#endif
|
|
#else
|
|
int mantissamask = output.bits.mantissa ? -1 : 0;
|
|
#endif
|
|
output.bits.mantissa = isel( mantissamask, 0x3ff, 0 ); //infinity maps to maxfloat, NaN to zero
|
|
output.bits.biased_exponent = isel( mantissamask, 0x1e, 0 );
|
|
output.bits.sign = inFloat.bits.sign;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// regular number
|
|
int new_exp = inFloat.bits.biased_exponent-float32bias;
|
|
// it's actually better to branch in these cases on PPC,
|
|
// because the variable bit shift is such a massive penalty
|
|
// that it's worth a branch penalty to avoid it.
|
|
if (new_exp<-24)
|
|
{
|
|
// this maps to 0
|
|
output.bits.mantissa = 0;
|
|
output.bits.biased_exponent = 0;
|
|
}
|
|
|
|
if (new_exp<-14)
|
|
{
|
|
// this maps to a denorm
|
|
output.bits.biased_exponent = 0;
|
|
unsigned int exp_val = ( unsigned int )( -14 - new_exp );
|
|
if( exp_val > 0 && exp_val < 11 )
|
|
{
|
|
output.bits.mantissa = ( 1 << ( 10 - exp_val ) ) + ( inFloat.bits.mantissa >> ( 13 + exp_val ) );
|
|
}
|
|
}
|
|
else if (new_exp>15)
|
|
{
|
|
#if 0
|
|
// map this value to infinity
|
|
output.bits.mantissa = 0;
|
|
output.bits.biased_exponent = 31;
|
|
#else
|
|
// to big. . . maps to maxfloat
|
|
output.bits.mantissa = 0x3ff;
|
|
output.bits.biased_exponent = 0x1e;
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
output.bits.biased_exponent = new_exp+15;
|
|
output.bits.mantissa = (inFloat.bits.mantissa >> 13);
|
|
}
|
|
|
|
|
|
}
|
|
return output.rawWord;
|
|
}
|
|
|
|
inline float float16::Convert16bitFloatTo32bits( unsigned short input )
|
|
{
|
|
float32bits output;
|
|
const float16bits &inFloat = *((float16bits *)&input);
|
|
|
|
if( IsInfinity( inFloat ) )
|
|
{
|
|
return maxfloat16bits * ( ( inFloat.bits.sign == 1 ) ? -1.0f : 1.0f );
|
|
}
|
|
if( IsNaN( inFloat ) )
|
|
{
|
|
return 0.0;
|
|
}
|
|
if( inFloat.bits.biased_exponent == 0 && inFloat.bits.mantissa != 0 )
|
|
{
|
|
// denorm
|
|
const float half_denorm = (1.0f/16384.0f); // 2^-14
|
|
float mantissa = ((float)(inFloat.bits.mantissa)) / 1024.0f;
|
|
float sgn = (inFloat.bits.sign)? -1.0f :1.0f;
|
|
output.rawFloat = sgn*mantissa*half_denorm;
|
|
}
|
|
else
|
|
{
|
|
// regular number
|
|
unsigned mantissa = inFloat.bits.mantissa;
|
|
unsigned biased_exponent = inFloat.bits.biased_exponent;
|
|
unsigned sign = ((unsigned)inFloat.bits.sign) << 31;
|
|
biased_exponent = ( (biased_exponent - float16bias + float32bias) * (biased_exponent != 0) ) << 23;
|
|
mantissa <<= (23-10);
|
|
|
|
*((unsigned *)&output) = ( mantissa | biased_exponent | sign );
|
|
}
|
|
|
|
return output.rawFloat;
|
|
}
|
|
|
|
|
|
|
|
|
|
inline Vector48& Vector48::operator=(const Vector &vOther)
|
|
{
|
|
CHECK_VALID(vOther);
|
|
|
|
x.SetFloat( vOther.x );
|
|
y.SetFloat( vOther.y );
|
|
z.SetFloat( vOther.z );
|
|
return *this;
|
|
}
|
|
|
|
|
|
inline Vector48::operator Vector ()
|
|
{
|
|
Vector tmp;
|
|
|
|
tmp.x = x.GetFloat();
|
|
tmp.y = y.GetFloat();
|
|
tmp.z = z.GetFloat();
|
|
|
|
return tmp;
|
|
}
|
|
|
|
//=========================================================
|
|
// Fit a 2D vector in 32 bits
|
|
//=========================================================
|
|
|
|
class Vector2d32
|
|
{
|
|
public:
|
|
// Construction/destruction:
|
|
Vector2d32(void) {}
|
|
Vector2d32(vec_t X, vec_t Y) { x.SetFloat( X ); y.SetFloat( Y ); }
|
|
|
|
// assignment
|
|
Vector2d32& operator=(const Vector &vOther);
|
|
Vector2d32& operator=(const Vector2D &vOther);
|
|
|
|
operator Vector2D ();
|
|
|
|
void Init( vec_t ix = 0.f, vec_t iy = 0.f);
|
|
|
|
float16_with_assign x;
|
|
float16_with_assign y;
|
|
};
|
|
|
|
inline Vector2d32& Vector2d32::operator=(const Vector2D &vOther)
|
|
{
|
|
x.SetFloat( vOther.x );
|
|
y.SetFloat( vOther.y );
|
|
return *this;
|
|
}
|
|
|
|
inline Vector2d32::operator Vector2D ()
|
|
{
|
|
Vector2D tmp;
|
|
|
|
tmp.x = x.GetFloat();
|
|
tmp.y = y.GetFloat();
|
|
|
|
return tmp;
|
|
}
|
|
|
|
inline void Vector2d32::Init( vec_t ix, vec_t iy )
|
|
{
|
|
x.SetFloat(ix);
|
|
y.SetFloat(iy);
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//=========================================================
|
|
// FAST SIMD BATCH OPERATIONS
|
|
//=========================================================
|
|
|
|
#ifdef _X360
|
|
//// Compressed vector formats: unpack Vector48 and Quaternion48 onto SIMD registers.
|
|
// Only available on 360 for now because SSE1 lacks the necessary operations. SSE2 could
|
|
// do it but we can't count on that yet.
|
|
// If you have many v48's or q48's to stream, please note the functions designed to
|
|
// work on them many at a time.
|
|
|
|
extern const uint16 ALIGN16 g_SIMD_Quat48_Unpack_Shift[]; //< Shuffles the z component of the quat48 left by one bit.
|
|
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute0[16];
|
|
extern const fltx4 g_SIMD_Quat48_Unpack_Magic_Constants;
|
|
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute1[16];
|
|
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute2[16];
|
|
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute3[16];
|
|
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute4[16];
|
|
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute5[16];
|
|
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute6[16];
|
|
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute7[16];
|
|
|
|
// unpack a single vector48 at the pointer into the x,y,z components of a fltx4.
|
|
// the w is total garbage.
|
|
FORCEINLINE fltx4 UnpackVector48SIMD( const Vector48 *pVec )
|
|
{
|
|
// load the three 16-bit floats into the first 48 bits of ret:
|
|
fltx4 ret = XMLoadVector4((const void *)&pVec->x);
|
|
// shuffle the top 64 bits of ret down to the least significant (the z,w) -- 16 of those bits are garbage.
|
|
ret = __vrlimi( ret, ret, 2 | 1, 2 ); // rotate left by 2 words and insert into z,w components
|
|
// now unpack the 16-bit floats into 32-bit floats. This is a hardware op, woohoo!
|
|
ret = __vupkd3d( ret , VPACK_FLOAT16_4 );
|
|
|
|
return ret;
|
|
}
|
|
|
|
// unpack a single Quaternion48 at the pointer into the x,y,z,w components of a fltx4
|
|
// FIXME!!! If we need a version of this that runs on 360, there is a work-in-progress version that hasn't been debugged lower in the file.
|
|
FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec )
|
|
{
|
|
// A quaternion 48 stores the x and y components as 0..65535 , which is almost mapped onto -1.0..1.0 via (x - 32768) / 32768.5 .
|
|
// z is stored as 0..32767, which is almost mapped onto -1..1 via (z - 16384) / 16384.5 .
|
|
// w is inferred from 1 - the dot product of the other tree components. the top bit of what would otherwise be the 16-bit z is
|
|
// w's sign bit.
|
|
fltx4 q16s = XMLoadVector3((const void *)pVec);
|
|
fltx4 shift = __lvx(&g_SIMD_Quat48_Unpack_Shift, 0); // load the aligned shift mask that we use to shuffle z.
|
|
fltx4 permute = __lvx(&g_SIMD_Quat48_Unpack_Permute0, 0); // load the permute word that shuffles x,y,z into their own words
|
|
bool wneg = pVec->wneg; // loading pVec into two different kinds of registers -- but not shuffling between (I hope!) so no LHS.
|
|
|
|
q16s = __vperm( q16s, Four_Threes, permute ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f
|
|
q16s = __vslh(q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
|
|
|
|
// each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1
|
|
const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };
|
|
const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants);
|
|
|
|
/*
|
|
fltx4 ret = __vcfux( q16s, 0 ); // convert from uint16 to floats.
|
|
|
|
// scale from 0..65535 to -1..1 : tmp.x = ((int)x - 32768) * (1 / 32768.0);
|
|
ret = __vmaddfp( ret, g_SIMD_Quat48_DivByU15, Four_NegativeOnes );
|
|
*/
|
|
fltx4 ret = __vmaddfp( q16s, vUpkMul, vUpkAdd );
|
|
|
|
// now, work out what w must be.
|
|
fltx4 dotxyz = Dot3SIMD( ret, ret ); // all components are dot product of ret w/ self.
|
|
dotxyz = ClampVectorSIMD( dotxyz, Four_Zeros, Four_Ones );
|
|
|
|
fltx4 ww = SubSIMD( Four_Ones, dotxyz ); // all components are 1 - dotxyz
|
|
ww = SqrtSIMD(ww); // all components are sqrt(1-dotxyz)
|
|
if (wneg)
|
|
{
|
|
ret = __vrlimi( ret, NegSIMD(ww), 1, 0 ); // insert one element from the ww vector into the w component of ret
|
|
}
|
|
else
|
|
{
|
|
ret = __vrlimi( ret, ww, 1, 0 ); // insert one element from the ww vector into the w component of ret
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
// Many-at-a-time unpackers.
|
|
|
|
|
|
/// Unpack eight consecutive Vector48's in memory onto eight SIMD registers.
|
|
/// The Vector48 pointer must be 16-byte aligned. Eight Vector48s add up
|
|
/// to 48 bytes long. You should maybe think about prefetching.
|
|
FORCEINLINE void UnpackEightVector48SIMD( fltx4 &out1, fltx4 &out2, fltx4 &out3, fltx4 &out4,
|
|
fltx4 &out5, fltx4 &out6, fltx4 &out7, fltx4 &out8,
|
|
Vector48 * RESTRICT pVecs )
|
|
{
|
|
AssertMsg((reinterpret_cast<unsigned int>(pVecs) & 0x0F) == 0, "Input to UnpackEightVector48SIMD is not 16-byte aligned." );
|
|
|
|
// first load the data onto three packed SIMD vectors, which contain eight Vector48s between them.
|
|
// I've named them very explicitly so you can follow the movement of the input data.
|
|
fltx4 x0y0z0x1y1z1x2y2, z2x3y3z3x4y4z4x5, y5z5x6y6z6x7y7z7;
|
|
x0y0z0x1y1z1x2y2 = __lvx( pVecs, 0 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 0
|
|
z2x3y3z3x4y4z4x5 = __lvx( pVecs, 16 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 1
|
|
y5z5x6y6z6x7y7z7 = __lvx( pVecs, 32 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 2
|
|
|
|
// Now, start unpacking. The __vupkd3d operation can turn 16-bit floats into 32-bit floats in a single op!
|
|
// It converts the contents of the z and w words of the input fltx4 , so we need to process a word to do
|
|
// one half, then rotate it to do the other half.
|
|
fltx4 y1z1x2y2 = __vupkd3d( x0y0z0x1y1z1x2y2 , VPACK_FLOAT16_4 );
|
|
x0y0z0x1y1z1x2y2 = __vrlimi( x0y0z0x1y1z1x2y2, x0y0z0x1y1z1x2y2, 0xf, 2 ); // actually y1z1x2y2x0y0z0x1 now. For perf it's important that the first param to vrlimi also be the assignee.
|
|
fltx4 x4y4z4x5 = __vupkd3d( z2x3y3z3x4y4z4x5 , VPACK_FLOAT16_4 );
|
|
z2x3y3z3x4y4z4x5 = __vrlimi( z2x3y3z3x4y4z4x5, z2x3y3z3x4y4z4x5, 0xf, 2 );
|
|
fltx4 z6x7y7z7 = __vupkd3d( y5z5x6y6z6x7y7z7 , VPACK_FLOAT16_4 );
|
|
y5z5x6y6z6x7y7z7 = __vrlimi( y5z5x6y6z6x7y7z7, y5z5x6y6z6x7y7z7, 0xf, 2 );
|
|
fltx4 x0y0z0x1 = __vupkd3d( x0y0z0x1y1z1x2y2 , VPACK_FLOAT16_4 );
|
|
fltx4 z2x3y3z3 = __vupkd3d( z2x3y3z3x4y4z4x5 , VPACK_FLOAT16_4 );
|
|
fltx4 y5z5x6y6 = __vupkd3d( y5z5x6y6z6x7y7z7 , VPACK_FLOAT16_4 );
|
|
|
|
// permute to populate the out-registers with part of their vectors:
|
|
out1 = x0y0z0x1; // DONE
|
|
out2 = __vpermwi( y1z1x2y2, VPERMWI_CONST(0, 0, 1, 0) ); // __y1z1__
|
|
out3 = __vpermwi( y1z1x2y2, VPERMWI_CONST(2, 3, 0, 0) ); // x2y2____
|
|
out4 = __vpermwi( z2x3y3z3, VPERMWI_CONST(1, 2, 3, 0) ); // x3y3z3__ // DONE
|
|
out5 = x4y4z4x5; // DONE
|
|
out6 = __vpermwi( y5z5x6y6, VPERMWI_CONST(0, 0, 1, 0) ); // __y5z5__
|
|
out7 = __vpermwi( y5z5x6y6, VPERMWI_CONST(2, 3, 0, 0) ); // x6y6____
|
|
out8 = __vpermwi( z6x7y7z7, VPERMWI_CONST(1, 2, 3, 0) ); // x7y7z7__ // DONE
|
|
|
|
// there are four more to finish, which we do with a masked insert
|
|
out2 = __vrlimi( out2, x0y0z0x1, 8, 3 ); // x1y1z1__
|
|
out3 = __vrlimi( out3, z2x3y3z3, 2, 2 ); // x2y2x2__
|
|
out6 = __vrlimi( out6, x4y4z4x5, 8, 3 ); // x5y5z5__
|
|
out7 = __vrlimi( out7, z6x7y7z7, 2, 2 ); // x6y6z6__
|
|
|
|
// and we're done!
|
|
}
|
|
|
|
|
|
|
|
/// Unpack eight consecutive Quaternion48's in memory onto eight SIMD registers.
|
|
/// The Quaternion48 pointer must be 16-byte aligned. Eight Quaternion48s add up
|
|
/// to 48 bytes long. You should maybe think about prefetching.
|
|
//
|
|
// This could be improved with verticalization, so that the W sqrts happen
|
|
// on two rather than eight vectors, and then transposing. This would make
|
|
// the initial permuatation even more complicated.
|
|
FORCEINLINE void UnpackEightQuaternion48SIMD( fltx4 &out0, fltx4 &out1, fltx4 &out2, fltx4 &out3,
|
|
fltx4 &out4, fltx4 &out5, fltx4 &out6, fltx4 &out7,
|
|
Quaternion48 * RESTRICT pVecs )
|
|
{
|
|
AssertMsg((reinterpret_cast<unsigned int>(pVecs) & 0x0F) == 0, "Input to UnpackEightQuaternion48SIMD is not 16-byte aligned." );
|
|
// each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1
|
|
const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };
|
|
const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants);
|
|
const fltx4 shift = __lvx(&g_SIMD_Quat48_Unpack_Shift, 0); // load the aligned shift mask that we use to shuffle z left by one bit.
|
|
|
|
// first load the data onto three packed SIMD vectors, which contain eight Quaternion48s between them.
|
|
// I've named them very explicitly so you can follow the movement of the input data.
|
|
fltx4 x0y0z0x1y1z1x2y2, z2x3y3z3x4y4z4x5, y5z5x6y6z6x7y7z7;
|
|
x0y0z0x1y1z1x2y2 = __lvx( pVecs, 0 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 0
|
|
z2x3y3z3x4y4z4x5 = __lvx( pVecs, 16 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 1
|
|
y5z5x6y6z6x7y7z7 = __lvx( pVecs, 32 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 2
|
|
|
|
// shove each quat onto its own fltx4, by using the permute operation
|
|
// each halfword argument goes into the bottom 16 bits of the floating
|
|
// point rep of 3.0f, then we use a magic constant to scale them.
|
|
out0 = __vperm( x0y0z0x1y1z1x2y2, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute0) ); // __x0__y0__z0____
|
|
out1 = __vperm( x0y0z0x1y1z1x2y2, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute1) ); // __x1__y1__z1____
|
|
// postpone 2 since it straddles two words, we'll get back to it
|
|
out3 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute3) ); // __x3__y3__z3__z2 // z2 is important, goes into out2
|
|
out4 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute4) ); // __x4__y4__z4__x5 // x5 is important, goes into out5
|
|
// 5 straddles two words
|
|
out6 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute6) ); // __x6__y6__z6____
|
|
out7 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute7) ); // __x7__y7__z7____
|
|
// now get back to the straddlers, which we make by blending together a prior output and the other source word
|
|
out2 = __vperm( x0y0z0x1y1z1x2y2, out3, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute2) ); // __x2__y2__z2____
|
|
out5 = __vperm( y5z5x6y6z6x7y7z7, out4, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute5) ); // __x5__y5__z5____
|
|
|
|
// the top bit of the z component in each word isn't part of the number; it's
|
|
// a flag indicating whether the eventual w component should be negative.
|
|
// so, we need to move the 0x00008000 bit of the z word onto the top bit
|
|
// of the w word, which is a rotation two bytes right, or 14 bytes left.
|
|
fltx4 wneg[8];
|
|
// juggle all the z halfwords left one bit (toss the wneg sign bit, multiply by two)
|
|
wneg[0] = __vsldoi( out0, out0, 14 );
|
|
out0 = __vslh(out0, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
|
|
wneg[1] = __vsldoi( out1, out1, 14 );
|
|
out1 = __vslh(out1, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
|
|
wneg[2] = __vsldoi( out2, out2, 14 );
|
|
out2 = __vslh(out2, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
|
|
wneg[3] = __vsldoi( out3, out3, 14 );
|
|
out3 = __vslh(out3, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
|
|
wneg[4] = __vsldoi( out4, out4, 14 );
|
|
out4 = __vslh(out4, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
|
|
wneg[5] = __vsldoi( out5, out5, 14 );
|
|
out5 = __vslh(out5, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
|
|
wneg[6] = __vsldoi( out6, out6, 14 );
|
|
out6 = __vslh(out6, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
|
|
wneg[7] = __vsldoi( out7, out7, 14 );
|
|
out7 = __vslh(out7, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
|
|
|
|
// create a mask that is just the sign bit of the w word.
|
|
fltx4 vAllOneBits = __vspltisw(-1); // Shift 31
|
|
fltx4 signMask = __vslw(vAllOneBits, vAllOneBits); // all the sign bits
|
|
signMask = __vrlimi( signMask, Four_Zeros, 14, 0 ); // zero out x,y,z words
|
|
|
|
// this macro defines the operations that will be performed on each of the eight words:
|
|
// * scale from 0..65535 to -1..1 : tmp.x = ((int)x - 32768) * (1 / 32768.0);
|
|
// * take the xyz dot product to get 1 - w^2
|
|
// * subtract from one to get w^2
|
|
// * square root to get zero
|
|
// * OR in the wneg sign mask to get sign for zero.
|
|
// though the macro makes it look like these are being done in serial,
|
|
// in fact the compiler will reorder them to minimize stalls.
|
|
fltx4 ONE = Four_Ones;
|
|
fltx4 dotxyz[8];
|
|
fltx4 ww[8];
|
|
// out0 = __vmaddfp( out0, vUpkMul, vUpkAdd );
|
|
// dotxyz[0] = Dot3SIMD( out0, out0 );
|
|
// clamnp dotxyz if it's more than 1.0
|
|
// all components are 1 - dotxyz
|
|
// clear all but w's sign bit in wneg
|
|
// all components are sqrt(1-dotxyz)
|
|
// toggle w's sign where necessary
|
|
// insert one element from the ww vector into the w component of ret
|
|
#define COMPUTE( target, number ) \
|
|
target ## number = __vmaddfp( target ## number, vUpkMul, vUpkAdd ); \
|
|
dotxyz[number] = Dot3SIMD( target ## number, target ## number ); \
|
|
dotxyz[number] = __vminfp( dotxyz[number], ONE ); \
|
|
ww[number] = SubSIMD( ONE, dotxyz[number] ); \
|
|
wneg[number] = AndSIMD( wneg[number], signMask ) ; \
|
|
ww[number] = SqrtSIMD(ww[number]); \
|
|
ww[number] = OrSIMD( ww[number], wneg[number] ); \
|
|
target ## number = __vrlimi( target ## number, ww[number], 1, 0 );
|
|
|
|
COMPUTE(out, 0);
|
|
COMPUTE(out, 1);
|
|
COMPUTE(out, 2);
|
|
COMPUTE(out, 3);
|
|
COMPUTE(out, 4);
|
|
COMPUTE(out, 5);
|
|
COMPUTE(out, 6);
|
|
COMPUTE(out, 7);
|
|
|
|
#undef COMPUTE
|
|
}
|
|
|
|
#elif defined(_PS3)
|
|
|
|
|
|
// unpack a single vector48 at the pointer into the x,y,z components of a fltx4.
|
|
// the w is total garbage.
|
|
FORCEINLINE fltx4 UnpackVector48SIMD( const Vector48 *pVec )
|
|
{
|
|
// PS3 libs just give us this
|
|
Vectormath::Aos::Vector3 ret;
|
|
Vectormath::Aos::loadHalfFloats( ret, reinterpret_cast<const uint16_t *>(&pVec->x) );
|
|
return ret.get128();
|
|
|
|
}
|
|
|
|
|
|
extern const uint16 ALIGN16 g_SIMD_Quat48_Unpack_Shift[]; //< Shuffles the z component of the quat48 left by one bit.
|
|
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute0[16];
|
|
extern const fltx4 g_SIMD_Quat48_Unpack_Magic_Constants;
|
|
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute1[16];
|
|
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute2[16];
|
|
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute3[16];
|
|
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute4[16];
|
|
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute5[16];
|
|
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute6[16];
|
|
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute7[16];
|
|
|
|
// unpack a single Quaternion48 at the pointer into the x,y,z,w components of a fltx4
|
|
FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec )
|
|
{
|
|
// A quaternion 48 stores the x and y components as 0..65535 , which is almost mapped onto -1.0..1.0 via (x - 32768) / 32768.5 .
|
|
// z is stored as 0..32767, which is almost mapped onto -1..1 via (z - 16384) / 16384.5 .
|
|
// w is inferred from 1 - the dot product of the other tree components. the top bit of what would otherwise be the 16-bit z is
|
|
// w's sign bit.
|
|
fltx4 q16s = LoadUnaligned3SIMD((const void *)pVec);
|
|
#if defined(__SPU__)
|
|
vec_ushort8 shift = vec_ld( 0, (short unsigned int *)g_SIMD_Quat48_Unpack_Shift ); // load the aligned shift mask that we use to shuffle z.
|
|
vec_uchar16 permute = vec_ld(0, (unsigned char *)g_SIMD_Quat48_Unpack_Permute0 ); // load the permute word that shuffles x,y,z into their own words
|
|
#else
|
|
vec_ushort8 shift = vec_ld( 0, g_SIMD_Quat48_Unpack_Shift ); // load the aligned shift mask that we use to shuffle z.
|
|
vec_uchar16 permute = vec_ld(0, g_SIMD_Quat48_Unpack_Permute0 ); // load the permute word that shuffles x,y,z into their own words
|
|
#endif
|
|
bool wneg = pVec->wneg; // loading pVec into two different kinds of registers -- but not shuffling between (I hope!) so no LHS.
|
|
|
|
q16s = vec_perm( q16s, Four_Threes, permute ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f
|
|
#if defined(__SPU__)
|
|
q16s = (fltx4) vec_sl( (vec_ushort8) q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
|
|
#else
|
|
q16s = (fltx4) vec_vslh( (vec_ushort8) q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
|
|
#endif
|
|
|
|
// each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1
|
|
const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };
|
|
const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants);
|
|
|
|
fltx4 ret = vec_madd( q16s, vUpkMul, vUpkAdd );
|
|
|
|
// now, work out what w must be.
|
|
fltx4 dotxyz = Dot3SIMD( ret, ret ); // all components are dot product of ret w/ self.
|
|
dotxyz = ClampVectorSIMD( dotxyz, Four_Zeros, Four_Ones );
|
|
|
|
fltx4 ww = SubSIMD( Four_Ones, dotxyz ); // all components are 1 - dotxyz
|
|
ww = SqrtSIMD(ww); // all components are sqrt(1-dotxyz)
|
|
|
|
// insert one element from the ww vector into the w component of ret
|
|
ret = MaskedAssign( LoadAlignedSIMD(g_SIMD_ComponentMask[3]), wneg ? NegSIMD(ww) : ww, ret );
|
|
|
|
return ret;
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#if defined( _X360 )
|
|
#pragma bitfield_order( pop )
|
|
#elif defined( _PS3 )
|
|
#pragma ms_struct off
|
|
#pragma reverse_bitfields off
|
|
#endif
|
|
|
|
#endif
|
|
|