csgo-2018-source/public/mathlib/compressed_vector.h
2021-07-24 21:11:47 -07:00

1621 lines
60 KiB
C++

//========= Copyright © 1996-2005, Valve Corporation, All rights reserved. ============//
//
// Purpose:
//
// $NoKeywords: $
//
//=============================================================================//
#ifndef COMPRESSED_VECTOR_H
#define COMPRESSED_VECTOR_H
#ifdef _WIN32
#pragma once
#endif
#include <math.h>
#include <float.h>
// For vec_t, put this somewhere else?
#include "basetypes.h"
// For rand(). We really need a library!
#include <stdlib.h>
#include "tier0/dbg.h"
#include "mathlib/vector.h"
#include "mathlib/mathlib.h"
#include "mathlib/ssemath.h"
#ifdef _PS3
#if defined(__SPU__)
#include <spu_intrinsics.h>
#include <vmx2spu.h>
#endif
#include <vectormath/cpp/vectormath_aos.h>
#endif
#if defined( _X360 )
#pragma bitfield_order( push, lsb_to_msb )
#elif defined( _PS3 )
#pragma ms_struct on
#pragma reverse_bitfields on
#endif
#ifdef OSX
#pragma GCC diagnostic ignored "-Wtautological-compare"
#endif
class Quaternion48;
FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec );
//=========================================================
// fit a 3D vector into 32 bits
//=========================================================
class Vector32
{
public:
// Construction/destruction:
Vector32(void);
Vector32(vec_t X, vec_t Y, vec_t Z);
// assignment
Vector32& operator=(const Vector &vOther);
operator Vector ();
private:
unsigned short x:10;
unsigned short y:10;
unsigned short z:10;
unsigned short exp:2;
};
inline Vector32& Vector32::operator=(const Vector &vOther)
{
CHECK_VALID(vOther);
static float expScale[4] = { 4.0f, 16.0f, 32.f, 64.f };
float fmax = MAX( fabs( vOther.x ), fabs( vOther.y ) );
fmax = fpmax( fmax, fabs( vOther.z ) );
for (exp = 0; exp < 3; exp++)
{
if (fmax < expScale[exp])
break;
}
Assert( fmax < expScale[exp] );
float fexp = 512.0f / expScale[exp];
x = clamp( (int)(vOther.x * fexp) + 512, 0, 1023 );
y = clamp( (int)(vOther.y * fexp) + 512, 0, 1023 );
z = clamp( (int)(vOther.z * fexp) + 512, 0, 1023 );
return *this;
}
inline Vector32::operator Vector ()
{
Vector tmp;
static float expScale[4] = { 4.0f, 16.0f, 32.f, 64.f };
float fexp = expScale[exp] / 512.0f;
tmp.x = (((int)x) - 512) * fexp;
tmp.y = (((int)y) - 512) * fexp;
tmp.z = (((int)z) - 512) * fexp;
return tmp;
}
//=========================================================
// Fit a unit vector into 32 bits
//=========================================================
class Normal32
{
public:
// Construction/destruction:
Normal32(void);
Normal32(vec_t X, vec_t Y, vec_t Z);
// assignment
Normal32& operator=(const Vector &vOther);
operator Vector ();
private:
unsigned short x:15;
unsigned short y:15;
unsigned short zneg:1;
};
inline Normal32& Normal32::operator=(const Vector &vOther)
{
CHECK_VALID(vOther);
x = clamp( (int)(vOther.x * 16384) + 16384, 0, 32767 );
y = clamp( (int)(vOther.y * 16384) + 16384, 0, 32767 );
zneg = (vOther.z < 0);
//x = vOther.x;
//y = vOther.y;
//z = vOther.z;
return *this;
}
inline Normal32::operator Vector ()
{
Vector tmp;
tmp.x = ((int)x - 16384) * (1 / 16384.0);
tmp.y = ((int)y - 16384) * (1 / 16384.0);
tmp.z = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y );
if (zneg)
tmp.z = -tmp.z;
return tmp;
}
//=========================================================
// 64 bit Quaternion
//=========================================================
class Quaternion64
{
public:
// Construction/destruction:
Quaternion64(void);
Quaternion64(vec_t X, vec_t Y, vec_t Z);
// assignment
// Quaternion& operator=(const Quaternion64 &vOther);
Quaternion64& operator=(const Quaternion &vOther);
operator Quaternion () const;
inline fltx4 LoadUnalignedSIMD() const; // load onto a SIMD register without assumptions of being on a 16byte boundary
private:
Quaternion64( uint64 xx, uint64 yy, uint64 zz, uint64 ww ) : x(xx), y(yy), z(zz), wneg(ww) {}; // stricly for static construction
uint64 x:21;
uint64 y:21;
uint64 z:21;
uint64 wneg:1;
};
inline Quaternion64::operator Quaternion () const
{
#if defined(__SPU__)
fltx4 tmpV;
QuaternionAligned tmpQ;
tmpV = LoadUnalignedSIMD();
StoreAlignedSIMD( (float *)&tmpQ, tmpV );
return tmpQ;
#else
Quaternion tmp;
// shift to -1048576, + 1048575, then round down slightly to -1.0 < x < 1.0
tmp.x = ((int)x - 1048576) * (1 / 1048576.5f);
tmp.y = ((int)y - 1048576) * (1 / 1048576.5f);
tmp.z = ((int)z - 1048576) * (1 / 1048576.5f);
tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
if (wneg)
tmp.w = -tmp.w;
return tmp;
#endif
}
inline Quaternion64& Quaternion64::operator=(const Quaternion &vOther)
{
CHECK_VALID(vOther);
x = clamp( (int)(vOther.x * 1048576) + 1048576, 0, 2097151 );
y = clamp( (int)(vOther.y * 1048576) + 1048576, 0, 2097151 );
z = clamp( (int)(vOther.z * 1048576) + 1048576, 0, 2097151 );
wneg = (vOther.w < 0);
return *this;
}
inline fltx4 Quaternion64::LoadUnalignedSIMD() const
{
#ifdef _PS3 // assume little endian packing
#if 1
const static u32x4 xmask = { 0x00000000, 0x001fffff, 0, 0 }; // bottom 21 bits ( 0 .. 20 ) true
const static u32x4 ymask = { 0x000003ff, 0xffe00000, 0, 0 }; // bits 21 .. 41 true
const static u32x4 zmask = { 0x7ffffC00, 0x00000000, 0, 0 }; // bits 42 .. 62 true
const static u32x4 wmask = { 0x80000000, 0x00000000, 0, 0 }; // only bit 63 is true
const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ;
// fish x, y, and z and put them into the the first words of their respective vec registers
// the end type for these registers must be signed for the following subtract, BUT!
// the shift has to happen as an UNSIGNED type so that it doesn't sign-extend.
// the code as present assumes that the fused multiply-add operation has an intermediate
// precision higher than 32 bits -- otherwise, we'll need to perform the initial subtract as an
// int op because of course 21 bits is right at the limit of floating point precision.
i32x4 ix = (i32x4) (ShiftLeftByBits<32>(vec_and( qbits, xmask ))); // shift x by eleven bits so its 21 bits of precision are sitting at the low end of the first word
i32x4 iy = (i32x4) (ShiftLeftByBits<11>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, left by 10 bits so its 21 bits of precision are sitting at the low end of the first word
i32x4 iz = (i32x4) (ShiftRightByBits<10>(vec_and( qbits, zmask ))); // shift z, which straddles the first two words, left by 31 bits so its 21 bits of precision are sitting at the low end of the first word
/* // this is how to put them into their respective words instead (but we don't want to do that because we need a dot product)
i32x4 iy = (i32x4) (ShiftRightByBits<22>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, right by 22 bits so its 21 bits of precision are sitting at the low end of the second word
i32x4 iz = (i32x4) (ShiftRightByBits<33>(vec_and( qbits, zmask ))); // shift z right by 33 bits so its 21 bits of precision are sitting at the low end of the third word
*/
i32x4 wsignbit = (i32x4) (ShiftRightByBits<96>(vec_and( qbits, wmask ))); // shift the w bit RIGHT so that it sits at the sign bit of the LAST word.
// convert each of the vectors from int to float. (because of the way the pipeline is organized,
// it's as fast to do this as it would have been to do by combining them into one register above
// and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll
// map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float-
// by-float conversion above, but the better stability of the vector quaternion ops makes that okay.
const fltx4 ONE = LoadOneSIMD();
#if defined(__SPU__)
fltx4 fx = SubSIMD( vec_ctf( ix, 20 ), ONE);
fltx4 fy = SubSIMD( vec_ctf( iy, 20 ), ONE);
fltx4 fz = SubSIMD( vec_ctf( iz, 20 ), ONE);
#else
fltx4 fx = SubSIMD( vec_vcfsx( ix, 20 ), ONE);
fltx4 fy = SubSIMD( vec_vcfsx( iy, 20 ), ONE);
fltx4 fz = SubSIMD( vec_vcfsx( iz, 20 ), ONE);
#endif
// compute the dot product
fltx4 fw = MsubSIMD( fz, fz, ONE ); // 1 - z*z
fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y
fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z
fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here
fltx4 result = Compress4SIMD( fx, fy, fz, fw );
// and for the coup de grace, set the sign bit of fw appropriately
result = OrSIMD( result, (fltx4)wsignbit );
return result;
#else
// original version
/*
union Qmask {
struct qq {
Quaternion64 mask;
uint64 padding;
} asQ ;
u32x4 asVec;
Qmask( const Quaternion64 &m ) : mask(m) {}
};
*/
const static u32x4 xmask = { 0xfffff800, 0x00000000, 0, 0 }; // top 21 bits ( 0 .. 20 ) true
const static u32x4 ymask = { 0x000007ff, 0xffc00000, 0, 0 }; // bits 21 .. 41 true
const static u32x4 zmask = { 0x00000000, 0x003ffffe, 0, 0 }; // bits 42 .. 62 true
const static u32x4 wmask = { 0x00000000, 0x00000001, 0, 0 }; // only bit 63 is true
const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ;
// fish x, y, and z and put them into the the first words of their respective vec registers
// the end type for these registers must be signed for the following subtract, BUT!
// the shift has to happen as an UNSIGNED type so that it doesn't sign-extend.
// the code as present assumes that the fused multiply-add operation has an intermediate
// precision higher than 32 bits -- otherwise, we'll need to perform the initial subtract as an
// int op because of course 21 bits is right at the limit of floating point precision.
i32x4 ix = (i32x4) (ShiftRightByBits<11>(vec_and( qbits, xmask ))); // shift x by eleven bits so its 21 bits of precision are sitting at the low end of the first word
i32x4 iy = (i32x4) (ShiftLeftByBits<10>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, left by 10 bits so its 21 bits of precision are sitting at the low end of the first word
i32x4 iz = (i32x4) (ShiftLeftByBits<31>(vec_and( qbits, zmask ))); // shift z, which straddles the first two words, left by 31 bits so its 21 bits of precision are sitting at the low end of the first word
/* // this is how to put them into their respective words instead (but we don't want to do that because we need a dot product)
i32x4 iy = (i32x4) (ShiftRightByBits<22>(vec_and( qbits, ymask ))); // shift y, which straddles the first two words, right by 22 bits so its 21 bits of precision are sitting at the low end of the second word
i32x4 iz = (i32x4) (ShiftRightByBits<33>(vec_and( qbits, zmask ))); // shift z right by 33 bits so its 21 bits of precision are sitting at the low end of the third word
*/
i32x4 wsignbit = (i32x4) (ShiftRightByBits<33>(vec_and( qbits, wmask ))); // shift the w bit RIGHT so that it sits at the sign bit of the LAST word.
// convert each of the vectors from int to float. (because of the way the pipeline is organized,
// it's as fast to do this as it would have been to do by combining them into one register above
// and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll
// map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float-
// by-float conversion above, but the better stability of the vector quaternion ops makes that okay.
const fltx4 ONE = LoadOneSIMD();
#if defined(__SPU__)
fltx4 fx = SubSIMD( vec_ctf( ix, 20 ), ONE);
fltx4 fy = SubSIMD( vec_ctf( iy, 20 ), ONE);
fltx4 fz = SubSIMD( vec_ctf( iz, 20 ), ONE);
#else
fltx4 fx = SubSIMD( vec_vcfsx( ix, 20 ), ONE);
fltx4 fy = SubSIMD( vec_vcfsx( iy, 20 ), ONE);
fltx4 fz = SubSIMD( vec_vcfsx( iz, 20 ), ONE);
#endif
// compute the dot product
fltx4 fw = MsubSIMD( ONE, fz, fz ); // 1 - z*z
fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y
fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z
fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here
fltx4 result = Compress4SIMD( fx, fy, fz, fw );
// and for the coup de grace, set the sign bit of fw appropriately
result = OrSIMD( result, (fltx4)wsignbit );
return result;
#endif
#elif 0 // basic C implementation (which ends up being slower than writing the whole Q onto the stack and then reading it back at once)
struct { float x; float y; float z; float w; } tmp;
tmp.x = ((int)x - 1048576) * (1 / 1048576.5f);
tmp.y = ((int)y - 1048576) * (1 / 1048576.5f);
tmp.z = ((int)z - 1048576) * (1 / 1048576.5f);
tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
if (wneg)
tmp.w = -tmp.w;
fltx4 ret = { tmp.x, tmp.y, tmp.z, tmp.w };
return ret;
#else // naive implementation (which ends up being faster than the explicit c imp above)
const QuaternionAligned q(Quaternion(*this)) ;
return LoadAlignedSIMD( &q );
#endif
}
//=========================================================
// 48 bit Quaternion
//=========================================================
class Quaternion48
{
public:
// Construction/destruction:
Quaternion48(void);
Quaternion48(vec_t X, vec_t Y, vec_t Z);
// assignment
// Quaternion& operator=(const Quaternion48 &vOther);
Quaternion48& operator=(const Quaternion &vOther);
operator Quaternion () const;
inline fltx4 LoadUnalignedSIMD() const; // load onto a SIMD register without assumptions of being on a 16byte boundary
//private:
unsigned short x:16;
unsigned short y:16;
unsigned short z:15;
unsigned short wneg:1;
};
inline Quaternion48::operator Quaternion () const
{
#if defined(__SPU__)
fltx4 tmpV;
QuaternionAligned tmpQ;
tmpV = LoadUnalignedSIMD();
StoreAlignedSIMD( (float *)&tmpQ, tmpV );
tmpV = UnpackQuaternion48SIMD( this );
StoreAlignedSIMD( (float *)&tmpQ, tmpV );
return tmpQ;
#else
Quaternion tmp;
tmp.x = ((int)x - 32768) * (1 / 32768.5);
tmp.y = ((int)y - 32768) * (1 / 32768.5);
tmp.z = ((int)z - 16384) * (1 / 16384.5);
tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
if (wneg)
tmp.w = -tmp.w;
return tmp;
#endif
}
inline Quaternion48& Quaternion48::operator=(const Quaternion &vOther)
{
CHECK_VALID(vOther);
x = clamp( (int)(vOther.x * 32768) + 32768, 0, 65535 );
y = clamp( (int)(vOther.y * 32768) + 32768, 0, 65535 );
z = clamp( (int)(vOther.z * 16384) + 16384, 0, 32767 );
wneg = (vOther.w < 0);
return *this;
}
inline fltx4 Quaternion48::LoadUnalignedSIMD() const
{
#ifdef _PS3 // assume little endian packing
const static u32x4 xmask = { 0x00000000, 0xffff0000, 0, 0 };
const static u32x4 ymask = { 0x0000ffff, 0x00000000, 0, 0 };
const static u32x4 zmask = { 0x7fff0000, 0x00000000, 0, 0 };
const static u32x4 wmask = { 0x80000000, 0x00000000, 0, 0 };
const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ;
// fish x, y, and z and put them into the the first words of their respective vec registers
i32x4 ix = (i32x4) (ShiftLeftByBits<16>(vec_and( qbits, xmask )));
i32x4 iy = (i32x4) ((vec_and( qbits, ymask )));
i32x4 iz = (i32x4) (ShiftRightByBits<16>(vec_and( qbits, zmask )));
// shift the w bit RIGHT so that it sits at the sign bit of the LAST word.
i32x4 wsignbit = (i32x4) (ShiftRightByBits<96>(vec_and( qbits, wmask )));
// convert each of the vectors from int to float. (because of the way the pipeline is organized,
// it's as fast to do this as it would have been to do by combining them into one register above
// and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll
// map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float-
// by-float conversion above, but the better stability of the vector quaternion ops makes that okay.
const fltx4 ONE = LoadOneSIMD();
#if defined(__SPU__)
fltx4 fx = SubSIMD( vec_ctf( ix, 15 ), ONE);
fltx4 fy = SubSIMD( vec_ctf( iy, 15 ), ONE);
fltx4 fz = SubSIMD( vec_ctf( iz, 14 ), ONE);
#else
fltx4 fx = SubSIMD( vec_vcfsx( ix, 15 ), ONE);
fltx4 fy = SubSIMD( vec_vcfsx( iy, 15 ), ONE);
fltx4 fz = SubSIMD( vec_vcfsx( iz, 14 ), ONE);
#endif
// compute the dot product
fltx4 fw = MsubSIMD( fz, fz, ONE ); // 1 - z*z
fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y
fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z
fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here
fltx4 result = Compress4SIMD( fx, fy, fz, fw );
// and for the coup de grace, set the sign bit of fw appropriately
result = OrSIMD( result, (fltx4)wsignbit );
return result;
#elif 0 // basic C implementation (which ends up being slower than writing the whole Q onto the stack and then reading it back at once)
struct { float x; float y; float z; float w; } tmp;
tmp.x = ((int)x - 32768) * (1 / 32768.5);
tmp.y = ((int)y - 32768) * (1 / 32768.5);
tmp.z = ((int)z - 16384) * (1 / 16384.5);
tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
if (wneg)
tmp.w = -tmp.w;
fltx4 ret = { tmp.x, tmp.y, tmp.z, tmp.w };
return ret;
#else // naive implementation (which ends up being faster than the explicit c imp above)
const QuaternionAligned q(Quaternion(*this)) ;
return LoadAlignedSIMD( &q );
#endif
}
//=========================================================
// 48 bit sorted Quaternion
//=========================================================
class Quaternion48S
{
public:
// Construction/destruction:
Quaternion48S(void);
Quaternion48S(vec_t X, vec_t Y, vec_t Z);
// assignment
// Quaternion& operator=(const Quaternion48 &vOther);
Quaternion48S& operator=(const Quaternion &vOther);
operator Quaternion () const;
operator fltx4 () const RESTRICT ;
//private:
// shift the quaternion so that the largest value is recreated by the sqrt()
// abcd maps modulo into quaternion xyzw starting at "offset"
// "offset" is split into two 1 bit fields so that the data packs into 6 bytes (3 shorts)
unsigned short a:15; // first of the 3 consecutive smallest quaternion elements
unsigned short offsetH:1; // high bit of "offset"
unsigned short b:15;
unsigned short offsetL:1; // low bit of "offset"
unsigned short c:15;
unsigned short dneg:1; // sign of the largest quaternion element
};
#define SCALE48S 23168.0f // needs to fit 2*sqrt(0.5) into 15 bits.
#define SHIFT48S 16384 // half of 2^15 bits.
inline Quaternion48S::operator Quaternion () const
{
#if defined(__SPU__)
fltx4 tmpV;
QuaternionAligned tmpQ;
tmpV = *this;
StoreAlignedSIMD( (float *)&tmpQ, tmpV );
return tmpQ;
#else
Quaternion tmp;
COMPILE_TIME_ASSERT( sizeof( Quaternion48S ) == 6 );
float *ptmp = &tmp.x;
int ia = offsetL + offsetH * 2;
int ib = ( ia + 1 ) % 4;
int ic = ( ia + 2 ) % 4;
int id = ( ia + 3 ) % 4;
ptmp[ia] = ( (int)a - SHIFT48S ) * ( 1.0f / SCALE48S );
ptmp[ib] = ( (int)b - SHIFT48S ) * ( 1.0f / SCALE48S );
ptmp[ic] = ( (int)c - SHIFT48S ) * ( 1.0f / SCALE48S );
ptmp[id] = sqrt( 1.0f - ptmp[ia] * ptmp[ia] - ptmp[ib] * ptmp[ib] - ptmp[ic] * ptmp[ic] );
if (dneg)
ptmp[id] = -ptmp[id];
return tmp;
#endif
}
inline Quaternion48S& Quaternion48S::operator=(const Quaternion &vOther)
{
CHECK_VALID(vOther);
const float *ptmp = &vOther.x;
// find largest field, make sure that one is recreated by the sqrt to minimize error
int i = 0;
if ( fabs( ptmp[i] ) < fabs( ptmp[1] ) )
{
i = 1;
}
if ( fabs( ptmp[i] ) < fabs( ptmp[2] ) )
{
i = 2;
}
if ( fabs( ptmp[i] ) < fabs( ptmp[3] ) )
{
i = 3;
}
int offset = ( i + 1 ) % 4; // make "a" so that "d" is the largest element
offsetL = offset & 1;
offsetH = offset > 1;
a = clamp( (int)(ptmp[ offset ] * SCALE48S) + SHIFT48S, 0, (int)(SCALE48S * 2) );
b = clamp( (int)(ptmp[ ( offset + 1 ) % 4 ] * SCALE48S) + SHIFT48S, 0, (int)(SCALE48S * 2) );
c = clamp( (int)(ptmp[ ( offset + 2 ) % 4 ] * SCALE48S) + SHIFT48S, 0, (int)(SCALE48S * 2) );
dneg = ( ptmp[ ( offset + 3 ) % 4 ] < 0.0f );
return *this;
}
// decode onto a SIMD register
inline Quaternion48S::operator fltx4 () const RESTRICT
{
AssertMsg1( (((uintp) this) & 1) == 0, "Quaternion48S is unaligned at %p\n", this );
#ifdef PLATFORM_PPC // this algorithm depends heavily on the Altivec permute op, for which there is no analogue in SSE. This function should not be used on PC.
// define some vector constants. the shift-scale will be done as a fused multiply-add,
// with the scale already distributed onto the shift (the part subtracted)
const static fltx4 vrSCALE48S = { (1.0f / SCALE48S), (1.0f / SCALE48S), (1.0f / SCALE48S), (1.0f / SCALE48S) };
const static fltx4 vrSHIFT48S = { ((float) -SHIFT48S) / SCALE48S, ((float) -SHIFT48S) / SCALE48S, ((float) -SHIFT48S) / SCALE48S, ((float) -SHIFT48S) / SCALE48S };
// start by hoisting the q48 onto a SIMD word.
u32x4 source = (u32x4) LoadUnalignedSIMD( this );
const u32x4 ZERO = (u32x4) LoadZeroSIMD();
// also hoist the offset into an int word. Hopefully this executes in parallel with the vector ops thanks to SUPERSCALAR!
const unsigned int offset = offsetL | ( offsetH << 1 );
const bi32x4 vDMask = (bi32x4) LoadAlignedSIMD( g_SIMD_ComponentMask[(offset+3)%4] ); // lets vsel poke D into the right word
#if 0 // This code can be used to deal with a situation where LoadUnalignedSIMD() fails to properly load
// vectors starting on halfword boundaries (rather than 32-bit aligned). Because this is a 48-bit
// structure, sometimes it'll only be 16-bit aligned. I expected that lvlx would always load from
// a word boundary, requiring me to shift misaligned vectors over by 16 bits, but evidently,
// lvlx actually works even on halfword boundaries. Who knew!
// Anyway, this code is still here in case the problem crops up, as a hint to both cause and solution.
if ( ((unsigned int) this) & 2 )
{
source = ShiftLeftByBits<16>(source);
}
#endif
// mask out the offset and dneg bits. Because of the packing #pragmas, the one-bit fields are actually at the MSB
// of the halfwords, not the LSB as you might expect.
ALIGN16 const static uint32 vMaskTopBits[4] = { 0x80008000, 0x80000000, 0, 0 }; // just the LSB of each the first three halfwords
u32x4 abc = AndNotSIMD( (u32x4) LoadAlignedSIMD(vMaskTopBits), source ); // now this is just the A, B, C halfwords.
// Next, unpack abc as unsigned numbers. We can do this with a permute op. In fact, we can exploit
// the integer pipe and load the offset while we're loading the SIMD numbers, then use the integer offset to select
// the permute, which will therefore also perform the rotate that maps abc to their rightful destinations.
// the masks below are for the vperm instruction, which is a byte-by-byte mapping from source to destination.
// it's assumed that the FIRST parameter to vperm will be ZERO, and the second the data. (that makes the masks a little clearer)
// in the simplest case -- imagine each letter below represents one byte; the source vector looks like
// AABB CCxx xxxx xxxx. We're going to permute it onto the work register like
// 00AA 00BB 00CC 0000
ALIGN16 const static uint32 vPermutations[4][4] = {
// offset = 0 means a->x, b->y, c->z, d->w
{ 0x00001011, 0x00001213, 0x00001415, 0x00000000 },
// offset = 1 means a->y, b->z, c->w, d->a
{ 0x00000000, 0x00001011, 0x00001213, 0x00001415 },
{ 0x00001415, 0x00000000, 0x00001011, 0x00001213 },
{ 0x00001213, 0x00001415, 0x00000000, 0x00001011 }
};
// compute two permutations on the input data: one where the zero-word is always in the w component,
// which lets us do a 3-way rather than 4-way dot product; and another where the zero-word corresponds to
// wherever D is supposed to go.
// Even though this seems redundant, the duplicated work ends up fitting into the pipeline bubbles,
// and the savings between a 4-way and 3-way dot seem to be about 3ns.
u32x4 abcfordot = PermuteVMX( ZERO, abc, LoadAlignedSIMD( vPermutations[0] ) );
abc = PermuteVMX( ZERO, abc, LoadAlignedSIMD( vPermutations[offset] ) );
// turn each of the ints into floats. Because we masked out the one-bit field at the top,
// We can think of this as a conversion from fixed-point where there's no fractional bit.
// This is done in line with the shift-scale operation, which is itself fused.
// we do this twice: once for the vector with the guaranteed zero w-word, and
// once for the vector rotated by the offset.
fltx4 vfDest = AndNotSIMD( vDMask, MaddSIMD( UnsignedFixedIntConvertToFltSIMD( abc, 0 ), vrSCALE48S, vrSHIFT48S ) );
fltx4 vfDestForDot = MaddSIMD( UnsignedFixedIntConvertToFltSIMD( abcfordot, 0 ), vrSCALE48S, vrSHIFT48S ) ;
// compute magnitude of the vector we know to have a 0 in the w word.
const fltx4 vDot = Dot3SIMD( vfDestForDot, vfDestForDot );
// recover the "D" word
const fltx4 vD = SqrtSIMD( SubSIMD( LoadOneSIMD(), vDot ) );
// mask D into the converted-and-offset vector, then return.
return MaskedAssign( vDMask, dneg ? NegSIMD(vD) : vD, vfDest );
#else
AssertMsg( false, "Quaternion48S::operator fltx4 is slow on this platform and should not be used.\n" );
QuaternionAligned q( (Quaternion) *this );
return LoadAlignedSIMD( &q );
#endif
}
//=========================================================
// 32 bit Quaternion
//=========================================================
class Quaternion32
{
public:
// Construction/destruction:
Quaternion32(void);
Quaternion32(vec_t X, vec_t Y, vec_t Z);
// assignment
// Quaternion& operator=(const Quaternion48 &vOther);
Quaternion32& operator=(const Quaternion &vOther);
operator Quaternion ();
inline fltx4 LoadUnalignedSIMD() const; // load onto a SIMD register without assumptions of being on a 16byte boundary
private:
unsigned int x:11;
unsigned int y:10;
unsigned int z:10;
unsigned int wneg:1;
};
inline Quaternion32::operator Quaternion ()
{
#if defined(__SPU__)
fltx4 tmpV;
QuaternionAligned tmpQ;
tmpV = LoadUnalignedSIMD();
StoreAlignedSIMD( (float *)&tmpQ, tmpV );
return tmpQ;
#else
Quaternion tmp;
tmp.x = ((int)x - 1024) * (1 / 1024.0);
tmp.y = ((int)y - 512) * (1 / 512.0);
tmp.z = ((int)z - 512) * (1 / 512.0);
tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
if (wneg)
tmp.w = -tmp.w;
return tmp;
#endif
}
inline Quaternion32& Quaternion32::operator=(const Quaternion &vOther)
{
CHECK_VALID(vOther);
x = clamp( (int)(vOther.x * 1024) + 1024, 0, 2047 );
y = clamp( (int)(vOther.y * 512) + 512, 0, 1023 );
z = clamp( (int)(vOther.z * 512) + 512, 0, 1023 );
wneg = (vOther.w < 0);
return *this;
}
inline fltx4 Quaternion32::LoadUnalignedSIMD() const
{
#ifdef _PS3 // assume little endian packing
const static u32x4 xmask = { 0x000007ff, 0, 0, 0 };
const static u32x4 ymask = { 0x001ff800, 0, 0, 0 };
const static u32x4 zmask = { 0x7fe00000, 0, 0, 0 };
const static u32x4 wmask = { 0x80000000, 0, 0, 0 };
const u32x4 qbits = (u32x4)( ::LoadUnalignedSIMD( this ) ) ;
// fish x, y, and z and put them into the the first words of their respective vec registers
i32x4 ix = (i32x4) ((vec_and( qbits, xmask )));
i32x4 iy = (i32x4) (ShiftRightByBits<11>(vec_and( qbits, ymask )));
i32x4 iz = (i32x4) (ShiftRightByBits<21>(vec_and( qbits, zmask )));
// shift the w bit RIGHT so that it sits at the sign bit of the LAST word.
i32x4 wsignbit = (i32x4) (ShiftRightByBits<96>(vec_and( qbits, wmask )));
// convert each of the vectors from int to float. (because of the way the pipeline is organized,
// it's as fast to do this as it would have been to do by combining them into one register above
// and converting all at once.) Also, we can do the fixed point conversion in the vcfsx op. It'll
// map us onto [0,2] which we'll shift to [-1,1] -- it includes the endpoints of unlike the float-
// by-float conversion above, but the better stability of the vector quaternion ops makes that okay.
const fltx4 ONE = LoadOneSIMD();
#if defined(__SPU__)
fltx4 fx = SubSIMD( vec_ctf( ix, 10 ), ONE);
fltx4 fy = SubSIMD( vec_ctf( iy, 9 ), ONE);
fltx4 fz = SubSIMD( vec_ctf( iz, 9 ), ONE);
#else
fltx4 fx = SubSIMD( vec_vcfsx( ix, 10 ), ONE);
fltx4 fy = SubSIMD( vec_vcfsx( iy, 10 ), ONE);
fltx4 fz = SubSIMD( vec_vcfsx( iz, 9 ), ONE);
#endif
// compute the dot product
fltx4 fw = MsubSIMD( fz, fz, ONE ); // 1 - z*z
fltx4 fxsqysq = MaddSIMD( fy, fy, MulSIMD( fx,fx ) ); // x*x + y*y
fw = SubSIMD( fw, fxsqysq ); // 1 - x*x - y*y - z*z
fw = SqrtSIMD( fw ); // unfortunately we really do need full precision here
fltx4 result = Compress4SIMD( fx, fy, fz, fw );
// and for the coup de grace, set the sign bit of fw appropriately
result = OrSIMD( result, (fltx4)wsignbit );
return result;
#else
struct { float x; float y; float z; float w; } tmp;
tmp.x = ((int)x - 1024) * (1 / 1024.0);
tmp.y = ((int)y - 512) * (1 / 512.0);
tmp.z = ((int)z - 512) * (1 / 512.0);
tmp.w = sqrt( 1 - tmp.x * tmp.x - tmp.y * tmp.y - tmp.z * tmp.z );
if (wneg)
tmp.w = -tmp.w;
fltx4 ret = { tmp.x, tmp.y, tmp.z, tmp.w };
return ret;
#endif
}
//=========================================================
// 16 bit float
//=========================================================
const int float32bias = 127;
const int float16bias = 15;
const float maxfloat16bits = 65504.0f;
class float16
{
public:
// float16() {};
//float16( float f ) { m_storage.rawWord = ConvertFloatTo16bits(f); }
float16& operator=(const unsigned short &other) { m_storage.rawWord = other; return *this; };
void Init() { m_storage.rawWord = 0; }
// float16& operator=(const float16 &other) { m_storage.rawWord = other.m_storage.rawWord; return *this; }
// float16& operator=(const float &other) { m_storage.rawWord = ConvertFloatTo16bits(other); return *this; }
// operator unsigned short () { return m_storage.rawWord; }
// operator float () { return Convert16bitFloatTo32bits( m_storage.rawWord ); }
unsigned short GetBits() const
{
return m_storage.rawWord;
}
float GetFloat() const
{
return Convert16bitFloatTo32bits( m_storage.rawWord );
}
void SetFloat( float in )
{
m_storage.rawWord = ConvertFloatTo16bits( in );
}
bool IsInfinity() const
{
return m_storage.bits.biased_exponent == 31 && m_storage.bits.mantissa == 0;
}
bool IsNaN() const
{
return m_storage.bits.biased_exponent == 31 && m_storage.bits.mantissa != 0;
}
bool operator==(const float16 other) const { return m_storage.rawWord == other.m_storage.rawWord; }
bool operator!=(const float16 other) const { return m_storage.rawWord != other.m_storage.rawWord; }
// bool operator< (const float other) const { return GetFloat() < other; }
// bool operator> (const float other) const { return GetFloat() > other; }
template< bool BRANCHLESS > // allows you to force branchy/branchless implementation regardless of the current platform
static unsigned short ConvertFloatTo16bitsNonDefault( float input );
static float Convert16bitFloatTo32bits( unsigned short input );
// a special case useful for the pixel writer: take four input float values, which are already in memory (not on registers),
// convert them all at once and write them sequentially through the output pointer.
static void ConvertFourFloatsTo16BitsAtOnce( float16 * RESTRICT pOut,
const float *a, const float *b, const float *c, const float *d );
// unfortunately, function templates can't have default template parameters in 2010-era C++
inline static unsigned short ConvertFloatTo16bits( float input )
{ // default to branchless on ppc and branchy on x86
#ifdef PLATFORM_PPC
return ConvertFloatTo16bitsNonDefault<true>(input);
#else
return ConvertFloatTo16bitsNonDefault<false>(input);
#endif
}
protected:
union float32bits
{
float rawFloat;
uint32 rawAsInt;
struct
{
unsigned int mantissa : 23;
unsigned int biased_exponent : 8;
unsigned int sign : 1;
} bits;
};
union float16bits
{
unsigned short rawWord;
struct
{
unsigned short mantissa : 10;
unsigned short biased_exponent : 5;
unsigned short sign : 1;
} bits;
};
static bool IsNaN( float16bits in )
{
return in.bits.biased_exponent == 31 && in.bits.mantissa != 0;
}
static bool IsInfinity( float16bits in )
{
return in.bits.biased_exponent == 31 && in.bits.mantissa == 0;
}
// 0x0001 - 0x03ff
float16bits m_storage;
};
class float16_with_assign : public float16
{
public:
float16_with_assign() {}
float16_with_assign( float f ) { m_storage.rawWord = ConvertFloatTo16bits(f); }
float16& operator=(const float16 &other) { m_storage.rawWord = ((float16_with_assign &)other).m_storage.rawWord; return *this; }
float16& operator=(const float &other) { m_storage.rawWord = ConvertFloatTo16bits(other); return *this; }
// operator unsigned short () const { return m_storage.rawWord; }
operator float () const { return Convert16bitFloatTo32bits( m_storage.rawWord ); }
};
//=========================================================
// Fit a 3D vector in 48 bits
//=========================================================
class Vector48
{
public:
// Construction/destruction:
Vector48(void) {}
Vector48(vec_t X, vec_t Y, vec_t Z) { x.SetFloat( X ); y.SetFloat( Y ); z.SetFloat( Z ); }
// assignment
Vector48& operator=(const Vector &vOther);
operator Vector ();
const float operator[]( int i ) const { return (((float16 *)this)[i]).GetFloat(); }
float16 x;
float16 y;
float16 z;
};
// The uses of isel below are malformed because the first expression is unsigned and thus always >= 0,
// so this whole expression maps to a simple assignment. This was found through a noisy clang
// warning. I am preprocessing this out until it is needed.
#if 0
inline void float16::ConvertFourFloatsTo16BitsAtOnce( float16 * RESTRICT pOut,
const float *a, const float *b, const float *c, const float *d )
{
COMPILE_TIME_ASSERT( sizeof(float) == 4 );
// being meant for use on the PPC, this is tuned for that.
// it is mostly branchless, except for the large outer for loop,
// since there's enough instructions inside that unrolling is
// a bad idea. This fucntion is four-at-once to simplify SIMDifying in the
// future should a convenient SIMD way to decimate emerge
// Also, because this is only used for the special case of converting
// float arrays into float16 GPU textures, this turns denorms into zeroes
// and infinities into MAXFLTs, since the shader can't deal with nonfinite
// numbers anyway.
// alias the input floats onto a union giving their mantissa etc
const float32bits * const inFloat[4] = {
reinterpret_cast<const float32bits *>(a),
reinterpret_cast<const float32bits *>(b),
reinterpret_cast<const float32bits *>(c),
reinterpret_cast<const float32bits *>(d) };
const static unsigned int maxfloat16bitsAsInt = 0x477FE000; // 65504.0f
const static unsigned int SIGNBIT = 0x80000000;
for ( int i = 0 ; i < 4 ; ++i ) // performs better not unrolled (less stack spilling)
{
unsigned int onGPR = inFloat[i]->rawAsInt;
// make a mask for each word; will be all 1's if the float is
// negative, all 0s if it is positive. Can do this just by
// using arithmetic shift to smear out the sign bit.
int isNegative = ((int) onGPR) >> 31;
// clamp to be within -maxfloat16bits, maxfloat16bits
// can't just use isel because IEEE754 floats are sign-magnitude, not two's comp. However,
// positive IEEE754s can be compared as if they were ints. So, we need to do a little extra
// work to test the negative case efficiently.
// clamp to -maxfloat16
#error See above for explanation of why this and other uses of isel in this file are broken.
int clampedNeg = isel( ((int)(onGPR & ~SIGNBIT)) - maxfloat16bitsAsInt, // -in >= maxfloatbits so in <= -maxfloat
maxfloat16bitsAsInt | SIGNBIT, // -65504.0f
onGPR );
// clamp to +maxfloat16
int clampedPos = isel( ((int)(onGPR)) - maxfloat16bitsAsInt, // in >= maxfloatbits
maxfloat16bitsAsInt , // -65504.0f
onGPR );
// take advantage of PPC's andc operator to effectively do a masked-move
onGPR = ( clampedNeg & isNegative ) | ( clampedPos & ~isNegative );
// fish out the input exponent and mantis fields directly (using the union induces an LHS)
int inExponent = (onGPR & 0x7f800000) >> 23;
unsigned int inMantissa = (onGPR & 0x007FFFFF);
int exponent = inExponent - 127 + 15; // rebias the exponent
unsigned int mantissa = isel( exponent, inMantissa >> 13, (unsigned) 0 ); // squash the mantissa to zero if the number is too small to represent (no denorms)
float16bits output;
// saturate the mantissa if rebiased exponent >= 31 (too big to store)
output.bits.mantissa = isel( exponent - 31, (unsigned) 0x3ff, mantissa );
// clamp the exponent to 0..30
output.bits.biased_exponent = isel( exponent, isel( exponent - 31, 30, exponent ), 0 );
output.bits.sign = isNegative; // this doesn't lhs, but instead issues the insrdi op to a word on GPR
pOut[i].m_storage.rawWord = output.rawWord;
}
}
#endif
#ifdef _X360
#define __cntlzw _CountLeadingZeros
#endif
template< bool BRANCHLESS >
inline unsigned short float16::ConvertFloatTo16bitsNonDefault( float input )
{
float16bits output;
float32bits inFloat;
//if ( !BRANCHLESS ) // x86 code
{
if ( input > maxfloat16bits )
input = maxfloat16bits;
else if ( input < -maxfloat16bits )
input = -maxfloat16bits;
inFloat.rawFloat = input;
}
/*
// The use of isel is incorrect because the first expression is unsigned and therefore always passes
// the test.
else // PPC code
{
// force the float onto the stack and then a GPR so we eat the LHS only once.
// you can't just write to one union member and then read back another;
// the compiler is inconsistent about supporting that kind of type-punning.
// (ie, it will work in one file, but not another.)
memcpy(&inFloat.rawFloat, &input, sizeof(inFloat.rawFloat));
// inFloat.rawFloat = input;
// clamp using the GPR
{
const unsigned int maxfloat16bitsAsInt = 0x477FE000; // 65504.0f
// clamp to be <= maxfloat16bits
uint32 &rawint = inFloat.rawAsInt; // <--- lhs
if ( rawint & 0x80000000 ) // negative
{
// because floats are sign-magnitude, not two's comp, need to
// flip the int positive briefly to do the isel comparison
#error See above for explanation of why this and other uses of isel in this file are broken.
rawint = isel( ((int)(rawint & ~0x80000000)) - maxfloat16bitsAsInt, // -in >= maxfloatbits so in <= -maxfloat
maxfloat16bitsAsInt | 0x80000000, // -65504.0f
rawint );
}
else // positive
{
rawint = isel( ((int)(rawint)) - maxfloat16bitsAsInt, // in >= maxfloatbits
maxfloat16bitsAsInt , // -65504.0f
rawint );
}
}
}
*/
output.bits.sign = inFloat.bits.sign;
if ( (inFloat.bits.biased_exponent==0) )
{
// zero and denorm both map to zero
output.bits.mantissa = 0;
output.bits.biased_exponent = 0;
}
else if ( inFloat.bits.biased_exponent==0xff )
{
if ( !BRANCHLESS )
{
if ( (inFloat.bits.mantissa==0) )
{
/*
// infinity
output.bits.mantissa = 0;
output.bits.biased_exponent = 31;
*/
// infinity maps to maxfloat
output.bits.mantissa = 0x3ff;
output.bits.biased_exponent = 0x1e;
}
else if ( (inFloat.bits.mantissa!=0) )
{
/*
// NaN
output.bits.mantissa = 1;
output.bits.biased_exponent = 31;
*/
// NaN maps to zero
output.bits.mantissa = 0;
output.bits.biased_exponent = 0;
}
}
else // branchless, only meant for PPC really bc needing the cntlzw op.
{
// else if ( inFloat.bits.biased_exponent==0xff ) // either infinity (biased_exponent is 0xff) or NaN.
{
#ifdef PLATFORM_PPC
#if defined(__SPU__)
int mantissamask = __builtin_clz( output.bits.mantissa ) - 32; // this is 0 if the mantissa is zero, and negative otherwise
#else
int mantissamask = __cntlzw( output.bits.mantissa ) - 32; // this is 0 if the mantissa is zero, and negative otherwise
#endif
#else
int mantissamask = output.bits.mantissa ? -1 : 0;
#endif
output.bits.mantissa = isel( mantissamask, 0x3ff, 0 ); //infinity maps to maxfloat, NaN to zero
output.bits.biased_exponent = isel( mantissamask, 0x1e, 0 );
output.bits.sign = inFloat.bits.sign;
}
}
}
else
{
// regular number
int new_exp = inFloat.bits.biased_exponent-float32bias;
// it's actually better to branch in these cases on PPC,
// because the variable bit shift is such a massive penalty
// that it's worth a branch penalty to avoid it.
if (new_exp<-24)
{
// this maps to 0
output.bits.mantissa = 0;
output.bits.biased_exponent = 0;
}
if (new_exp<-14)
{
// this maps to a denorm
output.bits.biased_exponent = 0;
unsigned int exp_val = ( unsigned int )( -14 - new_exp );
if( exp_val > 0 && exp_val < 11 )
{
output.bits.mantissa = ( 1 << ( 10 - exp_val ) ) + ( inFloat.bits.mantissa >> ( 13 + exp_val ) );
}
}
else if (new_exp>15)
{
#if 0
// map this value to infinity
output.bits.mantissa = 0;
output.bits.biased_exponent = 31;
#else
// to big. . . maps to maxfloat
output.bits.mantissa = 0x3ff;
output.bits.biased_exponent = 0x1e;
#endif
}
else
{
output.bits.biased_exponent = new_exp+15;
output.bits.mantissa = (inFloat.bits.mantissa >> 13);
}
}
return output.rawWord;
}
inline float float16::Convert16bitFloatTo32bits( unsigned short input )
{
float32bits output;
const float16bits &inFloat = *((float16bits *)&input);
if( IsInfinity( inFloat ) )
{
return maxfloat16bits * ( ( inFloat.bits.sign == 1 ) ? -1.0f : 1.0f );
}
if( IsNaN( inFloat ) )
{
return 0.0;
}
if( inFloat.bits.biased_exponent == 0 && inFloat.bits.mantissa != 0 )
{
// denorm
const float half_denorm = (1.0f/16384.0f); // 2^-14
float mantissa = ((float)(inFloat.bits.mantissa)) / 1024.0f;
float sgn = (inFloat.bits.sign)? -1.0f :1.0f;
output.rawFloat = sgn*mantissa*half_denorm;
}
else
{
// regular number
unsigned mantissa = inFloat.bits.mantissa;
unsigned biased_exponent = inFloat.bits.biased_exponent;
unsigned sign = ((unsigned)inFloat.bits.sign) << 31;
biased_exponent = ( (biased_exponent - float16bias + float32bias) * (biased_exponent != 0) ) << 23;
mantissa <<= (23-10);
*((unsigned *)&output) = ( mantissa | biased_exponent | sign );
}
return output.rawFloat;
}
inline Vector48& Vector48::operator=(const Vector &vOther)
{
CHECK_VALID(vOther);
x.SetFloat( vOther.x );
y.SetFloat( vOther.y );
z.SetFloat( vOther.z );
return *this;
}
inline Vector48::operator Vector ()
{
Vector tmp;
tmp.x = x.GetFloat();
tmp.y = y.GetFloat();
tmp.z = z.GetFloat();
return tmp;
}
//=========================================================
// Fit a 2D vector in 32 bits
//=========================================================
class Vector2d32
{
public:
// Construction/destruction:
Vector2d32(void) {}
Vector2d32(vec_t X, vec_t Y) { x.SetFloat( X ); y.SetFloat( Y ); }
// assignment
Vector2d32& operator=(const Vector &vOther);
Vector2d32& operator=(const Vector2D &vOther);
operator Vector2D ();
void Init( vec_t ix = 0.f, vec_t iy = 0.f);
float16_with_assign x;
float16_with_assign y;
};
inline Vector2d32& Vector2d32::operator=(const Vector2D &vOther)
{
x.SetFloat( vOther.x );
y.SetFloat( vOther.y );
return *this;
}
inline Vector2d32::operator Vector2D ()
{
Vector2D tmp;
tmp.x = x.GetFloat();
tmp.y = y.GetFloat();
return tmp;
}
inline void Vector2d32::Init( vec_t ix, vec_t iy )
{
x.SetFloat(ix);
y.SetFloat(iy);
}
//=========================================================
// FAST SIMD BATCH OPERATIONS
//=========================================================
#ifdef _X360
//// Compressed vector formats: unpack Vector48 and Quaternion48 onto SIMD registers.
// Only available on 360 for now because SSE1 lacks the necessary operations. SSE2 could
// do it but we can't count on that yet.
// If you have many v48's or q48's to stream, please note the functions designed to
// work on them many at a time.
extern const uint16 ALIGN16 g_SIMD_Quat48_Unpack_Shift[]; //< Shuffles the z component of the quat48 left by one bit.
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute0[16];
extern const fltx4 g_SIMD_Quat48_Unpack_Magic_Constants;
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute1[16];
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute2[16];
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute3[16];
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute4[16];
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute5[16];
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute6[16];
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute7[16];
// unpack a single vector48 at the pointer into the x,y,z components of a fltx4.
// the w is total garbage.
FORCEINLINE fltx4 UnpackVector48SIMD( const Vector48 *pVec )
{
// load the three 16-bit floats into the first 48 bits of ret:
fltx4 ret = XMLoadVector4((const void *)&pVec->x);
// shuffle the top 64 bits of ret down to the least significant (the z,w) -- 16 of those bits are garbage.
ret = __vrlimi( ret, ret, 2 | 1, 2 ); // rotate left by 2 words and insert into z,w components
// now unpack the 16-bit floats into 32-bit floats. This is a hardware op, woohoo!
ret = __vupkd3d( ret , VPACK_FLOAT16_4 );
return ret;
}
// unpack a single Quaternion48 at the pointer into the x,y,z,w components of a fltx4
// FIXME!!! If we need a version of this that runs on 360, there is a work-in-progress version that hasn't been debugged lower in the file.
FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec )
{
// A quaternion 48 stores the x and y components as 0..65535 , which is almost mapped onto -1.0..1.0 via (x - 32768) / 32768.5 .
// z is stored as 0..32767, which is almost mapped onto -1..1 via (z - 16384) / 16384.5 .
// w is inferred from 1 - the dot product of the other tree components. the top bit of what would otherwise be the 16-bit z is
// w's sign bit.
fltx4 q16s = XMLoadVector3((const void *)pVec);
fltx4 shift = __lvx(&g_SIMD_Quat48_Unpack_Shift, 0); // load the aligned shift mask that we use to shuffle z.
fltx4 permute = __lvx(&g_SIMD_Quat48_Unpack_Permute0, 0); // load the permute word that shuffles x,y,z into their own words
bool wneg = pVec->wneg; // loading pVec into two different kinds of registers -- but not shuffling between (I hope!) so no LHS.
q16s = __vperm( q16s, Four_Threes, permute ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f
q16s = __vslh(q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
// each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1
const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };
const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants);
/*
fltx4 ret = __vcfux( q16s, 0 ); // convert from uint16 to floats.
// scale from 0..65535 to -1..1 : tmp.x = ((int)x - 32768) * (1 / 32768.0);
ret = __vmaddfp( ret, g_SIMD_Quat48_DivByU15, Four_NegativeOnes );
*/
fltx4 ret = __vmaddfp( q16s, vUpkMul, vUpkAdd );
// now, work out what w must be.
fltx4 dotxyz = Dot3SIMD( ret, ret ); // all components are dot product of ret w/ self.
dotxyz = ClampVectorSIMD( dotxyz, Four_Zeros, Four_Ones );
fltx4 ww = SubSIMD( Four_Ones, dotxyz ); // all components are 1 - dotxyz
ww = SqrtSIMD(ww); // all components are sqrt(1-dotxyz)
if (wneg)
{
ret = __vrlimi( ret, NegSIMD(ww), 1, 0 ); // insert one element from the ww vector into the w component of ret
}
else
{
ret = __vrlimi( ret, ww, 1, 0 ); // insert one element from the ww vector into the w component of ret
}
return ret;
}
// Many-at-a-time unpackers.
/// Unpack eight consecutive Vector48's in memory onto eight SIMD registers.
/// The Vector48 pointer must be 16-byte aligned. Eight Vector48s add up
/// to 48 bytes long. You should maybe think about prefetching.
FORCEINLINE void UnpackEightVector48SIMD( fltx4 &out1, fltx4 &out2, fltx4 &out3, fltx4 &out4,
fltx4 &out5, fltx4 &out6, fltx4 &out7, fltx4 &out8,
Vector48 * RESTRICT pVecs )
{
AssertMsg((reinterpret_cast<unsigned int>(pVecs) & 0x0F) == 0, "Input to UnpackEightVector48SIMD is not 16-byte aligned." );
// first load the data onto three packed SIMD vectors, which contain eight Vector48s between them.
// I've named them very explicitly so you can follow the movement of the input data.
fltx4 x0y0z0x1y1z1x2y2, z2x3y3z3x4y4z4x5, y5z5x6y6z6x7y7z7;
x0y0z0x1y1z1x2y2 = __lvx( pVecs, 0 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 0
z2x3y3z3x4y4z4x5 = __lvx( pVecs, 16 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 1
y5z5x6y6z6x7y7z7 = __lvx( pVecs, 32 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 2
// Now, start unpacking. The __vupkd3d operation can turn 16-bit floats into 32-bit floats in a single op!
// It converts the contents of the z and w words of the input fltx4 , so we need to process a word to do
// one half, then rotate it to do the other half.
fltx4 y1z1x2y2 = __vupkd3d( x0y0z0x1y1z1x2y2 , VPACK_FLOAT16_4 );
x0y0z0x1y1z1x2y2 = __vrlimi( x0y0z0x1y1z1x2y2, x0y0z0x1y1z1x2y2, 0xf, 2 ); // actually y1z1x2y2x0y0z0x1 now. For perf it's important that the first param to vrlimi also be the assignee.
fltx4 x4y4z4x5 = __vupkd3d( z2x3y3z3x4y4z4x5 , VPACK_FLOAT16_4 );
z2x3y3z3x4y4z4x5 = __vrlimi( z2x3y3z3x4y4z4x5, z2x3y3z3x4y4z4x5, 0xf, 2 );
fltx4 z6x7y7z7 = __vupkd3d( y5z5x6y6z6x7y7z7 , VPACK_FLOAT16_4 );
y5z5x6y6z6x7y7z7 = __vrlimi( y5z5x6y6z6x7y7z7, y5z5x6y6z6x7y7z7, 0xf, 2 );
fltx4 x0y0z0x1 = __vupkd3d( x0y0z0x1y1z1x2y2 , VPACK_FLOAT16_4 );
fltx4 z2x3y3z3 = __vupkd3d( z2x3y3z3x4y4z4x5 , VPACK_FLOAT16_4 );
fltx4 y5z5x6y6 = __vupkd3d( y5z5x6y6z6x7y7z7 , VPACK_FLOAT16_4 );
// permute to populate the out-registers with part of their vectors:
out1 = x0y0z0x1; // DONE
out2 = __vpermwi( y1z1x2y2, VPERMWI_CONST(0, 0, 1, 0) ); // __y1z1__
out3 = __vpermwi( y1z1x2y2, VPERMWI_CONST(2, 3, 0, 0) ); // x2y2____
out4 = __vpermwi( z2x3y3z3, VPERMWI_CONST(1, 2, 3, 0) ); // x3y3z3__ // DONE
out5 = x4y4z4x5; // DONE
out6 = __vpermwi( y5z5x6y6, VPERMWI_CONST(0, 0, 1, 0) ); // __y5z5__
out7 = __vpermwi( y5z5x6y6, VPERMWI_CONST(2, 3, 0, 0) ); // x6y6____
out8 = __vpermwi( z6x7y7z7, VPERMWI_CONST(1, 2, 3, 0) ); // x7y7z7__ // DONE
// there are four more to finish, which we do with a masked insert
out2 = __vrlimi( out2, x0y0z0x1, 8, 3 ); // x1y1z1__
out3 = __vrlimi( out3, z2x3y3z3, 2, 2 ); // x2y2x2__
out6 = __vrlimi( out6, x4y4z4x5, 8, 3 ); // x5y5z5__
out7 = __vrlimi( out7, z6x7y7z7, 2, 2 ); // x6y6z6__
// and we're done!
}
/// Unpack eight consecutive Quaternion48's in memory onto eight SIMD registers.
/// The Quaternion48 pointer must be 16-byte aligned. Eight Quaternion48s add up
/// to 48 bytes long. You should maybe think about prefetching.
//
// This could be improved with verticalization, so that the W sqrts happen
// on two rather than eight vectors, and then transposing. This would make
// the initial permuatation even more complicated.
FORCEINLINE void UnpackEightQuaternion48SIMD( fltx4 &out0, fltx4 &out1, fltx4 &out2, fltx4 &out3,
fltx4 &out4, fltx4 &out5, fltx4 &out6, fltx4 &out7,
Quaternion48 * RESTRICT pVecs )
{
AssertMsg((reinterpret_cast<unsigned int>(pVecs) & 0x0F) == 0, "Input to UnpackEightQuaternion48SIMD is not 16-byte aligned." );
// each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1
const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };
const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants);
const fltx4 shift = __lvx(&g_SIMD_Quat48_Unpack_Shift, 0); // load the aligned shift mask that we use to shuffle z left by one bit.
// first load the data onto three packed SIMD vectors, which contain eight Quaternion48s between them.
// I've named them very explicitly so you can follow the movement of the input data.
fltx4 x0y0z0x1y1z1x2y2, z2x3y3z3x4y4z4x5, y5z5x6y6z6x7y7z7;
x0y0z0x1y1z1x2y2 = __lvx( pVecs, 0 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 0
z2x3y3z3x4y4z4x5 = __lvx( pVecs, 16 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 1
y5z5x6y6z6x7y7z7 = __lvx( pVecs, 32 ); // load reintrepret_cast<fltx 4 *>(pVecs) + 2
// shove each quat onto its own fltx4, by using the permute operation
// each halfword argument goes into the bottom 16 bits of the floating
// point rep of 3.0f, then we use a magic constant to scale them.
out0 = __vperm( x0y0z0x1y1z1x2y2, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute0) ); // __x0__y0__z0____
out1 = __vperm( x0y0z0x1y1z1x2y2, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute1) ); // __x1__y1__z1____
// postpone 2 since it straddles two words, we'll get back to it
out3 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute3) ); // __x3__y3__z3__z2 // z2 is important, goes into out2
out4 = __vperm( z2x3y3z3x4y4z4x5, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute4) ); // __x4__y4__z4__x5 // x5 is important, goes into out5
// 5 straddles two words
out6 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute6) ); // __x6__y6__z6____
out7 = __vperm( y5z5x6y6z6x7y7z7, Four_Threes, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute7) ); // __x7__y7__z7____
// now get back to the straddlers, which we make by blending together a prior output and the other source word
out2 = __vperm( x0y0z0x1y1z1x2y2, out3, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute2) ); // __x2__y2__z2____
out5 = __vperm( y5z5x6y6z6x7y7z7, out4, *reinterpret_cast<const fltx4 *>(&g_SIMD_Quat48_Unpack_Permute5) ); // __x5__y5__z5____
// the top bit of the z component in each word isn't part of the number; it's
// a flag indicating whether the eventual w component should be negative.
// so, we need to move the 0x00008000 bit of the z word onto the top bit
// of the w word, which is a rotation two bytes right, or 14 bytes left.
fltx4 wneg[8];
// juggle all the z halfwords left one bit (toss the wneg sign bit, multiply by two)
wneg[0] = __vsldoi( out0, out0, 14 );
out0 = __vslh(out0, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
wneg[1] = __vsldoi( out1, out1, 14 );
out1 = __vslh(out1, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
wneg[2] = __vsldoi( out2, out2, 14 );
out2 = __vslh(out2, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
wneg[3] = __vsldoi( out3, out3, 14 );
out3 = __vslh(out3, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
wneg[4] = __vsldoi( out4, out4, 14 );
out4 = __vslh(out4, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
wneg[5] = __vsldoi( out5, out5, 14 );
out5 = __vslh(out5, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
wneg[6] = __vsldoi( out6, out6, 14 );
out6 = __vslh(out6, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
wneg[7] = __vsldoi( out7, out7, 14 );
out7 = __vslh(out7, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
// create a mask that is just the sign bit of the w word.
fltx4 vAllOneBits = __vspltisw(-1); // Shift 31
fltx4 signMask = __vslw(vAllOneBits, vAllOneBits); // all the sign bits
signMask = __vrlimi( signMask, Four_Zeros, 14, 0 ); // zero out x,y,z words
// this macro defines the operations that will be performed on each of the eight words:
// * scale from 0..65535 to -1..1 : tmp.x = ((int)x - 32768) * (1 / 32768.0);
// * take the xyz dot product to get 1 - w^2
// * subtract from one to get w^2
// * square root to get zero
// * OR in the wneg sign mask to get sign for zero.
// though the macro makes it look like these are being done in serial,
// in fact the compiler will reorder them to minimize stalls.
fltx4 ONE = Four_Ones;
fltx4 dotxyz[8];
fltx4 ww[8];
// out0 = __vmaddfp( out0, vUpkMul, vUpkAdd );
// dotxyz[0] = Dot3SIMD( out0, out0 );
// clamnp dotxyz if it's more than 1.0
// all components are 1 - dotxyz
// clear all but w's sign bit in wneg
// all components are sqrt(1-dotxyz)
// toggle w's sign where necessary
// insert one element from the ww vector into the w component of ret
#define COMPUTE( target, number ) \
target ## number = __vmaddfp( target ## number, vUpkMul, vUpkAdd ); \
dotxyz[number] = Dot3SIMD( target ## number, target ## number ); \
dotxyz[number] = __vminfp( dotxyz[number], ONE ); \
ww[number] = SubSIMD( ONE, dotxyz[number] ); \
wneg[number] = AndSIMD( wneg[number], signMask ) ; \
ww[number] = SqrtSIMD(ww[number]); \
ww[number] = OrSIMD( ww[number], wneg[number] ); \
target ## number = __vrlimi( target ## number, ww[number], 1, 0 );
COMPUTE(out, 0);
COMPUTE(out, 1);
COMPUTE(out, 2);
COMPUTE(out, 3);
COMPUTE(out, 4);
COMPUTE(out, 5);
COMPUTE(out, 6);
COMPUTE(out, 7);
#undef COMPUTE
}
#elif defined(_PS3)
// unpack a single vector48 at the pointer into the x,y,z components of a fltx4.
// the w is total garbage.
FORCEINLINE fltx4 UnpackVector48SIMD( const Vector48 *pVec )
{
// PS3 libs just give us this
Vectormath::Aos::Vector3 ret;
Vectormath::Aos::loadHalfFloats( ret, reinterpret_cast<const uint16_t *>(&pVec->x) );
return ret.get128();
}
extern const uint16 ALIGN16 g_SIMD_Quat48_Unpack_Shift[]; //< Shuffles the z component of the quat48 left by one bit.
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute0[16];
extern const fltx4 g_SIMD_Quat48_Unpack_Magic_Constants;
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute1[16];
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute2[16];
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute3[16];
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute4[16];
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute5[16];
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute6[16];
extern const uint8 ALIGN16 g_SIMD_Quat48_Unpack_Permute7[16];
// unpack a single Quaternion48 at the pointer into the x,y,z,w components of a fltx4
FORCEINLINE fltx4 UnpackQuaternion48SIMD( const Quaternion48 * RESTRICT pVec )
{
// A quaternion 48 stores the x and y components as 0..65535 , which is almost mapped onto -1.0..1.0 via (x - 32768) / 32768.5 .
// z is stored as 0..32767, which is almost mapped onto -1..1 via (z - 16384) / 16384.5 .
// w is inferred from 1 - the dot product of the other tree components. the top bit of what would otherwise be the 16-bit z is
// w's sign bit.
fltx4 q16s = LoadUnaligned3SIMD((const void *)pVec);
#if defined(__SPU__)
vec_ushort8 shift = vec_ld( 0, (short unsigned int *)g_SIMD_Quat48_Unpack_Shift ); // load the aligned shift mask that we use to shuffle z.
vec_uchar16 permute = vec_ld(0, (unsigned char *)g_SIMD_Quat48_Unpack_Permute0 ); // load the permute word that shuffles x,y,z into their own words
#else
vec_ushort8 shift = vec_ld( 0, g_SIMD_Quat48_Unpack_Shift ); // load the aligned shift mask that we use to shuffle z.
vec_uchar16 permute = vec_ld(0, g_SIMD_Quat48_Unpack_Permute0 ); // load the permute word that shuffles x,y,z into their own words
#endif
bool wneg = pVec->wneg; // loading pVec into two different kinds of registers -- but not shuffling between (I hope!) so no LHS.
q16s = vec_perm( q16s, Four_Threes, permute ); // permute so that x, y, and z are now each in their own words. The top half is the floating point rep of 3.0f
#if defined(__SPU__)
q16s = (fltx4) vec_sl( (vec_ushort8) q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
#else
q16s = (fltx4) vec_vslh( (vec_ushort8) q16s, shift); // shift the z component left by one bit, tossing out the wneg sign bit and mapping z from [0..2^15) to [0..2^16)
#endif
// each word of q16s contains 3.0 + n * 2^-22 -- convert this so that we get numbers on the range -1..1
const fltx4 vUpkMul = SplatXSIMD(g_SIMD_Quat48_Unpack_Magic_Constants); // { UnpackMul16s, UnpackMul16s, UnpackMul16s, UnpackMul16s };
const fltx4 vUpkAdd = SplatYSIMD(g_SIMD_Quat48_Unpack_Magic_Constants);
fltx4 ret = vec_madd( q16s, vUpkMul, vUpkAdd );
// now, work out what w must be.
fltx4 dotxyz = Dot3SIMD( ret, ret ); // all components are dot product of ret w/ self.
dotxyz = ClampVectorSIMD( dotxyz, Four_Zeros, Four_Ones );
fltx4 ww = SubSIMD( Four_Ones, dotxyz ); // all components are 1 - dotxyz
ww = SqrtSIMD(ww); // all components are sqrt(1-dotxyz)
// insert one element from the ww vector into the w component of ret
ret = MaskedAssign( LoadAlignedSIMD(g_SIMD_ComponentMask[3]), wneg ? NegSIMD(ww) : ww, ret );
return ret;
}
#endif
#if defined( _X360 )
#pragma bitfield_order( pop )
#elif defined( _PS3 )
#pragma ms_struct off
#pragma reverse_bitfields off
#endif
#endif