//===== Copyright © 1996-2005, Valve Corporation, All rights reserved. ======//
//
// Purpose: - defines SIMD "structure of arrays" classes and functions.
//
//===========================================================================//
#ifndef SSEQUATMATH_H
#define SSEQUATMATH_H

#ifdef _WIN32
#pragma once
#endif


#include "mathlib/ssemath.h"

// Use this #define to allow SSE versions of Quaternion math
// to exist on PC.
// On PC, certain horizontal vector operations are not supported.
// This causes the SSE implementation of quaternion math to mix the
// vector and scalar floating point units, which is extremely 
// performance negative if you don't compile to native SSE2 (which 
// we don't as of Sept 1, 2007). So, it's best not to allow these
// functions to exist at all. It's not good enough to simply replace
// the contents of the functions with scalar math, because each call
// to LoadAligned and StoreAligned will result in an unnecssary copy
// of the quaternion, and several moves to and from the XMM registers.
//
// Basically, the problem you run into is that for efficient SIMD code,
// you need to load the quaternions and vectors into SIMD registers and
// keep them there as long as possible while doing only SIMD math,
// whereas for efficient scalar code, each time you copy onto or ever
// use a fltx4, it hoses your pipeline. So the difference has to be
// in the management of temporary variables in the calling function,
// not inside the math functions.
//
// If you compile assuming the presence of SSE2, the MSVC will abandon
// the traditional x87 FPU operations altogether and make everything use
// the SSE2 registers, which lessens this problem a little.

// permitted only on 360, as we've done careful tuning on its Altivec math.
// FourQuaternions, however, are always allowed, because vertical ops are
// fine on SSE.
#ifdef _X360
#define ALLOW_SIMD_QUATERNION_MATH 1  // not on PC!
#endif



//---------------------------------------------------------------------
// Load/store quaternions
//---------------------------------------------------------------------
#ifndef _X360
// Using STDC or SSE
FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD )
{
	fltx4 retval = LoadAlignedSIMD( pSIMD.Base() );
	return retval;
}

FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD )
{
	fltx4 retval = LoadAlignedSIMD( pSIMD->Base() );
	return retval;
}

FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a )
{
	StoreAlignedSIMD( pSIMD->Base(), a );
}
#else

// for the transitional class -- load a QuaternionAligned
FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned & pSIMD )
{
	fltx4 retval = XMLoadVector4A( pSIMD.Base() );
	return retval;
}

FORCEINLINE fltx4 LoadAlignedSIMD( const QuaternionAligned * RESTRICT pSIMD )
{
	fltx4 retval = XMLoadVector4A( pSIMD );
	return retval;
}

FORCEINLINE void StoreAlignedSIMD( QuaternionAligned * RESTRICT pSIMD, const fltx4 & a )
{
	XMStoreVector4A( pSIMD->Base(), a );
}

// From a RadianEuler packed onto a fltx4, to a quaternion
fltx4 AngleQuaternionSIMD( FLTX4 vAngles );

#endif


#if ALLOW_SIMD_QUATERNION_MATH
//---------------------------------------------------------------------
// Make sure quaternions are within 180 degrees of one another, if not, reverse q
//---------------------------------------------------------------------
FORCEINLINE fltx4 QuaternionAlignSIMD( const fltx4 &p, const fltx4 &q )
{
	// decide if one of the quaternions is backwards
	fltx4 a = SubSIMD( p, q );
	fltx4 b = AddSIMD( p, q );
	a = Dot4SIMD( a, a );
	b = Dot4SIMD( b, b );
	fltx4 cmp = CmpGtSIMD( a, b );
	fltx4 result = MaskedAssign( cmp, NegSIMD(q), q );
	return result;
}

//---------------------------------------------------------------------
// Normalize Quaternion
//---------------------------------------------------------------------
#if USE_STDC_FOR_SIMD

FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q )
{
	fltx4 radius, result;
	radius = Dot4SIMD( q, q );

	if ( SubFloat( radius, 0 ) ) // > FLT_EPSILON && ((radius < 1.0f - 4*FLT_EPSILON) || (radius > 1.0f + 4*FLT_EPSILON))
	{
		float iradius = 1.0f / sqrt( SubFloat( radius, 0 ) );
		result = ReplicateX4( iradius );
		result = MulSIMD( result, q );
		return result;
	}
	return q;
}

#else

// SSE + X360 implementation
FORCEINLINE fltx4 QuaternionNormalizeSIMD( const fltx4 &q )
{
	fltx4 radius, result, mask;
	radius = Dot4SIMD( q, q );
	mask = CmpEqSIMD( radius, Four_Zeros ); // all ones iff radius = 0
	result = ReciprocalSqrtSIMD( radius );
	result = MulSIMD( result, q );
	return MaskedAssign( mask, q, result );	// if radius was 0, just return q
}

#endif


//---------------------------------------------------------------------
// 0.0 returns p, 1.0 return q.
//---------------------------------------------------------------------
FORCEINLINE fltx4 QuaternionBlendNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )
{
	fltx4 sclp, sclq, result;
	sclq = ReplicateX4( t );
	sclp = SubSIMD( Four_Ones, sclq );
	result = MulSIMD( sclp, p );
	result = MaddSIMD( sclq, q, result );
	return QuaternionNormalizeSIMD( result );
}


//---------------------------------------------------------------------
// Blend Quaternions
//---------------------------------------------------------------------
FORCEINLINE fltx4 QuaternionBlendSIMD( const fltx4 &p, const fltx4 &q, float t )
{
	// decide if one of the quaternions is backwards
	fltx4 q2, result;
	q2 = QuaternionAlignSIMD( p, q );
	result = QuaternionBlendNoAlignSIMD( p, q2, t );
	return result;
}


//---------------------------------------------------------------------
// Multiply Quaternions
//---------------------------------------------------------------------
#ifndef _X360

// SSE and STDC
FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q )
{
	// decide if one of the quaternions is backwards
	fltx4 q2, result;
	q2 = QuaternionAlignSIMD( p, q );
	SubFloat( result, 0 ) =  SubFloat( p, 0 ) * SubFloat( q2, 3 ) + SubFloat( p, 1 ) * SubFloat( q2, 2 ) - SubFloat( p, 2 ) * SubFloat( q2, 1 ) + SubFloat( p, 3 ) * SubFloat( q2, 0 );
	SubFloat( result, 1 ) = -SubFloat( p, 0 ) * SubFloat( q2, 2 ) + SubFloat( p, 1 ) * SubFloat( q2, 3 ) + SubFloat( p, 2 ) * SubFloat( q2, 0 ) + SubFloat( p, 3 ) * SubFloat( q2, 1 );
	SubFloat( result, 2 ) =  SubFloat( p, 0 ) * SubFloat( q2, 1 ) - SubFloat( p, 1 ) * SubFloat( q2, 0 ) + SubFloat( p, 2 ) * SubFloat( q2, 3 ) + SubFloat( p, 3 ) * SubFloat( q2, 2 );
	SubFloat( result, 3 ) = -SubFloat( p, 0 ) * SubFloat( q2, 0 ) - SubFloat( p, 1 ) * SubFloat( q2, 1 ) - SubFloat( p, 2 ) * SubFloat( q2, 2 ) + SubFloat( p, 3 ) * SubFloat( q2, 3 );
	return result;
}

#else 

// X360
extern const fltx4 g_QuatMultRowSign[4];
FORCEINLINE fltx4 QuaternionMultSIMD( const fltx4 &p, const fltx4 &q )
{
	fltx4 q2, row, result;
	q2 = QuaternionAlignSIMD( p, q );

	row = XMVectorSwizzle( q2, 3, 2, 1, 0 );
	row = MulSIMD( row, g_QuatMultRowSign[0] );
	result = Dot4SIMD( row, p );

	row = XMVectorSwizzle( q2, 2, 3, 0, 1 );
	row = MulSIMD( row, g_QuatMultRowSign[1] );
	row = Dot4SIMD( row, p );
	result = __vrlimi( result, row, 4, 0 );
	
	row = XMVectorSwizzle( q2, 1, 0, 3, 2 );
	row = MulSIMD( row, g_QuatMultRowSign[2] );
	row = Dot4SIMD( row, p );
	result = __vrlimi( result, row, 2, 0 );
	
	row = MulSIMD( q2, g_QuatMultRowSign[3] );
	row = Dot4SIMD( row, p );
	result = __vrlimi( result, row, 1, 0 );
	return result;
}

#endif


//---------------------------------------------------------------------
// Quaternion scale
//---------------------------------------------------------------------
#ifndef _X360

// SSE and STDC
FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )
{
	float r;
	fltx4 q;

	// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to 
	// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.
	float sinom = sqrt( SubFloat( p, 0 ) * SubFloat( p, 0 ) + SubFloat( p, 1 ) * SubFloat( p, 1 ) + SubFloat( p, 2 ) * SubFloat( p, 2 ) );
	sinom = min( sinom, 1.f );

	float sinsom = sin( asin( sinom ) * t );

	t = sinsom / (sinom + FLT_EPSILON);
	SubFloat( q, 0 ) = t * SubFloat( p, 0 );
	SubFloat( q, 1 ) = t * SubFloat( p, 1 );
	SubFloat( q, 2 ) = t * SubFloat( p, 2 );

	// rescale rotation
	r = 1.0f - sinsom * sinsom;

	// Assert( r >= 0 );
	if (r < 0.0f) 
		r = 0.0f;
	r = sqrt( r );

	// keep sign of rotation
	SubFloat( q, 3 ) = fsel( SubFloat( p, 3 ), r, -r );
	return q;
}

#else

// X360
FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )
{
	fltx4 sinom = Dot3SIMD( p, p );
	sinom = SqrtSIMD( sinom );
	sinom = MinSIMD( sinom, Four_Ones );
	fltx4 sinsom = ArcSinSIMD( sinom );
	fltx4 t4 = ReplicateX4( t );
	sinsom = MulSIMD( sinsom, t4 );
	sinsom = SinSIMD( sinsom );
	sinom = AddSIMD( sinom, Four_Epsilons );
	sinom = ReciprocalSIMD( sinom );
	t4 = MulSIMD( sinsom, sinom );
	fltx4 result = MulSIMD( p, t4 );

	// rescale rotation
	sinsom = MulSIMD( sinsom, sinsom );
	fltx4 r = SubSIMD( Four_Ones, sinsom );
	r = MaxSIMD( r, Four_Zeros );
	r = SqrtSIMD( r );

	// keep sign of rotation
	fltx4 cmp = CmpGeSIMD( p, Four_Zeros );
	r = MaskedAssign( cmp, r, NegSIMD( r ) );

	result = __vrlimi(result, r, 1, 0);
	return result;
}

// X360
// assumes t4 contains a float replicated to each slot
FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, const fltx4 &t4 )
{
	fltx4 sinom = Dot3SIMD( p, p );
	sinom = SqrtSIMD( sinom );
	sinom = MinSIMD( sinom, Four_Ones );
	fltx4 sinsom = ArcSinSIMD( sinom );
	sinsom = MulSIMD( sinsom, t4 );
	sinsom = SinSIMD( sinsom );
	sinom = AddSIMD( sinom, Four_Epsilons );
	sinom = ReciprocalSIMD( sinom );
	fltx4 result = MulSIMD( p, MulSIMD( sinsom, sinom ) );

	// rescale rotation
	sinsom = MulSIMD( sinsom, sinsom );
	fltx4 r = SubSIMD( Four_Ones, sinsom );
	r = MaxSIMD( r, Four_Zeros );
	r = SqrtSIMD( r );

	// keep sign of rotation
	fltx4 cmp = CmpGeSIMD( p, Four_Zeros );
	r = MaskedAssign( cmp, r, NegSIMD( r ) );

	result = __vrlimi(result, r, 1, 0);
	return result;
}

#endif


//-----------------------------------------------------------------------------
// Quaternion sphereical linear interpolation
//-----------------------------------------------------------------------------
#ifndef _X360

// SSE and STDC
FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )
{
	float omega, cosom, sinom, sclp, sclq;

	fltx4 result;

	// 0.0 returns p, 1.0 return q.
	cosom = SubFloat( p, 0 ) * SubFloat( q, 0 ) + SubFloat( p, 1 ) * SubFloat( q, 1 ) + 
		SubFloat( p, 2 ) * SubFloat( q, 2 ) + SubFloat( p, 3 ) * SubFloat( q, 3 );

	if ( (1.0f + cosom ) > 0.000001f ) 
	{
		if ( (1.0f - cosom ) > 0.000001f ) 
		{
			omega = acos( cosom );
			sinom = sin( omega );
			sclp = sin( (1.0f - t)*omega) / sinom;
			sclq = sin( t*omega ) / sinom;
		}
		else 
		{
			// TODO: add short circuit for cosom == 1.0f?
			sclp = 1.0f - t;
			sclq = t;
		}
		SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( q, 0 );
		SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( q, 1 );
		SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( q, 2 );
		SubFloat( result, 3 ) = sclp * SubFloat( p, 3 ) + sclq * SubFloat( q, 3 );
	}
	else 
	{
		SubFloat( result, 0 ) = -SubFloat( q, 1 );
		SubFloat( result, 1 ) =  SubFloat( q, 0 );
		SubFloat( result, 2 ) = -SubFloat( q, 3 );
		SubFloat( result, 3 ) =  SubFloat( q, 2 );
		sclp = sin( (1.0f - t) * (0.5f * M_PI));
		sclq = sin( t * (0.5f * M_PI));
		SubFloat( result, 0 ) = sclp * SubFloat( p, 0 ) + sclq * SubFloat( result, 0 );
		SubFloat( result, 1 ) = sclp * SubFloat( p, 1 ) + sclq * SubFloat( result, 1 );
		SubFloat( result, 2 ) = sclp * SubFloat( p, 2 ) + sclq * SubFloat( result, 2 );
	}

	return result;
}

#else

// X360
FORCEINLINE fltx4 QuaternionSlerpNoAlignSIMD( const fltx4 &p, const fltx4 &q, float t )
{
	return XMQuaternionSlerp( p, q, t );
}

#endif


FORCEINLINE fltx4 QuaternionSlerpSIMD( const fltx4 &p, const fltx4 &q, float t )
{
	fltx4 q2, result;
	q2 = QuaternionAlignSIMD( p, q );
	result = QuaternionSlerpNoAlignSIMD( p, q2, t );
	return result;
}


#endif // ALLOW_SIMD_QUATERNION_MATH


/// class FourVectors stores 4 independent vectors for use in SIMD processing. These vectors are
/// stored in the format x x x x y y y y z z z z so that they can be efficiently SIMD-accelerated.
class ALIGN16 FourQuaternions
{
public:
	fltx4 x,y,z,w;

	FourQuaternions(void)
	{
	}

	FourQuaternions( const fltx4 &_x, 
					 const fltx4 &_y, 
					 const fltx4 &_z, 
					 const fltx4 &_w )
					 : x(_x), y(_y), z(_z), w(_w)
	{}

	FourQuaternions( FourQuaternions const &src )
	{
		x=src.x;
		y=src.y;
		z=src.z;
		w=src.w;
	}

	FORCEINLINE void operator=( FourQuaternions const &src )
	{
		x=src.x;
		y=src.y;
		z=src.z;
		w=src.w;
	}

	/// this = this * q;
	FORCEINLINE FourQuaternions Mul( FourQuaternions const &q ) const;

	/// negate the vector part
	FORCEINLINE FourQuaternions Conjugate() const;

	/// for a quaternion representing a rotation of angle theta, return
	/// one of angle s*theta
	/// scale is four floats -- one for each quat
	FORCEINLINE FourQuaternions ScaleAngle( const fltx4 &scale ) const;

	/// ret = this * ( s * q ) 
	/// In other words, for a quaternion representing a rotation of angle theta, return
	/// one of angle s*theta
	/// s is four floats in a fltx4 -- one for each quaternion
	FORCEINLINE FourQuaternions MulAc( const fltx4 &s, const FourQuaternions &q ) const;

	/// ret = ( s * this ) * q
	FORCEINLINE FourQuaternions ScaleMul( const fltx4 &s, const FourQuaternions &q ) const;

	/// Slerp four quaternions at once, FROM me TO the specified out.
	FORCEINLINE FourQuaternions Slerp( const FourQuaternions &to, const fltx4 &t );

	FORCEINLINE FourQuaternions SlerpNoAlign( const FourQuaternions &originalto, const fltx4 &t );

	/// LoadAndSwizzleAligned - load 4 QuaternionAligneds into a FourQuaternions, performing transpose op.
	/// all 4 vectors must be 128 bit boundary
	FORCEINLINE void LoadAndSwizzleAligned(const float *RESTRICT a, const float *RESTRICT b, const float *RESTRICT c, const float *RESTRICT d)
	{
#if _X360
		fltx4 tx = LoadAlignedSIMD(a);
		fltx4 ty = LoadAlignedSIMD(b);
		fltx4 tz = LoadAlignedSIMD(c);
		fltx4 tw = LoadAlignedSIMD(d);
		fltx4 r0 = __vmrghw(tx, tz);
		fltx4 r1 = __vmrghw(ty, tw);
		fltx4 r2 = __vmrglw(tx, tz);
		fltx4 r3 = __vmrglw(ty, tw);

		x = __vmrghw(r0, r1);
		y = __vmrglw(r0, r1);
		z = __vmrghw(r2, r3);
		w = __vmrglw(r2, r3);
#else
		x		= LoadAlignedSIMD(a);
		y		= LoadAlignedSIMD(b);
		z		= LoadAlignedSIMD(c);
		w		= LoadAlignedSIMD(d);
		// now, matrix is:
		// x y z w
		// x y z w
		// x y z w
		// x y z w
		TransposeSIMD(x, y, z, w);
#endif
	}

	FORCEINLINE void LoadAndSwizzleAligned(const QuaternionAligned * RESTRICT a, 
		const QuaternionAligned * RESTRICT b, 
		const QuaternionAligned * RESTRICT c, 
		const QuaternionAligned * RESTRICT d)
	{
		LoadAndSwizzleAligned(a->Base(), b->Base(), c->Base(), d->Base() );
	}


	/// LoadAndSwizzleAligned - load 4 consecutive QuaternionAligneds into a FourQuaternions, 
	/// performing transpose op.
	/// all 4 vectors must be 128 bit boundary
	FORCEINLINE void LoadAndSwizzleAligned(const QuaternionAligned *qs)
	{
#if _X360
		fltx4 tx = LoadAlignedSIMD(qs++);
		fltx4 ty = LoadAlignedSIMD(qs++);
		fltx4 tz = LoadAlignedSIMD(qs++);
		fltx4 tw = LoadAlignedSIMD(qs);
		fltx4 r0 = __vmrghw(tx, tz);
		fltx4 r1 = __vmrghw(ty, tw);
		fltx4 r2 = __vmrglw(tx, tz);
		fltx4 r3 = __vmrglw(ty, tw);

		x = __vmrghw(r0, r1);
		y = __vmrglw(r0, r1);
		z = __vmrghw(r2, r3);
		w = __vmrglw(r2, r3);
#else
		x		= LoadAlignedSIMD(qs++);
		y		= LoadAlignedSIMD(qs++);
		z		= LoadAlignedSIMD(qs++);
		w		= LoadAlignedSIMD(qs++);
		// now, matrix is:
		// x y z w
		// x y z w
		// x y z w
		// x y z w
		TransposeSIMD(x, y, z, w);
#endif
	}

	// Store the FourQuaternions out to four nonconsecutive ordinary quaternions in memory.
	FORCEINLINE void SwizzleAndStoreAligned(QuaternionAligned *a, QuaternionAligned *b, QuaternionAligned *c, QuaternionAligned *d)
	{
#if _X360
		fltx4 r0 = __vmrghw(x, z);
		fltx4 r1 = __vmrghw(y, w);
		fltx4 r2 = __vmrglw(x, z);
		fltx4 r3 = __vmrglw(y, w);

		fltx4 rx = __vmrghw(r0, r1);
		fltx4 ry = __vmrglw(r0, r1);
		fltx4 rz = __vmrghw(r2, r3);
		fltx4 rw = __vmrglw(r2, r3);

		StoreAlignedSIMD(a, rx);
		StoreAlignedSIMD(b, ry);
		StoreAlignedSIMD(c, rz);
		StoreAlignedSIMD(d, rw);
#else
		fltx4 dupes[4] = { x, y, z, w };
		TransposeSIMD(dupes[0], dupes[1], dupes[2], dupes[3]);
		StoreAlignedSIMD(a, dupes[0]);
		StoreAlignedSIMD(b, dupes[1]);
		StoreAlignedSIMD(c, dupes[2]);
		StoreAlignedSIMD(d, dupes[3]);
#endif
	}

	// Store the FourQuaternions out to four consecutive ordinary quaternions in memory.
	FORCEINLINE void SwizzleAndStoreAligned(QuaternionAligned *qs)
	{
#if _X360
		fltx4 r0 = __vmrghw(x, z);
		fltx4 r1 = __vmrghw(y, w);
		fltx4 r2 = __vmrglw(x, z);
		fltx4 r3 = __vmrglw(y, w);

		fltx4 rx = __vmrghw(r0, r1);
		fltx4 ry = __vmrglw(r0, r1);
		fltx4 rz = __vmrghw(r2, r3);
		fltx4 rw = __vmrglw(r2, r3);

		StoreAlignedSIMD(qs, rx);
		StoreAlignedSIMD(++qs, ry);
		StoreAlignedSIMD(++qs, rz);
		StoreAlignedSIMD(++qs, rw);
#else
		SwizzleAndStoreAligned(qs, qs+1, qs+2, qs+3);
#endif
	}

	// Store the FourQuaternions out to four consecutive ordinary quaternions in memory.
	// The mask specifies which of the quaternions are actually written out -- each	
	// word in the fltx4 should be all binary ones or zeros. Ones means the corresponding
	// quat will be written.
	FORCEINLINE void SwizzleAndStoreAlignedMasked(QuaternionAligned * RESTRICT qs, const fltx4 &controlMask)
	{
		fltx4 originals[4];
		originals[0] = LoadAlignedSIMD(qs);
		originals[1] = LoadAlignedSIMD(qs+1);
		originals[2] = LoadAlignedSIMD(qs+2);
		originals[3] = LoadAlignedSIMD(qs+3);

		fltx4 masks[4] = { SplatXSIMD(controlMask),
			SplatYSIMD(controlMask),
			SplatZSIMD(controlMask),
			SplatWSIMD(controlMask)	};

#if _X360
		fltx4 r0 = __vmrghw(x, z);
		fltx4 r1 = __vmrghw(y, w);
		fltx4 r2 = __vmrglw(x, z);
		fltx4 r3 = __vmrglw(y, w);

		fltx4 rx = __vmrghw(r0, r1);
		fltx4 ry = __vmrglw(r0, r1);
		fltx4 rz = __vmrghw(r2, r3);
		fltx4 rw = __vmrglw(r2, r3);
#else
		fltx4 rx = x;
		fltx4 ry = y;
		fltx4 rz = z;
		fltx4 rw = w;
		TransposeSIMD( rx, ry, rz, rw );
#endif

		StoreAlignedSIMD( qs+0, MaskedAssign(masks[0], rx, originals[0]));
		StoreAlignedSIMD( qs+1, MaskedAssign(masks[1], ry, originals[1]));
		StoreAlignedSIMD( qs+2, MaskedAssign(masks[2], rz, originals[2]));
		StoreAlignedSIMD( qs+3, MaskedAssign(masks[3], rw, originals[3]));
	}
};

FORCEINLINE FourQuaternions FourQuaternions::Conjugate(  ) const
{
	return FourQuaternions( NegSIMD(x),  NegSIMD(y),  NegSIMD(z),  w );
}




FORCEINLINE const fltx4 Dot(const FourQuaternions &a, const FourQuaternions &b)
{
	return
		MaddSIMD(a.x, b.x,
			MaddSIMD(a.y, b.y,
				MaddSIMD(a.z,b.z, MulSIMD(a.w,b.w))
			)
		);
}


FORCEINLINE const FourQuaternions Madd(const FourQuaternions &a, const fltx4 &scale, const FourQuaternions &c)
{
	FourQuaternions ret;
	ret.x = MaddSIMD(a.x,scale,c.x);
	ret.y = MaddSIMD(a.y,scale,c.y);
	ret.z = MaddSIMD(a.z,scale,c.z);
	ret.w = MaddSIMD(a.w,scale,c.w);
	return ret;
}

FORCEINLINE const FourQuaternions Mul(const FourQuaternions &a, const fltx4 &scale)
{
	FourQuaternions ret;
	ret.x = MulSIMD(a.x,scale);
	ret.y = MulSIMD(a.y,scale);
	ret.z = MulSIMD(a.z,scale);
	ret.w = MulSIMD(a.w,scale);
	return ret;
}

FORCEINLINE const FourQuaternions Add(const FourQuaternions &a,const FourQuaternions &b)
{
	FourQuaternions ret;
	ret.x = AddSIMD(a.x,b.x);
	ret.y = AddSIMD(a.y,b.y);
	ret.z = AddSIMD(a.z,b.z);
	ret.w = AddSIMD(a.w,b.w);
	return ret;
}

FORCEINLINE const FourQuaternions Sub(const FourQuaternions &a,const FourQuaternions &b)
{
	FourQuaternions ret;
	ret.x = SubSIMD(a.x,b.x);
	ret.y = SubSIMD(a.y,b.y);
	ret.z = SubSIMD(a.z,b.z);
	ret.w = SubSIMD(a.w,b.w);
	return ret;
}

FORCEINLINE const FourQuaternions Neg(const FourQuaternions &q)
{
	FourQuaternions ret;
	ret.x = NegSIMD(q.x);
	ret.y = NegSIMD(q.y);
	ret.z = NegSIMD(q.z);
	ret.w = NegSIMD(q.w);
	return ret;
}

FORCEINLINE const FourQuaternions MaskedAssign(const fltx4 &mask, const FourQuaternions &a, const FourQuaternions &b)
{
	FourQuaternions ret;
	ret.x = MaskedAssign(mask,a.x,b.x);
	ret.y = MaskedAssign(mask,a.y,b.y);
	ret.z = MaskedAssign(mask,a.z,b.z);
	ret.w = MaskedAssign(mask,a.w,b.w);
	return ret;
}


FORCEINLINE FourQuaternions QuaternionAlign( const FourQuaternions &p, const FourQuaternions &q )
{
	// decide if one of the quaternions is backwards
	fltx4 cmp = CmpLtSIMD( Dot(p,q), Four_Zeros );
	return MaskedAssign( cmp, Neg(q), q );
}


FORCEINLINE const FourQuaternions QuaternionNormalize( const FourQuaternions &q )
{
	fltx4 radius = Dot( q, q );
	fltx4 mask = CmpEqSIMD( radius, Four_Zeros ); // all ones iff radius = 0
	fltx4 invRadius = ReciprocalSqrtSIMD( radius );
	
	FourQuaternions ret = MaskedAssign(mask, q, Mul(q, invRadius));
	return ret;
}





/// this = this * q;
FORCEINLINE FourQuaternions FourQuaternions::Mul( FourQuaternions const &q ) const
{
	// W = w1w2 - x1x2 - y1y2 - z1z2
	FourQuaternions ret;
	fltx4 signMask = LoadAlignedSIMD( (float *) g_SIMD_signmask ); 
	// as we do the multiplication, also do a dot product, so we know whether
	// one of the quats is backwards and if we therefore have to negate at the end
	fltx4 dotProduct = MulSIMD( w, q.w );

	ret.w = MulSIMD( w, q.w ); // W = w1w2
	ret.x = MulSIMD( w, q.x ); // X = w1x2
	ret.y = MulSIMD( w, q.y ); // Y = w1y2
	ret.z = MulSIMD( w, q.z ); // Z = w1z2

	dotProduct = MaddSIMD( x, q.x, dotProduct );
	ret.w = MsubSIMD( x, q.x, ret.w ); // W = w1w2 - x1x2
	ret.x = MaddSIMD( x, q.w, ret.x ); // X = w1x2 + x1w2
	ret.y = MsubSIMD( x, q.z, ret.y ); // Y = w1y2 - x1z2
	ret.z = MaddSIMD( x, q.y, ret.z ); // Z = w1z2 + x1y2

	dotProduct = MaddSIMD( y, q.y, dotProduct );
	ret.w = MsubSIMD( y, q.y, ret.w ); // W = w1w2 - x1x2 - y1y2
	ret.x = MaddSIMD( y, q.z, ret.x ); // X = w1x2 + x1w2 + y1z2
	ret.y = MaddSIMD( y, q.w, ret.y ); // Y = w1y2 - x1z2 + y1w2
	ret.z = MsubSIMD( y, q.x, ret.z ); // Z = w1z2 + x1y2 - y1x2

	dotProduct = MaddSIMD( z, q.z, dotProduct );
	ret.w = MsubSIMD( z, q.z, ret.w ); // W = w1w2 - x1x2 - y1y2 - z1z2
	ret.x = MsubSIMD( z, q.y, ret.x ); // X = w1x2 + x1w2 + y1z2 - z1y2
	ret.y = MaddSIMD( z, q.x, ret.y ); // Y = w1y2 - x1z2 + y1w2 + z1x2
	ret.z = MaddSIMD( z, q.w, ret.z ); // Z = w1z2 + x1y2 - y1x2 + z1w2

	fltx4 Zero = Four_Zeros;
	fltx4 control = CmpLtSIMD( dotProduct, Four_Zeros );
	signMask = MaskedAssign(control, signMask, Zero); // negate quats where q1.q2 < 0
	ret.w = XorSIMD( signMask, ret.w );
	ret.x = XorSIMD( signMask, ret.x );
	ret.y = XorSIMD( signMask, ret.y );
	ret.z = XorSIMD( signMask, ret.z );

	return ret;
}

/*

void QuaternionScale( const Quaternion &p, float t, Quaternion &q )
{
	Assert( s_bMathlibInitialized );

	
	float r;

	// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to 
	// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.
	float sinom = sqrt( DotProduct( &p.x, &p.x ) );
	sinom = min( sinom, 1.f );

	float sinsom = sin( asin( sinom ) * t );

	t = sinsom / (sinom + FLT_EPSILON);
	VectorScale( &p.x, t, &q.x );

	// rescale rotation
	r = 1.0f - sinsom * sinsom;

	// Assert( r >= 0 );
	if (r < 0.0f) 
	r = 0.0f;
	r = sqrt( r );

	// keep sign of rotation
	if (p.w < 0)
	q.w = -r;
	else
	q.w = r;

	Assert( q.IsValid() );

	return;
}

*/

FORCEINLINE FourQuaternions FourQuaternions::ScaleAngle( const fltx4 &scale ) const
{
	FourQuaternions ret;
	static const fltx4 OneMinusEpsilon = {1.0f -  0.000001f, 1.0f -  0.000001f, 1.0f -  0.000001f, 1.0f -  0.000001f };
	//static const fltx4 tiny = { 0.00001f, 0.00001f, 0.00001f, 0.00001f };
	const fltx4 Zero = Four_Zeros;
	fltx4 signMask = LoadAlignedSIMD( (float *) g_SIMD_signmask ); 
	// work out if there are any tiny scales or angles, which are unstable
	fltx4 tinyAngles = CmpGtSIMD(w,OneMinusEpsilon);
	fltx4 negativeRotations = CmpLtSIMD(w, Zero); // if any w's are <0, we will need to negate later down

	// figure out the theta
	fltx4 angles = ArcCosSIMD(w);

	// test also if w > -1
	fltx4 negativeWs = XorSIMD(signMask, w);
	tinyAngles = OrSIMD( CmpGtSIMD(negativeWs, OneMinusEpsilon ), tinyAngles );

	// meanwhile start working on computing the dot product of the
	// vector component, and trust in the scheduler to interleave them
	fltx4 vLenSq = MulSIMD( x, x );
	vLenSq = MaddSIMD( y, y, vLenSq );
	vLenSq = MaddSIMD( z, z, vLenSq );

	// scale the angles
	angles = MulSIMD( angles, scale );

	// clear out the sign mask where w>=0
	signMask = MaskedAssign( negativeRotations, signMask, Zero);

	// work out the new w component and vector length
	fltx4 vLenRecip = ReciprocalSqrtSIMD(vLenSq); // interleave with Cos to hide latencies
	fltx4 sine;
	SinCosSIMD( sine, ret.w, angles );
	ret.x = MulSIMD( x, vLenRecip ); // renormalize so the vector length + w = 1
	ret.y = MulSIMD( y, vLenRecip ); // renormalize so the vector length + w = 1
	ret.z = MulSIMD( z, vLenRecip ); // renormalize so the vector length + w = 1
	ret.x = MulSIMD( ret.x, sine );
	ret.y = MulSIMD( ret.y, sine );
	ret.z = MulSIMD( ret.z, sine );

	// negate where necessary
	ret.x = XorSIMD(ret.x, signMask);
	ret.y = XorSIMD(ret.y, signMask);
	ret.z = XorSIMD(ret.z, signMask);
	ret.w = XorSIMD(ret.w, signMask);

	// finally, toss results from where cos(theta) is close to 1 -- these are non rotations.
	ret.x = MaskedAssign(tinyAngles, x, ret.x);
	ret.y = MaskedAssign(tinyAngles, y, ret.y);
	ret.z = MaskedAssign(tinyAngles, z, ret.z);
	ret.w = MaskedAssign(tinyAngles, w, ret.w);

	return ret;
}

//-----------------------------------------------------------------------------
// Purpose: return = this * ( s * q )
// In other words, for a quaternion representing a rotation of angle theta, return
// one of angle s*theta
// s is four floats in a fltx4 -- one for each quaternion
//-----------------------------------------------------------------------------
 
FORCEINLINE FourQuaternions FourQuaternions::MulAc( const fltx4 &s, const FourQuaternions &q ) const
{
	/*
	void QuaternionMA( const Quaternion &p, float s, const Quaternion &q, Quaternion &qt )
	{
		Quaternion p1, q1;

		QuaternionScale( q, s, q1 );
		QuaternionMult( p, q1, p1 );
		QuaternionNormalize( p1 );
		qt[0] = p1[0];
		qt[1] = p1[1];
		qt[2] = p1[2];
		qt[3] = p1[3];
	}
	*/

	return Mul(q.ScaleAngle(s));
}


FORCEINLINE FourQuaternions FourQuaternions::ScaleMul( const fltx4 &s, const FourQuaternions &q ) const
{
	return ScaleAngle(s).Mul(q);
}


FORCEINLINE FourQuaternions FourQuaternions::Slerp( const FourQuaternions &originalto, const fltx4 &t )
{
	FourQuaternions ret;
	static const fltx4 OneMinusEpsilon = {1.0f -  0.000001f, 1.0f -  0.000001f, 1.0f -  0.000001f, 1.0f -  0.000001f };

	// align if necessary.

	// actually, before we even do that, start by computing the dot product of 
	// the quaternions. it has lots of dependent ops and we can sneak it into
	// the pipeline bubbles as we figure out alignment. Of course we don't know
	// yet if we need to realign, so compute them both -- there's plenty of
	// space in the bubbles. They're roomy, those bubbles.
	fltx4 cosineOmega;
#if 0 // Maybe I don't need to do alignment seperately, using the xb360 technique...
	FourQuaternions to;
	{
		fltx4 diffs[4], sums[4], originalToNeg[4];
		fltx4 dotIfAligned, dotIfNotAligned;

		// compute negations of the TO quaternion.
		originalToNeg[0] = NegSIMD(originalto.x);
		originalToNeg[1] = NegSIMD(originalto.y);
		originalToNeg[2] = NegSIMD(originalto.z);
		originalToNeg[3] = NegSIMD(originalto.w);

			dotIfAligned    = MulSIMD(x, originalto.x);
			dotIfNotAligned = MulSIMD(x, originalToNeg[0]);

		diffs[0] = SubSIMD(x, originalto.x);
		diffs[1] = SubSIMD(y, originalto.y);
		diffs[2] = SubSIMD(z, originalto.z);
		diffs[3] = SubSIMD(w, originalto.w);

		sums[0] = AddSIMD(x, originalto.x);
		sums[1] = AddSIMD(y, originalto.y);
		sums[2] = AddSIMD(z, originalto.z);
		sums[3] = AddSIMD(w, originalto.w);

			dotIfAligned    = MaddSIMD(y, originalto.y,     dotIfAligned);
			dotIfNotAligned = MaddSIMD(y, originalToNeg[1], dotIfNotAligned);

		fltx4 diffsDot, sumsDot;

		diffsDot = MulSIMD(diffs[0], diffs[0]); // x^2
		sumsDot  = MulSIMD(sums[0],  sums[0] ); // x^2
			// do some work on the dot products while letting the multiplies cook
			dotIfAligned    = MaddSIMD(z, originalto.z,     dotIfAligned);
			dotIfNotAligned = MaddSIMD(z, originalToNeg[2], dotIfNotAligned);

		diffsDot = MaddSIMD(diffs[1], diffs[1], diffsDot); // x^2 + y^2 
		sumsDot  = MaddSIMD(sums[1],  sums[1],  sumsDot ); 
		diffsDot = MaddSIMD(diffs[2], diffs[2], diffsDot); // x^2 + y^2 + z^2
		sumsDot  = MaddSIMD(sums[2],  sums[2],  sumsDot ); 
		diffsDot = MaddSIMD(diffs[3], diffs[3], diffsDot); // x^2 + y^2 + z^2 + w^2
		sumsDot  = MaddSIMD(sums[3],  sums[3],  sumsDot ); 
			// do some work on the dot products while letting the multiplies cook
			dotIfAligned    = MaddSIMD(w, originalto.w,     dotIfAligned);
			dotIfNotAligned = MaddSIMD(w, originalToNeg[3], dotIfNotAligned);

		// are the differences greater than the sums?
		// if so, we need to negate that quaternion
		fltx4 mask = CmpGtSIMD(diffsDot, sumsDot); // 1 for diffs>0 and 0 elsewhere
		to.x = MaskedAssign(mask, originalToNeg[0], originalto.x);
		to.y = MaskedAssign(mask, originalToNeg[1], originalto.y);
		to.z = MaskedAssign(mask, originalToNeg[2], originalto.z);
		to.w = MaskedAssign(mask, originalToNeg[3], originalto.w);

		cosineOmega = MaskedAssign(mask, dotIfNotAligned, dotIfAligned);
	}

	// right, now to is aligned to be the short way round, and we computed
	// the dot product while we were figuring all that out.
#else
	const FourQuaternions &to = originalto;
	cosineOmega = MulSIMD(x, to.x);
	cosineOmega = MaddSIMD(y, to.y, cosineOmega);
	cosineOmega = MaddSIMD(z, to.z, cosineOmega);
	cosineOmega = MaddSIMD(w, to.w, cosineOmega);
#endif
	
	fltx4 Zero = Four_Zeros;
	fltx4 cosOmegaLessThanZero = CmpLtSIMD(cosineOmega, Zero);
	// fltx4 shouldNegate = MaskedAssign(cosOmegaLessThanZero, Four_NegativeOnes , Four_Ones );
	fltx4 signMask = LoadAlignedSIMD( (float *) g_SIMD_signmask ); // contains a one in the sign bit -- xor against a number to negate it
	fltx4 sinOmega = Four_Ones;

	// negate cosineOmega where necessary
	cosineOmega = MaskedAssign( cosOmegaLessThanZero, XorSIMD(cosineOmega, signMask), cosineOmega );
	fltx4 oneMinusT = SubSIMD(Four_Ones,t);
	fltx4 bCosOmegaLessThanOne = CmpLtSIMD(cosineOmega, OneMinusEpsilon); // we'll use this to mask out null slerps

	// figure out the sin component of the diff quaternion.
	// since sin^2(t) + cos^2(t) = 1...
	sinOmega = MsubSIMD( cosineOmega, cosineOmega, sinOmega ); // = 1 - cos^2(t) = sin^2(t)
	fltx4 invSinOmega = ReciprocalSqrtSIMD( sinOmega );  // 1/sin(t)
	sinOmega = MulSIMD( sinOmega, invSinOmega ); // = sin^2(t) / sin(t) = sin(t)

	// use the arctangent technique to work out omega from  tan^-1(sin/cos)
	fltx4 omega = ArcTan2SIMD(sinOmega, cosineOmega);

	// alpha = sin(omega * (1-T))/sin(omega)
	// beta  = sin(omega * T)/sin(omega)
	fltx4 alpha = MulSIMD(omega, oneMinusT);  // w(1-T)
	fltx4 beta  = MulSIMD(omega, t);		  // w(T)
	signMask = MaskedAssign(cosOmegaLessThanZero, signMask, Zero);

	alpha = SinSIMD(alpha);  // sin(w(1-T))
	beta  = SinSIMD(beta);   // sin(wT)
	
	alpha = MulSIMD(alpha, invSinOmega);
	beta  = MulSIMD(beta, invSinOmega);

	// depending on whether the dot product was less than zero, negate beta, or not
	beta  = XorSIMD(beta, signMask);

	// mask out singularities (where omega = 1)
	alpha = MaskedAssign( bCosOmegaLessThanOne, alpha, oneMinusT );
	beta  = MaskedAssign( bCosOmegaLessThanOne, beta , t );

 	ret.x = MulSIMD(x, alpha);
	ret.y = MulSIMD(y, alpha);
	ret.z = MulSIMD(z, alpha);
	ret.w = MulSIMD(w, alpha);

	ret.x = MaddSIMD(to.x, beta, ret.x);
	ret.y = MaddSIMD(to.y, beta, ret.y);
	ret.z = MaddSIMD(to.z, beta, ret.z);
	ret.w = MaddSIMD(to.w, beta, ret.w);

	return ret;
}



FORCEINLINE FourQuaternions FourQuaternions::SlerpNoAlign( const FourQuaternions &originalto, const fltx4 &t )
{
	FourQuaternions ret;
	static const fltx4 OneMinusEpsilon = {1.0f -  0.000001f, 1.0f -  0.000001f, 1.0f -  0.000001f, 1.0f -  0.000001f };

	// align if necessary.

	// actually, before we even do that, start by computing the dot product of 
	// the quaternions. it has lots of dependent ops and we can sneak it into
	// the pipeline bubbles as we figure out alignment. Of course we don't know
	// yet if we need to realign, so compute them both -- there's plenty of
	// space in the bubbles. They're roomy, those bubbles.
	fltx4 cosineOmega;

	const FourQuaternions &to = originalto;
	cosineOmega = MulSIMD(x, to.x);
	cosineOmega = MaddSIMD(y, to.y, cosineOmega);
	cosineOmega = MaddSIMD(z, to.z, cosineOmega);
	cosineOmega = MaddSIMD(w, to.w, cosineOmega);

	fltx4 sinOmega = Four_Ones;

	fltx4 oneMinusT = SubSIMD(Four_Ones,t);
	fltx4 bCosOmegaLessThanOne = CmpLtSIMD(cosineOmega, OneMinusEpsilon); // we'll use this to mask out null slerps

	// figure out the sin component of the diff quaternion.
	// since sin^2(t) + cos^2(t) = 1...
	sinOmega = MsubSIMD( cosineOmega, cosineOmega, sinOmega ); // = 1 - cos^2(t) = sin^2(t)
	fltx4 invSinOmega = ReciprocalSqrtSIMD( sinOmega );  // 1/sin(t)
	sinOmega = MulSIMD( sinOmega, invSinOmega ); // = sin^2(t) / sin(t) = sin(t)

	// use the arctangent technique to work out omega from  tan^-1(sin/cos)
	fltx4 omega = ArcTan2SIMD(sinOmega, cosineOmega);

	// alpha = sin(omega * (1-T))/sin(omega)
	// beta  = sin(omega * T)/sin(omega)
	fltx4 alpha = MulSIMD(omega, oneMinusT);  // w(1-T)
	fltx4 beta  = MulSIMD(omega, t);		  // w(T)
	alpha = SinSIMD(alpha);  // sin(w(1-T))
	beta  = SinSIMD(beta);   // sin(wT)
	alpha = MulSIMD(alpha, invSinOmega);
	beta  = MulSIMD(beta, invSinOmega);

	// mask out singularities (where omega = 1)
	alpha = MaskedAssign( bCosOmegaLessThanOne, alpha, oneMinusT );
	beta  = MaskedAssign( bCosOmegaLessThanOne, beta , t );

	ret.x = MulSIMD(x, alpha);
	ret.y = MulSIMD(y, alpha);
	ret.z = MulSIMD(z, alpha);
	ret.w = MulSIMD(w, alpha);

	ret.x = MaddSIMD(to.x, beta, ret.x);
	ret.y = MaddSIMD(to.y, beta, ret.y);
	ret.z = MaddSIMD(to.z, beta, ret.z);
	ret.w = MaddSIMD(to.w, beta, ret.w);

	return ret;
}


#endif // SSEQUATMATH_H