Port GetCPUInformation and mathlib from sdk2013

2024-12-23 01:59:43 +08:00 · 2024-04-20 13:34:13 -04:00 · 2024-04-20 13:34:13 -04:00 · 0d247b9566
commit 0d247b9566
parent b099570391
35 changed files with 3188 additions and 693 deletions
--- a/mathlib/AMBuilder
+++ b/mathlib/AMBuilder
@ -11,6 +11,7 @@ builder.SetBuildFolder('/')

 project = builder.StaticLibraryProject('mathlib')
 project.sources = [
+	'almostequal.cpp',
 	'anorms.cpp',
 	'bumpvects.cpp',
 	'color_conversion.cpp',
@ -25,6 +26,7 @@ project.sources = [
 	'randsse.cpp',
 	'simdvectormatrix.cpp',
 	'sparse_convolution_noise.cpp',
+	'spherical.cpp',
 	'sse.cpp',
 	'sseconst.cpp',
 	'ssenoise.cpp',
--- a/mathlib/IceKey.cpp
+++ b/mathlib/IceKey.cpp
@ -7,6 +7,8 @@
 #include "mathlib/IceKey.H"
 #include <cstdint>

+#include "tier0/memdbgon.h"
+
 #ifdef _MSC_VER
 #pragma warning(disable: 4244)
 #endif
--- a/mathlib/almostequal.cpp
+++ b/mathlib/almostequal.cpp
@ -0,0 +1,97 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: Fast ways to compare equality of two floats.  Assumes 
+// sizeof(float) == sizeof(int) and we are using IEEE format.
+//
+// Source:  http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm
+//=====================================================================================//
+
+#include <float.h>
+#include <math.h>
+
+#include "mathlib/mathlib.h"
+
+static inline bool AE_IsInfinite(float a)
+{
+    const int kInfAsInt = 0x7F800000;
+
+    // An infinity has an exponent of 255 (shift left 23 positions) and
+    // a zero mantissa. There are two infinities - positive and negative.
+    if ((*(int*)&a & 0x7FFFFFFF) == kInfAsInt)
+        return true;
+    return false;
+}
+
+static inline bool AE_IsNan(float a)
+{
+    // a NAN has an exponent of 255 (shifted left 23 positions) and
+    // a non-zero mantissa.
+    int exp = *(int*)&a & 0x7F800000;
+    int mantissa = *(int*)&a & 0x007FFFFF;
+    if (exp == 0x7F800000 && mantissa != 0)
+        return true;
+    return false;
+}
+
+static inline int AE_Sign(float a)
+{
+    // The sign bit of a number is the high bit.
+    return (*(int*)&a) & 0x80000000;
+}
+
+// This is the 'final' version of the AlmostEqualUlps function.
+// The optional checks are included for completeness, but in many
+// cases they are not necessary, or even not desirable.
+bool AlmostEqual(float a, float b, int maxUlps)
+{
+    // There are several optional checks that you can do, depending
+    // on what behavior you want from your floating point comparisons.
+    // These checks should not be necessary and they are included
+    // mainly for completeness.
+
+    // If a or b are infinity (positive or negative) then
+    // only return true if they are exactly equal to each other -
+    // that is, if they are both infinities of the same sign.
+    // This check is only needed if you will be generating
+    // infinities and you don't want them 'close' to numbers
+    // near FLT_MAX.
+    if (AE_IsInfinite(a) || AE_IsInfinite(b))
+        return a == b;
+
+    // If a or b are a NAN, return false. NANs are equal to nothing,
+    // not even themselves.
+    // This check is only needed if you will be generating NANs
+    // and you use a maxUlps greater than 4 million or you want to
+    // ensure that a NAN does not equal itself.
+    if (AE_IsNan(a) || AE_IsNan(b))
+        return false;
+
+    // After adjusting floats so their representations are lexicographically
+    // ordered as twos-complement integers a very small positive number
+    // will compare as 'close' to a very small negative number. If this is
+    // not desireable, and if you are on a platform that supports
+    // subnormals (which is the only place the problem can show up) then
+    // you need this check.
+    // The check for a == b is because zero and negative zero have different
+    // signs but are equal to each other.
+    if (AE_Sign(a) != AE_Sign(b))
+        return a == b;
+
+    int aInt = *(int*)&a;
+    // Make aInt lexicographically ordered as a twos-complement int
+    if (aInt < 0)
+        aInt = 0x80000000 - aInt;
+    // Make bInt lexicographically ordered as a twos-complement int
+    int bInt = *(int*)&b;
+    if (bInt < 0)
+        bInt = 0x80000000 - bInt;
+
+    // Now we can compare aInt and bInt to find out how far apart a and b
+    // are.
+    int intDiff = abs(aInt - bInt);
+    if (intDiff <= maxUlps)
+        return true;
+    return false;
+}
+
+
--- a/mathlib/color_conversion.cpp
+++ b/mathlib/color_conversion.cpp
@ -106,27 +106,23 @@ ALIGN128 float	power2_n[256] = 			// 2**(index - 128) / 255
 // You can use this to double check the exponent table and assert that 
 // the precomputation is correct.
 #ifdef DBGFLAG_ASSERT
-#ifdef _MSC_VER
+#ifdef _WIN32
 #pragma warning(push)
 #pragma warning( disable : 4189 ) // disable unused local variable warning
 #endif
-#ifdef __GNUC__
-__attribute__((unused)) static void CheckExponentTable()
-#else
 static void CheckExponentTable()
-#endif
 {
 	for( int i = 0; i < 256; i++ )
 	{
 		float testAgainst = pow( 2.0f, i - 128 ) / 255.0f;
 		float diff = testAgainst - power2_n[i] ;
 		float relativeDiff = diff / testAgainst;
-		Assert( sizeof(relativeDiff) > 0 && testAgainst == 0 ? 
-								power2_n[i] < 1.16E-041 :
-								power2_n[i] == testAgainst );
+		Assert( testAgainst == 0 ? 
+				power2_n[i] < 1.16E-041 :
+				power2_n[i] == testAgainst );
 	}
 }
-#ifdef _MSC_VER
+#ifdef _WIN32
 #pragma warning(pop)
 #endif
 #endif
@ -617,10 +613,10 @@ void VectorToColorRGBExp32( const Vector& vin, ColorRGBExp32 &c )
 		scalar = *reinterpret_cast<float *>(&fbits);
 	}

-	// we should never need to clamp:
-	Assert(vin.x * scalar <= 255.0f && 
-		   vin.y * scalar <= 255.0f && 
-		   vin.z * scalar <= 255.0f);
+	// We can totally wind up above 255 and that's okay--but above 256 would be right out.
+	Assert(vin.x * scalar < 256.0f && 
+		   vin.y * scalar < 256.0f && 
+		   vin.z * scalar < 256.0f);

 	// This awful construction is necessary to prevent VC2005 from using the 
 	// fldcw/fnstcw control words around every float-to-unsigned-char operation.
--- a/mathlib/imagequant.cpp
+++ b/mathlib/imagequant.cpp
@ -6,7 +6,7 @@
 //
 //=============================================================================//
 #include <quantize.h>
-#include <tier0/basetypes.h>
+#include <minmax.h>

 #define N_EXTRAVALUES 1
 #define N_DIMENSIONS (3+N_EXTRAVALUES)
@ -46,7 +46,7 @@ void ColorQuantize(uint8 const *Image,
 					val1+=PIXEL(x,y,c)*ExtraValueXForms[i*3+c];
 				val1>>=8;
 				NthSample(s,y*Width+x,N_DIMENSIONS)->Value[c]=(uint8)
-					(MIN(255,MAX(0,val1)));
+					(V_min(255,V_max(0,val1)));
 			}
 		}
 	struct QuantizedValue *q=Quantize(s,Width*Height,N_DIMENSIONS,
@ -76,7 +76,7 @@ void ColorQuantize(uint8 const *Image,
 					tryc+=Error[x][c][ErrorUse];
 					Error[x][c][ErrorUse]=0;
 				}
-				samp[c]=(uint8) MIN(255,MAX(0,tryc));
+				samp[c]=(uint8) V_min(255,V_max(0,tryc));
 			}
 			struct QuantizedValue *f=FindMatch(samp,3,Weights,q);
 			out_pixels[Width*y+x]=(uint8) (f->value);
--- a/mathlib/lightdesc.cpp
+++ b/mathlib/lightdesc.cpp
@ -10,7 +10,7 @@

 void LightDesc_t::RecalculateDerivedValues(void)
 {
-	m_Flags=0;
+	m_Flags = LIGHTTYPE_OPTIMIZATIONFLAGS_DERIVED_VALUES_CALCED;
 	if (m_Attenuation0)
 		m_Flags|=LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0;
 	if (m_Attenuation1)
--- a/mathlib/mathlib_base.cpp
+++ b/mathlib/mathlib_base.cpp
@ -1,4 +1,4 @@
-//===== Copyright © 1996-2005, Valve Corporation, All rights reserved. ======//
+//===== Copyright <EFBFBD> 1996-2005, Valve Corporation, All rights reserved. ======//
 //
 // Purpose: Math primitives.
 //
@ -17,7 +17,7 @@
 #include "tier0/vprof.h"
 //#define _VPROF_MATHLIB

-#ifdef _MSC_VER
+#ifdef _WIN32
 #pragma warning(disable:4244)   // "conversion from 'const int' to 'float', possible loss of data"
 #pragma warning(disable:4730)	// "mixing _m64 and floating point expressions may result in incorrect code"
 #endif
@ -25,6 +25,7 @@
 #include "mathlib/mathlib.h"
 #include "mathlib/vector.h"
 #if !defined( _X360 )
+#include "mathlib/amd3dx.h"
 #include "sse.h"
 #endif

@ -426,6 +427,33 @@ void MatrixSetColumn( const Vector &in, int column, matrix3x4_t& out )
 	out[2][column] = in.z;
 }

+void MatrixScaleBy ( const float flScale, matrix3x4_t &out )
+{
+	out[0][0] *= flScale;
+	out[1][0] *= flScale;
+	out[2][0] *= flScale;
+	out[0][1] *= flScale;
+	out[1][1] *= flScale;
+	out[2][1] *= flScale;
+	out[0][2] *= flScale;
+	out[1][2] *= flScale;
+	out[2][2] *= flScale;
+}
+
+void MatrixScaleByZero ( matrix3x4_t &out )
+{
+	out[0][0] = 0.0f;
+	out[1][0] = 0.0f;
+	out[2][0] = 0.0f;
+	out[0][1] = 0.0f;
+	out[1][1] = 0.0f;
+	out[2][1] = 0.0f;
+	out[0][2] = 0.0f;
+	out[1][2] = 0.0f;
+	out[2][2] = 0.0f;
+}
+
+

 int VectorCompare (const float *v1, const float *v2)
 {
@ -565,53 +593,128 @@ void ConcatRotations (const float in1[3][3], const float in2[3][3], float out[3]
 				in1[2][2] * in2[2][2];
 }

+void ConcatTransforms_Aligned( const matrix3x4_t &m0, const matrix3x4_t &m1, matrix3x4_t &out )
+{
+	Assert( (((size_t)&m0) % 16) == 0 );
+	Assert( (((size_t)&m1) % 16) == 0 );
+	Assert( (((size_t)&out) % 16) == 0 );
+
+	fltx4 lastMask = *(fltx4 *)(&g_SIMD_ComponentMask[3]);
+	fltx4 rowA0 = LoadAlignedSIMD( m0.m_flMatVal[0] );
+	fltx4 rowA1 = LoadAlignedSIMD( m0.m_flMatVal[1] );
+	fltx4 rowA2 = LoadAlignedSIMD( m0.m_flMatVal[2] );
+
+	fltx4 rowB0 = LoadAlignedSIMD( m1.m_flMatVal[0] );
+	fltx4 rowB1 = LoadAlignedSIMD( m1.m_flMatVal[1] );
+	fltx4 rowB2 = LoadAlignedSIMD( m1.m_flMatVal[2] );
+
+	// now we have the rows of m0 and the columns of m1
+	// first output row
+	fltx4 A0 = SplatXSIMD(rowA0);
+	fltx4 A1 = SplatYSIMD(rowA0);
+	fltx4 A2 = SplatZSIMD(rowA0);
+	fltx4 mul00 = MulSIMD( A0, rowB0 );
+	fltx4 mul01 = MulSIMD( A1, rowB1 );
+	fltx4 mul02 = MulSIMD( A2, rowB2 );
+	fltx4 out0 = AddSIMD( mul00, AddSIMD(mul01,mul02) );
+
+	// second output row
+	A0 = SplatXSIMD(rowA1);
+	A1 = SplatYSIMD(rowA1);
+	A2 = SplatZSIMD(rowA1);
+	fltx4 mul10 = MulSIMD( A0, rowB0 );
+	fltx4 mul11 = MulSIMD( A1, rowB1 );
+	fltx4 mul12 = MulSIMD( A2, rowB2 );
+	fltx4 out1 = AddSIMD( mul10, AddSIMD(mul11,mul12) );
+
+	// third output row
+	A0 = SplatXSIMD(rowA2);
+	A1 = SplatYSIMD(rowA2);
+	A2 = SplatZSIMD(rowA2);
+	fltx4 mul20 = MulSIMD( A0, rowB0 );
+	fltx4 mul21 = MulSIMD( A1, rowB1 );
+	fltx4 mul22 = MulSIMD( A2, rowB2 );
+	fltx4 out2 = AddSIMD( mul20, AddSIMD(mul21,mul22) );
+
+	// add in translation vector
+	A0 = AndSIMD(rowA0,lastMask);
+	A1 = AndSIMD(rowA1,lastMask);
+	A2 = AndSIMD(rowA2,lastMask);
+	out0 = AddSIMD(out0, A0);
+	out1 = AddSIMD(out1, A1);
+	out2 = AddSIMD(out2, A2);
+
+	StoreAlignedSIMD( out.m_flMatVal[0], out0 );
+	StoreAlignedSIMD( out.m_flMatVal[1], out1 );
+	StoreAlignedSIMD( out.m_flMatVal[2], out2 );
+}

 /*
 ================
 R_ConcatTransforms
 ================
 */
+
 void ConcatTransforms (const matrix3x4_t& in1, const matrix3x4_t& in2, matrix3x4_t& out)
 {
-	Assert( s_bMathlibInitialized );
-	if ( &in1 == &out )
+#if 0
+	// test for ones that'll be 2x faster
+	if ( (((size_t)&in1) % 16) == 0 && (((size_t)&in2) % 16) == 0 && (((size_t)&out) % 16) == 0 )
 	{
-		matrix3x4_t in1b;
-		MatrixCopy( in1, in1b );
-		ConcatTransforms( in1b, in2, out );
+		ConcatTransforms_Aligned( in1, in2, out );
 		return;
 	}
-	if ( &in2 == &out )
-	{
-		matrix3x4_t in2b;
-		MatrixCopy( in2, in2b );
-		ConcatTransforms( in1, in2b, out );
-		return;
-	}
-	out[0][0] = in1[0][0] * in2[0][0] + in1[0][1] * in2[1][0] +
-				in1[0][2] * in2[2][0];
-	out[0][1] = in1[0][0] * in2[0][1] + in1[0][1] * in2[1][1] +
-				in1[0][2] * in2[2][1];
-	out[0][2] = in1[0][0] * in2[0][2] + in1[0][1] * in2[1][2] +
-				in1[0][2] * in2[2][2];
-	out[0][3] = in1[0][0] * in2[0][3] + in1[0][1] * in2[1][3] +
-				in1[0][2] * in2[2][3] + in1[0][3];
-	out[1][0] = in1[1][0] * in2[0][0] + in1[1][1] * in2[1][0] +
-				in1[1][2] * in2[2][0];
-	out[1][1] = in1[1][0] * in2[0][1] + in1[1][1] * in2[1][1] +
-				in1[1][2] * in2[2][1];
-	out[1][2] = in1[1][0] * in2[0][2] + in1[1][1] * in2[1][2] +
-				in1[1][2] * in2[2][2];
-	out[1][3] = in1[1][0] * in2[0][3] + in1[1][1] * in2[1][3] +
-				in1[1][2] * in2[2][3] + in1[1][3];
-	out[2][0] = in1[2][0] * in2[0][0] + in1[2][1] * in2[1][0] +
-				in1[2][2] * in2[2][0];
-	out[2][1] = in1[2][0] * in2[0][1] + in1[2][1] * in2[1][1] +
-				in1[2][2] * in2[2][1];
-	out[2][2] = in1[2][0] * in2[0][2] + in1[2][1] * in2[1][2] +
-				in1[2][2] * in2[2][2];
-	out[2][3] = in1[2][0] * in2[0][3] + in1[2][1] * in2[1][3] +
-				in1[2][2] * in2[2][3] + in1[2][3];
+#endif
+
+	fltx4 lastMask = *(fltx4 *)(&g_SIMD_ComponentMask[3]);
+	fltx4 rowA0 = LoadUnalignedSIMD( in1.m_flMatVal[0] );
+	fltx4 rowA1 = LoadUnalignedSIMD( in1.m_flMatVal[1] );
+	fltx4 rowA2 = LoadUnalignedSIMD( in1.m_flMatVal[2] );
+
+	fltx4 rowB0 = LoadUnalignedSIMD( in2.m_flMatVal[0] );
+	fltx4 rowB1 = LoadUnalignedSIMD( in2.m_flMatVal[1] );
+	fltx4 rowB2 = LoadUnalignedSIMD( in2.m_flMatVal[2] );
+
+	// now we have the rows of m0 and the columns of m1
+	// first output row
+	fltx4 A0 = SplatXSIMD(rowA0);
+	fltx4 A1 = SplatYSIMD(rowA0);
+	fltx4 A2 = SplatZSIMD(rowA0);
+	fltx4 mul00 = MulSIMD( A0, rowB0 );
+	fltx4 mul01 = MulSIMD( A1, rowB1 );
+	fltx4 mul02 = MulSIMD( A2, rowB2 );
+	fltx4 out0 = AddSIMD( mul00, AddSIMD(mul01,mul02) );
+
+	// second output row
+	A0 = SplatXSIMD(rowA1);
+	A1 = SplatYSIMD(rowA1);
+	A2 = SplatZSIMD(rowA1);
+	fltx4 mul10 = MulSIMD( A0, rowB0 );
+	fltx4 mul11 = MulSIMD( A1, rowB1 );
+	fltx4 mul12 = MulSIMD( A2, rowB2 );
+	fltx4 out1 = AddSIMD( mul10, AddSIMD(mul11,mul12) );
+
+	// third output row
+	A0 = SplatXSIMD(rowA2);
+	A1 = SplatYSIMD(rowA2);
+	A2 = SplatZSIMD(rowA2);
+	fltx4 mul20 = MulSIMD( A0, rowB0 );
+	fltx4 mul21 = MulSIMD( A1, rowB1 );
+	fltx4 mul22 = MulSIMD( A2, rowB2 );
+	fltx4 out2 = AddSIMD( mul20, AddSIMD(mul21,mul22) );
+
+	// add in translation vector
+	A0 = AndSIMD(rowA0,lastMask);
+	A1 = AndSIMD(rowA1,lastMask);
+	A2 = AndSIMD(rowA2,lastMask);
+	out0 = AddSIMD(out0, A0);
+	out1 = AddSIMD(out1, A1);
+	out2 = AddSIMD(out2, A2);
+
+	// write to output
+	StoreUnalignedSIMD( out.m_flMatVal[0], out0 );
+	StoreUnalignedSIMD( out.m_flMatVal[1], out1 );
+	StoreUnalignedSIMD( out.m_flMatVal[2], out2 );
 }


@ -1358,7 +1461,9 @@ float Bias( float x, float biasAmt )
 	{
 		lastExponent = log( biasAmt ) * -1.4427f; // (-1.4427 = 1 / log(0.5))
 	}
-	return pow( x, lastExponent );
+	float fRet = pow( x, lastExponent );
+	Assert ( !IS_NAN( fRet ) );
+	return fRet;
 }


@ -1374,7 +1479,9 @@ float Gain( float x, float biasAmt )

 float SmoothCurve( float x )
 {
-	return (1 - cos( x * M_PI )) * 0.5f;
+	// Actual smooth curve. Visualization:
+	// http://www.wolframalpha.com/input/?i=plot%5B+0.5+*+%281+-+cos%5B2+*+pi+*+x%5D%29+for+x+%3D+%280%2C+1%29+%5D
+	return 0.5f * (1 - cos( 2.0f * M_PI * x ) );
 }


@ -1566,7 +1673,9 @@ float QuaternionAngleDiff( const Quaternion &p, const Quaternion &q )
 	QuaternionConjugate( q, qInv );
 	QuaternionMult( p, qInv, diff );

-	float sinang = sqrt( diff.x * diff.x + diff.y * diff.y + diff.z * diff.z );
+	// Note if the quaternion is slightly non-normalized the square root below may be more than 1,
+	// the value is clamped to one otherwise it may result in asin() returning an undefined result.
+	float sinang = MIN( 1.0f, sqrt( diff.x * diff.x + diff.y * diff.y + diff.z * diff.z ) );
 	float angle = RAD2DEG( 2 * asin( sinang ) );
 	return angle;
 #else
@ -1666,7 +1775,7 @@ void QuaternionScale( const Quaternion &p, float t, Quaternion &q )
 	// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to 
 	// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.
 	float sinom = sqrt( DotProduct( &p.x, &p.x ) );
-	sinom = MIN( sinom, 1.f );
+	sinom = V_min( sinom, 1.f );

 	float sinsom = sin( asin( sinom ) * t );

@ -1751,7 +1860,13 @@ void QuaternionMult( const Quaternion &p, const Quaternion &q, Quaternion &qt )

 void QuaternionMatrix( const Quaternion &q, const Vector &pos, matrix3x4_t& matrix )
 {
-	Assert( pos.IsValid() );
+#ifdef DBGFLAG_ASSERT
+	static bool s_bHushAsserts = !!CommandLine()->FindParm("-hushasserts");
+	if (!s_bHushAsserts)
+	{
+		Assert( pos.IsValid() );
+	}
+#endif

 	QuaternionMatrix( q, matrix );

@ -1763,7 +1878,13 @@ void QuaternionMatrix( const Quaternion &q, const Vector &pos, matrix3x4_t& matr
 void QuaternionMatrix( const Quaternion &q, matrix3x4_t& matrix )
 {
 	Assert( s_bMathlibInitialized );
-	Assert( q.IsValid() );
+#ifdef DBGFLAG_ASSERT
+	static bool s_bHushAsserts = !!CommandLine()->FindParm("-hushasserts");
+	if ( !s_bHushAsserts )
+	{
+		Assert( q.IsValid() );
+	}
+#endif

 #ifdef _VPROF_MATHLIB
 	VPROF_BUDGET( "QuaternionMatrix", "Mathlib" );
@ -3211,7 +3332,7 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright

 #if !defined( _X360 )
 	// Grab the processor information:
-	const CPUInformation& pi = GetCPUInformation();
+	const CPUInformation& pi = *GetCPUInformation();

 	// Select the default generic routines.
 	pfSqrt = _sqrtf;
@ -3240,6 +3361,8 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright
 	{
 		s_bSSEEnabled = true;

+#ifndef PLATFORM_WINDOWS_PC64
+		// These are not yet available.
 		// Select the SSE specific routines if available
 		pfVectorNormalize = _VectorNormalize;
 		pfVectorNormalizeFast = _SSE_VectorNormalizeFast;
@ -3247,7 +3370,8 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright
 		pfSqrt = _SSE_Sqrt;
 		pfRSqrt = _SSE_RSqrtAccurate;
 		pfRSqrtFast = _SSE_RSqrtFast;
-#ifdef _WIN32
+#endif
+#ifdef PLATFORM_WINDOWS_PC32
 		pfFastSinCos = _SSE_SinCos;
 		pfFastCos = _SSE_cos;
 #endif
@ -3260,7 +3384,7 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright
 	if ( bAllowSSE2 && pi.m_bSSE2 )
 	{
 		s_bSSE2Enabled = true;
-#ifdef _WIN32
+#ifdef PLATFORM_WINDOWS_PC32
 		pfFastSinCos = _SSE2_SinCos;
 		pfFastCos = _SSE2_cos;
 #endif
@ -3269,7 +3393,7 @@ void MathLib_Init( float gamma, float texGamma, float brightness, int overbright
 	{
 		s_bSSE2Enabled = false;
 	}
-#endif
+#endif // !_X360

 	s_bMathlibInitialized = true;

@ -3920,10 +4044,10 @@ void CalcTriangleTangentSpace( const Vector &p0, const Vector &p1, const Vector
 //-----------------------------------------------------------------------------
 void RGBtoHSV( const Vector &rgb, Vector &hsv )
 {
-	float flMax = MAX( rgb.x, rgb.y );
-	flMax = MAX( flMax, rgb.z );
-	float flMin = MIN( rgb.x, rgb.y );
-	flMin = MIN( flMin, rgb.z );
+	float flMax = V_max( rgb.x, rgb.y );
+	flMax = V_max( flMax, rgb.z );
+	float flMin = V_min( rgb.x, rgb.y );
+	flMin = V_min( flMin, rgb.z );

 	// hsv.z is the value
 	hsv.z = flMax;
@ -4070,3 +4194,44 @@ void GetInterpolationData( float const *pKnotPositions,
 	*pInterpolationValue = FLerp( 0, 1, 0, flSizeOfGap, flOffsetFromStartOfGap );
 	return;
 }
+
+float RandomVectorInUnitSphere( Vector *pVector )
+{
+	// Guarantee uniform random distribution within a sphere
+	// Graphics gems III contains this algorithm ("Nonuniform random point sets via warping")
+	float u = ((float)rand() / VALVE_RAND_MAX);
+	float v = ((float)rand() / VALVE_RAND_MAX);
+	float w = ((float)rand() / VALVE_RAND_MAX);
+
+	float flPhi = acos( 1 - 2 * u );
+	float flTheta = 2 * M_PI * v;
+	float flRadius = powf( w, 1.0f / 3.0f );
+
+	float flSinPhi, flCosPhi;
+	float flSinTheta, flCosTheta;
+	SinCos( flPhi, &flSinPhi, &flCosPhi );
+	SinCos( flTheta, &flSinTheta, &flCosTheta );
+
+	pVector->x = flRadius * flSinPhi * flCosTheta;
+	pVector->y = flRadius * flSinPhi * flSinTheta;
+	pVector->z = flRadius * flCosPhi;
+	return flRadius;
+}
+
+float RandomVectorInUnitCircle( Vector2D *pVector )
+{
+	// Guarantee uniform random distribution within a sphere
+	// Graphics gems III contains this algorithm ("Nonuniform random point sets via warping")
+	float u = ((float)rand() / VALVE_RAND_MAX);
+	float v = ((float)rand() / VALVE_RAND_MAX);
+
+	float flTheta = 2 * M_PI * v;
+	float flRadius = powf( u, 1.0f / 2.0f );
+
+	float flSinTheta, flCosTheta;
+	SinCos( flTheta, &flSinTheta, &flCosTheta );
+
+	pVector->x = flRadius * flCosTheta;
+	pVector->y = flRadius * flSinTheta;
+	return flRadius;
+}
--- a/mathlib/polyhedron.cpp
+++ b/mathlib/polyhedron.cpp
@ -34,7 +34,6 @@ CPolyhedron *ConvertLinkedGeometryToPolyhedron( GeneratePolyhedronFromPlanes_Uno
 //#define DEBUG_DUMP_POLYHEDRONS_TO_NUMBERED_GLVIEWS //dumps successfully generated polyhedrons

 #ifdef _DEBUG
-#include "filesystem.h"
 void DumpPolyhedronToGLView( const CPolyhedron *pPolyhedron, const char *pFilename, const VMatrix *pTransform );
 void DumpPlaneToGlView( const float *pPlane, float fGrayScale, const char *pszFileName, const VMatrix *pTransform );
 void DumpLineToGLView( const Vector &vPoint1, const Vector &vColor1, const Vector &vPoint2, const Vector &vColor2, float fThickness, FILE *pFile );
@ -103,19 +102,19 @@ CPolyhedron_AllocByNew *CPolyhedron_AllocByNew::Allocate( unsigned short iVertic
 class CPolyhedron_TempMemory : public CPolyhedron
 {
 public:
-#ifdef _DEBUG
+#ifdef DBGFLAG_ASSERT
 	int iReferenceCount;
 #endif

 	virtual void Release( void )
 	{
-#ifdef _DEBUG
+#ifdef DBGFLAG_ASSERT
 		--iReferenceCount;
 #endif
 	}

 	CPolyhedron_TempMemory( void )
-#ifdef _DEBUG
+#ifdef DBGFLAG_ASSERT
 		: iReferenceCount( 0 )
 #endif
 	{ };
@ -128,7 +127,7 @@ static CPolyhedron_TempMemory s_TempMemoryPolyhedron;
 CPolyhedron *GetTempPolyhedron( unsigned short iVertices, unsigned short iLines, unsigned short iIndices, unsigned short iPolygons ) //grab the temporary polyhedron. Avoids new/delete for quick work. Can only be in use by one chunk of code at a time
 {
 	AssertMsg( s_TempMemoryPolyhedron.iReferenceCount == 0, "Temporary polyhedron memory being rewritten before released" );
-#ifdef _DEBUG
+#ifdef DBGFLAG_ASSERT
 	++s_TempMemoryPolyhedron.iReferenceCount;
 #endif
 	s_TempMemoryPolyhedron_Buffer.SetCount( (sizeof( Vector ) * iVertices) +
@ -857,8 +856,8 @@ const char * DumpPolyhedronCutHistory( const CUtlVector<CPolyhedron *> &DumpedHi

 #else

-#define AssertMsg_DumpPolyhedron(condition, message)
-#define Assert_DumpPolyhedron(condition)
+#define AssertMsg_DumpPolyhedron(condition, message) NULL;
+#define Assert_DumpPolyhedron(condition) NULL;

 #endif

--- a/mathlib/powsse.cpp
+++ b/mathlib/powsse.cpp
@ -6,6 +6,10 @@

 #include "mathlib/ssemath.h"

+// NOTE: This has to be the last file included!
+#include "tier0/memdbgon.h"
+
+
 fltx4 Pow_FixedPoint_Exponent_SIMD( const fltx4 & x, int exponent)
 {
 	fltx4 rslt=Four_Ones;									// x^0=1.0
@ -32,8 +36,61 @@ fltx4 Pow_FixedPoint_Exponent_SIMD( const fltx4 & x, int exponent)
 			break;
 	}
 	if (exponent<0)
-		return ReciprocalEstSIMD(rslt);							// pow(x,-b)=1/pow(x,b)
+		return ReciprocalEstSaturateSIMD(rslt);				// pow(x,-b)=1/pow(x,b)
 	else
 		return rslt;
 }

+
+
+
+/*
+ * (c) Ian Stephenson
+ *
+ * ian@dctsystems.co.uk
+ *
+ * Fast pow() reference implementation
+ */
+
+
+static float shift23=(1<<23);
+static float OOshift23=1.0/(1<<23);
+
+float FastLog2(float i)
+{
+	float LogBodge=0.346607f;
+	float x;
+	float y;
+	x=*(int *)&i;
+	x*= OOshift23; //1/pow(2,23);
+	x=x-127;
+
+	y=x-floorf(x);
+	y=(y-y*y)*LogBodge;
+	return x+y;
+}
+float FastPow2(float i)
+{
+	float PowBodge=0.33971f;
+	float x;
+	float y=i-floorf(i);
+	y=(y-y*y)*PowBodge;
+
+	x=i+127-y;
+	x*= shift23; //pow(2,23);
+	*(int*)&x=(int)x;
+	return x;
+}
+float FastPow(float a, float b)
+{
+	if (a <= OOshift23)
+	{
+		return 0.0f;
+	}
+	return FastPow2(b*FastLog2(a));
+}
+float FastPow10( float i )
+{
+	return FastPow2( i * 3.321928f );
+}
+
--- a/mathlib/quantize.cpp
+++ b/mathlib/quantize.cpp
@ -18,11 +18,10 @@
 #endif

 #include <stdlib.h>
+#include <minmax.h>

 #include <math.h>

-#include <tier0/basetypes.h>
-
 static int current_ndims;
 static struct QuantizedValue *current_root;
 static int current_ssize;
@ -412,8 +411,8 @@ static void Label(struct QuantizedValue *q, int updatecolor)
 		else
 			for(int i=0;i<current_ndims;i++)
 			{
-				q->Mins[i]=MIN(q->Children[0]->Mins[i],q->Children[1]->Mins[i]);
-				q->Maxs[i]=MAX(q->Children[0]->Maxs[i],q->Children[1]->Maxs[i]);
+				q->Mins[i]=V_min(q->Children[0]->Mins[i],q->Children[1]->Mins[i]);
+				q->Maxs[i]=V_max(q->Children[0]->Maxs[i],q->Children[1]->Maxs[i]);
 			}
 	}
 }    
--- a/mathlib/simdvectormatrix.cpp
+++ b/mathlib/simdvectormatrix.cpp
@ -48,7 +48,7 @@ void CSIMDVectorMatrix::CreateFromRGBA_FloatImageData(int srcwidth, int srcheigh
 			{
 				for(int cp=0;cp<4; cp++)
 				{
-					int real_cp=MIN( cp, ntrailing_pixels_per_source_line-1 );
+					int real_cp=V_min( cp, ntrailing_pixels_per_source_line-1 );
 					data_out[4*c+cp]= data_in[c+4*real_cp];
 				}
 			}
--- a/mathlib/spherical.cpp
+++ b/mathlib/spherical.cpp
@ -0,0 +1,124 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: spherical math routines
+//
+//=====================================================================================//
+
+#include <math.h>
+#include <float.h>	// Needed for FLT_EPSILON
+#include "basetypes.h"
+#include <memory.h>
+#include "tier0/dbg.h"
+#include "mathlib/mathlib.h"
+#include "mathlib/vector.h"
+#include "mathlib/spherical_geometry.h"
+
+// memdbgon must be the last include file in a .cpp file!!!
+#include "tier0/memdbgon.h"
+
+float s_flFactorials[]={
+	1.,
+	1.,
+	2.,
+	6.,
+	24.,
+	120.,
+	720.,
+	5040.,
+	40320.,
+	362880.,
+	3628800.,
+	39916800.,
+	479001600.,
+	6227020800.,
+	87178291200.,
+	1307674368000.,
+	20922789888000.,
+	355687428096000.,
+	6402373705728000.,
+	121645100408832000.,
+	2432902008176640000.,
+	51090942171709440000.,
+	1124000727777607680000.,
+	25852016738884976640000.,
+	620448401733239439360000.,
+	15511210043330985984000000.,
+	403291461126605635584000000.,
+	10888869450418352160768000000.,
+	304888344611713860501504000000.,
+	8841761993739701954543616000000.,
+	265252859812191058636308480000000.,
+	8222838654177922817725562880000000.,
+	263130836933693530167218012160000000.,
+	8683317618811886495518194401280000000.
+};
+
+float AssociatedLegendrePolynomial( int nL, int nM, float flX )
+{
+	// evaluate associated legendre polynomial at flX, using recurrence relation
+	float flPmm = 1.;
+	if ( nM > 0 )
+	{
+		float flSomX2 = sqrt( ( 1 - flX ) * ( 1 + flX ) );
+		float flFact = 1.;
+		for( int i = 0 ; i < nM; i++ )
+		{
+			flPmm *= -flFact * flSomX2;
+			flFact += 2.0;
+		}
+	}
+	if ( nL == nM )
+		return flPmm;
+	float flPmmp1 = flX * ( 2.0 * nM + 1.0 ) * flPmm;
+	if ( nL == nM + 1 ) 
+		return flPmmp1;
+	float flPll = 0.;
+	for( int nLL = nM + 2 ; nLL <= nL; nLL++ )
+	{
+		flPll = ( ( 2.0 * nLL - 1.0 ) * flX * flPmmp1 - ( nLL + nM - 1.0 ) * flPmm ) * ( 1.0 / ( nLL - nM ) );
+		flPmm = flPmmp1;
+		flPmmp1 = flPll;
+	}
+	return flPll;
+}
+
+static float SHNormalizationFactor( int nL, int nM )
+{
+	double flTemp = ( ( 2. * nL + 1.0 ) * s_flFactorials[ nL - nM ] )/ ( 4. * M_PI * s_flFactorials[ nL + nM ] );
+	return sqrt( flTemp );
+}
+
+#define SQRT_2 1.414213562373095 
+
+FORCEINLINE float SphericalHarmonic( int nL, int nM, float flTheta, float flPhi, float flCosTheta )
+{
+	if ( nM == 0 )
+		return SHNormalizationFactor( nL, 0 ) * AssociatedLegendrePolynomial( nL, nM, flCosTheta );
+
+	if ( nM > 0 )
+		return SQRT_2 * SHNormalizationFactor( nL, nM ) * cos ( nM * flPhi ) *
+			AssociatedLegendrePolynomial( nL, nM, flCosTheta );
+
+	return 
+		SQRT_2 * SHNormalizationFactor( nL, -nM ) * sin( -nM * flPhi ) * AssociatedLegendrePolynomial( nL, -nM, flCosTheta );
+
+}
+
+float SphericalHarmonic( int nL, int nM, float flTheta, float flPhi )
+{
+	return SphericalHarmonic( nL, nM, flTheta, flPhi, cos( flTheta ) );
+}
+
+float SphericalHarmonic( int nL, int nM, Vector const &vecDirection )
+{
+	Assert( fabs( VectorLength( vecDirection ) - 1.0 ) < 0.0001 );
+	float flPhi = acos( vecDirection.z );
+	float flTheta = 0;
+	float S = Square( vecDirection.x ) + Square( vecDirection.y );
+	if ( S > 0 )
+	{
+		flTheta = atan2( vecDirection.y, vecDirection.x );
+	}
+	return SphericalHarmonic( nL, nM, flTheta, flPhi, cos( flTheta ) );
+}
+
--- a/mathlib/sse.cpp
+++ b/mathlib/sse.cpp
@ -1,4 +1,4 @@
-//========= Copyright © 1996-2005, Valve Corporation, All rights reserved. ============//
+//========= Copyright <EFBFBD> 1996-2005, Valve Corporation, All rights reserved. ============//
 //
 // Purpose: SSE Math primitives.
 //
@ -16,7 +16,10 @@
 // memdbgon must be the last include file in a .cpp file!!!
 #include "tier0/memdbgon.h"

-#if defined ( _WIN32 ) && !defined ( _WIN64 )
+#ifndef COMPILER_MSVC64
+// Implement for 64-bit Windows if needed.
+
+#ifdef _WIN32
 static const uint32 _sincos_masks[]	  = { (uint32)0x0,  (uint32)~0x0 };
 static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };
 #endif
@ -37,21 +40,21 @@ static const uint32 _sincos_inv_masks[] = { (uint32)~0x0, (uint32)0x0 };

 	#define _PS_CONST(Name, Val) \
 		static const __declspec(align(16)) float _ps_##Name[4] = { Val, Val, Val, Val }
-#elif defined _LINUX || defined __APPLE__
+#elif POSIX
 	#define _PS_EXTERN_CONST(Name, Val) \
-		const __attribute__((aligned(16))) float _ps_##Name[4] = { Val, Val, Val, Val }
+		const float _ps_##Name[4] __attribute__((aligned(16))) = { Val, Val, Val, Val }

 	#define _PS_EXTERN_CONST_TYPE(Name, Type, Val) \
-		const __attribute__((aligned(16))) Type _ps_##Name[4] = { Val, Val, Val, Val }; \
+		const Type _ps_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }; \

 	#define _EPI32_CONST(Name, Val) \
-		static const __attribute__((aligned(16))) int32 _epi32_##Name[4] = { Val, Val, Val, Val }
+		static const int32 _epi32_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }

 	#define _PS_CONST(Name, Val) \
-		static const __attribute__((aligned(16))) float _ps_##Name[4] = { Val, Val, Val, Val }
+		static const float _ps_##Name[4]  __attribute__((aligned(16))) = { Val, Val, Val, Val }
 #endif

-#if defined ( _WIN32 ) && !defined ( _WIN64 )
+#ifdef _WIN32
 _PS_EXTERN_CONST(am_0, 0.0f);
 _PS_EXTERN_CONST(am_1, 1.0f);
 _PS_EXTERN_CONST(am_m1, -1.0f);
@ -62,8 +65,8 @@ _PS_EXTERN_CONST(am_pi_o_2, (float)(M_PI / 2.0));
 _PS_EXTERN_CONST(am_2_o_pi, (float)(2.0 / M_PI));
 _PS_EXTERN_CONST(am_pi_o_4, (float)(M_PI / 4.0));
 _PS_EXTERN_CONST(am_4_o_pi, (float)(4.0 / M_PI));
-_PS_EXTERN_CONST_TYPE(am_sign_mask, int32, (int32)0x80000000);
-_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, int32, ~0x80000000);
+_PS_EXTERN_CONST_TYPE(am_sign_mask, int32, static_cast<int32>(0x80000000));
+_PS_EXTERN_CONST_TYPE(am_inv_sign_mask, int32, static_cast<int32>(~0x80000000));
 _PS_EXTERN_CONST_TYPE(am_min_norm_pos,int32, 0x00800000);
 _PS_EXTERN_CONST_TYPE(am_mant_mask, int32, 0x7f800000);
 _PS_EXTERN_CONST_TYPE(am_inv_mant_mask, int32, ~0x7f800000);
@ -86,9 +89,6 @@ void  __cdecl _SSE_VectorMA( const float *start, float scale, const float *direc
 //-----------------------------------------------------------------------------
 float _SSE_Sqrt(float x)
 {
-#if defined( _WIN64 )
-	return std::sqrt(x);
-#else
 	Assert( s_bMathlibInitialized );
 	float	root = 0.f;
 #ifdef _WIN32
@ -97,17 +97,10 @@ float _SSE_Sqrt(float x)
 		sqrtss		xmm0, x
 		movss		root, xmm0
 	}
-#elif defined _LINUX || defined __APPLE__
-	__asm__ __volatile__(
-		"movss %1,%%xmm2\n"
-		"sqrtss %%xmm2,%%xmm1\n"
-		"movss %%xmm1,%0"
-       	: "=m" (root)
-		: "m" (x)
-	);
+#elif POSIX
+	_mm_store_ss( &root, _mm_sqrt_ss( _mm_load_ss( &x ) ) );
 #endif
 	return root;
-#endif // _WIN64
 }

 // Single iteration NewtonRaphson reciprocal square root:
@ -128,17 +121,21 @@ float _SSE_RSqrtAccurate(float x)
 	return (0.5f * rroot) * (3.f - (x * rroot) * rroot);
 }
 #else
+
+#ifdef POSIX
+const __m128  f3  = _mm_set_ss(3.0f);  // 3 as SSE value
+const __m128  f05 = _mm_set_ss(0.5f);  // 0.5 as SSE value
+#endif
+
 // Intel / Kipps SSE RSqrt.  Significantly faster than above.
 float _SSE_RSqrtAccurate(float a)
 {
-#if defined( _WIN64 )
-	return std::sqrt(a);
-#else
+
+#ifdef _WIN32
 	float x;
 	float half = 0.5f;
 	float three = 3.f;

-#ifdef _WIN32
 	__asm
 	{
 		movss   xmm3, a;
@ -154,27 +151,25 @@ float _SSE_RSqrtAccurate(float a)

 		movss   x,    xmm1;
 	}
-#elif defined _LINUX || defined __APPLE__
-	__asm__ __volatile__(
-		"movss   %1, %%xmm3 \n\t"
-        "movss   %2, %%xmm1 \n\t"
-        "movss   %3, %%xmm2 \n\t"
-        "rsqrtss %%xmm3, %%xmm0 \n\t"
-        "mulss   %%xmm0, %%xmm3 \n\t"
-        "mulss   %%xmm0, %%xmm1 \n\t"
-        "mulss   %%xmm0, %%xmm3 \n\t"
-        "subss   %%xmm3, %%xmm2 \n\t"
-        "mulss   %%xmm2, %%xmm1 \n\t"
-        "movss   %%xmm1, %0 \n\t"
-		: "=m" (x)
-		: "m" (a), "m" (half), "m" (three)
-);
+
+	return x;
+#elif POSIX	
+	__m128  xx = _mm_load_ss( &a );
+    __m128  xr = _mm_rsqrt_ss( xx );
+    __m128  xt;
+	
+    xt = _mm_mul_ss( xr, xr );
+    xt = _mm_mul_ss( xt, xx );
+    xt = _mm_sub_ss( f3, xt );
+    xt = _mm_mul_ss( xt, f05 );
+    xr = _mm_mul_ss( xr, xt );
+	
+    _mm_store_ss( &a, xr );
+    return a;
 #else
 	#error "Not Implemented"
 #endif

-	return x;
-#endif // _WIN64
 }
 #endif

@ -182,54 +177,40 @@ float _SSE_RSqrtAccurate(float a)
 // or so, so ok for closed transforms.  (ie, computing lighting normals)
 float _SSE_RSqrtFast(float x)
 {
-#if defined( _WIN64 )
-	return std::sqrt(x);
-#else
 	Assert( s_bMathlibInitialized );

-	float rroot = 0.0f;
+	float rroot;
 #ifdef _WIN32
 	_asm
 	{
 		rsqrtss	xmm0, x
 		movss	rroot, xmm0
 	}
-#elif defined _LINUX || defined __APPLE__
-	 __asm__ __volatile__(
-		"rsqrtss %1, %%xmm0 \n\t"
-		"movss %%xmm0, %0 \n\t"
-		: "=m" (x)
-		: "m" (rroot)
-		: "%xmm0"
-	);
+#elif POSIX
+	__asm__ __volatile__( "rsqrtss %0, %1" : "=x" (rroot) : "x" (x) );
 #else
 #error
 #endif

 	return rroot;
-#endif // _WIN64
 }

 float FASTCALL _SSE_VectorNormalize (Vector& vec)
 {
-#if defined( _WIN64 )
-	float l = std::sqrt(vec.x * vec.x + vec.y * vec.y + vec.z * vec.z);
-	vec.x /= l;
-	vec.y /= l;
-	vec.z /= l;
-	return l;
-#else
 	Assert( s_bMathlibInitialized );

 	// NOTE: This is necessary to prevent an memory overwrite...
 	// sice vec only has 3 floats, we can't "movaps" directly into it.
 #ifdef _WIN32
 	__declspec(align(16)) float result[4];
-#elif defined _LINUX || defined __APPLE__
-	__attribute__((aligned(16))) float result[4];
+#elif POSIX
+	 float result[4] __attribute__((aligned(16)));
 #endif

 	float *v = &vec[0];
+#ifdef _WIN32
+	float *r = &result[0];
+#endif

 	float	radius = 0.f;
 	// Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't 
@ -237,7 +218,6 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
 	if ( v[0] || v[1] || v[2] )
 	{
 #ifdef _WIN32
-	float *r = &result[0];
 	_asm
 		{
 			mov			eax, v
@ -262,7 +242,7 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
 			mulps		xmm4, xmm1			// r4 = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
 			movaps		[edx], xmm4			// v = vx * 1/radius, vy * 1/radius, vz * 1/radius, X
 		}
-#elif defined _LINUX || defined __APPLE__
+#elif POSIX
 		__asm__ __volatile__(
 #ifdef ALIGNED_VECTOR
            "movaps          %2, %%xmm4 \n\t"
@ -285,6 +265,7 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
            "movaps          %%xmm4, %1 \n\t"
            : "=m" (radius), "=m" (result)
            : "m" (*v)
+            : "xmm1", "xmm2", "xmm3", "xmm4"
 		);
 #else
 	#error "Not Implemented"
@ -296,7 +277,6 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
 	}

 	return radius;
-#endif // _WIN64
 }

 void FASTCALL _SSE_VectorNormalizeFast (Vector& vec)
@ -310,10 +290,6 @@ void FASTCALL _SSE_VectorNormalizeFast (Vector& vec)

 float _SSE_InvRSquared(const float* v)
 {
-#if defined( _WIN64 )
-	float	r2 = DotProduct(v, v);
-	return r2 < 1.f ? 1.f : 1/r2;
-#else
 	float	inv_r2 = 1.f;
 #ifdef _WIN32
 	_asm { // Intel SSE only routine
@ -331,12 +307,13 @@ float _SSE_InvRSquared(const float* v)
 		shufps		xmm2, xmm2, 1		// x2 = vy * vy, X, X, X
 		addss		xmm1, xmm2			// x1 = (vx * vx) + (vy * vy), X, X, X
 		addss		xmm1, xmm3			// x1 = (vx * vx) + (vy * vy) + (vz * vz), X, X, X
-		maxss		xmm1, xmm5			// x1 = MAX( 1.0, x1 )
-		rcpss		xmm0, xmm1			// x0 = 1 / MAX( 1.0, x1 )
+		maxss		xmm1, xmm5			// x1 = max( 1.0, x1 )
+		rcpss		xmm0, xmm1			// x0 = 1 / max( 1.0, x1 )
 		movss		inv_r2, xmm0		// inv_r2 = x0
 	}
-#elif defined _LINUX || defined __APPLE__
+#elif POSIX
 		__asm__ __volatile__(
+		"movss			 %0, %%xmm5 \n\t"
 #ifdef ALIGNED_VECTOR
 		"movaps          %1, %%xmm4 \n\t"
 #else
@ -352,23 +329,64 @@ float _SSE_InvRSquared(const float* v)
 		"maxss           %%xmm5, %%xmm1 \n\t"
        "rcpss           %%xmm1, %%xmm0 \n\t"
 		"movss           %%xmm0, %0 \n\t" 
-        : "=m" (inv_r2)
-        : "m" (*v), "m" (inv_r2)
+        : "+m" (inv_r2)
+        : "m" (*v)
+        : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 		);
 #else
 	#error "Not Implemented"
 #endif

 	return inv_r2;
-#endif // _WIN64
 }

+
+#ifdef POSIX
+// #define _PS_CONST(Name, Val) static const ALIGN16 float _ps_##Name[4] ALIGN16_POST = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val) static const ALIGN16 Type _ps_##Name[4] ALIGN16_POST = { static_cast<Type>(Val), static_cast<Type>(Val), static_cast<Type>(Val), static_cast<Type>(Val) }
+
+_PS_CONST_TYPE(sign_mask, int, 0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+
+#define _PI32_CONST(Name, Val)  static const ALIGN16 int _pi32_##Name[4]  ALIGN16_POST = { Val, Val, Val, Val }
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+#ifdef _WIN32
+_PI32_CONST(0x7f, 0x7f);
+#endif
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+typedef union xmm_mm_union {
+	__m128 xmm;
+	__m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; }
+
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+typedef __m64 v2si;   // vector of 2 int (mmx)
+
+#endif
+
 void _SSE_SinCos(float x, float* s, float* c)
 {
-#if defined( _WIN64 )
-	*s = std::sin(x);
-	*c = std::cos(x);
-#elif defined( _WIN32 )
+#ifdef _WIN32
 	float t4, t8, t12;

 	__asm
@ -453,8 +471,121 @@ void _SSE_SinCos(float x, float* s, float* c)
 		movss	[eax], xmm0
 		movss	[edx], xmm4
 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "_SSE_sincos NOT implemented!"
+#elif POSIX
+	
+	Assert( "Needs testing, verify impl!\n" );
+	
+	v4sf  xx = _mm_load_ss( &x );
+	
+	v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+	v2si mm0, mm1, mm2, mm3, mm4, mm5;
+	sign_bit_sin = xx;
+	/* take the absolute value */
+	xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
+	/* extract the sign bit (upper one) */
+	sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+	
+	/* scale by 4/Pi */
+	y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
+	
+	/* store the integer part of y in mm2:mm3 */
+	xmm3 = _mm_movehl_ps(xmm3, y);
+	mm2 = _mm_cvttps_pi32(y);
+	mm3 = _mm_cvttps_pi32(xmm3);
+	
+	/* j=(j+1) & (~1) (see the cephes sources) */
+	mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+	mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+	
+	y = _mm_cvtpi32x2_ps(mm2, mm3);
+	
+	mm4 = mm2;
+	mm5 = mm3;
+	
+	/* get the swap sign flag for the sine */
+	mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+	mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+	mm0 = _mm_slli_pi32(mm0, 29);
+	mm1 = _mm_slli_pi32(mm1, 29);
+	v4sf swap_sign_bit_sin;
+	COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+	
+	/* get the polynom selection mask for the sine */
+	
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+	mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+	mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+	v4sf poly_mask;
+	COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+	
+	/* The magic pass: "Extended precision modular arithmetic" 
+	 x = ((x - y * DP1) - y * DP2) - y * DP3; */
+	xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+	xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+	xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+	xmm1 = _mm_mul_ps(y, xmm1);
+	xmm2 = _mm_mul_ps(y, xmm2);
+	xmm3 = _mm_mul_ps(y, xmm3);
+	xx = _mm_add_ps(xx, xmm1);
+	xx = _mm_add_ps(xx, xmm2);
+	xx = _mm_add_ps(xx, xmm3);
+	
+	/* get the sign flag for the cosine */
+	mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+	mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+	mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+	mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+	mm4 = _mm_slli_pi32(mm4, 29);
+	mm5 = _mm_slli_pi32(mm5, 29);
+	v4sf sign_bit_cos;
+	COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+	_mm_empty(); /* good-bye mmx */
+	
+	sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+	
+	
+	/* Evaluate the first polynom  (0 <= x <= Pi/4) */
+	v4sf z = _mm_mul_ps(xx,xx);
+	y = *(v4sf*)_ps_coscof_p0;
+	
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+	y = _mm_mul_ps(y, z);
+	y = _mm_mul_ps(y, z);
+	v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+	y = _mm_sub_ps(y, tmp);
+	y = _mm_add_ps(y, *(v4sf*)_ps_1);
+	
+	/* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+	
+	v4sf y2 = *(v4sf*)_ps_sincof_p0;
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_mul_ps(y2, xx);
+	y2 = _mm_add_ps(y2, xx);
+	
+	/* select the correct result from the two polynoms */  
+	xmm3 = poly_mask;
+	v4sf ysin2 = _mm_and_ps(xmm3, y2);
+	v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+	y2 = _mm_sub_ps(y2,ysin2);
+	y = _mm_sub_ps(y, ysin1);
+	
+	xmm1 = _mm_add_ps(ysin1,ysin2);
+	xmm2 = _mm_add_ps(y,y2);
+	
+	/* update the sign */
+	_mm_store_ss( s, _mm_xor_ps(xmm1, sign_bit_sin) );
+	_mm_store_ss( c, _mm_xor_ps(xmm2, sign_bit_cos) );
+
 #else
 	#error "Not Implemented"
 #endif
@ -462,9 +593,7 @@ void _SSE_SinCos(float x, float* s, float* c)

 float _SSE_cos( float x )
 {
-#if defined ( _WIN64 )
-	return std::cos(x);
-#elif defined( _WIN32 )
+#ifdef _WIN32
 	float temp;
 	__asm
 	{
@ -513,8 +642,102 @@ float _SSE_cos( float x )
 		movss   x,    xmm0

 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "_SSE_cos NOT implemented!"
+#elif POSIX
+
+	Assert( "Needs testing, verify impl!\n" );
+
+	v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+	v2si mm0, mm1, mm2, mm3;
+	/* take the absolute value */
+	v4sf  xx = _mm_load_ss( &x );
+
+	xx = _mm_and_ps(xx, *(v4sf*)_ps_inv_sign_mask);
+		
+	/* scale by 4/Pi */
+	y = _mm_mul_ps(xx, *(v4sf*)_ps_cephes_FOPI);
+	
+	/* store the integer part of y in mm0:mm1 */
+	xmm2 = _mm_movehl_ps(xmm2, y);
+	mm2 = _mm_cvttps_pi32(y);
+	mm3 = _mm_cvttps_pi32(xmm2);
+	
+	/* j=(j+1) & (~1) (see the cephes sources) */
+	mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+	mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+	
+	y = _mm_cvtpi32x2_ps(mm2, mm3);
+	
+	
+	mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+	mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+	
+	/* get the swap sign flag in mm0:mm1 and the 
+	 polynom selection mask in mm2:mm3 */
+	
+	mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+	mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+	mm0 = _mm_slli_pi32(mm0, 29);
+	mm1 = _mm_slli_pi32(mm1, 29);
+	
+	mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+	mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+	
+	mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+	mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+	
+	v4sf sign_bit, poly_mask;
+	COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+	COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+	_mm_empty(); /* good-bye mmx */
+
+	/* The magic pass: "Extended precision modular arithmetic" 
+	 x = ((x - y * DP1) - y * DP2) - y * DP3; */
+	xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+	xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+	xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+	xmm1 = _mm_mul_ps(y, xmm1);
+	xmm2 = _mm_mul_ps(y, xmm2);
+	xmm3 = _mm_mul_ps(y, xmm3);
+	xx = _mm_add_ps(xx, xmm1);
+	xx = _mm_add_ps(xx, xmm2);
+	xx = _mm_add_ps(xx, xmm3);
+	
+	/* Evaluate the first polynom  (0 <= x <= Pi/4) */
+	y = *(v4sf*)_ps_coscof_p0;
+	v4sf z = _mm_mul_ps(xx,xx);
+	
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+	y = _mm_mul_ps(y, z);
+	y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+	y = _mm_mul_ps(y, z);
+	y = _mm_mul_ps(y, z);
+	v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+	y = _mm_sub_ps(y, tmp);
+	y = _mm_add_ps(y, *(v4sf*)_ps_1);
+	
+	/* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+	
+	v4sf y2 = *(v4sf*)_ps_sincof_p0;
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+	y2 = _mm_mul_ps(y2, z);
+	y2 = _mm_mul_ps(y2, xx);
+	y2 = _mm_add_ps(y2, xx);
+	
+	/* select the correct result from the two polynoms */  
+	xmm3 = poly_mask;
+	y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+	y = _mm_andnot_ps(xmm3, y);
+	y = _mm_add_ps(y,y2);
+	/* update the sign */
+
+	_mm_store_ss( &x, _mm_xor_ps(y, sign_bit) );
+
 #else
 	#error "Not Implemented"
 #endif
@ -525,12 +748,10 @@ float _SSE_cos( float x )
 //-----------------------------------------------------------------------------
 // SSE2 implementations of optimized routines:
 //-----------------------------------------------------------------------------
+#ifdef PLATFORM_WINDOWS_PC32
 void _SSE2_SinCos(float x, float* s, float* c)  // any x
 {
-#if defined( _WIN64 )
-	*s = std::sin(x);
-	*c = std::cos(x);
-#elif defined( _WIN32 )
+#ifdef _WIN32
 	__asm
 	{
 		movss	xmm0, x
@ -606,18 +827,19 @@ void _SSE2_SinCos(float x, float* s, float* c)  // any x
 		movss	[eax], xmm0
 		movss	[edx], xmm6
 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "_SSE2_SinCos NOT implemented!"
+#elif POSIX
+	#warning "_SSE2_SinCos NOT implemented!"
+	Assert( 0 );
 #else
 	#error "Not Implemented"
 #endif
 }
+#endif // PLATFORM_WINDOWS_PC32

+#ifdef PLATFORM_WINDOWS_PC32
 float _SSE2_cos(float x)  
 {
-#if defined ( _WIN64 )
-	return std::cos(x);
-#elif defined( _WIN32 )
+#ifdef _WIN32
 	__asm
 	{
 		movss	xmm0, x
@ -663,25 +885,25 @@ float _SSE2_cos(float x)
 		mulss	xmm0, xmm1
 		movss   x,    xmm0
 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "_SSE2_cos NOT implemented!"
+#elif POSIX
+	#warning "_SSE2_cos NOT implemented!"
+	Assert( 0 );
 #else
 	#error "Not Implemented"
 #endif

 	return x;
 }
+#endif // PLATFORM_WINDOWS_PC32

+#if 0
 // SSE Version of VectorTransform
 void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
 {
 	Assert( s_bMathlibInitialized );
 	Assert( in1 != out1 );
-#if defined ( _WIN64 )
-	out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
-	out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
-	out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
-#elif defined( _WIN32 )
+
+#ifdef _WIN32
 	__asm
 	{
 		mov eax, in1;
@ -723,8 +945,8 @@ void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
 		addss xmm0, [ecx+12]
 		movss [edx+8], xmm0;
 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "VectorTransformSSE C implementation only"
+#elif POSIX
+	#warning "VectorTransformSSE C implementation only"
 		out1[0] = DotProduct(in1, in2[0]) + in2[0][3];
 		out1[1] = DotProduct(in1, in2[1]) + in2[1][3];
 		out1[2] = DotProduct(in1, in2[2]) + in2[2][3];
@ -732,16 +954,15 @@ void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1)
 	#error "Not Implemented"
 #endif
 }
+#endif

+#if 0
 void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
 {
 	Assert( s_bMathlibInitialized );
 	Assert( in1 != out1 );
-#if defined ( _WIN64 )
-	out1[0] = DotProduct( in1, in2[0] );
-	out1[1] = DotProduct( in1, in2[1] );
-	out1[2] = DotProduct( in1, in2[2] );
-#elif defined( _WIN32 )
+
+#ifdef _WIN32
 	__asm
 	{
 		mov eax, in1;
@ -780,8 +1001,8 @@ void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
 		addss xmm0, xmm2;
 		movss [edx+8], xmm0;
 	}
-#elif defined _LINUX || defined __APPLE__
-//	#warning "VectorRotateSSE C implementation only"
+#elif POSIX
+	#warning "VectorRotateSSE C implementation only"
 		out1[0] = DotProduct( in1, in2[0] );
 		out1[1] = DotProduct( in1, in2[1] );
 		out1[2] = DotProduct( in1, in2[2] );
@ -789,8 +1010,9 @@ void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 )
 	#error "Not Implemented"
 #endif
 }
+#endif

-#if defined( _WIN32 ) && !defined( _WIN64 )
+#ifdef _WIN32
 void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const float *direction, float *dest )
 {
 	// FIXME: This don't work!! It will overwrite memory in the write to dest
@ -821,7 +1043,7 @@ void _declspec(naked) _SSE_VectorMA( const float *start, float scale, const floa
 }
 #endif

-#if defined( _WIN32 ) && !defined( _WIN64 )
+#ifdef _WIN32
 #ifdef PFN_VECTORMA
 void _declspec(naked) __cdecl _SSE_VectorMA( const Vector &start, float scale, const Vector &direction, Vector &dest )
 {
@ -886,4 +1108,6 @@ vec_t DotProduct (const vec_t *a, const vec_t *c)
 		ret
 	}
 }
-*/
+*/
+
+#endif // COMPILER_MSVC64 
--- a/mathlib/sse.h
+++ b/mathlib/sse.h
@ -15,9 +15,13 @@ void FASTCALL _SSE_VectorNormalizeFast(Vector& vec);
 float _SSE_InvRSquared(const float* v);
 void _SSE_SinCos(float x, float* s, float* c);
 float _SSE_cos( float x);
+#ifdef PLATFORM_WINDOWS_PC32
 void _SSE2_SinCos(float x, float* s, float* c);
 float _SSE2_cos(float x); 
+#endif
+#if 0
 void VectorTransformSSE(const float *in1, const matrix3x4_t& in2, float *out1);
 void VectorRotateSSE( const float *in1, const matrix3x4_t& in2, float *out1 );
+#endif

 #endif // _SSE_H
--- a/mathlib/sseconst.cpp
+++ b/mathlib/sseconst.cpp
@ -1,4 +1,4 @@
-//===== Copyright © 1996-2005, Valve Corporation, All rights reserved. ======//
+//===== Copyright <EFBFBD> 1996-2005, Valve Corporation, All rights reserved. ======//
 //
 // Purpose: 
 //
@ -30,24 +30,33 @@ const fltx4 Four_FLT_MAX={FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
 const fltx4 Four_Negative_FLT_MAX={-FLT_MAX,-FLT_MAX,-FLT_MAX,-FLT_MAX};
 const fltx4 g_SIMD_0123 = { 0., 1., 2., 3. };

-const int32 ALIGN16 g_SIMD_clear_signmask[4]= {(int32)0x7fffffff,(int32)0x7fffffff,(int32)0x7fffffff,(int32)0x7fffffff};
-const int32 ALIGN16 g_SIMD_signmask[4]= { (int32)0x80000000, (int32)0x80000000, (int32)0x80000000, (int32)0x80000000 };
-const int32 ALIGN16 g_SIMD_lsbmask[4]= { (int32)0xfffffffe, (int32)0xfffffffe, (int32)0xfffffffe, (int32)0xfffffffe };
-const int32 ALIGN16 g_SIMD_clear_wmask[4]= { (int32)0xffffffff, (int32)0xffffffff, (int32)0xffffffff, 0 };
-const int32 ALIGN16 g_SIMD_AllOnesMask[4]= { (int32)0xffffffff, (int32)0xffffffff, (int32)0xffffffff, (int32)0xffffffff }; // ~0,~0,~0,~0
-const int32 ALIGN16 g_SIMD_Low16BitsMask[4]= { (int32)0xffff, (int32)0xffff,(int32) 0xffff, (int32)0xffff }; // 0xffff x 4
-
-const int32 ALIGN16 g_SIMD_ComponentMask[4][4] =
+extern const fltx4 g_QuatMultRowSign[4];
+const fltx4 g_QuatMultRowSign[4] =
 {
-	{ (int32)0xFFFFFFFF, 0, 0, 0 }, { 0, (int32)0xFFFFFFFF, 0, 0 }, { 0, 0, (int32)0xFFFFFFFF, 0 }, { 0, 0, 0, (int32)0xFFFFFFFF }
+	{  1.0f,  1.0f, -1.0f, 1.0f },
+	{ -1.0f,  1.0f,  1.0f, 1.0f },
+	{  1.0f, -1.0f,  1.0f, 1.0f },
+	{ -1.0f, -1.0f, -1.0f, 1.0f }
 };

-const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] =
+const int32 ALIGN16 g_SIMD_clear_signmask[4] ALIGN16_POST = {static_cast<int32>(0x7fffffff), static_cast<int32>(0x7fffffff), static_cast<int32>(0x7fffffff), static_cast<int32>(0x7fffffff)};
+const int32 ALIGN16 g_SIMD_signmask[4] ALIGN16_POST = { static_cast<int32>(0x80000000), static_cast<int32>(0x80000000), static_cast<int32>(0x80000000), static_cast<int32>(0x80000000) };
+const int32 ALIGN16 g_SIMD_lsbmask[4] ALIGN16_POST = { static_cast<int32>(0xfffffffe), static_cast<int32>(0xfffffffe), static_cast<int32>(0xfffffffe), static_cast<int32>(0xfffffffe) };
+const int32 ALIGN16 g_SIMD_clear_wmask[4] ALIGN16_POST = { static_cast<int32>(0xffffffff), static_cast<int32>(0xffffffff), static_cast<int32>(0xffffffff), 0 };
+const int32 ALIGN16 g_SIMD_AllOnesMask[4] ALIGN16_POST = { static_cast<int32>(0xffffffff), static_cast<int32>(0xffffffff), static_cast<int32>(0xffffffff), static_cast<int32>(0xffffffff) }; // ~0,~0,~0,~0
+const int32 ALIGN16 g_SIMD_Low16BitsMask[4] ALIGN16_POST = { 0xffff, 0xffff, 0xffff, 0xffff }; // 0xffff x 4
+
+const int32 ALIGN16 g_SIMD_ComponentMask[4][4] ALIGN16_POST =
 {
-	{ (int32)0xffffffff, (int32)0xffffffff, (int32)0xffffffff, (int32)0xffffffff },
-	{ (int32)0xffffffff, (int32)0x00000000, (int32)0x00000000, (int32)0x00000000 },
-	{ (int32)0xffffffff, (int32)0xffffffff, (int32)0x00000000, (int32)0x00000000 },
-	{ (int32)0xffffffff, (int32)0xffffffff, (int32)0xffffffff, (int32)0x00000000 },
+	{ static_cast<int32>(0xFFFFFFFF), 0, 0, 0 }, { 0, static_cast<int32>(0xFFFFFFFF), 0, 0 }, { 0, 0, static_cast<int32>(0xFFFFFFFF), 0 }, { 0, 0, 0, static_cast<int32>(0xFFFFFFFF) }
+};
+
+const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST =
+{
+	{ static_cast<int32>(0xffffffff), static_cast<int32>(0xffffffff), static_cast<int32>(0xffffffff), static_cast<int32>(0xffffffff) },
+	{ static_cast<int32>(0xffffffff), static_cast<int32>(0x00000000), static_cast<int32>(0x00000000), static_cast<int32>(0x00000000) },
+	{ static_cast<int32>(0xffffffff), static_cast<int32>(0xffffffff), static_cast<int32>(0x00000000), static_cast<int32>(0x00000000) },
+	{ static_cast<int32>(0xffffffff), static_cast<int32>(0xffffffff), static_cast<int32>(0xffffffff), static_cast<int32>(0x00000000) },
 };


--- a/mathlib/ssenoise.cpp
+++ b/mathlib/ssenoise.cpp
@ -30,6 +30,10 @@ static ALIGN16 int32 idx_mask[4]= {0xffff, 0xffff, 0xffff, 0xffff};
 // returns 0..1
 static inline float GetLatticePointValue( int idx_x, int idx_y, int idx_z )
 {
+	NOTE_UNUSED(perm_d);
+	NOTE_UNUSED(impulse_ycoords);
+	NOTE_UNUSED(impulse_zcoords);
+
 	int ret_idx = perm_a[idx_x & 0xff];
 	ret_idx = perm_b[( idx_y + ret_idx ) & 0xff];
 	ret_idx = perm_c[( idx_z + ret_idx ) & 0xff];
--- a/mathlib/vmatrix.cpp
+++ b/mathlib/vmatrix.cpp
@ -306,7 +306,7 @@ bool MatrixInverseGeneral(const VMatrix& src, VMatrix& dst)
 	for(iRow=0; iRow < 4; iRow++)
 	{
 		// Find the row with the largest element in this column.
-		fLargest = 0.001f;
+		fLargest = 0.00001f;
 		iLargest = -1;
 		for(iTest=iRow; iTest < 4; iTest++)
 		{
@ -509,7 +509,7 @@ bool VMatrix::IsRotationMatrix() const
 		FloatMakePositive( v2.Dot(v3) ) < 0.01f;
 }

-void VMatrix::SetupMatrixOrgAngles( const Vector &origin, const QAngle &vAngles )
+static void SetupMatrixAnglesInternal( vec_t m[4][4], const QAngle & vAngles )
 {
 	float		sr, sp, sy, cr, cp, cy;

@ -530,6 +530,11 @@ void VMatrix::SetupMatrixOrgAngles( const Vector &origin, const QAngle &vAngles
 	m[0][3] = 0.f;
 	m[1][3] = 0.f;
 	m[2][3] = 0.f;
+}
+
+void VMatrix::SetupMatrixOrgAngles( const Vector &origin, const QAngle &vAngles )
+{
+	SetupMatrixAnglesInternal( m, vAngles );
 	
 	// Add translation
 	m[0][3] = origin.x;
@ -542,6 +547,21 @@ void VMatrix::SetupMatrixOrgAngles( const Vector &origin, const QAngle &vAngles
 }


+void	VMatrix::SetupMatrixAngles( const QAngle &vAngles )
+{
+	SetupMatrixAnglesInternal( m, vAngles );
+
+	// Zero everything else
+	m[0][3] = 0.0f;
+	m[1][3] = 0.0f;
+	m[2][3] = 0.0f;
+	m[3][0] = 0.0f;
+	m[3][1] = 0.0f;
+	m[3][2] = 0.0f;
+	m[3][3] = 1.0f;
+}
+
+
 //-----------------------------------------------------------------------------
 // Sets matrix to identity
 //-----------------------------------------------------------------------------
@ -728,7 +748,7 @@ void Vector4DMultiplyPosition( const VMatrix& src1, Vector const& src2, Vector4D
 {
 	// Make sure it works if src2 == dst
 	Vector tmp;
-	Vector const&v = ( &src2 == &dst.AsVector3D() ) ? tmp : src2;
+	Vector const&v = ( &src2 == &dst.AsVector3D() ) ? static_cast<const Vector&>(tmp) : src2;

 	if (&src2 == &dst.AsVector3D())
 	{
@ -751,7 +771,7 @@ void Vector3DMultiply( const VMatrix &src1, const Vector &src2, Vector &dst )
 {
 	// Make sure it works if src2 == dst
 	Vector tmp;
-	const Vector &v = (&src2 == &dst) ?  tmp : src2;
+	const Vector &v = (&src2 == &dst) ?  static_cast<const Vector&>(tmp) : src2;

 	if( &src2 == &dst )
 	{
@ -772,7 +792,7 @@ void Vector3DMultiplyPositionProjective( const VMatrix& src1, const Vector &src2
 {
 	// Make sure it works if src2 == dst
 	Vector tmp;
-	const Vector &v = (&src2 == &dst) ? tmp: src2;
+	const Vector &v = (&src2 == &dst) ? static_cast<const Vector&>(tmp): src2;
 	if( &src2 == &dst )
 	{
 		VectorCopy( src2, tmp );
@ -799,7 +819,7 @@ void Vector3DMultiplyProjective( const VMatrix& src1, const Vector &src2, Vector
 {
 	// Make sure it works if src2 == dst
 	Vector tmp;
-	const Vector &v = (&src2 == &dst) ? tmp : src2;
+	const Vector &v = (&src2 == &dst) ? static_cast<const Vector&>(tmp) : src2;
 	if( &src2 == &dst )
 	{
 		VectorCopy( src2, tmp );
@ -852,7 +872,7 @@ void Vector3DMultiplyTranspose( const VMatrix& src1, const Vector& src2, Vector&
 	bool srcEqualsDst = (&src2 == &dst);

 	Vector tmp;
-	const Vector&v = srcEqualsDst ? tmp : src2;
+	const Vector&v = srcEqualsDst ? static_cast<const Vector&>(tmp) : src2;

 	if (srcEqualsDst)
 	{
@ -937,7 +957,7 @@ void MatrixBuildTranslation( VMatrix& dst, const Vector &translation )
 //-----------------------------------------------------------------------------
 void MatrixBuildRotationAboutAxis( VMatrix &dst, const Vector &vAxisOfRot, float angleDegrees )
 {
-	MatrixBuildRotationAboutAxis( vAxisOfRot, angleDegrees, dst.As3x4() );
+	MatrixBuildRotationAboutAxis( vAxisOfRot, angleDegrees, const_cast< matrix3x4_t &> ( dst.As3x4() ) );
 	dst[3][0] = 0;
 	dst[3][1] = 0;
 	dst[3][2] = 0;
@ -1233,19 +1253,29 @@ void MatrixBuildOrtho( VMatrix& dst, double left, double top, double right, doub
 				0.0f,						0.0f,						0.0f,								1.0f );
 }

+void MatrixBuildPerspectiveZRange( VMatrix& dst, double flZNear, double flZFar )
+{
+	dst.m[2][0] = 0.0f;
+	dst.m[2][1] = 0.0f;
+	dst.m[2][2] = flZFar / ( flZNear - flZFar );
+	dst.m[2][3] = flZNear * flZFar / ( flZNear - flZFar );
+}
+
 void MatrixBuildPerspectiveX( VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar )
 {
-	float flWidth = 2.0f * flZNear * tanf( flFovX * M_PI / 360.0f );
-	float flHeight = flWidth / flAspect;
-	dst.Init(   2.0f * flZNear / flWidth,						0.0f,							0.0f,										0.0f,
-				0.0f,  2.0f  * flZNear/ flHeight,							0.0f,										0.0f,
-				0.0f,						0.0f,  flZFar / ( flZNear - flZFar ),	 flZNear * flZFar / ( flZNear - flZFar ),
+	float flWidthScale = 1.0f / tanf( flFovX * M_PI / 360.0f );
+	float flHeightScale = flAspect * flWidthScale;
+	dst.Init(   flWidthScale,				0.0f,							0.0f,										0.0f,
+				0.0f,						flHeightScale,					0.0f,										0.0f,
+				0.0f,						0.0f,							0.0f,										0.0f,
 				0.0f,						0.0f,						   -1.0f,										0.0f );
+
+	MatrixBuildPerspectiveZRange ( dst, flZNear, flZFar );
 }

 void MatrixBuildPerspectiveOffCenterX( VMatrix& dst, double flFovX, double flAspect, double flZNear, double flZFar, double bottom, double top, double left, double right )
 {
-	float flWidth = 2.0f * flZNear * tanf( flFovX * M_PI / 360.0f );
+	float flWidth = tanf( flFovX * M_PI / 360.0f );
 	float flHeight = flWidth / flAspect;

 	// bottom, top, left, right are 0..1 so convert to -<val>/2..<val>/2
@ -1254,10 +1284,12 @@ void MatrixBuildPerspectiveOffCenterX( VMatrix& dst, double flFovX, double flAsp
 	float flBottom = -(flHeight/2.0f) * (1.0f - bottom) + bottom * (flHeight/2.0f);
 	float flTop    = -(flHeight/2.0f) * (1.0f - top)    + top    * (flHeight/2.0f);

-	dst.Init(   (2.0f * flZNear) / (flRight-flLeft),                           0.0f, (flLeft+flRight)/(flRight-flLeft),                            0.0f,
-				0.0f,  2.0f*flZNear/(flTop-flBottom), (flTop+flBottom)/(flTop-flBottom),                            0.0f,
-				0.0f,                           0.0f,           flZFar/(flZNear-flZFar),  flZNear*flZFar/(flZNear-flZFar),
-				0.0f,                           0.0f,                             -1.0f,                            0.0f );
+	dst.Init(   1.0f / (flRight-flLeft),        0.0f,                           (flLeft+flRight)/(flRight-flLeft),  0.0f,
+				0.0f,                           1.0f /(flTop-flBottom),         (flTop+flBottom)/(flTop-flBottom),  0.0f,
+				0.0f,                           0.0f,							0.0f,								0.0f,
+				0.0f,                           0.0f,                           -1.0f,								0.0f );
+
+	MatrixBuildPerspectiveZRange ( dst, flZNear, flZFar );
 }
 #endif // !_STATIC_LINKED || _SHARED_LIB

--- a/public/mathlib/amd3dx.h
+++ b/public/mathlib/amd3dx.h
--- a/public/mathlib/compressed_vector.h
+++ b/public/mathlib/compressed_vector.h
@ -58,8 +58,8 @@ inline Vector32& Vector32::operator=(const Vector &vOther)

 	static float expScale[4] = { 4.0f, 16.0f, 32.f, 64.f };

-	float fmax = MAX( fabs( vOther.x ), fabs( vOther.y ) );
-	fmax = fpmax( fmax, fabs( vOther.z ) );
+	float fmax = Max( fabs( vOther.x ), fabs( vOther.y ) );
+	fmax = Max( fmax, (float)fabs( vOther.z ) );

 	for (exp = 0; exp < 3; exp++)
 	{
@ -70,9 +70,9 @@ inline Vector32& Vector32::operator=(const Vector &vOther)

 	float fexp = 512.0f / expScale[exp];

-	x = clamp( (int)(vOther.x * fexp) + 512, 0, 1023 );
-	y = clamp( (int)(vOther.y * fexp) + 512, 0, 1023 );
-	z = clamp( (int)(vOther.z * fexp) + 512, 0, 1023 );
+	x = Clamp( (int)(vOther.x * fexp) + 512, 0, 1023 );
+	y = Clamp( (int)(vOther.y * fexp) + 512, 0, 1023 );
+	z = Clamp( (int)(vOther.z * fexp) + 512, 0, 1023 );
 	return *this; 
 }

@ -118,8 +118,8 @@ inline Normal32& Normal32::operator=(const Vector &vOther)
 {
 	CHECK_VALID(vOther);

-	x = clamp( (int)(vOther.x * 16384) + 16384, 0, 32767 );
-	y = clamp( (int)(vOther.y * 16384) + 16384, 0, 32767 );
+	x = Clamp( (int)(vOther.x * 16384) + 16384, 0, 32767 );
+	y = Clamp( (int)(vOther.y * 16384) + 16384, 0, 32767 );
 	zneg = (vOther.z < 0);
 	//x = vOther.x; 
 	//y = vOther.y; 
@ -182,9 +182,9 @@ inline Quaternion64& Quaternion64::operator=(const Quaternion &vOther)
 {
 	CHECK_VALID(vOther);

-	x = clamp( (int)(vOther.x * 1048576) + 1048576, 0, 2097151 );
-	y = clamp( (int)(vOther.y * 1048576) + 1048576, 0, 2097151 );
-	z = clamp( (int)(vOther.z * 1048576) + 1048576, 0, 2097151 );
+	x = Clamp( (int)(vOther.x * 1048576) + 1048576, 0, 2097151 );
+	y = Clamp( (int)(vOther.y * 1048576) + 1048576, 0, 2097151 );
+	z = Clamp( (int)(vOther.z * 1048576) + 1048576, 0, 2097151 );
 	wneg = (vOther.w < 0);
 	return *this; 
 }
@ -229,9 +229,9 @@ inline Quaternion48& Quaternion48::operator=(const Quaternion &vOther)
 {
 	CHECK_VALID(vOther);

-	x = clamp( (int)(vOther.x * 32768) + 32768, 0, 65535 );
-	y = clamp( (int)(vOther.y * 32768) + 32768, 0, 65535 );
-	z = clamp( (int)(vOther.z * 16384) + 16384, 0, 32767 );
+	x = Clamp( (int)(vOther.x * 32768) + 32768, 0, 65535 );
+	y = Clamp( (int)(vOther.y * 32768) + 32768, 0, 65535 );
+	z = Clamp( (int)(vOther.z * 16384) + 16384, 0, 32767 );
 	wneg = (vOther.w < 0);
 	return *this; 
 }
@ -276,9 +276,9 @@ inline Quaternion32& Quaternion32::operator=(const Quaternion &vOther)
 {
 	CHECK_VALID(vOther);

-	x = clamp( (int)(vOther.x * 1024) + 1024, 0, 2047 );
-	y = clamp( (int)(vOther.y * 512) + 512, 0, 1023 );
-	z = clamp( (int)(vOther.z * 512) + 512, 0, 1023 );
+	x = Clamp( (int)(vOther.x * 1024) + 1024, 0, 2047 );
+	y = Clamp( (int)(vOther.y * 512) + 512, 0, 1023 );
+	z = Clamp( (int)(vOther.z * 512) + 512, 0, 1023 );
 	wneg = (vOther.w < 0);
 	return *this; 
 }
--- a/public/mathlib/lightdesc.h
+++ b/public/mathlib/lightdesc.h
@ -28,6 +28,7 @@ enum LightType_OptimizationFlags_t
 	LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION0 = 1,
 	LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION1 = 2,
 	LIGHTTYPE_OPTIMIZATIONFLAGS_HAS_ATTENUATION2 = 4,
+	LIGHTTYPE_OPTIMIZATIONFLAGS_DERIVED_VALUES_CALCED = 8,
 };

 struct LightDesc_t 
@ -102,6 +103,11 @@ public:
 	{
 		return ((m_Type!=MATERIAL_LIGHT_SPOT) || (rdir.Dot(m_Direction)>=m_PhiDot));
 	}
+
+	float OneOverThetaDotMinusPhiDot() const
+	{
+		return OneOver_ThetaDot_Minus_PhiDot;
+	}
 };


--- a/public/mathlib/math_pfns.h
+++ b/public/mathlib/math_pfns.h
@ -28,6 +28,14 @@ extern float (*pfFastCos)(float x);
 #define FastSinCos(x,s,c)   (*pfFastSinCos)(x,s,c)
 #define FastCos(x)			(*pfFastCos)(x)

+#if defined(__i386__) || defined(_M_IX86)
+// On x86, the inline FPU or SSE sqrt instruction is faster than
+// the overhead of setting up a function call and saving/restoring
+// the FPU or SSE register state and can be scheduled better, too.
+#undef FastSqrt
+#define FastSqrt(x)			::sqrtf(x)
+#endif
+
 #endif // !_X360

 #if defined( _X360 )
--- a/public/mathlib/mathlib.h
+++ b/public/mathlib/mathlib.h
@ -16,6 +16,61 @@

 #include "mathlib/math_pfns.h"

+#if defined(__i386__) || defined(_M_IX86)
+// For MMX intrinsics
+#include <xmmintrin.h>
+#endif
+
+// XXX remove me
+#undef clamp
+
+#ifdef DEBUG  // stop crashing edit-and-continue
+FORCEINLINE float clamp( float val, float minVal, float maxVal )
+{
+	if ( maxVal < minVal )
+		return maxVal;
+	else if( val < minVal )
+		return minVal;
+	else if( val > maxVal )
+		return maxVal;
+	else
+		return val;
+}
+#else // DEBUG
+FORCEINLINE float clamp( float val, float minVal, float maxVal )
+{
+#if defined(__i386__) || defined(_M_IX86)
+	_mm_store_ss( &val,
+		_mm_min_ss(
+			_mm_max_ss(
+				_mm_load_ss(&val),
+				_mm_load_ss(&minVal) ),
+			_mm_load_ss(&maxVal) ) );
+#else
+	val = fpmax(minVal, val);
+	val = fpmin(maxVal, val);
+#endif
+	return val;
+}
+#endif // DEBUG
+
+//
+// Returns a clamped value in the range [min, max].
+//
+template< class T >
+inline T clamp( T const &val, T const &minVal, T const &maxVal )
+{
+	if ( maxVal < minVal )
+		return maxVal;
+	else if( val < minVal )
+		return minVal;
+	else if( val > maxVal )
+		return maxVal;
+	else
+		return val;
+}
+
+
 // plane_t structure
 // !!! if this is changed, it must be changed in asm code too !!!
 // FIXME: does the asm code even exist anymore?
@ -225,12 +280,12 @@ FORCEINLINE void VectorClear(vec_t *a)

 FORCEINLINE float VectorMaximum(const vec_t *v)
 {
-	return MAX( v[0], MAX( v[1], v[2] ) );
+	return V_max( v[0], V_max( v[1], v[2] ) );
 }

 FORCEINLINE float VectorMaximum(const Vector& v)
 {
-	return MAX( v.x, MAX( v.y, v.z ) );
+	return V_max( v.x, V_max( v.y, v.z ) );
 }

 FORCEINLINE void VectorScale (const float* in, vec_t scale, float* out)
@ -255,7 +310,7 @@ inline void VectorNegate(vec_t *a)
 }


-//#define VectorMaximum(a)		( MAX( (a)[0], MAX( (a)[1], (a)[2] ) ) )
+//#define VectorMaximum(a)		( V_max( (a)[0], V_max( (a)[1], (a)[2] ) ) )
 #define Vector2Clear(x)			{(x)[0]=(x)[1]=0;}
 #define Vector2Negate(x)		{(x)[0]=-((x)[0]);(x)[1]=-((x)[1]);}
 #define Vector2Copy(a,b)		{(b)[0]=(a)[0];(b)[1]=(a)[1];}
@ -282,10 +337,10 @@ FORCEINLINE void VectorMAInline( const Vector& start, float scale, const Vector&
 	dest.z=start.z+direction.z*scale;
 }

-//FORCEINLINE void VectorMA( const Vector& start, float scale, const Vector& direction, Vector& dest )
-//{
-//	VectorMAInline(start, scale, direction, dest);
-//}
+FORCEINLINE void VectorMA( const Vector& start, float scale, const Vector& direction, Vector& dest )
+{
+	VectorMAInline(start, scale, direction, dest);
+}

 FORCEINLINE void VectorMA( const float * start, float scale, const float *direction, float *dest )
 {
@ -314,12 +369,9 @@ int Q_log2(int val);
 // Math routines done in optimized assembly math package routines
 void inline SinCos( float radians, float *sine, float *cosine )
 {
-#if defined( _WIN64 )
-	*sine = sinf(radians);
-	*cosine = cosf(radians);
-#elif defined( _X360 )
+#if defined( _X360 )
 	XMScalarSinCos( sine, cosine, radians );
-#elif defined( _WIN32 )
+#elif defined( PLATFORM_WINDOWS_PC32 )
 	_asm
 	{
 		fld		DWORD PTR [radians]
@ -331,11 +383,12 @@ void inline SinCos( float radians, float *sine, float *cosine )
 		fstp DWORD PTR [edx]
 		fstp DWORD PTR [eax]
 	}
-#elif defined( _LINUX ) || defined ( __APPLE__ )
+#elif defined( PLATFORM_WINDOWS_PC64 )
+	*sine = sin( radians );
+	*cosine = cos( radians );
+#elif defined( POSIX )
 	double __cosr, __sinr;
- 	__asm __volatile__
-    		("fsincos"
-     	: "=t" (__cosr), "=u" (__sinr) : "0" (radians));
+	__asm ("fsincos" : "=t" (__cosr), "=u" (__sinr) : "0" (radians));

  	*sine = __sinr;
  	*cosine = __cosr;
@ -379,11 +432,6 @@ FORCEINLINE T Square( T const &a )
 }


-FORCEINLINE bool IsPowerOfTwo( uint x )
-{
-	return ( x & ( x - 1 ) ) == 0;
-}
-
 // return the smallest power of two >= x.
 // returns 0 if x == 0 or x > 0x80000000 (ie numbers that would be negative if x was signed)
 // NOTE: the old code took an int, and if you pass in an int of 0x80000000 casted to a uint,
@ -450,6 +498,19 @@ bool MatricesAreEqual( const matrix3x4_t &src1, const matrix3x4_t &src2, float f
 void MatrixGetColumn( const matrix3x4_t &in, int column, Vector &out );
 void MatrixSetColumn( const Vector &in, int column, matrix3x4_t &out );

+inline void MatrixGetTranslation( const matrix3x4_t &in, Vector &out )
+{
+	MatrixGetColumn ( in, 3, out );
+}
+
+inline void MatrixSetTranslation( const Vector &in, matrix3x4_t &out )
+{
+	MatrixSetColumn ( in, 3, out );
+}
+
+void MatrixScaleBy ( const float flScale, matrix3x4_t &out );
+void MatrixScaleByZero ( matrix3x4_t &out );
+
 //void DecomposeRotation( const matrix3x4_t &mat, float *out );
 void ConcatRotations (const matrix3x4_t &in1, const matrix3x4_t &in2, matrix3x4_t &out);
 void ConcatTransforms (const matrix3x4_t &in1, const matrix3x4_t &in2, matrix3x4_t &out);
@ -625,15 +686,11 @@ template <class T> FORCEINLINE T AVG(T a, T b)
 }

 // number of elements in an array of static size
-#define NELEMS(x) ((sizeof(x))/sizeof(x[0]))
+#define NELEMS(x) ARRAYSIZE(x)

 // XYZ macro, for printf type functions - ex printf("%f %f %f",XYZ(myvector));
 #define XYZ(v) (v).x,(v).y,(v).z

-//
-// Returns a clamped value in the range [min, max].
-//
-#define V_clamp(val, min, max) (((val) > (max)) ? (max) : (((val) < (min)) ? (min) : (val)))

 inline float Sign( float x )
 {
@ -1070,14 +1127,14 @@ inline float SimpleSplineRemapValClamped( float val, float A, float B, float C,
 	if ( A == B )
 		return val >= B ? D : C;
 	float cVal = (val - A) / (B - A);
-	cVal = V_clamp( cVal, 0.0f, 1.0f );
+	cVal = clamp( cVal, 0.0f, 1.0f );
 	return C + (D - C) * SimpleSpline( cVal );
 }

 FORCEINLINE int RoundFloatToInt(float f)
 {
-#if defined( _WIN64 )
-	return std::round(f);
+#if defined(__i386__) || defined(_M_IX86) || defined( PLATFORM_WINDOWS_PC64 ) || defined(__x86_64__)
+	return _mm_cvtss_si32(_mm_load_ss(&f));
 #elif defined( _X360 )
 #ifdef Assert
 	Assert( IsFPUControlWordSet() );
@ -1089,72 +1146,23 @@ FORCEINLINE int RoundFloatToInt(float f)
 	};
 	flResult = __fctiw( f );
 	return pResult[1];
-#else // !X360
-	int nResult;
-#if defined( _WIN32 )
-	__asm
-	{
-		fld f
-		fistp nResult
-	}
-#elif defined( _LINUX ) || defined( __APPLE__ )
-	__asm __volatile__ (
-		"fistpl %0;": "=m" (nResult): "t" (f) : "st"
-	);
-#endif
-	return nResult;
+#else
+#error Unknown architecture
 #endif
 }

 FORCEINLINE unsigned char RoundFloatToByte(float f)
 {
-#if defined( _WIN64 )
-	return std::round(f);
-#elif defined( _X360 )
+	int nResult = RoundFloatToInt(f);
 #ifdef Assert
-	Assert( IsFPUControlWordSet() );
-#endif
-	union
-	{
-		double flResult;
-		int pIntResult[2];
-		unsigned char pResult[8];
-	};
-	flResult = __fctiw( f );
-#ifdef Assert
-	Assert( pIntResult[1] >= 0 && pIntResult[1] <= 255 );
-#endif
-	return pResult[8];
-
-#else // !X360
-	
-	int nResult;
-
-#if defined( _WIN32 )
-	__asm
-	{
-		fld f
-		fistp nResult
-	}
-#elif defined( _LINUX ) || defined( __APPLE__ )
-	__asm __volatile__ (
-		"fistpl %0;": "=m" (nResult): "t" (f) : "st"
-	);
-#endif
-
-#ifdef Assert
-	Assert( nResult >= 0 && nResult <= 255 );
-#endif 
-	return nResult;
-
+	Assert( (nResult & ~0xFF) == 0 );
 #endif
+	return (unsigned char) nResult;
 }

-FORCEINLINE uint32_t RoundFloatToUnsignedLong(float f)
+FORCEINLINE unsigned long RoundFloatToUnsignedLong(float f)
 {
-#if defined( _WIN64 )
-	return std::round(f);
-#elif defined( _X360 )
+#if defined( _X360 )
 #ifdef Assert
 	Assert( IsFPUControlWordSet() );
 #endif
@ -1162,29 +1170,48 @@ FORCEINLINE uint32_t RoundFloatToUnsignedLong(float f)
 	{
 		double flResult;
 		int pIntResult[2];
-		uint32_t pResult[2];
+		unsigned long pResult[2];
 	};
 	flResult = __fctiw( f );
 	Assert( pIntResult[1] >= 0 );
 	return pResult[1];
 #else  // !X360
 	
+#if defined( PLATFORM_WINDOWS_PC64 )
+	uint nRet = ( uint ) f;
+	if ( nRet & 1 )
+	{
+		if ( ( f - floor( f ) >= 0.5 ) )
+		{
+			nRet++;
+		}
+	}
+	else
+	{
+		if ( ( f - floor( f ) > 0.5 ) )
+		{
+			nRet++;
+		}
+	}
+	return nRet;
+#else // PLATFORM_WINDOWS_PC64
 	unsigned char nResult[8];

-#if defined( _WIN32 )
-	__asm
-	{
-		fld f
-		fistp       qword ptr nResult
-	}
-#elif defined( _LINUX ) || defined( __APPLE__ )
-	__asm __volatile__ (
-		"fistpl %0;": "=m" (nResult): "t" (f) : "st"
-	);
-#endif
+	#if defined( _WIN32 )
+		__asm
+		{
+			fld f
+			fistp       qword ptr nResult
+		}
+	#elif POSIX
+		__asm __volatile__ (
+			"fistpl %0;": "=m" (nResult): "t" (f) : "st"
+		);
+	#endif

-	return *((uint32_t*)nResult);
-#endif
+		return *((unsigned long*)nResult);
+#endif // PLATFORM_WINDOWS_PC64
+#endif // !X360
 }

 FORCEINLINE bool IsIntegralValue( float flValue, float flTolerance = 0.001f )
@ -1195,9 +1222,7 @@ FORCEINLINE bool IsIntegralValue( float flValue, float flTolerance = 0.001f )
 // Fast, accurate ftol:
 FORCEINLINE int Float2Int( float a )
 {
-#if defined ( _WIN64 )
-	return a;
-#elif defined( _X360 )
+#if defined( _X360 )
 	union
 	{
 		double flResult;
@ -1206,78 +1231,54 @@ FORCEINLINE int Float2Int( float a )
 	flResult = __fctiwz( a );
 	return pResult[1];
 #else  // !X360
-	
-	int RetVal;
-
-#if defined( _WIN32 )
-	int CtrlwdHolder;
-	int CtrlwdSetter;
-	__asm 
-	{
-		fld    a					// push 'a' onto the FP stack
-		fnstcw CtrlwdHolder		// store FPU control word
-		movzx  eax, CtrlwdHolder	// move and zero extend word into eax
-		and    eax, 0xFFFFF3FF	// set all bits except rounding bits to 1
-		or     eax, 0x00000C00	// set rounding mode bits to round towards zero
-		mov    CtrlwdSetter, eax	// Prepare to set the rounding mode -- prepare to enter plaid!
-		fldcw  CtrlwdSetter		// Entering plaid!
-		fistp  RetVal				// Store and converted (to int) result
-		fldcw  CtrlwdHolder		// Restore control word
-	}
-#elif defined( _LINUX ) || defined ( __APPLE__ ) 
-	RetVal = static_cast<int>( a );
-#endif
-
-	return RetVal;
+	// Rely on compiler to generate CVTTSS2SI on x86
+	return (int) a;
 #endif
 }

 // Over 15x faster than: (int)floor(value)
 inline int Floor2Int( float a )
 {
-#if defined ( _WIN64 )
-	return std::floor(a);
-#else
 	int RetVal;
-#if defined( _X360 )
-	RetVal = (int)floor( a );
-#elif defined( _WIN32 )
-   int CtrlwdHolder;
-   int CtrlwdSetter;
-   __asm 
-   {
-      fld    a					// push 'a' onto the FP stack
-      fnstcw CtrlwdHolder		// store FPU control word
-      movzx  eax, CtrlwdHolder	// move and zero extend word into eax
-      and    eax, 0xFFFFF3FF	// set all bits except rounding bits to 1
-      or     eax, 0x00000400	// set rounding mode bits to round down
-      mov    CtrlwdSetter, eax	// Prepare to set the rounding mode -- prepare to enter plaid!
-      fldcw  CtrlwdSetter		// Entering plaid!
-      fistp  RetVal				// Store floored and converted (to int) result
-      fldcw  CtrlwdHolder		// Restore control word
-   }
-#elif defined( _LINUX ) || defined( __APPLE__ )
+#if defined( __i386__ )
+	// Convert to int and back, compare, subtract one if too big
+	__m128 a128 = _mm_set_ss(a);
+	RetVal = _mm_cvtss_si32(a128);
+    __m128 rounded128 = _mm_cvt_si2ss(_mm_setzero_ps(), RetVal);
+	RetVal -= _mm_comigt_ss( rounded128, a128 );
+#else
 	RetVal = static_cast<int>( floor(a) );
 #endif
 	return RetVal;
-#endif // _WIN64
 }

 //-----------------------------------------------------------------------------
 // Fast color conversion from float to unsigned char
 //-----------------------------------------------------------------------------
-FORCEINLINE unsigned char FastFToC( float c )
+FORCEINLINE unsigned int FastFToC( float c )
 {
-	volatile float dc;
-
-	// ieee trick
-	dc = c * 255.0f + (float)(1 << 23);
-	
-	// return the lsb
-#if defined( _X360 )
-	return ((unsigned char*)&dc)[3];
+#if defined( __i386__ )
+	// IEEE float bit manipulation works for values between [0, 1<<23)
+	union { float f; int i; } convert = { c*255.0f + (float)(1<<23) };
+	return convert.i & 255;
 #else
-	return *(unsigned char*)&dc;
+	// consoles CPUs suffer from load-hit-store penalty
+	return Float2Int( c * 255.0f );
+#endif
+}
+
+//-----------------------------------------------------------------------------
+// Fast conversion from float to integer with magnitude less than 2**22
+//-----------------------------------------------------------------------------
+FORCEINLINE int FastFloatToSmallInt( float c )
+{
+#if defined( __i386__ )
+	// IEEE float bit manipulation works for values between [-1<<22, 1<<22)
+	union { float f; int i; } convert = { c + (float)(3<<22) };
+	return (convert.i & ((1<<23)-1)) - (1<<22);
+#else
+	// consoles CPUs suffer from load-hit-store penalty
+	return Float2Int( c );
 #endif
 }

@ -1289,39 +1290,23 @@ FORCEINLINE unsigned char FastFToC( float c )
 inline float ClampToMsec( float in )
 {
 	int msec = Floor2Int( in * 1000.0f + 0.5f );
-	return msec / 1000.0f;
+	return 0.001f * msec;
 }

 // Over 15x faster than: (int)ceil(value)
 inline int Ceil2Int( float a )
 {
-#if defined ( _WIN64 )
-	return std::ceil(a);
-#else
   int RetVal;
-
-#if defined( _X360 )
-	RetVal = (int)ceil( a );
-#elif defined( _WIN32 )
-   int CtrlwdHolder;
-   int CtrlwdSetter;
-   __asm 
-   {
-      fld    a					// push 'a' onto the FP stack
-      fnstcw CtrlwdHolder		// store FPU control word
-      movzx  eax, CtrlwdHolder	// move and zero extend word into eax
-      and    eax, 0xFFFFF3FF	// set all bits except rounding bits to 1
-      or     eax, 0x00000800	// set rounding mode bits to round down
-      mov    CtrlwdSetter, eax	// Prepare to set the rounding mode -- prepare to enter plaid!
-      fldcw  CtrlwdSetter		// Entering plaid!
-      fistp  RetVal				// Store floored and converted (to int) result
-      fldcw  CtrlwdHolder		// Restore control word
-   }
-#elif defined( _LINUX ) || defined( __APPLE__ )
-	RetVal = static_cast<int>( ceil(a) );
+#if defined( __i386__ )
+   // Convert to int and back, compare, add one if too small
+   __m128 a128 = _mm_load_ss(&a);
+   RetVal = _mm_cvtss_si32(a128);
+   __m128 rounded128 = _mm_cvt_si2ss(_mm_setzero_ps(), RetVal);
+   RetVal += _mm_comilt_ss( rounded128, a128 );
+#else
+   RetVal = static_cast<int>( ceil(a) );
 #endif
 	return RetVal;
-#endif // _WIN64
 }


@ -1436,7 +1421,7 @@ FORCEINLINE unsigned char LinearToLightmap( float f )

 FORCEINLINE void ColorClamp( Vector& color )
 {
-	float maxc = MAX( color.x, MAX( color.y, color.z ) );
+	float maxc = V_max( color.x, V_max( color.y, color.z ) );
 	if ( maxc > 1.0f )
 	{
 		float ooMax = 1.0f / maxc;
@ -1565,7 +1550,7 @@ float Hermite_Spline(
 	float t );


-void Hermite_SplineBasis( float t, float basis[4] );
+void Hermite_SplineBasis( float t, float basis[] );

 void Hermite_Spline( 
 	const Quaternion &q0, 
@ -1932,10 +1917,10 @@ FORCEINLINE unsigned int * PackNormal_SHORT2( float nx, float ny, float nz, unsi
 	ny *= 16384.0f;

 	// '0' and '32768' values are invalid encodings
-	nx = MAX( nx, 1.0f );		// Make sure there are no zero values
-	ny = MAX( ny, 1.0f );
-	nx = MIN( nx, 32767.0f );	// Make sure there are no 32768 values
-	ny = MIN( ny, 32767.0f );
+	nx = V_max( nx, 1.0f );		// Make sure there are no zero values
+	ny = V_max( ny, 1.0f );
+	nx = V_min( nx, 32767.0f );	// Make sure there are no 32768 values
+	ny = V_min( ny, 32767.0f );

 	if ( nz < 0.0f )
 		nx = -nx;				// Set the sign bit for z
@ -2085,6 +2070,46 @@ void RGBtoHSV( const Vector &rgb, Vector &hsv );
 void HSVtoRGB( const Vector &hsv, Vector &rgb );


+//-----------------------------------------------------------------------------
+// Fast version of pow and log
+//-----------------------------------------------------------------------------
+
+float FastLog2(float i);			// log2( i )
+float FastPow2(float i);			// 2^i
+float FastPow(float a, float b);	// a^b
+float FastPow10( float i );			// 10^i
+
+//-----------------------------------------------------------------------------
+// For testing float equality
+//-----------------------------------------------------------------------------
+
+inline bool CloseEnough( float a, float b, float epsilon = EQUAL_EPSILON )
+{
+	return fabs( a - b ) <= epsilon;
+}
+
+inline bool CloseEnough( const Vector &a, const Vector &b, float epsilon = EQUAL_EPSILON )
+{
+	return fabs( a.x - b.x ) <= epsilon &&
+		fabs( a.y - b.y ) <= epsilon &&
+		fabs( a.z - b.z ) <= epsilon;
+}
+
+// Fast compare
+// maxUlps is the maximum error in terms of Units in the Last Place. This 
+// specifies how big an error we are willing to accept in terms of the value
+// of the least significant digit of the floating point number’s 
+// representation. maxUlps can also be interpreted in terms of how many 
+// representable floats we are willing to accept between A and B. 
+// This function will allow maxUlps-1 floats between A and B.
+bool AlmostEqual(float a, float b, int maxUlps = 10);
+
+inline bool AlmostEqual( const Vector &a, const Vector &b, int maxUlps = 10)
+{
+	return AlmostEqual( a.x, b.x, maxUlps ) &&
+		AlmostEqual( a.y, b.y, maxUlps ) &&
+		AlmostEqual( a.z, b.z, maxUlps );
+}

 #endif	// MATH_BASE_H

--- a/public/mathlib/matrixmath.h
+++ b/public/mathlib/matrixmath.h
@ -0,0 +1,385 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose: 
+//
+//  A set of generic, template-based matrix functions.
+//===========================================================================//
+
+#ifndef MATRIXMATH_H
+#define MATRIXMATH_H
+
+#include <stdarg.h>
+
+// The operations in this file can perform basic matrix operations on matrices represented
+// using any class that supports the necessary operations:
+//
+//  .Element( row, col )  - return the element at a given matrox position
+//  .SetElement( row, col, val ) - modify an element
+//  .Width(), .Height() - get dimensions
+//  .SetDimensions( nrows, ncols) - set a matrix to be un-initted and the appropriate size
+//
+// Generally, vectors can be used with these functions by using N x 1 matrices to represent them.
+//  Matrices are addressed as row, column, and indices are 0-based
+//
+//
+// Note that the template versions of these routines are defined for generality - it is expected
+// that template specialization is used for common high performance cases.
+
+namespace MatrixMath
+{
+	/// M *= flScaleValue
+	template<class MATRIXCLASS>
+	void ScaleMatrix( MATRIXCLASS &matrix, float flScaleValue )
+	{
+		for( int i = 0; i < matrix.Height(); i++ )
+		{
+			for( int j = 0; j < matrix.Width(); j++ )
+			{
+				matrix.SetElement( i, j, flScaleValue * matrix.Element( i, j ) );
+			}
+		}
+	}
+
+	/// AppendElementToMatrix - same as setting the element, except only works when all calls
+	/// happen in top to bottom left to right order, end you have to call FinishedAppending when
+	/// done. For normal matrix classes this is not different then SetElement, but for
+	/// CSparseMatrix, it is an accelerated way to fill a matrix from scratch.
+	template<class MATRIXCLASS>
+	FORCEINLINE void AppendElement( MATRIXCLASS &matrix, int nRow, int nCol, float flValue )
+	{
+		matrix.SetElement( nRow, nCol, flValue );			// default implementation
+	}
+
+	template<class MATRIXCLASS>
+	FORCEINLINE void FinishedAppending( MATRIXCLASS &matrix ) {} // default implementation
+
+	/// M += fl
+	template<class MATRIXCLASS>
+	void AddToMatrix( MATRIXCLASS &matrix, float flAddend )
+	{
+		for( int i = 0; i < matrix.Height(); i++ )
+		{
+			for( int j = 0; j < matrix.Width(); j++ )
+			{
+				matrix.SetElement( i, j, flAddend + matrix.Element( i, j ) );
+			}
+		}
+	}
+
+	/// transpose
+	template<class MATRIXCLASSIN, class MATRIXCLASSOUT>
+	void TransposeMatrix( MATRIXCLASSIN const &matrixIn, MATRIXCLASSOUT *pMatrixOut )
+	{
+		pMatrixOut->SetDimensions( matrixIn.Width(), matrixIn.Height() );
+		for( int i = 0; i < pMatrixOut->Height(); i++ )
+		{
+			for( int j = 0; j < pMatrixOut->Width(); j++ )
+			{
+				AppendElement( *pMatrixOut, i, j, matrixIn.Element( j, i ) );
+			}
+		}
+		FinishedAppending( *pMatrixOut );
+	}
+
+	/// copy
+	template<class MATRIXCLASSIN, class MATRIXCLASSOUT>
+	void CopyMatrix( MATRIXCLASSIN const &matrixIn, MATRIXCLASSOUT *pMatrixOut )
+	{
+		pMatrixOut->SetDimensions( matrixIn.Height(), matrixIn.Width() );
+		for( int i = 0; i < matrixIn.Height(); i++ )
+		{
+			for( int j = 0; j < matrixIn.Width(); j++ )
+			{
+				AppendElement( *pMatrixOut, i, j, matrixIn.Element( i, j ) );
+			}
+		}
+		FinishedAppending( *pMatrixOut );
+	}
+
+
+
+	/// M+=M
+	template<class MATRIXCLASSIN, class MATRIXCLASSOUT>
+	void AddMatrixToMatrix( MATRIXCLASSIN const &matrixIn, MATRIXCLASSOUT *pMatrixOut )
+	{
+		for( int i = 0; i < matrixIn.Height(); i++ )
+		{
+			for( int j = 0; j < matrixIn.Width(); j++ )
+			{
+				pMatrixOut->SetElement( i, j, pMatrixOut->Element( i, j ) + matrixIn.Element( i, j ) );
+			}
+		}
+	}
+
+	// M += scale * M
+	template<class MATRIXCLASSIN, class MATRIXCLASSOUT>
+	void AddScaledMatrixToMatrix( float flScale, MATRIXCLASSIN const &matrixIn, MATRIXCLASSOUT *pMatrixOut )
+	{
+		for( int i = 0; i < matrixIn.Height(); i++ )
+		{
+			for( int j = 0; j < matrixIn.Width(); j++ )
+			{
+				pMatrixOut->SetElement( i, j, pMatrixOut->Element( i, j ) + flScale * matrixIn.Element( i, j ) );
+			}
+		}
+	}
+
+
+	// simple way to initialize a matrix with constants from code.
+	template<class MATRIXCLASSOUT> 
+	void SetMatrixToIdentity( MATRIXCLASSOUT *pMatrixOut, float flDiagonalValue = 1.0 )
+	{
+		for( int i = 0; i < pMatrixOut->Height(); i++ )
+		{
+			for( int j = 0; j < pMatrixOut->Width(); j++ )
+			{
+				AppendElement( *pMatrixOut, i, j, ( i == j ) ? flDiagonalValue : 0 );
+			}
+		}
+		FinishedAppending( *pMatrixOut );
+	}
+
+	//// simple way to initialize a matrix with constants from code
+	template<class MATRIXCLASSOUT> 
+	void SetMatrixValues( MATRIXCLASSOUT *pMatrix, int nRows, int nCols, ... )
+	{
+		va_list argPtr;
+		va_start( argPtr, nCols );
+
+		pMatrix->SetDimensions( nRows, nCols );
+		for( int nRow = 0; nRow < nRows; nRow++ )
+		{
+			for( int nCol = 0; nCol < nCols; nCol++ )
+			{
+				double flNewValue = va_arg( argPtr, double );
+				pMatrix->SetElement( nRow, nCol, flNewValue );
+			}
+		}
+		va_end( argPtr );
+	}
+
+
+	/// row and colum accessors. treat a row or a column as a column vector
+	template<class MATRIXTYPE> class MatrixRowAccessor
+	{
+	public:
+		FORCEINLINE MatrixRowAccessor( MATRIXTYPE const &matrix, int nRow )
+		{
+			m_pMatrix = &matrix;
+			m_nRow = nRow;
+		}
+
+		FORCEINLINE float Element( int nRow, int nCol ) const
+		{
+			Assert( nCol == 0 );
+			return m_pMatrix->Element( m_nRow, nRow );
+		}
+
+		FORCEINLINE int Width( void ) const { return 1; };
+		FORCEINLINE int Height( void ) const { return m_pMatrix->Width(); }
+
+	private:
+		MATRIXTYPE const *m_pMatrix;
+		int m_nRow;
+	};
+
+	template<class MATRIXTYPE> class MatrixColumnAccessor
+	{
+	public:
+		FORCEINLINE MatrixColumnAccessor( MATRIXTYPE const &matrix, int nColumn )
+		{
+			m_pMatrix = &matrix;
+			m_nColumn = nColumn;
+		}
+
+		FORCEINLINE float Element( int nRow, int nColumn ) const
+		{
+			Assert( nColumn == 0 );
+			return m_pMatrix->Element( nRow, m_nColumn );
+		}
+
+		FORCEINLINE int Width( void ) const { return 1; }
+		FORCEINLINE int Height( void ) const { return m_pMatrix->Height(); }
+	private:
+		MATRIXTYPE const *m_pMatrix;
+		int m_nColumn;
+	};
+
+	/// this translator acts as a proxy for the transposed matrix
+	template<class MATRIXTYPE> class MatrixTransposeAccessor
+	{
+	public:
+		FORCEINLINE MatrixTransposeAccessor( MATRIXTYPE const & matrix )
+		{
+			m_pMatrix = &matrix;
+		}
+
+		FORCEINLINE float Element( int nRow, int nColumn ) const
+		{
+			return m_pMatrix->Element( nColumn, nRow );
+		}
+
+		FORCEINLINE int Width( void ) const { return m_pMatrix->Height(); }
+		FORCEINLINE int Height( void ) const { return m_pMatrix->Width(); }
+	private:
+		MATRIXTYPE const *m_pMatrix;
+	};
+
+	/// this tranpose returns a wrapper around it's argument, allowing things like AddMatrixToMatrix( Transpose( matA ), &matB ) without an extra copy
+	template<class MATRIXCLASSIN>
+	MatrixTransposeAccessor<MATRIXCLASSIN> TransposeMatrix( MATRIXCLASSIN const &matrixIn )
+	{
+		return MatrixTransposeAccessor<MATRIXCLASSIN>( matrixIn );
+	}
+
+
+	/// retrieve rows and columns
+	template<class MATRIXTYPE>
+	FORCEINLINE MatrixColumnAccessor<MATRIXTYPE> MatrixColumn( MATRIXTYPE const &matrix, int nColumn )
+	{
+		return MatrixColumnAccessor<MATRIXTYPE>( matrix, nColumn );
+	}
+
+	template<class MATRIXTYPE>
+	FORCEINLINE MatrixRowAccessor<MATRIXTYPE> MatrixRow( MATRIXTYPE const &matrix, int nRow )
+	{
+		return MatrixRowAccessor<MATRIXTYPE>( matrix, nRow );
+	}
+
+	//// dot product between vectors (or rows and/or columns via accessors)
+	template<class MATRIXACCESSORATYPE, class MATRIXACCESSORBTYPE >
+	float InnerProduct( MATRIXACCESSORATYPE const &vecA, MATRIXACCESSORBTYPE const &vecB )
+	{
+		Assert( vecA.Width() == 1 );
+		Assert( vecB.Width() == 1 );
+		Assert( vecA.Height() == vecB.Height() );
+		double flResult = 0;
+		for( int i = 0; i < vecA.Height(); i++ )
+		{
+			flResult += vecA.Element( i, 0 ) * vecB.Element( i, 0 );
+		}
+		return flResult;
+	}
+
+
+
+	/// matrix x matrix multiplication
+	template<class MATRIXATYPE, class MATRIXBTYPE, class MATRIXOUTTYPE>
+	void MatrixMultiply( MATRIXATYPE const &matA, MATRIXBTYPE const &matB, MATRIXOUTTYPE *pMatrixOut )
+	{
+		Assert( matA.Width() == matB.Height() );
+		pMatrixOut->SetDimensions( matA.Height(), matB.Width() );
+		for( int i = 0; i < matA.Height(); i++ )
+		{
+			for( int j = 0; j < matB.Width(); j++ )
+			{
+				pMatrixOut->SetElement( i, j, InnerProduct( MatrixRow( matA, i ), MatrixColumn( matB, j ) ) );
+			}
+		}
+	}
+
+	/// solve Ax=B via the conjugate graident method. Code and naming conventions based on the
+	/// wikipedia article.
+	template<class ATYPE, class XTYPE, class BTYPE>
+	void ConjugateGradient( ATYPE const &matA, BTYPE const &vecB, XTYPE &vecX, float flTolerance = 1.0e-20 )
+	{
+		XTYPE vecR;
+		vecR.SetDimensions( vecX.Height(), 1 );
+		MatrixMultiply( matA, vecX, &vecR );
+		ScaleMatrix( vecR, -1 );
+		AddMatrixToMatrix( vecB, &vecR );
+		XTYPE vecP;
+		CopyMatrix( vecR, &vecP );
+		float flRsOld = InnerProduct( vecR, vecR );
+		for( int nIter = 0; nIter < 100; nIter++ )
+		{
+			XTYPE vecAp;
+			MatrixMultiply( matA, vecP, &vecAp );
+			float flDivisor = InnerProduct( vecAp, vecP );
+			float flAlpha = flRsOld / flDivisor;
+			AddScaledMatrixToMatrix( flAlpha, vecP, &vecX );
+			AddScaledMatrixToMatrix( -flAlpha, vecAp, &vecR );
+			float flRsNew = InnerProduct( vecR, vecR );
+			if ( flRsNew < flTolerance )
+			{
+				break;
+			}
+			ScaleMatrix( vecP, flRsNew / flRsOld );
+			AddMatrixToMatrix( vecR, &vecP );
+			flRsOld = flRsNew;
+		}
+	}
+
+	/// solve (A'*A) x=B via the conjugate gradient method. Code and naming conventions based on
+	/// the wikipedia article. Same as Conjugate gradient but allows passing in two matrices whose
+	/// product is used as the A matrix (in order to preserve sparsity)
+	template<class ATYPE, class APRIMETYPE, class XTYPE, class BTYPE>
+	void ConjugateGradient( ATYPE const &matA, APRIMETYPE const &matAPrime, BTYPE const &vecB, XTYPE &vecX, float flTolerance = 1.0e-20 )
+	{
+		XTYPE vecR1;
+		vecR1.SetDimensions( vecX.Height(), 1 );
+		MatrixMultiply( matA, vecX, &vecR1 );
+		XTYPE vecR;
+		vecR.SetDimensions( vecR1.Height(), 1 );
+		MatrixMultiply( matAPrime, vecR1, &vecR );
+		ScaleMatrix( vecR, -1 );
+		AddMatrixToMatrix( vecB, &vecR );
+		XTYPE vecP;
+		CopyMatrix( vecR, &vecP );
+		float flRsOld = InnerProduct( vecR, vecR );
+		for( int nIter = 0; nIter < 100; nIter++ )
+		{
+			XTYPE vecAp1;
+			MatrixMultiply( matA, vecP, &vecAp1 );
+			XTYPE vecAp;
+			MatrixMultiply( matAPrime, vecAp1, &vecAp );
+			float flDivisor = InnerProduct( vecAp, vecP );
+			float flAlpha = flRsOld / flDivisor;
+			AddScaledMatrixToMatrix( flAlpha, vecP, &vecX );
+			AddScaledMatrixToMatrix( -flAlpha, vecAp, &vecR );
+			float flRsNew = InnerProduct( vecR, vecR );
+			if ( flRsNew < flTolerance )
+			{
+				break;
+			}
+			ScaleMatrix( vecP, flRsNew / flRsOld );
+			AddMatrixToMatrix( vecR, &vecP );
+			flRsOld = flRsNew;
+		}
+	}
+
+	
+	template<class ATYPE,  class XTYPE, class BTYPE>
+	void LeastSquaresFit( ATYPE const &matA, BTYPE const &vecB, XTYPE &vecX )
+	{
+		// now, generate the normal equations
+		BTYPE vecBeta;
+		MatrixMath::MatrixMultiply( MatrixMath::TransposeMatrix( matA ), vecB, &vecBeta );
+
+		vecX.SetDimensions( matA.Width(), 1 );
+		MatrixMath::SetMatrixToIdentity( &vecX );
+
+		ATYPE matATransposed;
+		TransposeMatrix( matA, &matATransposed );
+		ConjugateGradient( matA, matATransposed, vecBeta, vecX, 1.0e-20 );
+	}
+
+};
+
+/// a simple fixed-size matrix class
+template<int NUMROWS, int NUMCOLS> class CFixedMatrix
+{
+public:
+	FORCEINLINE int Width( void ) const { return NUMCOLS; }
+	FORCEINLINE int Height( void ) const { return NUMROWS; }
+	FORCEINLINE float Element( int nRow, int nCol ) const { return m_flValues[nRow][nCol]; }
+	FORCEINLINE void SetElement( int nRow, int nCol, float flValue ) { m_flValues[nRow][nCol] = flValue; }
+	FORCEINLINE void SetDimensions( int nNumRows, int nNumCols ) { Assert( ( nNumRows == NUMROWS ) && ( nNumCols == NUMCOLS ) ); }
+
+private:
+	float m_flValues[NUMROWS][NUMCOLS];
+};
+
+
+
+#endif //matrixmath_h
--- a/public/mathlib/ssemath.h
+++ b/public/mathlib/ssemath.h
@ -1,4 +1,4 @@
-//===== Copyright © 1996-2005, Valve Corporation, All rights reserved. ======//
+//===== Copyright <EFBFBD> 1996-2005, Valve Corporation, All rights reserved. ======//
 //
 // Purpose: - defines SIMD "structure of arrays" classes and functions.
 //
@ -15,7 +15,7 @@
 #include <mathlib/vector.h>
 #include <mathlib/mathlib.h>

-#if defined(_LINUX) || defined(__APPLE__)
+#if defined(GNUC)
 #define USE_STDC_FOR_SIMD 0
 #else
 #define USE_STDC_FOR_SIMD 0
@ -108,7 +108,7 @@ struct ALIGN16 intx4
 			m_i32[2] == other.m_i32[2] &&
 			m_i32[3] == other.m_i32[3] 	;
 	}
-};
+} ALIGN16_POST;


 #if defined( _DEBUG ) && defined( _X360 )
@ -136,13 +136,13 @@ FORCEINLINE void TestVPUFlags() {}
 // miss.)
 #ifndef _X360
 extern const fltx4 Four_Zeros;									// 0 0 0 0
-extern const fltx4 Four_Ones;										// 1 1 1 1
-extern const fltx4 Four_Twos;										// 2 2 2 2
+extern const fltx4 Four_Ones;									// 1 1 1 1
+extern const fltx4 Four_Twos;									// 2 2 2 2
 extern const fltx4 Four_Threes;									// 3 3 3 3
 extern const fltx4 Four_Fours;									// guess.
 extern const fltx4 Four_Point225s;								// .225 .225 .225 .225
 extern const fltx4 Four_PointFives;								// .5 .5 .5 .5
-extern const fltx4 Four_Epsilons;									// FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
+extern const fltx4 Four_Epsilons;								// FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
 extern const fltx4 Four_2ToThe21s;								// (1<<21)..
 extern const fltx4 Four_2ToThe22s;								// (1<<22)..
 extern const fltx4 Four_2ToThe23s;								// (1<<23)..
@ -157,7 +157,7 @@ extern const fltx4 Four_Threes;									// 3 3 3 3
 extern const fltx4 Four_Fours;									// guess.
 extern const fltx4 Four_Point225s;								// .225 .225 .225 .225
 extern const fltx4 Four_PointFives;								// .5 .5 .5 .5
-extern const fltx4 Four_Epsilons;									// FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
+extern const fltx4 Four_Epsilons;								// FLT_EPSILON FLT_EPSILON FLT_EPSILON FLT_EPSILON
 extern const fltx4 Four_2ToThe21s;								// (1<<21)..
 extern const fltx4 Four_2ToThe22s;								// (1<<22)..
 extern const fltx4 Four_2ToThe23s;								// (1<<23)..
@ -167,20 +167,20 @@ extern const fltx4 Four_NegativeOnes;							// -1 -1 -1 -1
 #endif
 extern const fltx4 Four_FLT_MAX;								// FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX
 extern const fltx4 Four_Negative_FLT_MAX;						// -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX
-extern const fltx4 g_SIMD_0123;								// 0 1 2 3 as float
+extern const fltx4 g_SIMD_0123;									// 0 1 2 3 as float

 // external aligned integer constants
-extern const ALIGN16 int32 g_SIMD_clear_signmask[];			// 0x7fffffff x 4
-extern const ALIGN16 int32 g_SIMD_signmask[];				// 0x80000000 x 4
-extern const ALIGN16 int32 g_SIMD_lsbmask[];				// 0xfffffffe x 4
-extern const ALIGN16 int32 g_SIMD_clear_wmask[];			// -1 -1 -1 0
-extern const ALIGN16 int32 g_SIMD_ComponentMask[4][4];		// [0xFFFFFFFF 0 0 0], [0 0xFFFFFFFF 0 0], [0 0 0xFFFFFFFF 0], [0 0 0 0xFFFFFFFF]
-extern const ALIGN16 int32 g_SIMD_AllOnesMask[];			// ~0,~0,~0,~0
-extern const ALIGN16 int32 g_SIMD_Low16BitsMask[];			// 0xffff x 4
+extern const ALIGN16 int32 g_SIMD_clear_signmask[] ALIGN16_POST;			// 0x7fffffff x 4
+extern const ALIGN16 int32 g_SIMD_signmask[] ALIGN16_POST;				// 0x80000000 x 4
+extern const ALIGN16 int32 g_SIMD_lsbmask[] ALIGN16_POST;				// 0xfffffffe x 4
+extern const ALIGN16 int32 g_SIMD_clear_wmask[] ALIGN16_POST;			// -1 -1 -1 0
+extern const ALIGN16 int32 g_SIMD_ComponentMask[4][4] ALIGN16_POST;		// [0xFFFFFFFF 0 0 0], [0 0xFFFFFFFF 0 0], [0 0 0xFFFFFFFF 0], [0 0 0 0xFFFFFFFF]
+extern const ALIGN16 int32 g_SIMD_AllOnesMask[] ALIGN16_POST;			// ~0,~0,~0,~0
+extern const ALIGN16 int32 g_SIMD_Low16BitsMask[] ALIGN16_POST;			// 0xffff x 4

 // this mask is used for skipping the tail of things. If you have N elements in an array, and wish
 // to mask out the tail, g_SIMD_SkipTailMask[N & 3] what you want to use for the last iteration.
-extern const int32 ALIGN16 g_SIMD_SkipTailMask[4][4];
+extern const int32 ALIGN16 g_SIMD_SkipTailMask[4][4] ALIGN16_POST;

 // Define prefetch macros.
 // The characteristics of cache and prefetch are completely 
@ -436,23 +436,23 @@ FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )
 	return result;
 }

-FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b )				// MAX(a,b)
+FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b )				// max(a,b)
 {
 	fltx4 retVal;
-	SubFloat( retVal, 0 ) = MAX( SubFloat( a, 0 ), SubFloat( b, 0 ) );
-	SubFloat( retVal, 1 ) = MAX( SubFloat( a, 1 ), SubFloat( b, 1 ) );
-	SubFloat( retVal, 2 ) = MAX( SubFloat( a, 2 ), SubFloat( b, 2 ) );
-	SubFloat( retVal, 3 ) = MAX( SubFloat( a, 3 ), SubFloat( b, 3 ) );
+	SubFloat( retVal, 0 ) = max( SubFloat( a, 0 ), SubFloat( b, 0 ) );
+	SubFloat( retVal, 1 ) = max( SubFloat( a, 1 ), SubFloat( b, 1 ) );
+	SubFloat( retVal, 2 ) = max( SubFloat( a, 2 ), SubFloat( b, 2 ) );
+	SubFloat( retVal, 3 ) = max( SubFloat( a, 3 ), SubFloat( b, 3 ) );
 	return retVal;
 }

-FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b )				// MIN(a,b)
+FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b )				// min(a,b)
 {
 	fltx4 retVal;
-	SubFloat( retVal, 0 ) = MIN( SubFloat( a, 0 ), SubFloat( b, 0 ) );
-	SubFloat( retVal, 1 ) = MIN( SubFloat( a, 1 ), SubFloat( b, 1 ) );
-	SubFloat( retVal, 2 ) = MIN( SubFloat( a, 2 ), SubFloat( b, 2 ) );
-	SubFloat( retVal, 3 ) = MIN( SubFloat( a, 3 ), SubFloat( b, 3 ) );
+	SubFloat( retVal, 0 ) = min( SubFloat( a, 0 ), SubFloat( b, 0 ) );
+	SubFloat( retVal, 1 ) = min( SubFloat( a, 1 ), SubFloat( b, 1 ) );
+	SubFloat( retVal, 2 ) = min( SubFloat( a, 2 ), SubFloat( b, 2 ) );
+	SubFloat( retVal, 3 ) = min( SubFloat( a, 3 ), SubFloat( b, 3 ) );
 	return retVal;
 }

@ -858,7 +858,7 @@ FORCEINLINE void TransposeSIMD( fltx4 & x, fltx4 & y, fltx4 & z, fltx4 & w )
 // and replicate it to the whole return value.
 FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
 {
-	float lowest = MIN( MIN( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
+	float lowest = min( min( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
 	return ReplicateX4(lowest);
 }

@ -866,7 +866,7 @@ FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
 // and replicate it to the whole return value.
 FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
 {
-	float highest = MAX( MAX( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
+	float highest = max( max( SubFloat(a, 0), SubFloat(a, 1) ), SubFloat(a, 2));
 	return ReplicateX4(highest);
 }

@ -1067,12 +1067,12 @@ FORCEINLINE fltx4 ArcTan2SIMD( const fltx4 &a, const fltx4 &b )

 // DivSIMD defined further down, since it uses ReciprocalSIMD

-FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b )				// MAX(a,b)
+FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b )				// max(a,b)
 {
 	return __vmaxfp( a, b );
 }

-FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b )				// MIN(a,b)
+FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b )				// min(a,b)
 {
 	return __vminfp( a, b );
 }
@ -1520,11 +1520,11 @@ FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 & a )
 	compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );
 	// compareOne is [y,z,G,G]
 	fltx4 retval = MinSIMD( a, compareOne );
-	// retVal is [MIN(x,y), MIN(y,z), G, G]
+	// retVal is [min(x,y), min(y,z), G, G]
 	compareOne = __vrlimi( compareOne, a, 8 , 2);
 	// compareOne is [z, G, G, G]
 	retval = MinSIMD( retval, compareOne );
-	// retVal = [ MIN(MIN(x,y),z), G, G, G ]
+	// retVal = [ min(min(x,y),z), G, G, G ]
 	
 	// splat the x component out to the whole vector and return
 	return SplatXSIMD( retval );
@ -1544,11 +1544,11 @@ FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 & a )
 	compareOne = __vrlimi( compareOne, a, 8 | 4 , 1 );
 	// compareOne is [y,z,G,G]
 	fltx4 retval = MaxSIMD( a, compareOne );
-	// retVal is [MAX(x,y), MAX(y,z), G, G]
+	// retVal is [max(x,y), max(y,z), G, G]
 	compareOne = __vrlimi( compareOne, a, 8 , 2);
 	// compareOne is [z, G, G, G]
 	retval = MaxSIMD( retval, compareOne );
-	// retVal = [ MAX(MAX(x,y),z), G, G, G ]
+	// retVal = [ max(max(x,y),z), G, G, G ]

 	// splat the x component out to the whole vector and return
 	return SplatXSIMD( retval );
@ -1757,7 +1757,7 @@ FORCEINLINE fltx4 AndSIMD( const fltx4 & a, const fltx4 & b )				// a & b
 	return _mm_and_ps( a, b );
 }

-FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b )			// a & ~b
+FORCEINLINE fltx4 AndNotSIMD( const fltx4 & a, const fltx4 & b )			// ~a & b
 {
 	return _mm_andnot_ps( a, b );
 }
@ -1813,7 +1813,7 @@ FORCEINLINE fltx4 ReplicateX4( float flValue )
 FORCEINLINE float SubFloat( const fltx4 & a, int idx )
 {
 	// NOTE: if the output goes into a register, this causes a Load-Hit-Store stall (don't mix fpu/vpu math!)
-#if !defined _LINUX && !defined __APPLE__
+#ifndef POSIX
 	return a.m128_f32[ idx ];
 #else
 	return (reinterpret_cast<float const *>(&a))[idx];
@ -1822,7 +1822,7 @@ FORCEINLINE float SubFloat( const fltx4 & a, int idx )

 FORCEINLINE float & SubFloat( fltx4 & a, int idx )
 {
-#if !defined _LINUX && !defined __APPLE__
+#ifndef POSIX
 	return a.m128_f32[ idx ];
 #else
 	return (reinterpret_cast<float *>(&a))[idx];
@ -1836,7 +1836,7 @@ FORCEINLINE uint32 SubFloatConvertToInt( const fltx4 & a, int idx )

 FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )
 {
-#if !defined _LINUX && !defined __APPLE__
+#ifndef POSIX
 	return a.m128_u32[idx];
 #else
 	return (reinterpret_cast<uint32 const *>(&a))[idx];
@ -1845,7 +1845,7 @@ FORCEINLINE uint32 SubInt( const fltx4 & a, int idx )

 FORCEINLINE uint32 & SubInt( fltx4 & a, int idx )
 {
-#if !defined _LINUX && !defined __APPLE__
+#ifndef POSIX
 	return a.m128_u32[idx];
 #else
 	return (reinterpret_cast<uint32 *>(&a))[idx];
@ -2120,12 +2120,12 @@ FORCEINLINE fltx4 CmpInBoundsSIMD( const fltx4 & a, const fltx4 & b )		// (a <=
 	return AndSIMD( CmpLeSIMD(a,b), CmpGeSIMD(a, NegSIMD(b)) );
 }

-FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b )				// MIN(a,b)
+FORCEINLINE fltx4 MinSIMD( const fltx4 & a, const fltx4 & b )				// min(a,b)
 {
 	return _mm_min_ps( a, b );
 }

-FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b )				// MAX(a,b)
+FORCEINLINE fltx4 MaxSIMD( const fltx4 & a, const fltx4 & b )				// max(a,b)
 {
 	return _mm_max_ps( a, b );
 }
@ -2271,11 +2271,11 @@ FORCEINLINE fltx4 FindLowestSIMD3( const fltx4 &a )
 	fltx4 compareOne = RotateLeft( a );
 	// compareOne is [y,z,G,x]
 	fltx4 retval = MinSIMD( a, compareOne );
-	// retVal is [MIN(x,y), ... ]
+	// retVal is [min(x,y), ... ]
 	compareOne = RotateLeft2( a );
 	// compareOne is [z, G, x, y]
 	retval = MinSIMD( retval, compareOne );
-	// retVal = [ MIN(MIN(x,y),z)..]
+	// retVal = [ min(min(x,y),z)..]
 	// splat the x component out to the whole vector and return
 	return SplatXSIMD( retval );
 	
@ -2288,11 +2288,11 @@ FORCEINLINE fltx4 FindHighestSIMD3( const fltx4 &a )
 	fltx4 compareOne = RotateLeft( a );
 	// compareOne is [y,z,G,x]
 	fltx4 retval = MaxSIMD( a, compareOne );
-	// retVal is [MAX(x,y), ... ]
+	// retVal is [max(x,y), ... ]
 	compareOne = RotateLeft2( a );
 	// compareOne is [z, G, x, y]
 	retval = MaxSIMD( retval, compareOne );
-	// retVal = [ MAX(MAX(x,y),z)..]
+	// retVal = [ max(max(x,y),z)..]
 	// splat the x component out to the whole vector and return
 	return SplatXSIMD( retval );
 	
--- a/public/mathlib/ssequaternion.h
+++ b/public/mathlib/ssequaternion.h
@ -233,7 +233,7 @@ FORCEINLINE fltx4 QuaternionScaleSIMD( const fltx4 &p, float t )
 	// FIXME: nick, this isn't overly sensitive to accuracy, and it may be faster to 
 	// use the cos part (w) of the quaternion (sin(omega)*N,cos(omega)) to figure the new scale.
 	float sinom = sqrt( SubFloat( p, 0 ) * SubFloat( p, 0 ) + SubFloat( p, 1 ) * SubFloat( p, 1 ) + SubFloat( p, 2 ) * SubFloat( p, 2 ) );
-	sinom = MIN( sinom, 1.f );
+	sinom = min( sinom, 1.f );

 	float sinsom = sin( asin( sinom ) * t );

--- a/public/mathlib/vector.h
+++ b/public/mathlib/vector.h
@ -31,6 +31,7 @@
 #include "tier0/threadtools.h"
 #include "mathlib/vector2d.h"
 #include "mathlib/math_pfns.h"
+#include "minmax.h"

 // Uncomment this to add extra Asserts to check for NANs, uninitialized vecs, etc.
 //#define VECTOR_PARANOIA	1
@ -48,7 +49,11 @@
 #ifdef VECTOR_PARANOIA
 #define CHECK_VALID( _v)	Assert( (_v).IsValid() )
 #else
+#ifdef GNUC
 #define CHECK_VALID( _v)
+#else
+#define CHECK_VALID( _v)	0
+#endif
 #endif

 #define VecToString(v)	(static_cast<const char *>(CFmtStr("(%f, %f, %f)", (v).x, (v).y, (v).z))) // ** Note: this generates a temporary, don't hold reference!
@ -129,6 +134,7 @@ public:
 	}

 	vec_t	NormalizeInPlace();
+	Vector	Normalized() const;
 	bool	IsLengthGreaterThan( float val ) const;
 	bool	IsLengthLessThan( float val ) const;

@ -202,6 +208,7 @@ private:
 #endif
 };

+FORCEINLINE void NetworkVarConstruct( Vector &v ) { v.Zero(); }

 #if ( ( !defined( _X360 ) ) && ( ! defined( _LINUX) ) )
    #define USE_M64S 1
@ -260,7 +267,7 @@ private:
 	// No assignment operators either...
 //	ShortVector& operator=( ShortVector const& src );

-};
+} ALIGN8_POST;



@ -396,7 +403,7 @@ public:
 	
 #endif
 	float w;	// this space is used anyway
-};
+} ALIGN16_POST;

 //-----------------------------------------------------------------------------
 // Vector related operations
@ -416,7 +423,9 @@ FORCEINLINE void VectorMultiply( const Vector& a, const Vector& b, Vector& resul
 FORCEINLINE void VectorDivide( const Vector& a, vec_t b, Vector& result );
 FORCEINLINE void VectorDivide( const Vector& a, const Vector& b, Vector& result );
 inline void VectorScale ( const Vector& in, vec_t scale, Vector& result );
-inline void VectorMA( const Vector& start, float scale, const Vector& direction, Vector& dest );
+// Don't mark this as inline in its function declaration. That's only necessary on its
+// definition, and 'inline' here leads to gcc warnings.
+void VectorMA( const Vector& start, float scale, const Vector& direction, Vector& dest );

 // Vector equality with tolerance
 bool VectorsAreEqual( const Vector& src1, const Vector& src2, float tolerance = 0.0f );
@ -443,6 +452,31 @@ void VectorMax( const Vector &a, const Vector &b, Vector &result );

 // Linearly interpolate between two vectors
 void VectorLerp(const Vector& src1, const Vector& src2, vec_t t, Vector& dest );
+Vector VectorLerp(const Vector& src1, const Vector& src2, vec_t t );
+
+FORCEINLINE Vector ReplicateToVector( float x )
+{
+	return Vector( x, x, x );
+}
+
+// check if a point is in the field of a view of an object. supports up to 180 degree fov.
+FORCEINLINE bool PointWithinViewAngle( Vector const &vecSrcPosition, 
+									   Vector const &vecTargetPosition, 
+									   Vector const &vecLookDirection, float flCosHalfFOV )
+{
+	Vector vecDelta = vecTargetPosition - vecSrcPosition;
+	float cosDiff = DotProduct( vecLookDirection, vecDelta );
+
+	if ( cosDiff < 0 ) 
+		return false;
+
+	float flLen2 = vecDelta.LengthSqr();
+
+	// a/sqrt(b) > c  == a^2 > b * c ^2
+	return ( cosDiff * cosDiff > flLen2 * flCosHalfFOV * flCosHalfFOV );
+	
+}
+

 #ifndef VECTOR_NO_SLOW_OPERATIONS

@ -454,6 +488,10 @@ Vector RandomVector( vec_t minVal, vec_t maxVal );

 #endif

+float RandomVectorInUnitSphere( Vector *pVector );
+float RandomVectorInUnitCircle( Vector2D *pVector );
+
+
 //-----------------------------------------------------------------------------
 //
 // Inlined Vector methods
@ -517,9 +555,9 @@ inline void Vector::Init( vec_t ix, vec_t iy, vec_t iz )

 inline void Vector::Random( vec_t minVal, vec_t maxVal )
 {
-	x = minVal + ((float)rand() / (float)RAND_MAX) * (maxVal - minVal);
-	y = minVal + ((float)rand() / (float)RAND_MAX) * (maxVal - minVal);
-	z = minVal + ((float)rand() / (float)RAND_MAX) * (maxVal - minVal);
+	x = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	y = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	z = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
 	CHECK_VALID(*this);
 }

@ -1082,14 +1120,6 @@ inline void VectorScale ( const Vector& in, vec_t scale, Vector& result )
 	VectorMultiply( in, scale, result );
 }

-inline void VectorMA( const Vector& start, float scale, const Vector& direction, Vector& dest )
-{
-	CHECK_VALID(start);
-	CHECK_VALID(direction);
-	dest.x = start.x + scale * direction.x;
-	dest.y = start.y + scale * direction.y;
-	dest.z = start.z + scale * direction.z;
-}

 FORCEINLINE void VectorDivide( const Vector& a, vec_t b, Vector& c )
 {
@ -1131,6 +1161,12 @@ inline void VectorLerp(const Vector& src1, const Vector& src2, vec_t t, Vector&
 	dest.z = src1.z + (src2.z - src1.z) * t;
 }

+inline Vector VectorLerp(const Vector& src1, const Vector& src2, vec_t t )
+{
+	Vector result;
+	VectorLerp( src1, src2, t, result );
+	return result;
+}

 //-----------------------------------------------------------------------------
 // Temporary storage for vector results so const Vector& results can be returned
@ -1431,6 +1467,13 @@ inline void VectorMax( const Vector &a, const Vector &b, Vector &result )
 	result.z = fpmax(a.z, b.z);
 }

+inline float ComputeVolume( const Vector &vecMins, const Vector &vecMaxs )
+{
+	Vector vecDelta;
+	VectorSubtract( vecMaxs, vecMins, vecDelta );
+	return DotProduct( vecDelta, vecDelta );
+}
+
 // Get a random vector.
 inline Vector RandomVector( float minVal, float maxVal )
 {
@ -1610,7 +1653,7 @@ public:
 	}

 #endif
-};
+} ALIGN16_POST;


 //-----------------------------------------------------------------------------
@ -1643,6 +1686,9 @@ public:

 extern void AngleQuaternion( RadianEuler const &angles, Quaternion &qt );
 extern void QuaternionAngles( Quaternion const &q, RadianEuler &angles );
+
+FORCEINLINE void NetworkVarConstruct( Quaternion &q ) { q.x = q.y = q.z = q.w = 0.0f; }
+
 inline Quaternion::Quaternion(RadianEuler const &angle)
 {
 	AngleQuaternion( angle, *this );
@ -1790,6 +1836,8 @@ private:
 #endif
 };

+FORCEINLINE void NetworkVarConstruct( QAngle &q ) { q.x = q.y = q.z = 0.0f; }
+
 //-----------------------------------------------------------------------------
 // Allows us to specifically pass the vector by value when we need to
 //-----------------------------------------------------------------------------
@ -1853,9 +1901,9 @@ inline void QAngle::Init( vec_t ix, vec_t iy, vec_t iz )

 inline void QAngle::Random( vec_t minVal, vec_t maxVal )
 {
-	x = minVal + ((float)rand() / (float)RAND_MAX) * (maxVal - minVal);
-	y = minVal + ((float)rand() / (float)RAND_MAX) * (maxVal - minVal);
-	z = minVal + ((float)rand() / (float)RAND_MAX) * (maxVal - minVal);
+	x = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	y = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	z = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
 	CHECK_VALID(*this);
 }

@ -2128,11 +2176,16 @@ inline void AngularImpulseToQAngle( const AngularImpulse &impulse, QAngle &angle
 }

 #if !defined( _X360 )
-extern float (*pfInvRSquared)( const float *v );

 FORCEINLINE vec_t InvRSquared( float const *v )
 {
-	return (*pfInvRSquared)(v);
+#if defined(__i386__) || defined(_M_IX86)
+	float sqrlen = v[0]*v[0]+v[1]*v[1]+v[2]*v[2] + 1.0e-10f, result;
+	_mm_store_ss(&result, _mm_rcp_ss( _mm_max_ss( _mm_set_ss(1.0f), _mm_load_ss(&sqrlen) ) ));
+	return result;
+#else
+	return 1.f/fpmax(1.f, v[0]*v[0]+v[1]*v[1]+v[2]*v[2]);
+#endif
 }

 FORCEINLINE vec_t InvRSquared( const Vector &v )
@ -2140,36 +2193,63 @@ FORCEINLINE vec_t InvRSquared( const Vector &v )
 	return InvRSquared(&v.x);
 }

-#else
-
-// call directly
-FORCEINLINE float _VMX_InvRSquared( const Vector &v )
+#if defined(__i386__) || defined(_M_IX86)
+inline void _SSE_RSqrtInline( float a, float* out )
 {
-	XMVECTOR xmV = XMVector3ReciprocalLength( XMLoadVector3( v.Base() ) );
-	xmV = XMVector3Dot( xmV, xmV );
-	return xmV.x;
+	__m128  xx = _mm_load_ss( &a );
+	__m128  xr = _mm_rsqrt_ss( xx );
+	__m128  xt;
+	xt = _mm_mul_ss( xr, xr );
+	xt = _mm_mul_ss( xt, xx );
+	xt = _mm_sub_ss( _mm_set_ss(3.f), xt );
+	xt = _mm_mul_ss( xt, _mm_set_ss(0.5f) );
+	xr = _mm_mul_ss( xr, xt );
+	_mm_store_ss( out, xr );
 }
-
-#define InvRSquared(x) _VMX_InvRSquared(x)
-
-#endif // _X360
-
-#if !defined( _X360 )
-extern float (FASTCALL *pfVectorNormalize)(Vector& v);
+#endif

 // FIXME: Change this back to a #define once we get rid of the vec_t version
-FORCEINLINE float VectorNormalize( Vector& v )
+FORCEINLINE float VectorNormalize( Vector& vec )
 {
-	return (*pfVectorNormalize)(v);
+#ifndef DEBUG // stop crashing my edit-and-continue!
+	#if defined(__i386__) || defined(_M_IX86)
+		#define DO_SSE_OPTIMIZATION
+	#endif
+#endif
+
+#if defined( DO_SSE_OPTIMIZATION )
+	float sqrlen = vec.LengthSqr() + 1.0e-10f, invlen;
+	_SSE_RSqrtInline(sqrlen, &invlen);
+	vec.x *= invlen;
+	vec.y *= invlen;
+	vec.z *= invlen;
+	return sqrlen * invlen;
+#else
+	extern float (FASTCALL *pfVectorNormalize)(Vector& v);
+	return (*pfVectorNormalize)(vec);
+#endif
 }
+
 // FIXME: Obsolete version of VectorNormalize, once we remove all the friggin float*s
 FORCEINLINE float VectorNormalize( float * v )
 {
 	return VectorNormalize(*(reinterpret_cast<Vector *>(v)));
 }

+FORCEINLINE void VectorNormalizeFast( Vector &vec )
+{
+	VectorNormalize(vec);
+}
+
 #else

+FORCEINLINE float _VMX_InvRSquared( const Vector &v )
+{
+	XMVECTOR xmV = XMVector3ReciprocalLength( XMLoadVector3( v.Base() ) );
+	xmV = XMVector3Dot( xmV, xmV );
+	return xmV.x;
+}
+
 // call directly
 FORCEINLINE float _VMX_VectorNormalize( Vector &vec )
 {
@ -2180,6 +2260,9 @@ FORCEINLINE float _VMX_VectorNormalize( Vector &vec )
 	vec.z *= den;
 	return mag;
 }
+
+#define InvRSquared(x) _VMX_InvRSquared(x)
+
 // FIXME: Change this back to a #define once we get rid of the vec_t version
 FORCEINLINE float VectorNormalize( Vector& v )
 {
@ -2191,18 +2274,6 @@ FORCEINLINE float VectorNormalize( float *pV )
 	return _VMX_VectorNormalize(*(reinterpret_cast<Vector*>(pV)));
 }

-#endif // _X360
-
-#if !defined( _X360 )
-extern void (FASTCALL *pfVectorNormalizeFast)(Vector& v);
-
-FORCEINLINE void VectorNormalizeFast( Vector& v )
-{
-	(*pfVectorNormalizeFast)(v);
-}
-
-#else
-
 // call directly
 FORCEINLINE void VectorNormalizeFast( Vector &vec )
 {
@ -2215,11 +2286,19 @@ FORCEINLINE void VectorNormalizeFast( Vector &vec )

 #endif // _X360

+
 inline vec_t Vector::NormalizeInPlace()
 {
 	return VectorNormalize( *this );
 }

+inline Vector Vector::Normalized() const
+{
+	Vector norm = *this;
+	VectorNormalize( norm );
+	return norm;
+}
+
 inline bool Vector::IsLengthGreaterThan( float val ) const
 {
 	return LengthSqr() > val*val;
--- a/public/mathlib/vector2d.h
+++ b/public/mathlib/vector2d.h
@ -239,8 +239,8 @@ inline void Vector2D::Init( vec_t ix, vec_t iy )

 inline void Vector2D::Random( float minVal, float maxVal )
 {
-	x = minVal + ((float)rand() / (float)RAND_MAX) * (maxVal - minVal);
-	y = minVal + ((float)rand() / (float)RAND_MAX) * (maxVal - minVal);
+	x = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	y = minVal + ((float)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
 }

 inline void Vector2DClear( Vector2D& a )
--- a/public/mathlib/vector4d.h
+++ b/public/mathlib/vector4d.h
@ -132,11 +132,7 @@ const Vector4D vec4_invalid( FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX );
 // SSE optimized routines
 //-----------------------------------------------------------------------------

-#ifdef _WIN32
-class __declspec(align(16)) Vector4DAligned : public Vector4D
-#elif defined _LINUX || defined __APPLE__
-class __attribute__((aligned(16))) Vector4DAligned : public Vector4D
-#endif
+class ALIGN16 Vector4DAligned : public Vector4D
 {
 public:
 	Vector4DAligned(void) {}
@ -154,7 +150,7 @@ private:

 	// No assignment operators either...
 	Vector4DAligned& operator=( Vector4DAligned const& src );
-};
+} ALIGN16_POST;

 //-----------------------------------------------------------------------------
 // Vector4D related operations
@ -249,10 +245,10 @@ inline void Vector4D::Init( vec_t ix, vec_t iy, vec_t iz, vec_t iw )

 inline void Vector4D::Random( vec_t minVal, vec_t maxVal )
 {
-	x = minVal + ((vec_t)rand() / (float)RAND_MAX) * (maxVal - minVal);
-	y = minVal + ((vec_t)rand() / (float)RAND_MAX) * (maxVal - minVal);
-	z = minVal + ((vec_t)rand() / (float)RAND_MAX) * (maxVal - minVal);
-	w = minVal + ((vec_t)rand() / (float)RAND_MAX) * (maxVal - minVal);
+	x = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	y = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	z = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
+	w = minVal + ((vec_t)rand() / VALVE_RAND_MAX) * (maxVal - minVal);
 }

 inline void Vector4DClear( Vector4D& a )
--- a/public/mathlib/vmatrix.h
+++ b/public/mathlib/vmatrix.h
@ -54,6 +54,7 @@ public:
 	// Creates a matrix where the X axis = forward
 	// the Y axis = left, and the Z axis = up
 	VMatrix( const Vector& forward, const Vector& left, const Vector& up );
+	VMatrix( const Vector& forward, const Vector& left, const Vector& up, const Vector& translation );
 	
 	// Construct from a 3x4 matrix
 	VMatrix( const matrix3x4_t& matrix3x4 );
@ -106,7 +107,6 @@ public:
 	void		PreTranslate(const Vector &vTrans);
 	void		PostTranslate(const Vector &vTrans);

-	matrix3x4_t& As3x4();
 	const matrix3x4_t& As3x4() const;
 	void		CopyFrom3x4( const matrix3x4_t &m3x4 );
 	void		Set3x4( matrix3x4_t& matrix3x4 ) const;
@ -199,6 +199,9 @@ public:
 	// Setup a matrix for origin and angles.
 	void		SetupMatrixOrgAngles( const Vector &origin, const QAngle &vAngles );
 	
+	// Setup a matrix for angles and no translation.
+	void		SetupMatrixAngles( const QAngle &vAngles );
+
 	// General inverse. This may fail so check the return!
 	bool		InverseGeneral(VMatrix &vInverse) const;
 	
@ -457,6 +460,16 @@ inline VMatrix::VMatrix( const Vector& xAxis, const Vector& yAxis, const Vector&
 		);
 }

+inline VMatrix::VMatrix( const Vector& xAxis, const Vector& yAxis, const Vector& zAxis, const Vector& translation )
+{
+	Init(
+		xAxis.x, yAxis.x, zAxis.x, translation.x,
+		xAxis.y, yAxis.y, zAxis.y, translation.y,
+		xAxis.z, yAxis.z, zAxis.z, translation.z,
+		0.0f, 0.0f, 0.0f, 1.0f
+		);
+}
+

 inline void VMatrix::Init(
 	vec_t m00, vec_t m01, vec_t m02, vec_t m03,
@ -616,11 +629,6 @@ inline const matrix3x4_t& VMatrix::As3x4() const
 	return *((const matrix3x4_t*)this);
 }

-inline matrix3x4_t& VMatrix::As3x4()
-{
-	return *((matrix3x4_t*)this);
-}
-
 inline void VMatrix::CopyFrom3x4( const matrix3x4_t &m3x4 )
 {
 	memcpy( m, m3x4.Base(), sizeof( matrix3x4_t ) );
--- a/public/minmax.h
+++ b/public/minmax.h
@ -0,0 +1,18 @@
+//========= Copyright Valve Corporation, All rights reserved. ============//
+//
+// Purpose:
+//
+// $NoKeywords: $
+//=============================================================================//
+
+#ifndef MINMAX_H
+#define MINMAX_H
+
+#ifndef V_min
+#define V_min(a,b)  (((a) < (b)) ? (a) : (b))
+#endif
+#ifndef V_max
+#define V_max(a,b)  (((a) > (b)) ? (a) : (b))
+#endif
+
+#endif // MINMAX_H
--- a/public/tier0/basetypes.h
+++ b/public/tier0/basetypes.h
@ -109,16 +109,38 @@ FORCEINLINE float fpmax( float a, float b )
 #endif

 #ifdef __cplusplus
-	template< class T >
-	inline T clamp( T const &val, T const &minVal, T const &maxVal )
-	{
-		if( val < minVal )
-			return minVal;
-		else if( val > maxVal )
-			return maxVal;
-		else
-			return val;
-	}
+
+// This is the preferred clamp operator. Using the clamp macro can lead to
+// unexpected side-effects or more expensive code. Even the clamp (all
+// lower-case) function can generate more expensive code because of the
+// mixed types involved.
+template< class T >
+T Clamp( T const &val, T const &minVal, T const &maxVal )
+{
+	if( val < minVal )
+		return minVal;
+	else if( val > maxVal )
+		return maxVal;
+	else
+		return val;
+}
+
+// This is the preferred Min operator. Using the MIN macro can lead to unexpected
+// side-effects or more expensive code.
+template< class T >
+T Min( T const &val1, T const &val2 )
+{
+	return val1 < val2 ? val1 : val2;
+}
+
+// This is the preferred Max operator. Using the MAX macro can lead to unexpected
+// side-effects or more expensive code.
+template< class T >
+T Max( T const &val1, T const &val2 )
+{
+	return val1 > val2 ? val1 : val2;
+}
+
 #endif

 #ifndef FALSE
@ -247,7 +269,7 @@ struct colorVec


 #ifndef NOTE_UNUSED
-#define NOTE_UNUSED(x)	(x = x)	// for pesky compiler / lint warnings
+#define NOTE_UNUSED(x)	(void)(x)	// for pesky compiler / lint warnings
 #endif
 #ifdef __cplusplus

--- a/public/tier0/commonmacros.h
+++ b/public/tier0/commonmacros.h
@ -32,11 +32,12 @@

 #define SETBITS(iBitVector, bits)		((iBitVector) |= (bits))
 #define CLEARBITS(iBitVector, bits)		((iBitVector) &= ~(bits))
-#define FBitSet(iBitVector, bit)		((iBitVector) & (bit))
+#define FBitSet(iBitVector, bits)		((iBitVector) & (bits))

-inline bool IsPowerOfTwo( int value )
+template <typename T>
+inline bool IsPowerOfTwo( T value )
 {
-	return (value & ( value - 1 )) == 0;
+	return (value & ( value - (T)1 )) == (T)0;
 }

 #define CONST_INTEGER_AS_STRING(x) #x //Wraps the integer in quotes, allowing us to form constant strings with it
--- a/public/tier0/fasttimer.h
+++ b/public/tier0/fasttimer.h
@ -1,4 +1,4 @@
-//========= Copyright © 1996-2005, Valve Corporation, All rights reserved. ============//
+//========= Copyright <EFBFBD> 1996-2005, Valve Corporation, All rights reserved. ============//
 //
 // Purpose: 
 //
@ -11,11 +11,14 @@
 #pragma once
 #endif

+#ifdef _WIN32
+#include <intrin.h>
+#endif
+
 #include <assert.h>
 #include "tier0/platform.h"

-PLATFORM_INTERFACE int64 g_ClockSpeed;
-PLATFORM_INTERFACE uint32_t g_dwClockSpeed;
+PLATFORM_INTERFACE uint64 g_ClockSpeed;
 #if defined( _X360 ) && defined( _CERT )
 PLATFORM_INTERFACE uint32_t g_dwFakeFastCounter;
 #endif
@ -30,20 +33,20 @@ friend class CFastTimer;

 public:
 					CCycleCount();
-					CCycleCount( int64 cycles );
+					CCycleCount( uint64 cycles );

 	void			Sample();	// Sample the clock. This takes about 34 clocks to execute (or 26,000 calls per millisecond on a P900).

 	void			Init();		// Set to zero.
 	void			Init( float initTimeMsec );
 	void			Init( double initTimeMsec )		{ Init( (float)initTimeMsec ); }
-	void			Init( int64 cycles );
+	void			Init( uint64 cycles );
 	bool			IsLessThan( CCycleCount const &other ) const;					// Compare two counts.

 	// Convert to other time representations. These functions are slow, so it's preferable to call them
 	// during display rather than inside a timing block.
 	uint32_t	GetCycles()  const;
-	int64			GetLongCycles() const;
+	uint64			GetLongCycles() const;

 	uint32_t	GetMicroseconds() const;
 	uint64			GetUlMicroseconds() const;
@ -63,12 +66,12 @@ public:
 	// dest = rSrc1 - rSrc2
 	static void		Sub( CCycleCount const &rSrc1, CCycleCount const &rSrc2, CCycleCount &dest );	// Add two samples together.

-	static int64	GetTimestamp();
+	static uint64	GetTimestamp();

-	int64			m_Int64;
+	uint64			m_Int64;
 };

-class CClockSpeedInit
+class PLATFORM_CLASS CClockSpeedInit
 {
 public:
 	CClockSpeedInit()
@ -76,21 +79,7 @@ public:
 		Init();
 	}

-	static void Init()
-	{
-#if defined( _X360 ) && !defined( _CERT )
-		PMCStart();
-		PMCInitIntervalTimer( 0 );
-#endif
-		const CPUInformation& pi = GetCPUInformation();
-
-		g_ClockSpeed = pi.m_Speed;
-		g_dwClockSpeed = (uint32_t)g_ClockSpeed;
-
-		g_ClockSpeedMicrosecondsMultiplier = 1000000.0 / (double)g_ClockSpeed;
-		g_ClockSpeedMillisecondsMultiplier = 1000.0 / (double)g_ClockSpeed;
-		g_ClockSpeedSecondsMultiplier = 1.0f / (double)g_ClockSpeed;
-	}
+	static void Init();
 };

 class CFastTimer
@ -104,7 +93,7 @@ public:
 	CCycleCount 		GetDurationInProgress() const; // Call without ending. Not that cheap.

 	// Return number of cycles per second on this processor.
-	static inline uint32_t GetClockSpeed();
+	static inline int64	GetClockSpeed();

 private:
 	CCycleCount	m_Duration;
@ -233,8 +222,6 @@ private:
 	unsigned	m_nIters;
 	CCycleCount m_Total;
 	CCycleCount	m_Peak;
-//	bool		m_fReport;
-//	const tchar *m_pszName;
 };

 // -------------------------------------------------------------------------- // 
@ -257,87 +244,37 @@ private:

 inline CCycleCount::CCycleCount()
 {
-	Init( (int64)0 );
+	Init( (uint64)0 );
 }

-inline CCycleCount::CCycleCount( int64 cycles )
+inline CCycleCount::CCycleCount( uint64 cycles )
 {
 	Init( cycles );
 }

 inline void CCycleCount::Init()
 {
-	Init( (int64)0 );
+	Init( (uint64)0 );
 }

 inline void CCycleCount::Init( float initTimeMsec )
 {
 	if ( g_ClockSpeedMillisecondsMultiplier > 0 )
-		Init( (int64)(initTimeMsec / g_ClockSpeedMillisecondsMultiplier) );
+		Init( (uint64)(initTimeMsec / g_ClockSpeedMillisecondsMultiplier) );
 	else
-		Init( (int64)0 );
+		Init( (uint64)0 );
 }

-inline void CCycleCount::Init( int64 cycles )
+inline void CCycleCount::Init( uint64 cycles )
 {
 	m_Int64 = cycles;
 }

-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4189) // warning C4189: local variable is initialized but not referenced
-#endif
-
 inline void CCycleCount::Sample()
 {
-#if defined( _X360 )
-#if !defined( _CERT )
-	// read the highest resolution timer directly (ticks at native 3.2GHz), bypassing any calls into PMC
-	// can only resolve 32 bits, rollover is ~1.32 secs
-	// based on PMCGetIntervalTimer() from the April 2007 XDK
-	int64 temp;
-	__asm 
-	{
-		lis		r11,08FFFh
-		ld		r11,011E0h(r11)
-		rldicl	r11,r11,32,32
-		// unforunate can't get the inline assembler to write directly into desired target
-		std		r11,temp
-	}
-	m_Int64 = temp;
-#else
-	m_Int64 = ++g_dwFakeFastCounter;
-#endif
-#elif defined( _WIN32 ) && !defined( _WIN64 )
-	uint32_t* pSample = (uint32_t *)&m_Int64;
-	__asm
-	{
-		// force the cpu to synchronize the instruction queue
-		// NJS: CPUID can really impact performance in tight loops.
-		//cpuid
-		//cpuid
-		//cpuid
-		mov		ecx, pSample
-		rdtsc
-		mov		[ecx], eax
-		mov		[ecx+4], edx
-	}
-#elif defined( _LINUX )
-	uint32_t* pSample = (uint32_t *)&m_Int64;
-    __asm__ __volatile__ (  
-		"rdtsc\n\t"
-		"movl %%eax,  (%0)\n\t"
-		"movl %%edx, 4(%0)\n\t"
-		: /* no output regs */
-		: "D" (pSample)
-		: "%eax", "%edx" );
-#endif
+	m_Int64 = Plat_Rdtsc();
 }

-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
 inline CCycleCount& CCycleCount::operator+=( CCycleCount const &other )
 {
 	m_Int64 += other.m_Int64;
@ -355,7 +292,7 @@ inline void CCycleCount::Sub( CCycleCount const &rSrc1, CCycleCount const &rSrc2
 	dest.m_Int64 = rSrc1.m_Int64 - rSrc2.m_Int64;
 }

-inline int64 CCycleCount::GetTimestamp()
+inline uint64 CCycleCount::GetTimestamp()
 {
 	CCycleCount c;
 	c.Sample();
@ -373,7 +310,7 @@ inline uint32_t CCycleCount::GetCycles() const
 	return (uint32_t)m_Int64;
 }

-inline int64 CCycleCount::GetLongCycles() const
+inline uint64 CCycleCount::GetLongCycles() const
 {
 	return m_Int64;
 }
@ -397,7 +334,7 @@ inline double CCycleCount::GetMicrosecondsF() const

 inline void	CCycleCount::SetMicroseconds( uint32_t nMicroseconds )
 {
-	m_Int64 = ((int64)nMicroseconds * g_ClockSpeed) / 1000000;
+	m_Int64 = ((uint64)nMicroseconds * g_ClockSpeed) / 1000000;
 }


@ -438,10 +375,10 @@ inline void CFastTimer::End()
 	if ( IsX360() )
 	{
 		// have to handle rollover, hires timer is only accurate to 32 bits
-		// more than one overflow should not have occured, otherwise caller should use a slower timer
+		// more than one overflow should not have occurred, otherwise caller should use a slower timer
 		if ( (uint64)cnt.m_Int64 <= (uint64)m_Duration.m_Int64 )
 		{
-			// rollover occured	
+			// rollover occurred	
 			cnt.m_Int64 += 0x100000000LL;	
 		}
 	}
@ -460,10 +397,10 @@ inline CCycleCount CFastTimer::GetDurationInProgress() const
 	if ( IsX360() )
 	{
 		// have to handle rollover, hires timer is only accurate to 32 bits
-		// more than one overflow should not have occured, otherwise caller should use a slower timer
+		// more than one overflow should not have occurred, otherwise caller should use a slower timer
 		if ( (uint64)cnt.m_Int64 <= (uint64)m_Duration.m_Int64 )
 		{
-			// rollover occured	
+			// rollover occurred	
 			cnt.m_Int64 += 0x100000000LL;	
 		}
 	}
@ -475,9 +412,9 @@ inline CCycleCount CFastTimer::GetDurationInProgress() const
 }


-inline uint32_t CFastTimer::GetClockSpeed()
+inline int64 CFastTimer::GetClockSpeed()
 {
-	return g_dwClockSpeed;
+	return g_ClockSpeed;
 }


@ -553,15 +490,20 @@ inline CAverageTimeMarker::~CAverageTimeMarker()

 // CLimitTimer
 // Use this to time whether a desired interval of time has passed.  It's extremely fast
-// to check while running.
+// to check while running.  NOTE: CMicroSecOverage() and CMicroSecLeft() are not as fast to check.
 class CLimitTimer
 {
 public:
+	CLimitTimer() {}
+	CLimitTimer( uint64 cMicroSecDuration ) { SetLimit( cMicroSecDuration ); }
 	void SetLimit( uint64 m_cMicroSecDuration );
-	bool BLimitReached( void );
+	bool BLimitReached() const;
+
+	int CMicroSecOverage() const;
+	uint64 CMicroSecLeft() const; 

 private:
-	int64 m_lCycleLimit;
+	uint64 m_lCycleLimit;
 };


@ -569,9 +511,9 @@ private:
 // Purpose: Initializes the limit timer with a period of time to measure.
 // Input  : cMicroSecDuration -		How long a time period to measure
 //-----------------------------------------------------------------------------
-inline void CLimitTimer::SetLimit( uint64 m_cMicroSecDuration )
+inline void CLimitTimer::SetLimit( uint64 cMicroSecDuration )
 {
-	int64 dlCycles = ( ( uint64 ) m_cMicroSecDuration * ( int64 ) g_dwClockSpeed ) / ( int64 ) 1000000L;
+	uint64 dlCycles = ( ( uint64 ) cMicroSecDuration * g_ClockSpeed ) / ( uint64 ) 1000000L;
 	CCycleCount cycleCount;
 	cycleCount.Sample( );
 	m_lCycleLimit = cycleCount.GetLongCycles( ) + dlCycles;
@ -582,7 +524,7 @@ inline void CLimitTimer::SetLimit( uint64 m_cMicroSecDuration )
 // Purpose: Determines whether our specified time period has passed
 // Output:	true if at least the specified time period has passed
 //-----------------------------------------------------------------------------
-inline bool CLimitTimer::BLimitReached( )
+inline bool CLimitTimer::BLimitReached() const
 {
 	CCycleCount cycleCount;
 	cycleCount.Sample( );
@ -590,5 +532,38 @@ inline bool CLimitTimer::BLimitReached( )
 }


+//-----------------------------------------------------------------------------
+// Purpose: If we're over our specified time period, return the amount of the overage.
+// Output:	# of microseconds since we reached our specified time period.
+//-----------------------------------------------------------------------------
+inline int CLimitTimer::CMicroSecOverage() const
+{
+	CCycleCount cycleCount;
+	cycleCount.Sample();
+	uint64 lcCycles = cycleCount.GetLongCycles();
+
+	if ( lcCycles < m_lCycleLimit )
+		return 0;
+
+	return( ( int ) ( ( lcCycles - m_lCycleLimit ) * ( uint64 ) 1000000L / g_ClockSpeed ) );
+}
+
+
+//-----------------------------------------------------------------------------
+// Purpose: If we're under our specified time period, return the amount under.
+// Output:	# of microseconds until we reached our specified time period, 0 if we've passed it
+//-----------------------------------------------------------------------------
+inline uint64 CLimitTimer::CMicroSecLeft() const
+{
+	CCycleCount cycleCount;
+	cycleCount.Sample();
+	uint64 lcCycles = cycleCount.GetLongCycles();
+
+	if ( lcCycles >= m_lCycleLimit )
+		return 0;
+
+	return( ( uint64 ) ( ( m_lCycleLimit - lcCycles ) * ( uint64 ) 1000000L / g_ClockSpeed ) );
+}
+

 #endif // FASTTIMER_H
--- a/public/tier0/memalloc.h
+++ b/public/tier0/memalloc.h
@ -382,7 +382,7 @@ public:

 	#pragma warning(disable:4290)
 	#pragma warning(push)
-	#include <typeinfo.h>
+	#include <typeinfo>

 	// MEM_DEBUG_CLASSNAME is opt-in.
 	// Note: typeid().name() is not threadsafe, so if the project needs to access it in multiple threads
--- a/public/tier0/platform.h
+++ b/public/tier0/platform.h
@ -44,10 +44,11 @@ typedef uint32_t ThreadId_t;
 // feature enables
 #define NEW_SOFTWARE_LIGHTING

-#if defined(_LINUX) || defined(__APPLE__)
+#ifdef POSIX
 // need this for _alloca
 #include <alloca.h>
-#endif // _LINUX
+#include <time.h>
+#endif

 #if defined __APPLE__
 #include <stdlib.h>
@ -234,6 +235,14 @@ typedef unsigned int uint;
 #define abstract_class class NO_VTABLE
 #endif

+
+// MSVC CRT uses 0x7fff while gcc uses MAX_INT, leading to mismatches between platforms
+// As a result, we pick the least common denominator here.  This should be used anywhere
+// you might typically want to use RAND_MAX
+#define VALVE_RAND_MAX 0x7fff
+
+
+
 /*
 FIXME: Enable this when we no longer fear change =)

@ -242,32 +251,32 @@ FIXME: Enable this when we no longer fear change =)
 #include <float.h>

 // Maximum and minimum representable values
-#define  INT8_MAX    SCHAR_MAX
-#define  INT16_MAX   SHRT_MAX
-#define  INT32_MAX   LONG_MAX
-#define  INT64_MAX   (((int64)~0) >> 1)
+#define  INT8_MAX			SCHAR_MAX
+#define  INT16_MAX			SHRT_MAX
+#define  INT32_MAX			LONG_MAX
+#define  INT64_MAX			(((int64)~0) >> 1)

-#define  INT8_MIN    SCHAR_MIN
-#define  INT16_MIN   SHRT_MIN
-#define  INT32_MIN   LONG_MIN
-#define  INT64_MIN   (((int64)1) << 63)
+#define  INT8_MIN			SCHAR_MIN
+#define  INT16_MIN			SHRT_MIN
+#define  INT32_MIN			LONG_MIN
+#define  INT64_MIN			(((int64)1) << 63)

-#define  UINT8_MAX   ((uint8)~0)
-#define  UINT16_MAX  ((uint16)~0)
-#define  UINT32_MAX  ((uint32)~0)
-#define  UINT64_MAX  ((uint64)~0)
+#define  UINT8_MAX			((uint8)~0)
+#define  UINT16_MAX			((uint16)~0)
+#define  UINT32_MAX			((uint32)~0)
+#define  UINT64_MAX			((uint64)~0)

-#define  UINT8_MIN   0
-#define  UINT16_MIN  0
-#define  UINT32_MIN  0
-#define  UINT64_MIN  0
+#define  UINT8_MIN			0
+#define  UINT16_MIN			0
+#define  UINT32_MIN			0
+#define  UINT64_MIN			0

 #ifndef  UINT_MIN
-#define  UINT_MIN    UINT32_MIN
+#define  UINT_MIN			UINT32_MIN
 #endif

-#define  FLOAT32_MAX FLT_MAX
-#define  FLOAT64_MAX DBL_MAX
+#define  FLOAT32_MAX		FLT_MAX
+#define  FLOAT64_MAX		DBL_MAX

 #define  FLOAT32_MIN FLT_MIN
 #define  FLOAT64_MIN DBL_MIN
@ -332,11 +341,35 @@ typedef void * HINSTANCE;
        #define DECL_ALIGN(x) /* */
 #endif

+#ifdef _MSC_VER
+// MSVC has the align at the start of the struct
+#define ALIGN4 DECL_ALIGN(4)
 #define ALIGN8 DECL_ALIGN(8)
 #define ALIGN16 DECL_ALIGN(16)
 #define ALIGN32 DECL_ALIGN(32)
 #define ALIGN128 DECL_ALIGN(128)

+#define ALIGN4_POST
+#define ALIGN8_POST
+#define ALIGN16_POST
+#define ALIGN32_POST
+#define ALIGN128_POST
+#elif defined( GNUC )
+// gnuc has the align decoration at the end
+#define ALIGN4
+#define ALIGN8 
+#define ALIGN16
+#define ALIGN32
+#define ALIGN128
+
+#define ALIGN4_POST DECL_ALIGN(4)
+#define ALIGN8_POST DECL_ALIGN(8)
+#define ALIGN16_POST DECL_ALIGN(16)
+#define ALIGN32_POST DECL_ALIGN(32)
+#define ALIGN128_POST DECL_ALIGN(128)
+#else
+#error
+#endif

 // Pull in the /analyze code annotations.
 #include "annotations.h"
@ -829,17 +862,20 @@ inline void StoreLittleDWord( uint32_t *base, unsigned int dwordIndex, uint32_t
 #ifndef STATIC_TIER0

 #ifdef TIER0_DLL_EXPORT
-	#define PLATFORM_INTERFACE	DLL_EXPORT
-	#define PLATFORM_OVERLOAD	DLL_GLOBAL_EXPORT
+#define PLATFORM_INTERFACE	DLL_EXPORT
+#define PLATFORM_OVERLOAD	DLL_GLOBAL_EXPORT
+#define PLATFORM_CLASS		DLL_CLASS_EXPORT
 #else
-	#define PLATFORM_INTERFACE	DLL_IMPORT
-	#define PLATFORM_OVERLOAD	DLL_GLOBAL_IMPORT
+#define PLATFORM_INTERFACE	DLL_IMPORT
+#define PLATFORM_OVERLOAD	DLL_GLOBAL_IMPORT
+#define PLATFORM_CLASS		DLL_CLASS_IMPORT
 #endif

 #else	// BUILD_AS_DLL

 #define PLATFORM_INTERFACE	extern
 #define PLATFORM_OVERLOAD
+#define PLATFORM_CLASS

 #endif	// BUILD_AS_DLL

@ -854,6 +890,41 @@ PLATFORM_INTERFACE bool				Plat_IsInBenchmarkMode();

 PLATFORM_INTERFACE double			Plat_FloatTime();		// Returns time in seconds since the module was loaded.
 PLATFORM_INTERFACE uint32_t			Plat_MSTime();			// Time in milliseconds.
+PLATFORM_INTERFACE char *			Plat_ctime( const time_t *timep, char *buf, size_t bufsize );
+PLATFORM_INTERFACE struct tm *		Plat_gmtime( const time_t *timep, struct tm *result );
+PLATFORM_INTERFACE time_t			Plat_timegm( struct tm *timeptr );
+PLATFORM_INTERFACE struct tm *		Plat_localtime( const time_t *timep, struct tm *result );
+
+#if defined( _WIN32 ) && defined( _MSC_VER ) && ( _MSC_VER >= 1400 )
+	extern "C" unsigned __int64 __rdtsc();
+	#pragma intrinsic(__rdtsc)
+#endif
+
+inline uint64 Plat_Rdtsc()
+{
+#if defined( _X360 )
+	return ( uint64 )__mftb32();
+#elif defined( _WIN64 )
+	return ( uint64 )__rdtsc();
+#elif defined( _WIN32 )
+  #if defined( _MSC_VER ) && ( _MSC_VER >= 1400 )
+	return ( uint64 )__rdtsc();
+  #else
+    __asm rdtsc;
+	__asm ret;
+  #endif
+#elif defined( __i386__ )
+	uint64 val;
+	__asm__ __volatile__ ( "rdtsc" : "=A" (val) );
+	return val;
+#elif defined( __x86_64__ )
+	uint32 lo, hi;
+	__asm__ __volatile__ ( "rdtsc" : "=a" (lo), "=d" (hi));
+	return ( ( ( uint64 )hi ) << 32 ) | lo;
+#else
+	#error
+#endif
+}

 // b/w compatibility
 #define Sys_FloatTime Plat_FloatTime
@ -901,13 +972,10 @@ struct CPUInformation // Size: Win32=64, Win64=72
 	CPUInformation(): m_Size(0){}
 };

-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wunknown-pragmas"
-#pragma clang diagnostic ignored "-Wreturn-type-c-linkage"
-#endif

-PLATFORM_INTERFACE const CPUInformation& GetCPUInformation();
+// Have to return a pointer, not a reference, because references are not compatible with the
+// extern "C" implied by PLATFORM_INTERFACE.
+PLATFORM_INTERFACE const CPUInformation* GetCPUInformation();


 PLATFORM_INTERFACE void GetCurrentDate( int *pDay, int *pMonth, int *pYear );