csgo-2018-source/studiorender/r_studiosubd_patches.cpp

#include "r_studiosubd_patches.h"
#include "tier1/convar.h"
#include <stdio.h>

#define PI 3.14159265

#ifdef _DEBUG
CUtlVector<Vector4D> g_DebugCornerPositions;
CUtlVector<Vector4D> g_DebugEdgePositions;
CUtlVector<Vector4D> g_DebugInteriorPositions;
#endif

//----------------------------------------------------------------------------------------------
// static stencil buffers
//----------------------------------------------------------------------------------------------

#if !defined( USE_OPT )

static float sPosCornerStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2];
static float sPosEdge1Stencil[MAX_VALENCE+1][6];
static float sPosEdge2Stencil[MAX_VALENCE+1][6];
static float sPosInteriorStencil[MAX_VALENCE+1][4];

static float sCCLimitTanStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static float sCCLimitTanStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static float sCCLimitTanBndStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static float sCCLimitTanBndStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static float sCCLimitTanCornerStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static float sCCLimitTanCornerStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];

static float sPosGregoryInterior1Stencil[6];
static float sPosGregoryInterior2Stencil[6];

static float sPosCornerBndStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static float sPosEdge1BndStencil[MAX_VALENCE+1][6];
static float sPosEdge2BndStencil[MAX_VALENCE+1][6];
static float sPosInteriorBndStencil[MAX_VALENCE+1][4];

static float sPosEdge1CornerStencil[MAX_VALENCE+1][6];
static float sPosEdge2CornerStencil[MAX_VALENCE+1][6];

#endif

static bool sTableInited = false;
static bool sCornerCorrection = false;
static bool sShowACCGeometryTangents = false;
static bool sUseCornerTangents = true;

void set_ShowACCGeometryTangents(bool v)
{
	sShowACCGeometryTangents = v;
}

void set_CornerCorrection(bool v)
{
	sCornerCorrection = v;
}

void set_UseCornerTangents(bool v)
{
	sUseCornerTangents = v;
}

// averaging function over geometry patch tangents.
static float tangentAveraging( int n, int j)
{
	return sin( PI * j / (float) n );
}

//--------------------------------------------------------------------------------------
// Subdiv Stencils
//--------------------------------------------------------------------------------------
#if !defined( USE_OPT )

static void ComputeCatmullClarkLimitPosStencil(byte boundary, int n, float *stencilBuffer)
{
	VPROF_BUDGET( "ComputeCatmullClarkLimitPosStencil", _T("SubD Rendering") );

	memset(stencilBuffer, 0, 2*n*sizeof(float));

	if (!boundary)
	{
		float scale = 1.0f / (n*n + 5.0f*n);

		stencilBuffer[0] = n*n * scale;

		for (int i=0; i<n; i++)
		{
			stencilBuffer[2*i+1] = 4.0f * scale;
			stencilBuffer[2*i+2] = 1.0f * scale;
		}
	}
	else
	{
		int k = n-1;

		float s = 1.0f / 6.0f;
		stencilBuffer[0]     = s * 4.0f;
		stencilBuffer[1]     = s * 1.0f;
		stencilBuffer[2*k+1] = s * 1.0f;
	}
}

static void ComputeCatmullClarkLimitTanStencil(bool bndVtx, bool cornerVtx, const int n, float *stencilBuffer1, float *stencilBuffer2)
{
	VPROF_BUDGET( "ComputeCatmullClarkLimitTanStencil", _T("SubD Rendering") );

	memset( stencilBuffer1, 0, sizeof(float) * 2*n );
	memset( stencilBuffer2, 0, sizeof(float) * 2*n );

	if ( !bndVtx )
	{
		float scale_beta  = 1.0f / (n * sqrtf( 4.0f + cos( PI / n ) * cos( PI / n ) ) );
		float scale_alpha = 1.0f / n + cos( PI / n ) * scale_beta;

		for ( int i=0; i<n; i++ )
		{
			stencilBuffer1[2*i+1] = cos( 2*PI*i/n ) * scale_alpha;
			stencilBuffer1[2*i+2] = cos((2*PI*i+PI)/n ) * scale_beta;

			int j = (i - 1)%n;
			stencilBuffer2[2*i+1] = cos( 2*PI*j/n ) * scale_alpha;
			stencilBuffer2[2*i+2] = cos((2*PI*j+PI)/n ) * scale_beta;
		}
	}
	else
	{
		// boundary vertex cases

		if ( cornerVtx )
		{
			if ( n<=2 )
				return;

			float sectorScale = 0, w;
			// treat first and last tangent (crease edges) separately
			w = tangentAveraging( n-1, 0 ); sectorScale += w;
			stencilBuffer1[ 1] +=  0.5 * w;
			stencilBuffer1[ 0] += -0.5 * w;

			w = tangentAveraging( n-1, n-1 ); sectorScale += w;
			stencilBuffer1[ 2*(n-1)+1] +=  0.5 * w;
			stencilBuffer1[ 0 ]        += -0.5 * w;

			// inner tangents are computed using the 6 weights from the geometery edge construction.
			for (int k=1; k<(n-1); k++)
			{
				w = tangentAveraging( n-1, k ); sectorScale += w;
				float scale = 1.0f / (2.0f*n + 10.0f);

				stencilBuffer1[        0] += w * (2.0f*n * scale - 1.0f);
				stencilBuffer1[2*(k-1)+1] += w *  2.0f   * scale;
				stencilBuffer1[2*(k-1)+2] += w *  1.0f   * scale;
				stencilBuffer1[2*(k-1)+3] += w *  4.0f   * scale;
				stencilBuffer1[2*(k-1)+4] += w *  1.0f   * scale;
				stencilBuffer1[2*(k-1)+5] += w *  2.0f   * scale;
			}

			// rescale weights
			for (int k = 0; k<2*n; k++)
			{
				stencilBuffer1[k] /= sectorScale;
			}

		}
		else
		{
			// special case to avoid colinear tangents
			if ( n==2 )
			{
				float s = 1.0f / 2.0f;
				stencilBuffer1[1] = 1.0 * s;
				stencilBuffer1[3] =-1.0 * s;

				stencilBuffer2[1] =-1.0 * s;
				stencilBuffer2[3] = 1.0 * s;


				// regularization term to avoid collinearity and preserve limit normal at the boundary
				float eps = 1e-4;
				stencilBuffer1[0] += eps * (-4.0/3.0);
				stencilBuffer1[1] += eps * (1.0/2.0);
				stencilBuffer1[2] += eps * (1.0/3.0);
				stencilBuffer1[3] += eps * (1.0/2.0);

				stencilBuffer2[0] += eps * (-4.0/3.0);
				stencilBuffer2[1] += eps * (1.0/2.0);
				stencilBuffer2[2] += eps * (1.0/3.0);
				stencilBuffer2[3] += eps * (1.0/2.0);

			}
			else
			{
				int k = n-1;
				float c = cos( PI / k ), s=sin( PI / k );

				stencilBuffer1[2*0+1] =  0.5f;
				stencilBuffer1[2*k+1] = -0.5f;

				stencilBuffer2[0] = -4.0f*s / (3.0f*k + c);  // gamma

				for (int i=0; i<k; ++i)
				{
					stencilBuffer2[2*i+1] = 4*sin(PI*i/k)/(3*k+c);                   // alpha_i
					stencilBuffer2[2*i+2] = (sin(PI*i/k)+sin(PI*(i+1)/k)) / (3.0f*k+c);   // beta_i
				}

				stencilBuffer2[2*0+1] = stencilBuffer2[2*k+1] = -( (1+2*c)*sqrt(1+c) ) / ( (3*k+c)*sqrt(1-c) );  // alpha_0, alpha_k
			}
		}

	}

}


static void computeACCEdgePosStencils(byte boundary, byte corner, int n, float *stencilBuffer1, float *stencilBuffer2)
{
	VPROF_BUDGET( "ComputeACCEdgePosStencils", _T("SubD Rendering") );

	memset(stencilBuffer1, 0, 6*sizeof(float));
	memset(stencilBuffer2, 0, 6*sizeof(float));

	if ( !boundary )
	{
		float scale = 1.0f / (2.0f*n + 10.0f);

		stencilBuffer1[0] = 2.0f*n * scale; stencilBuffer2[0] = 4.0f * scale;
		stencilBuffer1[1] = 2.0f * scale;   stencilBuffer2[1] = 1.0f * scale;
		stencilBuffer1[2] = 1.0f * scale;   stencilBuffer2[2] = 2.0f * scale;
		stencilBuffer1[3] = 4.0f * scale;   stencilBuffer2[3] = 2.0f*n* scale;
		stencilBuffer1[4] = 1.0f * scale;   stencilBuffer2[4] = 2.0f * scale;
		stencilBuffer1[5] = 2.0f * scale;   stencilBuffer2[5] = 1.0f * scale;
	}
	else
	{ // boundary stencil
		if ( corner )
		{
			float scale = 1.0f / (3.0f);

			stencilBuffer1[0] = 2.0f * scale; stencilBuffer2[0] = 1.0f * scale;
			stencilBuffer1[3] = 1.0f * scale; stencilBuffer2[3] = 2.0f * scale;
		}
		else
		{
			float scale = 1.0f / 3.0f;

			stencilBuffer1[0] = 2.0f * scale; stencilBuffer2[0] = 1.0f * scale;
			stencilBuffer1[3] = 1.0f * scale; stencilBuffer2[3] = 2.0f * scale;
		}
	}
}

static void computeACCInteriorPosStencil(byte boundary, int n, float *stencilBuffer)
{
	VPROF_BUDGET( "ComputeACCInteriorPosStencil", _T("SubD Rendering") );

	float scale = 1.0f / (n + 5.0f);

	stencilBuffer[0] = n * scale;
	stencilBuffer[1] = 2.0f * scale;
	stencilBuffer[2] = 1.0f * scale;
	stencilBuffer[3] = 2.0f * scale;
}


void FillTables()
{
	if ( sTableInited )	return;

	for ( int val=0; val<=MAX_VALENCE; val++ )
	{
		// interior stencils
		computeCatmullClarkLimitPosStencil(false, val, sPosCornerStencil[val]);
		computeACCEdgePosStencils(false, false, val, sPosEdge1Stencil[val], sPosEdge2Stencil[val]);
		computeACCInteriorPosStencil(false, val, sPosInteriorStencil[val]);

		// boundary stencils
		computeCatmullClarkLimitPosStencil(true, val, sPosCornerBndStencil[val]);
		computeACCEdgePosStencils(true, false, val, sPosEdge1BndStencil[val], sPosEdge2BndStencil[val]);
		computeACCEdgePosStencils(true, true, val, sPosEdge1CornerStencil[val], sPosEdge2CornerStencil[val]);
		computeACCInteriorPosStencil(true, val, sPosInteriorBndStencil[val]);

		computeCatmullClarkLimitTanStencil(false, false, val, sCCLimitTanStencil1[val], sCCLimitTanStencil2[val]);
		computeCatmullClarkLimitTanStencil(true, false, val, sCCLimitTanBndStencil1[val], sCCLimitTanBndStencil2[val]);
		computeCatmullClarkLimitTanStencil(true, true, val, sCCLimitTanCornerStencil1[val], sCCLimitTanCornerStencil2[val]);
	}

	sTableInited = true;
}


//--------------------------------------------------------------------------------------
// Runtime
//--------------------------------------------------------------------------------------

#ifdef _DEBUG
static ConVar mat_tess_dump( "mat_tess_dump", "0", FCVAR_CHEAT );
#endif

// Compute corner control points for each patch
inline void ComputeCatmullClarkLimitPosition( Vector4D *pPos, unsigned short *oneRing,
											 unsigned short vtx1RingSize, unsigned short minOneRingIndex, unsigned short bndVtx,
											 unsigned short cornerVtx, unsigned short valence, unsigned short nbCorners, Vector4D &limitPos )
{
	VPROF_BUDGET( "ComputeCatmullClarkLimitPosition", _T("SubD Rendering") );

	if ( cornerVtx > 0 )
	{
		limitPos = pPos[ oneRing[0] ];
	}
	else
	{
		assert( valence <= MAX_VALENCE );

		float *pStencil = bndVtx ? sPosCornerBndStencil[ valence ] : sPosCornerStencil[ valence ];

		// pStencil[0] is always the largest value (see Figures 4 and 5 in Loop and Schaefer)
		limitPos = pStencil[0] * pPos[ oneRing[0] ];
		for ( int k = 0; k < vtx1RingSize; k++ )
		{
			int idx = ( k + minOneRingIndex ) % vtx1RingSize;	// Shuffle to get the minimum index consistently first in order
			if ( idx != 0 )										// Don't do pStencil[0] again
			{
				limitPos += pStencil[idx] * pPos[ oneRing[idx] ];
			}
		}
	}
#ifdef _DEBUG
	g_DebugCornerPositions.AddToTail( limitPos );
#endif
}

inline Vector4D CrossProduct(const Vector4D& a, const Vector4D& b)
{
	return Vector4D( a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f );
}

inline float VectorNormalize(Vector4D& vec)
{
	float radius = sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z);

	// FLT_EPSILON is added to the radius to eliminate the possibility of divide by zero.
	float iradius = 1.f / ( radius + FLT_EPSILON );

	vec.x *= iradius;
	vec.y *= iradius;
	vec.z *= iradius;

	return radius;
}

FORCEINLINE float DotProduct(const Vector4D& a, const Vector4D& b)
{
	return ( a.x*b.x + a.y*b.y + a.z*b.z );
}

inline void ComputeCatmullClarkLimitTangents( int idx, Vector4D *pPos, unsigned short *oneRing, unsigned short vtx1RingSize,
											 unsigned short centerOffset, unsigned short bndVtx, unsigned short cornerVtx,
											 unsigned short valence, unsigned short &loopGapAngle,
											 Vector4D &limitTanU, Vector4D &limitTanV )
{
	// for valence=1, no need to have separate tangents

	float tanUSign[] = {1,-1,-1,1};
	float tanVSign[] = {1,1,-1,-1};

	VPROF_BUDGET( "ComputeCatmullClarkLimitTangents", _T("SubD Rendering") );

	if ( !sUseCornerTangents )
		cornerVtx = 0;

	if ( !bndVtx )			// interior vertices
	{
		float *stencil1 = sCCLimitTanStencil1[ valence ];
		float *stencil2 = sCCLimitTanStencil2[ valence ];

		limitTanU = Vector4D(0,0,0,0);
		limitTanV = Vector4D(0,0,0,0);

		for (int k = 0; k < vtx1RingSize; ++k)
		{
			limitTanU += stencil1[k] * pPos[ oneRing[k] ];
			limitTanV += stencil2[k] * pPos[ oneRing[k] ];
		}

	}
	else if ( (!cornerVtx) || (cornerVtx == CORNER_WITH_SMOOTHBNDTANGENTS) )	// smooth boundary vertices
	{

		float *stencil1 = sCCLimitTanBndStencil1[ valence ];
		float *stencil2 = sCCLimitTanBndStencil2[ valence ];

		Vector4D r0 = Vector4D(0,0,0,0);
		Vector4D r1 = Vector4D(0,0,0,0);

		for (int k = 0; k < vtx1RingSize; ++k)
		{
			r0 += stencil1[k] * pPos[ oneRing[k] ];
			r1 += stencil2[k] * pPos[ oneRing[k] ];
		}

		int j1 = (centerOffset - 1) / 2;
		int j2 = j1+1;
		int K = (valence - 1);

		if (valence == 2)
		{
			limitTanU = r0;
			limitTanV = r1;
		}
		else
		{
			limitTanU = cos(PI*j1 / K) * r0 + sin(PI*j1 / K) * r1;
			limitTanV = cos(PI*j2 / K) * r0 + sin(PI*j2 / K) * r1;
		}
	}
	else // corner vertices
	{
		if ( valence == 2 )
			return;

		float *pEdgeStencil = sPosEdge1Stencil[ valence ];
//		float *avgStencil  = sCCLimitTanCornerStencil1[ valence ];

		// compute tangents
		Vector4D c0 = pPos[ oneRing[1] ] - pPos[ oneRing[0] ];  c0.w = 0;
		Vector4D c1 = pPos[ oneRing[vtx1RingSize - 1] ] - pPos[ oneRing[0] ]; c1.w = 0;

		Vector4D e0 = (pEdgeStencil[0] - 1.0f ) * pPos[ oneRing[0] ];
		Vector4D e1 = (pEdgeStencil[0] - 1.0f ) * pPos[ oneRing[0] ];
		for (int k = 1; k < 6; k++ )
		{
			e0 += pEdgeStencil[k] * pPos[ oneRing[ k ] ];
			e1 += pEdgeStencil[k] * pPos[ oneRing[ vtx1RingSize - 6 + k ] ];
		}
		e0.w = 0; e1.w = 0;

		// compute average tangent plane normal
		Vector4D n0 = CrossProduct( c0, e0 ); VectorNormalize( n0 );
		Vector4D n1 = CrossProduct( e1, c1 ); VectorNormalize( n1 );
		Vector4D N = n0 + n1;
//		N = N - ( DotProduct( N, tAvg )/ DotProduct(tAvg, tAvg) ) * tAvg;
		VectorNormalize( N );

		// project into tangent plane

		c0 = c0 - DotProduct(c0, N) * N;
		c1 = c1 - DotProduct(c1, N) * N;

		float c0l = Vector4DLength( c0 ); c0 = c0 / c0l;
		float c1l = Vector4DLength( c1 ); c1 = c1 / c1l;
		float cAvg = (c0l + c1l) / 2;

		// compute angle
		Vector4D c0p = CrossProduct(N, c0);
		float angle = PI - atan2( DotProduct(c0p, c1), -DotProduct(c0, c1) );

		loopGapAngle = (unsigned int) ((65535.0 * angle) / (2*PI));

		// compute final tangent vector
		int j1 = (centerOffset - 1) / 2;
		int j2 = j1+1;
		int K = (valence - 1);

		limitTanU = cAvg * ( cos(angle*j1 / K) * c0 + sin(angle*j1 / K) * c0p );
		limitTanV = cAvg * ( cos(angle*j2 / K) * c0 + sin(angle*j2 / K) * c0p );
	}

	// flip tangents so they point in u/v direction
	if ( idx & 1 )
	{
		swap(limitTanU, limitTanV);
	}
	limitTanU *= tanUSign[idx];
	limitTanV *= tanVSign[idx];
}


inline void ComputeACCEdgePositions( Vector4D *pPos, unsigned short *oneRing, unsigned short centerOffset,
									unsigned short bndEdge, unsigned short bndVtx0, unsigned short bndVtx1,
									unsigned short cornerVtx0, unsigned short cornerVtx1, unsigned short loopGapAngle0, unsigned short loopGapAngle1,
									unsigned short edgeBias0, unsigned short edgeBias1, unsigned short val0, unsigned short val1,
									unsigned short minOneRingOffset, unsigned short vtx1RingSize,
									Vector4D &edgePos0, Vector4D &edgePos1)
{
	VPROF_BUDGET( "ComputeACCEdgePositions", _T("SubD Rendering") );

	if ( bndVtx0 )
	{
		val0 = 2*(val0 - 1);
	}

	if ( bndVtx1 )
	{
		val1 = 2*(val1 - 1);
	}

	Assert( val0 <= MAX_VALENCE );
	Assert( val1 <= MAX_VALENCE );

	float* pStencil0 = (bndEdge) ? (cornerVtx0) ? sPosEdge1CornerStencil[ val0 ] : sPosEdge1BndStencil[ val0 ] : sPosEdge1Stencil[ val0 ];
	float* pStencil1 = (bndEdge) ? (cornerVtx1) ? sPosEdge2CornerStencil[ val1 ] : sPosEdge2BndStencil[ val1 ] : sPosEdge2Stencil[ val1 ];

	int kEnd = (bndEdge) ? 4 : 6;

	if ( ( edgeBias0 == 16384 ) && ( edgeBias1 == 16384 ) )
	{
		int oneRingIndex[6] = { 0, 0, 0, 0, 0, 0 };
		for ( int i = 1; i < kEnd; i++ )
		{
			oneRingIndex[i] = centerOffset + i - 1;
		}

		edgePos0 = edgePos1 = Vector4D(0,0,0,0);
		for ( int k = 0; k < kEnd; k++ )
		{
			int idx = ( k + minOneRingOffset ) % kEnd;	// Offset to min index to enforce evaluation order between neighboring patches
			edgePos0 += pStencil0[idx] * pPos[ oneRing[ oneRingIndex[idx] ] ];
			edgePos1 += pStencil1[idx] * pPos[ oneRing[ oneRingIndex[idx] ] ];
		}
	}
	else
	{
		float b0, b1;
		b1 = edgeBias0 / 32768.0, b0 = 1.0f-b1;
		edgePos0 = (val0 * pPos[ oneRing[0] ] + 2*b0*pPos[ oneRing[centerOffset + 0] ] + 1*b0*pPos[ oneRing[centerOffset + 1] ] +    2*pPos[ oneRing[centerOffset + 2] ] + 1*b1*pPos[ oneRing[centerOffset + 3] ] + 2*b1*pPos[ oneRing[centerOffset + 4] ] ) / (val0 + 5.0f);
		b1 = edgeBias1 / 32768.0, b0 = 1.0f-b1;
		edgePos1 = (   2 * pPos[ oneRing[0] ] + 1*b0*pPos[ oneRing[centerOffset + 0] ] + 2*b0*pPos[ oneRing[centerOffset + 1] ] + val1*pPos[ oneRing[centerOffset + 2] ] + 2*b1*pPos[ oneRing[centerOffset + 3] ] + 1*b1*pPos[ oneRing[centerOffset + 4] ] ) / (val1 + 5.0f);
	}

#ifdef _DEBUG
	g_DebugEdgePositions.AddToTail( edgePos0 );
	g_DebugEdgePositions.AddToTail( edgePos1 );
#endif
}


inline void ComputeACCInteriorPosition( Vector4D *pPos, unsigned short *oneRing, unsigned short centerOffset, unsigned short bndVtx, unsigned short valence, Vector4D &interiorPos )
{
	VPROF_BUDGET( "ComputeACCInteriorPosition", _T("SubD Rendering") );

	if ( bndVtx )
	{
		valence = valence>2 ?  2*(valence - 1) : 4*(valence - 1);
	}

	Assert( valence<=MAX_VALENCE );

	float *stencil = sPosInteriorStencil[ valence ];

	interiorPos = stencil[0] * pPos[ oneRing[0] ];
	for ( int k = 1; k < 4; ++k )
	{
		interiorPos += stencil[k] * pPos[ oneRing[ centerOffset + k - 1 ] ];
	}

#ifdef _DEBUG
	g_DebugInteriorPositions.AddToTail( interiorPos );
#endif

}

inline void ComputeACCGeometryPatchTangents( Vector4D *Pos, Vector4D *TanU, Vector4D *TanV )
{
	VPROF_BUDGET( "ComputeACCGeometryPatchTangents", _T("SubD Rendering") );

	for ( int j=0; j<3; j++ )
	{
		for ( int i=0; i<4; i++ )
		{
			TanU[i*3+j] = 3*( Pos[i*4+j+1]   - Pos[i*4+j] );
			TanV[j*4+i] = 3*( Pos[(j+1)*4+i] - Pos[j*4+i] );
		}
	}
}

void ComputeACCGeometryPatch( Vector4D* pPos, TopologyIndexStruct *quad, Vector4D* Pos)
{
	VPROF_BUDGET( "ComputeACCGeometryPatch", _T("SubD Rendering") );

	int MOD4[8] = {0,1,2,3,0,1,2,3};

	int accCorner[]   = {0,3,15,12};
	int accEdge1[]    = {4,2,11,13};
	int accEdge2[]    = {8,1,7,14};
	int accInterior[] = {5,6,10,9};

	int vtx1RingStart = 0;

	unsigned short *oneRing = quad->oneRing;

	for ( int i=0; i<4; i++ ) // 4 corner vertices
	{
		ComputeCatmullClarkLimitPosition( pPos, &oneRing[vtx1RingStart], quad->vtx1RingSize[i], quad->minOneRingOffset[i], quad->bndVtx[i], quad->cornerVtx[i], quad->valences[i], quad->nbCornerVtx[i], Pos[ accCorner[i] ] );

		ComputeACCEdgePositions( pPos, &oneRing[vtx1RingStart], quad->vtx1RingCenterQuadOffset[i],
			quad->bndEdge[ MOD4[i+3] ],
			quad->bndVtx[i], quad->bndVtx[MOD4[i+3]],
			quad->cornerVtx[i],    quad->cornerVtx[MOD4[i+3]],
			quad->loopGapAngle[i], quad->loopGapAngle[MOD4[i+3]],
			quad->edgeBias[ 2*MOD4[i+3] ], quad->edgeBias[ 2*MOD4[i+3] + 1 ],
			quad->valences[i], quad->valences[MOD4[i+3]],
			quad->minOneRingOffset[i], quad->vtx1RingSize[i],
			Pos[accEdge1[i]], Pos[accEdge2[i]] );

		ComputeACCInteriorPosition( pPos, &oneRing[vtx1RingStart], quad->vtx1RingCenterQuadOffset[i], quad->bndVtx[i], quad->cornerVtx[i], quad->loopGapAngle[i], quad->valences[i], Pos[ accInterior[i] ] );

		vtx1RingStart += quad->vtx1RingSize[i];
	}
}


void ComputeACCTangentPatches( Vector4D* pPos, TopologyIndexStruct* quad, Vector4D* Pos, Vector4D* TanU, Vector4D* TanV )
{
	VPROF_BUDGET( "ComputeACCTangentPatches", _T("SubD Rendering") );

	int MOD4[8] = {0,1,2,3,0,1,2,3};

	int accTanCornerU[] = {0,2,11,9};  // counterclockwise orders!
	int accTanCornerV[] = {0,3,11,8};

	unsigned short *oneRing = quad->oneRing;

	ComputeACCGeometryPatchTangents(Pos, TanU, TanV);

#if !defined( NO_TANGENTS )
	if ( !sShowACCGeometryTangents )
	{
		// compute corner tangents ( = subdivision surface limit tangents)
		int vtx1RingStart = 0;
		for ( int i=0; i<4; i++ )
		{
			int vtx1RingSize = quad->vtx1RingSize[i];

			Vector4D &accTanU = TanU[ accTanCornerU[i] ];
			Vector4D &accTanV = TanV[ accTanCornerV[i] ];

			ComputeCatmullClarkLimitTangents(i, pPos, &oneRing[vtx1RingStart], vtx1RingSize, quad->vtx1RingCenterQuadOffset[i], quad->bndVtx[i], quad->cornerVtx[i], quad->valences[i], quad->loopGapAngle[i], accTanU, accTanV );

			vtx1RingStart += vtx1RingSize;
		}

		// compute correction component to boundary tangents for tangent plane continuity
		//                             /TanV/ /TanU/ / TanV / /TanU/
		static int   CB_CornerIdx[]   = {0,1,2, 3,7,11, 11,10,9, 8,4,0 };
		static int   CB_InteriorIdx[] = {1,2,   5,8,    10,9,    6,3 };
		static float CB_sign[]        = {1,-1,1,-1};

		for ( int i=0; i<4; i++ ) // for all quad edges
		{
			if ( !quad->bndEdge[i] )
			{
				Vector4D *CBTanV = (i&1) ? TanU : TanV;
				Vector4D *CBTanU = (i&1) ? TanV : TanU;

				Vector4D u00 = CBTanU[CB_CornerIdx[3*i + 0]];
				Vector4D u10 = CBTanU[CB_CornerIdx[3*i + 1]];
				Vector4D u20 = CBTanU[CB_CornerIdx[3*i + 2]];

				int val0 = quad->valences[i];
				int val1 = quad->valences[MOD4[i+1]];

				if ( quad->bndVtx[i] )
					val0--;
				if ( quad->bndVtx[MOD4[i+1]] )
					val1--;

				float c0 = cos( (2*PI * quad->loopGapAngle[     i   ] / 65535.0f) / val0 );
				float c1 = cos( (2*PI * quad->loopGapAngle[MOD4[i+1]] / 65535.0f) / val1 );

				CBTanV[ CB_InteriorIdx[2*i + 0] ] += CB_sign[i]*( 2*c0*u10 -   c1*u00 )/3.0f;
				CBTanV[ CB_InteriorIdx[2*i + 1] ] += CB_sign[i]*(   c0*u20 - 2*c1*u10 )/3.0f;
			}
		}

	}
#endif

}
#endif  // !defined( USE_OPT )

#if defined( USE_OPT )

#define M_PI2			6.28318530717958647692f

static fltx4 Four_NegativeThirds;
static fltx4 Four_Fives;
static fltx4 Four_Tens;
static fltx4 Four_N[32];
static fltx4 Four_TwoPI;
static fltx4 Four_Valence[MAX_VALENCE];
static fltx4 Four_ValencePlus5[MAX_VALENCE];

static fltx4 sPosCornerStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2];
static fltx4 sPosEdge1Stencil[MAX_VALENCE+1][6];
static fltx4 sPosEdge2Stencil[MAX_VALENCE+1][6];
static fltx4 sPosInteriorStencil[MAX_VALENCE+1][4];

static fltx4 sCCLimitTanStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static fltx4 sCCLimitTanStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static fltx4 sCCLimitTanBndStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static fltx4 sCCLimitTanBndStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static fltx4 sCCLimitTanCornerStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static fltx4 sCCLimitTanCornerStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];

static fltx4 sPosCornerBndStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static fltx4 sPosEdge1BndStencil[MAX_VALENCE+1][6];
static fltx4 sPosEdge2BndStencil[MAX_VALENCE+1][6];
static fltx4 sPosInteriorBndStencil[MAX_VALENCE+1][4];

static fltx4 sPosEdge1CornerStencil[MAX_VALENCE+1][6];
static fltx4 sPosEdge2CornerStencil[MAX_VALENCE+1][6];

static fltx4 sCCSinPI[MAX_VALENCE*2][MAX_VALENCE];
static fltx4 sCCCosPI[MAX_VALENCE*2][MAX_VALENCE];

static float Valence_MinusOne[MAX_VALENCE];


static void ComputeCatmullClarkLimitPosStencil(byte boundary, int n, fltx4 *stencilBuffer)
{
	VPROF_BUDGET( "ComputeCatmullClarkLimitPosStencil", _T("SubD Rendering") );

	for ( int i=0; i<2*n; ++i )
	{
		stencilBuffer[i] = Four_Zeros;
	}

	if ( !boundary )
	{
		float scale = 1.0f / (n*n + 5.0f*n);

		stencilBuffer[0] = ReplicateX4( n*n * scale );

		for ( int i=0; i<n; i++ )
		{
			stencilBuffer[2*i+1] = ReplicateX4( 4.0f * scale );
			stencilBuffer[2*i+2] = ReplicateX4( 1.0f * scale );
		}
	}
	else
	{
		int k = n-1;

		float s = 1.0f / 6.0f;
		stencilBuffer[0]     = ReplicateX4( s * 4.0f );
		stencilBuffer[1]     = ReplicateX4( s * 1.0f );
		stencilBuffer[2*k+1] = ReplicateX4( s * 1.0f );
	}
}

static void ComputeCatmullClarkLimitTanStencil(bool bndVtx, bool cornerVtx, const int n, fltx4 *stencilBuffer1, fltx4 *stencilBuffer2)
{
	VPROF_BUDGET( "ComputeCatmullClarkLimitTanStencil", _T("SubD Rendering") );

	for ( int i=0; i<2*n; ++i )
	{
		stencilBuffer1[i] = Four_Zeros;
		stencilBuffer2[i] = Four_Zeros;
	}

	if ( !bndVtx )
	{
		float scale_beta  = 1.0f / (n * sqrtf(4.0f + cos(PI/n)*cos(PI/n)));
		float scale_alpha = 1.0f/n + cos(PI/n) * scale_beta;

		for ( int i=0; i<n; i++ )
		{
			stencilBuffer1[2*i+1] = ReplicateX4( cos( 2*PI*i/n ) * scale_alpha );
			stencilBuffer1[2*i+2] = ReplicateX4( cos((2*PI*i+PI)/n ) * scale_beta );

			int j = (i - 1)%n;
			stencilBuffer2[2*i+1] = ReplicateX4( cos( 2*PI*j/n ) * scale_alpha );
			stencilBuffer2[2*i+2] = ReplicateX4( cos((2*PI*j+PI)/n ) * scale_beta );
		}
	}
	else
	{
		// boundary vertex cases
		if ( cornerVtx )
		{
			if ( n<=2 )
				return;

			float sectorScale = 0, w;
			// treat first and last tangent (crease edges) separately
			w = tangentAveraging( n-1, 0 ); sectorScale += w;
			stencilBuffer1[ 1] = stencilBuffer1[ 1] + ReplicateX4( 0.5 * w );
			stencilBuffer1[ 0] = stencilBuffer1[ 0] + ReplicateX4( -0.5 * w );

			w = tangentAveraging( n-1, n-1 ); sectorScale += w;
			stencilBuffer1[ 2*(n-1)+1] = stencilBuffer1[ 2*(n-1)+1] + ReplicateX4( 0.5 * w );
			stencilBuffer1[ 0 ]        = stencilBuffer1[ 0 ]        + ReplicateX4( -0.5 * w );

			// inner tangents are computed using the 6 weights from the geometery edge construction.
			for (int k=1; k<(n-1); k++)
			{
				w = tangentAveraging( n-1, k ); sectorScale += w;
				float scale = 1.0f / (2.0f*n + 10.0f);

				stencilBuffer1[        0] = stencilBuffer1[        0] + ReplicateX4( w * (2.0f*n * scale - 1.0f) );
				stencilBuffer1[2*(k-1)+1] = stencilBuffer1[2*(k-1)+1] + ReplicateX4( w *  2.0f   * scale );
				stencilBuffer1[2*(k-1)+2] = stencilBuffer1[2*(k-1)+2] + ReplicateX4( w *  1.0f   * scale );
				stencilBuffer1[2*(k-1)+3] = stencilBuffer1[2*(k-1)+3] + ReplicateX4( w *  4.0f   * scale );
				stencilBuffer1[2*(k-1)+4] = stencilBuffer1[2*(k-1)+4] + ReplicateX4( w *  1.0f   * scale );
				stencilBuffer1[2*(k-1)+5] = stencilBuffer1[2*(k-1)+5] + ReplicateX4( w *  2.0f   * scale );
			}

			// rescale weights
			fltx4 fltx4Scale = ReplicateX4( sectorScale );
			for ( int k = 0; k<2*n; ++k )
			{
				stencilBuffer1[k] = DivSIMD( stencilBuffer1[k], fltx4Scale );
			}

		}
		else
		{
			// special case to avoid colinear tangents
			if ( n==2 )
			{
				float s = 1.0f / 2.0f;
				stencilBuffer1[1] = ReplicateX4(  1.0 * s );
				stencilBuffer1[3] = ReplicateX4( -1.0 * s );

				stencilBuffer2[1] = ReplicateX4( -1.0 * s );
				stencilBuffer2[3] = ReplicateX4(  1.0 * s );


				// regularization term to avoid collinearity and preserve limit normal at the boundary
				float eps = 1e-4;
				stencilBuffer1[0] = AddSIMD( stencilBuffer1[0], ReplicateX4( eps * (-4.0/3.0) ) );
				stencilBuffer1[1] = AddSIMD( stencilBuffer1[1], ReplicateX4( eps * (1.0/2.0) ) );
				stencilBuffer1[2] = AddSIMD( stencilBuffer1[2], ReplicateX4( eps * (1.0/3.0) ) );
				stencilBuffer1[3] = AddSIMD( stencilBuffer1[3], ReplicateX4( eps * (1.0/2.0) ) );

				stencilBuffer2[0] = AddSIMD( stencilBuffer2[0], ReplicateX4( eps * (-4.0/3.0) ) );
				stencilBuffer2[1] = AddSIMD( stencilBuffer2[1], ReplicateX4( eps * (1.0/2.0) ) );
				stencilBuffer2[2] = AddSIMD( stencilBuffer2[2], ReplicateX4( eps * (1.0/3.0) ) );
				stencilBuffer2[3] = AddSIMD( stencilBuffer2[3], ReplicateX4( eps * (1.0/2.0) ) );
			}
			else
			{
				int k = n-1;
				float c = cos( PI / k ), s=sin( PI / k );

				stencilBuffer1[2*0+1] = ReplicateX4(  0.5f );
				stencilBuffer1[2*k+1] = ReplicateX4( -0.5f );

				stencilBuffer2[0] = ReplicateX4( -4.0f*s / (3.0f*k + c) );  // gamma

				for ( int i=0; i<k; ++i )
				{
					stencilBuffer2[2*i+1] = ReplicateX4( 4*sin(PI*i/k)/(3*k+c) );                   // alpha_i
					stencilBuffer2[2*i+2] = ReplicateX4( (sin(PI*i/k)+sin(PI*(i+1)/k)) / (3.0f*k+c) );   // beta_i
				}

				stencilBuffer2[2*0+1] = stencilBuffer2[2*k+1] = ReplicateX4( -( (1+2*c)*sqrt(1+c) ) / ( (3*k+c)*sqrt(1-c) ) );  // alpha_0, alpha_k
			}
		}

	}

}

static void ComputeACCEdgePosStencils(byte boundary, byte corner, int n, fltx4 *stencilBuffer1, fltx4 *stencilBuffer2)
{
	VPROF_BUDGET( "ComputeACCEdgePosStencils", _T("SubD Rendering") );

	for ( int i=0; i<6; ++i )
	{
		stencilBuffer1[i] = Four_Zeros;
		stencilBuffer2[i] = Four_Zeros;
	}

	if ( !boundary )
	{
		float scale = 1.0f / (2.0f*n + 10.0f);

		stencilBuffer1[0] = ReplicateX4( 2.0f*n * scale ); stencilBuffer2[0] = ReplicateX4( 4.0f * scale );
		stencilBuffer1[1] = ReplicateX4( 2.0f * scale );   stencilBuffer2[1] = ReplicateX4( 1.0f * scale );
		stencilBuffer1[2] = ReplicateX4( 1.0f * scale );   stencilBuffer2[2] = ReplicateX4( 2.0f * scale );
		stencilBuffer1[3] = ReplicateX4( 4.0f * scale );   stencilBuffer2[3] = ReplicateX4( 2.0f*n* scale );
		stencilBuffer1[4] = ReplicateX4( 1.0f * scale );   stencilBuffer2[4] = ReplicateX4( 2.0f * scale );
		stencilBuffer1[5] = ReplicateX4( 2.0f * scale );   stencilBuffer2[5] = ReplicateX4( 1.0f * scale );
	}
	else
	{
		// boundary stencil
		if ( corner )
		{
			float scale = 1.0f / (3.0f);

			stencilBuffer1[0] = ReplicateX4( 2.0f * scale ); stencilBuffer2[0] = ReplicateX4( 1.0f * scale );
			stencilBuffer1[3] = ReplicateX4( 1.0f * scale ); stencilBuffer2[3] = ReplicateX4( 2.0f * scale );
		}
		else
		{
			float scale = 1.0f / 3.0f;

			stencilBuffer1[0] = ReplicateX4( 2.0f * scale ); stencilBuffer2[0] = ReplicateX4( 1.0f * scale );
			stencilBuffer1[3] = ReplicateX4( 1.0f * scale ); stencilBuffer2[3] = ReplicateX4( 2.0f * scale );
		}
	}
}

static void ComputeACCInteriorPosStencil(byte boundary, int n, fltx4 *stencilBuffer)
{
	VPROF_BUDGET( "ComputeACCInteriorPosStencil", _T("SubD Rendering") );

	float scale = 1.0f / (n + 5.0f);

	stencilBuffer[0] = ReplicateX4( n * scale );
	stencilBuffer[1] = ReplicateX4( 2.0f * scale );
	stencilBuffer[2] = ReplicateX4( 1.0f * scale );
	stencilBuffer[3] = ReplicateX4( 2.0f * scale );
}

static void ComputeACCSinCosPITables()
{
	fltx4 PI4 = ReplicateX4( M_PI );

	for ( int j=0; j<MAX_VALENCE*2; ++j )
	{
		fltx4 j4 = ReplicateX4( (float)j );

		for ( int k=0; k<MAX_VALENCE; ++k )
		{
			fltx4 k4 = ReplicateX4( (float)k );
			fltx4 radians = DivSIMD( MulSIMD( PI4, j4 ), k4 );

			// not really simd
			SinCosSIMD( sCCSinPI[j][k], sCCCosPI[j][k], radians );
		}
	}
}

void FillTables()
{
	if ( sTableInited )
		return;

	// Some simd stuff
	Four_TwoPI = ReplicateX4( 2*M_PI );
	Four_Tens = ReplicateX4( 10.0f );
	Four_Fives = ReplicateX4( 5 );
	Four_NegativeThirds = ReplicateX4( -0.333333333333333f );
	for ( int i=0; i<32; ++i )
	{
		Four_N[i] = ReplicateX4( (float)i );
	}
	for ( int i=0; i<MAX_VALENCE; ++i )
	{
		Four_Valence[i] = ReplicateX4( (float)i );
		Four_ValencePlus5[i] = ReplicateX4( (float)i + 5.0f );
		Valence_MinusOne[i] = (float)(i-1);
	}

	for ( int val=0; val<=MAX_VALENCE; val++ )
	{
		// interior stencils
		ComputeCatmullClarkLimitPosStencil( false, val, sPosCornerStencil[val] );
		ComputeACCEdgePosStencils( false, false, val, sPosEdge1Stencil[val], sPosEdge2Stencil[val] );
		ComputeACCInteriorPosStencil( false, val, sPosInteriorStencil[val] );

		// boundary stencils
		ComputeCatmullClarkLimitPosStencil( true, val, sPosCornerBndStencil[val] );
		ComputeACCEdgePosStencils( true, false, val, sPosEdge1BndStencil[val], sPosEdge2BndStencil[val] );
		ComputeACCEdgePosStencils( true, true, val, sPosEdge1CornerStencil[val], sPosEdge2CornerStencil[val] );
		ComputeACCInteriorPosStencil( true, val, sPosInteriorBndStencil[val] );

		ComputeCatmullClarkLimitTanStencil( false, false, val, sCCLimitTanStencil1[val], sCCLimitTanStencil2[val] );
		ComputeCatmullClarkLimitTanStencil( true, false, val, sCCLimitTanBndStencil1[val], sCCLimitTanBndStencil2[val] );
		ComputeCatmullClarkLimitTanStencil( true, true, val, sCCLimitTanCornerStencil1[val], sCCLimitTanCornerStencil2[val] );
	}

	// sincos tables
	ComputeACCSinCosPITables();

	sTableInited = true;
}

//--------------------------------------------------------------------------------------
// Runtime
//--------------------------------------------------------------------------------------
FORCEINLINE void ComputeCatmullClarkLimitPosition( fltx4 *pPos, unsigned short *pOneRing,
												   unsigned short vtx1RingSize, unsigned short minOneRingIndex, unsigned short bndVtx,
												   unsigned short cornerVtx, unsigned short valence, fltx4 &limitPos )
{
	VPROF_BUDGET( "ComputeCatmullClarkLimitPosition (SIMD)", _T( "SubD Rendering" ) );

	assert( pPos );
	assert( pOneRing );

	if ( cornerVtx > 0 )
	{
		limitPos = pPos[ pOneRing[0]  ];
	}
	else
	{
		assert( valence <= MAX_VALENCE );

		fltx4 *pStencil = bndVtx ? sPosCornerBndStencil[ valence ] : sPosCornerStencil[ valence ];

		// pStencil[0] is always the largest value (see Figures 4 and 5 in Loop and Schaefer)
		limitPos = MulSIMD( pStencil[0], pPos[ pOneRing[0] ] );
		for ( int k = 0; k < vtx1RingSize; k++ )
		{
			int idx = ( k + minOneRingIndex ) % vtx1RingSize;	// Shuffle to get the minimum index consistently first in order
			if ( idx != 0 )										// Don't do pStencil[0] again
			{
				limitPos = MaddSIMD( pStencil[idx], pPos[ pOneRing[idx] ], limitPos );
			}
		}
	}
}

FORCEINLINE fltx4 VectorNormalize( fltx4 &A )
{
	fltx4 mag_sq = Dot3SIMD( A, A );						// length^2
	fltx4 invSqrt = ReciprocalSqrtEstSIMD(mag_sq);
	return MulSIMD( A, invSqrt );
}

FORCEINLINE fltx4 VectorLength( fltx4 &A )
{
	fltx4 mag_sq = Dot3SIMD( A, A );						// length^2
	fltx4 invSqrt = ReciprocalSqrtEstSIMD(mag_sq);
	return invSqrt;
}

FORCEINLINE fltx4 CrossProduct( const fltx4 &A, const fltx4 &B )
{
#if defined( _X360 )
	return XMVector3Cross( A, B );
#elif defined( _WIN32 )
	fltx4 A1 = _mm_shuffle_ps( A, A, MM_SHUFFLE_REV( 1, 2, 0, 3 ) );
	fltx4 B1 = _mm_shuffle_ps( B, B, MM_SHUFFLE_REV( 2, 0, 1, 3 ) );
	fltx4 Result1 = MulSIMD( A1, B1 );
	fltx4 A2 = _mm_shuffle_ps( A, A, MM_SHUFFLE_REV( 2, 0, 1, 3 ) );
	fltx4 B2 = _mm_shuffle_ps( B, B, MM_SHUFFLE_REV( 1, 2, 0, 3 ) );
	fltx4 Result2 = MulSIMD( A2, B2 );
	return SubSIMD( Result1, Result2 );
#else
	fltx4 CrossVal;
	SubFloat( CrossVal, 0 ) = SubFloat( A, 1 )*SubFloat( B, 2 ) - SubFloat( A, 2 )*SubFloat( B, 1 );
	SubFloat( CrossVal, 1 ) = SubFloat( A, 2 )*SubFloat( B, 0 ) - SubFloat( A, 0 )*SubFloat( B, 2 );
	SubFloat( CrossVal, 2 ) = SubFloat( A, 0 )*SubFloat( B, 1 ) - SubFloat( A, 1 )*SubFloat( B, 0 );
	SubFloat( CrossVal, 3 ) = 0;
	return CrossVal;
#endif
}

FORCEINLINE void ComputeCatmullClarkLimitTangents( int idx, fltx4 *pPos, unsigned short *pOneRing, unsigned short vtx1RingSize,
											 unsigned short centerOffset, unsigned short bndVtx, unsigned short cornerVtx,
											 unsigned short valence, float &loopGapAngle, fltx4 &limitTanU, fltx4 &limitTanV )
{
	VPROF_BUDGET( "ComputeCatmullClarkLimitTangents (SIMD)", _T( "SubD Rendering" ) );

	// for valence=1, no need to have separate tangents
	static const fltx4 tanUSign[4] = { Four_Ones, Four_NegativeOnes, Four_NegativeOnes, Four_Ones };
	static const fltx4 tanVSign[4] = { Four_Ones, Four_Ones, Four_NegativeOnes, Four_NegativeOnes };

	if (!sUseCornerTangents) cornerVtx = 0;

	// interior vertices
	if ( !bndVtx )
	{
		fltx4 *pStencil0 = sCCLimitTanStencil1[ valence ];
		fltx4 *pStencil1 = sCCLimitTanStencil2[ valence ];

		limitTanU = limitTanV = Four_Zeros;

		for ( int k = 0; k < vtx1RingSize; k++ )
		{
			limitTanU = MaddSIMD( pStencil0[k], pPos[ pOneRing[ k ] ], limitTanU );
			limitTanV = MaddSIMD( pStencil1[k], pPos[ pOneRing[ k ] ], limitTanV );
		}

	}
	else if ( (!cornerVtx) || (cornerVtx == CORNER_WITH_SMOOTHBNDTANGENTS) )
	{
		// smooth boundary vertices
		fltx4 *pStencil0 = sCCLimitTanBndStencil1[ valence ];
		fltx4 *pStencil1 = sCCLimitTanBndStencil2[ valence ];

		fltx4 r0 = Four_Zeros;
		fltx4 r1 = Four_Zeros;

		for (int k = 0; k < vtx1RingSize; ++k)
		{
			r0 = MaddSIMD( pStencil0[k], pPos[ pOneRing[ k ] ], r0 );
			r1 = MaddSIMD( pStencil1[k], pPos[ pOneRing[ k ] ], r1 );
		}

		int j1 = ( centerOffset - 1 ) / 2;
		int j2 = j1 + 1;
		int k = valence - 1;

		if ( valence == 2 )
		{
			limitTanU = r0;
			limitTanV = r1;
		}
		else
		{
			limitTanU = AddSIMD( MulSIMD( sCCCosPI[j1][k], r0 ), MulSIMD( sCCSinPI[j1][k], r1 ) );
			limitTanV = AddSIMD( MulSIMD( sCCCosPI[j2][k], r0 ), MulSIMD( sCCSinPI[j2][k], r1 ) );
		}
	}
	else
	{
		// Corner vertices
		if ( valence == 2 )
			return;

		fltx4 *pEdgeStencil = sPosEdge1Stencil[ valence ];

		// Compute tangents
		fltx4 c0 = SubSIMD( pPos[ pOneRing[ 1 ] ], pPos[ pOneRing[ 0 ] ] );
		fltx4 c1 = SubSIMD( pPos[ pOneRing[ vtx1RingSize - 1 ] ], pPos[ pOneRing[ 0 ] ] );

		fltx4 e0 = MulSIMD( SubSIMD( pEdgeStencil[0], Four_Ones ), pPos[ pOneRing[ 0 ] ] );
		fltx4 e1 = e0;
		for ( int k = 1; k < 6; k++ )
		{
			e0 = MaddSIMD( pEdgeStencil[k], pPos[ pOneRing[ k ] ], e0 );
			e1 = MaddSIMD( pEdgeStencil[k], pPos[ pOneRing[ vtx1RingSize - 6 + k ] ], e1 );
		}

		// Compute average tangent plane normal
		fltx4 n0 = CrossProduct( c0, e0 );
		n0 = VectorNormalize( n0 );
		fltx4 n1 = CrossProduct( e1, c1 );
		n1 = VectorNormalize( n1 );
		fltx4 N = AddSIMD( n0, n1 );
		N = VectorNormalize( N );

		// Project into tangent plane
		fltx4 DotC0N = Dot3SIMD( c0, N );
		fltx4 DotC1N = Dot3SIMD( c1, N );

		c0 = SubSIMD( c0, MulSIMD( DotC0N, N ) );
		c1 = SubSIMD( c1, MulSIMD( DotC1N, N ) );

		fltx4 c0l = VectorLength( c0 );
		c0 = DivSIMD( c0, c0l );
		fltx4 c1l = VectorLength( c1 );
		c1 = DivSIMD( c1, c1l );
		fltx4 cAvg = MulSIMD( AddSIMD(c0l,c1l), Four_PointFives );

		// Compute angle
		fltx4 c0p = CrossProduct(N, c0);
		fltx4 dot1 = Dot3SIMD(c0p, c1);
		fltx4 dot2 = Dot3SIMD(c0, c1);

		float angle = PI - atan2( SubFloat( dot1, 0 ), -SubFloat( dot2, 0 ) );

		loopGapAngle = angle;

		// Compute final tangent vector
		int j1 = ( centerOffset - 1 ) / 2;
		int j2 = j1 + 1;
		int K = (valence - 1);

		static float fK[MAX_VALENCE] = { 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
										 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
										 17.0f, 18.0f };
		// Compute final tangent vector
		float flK = fK[K];

		fltx4 Cos0 = ReplicateX4( cos( angle*j1 / flK ) );
		fltx4 Sin0 = ReplicateX4( sin( angle*j1 / flK ) );
		fltx4 Cos1 = ReplicateX4( cos( angle*j2 / flK ) );
		fltx4 Sin1 = ReplicateX4( sin( angle*j2 / flK ) );

		limitTanU = cAvg * ( Cos0 * c0 + Sin0 * c0p );
		limitTanV = cAvg * ( Cos1 * c0 + Sin1 * c0p );
	}

	// Flip tangents so they point in u/v direction
	if ( idx & 1 )
	{
		V_swap( limitTanU, limitTanV );
	}

	limitTanU = MulSIMD( limitTanU, tanUSign[idx] );
	limitTanV = MulSIMD( limitTanV, tanVSign[idx] );
}

FORCEINLINE void ComputeACCEdgePositions( fltx4 *pPos, unsigned short *oneRing, unsigned short centerOffset,
									unsigned short bndEdge, unsigned short bndVtx0, unsigned short bndVtx1,
									unsigned short cornerVtx0, unsigned short cornerVtx1,
									unsigned short edgeBias0, unsigned short edgeBias1,
									unsigned short val0, unsigned short val1,
									unsigned short minOneRingOffset, unsigned short vtx1RingSize,
									fltx4 &edgePos0, fltx4 &edgePos1)
{
	VPROF_BUDGET( "ComputeACCEdgePositions (SIMD)", _T("SubD Rendering") );

	if ( bndVtx0 )
	{
		val0 = 2*(val0 - 1);
	}

	if ( bndVtx1 )
	{
		val1 = 2*(val1 - 1);
	}

	Assert( val0 <= MAX_VALENCE );
	Assert( val1 <= MAX_VALENCE );

	fltx4 *pStencil0 = (bndEdge) ? (cornerVtx0) ? sPosEdge1CornerStencil[ val0 ] : sPosEdge1BndStencil[ val0 ] : sPosEdge1Stencil[ val0 ];
	fltx4 *pStencil1 = (bndEdge) ? (cornerVtx1) ? sPosEdge2CornerStencil[ val1 ] : sPosEdge2BndStencil[ val1 ] : sPosEdge2Stencil[ val1 ];

	int kEnd = (bndEdge) ? 4 : 6;

	if ( ( edgeBias0 == 16384 ) && ( edgeBias1 == 16384 ) )
	{
		int oneRingIndex[6] = { 0, 0, 0, 0, 0, 0 };
		for ( int i = 1; i < kEnd; i++ )
		{
			oneRingIndex[i] = centerOffset + i - 1;
		}

		edgePos0 = edgePos1 = Four_Zeros;
		for ( int k = 0; k < kEnd; k++ )
		{
			int idx = ( k + minOneRingOffset ) % kEnd;	// Offset to min index to enforce evaluation order between neighboring patches
			edgePos0 = MaddSIMD( pStencil0[idx], pPos[ oneRing[ oneRingIndex[idx] ] ], edgePos0 );
			edgePos1 = MaddSIMD( pStencil1[idx], pPos[ oneRing[ oneRingIndex[idx] ] ], edgePos1 );
		}
	}
	else
	{
		fltx4 b0, b1;
		b1 = ReplicateX4( edgeBias0 / 32768.0f );
		b0 = SubSIMD( Four_Ones, b1 );
		edgePos0 = DivSIMD( ( Four_Valence[val0]*pPos[ oneRing[0] ] +
					    Four_Twos*b0*pPos[ oneRing[ centerOffset] ] +
							   b0*pPos[ oneRing[centerOffset + 1] ] +
						Four_Twos*pPos[ oneRing[centerOffset + 2] ] +
							   b1*pPos[ oneRing[centerOffset + 3] ] +
					 Four_Twos*b1*pPos[ oneRing[centerOffset + 4] ] ), Four_ValencePlus5[val0] );

		b1 = ReplicateX4( edgeBias1 / 32768.0f );
		b0 = SubSIMD( Four_Ones, b1 );
		edgePos1 = DivSIMD( ( Four_Twos*pPos[ oneRing[0] ] +
					  b0*pPos[ oneRing[centerOffset + 0] ] +
			Four_Twos*b0*pPos[ oneRing[centerOffset + 1] ] +
	  Four_Valence[val1]*pPos[ oneRing[centerOffset + 2] ] +
			Four_Twos*b1*pPos[ oneRing[centerOffset + 3] ] +
					  b1*pPos[ oneRing[centerOffset + 4] ] ), Four_ValencePlus5[val0] );
	}
}

FORCEINLINE void ComputeACCInteriorPosition( fltx4 *pPos, unsigned short *oneRing, unsigned short centerOffset, unsigned short bndVtx, unsigned short valence, fltx4 &interiorPos )
{
	VPROF_BUDGET( "ComputeACCInteriorPosition (SIMD)", _T( "SubD Rendering" ) );

	if ( bndVtx )
	{
		valence = valence > 2 ?  2 * (valence - 1) : 4 * (valence - 1);
	}

	Assert( valence <= MAX_VALENCE );

	fltx4 *pStencil = sPosInteriorStencil[ valence ];

	interiorPos = MulSIMD( pStencil[0], pPos[ oneRing[0] ] );
	for ( int k = 1; k < 4; k++ )
	{
		interiorPos = MaddSIMD( pStencil[k], pPos[ oneRing[ centerOffset + k - 1 ] ], interiorPos );
	}
}

FORCEINLINE void ComputeACCGeometryPatchTangents( fltx4 *Pos, fltx4 *TanU, fltx4 *TanV )
{
	//VPROF_BUDGET( "ComputeACCGeometryPatchTangents", _T("SubD Rendering") );
	TanU[0] = MulSIMD( Four_Threes, SubSIMD( Pos[1], Pos[0] ) );
	TanV[0] = MulSIMD( Four_Threes, SubSIMD( Pos[4], Pos[0] ) );
	TanU[3] = MulSIMD( Four_Threes, SubSIMD( Pos[5], Pos[4] ) );
	TanV[1] = MulSIMD( Four_Threes, SubSIMD( Pos[5], Pos[1] ) );
	TanU[6] = MulSIMD( Four_Threes, SubSIMD( Pos[9], Pos[8] ) );
	TanV[2] = MulSIMD( Four_Threes, SubSIMD( Pos[6], Pos[2] ) );
	TanU[9] = MulSIMD( Four_Threes, SubSIMD( Pos[13], Pos[12] ) );
	TanV[3] = MulSIMD( Four_Threes, SubSIMD( Pos[7], Pos[3] ) );
	TanU[1] = MulSIMD( Four_Threes, SubSIMD( Pos[2], Pos[1] ) );
	TanV[4] = MulSIMD( Four_Threes, SubSIMD( Pos[8], Pos[4] ) );
	TanU[4] = MulSIMD( Four_Threes, SubSIMD( Pos[6], Pos[5] ) );
	TanV[5] = MulSIMD( Four_Threes, SubSIMD( Pos[9], Pos[5] ) );
	TanU[7] = MulSIMD( Four_Threes, SubSIMD( Pos[10], Pos[9] ) );
	TanV[6] = MulSIMD( Four_Threes, SubSIMD( Pos[10], Pos[6] ) );
	TanU[10] = MulSIMD( Four_Threes, SubSIMD( Pos[14], Pos[13] ) );
	TanV[7] = MulSIMD( Four_Threes, SubSIMD( Pos[11], Pos[7] ) );
	TanU[2] = MulSIMD( Four_Threes, SubSIMD( Pos[3], Pos[2] ) );
	TanV[8] = MulSIMD( Four_Threes, SubSIMD( Pos[12], Pos[8] ) );
	TanU[5] = MulSIMD( Four_Threes, SubSIMD( Pos[7], Pos[6] ) );
	TanV[9] = MulSIMD( Four_Threes, SubSIMD( Pos[13], Pos[9] ) );
	TanU[8] = MulSIMD( Four_Threes, SubSIMD( Pos[11], Pos[10] ) );
	TanV[10] = MulSIMD( Four_Threes, SubSIMD( Pos[14], Pos[10] ) );
	TanU[11] = MulSIMD( Four_Threes, SubSIMD( Pos[15], Pos[14] ) );
	TanV[11] = MulSIMD( Four_Threes, SubSIMD( Pos[15], Pos[11] ) );
}

void ComputeACCAllPatches( fltx4* pPos, TopologyIndexStruct* quad, Vector4D* Pos, Vector4D* TanU, Vector4D* TanV, bool bRegularPatch )
{
	VPROF_BUDGET( "ComputeACCAllPatches (SIMD)", _T( "SubD Rendering" ) );
	int accCorner[]     = { 0, 3, 15, 12 };
	int accEdge1[]      = { 4, 2, 11, 13 };
	int accEdge2[]      = { 8, 1, 7,  14 };
	int accInterior[]   = { 5, 6, 10, 9  };
	int accTanCornerU[] = { 0, 2, 11, 9  };  // counterclockwise orders!
	int accTanCornerV[] = { 0, 3, 11, 8  };

	fltx4 OutPos[16], OutTanU[16], OutTanV[16];

	// Point to four one-rings
	int vtx1RingStart = 0;
	unsigned short* pOneRing[4];
	for ( int i = 0; i < 4; i++ )
	{
		unsigned short vtx1RingSize = quad->vtx1RingSize[i];
		pOneRing[i] = &(quad->oneRing[vtx1RingStart]);
		vtx1RingStart += vtx1RingSize;
	}

	{
		VPROF_BUDGET( "ComputeACCAllPatches - Geometry Control Points (SIMD)", _T( "SubD Rendering" ) );

		ComputeCatmullClarkLimitPosition( pPos, pOneRing[0], quad->vtx1RingSize[0], quad->minOneRingOffset[0], quad->bndVtx[0], quad->cornerVtx[0], quad->valences[0], OutPos[ accCorner[0] ] );
		ComputeCatmullClarkLimitPosition( pPos, pOneRing[1], quad->vtx1RingSize[1], quad->minOneRingOffset[1], quad->bndVtx[1], quad->cornerVtx[1], quad->valences[1], OutPos[ accCorner[1] ] );
		ComputeCatmullClarkLimitPosition( pPos, pOneRing[2], quad->vtx1RingSize[2], quad->minOneRingOffset[2], quad->bndVtx[2], quad->cornerVtx[2], quad->valences[2], OutPos[ accCorner[2] ] );
		ComputeCatmullClarkLimitPosition( pPos, pOneRing[3], quad->vtx1RingSize[3], quad->minOneRingOffset[3], quad->bndVtx[3], quad->cornerVtx[3], quad->valences[3], OutPos[ accCorner[3] ] );

		ComputeACCEdgePositions( pPos, pOneRing[0], quad->vtx1RingCenterQuadOffset[0],
								 quad->bndEdge[3], quad->bndVtx[0], quad->bndVtx[3],
								 quad->cornerVtx[0], quad->cornerVtx[3],
								 quad->edgeBias[6], quad->edgeBias[7],
								 quad->valences[0], quad->valences[3],
								 quad->minOneRingOffset[0], quad->vtx1RingSize[0],
								 OutPos[accEdge1[0]], OutPos[accEdge2[0]] );
		ComputeACCEdgePositions( pPos, pOneRing[1], quad->vtx1RingCenterQuadOffset[1],
								 quad->bndEdge[0], quad->bndVtx[1], quad->bndVtx[0],
								 quad->cornerVtx[1], quad->cornerVtx[0],
								 quad->edgeBias[0], quad->edgeBias[1],
								 quad->valences[1], quad->valences[0],
								 quad->minOneRingOffset[1], quad->vtx1RingSize[1],
								 OutPos[accEdge1[1]], OutPos[accEdge2[1]] );
		ComputeACCEdgePositions( pPos, pOneRing[2], quad->vtx1RingCenterQuadOffset[2],
								 quad->bndEdge[1], quad->bndVtx[2], quad->bndVtx[1],
								 quad->cornerVtx[2], quad->cornerVtx[1],
								 quad->edgeBias[2], quad->edgeBias[3],
								 quad->valences[2], quad->valences[1],
								 quad->minOneRingOffset[2], quad->vtx1RingSize[2],
								 OutPos[accEdge1[2]], OutPos[accEdge2[2]] );
		ComputeACCEdgePositions( pPos, pOneRing[3], quad->vtx1RingCenterQuadOffset[3],
								 quad->bndEdge[2], quad->bndVtx[3], quad->bndVtx[2],
								 quad->cornerVtx[3], quad->cornerVtx[2],
								 quad->edgeBias[4], quad->edgeBias[5],
								 quad->valences[3], quad->valences[2],
								 quad->minOneRingOffset[3], quad->vtx1RingSize[3],
								 OutPos[accEdge1[3]], OutPos[accEdge2[3]] );

		ComputeACCInteriorPosition( pPos, pOneRing[0], quad->vtx1RingCenterQuadOffset[0], quad->bndVtx[0], quad->valences[0], OutPos[ accInterior[0] ] );
		ComputeACCInteriorPosition( pPos, pOneRing[1], quad->vtx1RingCenterQuadOffset[1], quad->bndVtx[1], quad->valences[1], OutPos[ accInterior[1] ] );
		ComputeACCInteriorPosition( pPos, pOneRing[2], quad->vtx1RingCenterQuadOffset[2], quad->bndVtx[2], quad->valences[2], OutPos[ accInterior[2] ] );
		ComputeACCInteriorPosition( pPos, pOneRing[3], quad->vtx1RingCenterQuadOffset[3], quad->bndVtx[3], quad->valences[3], OutPos[ accInterior[3] ] );
	}

#if !defined( NO_TANGENTS )
	// Don't compute tangents for regular patches
#if defined( SEPARATE_REGULAR_AND_EXTRA )
	if ( !bRegularPatch )
#endif
	{
		VPROF_BUDGET( "ComputeACCAllPatches - Tangents (SIMD)", _T( "SubD Rendering" ) );

		ComputeACCGeometryPatchTangents( OutPos, OutTanU, OutTanV );

		float flLoopGap[4];
		flLoopGap[0] = ( M_PI2 * quad->loopGapAngle[0] ) / 65535.0f;
		flLoopGap[1] = ( M_PI2 * quad->loopGapAngle[1] ) / 65535.0f;
		flLoopGap[2] = ( M_PI2 * quad->loopGapAngle[2] ) / 65535.0f;
		flLoopGap[3] = ( M_PI2 * quad->loopGapAngle[3] ) / 65535.0f;
		if ( !sShowACCGeometryTangents )
		{
			{
				ComputeCatmullClarkLimitTangents( 0, pPos, pOneRing[0], quad->vtx1RingSize[0], quad->vtx1RingCenterQuadOffset[0],
					quad->bndVtx[0], quad->cornerVtx[0], quad->valences[0], flLoopGap[0], OutTanU[ accTanCornerU[0] ], OutTanV[ accTanCornerV[0] ] );
				ComputeCatmullClarkLimitTangents( 1, pPos, pOneRing[1], quad->vtx1RingSize[1], quad->vtx1RingCenterQuadOffset[1],
					quad->bndVtx[1], quad->cornerVtx[1], quad->valences[1], flLoopGap[1], OutTanU[ accTanCornerU[1] ], OutTanV[ accTanCornerV[1] ] );
				ComputeCatmullClarkLimitTangents( 2, pPos, pOneRing[2], quad->vtx1RingSize[2], quad->vtx1RingCenterQuadOffset[2],
					quad->bndVtx[2], quad->cornerVtx[2], quad->valences[2], flLoopGap[2], OutTanU[ accTanCornerU[2] ], OutTanV[ accTanCornerV[2] ] );
				ComputeCatmullClarkLimitTangents( 3, pPos, pOneRing[3], quad->vtx1RingSize[3], quad->vtx1RingCenterQuadOffset[3],
					quad->bndVtx[3], quad->cornerVtx[3], quad->valences[3], flLoopGap[3], OutTanU[ accTanCornerU[3] ], OutTanV[ accTanCornerV[3] ] );
			}

			// compute correction component to boundary tangents for tangent plane continuity
			//                             /TanV/ /TanU/ / TanV / /TanU/
			static int   CB_CornerIdx[]   = {0,1,2, 3,7,11, 11,10,9, 8,4,0 };
			static int   CB_InteriorIdx[] = {1,2,   5,8,    10,9,    6,3 };
			static fltx4 CB_sign[4] = {Four_Ones,Four_NegativeOnes,Four_Ones,Four_NegativeOnes};

			{
				// Unroll, since the compiler wants to keep it rolled, and we get better perf unrolled
				{
					fltx4 u00 = OutTanU[CB_CornerIdx[0]];
					fltx4 u10 = MulSIMD( OutTanU[CB_CornerIdx[1]], Four_Twos );
					fltx4 u20 = OutTanU[CB_CornerIdx[2]];

					int val0 = quad->valences[0];	int val1 = quad->valences[1];
					if ( quad->bndVtx[0] ) val0--;
					if ( quad->bndVtx[1] ) val1--;

					fltx4 c0 = ReplicateX4( cosf( (flLoopGap[0]) / val0 ) );
					fltx4 c1 = ReplicateX4( cosf( (flLoopGap[1]) / val1 ) );

					fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 );	fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 );
					fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes );

					OutTanV[CB_InteriorIdx[0] ] = AddSIMD( OutTanV[CB_InteriorIdx[0] ], E );
					OutTanV[CB_InteriorIdx[1] ] = AddSIMD( OutTanV[CB_InteriorIdx[1] ], F );
				}

				{
					fltx4 u00 = OutTanV[CB_CornerIdx[3]];
					fltx4 u10 = MulSIMD( OutTanV[CB_CornerIdx[4]], Four_Twos );
					fltx4 u20 = OutTanV[CB_CornerIdx[5]];

					int val0 = quad->valences[1];	int val1 = quad->valences[2];
					if ( quad->bndVtx[1] ) val0--;
					if ( quad->bndVtx[2] ) val1--;

					fltx4 c0 = ReplicateX4( cosf( (flLoopGap[1]) / val0 ) );
					fltx4 c1 = ReplicateX4( cosf( (flLoopGap[2]) / val1 ) );

					fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 );	fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 );
					fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes );

					OutTanU[CB_InteriorIdx[2] ] = SubSIMD( OutTanU[CB_InteriorIdx[2] ], E );
					OutTanU[CB_InteriorIdx[3] ] = SubSIMD( OutTanU[CB_InteriorIdx[3] ], F );
				}

				{
					fltx4 u00 = OutTanU[CB_CornerIdx[6]];
					fltx4 u10 = MulSIMD( OutTanU[CB_CornerIdx[7]], Four_Twos );
					fltx4 u20 = OutTanU[CB_CornerIdx[8]];

					int val0 = quad->valences[2];	int val1 = quad->valences[3];
					if ( quad->bndVtx[2] ) val0--;
					if ( quad->bndVtx[3] ) val1--;

					fltx4 c0 = ReplicateX4( cosf( (flLoopGap[2]) / val0 ) );
					fltx4 c1 = ReplicateX4( cosf( (flLoopGap[3]) / val1 ) );

					fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 );	fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 );
					fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes );

					OutTanV[CB_InteriorIdx[4] ] = AddSIMD( OutTanV[CB_InteriorIdx[4] ], E );
					OutTanV[CB_InteriorIdx[5] ] = AddSIMD( OutTanV[CB_InteriorIdx[5] ], F );
				}

				{
					fltx4 u00 = OutTanV[CB_CornerIdx[9]];
					fltx4 u10 = MulSIMD( OutTanV[CB_CornerIdx[10]], Four_Twos );
					fltx4 u20 = OutTanV[CB_CornerIdx[11]];

					int val0 = quad->valences[3];	int val1 = quad->valences[0];
					if ( quad->bndVtx[3] ) val0--;
					if ( quad->bndVtx[0] ) val1--;

					fltx4 c0 = ReplicateX4( cosf( (flLoopGap[3]) / val0 ) );
					fltx4 c1 = ReplicateX4( cosf( (flLoopGap[0]) / val1 ) );

					fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 );	fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 );
					fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes );

					OutTanU[CB_InteriorIdx[6] ] = SubSIMD( OutTanU[CB_InteriorIdx[6] ], E );
					OutTanU[CB_InteriorIdx[7] ] = SubSIMD( OutTanU[CB_InteriorIdx[7] ], F );
				}
			}
		}

		StoreAlignedSIMD( (float*)&TanU[0], OutTanU[0] );
		StoreAlignedSIMD( (float*)&TanU[1], OutTanU[1] );
		StoreAlignedSIMD( (float*)&TanU[2], OutTanU[2] );
		StoreAlignedSIMD( (float*)&TanU[3], OutTanU[3] );
		StoreAlignedSIMD( (float*)&TanU[4], OutTanU[4] );
		StoreAlignedSIMD( (float*)&TanU[5], OutTanU[5] );
		StoreAlignedSIMD( (float*)&TanU[6], OutTanU[6] );
		StoreAlignedSIMD( (float*)&TanU[7], OutTanU[7] );
		StoreAlignedSIMD( (float*)&TanU[8], OutTanU[8] );
		StoreAlignedSIMD( (float*)&TanU[9], OutTanU[9] );
		StoreAlignedSIMD( (float*)&TanU[10], OutTanU[10] );
		StoreAlignedSIMD( (float*)&TanU[11], OutTanU[11] );

		StoreAlignedSIMD( (float*)&TanV[0], OutTanV[0] );
		StoreAlignedSIMD( (float*)&TanV[1], OutTanV[1] );
		StoreAlignedSIMD( (float*)&TanV[2], OutTanV[2] );
		StoreAlignedSIMD( (float*)&TanV[3], OutTanV[3] );
		StoreAlignedSIMD( (float*)&TanV[4], OutTanV[4] );
		StoreAlignedSIMD( (float*)&TanV[5], OutTanV[5] );
		StoreAlignedSIMD( (float*)&TanV[6], OutTanV[6] );
		StoreAlignedSIMD( (float*)&TanV[7], OutTanV[7] );
		StoreAlignedSIMD( (float*)&TanV[8], OutTanV[8] );
		StoreAlignedSIMD( (float*)&TanV[9], OutTanV[9] );
		StoreAlignedSIMD( (float*)&TanV[10], OutTanV[10] );
		StoreAlignedSIMD( (float*)&TanV[11], OutTanV[11] );
	}

#endif

	StoreAlignedSIMD( (float*)&Pos[0], OutPos[0] );
	StoreAlignedSIMD( (float*)&Pos[1], OutPos[1] );
	StoreAlignedSIMD( (float*)&Pos[2], OutPos[2] );
	StoreAlignedSIMD( (float*)&Pos[3], OutPos[3] );
	StoreAlignedSIMD( (float*)&Pos[4], OutPos[4] );
	StoreAlignedSIMD( (float*)&Pos[5], OutPos[5] );
	StoreAlignedSIMD( (float*)&Pos[6], OutPos[6] );
	StoreAlignedSIMD( (float*)&Pos[7], OutPos[7] );
	StoreAlignedSIMD( (float*)&Pos[8], OutPos[8] );
	StoreAlignedSIMD( (float*)&Pos[9], OutPos[9] );
	StoreAlignedSIMD( (float*)&Pos[10], OutPos[10] );
	StoreAlignedSIMD( (float*)&Pos[11], OutPos[11] );
	StoreAlignedSIMD( (float*)&Pos[12], OutPos[12] );
	StoreAlignedSIMD( (float*)&Pos[13], OutPos[13] );
	StoreAlignedSIMD( (float*)&Pos[14], OutPos[14] );
	StoreAlignedSIMD( (float*)&Pos[15], OutPos[15] );
}

#endif