csgo-2018-source/studiorender/r_studiosubd_patches.cpp
2021-07-24 21:11:47 -07:00

1535 lines
53 KiB
C++

#include "r_studiosubd_patches.h"
#include "tier1/convar.h"
#include <stdio.h>
#define PI 3.14159265
#ifdef _DEBUG
CUtlVector<Vector4D> g_DebugCornerPositions;
CUtlVector<Vector4D> g_DebugEdgePositions;
CUtlVector<Vector4D> g_DebugInteriorPositions;
#endif
//----------------------------------------------------------------------------------------------
// static stencil buffers
//----------------------------------------------------------------------------------------------
#if !defined( USE_OPT )
static float sPosCornerStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2];
static float sPosEdge1Stencil[MAX_VALENCE+1][6];
static float sPosEdge2Stencil[MAX_VALENCE+1][6];
static float sPosInteriorStencil[MAX_VALENCE+1][4];
static float sCCLimitTanStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static float sCCLimitTanStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static float sCCLimitTanBndStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static float sCCLimitTanBndStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static float sCCLimitTanCornerStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static float sCCLimitTanCornerStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static float sPosGregoryInterior1Stencil[6];
static float sPosGregoryInterior2Stencil[6];
static float sPosCornerBndStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static float sPosEdge1BndStencil[MAX_VALENCE+1][6];
static float sPosEdge2BndStencil[MAX_VALENCE+1][6];
static float sPosInteriorBndStencil[MAX_VALENCE+1][4];
static float sPosEdge1CornerStencil[MAX_VALENCE+1][6];
static float sPosEdge2CornerStencil[MAX_VALENCE+1][6];
#endif
static bool sTableInited = false;
static bool sCornerCorrection = false;
static bool sShowACCGeometryTangents = false;
static bool sUseCornerTangents = true;
void set_ShowACCGeometryTangents(bool v)
{
sShowACCGeometryTangents = v;
}
void set_CornerCorrection(bool v)
{
sCornerCorrection = v;
}
void set_UseCornerTangents(bool v)
{
sUseCornerTangents = v;
}
// averaging function over geometry patch tangents.
static float tangentAveraging( int n, int j)
{
return sin( PI * j / (float) n );
}
//--------------------------------------------------------------------------------------
// Subdiv Stencils
//--------------------------------------------------------------------------------------
#if !defined( USE_OPT )
static void ComputeCatmullClarkLimitPosStencil(byte boundary, int n, float *stencilBuffer)
{
VPROF_BUDGET( "ComputeCatmullClarkLimitPosStencil", _T("SubD Rendering") );
memset(stencilBuffer, 0, 2*n*sizeof(float));
if (!boundary)
{
float scale = 1.0f / (n*n + 5.0f*n);
stencilBuffer[0] = n*n * scale;
for (int i=0; i<n; i++)
{
stencilBuffer[2*i+1] = 4.0f * scale;
stencilBuffer[2*i+2] = 1.0f * scale;
}
}
else
{
int k = n-1;
float s = 1.0f / 6.0f;
stencilBuffer[0] = s * 4.0f;
stencilBuffer[1] = s * 1.0f;
stencilBuffer[2*k+1] = s * 1.0f;
}
}
static void ComputeCatmullClarkLimitTanStencil(bool bndVtx, bool cornerVtx, const int n, float *stencilBuffer1, float *stencilBuffer2)
{
VPROF_BUDGET( "ComputeCatmullClarkLimitTanStencil", _T("SubD Rendering") );
memset( stencilBuffer1, 0, sizeof(float) * 2*n );
memset( stencilBuffer2, 0, sizeof(float) * 2*n );
if ( !bndVtx )
{
float scale_beta = 1.0f / (n * sqrtf( 4.0f + cos( PI / n ) * cos( PI / n ) ) );
float scale_alpha = 1.0f / n + cos( PI / n ) * scale_beta;
for ( int i=0; i<n; i++ )
{
stencilBuffer1[2*i+1] = cos( 2*PI*i/n ) * scale_alpha;
stencilBuffer1[2*i+2] = cos((2*PI*i+PI)/n ) * scale_beta;
int j = (i - 1)%n;
stencilBuffer2[2*i+1] = cos( 2*PI*j/n ) * scale_alpha;
stencilBuffer2[2*i+2] = cos((2*PI*j+PI)/n ) * scale_beta;
}
}
else
{
// boundary vertex cases
if ( cornerVtx )
{
if ( n<=2 )
return;
float sectorScale = 0, w;
// treat first and last tangent (crease edges) separately
w = tangentAveraging( n-1, 0 ); sectorScale += w;
stencilBuffer1[ 1] += 0.5 * w;
stencilBuffer1[ 0] += -0.5 * w;
w = tangentAveraging( n-1, n-1 ); sectorScale += w;
stencilBuffer1[ 2*(n-1)+1] += 0.5 * w;
stencilBuffer1[ 0 ] += -0.5 * w;
// inner tangents are computed using the 6 weights from the geometery edge construction.
for (int k=1; k<(n-1); k++)
{
w = tangentAveraging( n-1, k ); sectorScale += w;
float scale = 1.0f / (2.0f*n + 10.0f);
stencilBuffer1[ 0] += w * (2.0f*n * scale - 1.0f);
stencilBuffer1[2*(k-1)+1] += w * 2.0f * scale;
stencilBuffer1[2*(k-1)+2] += w * 1.0f * scale;
stencilBuffer1[2*(k-1)+3] += w * 4.0f * scale;
stencilBuffer1[2*(k-1)+4] += w * 1.0f * scale;
stencilBuffer1[2*(k-1)+5] += w * 2.0f * scale;
}
// rescale weights
for (int k = 0; k<2*n; k++)
{
stencilBuffer1[k] /= sectorScale;
}
}
else
{
// special case to avoid colinear tangents
if ( n==2 )
{
float s = 1.0f / 2.0f;
stencilBuffer1[1] = 1.0 * s;
stencilBuffer1[3] =-1.0 * s;
stencilBuffer2[1] =-1.0 * s;
stencilBuffer2[3] = 1.0 * s;
// regularization term to avoid collinearity and preserve limit normal at the boundary
float eps = 1e-4;
stencilBuffer1[0] += eps * (-4.0/3.0);
stencilBuffer1[1] += eps * (1.0/2.0);
stencilBuffer1[2] += eps * (1.0/3.0);
stencilBuffer1[3] += eps * (1.0/2.0);
stencilBuffer2[0] += eps * (-4.0/3.0);
stencilBuffer2[1] += eps * (1.0/2.0);
stencilBuffer2[2] += eps * (1.0/3.0);
stencilBuffer2[3] += eps * (1.0/2.0);
}
else
{
int k = n-1;
float c = cos( PI / k ), s=sin( PI / k );
stencilBuffer1[2*0+1] = 0.5f;
stencilBuffer1[2*k+1] = -0.5f;
stencilBuffer2[0] = -4.0f*s / (3.0f*k + c); // gamma
for (int i=0; i<k; ++i)
{
stencilBuffer2[2*i+1] = 4*sin(PI*i/k)/(3*k+c); // alpha_i
stencilBuffer2[2*i+2] = (sin(PI*i/k)+sin(PI*(i+1)/k)) / (3.0f*k+c); // beta_i
}
stencilBuffer2[2*0+1] = stencilBuffer2[2*k+1] = -( (1+2*c)*sqrt(1+c) ) / ( (3*k+c)*sqrt(1-c) ); // alpha_0, alpha_k
}
}
}
}
static void computeACCEdgePosStencils(byte boundary, byte corner, int n, float *stencilBuffer1, float *stencilBuffer2)
{
VPROF_BUDGET( "ComputeACCEdgePosStencils", _T("SubD Rendering") );
memset(stencilBuffer1, 0, 6*sizeof(float));
memset(stencilBuffer2, 0, 6*sizeof(float));
if ( !boundary )
{
float scale = 1.0f / (2.0f*n + 10.0f);
stencilBuffer1[0] = 2.0f*n * scale; stencilBuffer2[0] = 4.0f * scale;
stencilBuffer1[1] = 2.0f * scale; stencilBuffer2[1] = 1.0f * scale;
stencilBuffer1[2] = 1.0f * scale; stencilBuffer2[2] = 2.0f * scale;
stencilBuffer1[3] = 4.0f * scale; stencilBuffer2[3] = 2.0f*n* scale;
stencilBuffer1[4] = 1.0f * scale; stencilBuffer2[4] = 2.0f * scale;
stencilBuffer1[5] = 2.0f * scale; stencilBuffer2[5] = 1.0f * scale;
}
else
{ // boundary stencil
if ( corner )
{
float scale = 1.0f / (3.0f);
stencilBuffer1[0] = 2.0f * scale; stencilBuffer2[0] = 1.0f * scale;
stencilBuffer1[3] = 1.0f * scale; stencilBuffer2[3] = 2.0f * scale;
}
else
{
float scale = 1.0f / 3.0f;
stencilBuffer1[0] = 2.0f * scale; stencilBuffer2[0] = 1.0f * scale;
stencilBuffer1[3] = 1.0f * scale; stencilBuffer2[3] = 2.0f * scale;
}
}
}
static void computeACCInteriorPosStencil(byte boundary, int n, float *stencilBuffer)
{
VPROF_BUDGET( "ComputeACCInteriorPosStencil", _T("SubD Rendering") );
float scale = 1.0f / (n + 5.0f);
stencilBuffer[0] = n * scale;
stencilBuffer[1] = 2.0f * scale;
stencilBuffer[2] = 1.0f * scale;
stencilBuffer[3] = 2.0f * scale;
}
void FillTables()
{
if ( sTableInited ) return;
for ( int val=0; val<=MAX_VALENCE; val++ )
{
// interior stencils
computeCatmullClarkLimitPosStencil(false, val, sPosCornerStencil[val]);
computeACCEdgePosStencils(false, false, val, sPosEdge1Stencil[val], sPosEdge2Stencil[val]);
computeACCInteriorPosStencil(false, val, sPosInteriorStencil[val]);
// boundary stencils
computeCatmullClarkLimitPosStencil(true, val, sPosCornerBndStencil[val]);
computeACCEdgePosStencils(true, false, val, sPosEdge1BndStencil[val], sPosEdge2BndStencil[val]);
computeACCEdgePosStencils(true, true, val, sPosEdge1CornerStencil[val], sPosEdge2CornerStencil[val]);
computeACCInteriorPosStencil(true, val, sPosInteriorBndStencil[val]);
computeCatmullClarkLimitTanStencil(false, false, val, sCCLimitTanStencil1[val], sCCLimitTanStencil2[val]);
computeCatmullClarkLimitTanStencil(true, false, val, sCCLimitTanBndStencil1[val], sCCLimitTanBndStencil2[val]);
computeCatmullClarkLimitTanStencil(true, true, val, sCCLimitTanCornerStencil1[val], sCCLimitTanCornerStencil2[val]);
}
sTableInited = true;
}
//--------------------------------------------------------------------------------------
// Runtime
//--------------------------------------------------------------------------------------
#ifdef _DEBUG
static ConVar mat_tess_dump( "mat_tess_dump", "0", FCVAR_CHEAT );
#endif
// Compute corner control points for each patch
inline void ComputeCatmullClarkLimitPosition( Vector4D *pPos, unsigned short *oneRing,
unsigned short vtx1RingSize, unsigned short minOneRingIndex, unsigned short bndVtx,
unsigned short cornerVtx, unsigned short valence, unsigned short nbCorners, Vector4D &limitPos )
{
VPROF_BUDGET( "ComputeCatmullClarkLimitPosition", _T("SubD Rendering") );
if ( cornerVtx > 0 )
{
limitPos = pPos[ oneRing[0] ];
}
else
{
assert( valence <= MAX_VALENCE );
float *pStencil = bndVtx ? sPosCornerBndStencil[ valence ] : sPosCornerStencil[ valence ];
// pStencil[0] is always the largest value (see Figures 4 and 5 in Loop and Schaefer)
limitPos = pStencil[0] * pPos[ oneRing[0] ];
for ( int k = 0; k < vtx1RingSize; k++ )
{
int idx = ( k + minOneRingIndex ) % vtx1RingSize; // Shuffle to get the minimum index consistently first in order
if ( idx != 0 ) // Don't do pStencil[0] again
{
limitPos += pStencil[idx] * pPos[ oneRing[idx] ];
}
}
}
#ifdef _DEBUG
g_DebugCornerPositions.AddToTail( limitPos );
#endif
}
inline Vector4D CrossProduct(const Vector4D& a, const Vector4D& b)
{
return Vector4D( a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x, 0.0f );
}
inline float VectorNormalize(Vector4D& vec)
{
float radius = sqrtf(vec.x*vec.x + vec.y*vec.y + vec.z*vec.z);
// FLT_EPSILON is added to the radius to eliminate the possibility of divide by zero.
float iradius = 1.f / ( radius + FLT_EPSILON );
vec.x *= iradius;
vec.y *= iradius;
vec.z *= iradius;
return radius;
}
FORCEINLINE float DotProduct(const Vector4D& a, const Vector4D& b)
{
return ( a.x*b.x + a.y*b.y + a.z*b.z );
}
inline void ComputeCatmullClarkLimitTangents( int idx, Vector4D *pPos, unsigned short *oneRing, unsigned short vtx1RingSize,
unsigned short centerOffset, unsigned short bndVtx, unsigned short cornerVtx,
unsigned short valence, unsigned short &loopGapAngle,
Vector4D &limitTanU, Vector4D &limitTanV )
{
// for valence=1, no need to have separate tangents
float tanUSign[] = {1,-1,-1,1};
float tanVSign[] = {1,1,-1,-1};
VPROF_BUDGET( "ComputeCatmullClarkLimitTangents", _T("SubD Rendering") );
if ( !sUseCornerTangents )
cornerVtx = 0;
if ( !bndVtx ) // interior vertices
{
float *stencil1 = sCCLimitTanStencil1[ valence ];
float *stencil2 = sCCLimitTanStencil2[ valence ];
limitTanU = Vector4D(0,0,0,0);
limitTanV = Vector4D(0,0,0,0);
for (int k = 0; k < vtx1RingSize; ++k)
{
limitTanU += stencil1[k] * pPos[ oneRing[k] ];
limitTanV += stencil2[k] * pPos[ oneRing[k] ];
}
}
else if ( (!cornerVtx) || (cornerVtx == CORNER_WITH_SMOOTHBNDTANGENTS) ) // smooth boundary vertices
{
float *stencil1 = sCCLimitTanBndStencil1[ valence ];
float *stencil2 = sCCLimitTanBndStencil2[ valence ];
Vector4D r0 = Vector4D(0,0,0,0);
Vector4D r1 = Vector4D(0,0,0,0);
for (int k = 0; k < vtx1RingSize; ++k)
{
r0 += stencil1[k] * pPos[ oneRing[k] ];
r1 += stencil2[k] * pPos[ oneRing[k] ];
}
int j1 = (centerOffset - 1) / 2;
int j2 = j1+1;
int K = (valence - 1);
if (valence == 2)
{
limitTanU = r0;
limitTanV = r1;
}
else
{
limitTanU = cos(PI*j1 / K) * r0 + sin(PI*j1 / K) * r1;
limitTanV = cos(PI*j2 / K) * r0 + sin(PI*j2 / K) * r1;
}
}
else // corner vertices
{
if ( valence == 2 )
return;
float *pEdgeStencil = sPosEdge1Stencil[ valence ];
// float *avgStencil = sCCLimitTanCornerStencil1[ valence ];
// compute tangents
Vector4D c0 = pPos[ oneRing[1] ] - pPos[ oneRing[0] ]; c0.w = 0;
Vector4D c1 = pPos[ oneRing[vtx1RingSize - 1] ] - pPos[ oneRing[0] ]; c1.w = 0;
Vector4D e0 = (pEdgeStencil[0] - 1.0f ) * pPos[ oneRing[0] ];
Vector4D e1 = (pEdgeStencil[0] - 1.0f ) * pPos[ oneRing[0] ];
for (int k = 1; k < 6; k++ )
{
e0 += pEdgeStencil[k] * pPos[ oneRing[ k ] ];
e1 += pEdgeStencil[k] * pPos[ oneRing[ vtx1RingSize - 6 + k ] ];
}
e0.w = 0; e1.w = 0;
// compute average tangent plane normal
Vector4D n0 = CrossProduct( c0, e0 ); VectorNormalize( n0 );
Vector4D n1 = CrossProduct( e1, c1 ); VectorNormalize( n1 );
Vector4D N = n0 + n1;
// N = N - ( DotProduct( N, tAvg )/ DotProduct(tAvg, tAvg) ) * tAvg;
VectorNormalize( N );
// project into tangent plane
c0 = c0 - DotProduct(c0, N) * N;
c1 = c1 - DotProduct(c1, N) * N;
float c0l = Vector4DLength( c0 ); c0 = c0 / c0l;
float c1l = Vector4DLength( c1 ); c1 = c1 / c1l;
float cAvg = (c0l + c1l) / 2;
// compute angle
Vector4D c0p = CrossProduct(N, c0);
float angle = PI - atan2( DotProduct(c0p, c1), -DotProduct(c0, c1) );
loopGapAngle = (unsigned int) ((65535.0 * angle) / (2*PI));
// compute final tangent vector
int j1 = (centerOffset - 1) / 2;
int j2 = j1+1;
int K = (valence - 1);
limitTanU = cAvg * ( cos(angle*j1 / K) * c0 + sin(angle*j1 / K) * c0p );
limitTanV = cAvg * ( cos(angle*j2 / K) * c0 + sin(angle*j2 / K) * c0p );
}
// flip tangents so they point in u/v direction
if ( idx & 1 )
{
swap(limitTanU, limitTanV);
}
limitTanU *= tanUSign[idx];
limitTanV *= tanVSign[idx];
}
inline void ComputeACCEdgePositions( Vector4D *pPos, unsigned short *oneRing, unsigned short centerOffset,
unsigned short bndEdge, unsigned short bndVtx0, unsigned short bndVtx1,
unsigned short cornerVtx0, unsigned short cornerVtx1, unsigned short loopGapAngle0, unsigned short loopGapAngle1,
unsigned short edgeBias0, unsigned short edgeBias1, unsigned short val0, unsigned short val1,
unsigned short minOneRingOffset, unsigned short vtx1RingSize,
Vector4D &edgePos0, Vector4D &edgePos1)
{
VPROF_BUDGET( "ComputeACCEdgePositions", _T("SubD Rendering") );
if ( bndVtx0 )
{
val0 = 2*(val0 - 1);
}
if ( bndVtx1 )
{
val1 = 2*(val1 - 1);
}
Assert( val0 <= MAX_VALENCE );
Assert( val1 <= MAX_VALENCE );
float* pStencil0 = (bndEdge) ? (cornerVtx0) ? sPosEdge1CornerStencil[ val0 ] : sPosEdge1BndStencil[ val0 ] : sPosEdge1Stencil[ val0 ];
float* pStencil1 = (bndEdge) ? (cornerVtx1) ? sPosEdge2CornerStencil[ val1 ] : sPosEdge2BndStencil[ val1 ] : sPosEdge2Stencil[ val1 ];
int kEnd = (bndEdge) ? 4 : 6;
if ( ( edgeBias0 == 16384 ) && ( edgeBias1 == 16384 ) )
{
int oneRingIndex[6] = { 0, 0, 0, 0, 0, 0 };
for ( int i = 1; i < kEnd; i++ )
{
oneRingIndex[i] = centerOffset + i - 1;
}
edgePos0 = edgePos1 = Vector4D(0,0,0,0);
for ( int k = 0; k < kEnd; k++ )
{
int idx = ( k + minOneRingOffset ) % kEnd; // Offset to min index to enforce evaluation order between neighboring patches
edgePos0 += pStencil0[idx] * pPos[ oneRing[ oneRingIndex[idx] ] ];
edgePos1 += pStencil1[idx] * pPos[ oneRing[ oneRingIndex[idx] ] ];
}
}
else
{
float b0, b1;
b1 = edgeBias0 / 32768.0, b0 = 1.0f-b1;
edgePos0 = (val0 * pPos[ oneRing[0] ] + 2*b0*pPos[ oneRing[centerOffset + 0] ] + 1*b0*pPos[ oneRing[centerOffset + 1] ] + 2*pPos[ oneRing[centerOffset + 2] ] + 1*b1*pPos[ oneRing[centerOffset + 3] ] + 2*b1*pPos[ oneRing[centerOffset + 4] ] ) / (val0 + 5.0f);
b1 = edgeBias1 / 32768.0, b0 = 1.0f-b1;
edgePos1 = ( 2 * pPos[ oneRing[0] ] + 1*b0*pPos[ oneRing[centerOffset + 0] ] + 2*b0*pPos[ oneRing[centerOffset + 1] ] + val1*pPos[ oneRing[centerOffset + 2] ] + 2*b1*pPos[ oneRing[centerOffset + 3] ] + 1*b1*pPos[ oneRing[centerOffset + 4] ] ) / (val1 + 5.0f);
}
#ifdef _DEBUG
g_DebugEdgePositions.AddToTail( edgePos0 );
g_DebugEdgePositions.AddToTail( edgePos1 );
#endif
}
inline void ComputeACCInteriorPosition( Vector4D *pPos, unsigned short *oneRing, unsigned short centerOffset, unsigned short bndVtx, unsigned short valence, Vector4D &interiorPos )
{
VPROF_BUDGET( "ComputeACCInteriorPosition", _T("SubD Rendering") );
if ( bndVtx )
{
valence = valence>2 ? 2*(valence - 1) : 4*(valence - 1);
}
Assert( valence<=MAX_VALENCE );
float *stencil = sPosInteriorStencil[ valence ];
interiorPos = stencil[0] * pPos[ oneRing[0] ];
for ( int k = 1; k < 4; ++k )
{
interiorPos += stencil[k] * pPos[ oneRing[ centerOffset + k - 1 ] ];
}
#ifdef _DEBUG
g_DebugInteriorPositions.AddToTail( interiorPos );
#endif
}
inline void ComputeACCGeometryPatchTangents( Vector4D *Pos, Vector4D *TanU, Vector4D *TanV )
{
VPROF_BUDGET( "ComputeACCGeometryPatchTangents", _T("SubD Rendering") );
for ( int j=0; j<3; j++ )
{
for ( int i=0; i<4; i++ )
{
TanU[i*3+j] = 3*( Pos[i*4+j+1] - Pos[i*4+j] );
TanV[j*4+i] = 3*( Pos[(j+1)*4+i] - Pos[j*4+i] );
}
}
}
void ComputeACCGeometryPatch( Vector4D* pPos, TopologyIndexStruct *quad, Vector4D* Pos)
{
VPROF_BUDGET( "ComputeACCGeometryPatch", _T("SubD Rendering") );
int MOD4[8] = {0,1,2,3,0,1,2,3};
int accCorner[] = {0,3,15,12};
int accEdge1[] = {4,2,11,13};
int accEdge2[] = {8,1,7,14};
int accInterior[] = {5,6,10,9};
int vtx1RingStart = 0;
unsigned short *oneRing = quad->oneRing;
for ( int i=0; i<4; i++ ) // 4 corner vertices
{
ComputeCatmullClarkLimitPosition( pPos, &oneRing[vtx1RingStart], quad->vtx1RingSize[i], quad->minOneRingOffset[i], quad->bndVtx[i], quad->cornerVtx[i], quad->valences[i], quad->nbCornerVtx[i], Pos[ accCorner[i] ] );
ComputeACCEdgePositions( pPos, &oneRing[vtx1RingStart], quad->vtx1RingCenterQuadOffset[i],
quad->bndEdge[ MOD4[i+3] ],
quad->bndVtx[i], quad->bndVtx[MOD4[i+3]],
quad->cornerVtx[i], quad->cornerVtx[MOD4[i+3]],
quad->loopGapAngle[i], quad->loopGapAngle[MOD4[i+3]],
quad->edgeBias[ 2*MOD4[i+3] ], quad->edgeBias[ 2*MOD4[i+3] + 1 ],
quad->valences[i], quad->valences[MOD4[i+3]],
quad->minOneRingOffset[i], quad->vtx1RingSize[i],
Pos[accEdge1[i]], Pos[accEdge2[i]] );
ComputeACCInteriorPosition( pPos, &oneRing[vtx1RingStart], quad->vtx1RingCenterQuadOffset[i], quad->bndVtx[i], quad->cornerVtx[i], quad->loopGapAngle[i], quad->valences[i], Pos[ accInterior[i] ] );
vtx1RingStart += quad->vtx1RingSize[i];
}
}
void ComputeACCTangentPatches( Vector4D* pPos, TopologyIndexStruct* quad, Vector4D* Pos, Vector4D* TanU, Vector4D* TanV )
{
VPROF_BUDGET( "ComputeACCTangentPatches", _T("SubD Rendering") );
int MOD4[8] = {0,1,2,3,0,1,2,3};
int accTanCornerU[] = {0,2,11,9}; // counterclockwise orders!
int accTanCornerV[] = {0,3,11,8};
unsigned short *oneRing = quad->oneRing;
ComputeACCGeometryPatchTangents(Pos, TanU, TanV);
#if !defined( NO_TANGENTS )
if ( !sShowACCGeometryTangents )
{
// compute corner tangents ( = subdivision surface limit tangents)
int vtx1RingStart = 0;
for ( int i=0; i<4; i++ )
{
int vtx1RingSize = quad->vtx1RingSize[i];
Vector4D &accTanU = TanU[ accTanCornerU[i] ];
Vector4D &accTanV = TanV[ accTanCornerV[i] ];
ComputeCatmullClarkLimitTangents(i, pPos, &oneRing[vtx1RingStart], vtx1RingSize, quad->vtx1RingCenterQuadOffset[i], quad->bndVtx[i], quad->cornerVtx[i], quad->valences[i], quad->loopGapAngle[i], accTanU, accTanV );
vtx1RingStart += vtx1RingSize;
}
// compute correction component to boundary tangents for tangent plane continuity
// /TanV/ /TanU/ / TanV / /TanU/
static int CB_CornerIdx[] = {0,1,2, 3,7,11, 11,10,9, 8,4,0 };
static int CB_InteriorIdx[] = {1,2, 5,8, 10,9, 6,3 };
static float CB_sign[] = {1,-1,1,-1};
for ( int i=0; i<4; i++ ) // for all quad edges
{
if ( !quad->bndEdge[i] )
{
Vector4D *CBTanV = (i&1) ? TanU : TanV;
Vector4D *CBTanU = (i&1) ? TanV : TanU;
Vector4D u00 = CBTanU[CB_CornerIdx[3*i + 0]];
Vector4D u10 = CBTanU[CB_CornerIdx[3*i + 1]];
Vector4D u20 = CBTanU[CB_CornerIdx[3*i + 2]];
int val0 = quad->valences[i];
int val1 = quad->valences[MOD4[i+1]];
if ( quad->bndVtx[i] )
val0--;
if ( quad->bndVtx[MOD4[i+1]] )
val1--;
float c0 = cos( (2*PI * quad->loopGapAngle[ i ] / 65535.0f) / val0 );
float c1 = cos( (2*PI * quad->loopGapAngle[MOD4[i+1]] / 65535.0f) / val1 );
CBTanV[ CB_InteriorIdx[2*i + 0] ] += CB_sign[i]*( 2*c0*u10 - c1*u00 )/3.0f;
CBTanV[ CB_InteriorIdx[2*i + 1] ] += CB_sign[i]*( c0*u20 - 2*c1*u10 )/3.0f;
}
}
}
#endif
}
#endif // !defined( USE_OPT )
#if defined( USE_OPT )
#define M_PI2 6.28318530717958647692f
static fltx4 Four_NegativeThirds;
static fltx4 Four_Fives;
static fltx4 Four_Tens;
static fltx4 Four_N[32];
static fltx4 Four_TwoPI;
static fltx4 Four_Valence[MAX_VALENCE];
static fltx4 Four_ValencePlus5[MAX_VALENCE];
static fltx4 sPosCornerStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2];
static fltx4 sPosEdge1Stencil[MAX_VALENCE+1][6];
static fltx4 sPosEdge2Stencil[MAX_VALENCE+1][6];
static fltx4 sPosInteriorStencil[MAX_VALENCE+1][4];
static fltx4 sCCLimitTanStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static fltx4 sCCLimitTanStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static fltx4 sCCLimitTanBndStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static fltx4 sCCLimitTanBndStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static fltx4 sCCLimitTanCornerStencil1[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static fltx4 sCCLimitTanCornerStencil2[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static fltx4 sPosCornerBndStencil[MAX_VALENCE+1][(MAX_VALENCE+1)*2+1];
static fltx4 sPosEdge1BndStencil[MAX_VALENCE+1][6];
static fltx4 sPosEdge2BndStencil[MAX_VALENCE+1][6];
static fltx4 sPosInteriorBndStencil[MAX_VALENCE+1][4];
static fltx4 sPosEdge1CornerStencil[MAX_VALENCE+1][6];
static fltx4 sPosEdge2CornerStencil[MAX_VALENCE+1][6];
static fltx4 sCCSinPI[MAX_VALENCE*2][MAX_VALENCE];
static fltx4 sCCCosPI[MAX_VALENCE*2][MAX_VALENCE];
static float Valence_MinusOne[MAX_VALENCE];
static void ComputeCatmullClarkLimitPosStencil(byte boundary, int n, fltx4 *stencilBuffer)
{
VPROF_BUDGET( "ComputeCatmullClarkLimitPosStencil", _T("SubD Rendering") );
for ( int i=0; i<2*n; ++i )
{
stencilBuffer[i] = Four_Zeros;
}
if ( !boundary )
{
float scale = 1.0f / (n*n + 5.0f*n);
stencilBuffer[0] = ReplicateX4( n*n * scale );
for ( int i=0; i<n; i++ )
{
stencilBuffer[2*i+1] = ReplicateX4( 4.0f * scale );
stencilBuffer[2*i+2] = ReplicateX4( 1.0f * scale );
}
}
else
{
int k = n-1;
float s = 1.0f / 6.0f;
stencilBuffer[0] = ReplicateX4( s * 4.0f );
stencilBuffer[1] = ReplicateX4( s * 1.0f );
stencilBuffer[2*k+1] = ReplicateX4( s * 1.0f );
}
}
static void ComputeCatmullClarkLimitTanStencil(bool bndVtx, bool cornerVtx, const int n, fltx4 *stencilBuffer1, fltx4 *stencilBuffer2)
{
VPROF_BUDGET( "ComputeCatmullClarkLimitTanStencil", _T("SubD Rendering") );
for ( int i=0; i<2*n; ++i )
{
stencilBuffer1[i] = Four_Zeros;
stencilBuffer2[i] = Four_Zeros;
}
if ( !bndVtx )
{
float scale_beta = 1.0f / (n * sqrtf(4.0f + cos(PI/n)*cos(PI/n)));
float scale_alpha = 1.0f/n + cos(PI/n) * scale_beta;
for ( int i=0; i<n; i++ )
{
stencilBuffer1[2*i+1] = ReplicateX4( cos( 2*PI*i/n ) * scale_alpha );
stencilBuffer1[2*i+2] = ReplicateX4( cos((2*PI*i+PI)/n ) * scale_beta );
int j = (i - 1)%n;
stencilBuffer2[2*i+1] = ReplicateX4( cos( 2*PI*j/n ) * scale_alpha );
stencilBuffer2[2*i+2] = ReplicateX4( cos((2*PI*j+PI)/n ) * scale_beta );
}
}
else
{
// boundary vertex cases
if ( cornerVtx )
{
if ( n<=2 )
return;
float sectorScale = 0, w;
// treat first and last tangent (crease edges) separately
w = tangentAveraging( n-1, 0 ); sectorScale += w;
stencilBuffer1[ 1] = stencilBuffer1[ 1] + ReplicateX4( 0.5 * w );
stencilBuffer1[ 0] = stencilBuffer1[ 0] + ReplicateX4( -0.5 * w );
w = tangentAveraging( n-1, n-1 ); sectorScale += w;
stencilBuffer1[ 2*(n-1)+1] = stencilBuffer1[ 2*(n-1)+1] + ReplicateX4( 0.5 * w );
stencilBuffer1[ 0 ] = stencilBuffer1[ 0 ] + ReplicateX4( -0.5 * w );
// inner tangents are computed using the 6 weights from the geometery edge construction.
for (int k=1; k<(n-1); k++)
{
w = tangentAveraging( n-1, k ); sectorScale += w;
float scale = 1.0f / (2.0f*n + 10.0f);
stencilBuffer1[ 0] = stencilBuffer1[ 0] + ReplicateX4( w * (2.0f*n * scale - 1.0f) );
stencilBuffer1[2*(k-1)+1] = stencilBuffer1[2*(k-1)+1] + ReplicateX4( w * 2.0f * scale );
stencilBuffer1[2*(k-1)+2] = stencilBuffer1[2*(k-1)+2] + ReplicateX4( w * 1.0f * scale );
stencilBuffer1[2*(k-1)+3] = stencilBuffer1[2*(k-1)+3] + ReplicateX4( w * 4.0f * scale );
stencilBuffer1[2*(k-1)+4] = stencilBuffer1[2*(k-1)+4] + ReplicateX4( w * 1.0f * scale );
stencilBuffer1[2*(k-1)+5] = stencilBuffer1[2*(k-1)+5] + ReplicateX4( w * 2.0f * scale );
}
// rescale weights
fltx4 fltx4Scale = ReplicateX4( sectorScale );
for ( int k = 0; k<2*n; ++k )
{
stencilBuffer1[k] = DivSIMD( stencilBuffer1[k], fltx4Scale );
}
}
else
{
// special case to avoid colinear tangents
if ( n==2 )
{
float s = 1.0f / 2.0f;
stencilBuffer1[1] = ReplicateX4( 1.0 * s );
stencilBuffer1[3] = ReplicateX4( -1.0 * s );
stencilBuffer2[1] = ReplicateX4( -1.0 * s );
stencilBuffer2[3] = ReplicateX4( 1.0 * s );
// regularization term to avoid collinearity and preserve limit normal at the boundary
float eps = 1e-4;
stencilBuffer1[0] = AddSIMD( stencilBuffer1[0], ReplicateX4( eps * (-4.0/3.0) ) );
stencilBuffer1[1] = AddSIMD( stencilBuffer1[1], ReplicateX4( eps * (1.0/2.0) ) );
stencilBuffer1[2] = AddSIMD( stencilBuffer1[2], ReplicateX4( eps * (1.0/3.0) ) );
stencilBuffer1[3] = AddSIMD( stencilBuffer1[3], ReplicateX4( eps * (1.0/2.0) ) );
stencilBuffer2[0] = AddSIMD( stencilBuffer2[0], ReplicateX4( eps * (-4.0/3.0) ) );
stencilBuffer2[1] = AddSIMD( stencilBuffer2[1], ReplicateX4( eps * (1.0/2.0) ) );
stencilBuffer2[2] = AddSIMD( stencilBuffer2[2], ReplicateX4( eps * (1.0/3.0) ) );
stencilBuffer2[3] = AddSIMD( stencilBuffer2[3], ReplicateX4( eps * (1.0/2.0) ) );
}
else
{
int k = n-1;
float c = cos( PI / k ), s=sin( PI / k );
stencilBuffer1[2*0+1] = ReplicateX4( 0.5f );
stencilBuffer1[2*k+1] = ReplicateX4( -0.5f );
stencilBuffer2[0] = ReplicateX4( -4.0f*s / (3.0f*k + c) ); // gamma
for ( int i=0; i<k; ++i )
{
stencilBuffer2[2*i+1] = ReplicateX4( 4*sin(PI*i/k)/(3*k+c) ); // alpha_i
stencilBuffer2[2*i+2] = ReplicateX4( (sin(PI*i/k)+sin(PI*(i+1)/k)) / (3.0f*k+c) ); // beta_i
}
stencilBuffer2[2*0+1] = stencilBuffer2[2*k+1] = ReplicateX4( -( (1+2*c)*sqrt(1+c) ) / ( (3*k+c)*sqrt(1-c) ) ); // alpha_0, alpha_k
}
}
}
}
static void ComputeACCEdgePosStencils(byte boundary, byte corner, int n, fltx4 *stencilBuffer1, fltx4 *stencilBuffer2)
{
VPROF_BUDGET( "ComputeACCEdgePosStencils", _T("SubD Rendering") );
for ( int i=0; i<6; ++i )
{
stencilBuffer1[i] = Four_Zeros;
stencilBuffer2[i] = Four_Zeros;
}
if ( !boundary )
{
float scale = 1.0f / (2.0f*n + 10.0f);
stencilBuffer1[0] = ReplicateX4( 2.0f*n * scale ); stencilBuffer2[0] = ReplicateX4( 4.0f * scale );
stencilBuffer1[1] = ReplicateX4( 2.0f * scale ); stencilBuffer2[1] = ReplicateX4( 1.0f * scale );
stencilBuffer1[2] = ReplicateX4( 1.0f * scale ); stencilBuffer2[2] = ReplicateX4( 2.0f * scale );
stencilBuffer1[3] = ReplicateX4( 4.0f * scale ); stencilBuffer2[3] = ReplicateX4( 2.0f*n* scale );
stencilBuffer1[4] = ReplicateX4( 1.0f * scale ); stencilBuffer2[4] = ReplicateX4( 2.0f * scale );
stencilBuffer1[5] = ReplicateX4( 2.0f * scale ); stencilBuffer2[5] = ReplicateX4( 1.0f * scale );
}
else
{
// boundary stencil
if ( corner )
{
float scale = 1.0f / (3.0f);
stencilBuffer1[0] = ReplicateX4( 2.0f * scale ); stencilBuffer2[0] = ReplicateX4( 1.0f * scale );
stencilBuffer1[3] = ReplicateX4( 1.0f * scale ); stencilBuffer2[3] = ReplicateX4( 2.0f * scale );
}
else
{
float scale = 1.0f / 3.0f;
stencilBuffer1[0] = ReplicateX4( 2.0f * scale ); stencilBuffer2[0] = ReplicateX4( 1.0f * scale );
stencilBuffer1[3] = ReplicateX4( 1.0f * scale ); stencilBuffer2[3] = ReplicateX4( 2.0f * scale );
}
}
}
static void ComputeACCInteriorPosStencil(byte boundary, int n, fltx4 *stencilBuffer)
{
VPROF_BUDGET( "ComputeACCInteriorPosStencil", _T("SubD Rendering") );
float scale = 1.0f / (n + 5.0f);
stencilBuffer[0] = ReplicateX4( n * scale );
stencilBuffer[1] = ReplicateX4( 2.0f * scale );
stencilBuffer[2] = ReplicateX4( 1.0f * scale );
stencilBuffer[3] = ReplicateX4( 2.0f * scale );
}
static void ComputeACCSinCosPITables()
{
fltx4 PI4 = ReplicateX4( M_PI );
for ( int j=0; j<MAX_VALENCE*2; ++j )
{
fltx4 j4 = ReplicateX4( (float)j );
for ( int k=0; k<MAX_VALENCE; ++k )
{
fltx4 k4 = ReplicateX4( (float)k );
fltx4 radians = DivSIMD( MulSIMD( PI4, j4 ), k4 );
// not really simd
SinCosSIMD( sCCSinPI[j][k], sCCCosPI[j][k], radians );
}
}
}
void FillTables()
{
if ( sTableInited )
return;
// Some simd stuff
Four_TwoPI = ReplicateX4( 2*M_PI );
Four_Tens = ReplicateX4( 10.0f );
Four_Fives = ReplicateX4( 5 );
Four_NegativeThirds = ReplicateX4( -0.333333333333333f );
for ( int i=0; i<32; ++i )
{
Four_N[i] = ReplicateX4( (float)i );
}
for ( int i=0; i<MAX_VALENCE; ++i )
{
Four_Valence[i] = ReplicateX4( (float)i );
Four_ValencePlus5[i] = ReplicateX4( (float)i + 5.0f );
Valence_MinusOne[i] = (float)(i-1);
}
for ( int val=0; val<=MAX_VALENCE; val++ )
{
// interior stencils
ComputeCatmullClarkLimitPosStencil( false, val, sPosCornerStencil[val] );
ComputeACCEdgePosStencils( false, false, val, sPosEdge1Stencil[val], sPosEdge2Stencil[val] );
ComputeACCInteriorPosStencil( false, val, sPosInteriorStencil[val] );
// boundary stencils
ComputeCatmullClarkLimitPosStencil( true, val, sPosCornerBndStencil[val] );
ComputeACCEdgePosStencils( true, false, val, sPosEdge1BndStencil[val], sPosEdge2BndStencil[val] );
ComputeACCEdgePosStencils( true, true, val, sPosEdge1CornerStencil[val], sPosEdge2CornerStencil[val] );
ComputeACCInteriorPosStencil( true, val, sPosInteriorBndStencil[val] );
ComputeCatmullClarkLimitTanStencil( false, false, val, sCCLimitTanStencil1[val], sCCLimitTanStencil2[val] );
ComputeCatmullClarkLimitTanStencil( true, false, val, sCCLimitTanBndStencil1[val], sCCLimitTanBndStencil2[val] );
ComputeCatmullClarkLimitTanStencil( true, true, val, sCCLimitTanCornerStencil1[val], sCCLimitTanCornerStencil2[val] );
}
// sincos tables
ComputeACCSinCosPITables();
sTableInited = true;
}
//--------------------------------------------------------------------------------------
// Runtime
//--------------------------------------------------------------------------------------
FORCEINLINE void ComputeCatmullClarkLimitPosition( fltx4 *pPos, unsigned short *pOneRing,
unsigned short vtx1RingSize, unsigned short minOneRingIndex, unsigned short bndVtx,
unsigned short cornerVtx, unsigned short valence, fltx4 &limitPos )
{
VPROF_BUDGET( "ComputeCatmullClarkLimitPosition (SIMD)", _T( "SubD Rendering" ) );
assert( pPos );
assert( pOneRing );
if ( cornerVtx > 0 )
{
limitPos = pPos[ pOneRing[0] ];
}
else
{
assert( valence <= MAX_VALENCE );
fltx4 *pStencil = bndVtx ? sPosCornerBndStencil[ valence ] : sPosCornerStencil[ valence ];
// pStencil[0] is always the largest value (see Figures 4 and 5 in Loop and Schaefer)
limitPos = MulSIMD( pStencil[0], pPos[ pOneRing[0] ] );
for ( int k = 0; k < vtx1RingSize; k++ )
{
int idx = ( k + minOneRingIndex ) % vtx1RingSize; // Shuffle to get the minimum index consistently first in order
if ( idx != 0 ) // Don't do pStencil[0] again
{
limitPos = MaddSIMD( pStencil[idx], pPos[ pOneRing[idx] ], limitPos );
}
}
}
}
FORCEINLINE fltx4 VectorNormalize( fltx4 &A )
{
fltx4 mag_sq = Dot3SIMD( A, A ); // length^2
fltx4 invSqrt = ReciprocalSqrtEstSIMD(mag_sq);
return MulSIMD( A, invSqrt );
}
FORCEINLINE fltx4 VectorLength( fltx4 &A )
{
fltx4 mag_sq = Dot3SIMD( A, A ); // length^2
fltx4 invSqrt = ReciprocalSqrtEstSIMD(mag_sq);
return invSqrt;
}
FORCEINLINE fltx4 CrossProduct( const fltx4 &A, const fltx4 &B )
{
#if defined( _X360 )
return XMVector3Cross( A, B );
#elif defined( _WIN32 )
fltx4 A1 = _mm_shuffle_ps( A, A, MM_SHUFFLE_REV( 1, 2, 0, 3 ) );
fltx4 B1 = _mm_shuffle_ps( B, B, MM_SHUFFLE_REV( 2, 0, 1, 3 ) );
fltx4 Result1 = MulSIMD( A1, B1 );
fltx4 A2 = _mm_shuffle_ps( A, A, MM_SHUFFLE_REV( 2, 0, 1, 3 ) );
fltx4 B2 = _mm_shuffle_ps( B, B, MM_SHUFFLE_REV( 1, 2, 0, 3 ) );
fltx4 Result2 = MulSIMD( A2, B2 );
return SubSIMD( Result1, Result2 );
#else
fltx4 CrossVal;
SubFloat( CrossVal, 0 ) = SubFloat( A, 1 )*SubFloat( B, 2 ) - SubFloat( A, 2 )*SubFloat( B, 1 );
SubFloat( CrossVal, 1 ) = SubFloat( A, 2 )*SubFloat( B, 0 ) - SubFloat( A, 0 )*SubFloat( B, 2 );
SubFloat( CrossVal, 2 ) = SubFloat( A, 0 )*SubFloat( B, 1 ) - SubFloat( A, 1 )*SubFloat( B, 0 );
SubFloat( CrossVal, 3 ) = 0;
return CrossVal;
#endif
}
FORCEINLINE void ComputeCatmullClarkLimitTangents( int idx, fltx4 *pPos, unsigned short *pOneRing, unsigned short vtx1RingSize,
unsigned short centerOffset, unsigned short bndVtx, unsigned short cornerVtx,
unsigned short valence, float &loopGapAngle, fltx4 &limitTanU, fltx4 &limitTanV )
{
VPROF_BUDGET( "ComputeCatmullClarkLimitTangents (SIMD)", _T( "SubD Rendering" ) );
// for valence=1, no need to have separate tangents
static const fltx4 tanUSign[4] = { Four_Ones, Four_NegativeOnes, Four_NegativeOnes, Four_Ones };
static const fltx4 tanVSign[4] = { Four_Ones, Four_Ones, Four_NegativeOnes, Four_NegativeOnes };
if (!sUseCornerTangents) cornerVtx = 0;
// interior vertices
if ( !bndVtx )
{
fltx4 *pStencil0 = sCCLimitTanStencil1[ valence ];
fltx4 *pStencil1 = sCCLimitTanStencil2[ valence ];
limitTanU = limitTanV = Four_Zeros;
for ( int k = 0; k < vtx1RingSize; k++ )
{
limitTanU = MaddSIMD( pStencil0[k], pPos[ pOneRing[ k ] ], limitTanU );
limitTanV = MaddSIMD( pStencil1[k], pPos[ pOneRing[ k ] ], limitTanV );
}
}
else if ( (!cornerVtx) || (cornerVtx == CORNER_WITH_SMOOTHBNDTANGENTS) )
{
// smooth boundary vertices
fltx4 *pStencil0 = sCCLimitTanBndStencil1[ valence ];
fltx4 *pStencil1 = sCCLimitTanBndStencil2[ valence ];
fltx4 r0 = Four_Zeros;
fltx4 r1 = Four_Zeros;
for (int k = 0; k < vtx1RingSize; ++k)
{
r0 = MaddSIMD( pStencil0[k], pPos[ pOneRing[ k ] ], r0 );
r1 = MaddSIMD( pStencil1[k], pPos[ pOneRing[ k ] ], r1 );
}
int j1 = ( centerOffset - 1 ) / 2;
int j2 = j1 + 1;
int k = valence - 1;
if ( valence == 2 )
{
limitTanU = r0;
limitTanV = r1;
}
else
{
limitTanU = AddSIMD( MulSIMD( sCCCosPI[j1][k], r0 ), MulSIMD( sCCSinPI[j1][k], r1 ) );
limitTanV = AddSIMD( MulSIMD( sCCCosPI[j2][k], r0 ), MulSIMD( sCCSinPI[j2][k], r1 ) );
}
}
else
{
// Corner vertices
if ( valence == 2 )
return;
fltx4 *pEdgeStencil = sPosEdge1Stencil[ valence ];
// Compute tangents
fltx4 c0 = SubSIMD( pPos[ pOneRing[ 1 ] ], pPos[ pOneRing[ 0 ] ] );
fltx4 c1 = SubSIMD( pPos[ pOneRing[ vtx1RingSize - 1 ] ], pPos[ pOneRing[ 0 ] ] );
fltx4 e0 = MulSIMD( SubSIMD( pEdgeStencil[0], Four_Ones ), pPos[ pOneRing[ 0 ] ] );
fltx4 e1 = e0;
for ( int k = 1; k < 6; k++ )
{
e0 = MaddSIMD( pEdgeStencil[k], pPos[ pOneRing[ k ] ], e0 );
e1 = MaddSIMD( pEdgeStencil[k], pPos[ pOneRing[ vtx1RingSize - 6 + k ] ], e1 );
}
// Compute average tangent plane normal
fltx4 n0 = CrossProduct( c0, e0 );
n0 = VectorNormalize( n0 );
fltx4 n1 = CrossProduct( e1, c1 );
n1 = VectorNormalize( n1 );
fltx4 N = AddSIMD( n0, n1 );
N = VectorNormalize( N );
// Project into tangent plane
fltx4 DotC0N = Dot3SIMD( c0, N );
fltx4 DotC1N = Dot3SIMD( c1, N );
c0 = SubSIMD( c0, MulSIMD( DotC0N, N ) );
c1 = SubSIMD( c1, MulSIMD( DotC1N, N ) );
fltx4 c0l = VectorLength( c0 );
c0 = DivSIMD( c0, c0l );
fltx4 c1l = VectorLength( c1 );
c1 = DivSIMD( c1, c1l );
fltx4 cAvg = MulSIMD( AddSIMD(c0l,c1l), Four_PointFives );
// Compute angle
fltx4 c0p = CrossProduct(N, c0);
fltx4 dot1 = Dot3SIMD(c0p, c1);
fltx4 dot2 = Dot3SIMD(c0, c1);
float angle = PI - atan2( SubFloat( dot1, 0 ), -SubFloat( dot2, 0 ) );
loopGapAngle = angle;
// Compute final tangent vector
int j1 = ( centerOffset - 1 ) / 2;
int j2 = j1 + 1;
int K = (valence - 1);
static float fK[MAX_VALENCE] = { 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,
17.0f, 18.0f };
// Compute final tangent vector
float flK = fK[K];
fltx4 Cos0 = ReplicateX4( cos( angle*j1 / flK ) );
fltx4 Sin0 = ReplicateX4( sin( angle*j1 / flK ) );
fltx4 Cos1 = ReplicateX4( cos( angle*j2 / flK ) );
fltx4 Sin1 = ReplicateX4( sin( angle*j2 / flK ) );
limitTanU = cAvg * ( Cos0 * c0 + Sin0 * c0p );
limitTanV = cAvg * ( Cos1 * c0 + Sin1 * c0p );
}
// Flip tangents so they point in u/v direction
if ( idx & 1 )
{
V_swap( limitTanU, limitTanV );
}
limitTanU = MulSIMD( limitTanU, tanUSign[idx] );
limitTanV = MulSIMD( limitTanV, tanVSign[idx] );
}
FORCEINLINE void ComputeACCEdgePositions( fltx4 *pPos, unsigned short *oneRing, unsigned short centerOffset,
unsigned short bndEdge, unsigned short bndVtx0, unsigned short bndVtx1,
unsigned short cornerVtx0, unsigned short cornerVtx1,
unsigned short edgeBias0, unsigned short edgeBias1,
unsigned short val0, unsigned short val1,
unsigned short minOneRingOffset, unsigned short vtx1RingSize,
fltx4 &edgePos0, fltx4 &edgePos1)
{
VPROF_BUDGET( "ComputeACCEdgePositions (SIMD)", _T("SubD Rendering") );
if ( bndVtx0 )
{
val0 = 2*(val0 - 1);
}
if ( bndVtx1 )
{
val1 = 2*(val1 - 1);
}
Assert( val0 <= MAX_VALENCE );
Assert( val1 <= MAX_VALENCE );
fltx4 *pStencil0 = (bndEdge) ? (cornerVtx0) ? sPosEdge1CornerStencil[ val0 ] : sPosEdge1BndStencil[ val0 ] : sPosEdge1Stencil[ val0 ];
fltx4 *pStencil1 = (bndEdge) ? (cornerVtx1) ? sPosEdge2CornerStencil[ val1 ] : sPosEdge2BndStencil[ val1 ] : sPosEdge2Stencil[ val1 ];
int kEnd = (bndEdge) ? 4 : 6;
if ( ( edgeBias0 == 16384 ) && ( edgeBias1 == 16384 ) )
{
int oneRingIndex[6] = { 0, 0, 0, 0, 0, 0 };
for ( int i = 1; i < kEnd; i++ )
{
oneRingIndex[i] = centerOffset + i - 1;
}
edgePos0 = edgePos1 = Four_Zeros;
for ( int k = 0; k < kEnd; k++ )
{
int idx = ( k + minOneRingOffset ) % kEnd; // Offset to min index to enforce evaluation order between neighboring patches
edgePos0 = MaddSIMD( pStencil0[idx], pPos[ oneRing[ oneRingIndex[idx] ] ], edgePos0 );
edgePos1 = MaddSIMD( pStencil1[idx], pPos[ oneRing[ oneRingIndex[idx] ] ], edgePos1 );
}
}
else
{
fltx4 b0, b1;
b1 = ReplicateX4( edgeBias0 / 32768.0f );
b0 = SubSIMD( Four_Ones, b1 );
edgePos0 = DivSIMD( ( Four_Valence[val0]*pPos[ oneRing[0] ] +
Four_Twos*b0*pPos[ oneRing[ centerOffset] ] +
b0*pPos[ oneRing[centerOffset + 1] ] +
Four_Twos*pPos[ oneRing[centerOffset + 2] ] +
b1*pPos[ oneRing[centerOffset + 3] ] +
Four_Twos*b1*pPos[ oneRing[centerOffset + 4] ] ), Four_ValencePlus5[val0] );
b1 = ReplicateX4( edgeBias1 / 32768.0f );
b0 = SubSIMD( Four_Ones, b1 );
edgePos1 = DivSIMD( ( Four_Twos*pPos[ oneRing[0] ] +
b0*pPos[ oneRing[centerOffset + 0] ] +
Four_Twos*b0*pPos[ oneRing[centerOffset + 1] ] +
Four_Valence[val1]*pPos[ oneRing[centerOffset + 2] ] +
Four_Twos*b1*pPos[ oneRing[centerOffset + 3] ] +
b1*pPos[ oneRing[centerOffset + 4] ] ), Four_ValencePlus5[val0] );
}
}
FORCEINLINE void ComputeACCInteriorPosition( fltx4 *pPos, unsigned short *oneRing, unsigned short centerOffset, unsigned short bndVtx, unsigned short valence, fltx4 &interiorPos )
{
VPROF_BUDGET( "ComputeACCInteriorPosition (SIMD)", _T( "SubD Rendering" ) );
if ( bndVtx )
{
valence = valence > 2 ? 2 * (valence - 1) : 4 * (valence - 1);
}
Assert( valence <= MAX_VALENCE );
fltx4 *pStencil = sPosInteriorStencil[ valence ];
interiorPos = MulSIMD( pStencil[0], pPos[ oneRing[0] ] );
for ( int k = 1; k < 4; k++ )
{
interiorPos = MaddSIMD( pStencil[k], pPos[ oneRing[ centerOffset + k - 1 ] ], interiorPos );
}
}
FORCEINLINE void ComputeACCGeometryPatchTangents( fltx4 *Pos, fltx4 *TanU, fltx4 *TanV )
{
//VPROF_BUDGET( "ComputeACCGeometryPatchTangents", _T("SubD Rendering") );
TanU[0] = MulSIMD( Four_Threes, SubSIMD( Pos[1], Pos[0] ) );
TanV[0] = MulSIMD( Four_Threes, SubSIMD( Pos[4], Pos[0] ) );
TanU[3] = MulSIMD( Four_Threes, SubSIMD( Pos[5], Pos[4] ) );
TanV[1] = MulSIMD( Four_Threes, SubSIMD( Pos[5], Pos[1] ) );
TanU[6] = MulSIMD( Four_Threes, SubSIMD( Pos[9], Pos[8] ) );
TanV[2] = MulSIMD( Four_Threes, SubSIMD( Pos[6], Pos[2] ) );
TanU[9] = MulSIMD( Four_Threes, SubSIMD( Pos[13], Pos[12] ) );
TanV[3] = MulSIMD( Four_Threes, SubSIMD( Pos[7], Pos[3] ) );
TanU[1] = MulSIMD( Four_Threes, SubSIMD( Pos[2], Pos[1] ) );
TanV[4] = MulSIMD( Four_Threes, SubSIMD( Pos[8], Pos[4] ) );
TanU[4] = MulSIMD( Four_Threes, SubSIMD( Pos[6], Pos[5] ) );
TanV[5] = MulSIMD( Four_Threes, SubSIMD( Pos[9], Pos[5] ) );
TanU[7] = MulSIMD( Four_Threes, SubSIMD( Pos[10], Pos[9] ) );
TanV[6] = MulSIMD( Four_Threes, SubSIMD( Pos[10], Pos[6] ) );
TanU[10] = MulSIMD( Four_Threes, SubSIMD( Pos[14], Pos[13] ) );
TanV[7] = MulSIMD( Four_Threes, SubSIMD( Pos[11], Pos[7] ) );
TanU[2] = MulSIMD( Four_Threes, SubSIMD( Pos[3], Pos[2] ) );
TanV[8] = MulSIMD( Four_Threes, SubSIMD( Pos[12], Pos[8] ) );
TanU[5] = MulSIMD( Four_Threes, SubSIMD( Pos[7], Pos[6] ) );
TanV[9] = MulSIMD( Four_Threes, SubSIMD( Pos[13], Pos[9] ) );
TanU[8] = MulSIMD( Four_Threes, SubSIMD( Pos[11], Pos[10] ) );
TanV[10] = MulSIMD( Four_Threes, SubSIMD( Pos[14], Pos[10] ) );
TanU[11] = MulSIMD( Four_Threes, SubSIMD( Pos[15], Pos[14] ) );
TanV[11] = MulSIMD( Four_Threes, SubSIMD( Pos[15], Pos[11] ) );
}
void ComputeACCAllPatches( fltx4* pPos, TopologyIndexStruct* quad, Vector4D* Pos, Vector4D* TanU, Vector4D* TanV, bool bRegularPatch )
{
VPROF_BUDGET( "ComputeACCAllPatches (SIMD)", _T( "SubD Rendering" ) );
int accCorner[] = { 0, 3, 15, 12 };
int accEdge1[] = { 4, 2, 11, 13 };
int accEdge2[] = { 8, 1, 7, 14 };
int accInterior[] = { 5, 6, 10, 9 };
int accTanCornerU[] = { 0, 2, 11, 9 }; // counterclockwise orders!
int accTanCornerV[] = { 0, 3, 11, 8 };
fltx4 OutPos[16], OutTanU[16], OutTanV[16];
// Point to four one-rings
int vtx1RingStart = 0;
unsigned short* pOneRing[4];
for ( int i = 0; i < 4; i++ )
{
unsigned short vtx1RingSize = quad->vtx1RingSize[i];
pOneRing[i] = &(quad->oneRing[vtx1RingStart]);
vtx1RingStart += vtx1RingSize;
}
{
VPROF_BUDGET( "ComputeACCAllPatches - Geometry Control Points (SIMD)", _T( "SubD Rendering" ) );
ComputeCatmullClarkLimitPosition( pPos, pOneRing[0], quad->vtx1RingSize[0], quad->minOneRingOffset[0], quad->bndVtx[0], quad->cornerVtx[0], quad->valences[0], OutPos[ accCorner[0] ] );
ComputeCatmullClarkLimitPosition( pPos, pOneRing[1], quad->vtx1RingSize[1], quad->minOneRingOffset[1], quad->bndVtx[1], quad->cornerVtx[1], quad->valences[1], OutPos[ accCorner[1] ] );
ComputeCatmullClarkLimitPosition( pPos, pOneRing[2], quad->vtx1RingSize[2], quad->minOneRingOffset[2], quad->bndVtx[2], quad->cornerVtx[2], quad->valences[2], OutPos[ accCorner[2] ] );
ComputeCatmullClarkLimitPosition( pPos, pOneRing[3], quad->vtx1RingSize[3], quad->minOneRingOffset[3], quad->bndVtx[3], quad->cornerVtx[3], quad->valences[3], OutPos[ accCorner[3] ] );
ComputeACCEdgePositions( pPos, pOneRing[0], quad->vtx1RingCenterQuadOffset[0],
quad->bndEdge[3], quad->bndVtx[0], quad->bndVtx[3],
quad->cornerVtx[0], quad->cornerVtx[3],
quad->edgeBias[6], quad->edgeBias[7],
quad->valences[0], quad->valences[3],
quad->minOneRingOffset[0], quad->vtx1RingSize[0],
OutPos[accEdge1[0]], OutPos[accEdge2[0]] );
ComputeACCEdgePositions( pPos, pOneRing[1], quad->vtx1RingCenterQuadOffset[1],
quad->bndEdge[0], quad->bndVtx[1], quad->bndVtx[0],
quad->cornerVtx[1], quad->cornerVtx[0],
quad->edgeBias[0], quad->edgeBias[1],
quad->valences[1], quad->valences[0],
quad->minOneRingOffset[1], quad->vtx1RingSize[1],
OutPos[accEdge1[1]], OutPos[accEdge2[1]] );
ComputeACCEdgePositions( pPos, pOneRing[2], quad->vtx1RingCenterQuadOffset[2],
quad->bndEdge[1], quad->bndVtx[2], quad->bndVtx[1],
quad->cornerVtx[2], quad->cornerVtx[1],
quad->edgeBias[2], quad->edgeBias[3],
quad->valences[2], quad->valences[1],
quad->minOneRingOffset[2], quad->vtx1RingSize[2],
OutPos[accEdge1[2]], OutPos[accEdge2[2]] );
ComputeACCEdgePositions( pPos, pOneRing[3], quad->vtx1RingCenterQuadOffset[3],
quad->bndEdge[2], quad->bndVtx[3], quad->bndVtx[2],
quad->cornerVtx[3], quad->cornerVtx[2],
quad->edgeBias[4], quad->edgeBias[5],
quad->valences[3], quad->valences[2],
quad->minOneRingOffset[3], quad->vtx1RingSize[3],
OutPos[accEdge1[3]], OutPos[accEdge2[3]] );
ComputeACCInteriorPosition( pPos, pOneRing[0], quad->vtx1RingCenterQuadOffset[0], quad->bndVtx[0], quad->valences[0], OutPos[ accInterior[0] ] );
ComputeACCInteriorPosition( pPos, pOneRing[1], quad->vtx1RingCenterQuadOffset[1], quad->bndVtx[1], quad->valences[1], OutPos[ accInterior[1] ] );
ComputeACCInteriorPosition( pPos, pOneRing[2], quad->vtx1RingCenterQuadOffset[2], quad->bndVtx[2], quad->valences[2], OutPos[ accInterior[2] ] );
ComputeACCInteriorPosition( pPos, pOneRing[3], quad->vtx1RingCenterQuadOffset[3], quad->bndVtx[3], quad->valences[3], OutPos[ accInterior[3] ] );
}
#if !defined( NO_TANGENTS )
// Don't compute tangents for regular patches
#if defined( SEPARATE_REGULAR_AND_EXTRA )
if ( !bRegularPatch )
#endif
{
VPROF_BUDGET( "ComputeACCAllPatches - Tangents (SIMD)", _T( "SubD Rendering" ) );
ComputeACCGeometryPatchTangents( OutPos, OutTanU, OutTanV );
float flLoopGap[4];
flLoopGap[0] = ( M_PI2 * quad->loopGapAngle[0] ) / 65535.0f;
flLoopGap[1] = ( M_PI2 * quad->loopGapAngle[1] ) / 65535.0f;
flLoopGap[2] = ( M_PI2 * quad->loopGapAngle[2] ) / 65535.0f;
flLoopGap[3] = ( M_PI2 * quad->loopGapAngle[3] ) / 65535.0f;
if ( !sShowACCGeometryTangents )
{
{
ComputeCatmullClarkLimitTangents( 0, pPos, pOneRing[0], quad->vtx1RingSize[0], quad->vtx1RingCenterQuadOffset[0],
quad->bndVtx[0], quad->cornerVtx[0], quad->valences[0], flLoopGap[0], OutTanU[ accTanCornerU[0] ], OutTanV[ accTanCornerV[0] ] );
ComputeCatmullClarkLimitTangents( 1, pPos, pOneRing[1], quad->vtx1RingSize[1], quad->vtx1RingCenterQuadOffset[1],
quad->bndVtx[1], quad->cornerVtx[1], quad->valences[1], flLoopGap[1], OutTanU[ accTanCornerU[1] ], OutTanV[ accTanCornerV[1] ] );
ComputeCatmullClarkLimitTangents( 2, pPos, pOneRing[2], quad->vtx1RingSize[2], quad->vtx1RingCenterQuadOffset[2],
quad->bndVtx[2], quad->cornerVtx[2], quad->valences[2], flLoopGap[2], OutTanU[ accTanCornerU[2] ], OutTanV[ accTanCornerV[2] ] );
ComputeCatmullClarkLimitTangents( 3, pPos, pOneRing[3], quad->vtx1RingSize[3], quad->vtx1RingCenterQuadOffset[3],
quad->bndVtx[3], quad->cornerVtx[3], quad->valences[3], flLoopGap[3], OutTanU[ accTanCornerU[3] ], OutTanV[ accTanCornerV[3] ] );
}
// compute correction component to boundary tangents for tangent plane continuity
// /TanV/ /TanU/ / TanV / /TanU/
static int CB_CornerIdx[] = {0,1,2, 3,7,11, 11,10,9, 8,4,0 };
static int CB_InteriorIdx[] = {1,2, 5,8, 10,9, 6,3 };
static fltx4 CB_sign[4] = {Four_Ones,Four_NegativeOnes,Four_Ones,Four_NegativeOnes};
{
// Unroll, since the compiler wants to keep it rolled, and we get better perf unrolled
{
fltx4 u00 = OutTanU[CB_CornerIdx[0]];
fltx4 u10 = MulSIMD( OutTanU[CB_CornerIdx[1]], Four_Twos );
fltx4 u20 = OutTanU[CB_CornerIdx[2]];
int val0 = quad->valences[0]; int val1 = quad->valences[1];
if ( quad->bndVtx[0] ) val0--;
if ( quad->bndVtx[1] ) val1--;
fltx4 c0 = ReplicateX4( cosf( (flLoopGap[0]) / val0 ) );
fltx4 c1 = ReplicateX4( cosf( (flLoopGap[1]) / val1 ) );
fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 ); fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 );
fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes );
OutTanV[CB_InteriorIdx[0] ] = AddSIMD( OutTanV[CB_InteriorIdx[0] ], E );
OutTanV[CB_InteriorIdx[1] ] = AddSIMD( OutTanV[CB_InteriorIdx[1] ], F );
}
{
fltx4 u00 = OutTanV[CB_CornerIdx[3]];
fltx4 u10 = MulSIMD( OutTanV[CB_CornerIdx[4]], Four_Twos );
fltx4 u20 = OutTanV[CB_CornerIdx[5]];
int val0 = quad->valences[1]; int val1 = quad->valences[2];
if ( quad->bndVtx[1] ) val0--;
if ( quad->bndVtx[2] ) val1--;
fltx4 c0 = ReplicateX4( cosf( (flLoopGap[1]) / val0 ) );
fltx4 c1 = ReplicateX4( cosf( (flLoopGap[2]) / val1 ) );
fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 ); fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 );
fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes );
OutTanU[CB_InteriorIdx[2] ] = SubSIMD( OutTanU[CB_InteriorIdx[2] ], E );
OutTanU[CB_InteriorIdx[3] ] = SubSIMD( OutTanU[CB_InteriorIdx[3] ], F );
}
{
fltx4 u00 = OutTanU[CB_CornerIdx[6]];
fltx4 u10 = MulSIMD( OutTanU[CB_CornerIdx[7]], Four_Twos );
fltx4 u20 = OutTanU[CB_CornerIdx[8]];
int val0 = quad->valences[2]; int val1 = quad->valences[3];
if ( quad->bndVtx[2] ) val0--;
if ( quad->bndVtx[3] ) val1--;
fltx4 c0 = ReplicateX4( cosf( (flLoopGap[2]) / val0 ) );
fltx4 c1 = ReplicateX4( cosf( (flLoopGap[3]) / val1 ) );
fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 ); fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 );
fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes );
OutTanV[CB_InteriorIdx[4] ] = AddSIMD( OutTanV[CB_InteriorIdx[4] ], E );
OutTanV[CB_InteriorIdx[5] ] = AddSIMD( OutTanV[CB_InteriorIdx[5] ], F );
}
{
fltx4 u00 = OutTanV[CB_CornerIdx[9]];
fltx4 u10 = MulSIMD( OutTanV[CB_CornerIdx[10]], Four_Twos );
fltx4 u20 = OutTanV[CB_CornerIdx[11]];
int val0 = quad->valences[3]; int val1 = quad->valences[0];
if ( quad->bndVtx[3] ) val0--;
if ( quad->bndVtx[0] ) val1--;
fltx4 c0 = ReplicateX4( cosf( (flLoopGap[3]) / val0 ) );
fltx4 c1 = ReplicateX4( cosf( (flLoopGap[0]) / val1 ) );
fltx4 A = MulSIMD( c0, u10 ); fltx4 B = MulSIMD( c1, u00 ); fltx4 C = MulSIMD( c0, u20 ); fltx4 D = MulSIMD( c1, u10 );
fltx4 E = DivSIMD( SubSIMD( A, B ), Four_Threes ); fltx4 F = DivSIMD( SubSIMD( C, D ), Four_Threes );
OutTanU[CB_InteriorIdx[6] ] = SubSIMD( OutTanU[CB_InteriorIdx[6] ], E );
OutTanU[CB_InteriorIdx[7] ] = SubSIMD( OutTanU[CB_InteriorIdx[7] ], F );
}
}
}
StoreAlignedSIMD( (float*)&TanU[0], OutTanU[0] );
StoreAlignedSIMD( (float*)&TanU[1], OutTanU[1] );
StoreAlignedSIMD( (float*)&TanU[2], OutTanU[2] );
StoreAlignedSIMD( (float*)&TanU[3], OutTanU[3] );
StoreAlignedSIMD( (float*)&TanU[4], OutTanU[4] );
StoreAlignedSIMD( (float*)&TanU[5], OutTanU[5] );
StoreAlignedSIMD( (float*)&TanU[6], OutTanU[6] );
StoreAlignedSIMD( (float*)&TanU[7], OutTanU[7] );
StoreAlignedSIMD( (float*)&TanU[8], OutTanU[8] );
StoreAlignedSIMD( (float*)&TanU[9], OutTanU[9] );
StoreAlignedSIMD( (float*)&TanU[10], OutTanU[10] );
StoreAlignedSIMD( (float*)&TanU[11], OutTanU[11] );
StoreAlignedSIMD( (float*)&TanV[0], OutTanV[0] );
StoreAlignedSIMD( (float*)&TanV[1], OutTanV[1] );
StoreAlignedSIMD( (float*)&TanV[2], OutTanV[2] );
StoreAlignedSIMD( (float*)&TanV[3], OutTanV[3] );
StoreAlignedSIMD( (float*)&TanV[4], OutTanV[4] );
StoreAlignedSIMD( (float*)&TanV[5], OutTanV[5] );
StoreAlignedSIMD( (float*)&TanV[6], OutTanV[6] );
StoreAlignedSIMD( (float*)&TanV[7], OutTanV[7] );
StoreAlignedSIMD( (float*)&TanV[8], OutTanV[8] );
StoreAlignedSIMD( (float*)&TanV[9], OutTanV[9] );
StoreAlignedSIMD( (float*)&TanV[10], OutTanV[10] );
StoreAlignedSIMD( (float*)&TanV[11], OutTanV[11] );
}
#endif
StoreAlignedSIMD( (float*)&Pos[0], OutPos[0] );
StoreAlignedSIMD( (float*)&Pos[1], OutPos[1] );
StoreAlignedSIMD( (float*)&Pos[2], OutPos[2] );
StoreAlignedSIMD( (float*)&Pos[3], OutPos[3] );
StoreAlignedSIMD( (float*)&Pos[4], OutPos[4] );
StoreAlignedSIMD( (float*)&Pos[5], OutPos[5] );
StoreAlignedSIMD( (float*)&Pos[6], OutPos[6] );
StoreAlignedSIMD( (float*)&Pos[7], OutPos[7] );
StoreAlignedSIMD( (float*)&Pos[8], OutPos[8] );
StoreAlignedSIMD( (float*)&Pos[9], OutPos[9] );
StoreAlignedSIMD( (float*)&Pos[10], OutPos[10] );
StoreAlignedSIMD( (float*)&Pos[11], OutPos[11] );
StoreAlignedSIMD( (float*)&Pos[12], OutPos[12] );
StoreAlignedSIMD( (float*)&Pos[13], OutPos[13] );
StoreAlignedSIMD( (float*)&Pos[14], OutPos[14] );
StoreAlignedSIMD( (float*)&Pos[15], OutPos[15] );
}
#endif