csgo-2018-source/common/ps3/spu_job_shared.h
2021-07-24 21:11:47 -07:00

754 lines
19 KiB
C++

//========= Copyright © 1996-2004, Valve LLC, All rights reserved. ============
//
// This is the common include file to be included in SPU jobs.
// It takes care to remap/emulate some SPU-specific functinality on PPU
//
#ifndef PS3_SPU_JOB_SHARED_HDR
#define PS3_SPU_JOB_SHARED_HDR
#ifdef _PS3
#include <ps3/ps3_platform.h>
#include <cell/spurs/job_chain.h>
#include <cell/spurs/job_queue.h>
#include <cell/spurs/job_queue_port2.h>
#include <cell/dma/types.h>
//
// NOTE: Enable the following block for debugging GCM on SPU; works as of SDK 350
//
#if 0 && defined( __SPU__ )
#include <cell/gcm/gcm_macros.h>
#undef CELL_GCM_ASSERT
#undef CELL_GCM_ASSERTS
#define CELL_GCM_ASSERT(condition) Assert( condition )
#define CELL_GCM_ASSERTS(condition, description) AssertSpuMsg( condition, description )
#define CELL_GCM_ASSERT_ENABLE
#endif
enum DmaTagEnum_t
{
DMATAG_SYNC = 2, // used for synchronous transfers, where we need the transfer to finish very soon/immediately after issuing
DMATAG_TEXTURES = 3,
DMATAG_SHADERS = 4,
DMATAG_SCRATCH = 5, // used for DMA PUTs from Scratch memory, so we need to wait for this to finish before job finishes
// each jobchain needs 2 dma tags, up to tag 30
// DMATAG_EDGE_JOBCHAIN = 8,
// DMATAG_FPCP_JOBCHAIN = 10,
// DMATAG_GCM_JOBCHAIN = 12,
DMATAG_ANIM = 8, // non immediate dma's
DMATAG_BUILDINDICES = 8,
DMATAG_BUILDRENDERABLES = 8,
}; // shouldn't overlap with the tags used by the workload
// Enable this define to disable assert. This may be necessary to detect timing issues in DEBUG and RELEASE,
// or incorrectly generated code from compiler. When LSGUARD is enabled, we disable asserts to force potential issues.
#ifdef USE_LSGUARD
# define DISABLE_ASSERT
#endif
template <typename T>
inline T* AddBytes( T* p, int nBytes )
{
return ( T* )( int( p ) + nBytes );
}
template <typename T>
inline T Min( T a, T b )
{
return a < b ? a : b;
}
template <typename T>
inline T Max( T a, T b )
{
return a > b ? a : b;
}
template <typename T>
inline void Swap( T& a , T & b )
{
T c = a; a = b; b = c;
}
// <sergiy> should I port platform.h to SPU?
#ifdef SPU
#include <cell/spurs/job_context.h>
#include "cell/spurs/common.h"
#include <cell/atomic.h>
#include <spu_intrinsics.h>
#include <vmx2spu.h>
#define PPU_ONLY(X)
#define SPU_ONLY(X) X
#define vector __vector
void CheckBufferOverflow_Impl();
void CheckDmaGet_Impl( const void * pBuffer, size_t nSize );
#if defined(_CERT) || defined(DISABLE_ASSERT)
# define VjobSpuLog(...)
# define DebuggerBreak()
# define Warning(...)
# define CheckBufferOverflow()
# define CheckDmaGet(p, size)
#else
# include <spu_printf.h>
# define VjobSpuLog( MSG, ... ) spu_printf( "[%d]" MSG, cellSpursGetCurrentSpuId(), ##__VA_ARGS__ )
# define Msg( MSG, ... ) spu_printf( "[%d]" MSG, cellSpursGetCurrentSpuId(), ##__VA_ARGS__ )
#ifndef BASETYPES_H
#define DebuggerBreak() __asm volatile ("stopd $0,$0,$0")
#endif
# define Warning( MSG, ... ) spu_printf( "[%d] Warning: " MSG, cellSpursGetCurrentSpuId(), ##__VA_ARGS__ )
# define CELL_DMA_ASSERT_VERBOSE
# define CheckBufferOverflow() CheckBufferOverflow_Impl()
# define CheckDmaGet(p, size) CheckDmaGet_Impl( p, size )
#endif
#define LWSYNC_PPU_ONLY()
#define VJOB_IOBUFFER_DMATAG g_stInfo->dmaTag // fake DMA tag
#include <cell/spurs/common.h>
#define VjobDmaPut cellDmaPut
#define VjobDmaGet cellDmaGet
#define VjobDmaGetf cellDmaGetf
#define VjobDmaListGet cellDmaListGet
#define VjobDmaLargePut cellDmaLargePut
#define VjobDmaLargePutf cellDmaLargePutf
//#define VjobDmaLargePutb cellDmaLargePutb
#define VjobDmaPutf cellDmaPutf
#define VjobDmaSmallPut cellDmaSmallPut
#define VjobDmaSmallPutf cellDmaSmallPutf
//#define VjobDmaSmallPutb cellDmaSmallPutb
#define VjobDmaSmallGet cellDmaSmallGet
#define VjobWaitTagStatusAll cellDmaWaitTagStatusAll
#define VjobWaitTagStatusImmediate cellDmaWaitTagStatusImmediate
#define VjobDmaGetUint32 cellDmaGetUint32
#define VjobDmaPutUint32 cellDmaPutUint32
#define VjobDmaGetUint64 cellDmaGetUint64
#define VjobDmaPutUint64 cellDmaPutUint64
#define VjobDmaUnalignedPutf cellDmaUnalignedPutf
#define VjobDmaUnalignedPut cellDmaUnalignedPut
#define VjobDmaPutfUintTemplate(SIZE, value, ea, tag, tid, rid) \
do { \
uint64_t __cellDma_ea = ea; \
uint32_t __cellDma_tag = tag; \
qword _buf = (qword)spu_splats(value); \
cellDmaDataAssert(__cellDma_ea,sizeof(uint##SIZE##_t),__cellDma_tag); \
cellDmaAndWait(cellDmaEa2Ls(__cellDma_ea,&_buf),__cellDma_ea,sizeof(uint##SIZE##_t),__cellDma_tag,MFC_CMD_WORD(tid,rid,MFC_PUTF_CMD)); \
} while(0)
#define VjobDmaPutfUint8(value, ea, tag) cellDmaPutUintTemplate(8, ((uint8_t)value), ea, tag, 0, 0)
#define VjobDmaPutfUint16(value, ea, tag) cellDmaPutUintTemplate(16, ((uint16_t)value), ea, tag, 0, 0)
#define VjobDmaPutfUint32(value, ea, tag) cellDmaPutUintTemplate(32, ((uint32_t)value), ea, tag, 0, 0)
#define VjobDmaPutfUint64(value, ea, tag) cellDmaPutUintTemplate(64, ((uint64_t)value), ea, tag, 0, 0)
#define VjobSpuId() int( cellSpursGetCurrentSpuId() )
#define V_memset __builtin_memset
#define V_memcpy __builtin_memcpy
#if !defined ARRAYSIZE
#define ARRAYSIZE( ARRAY ) ( sizeof( ARRAY ) / sizeof( ( ARRAY )[0] ) )
#endif
typedef signed int int32;
typedef unsigned int uint;
typedef signed char int8;
typedef unsigned char uint8;
typedef signed short int16;
typedef unsigned short uint16;
typedef signed int int32;
typedef unsigned int uint32;
typedef signed long long int64;
typedef unsigned long long uint64;
typedef unsigned int uintp;
typedef vector float fltx4 ;
#define INT_MAX 0x7fffffff
#define DECL_ALIGN(x) __attribute__( ( aligned( x ) ) )
#ifndef BASETYPES_H
#define ALIGN16 DECL_ALIGN(16)
#define ALIGN16_POST
#define ALIGN128 DECL_ALIGN(128)
#define ALIGN128_POST
template <typename T>
inline T AlignValue( T val, uintp alignment )
{
return ( T )( ( ( uintp )val + alignment - 1 ) & ~( alignment - 1 ) );
}
#define ALIGN_VALUE( val, alignment ) ( ( val + alignment - 1 ) & ~( alignment - 1 ) )
inline bool IsPowerOfTwo( uint x )
{
return ( x & ( x - 1 ) ) == 0;
}
#endif
#define FORCEINLINE inline /* __attribute__ ((always_inline)) */
#define IsPlatformPS3() 1
#define IsPlatformPS3_PPU() 0
#define IsPlatformPS3_SPU() 1
#define IsPlatformX360() 0
#define IsPlatformOSX() 0
#if !defined RESTRICT
#define RESTRICT
#endif
#define V_memset __builtin_memset
#define V_memcpy __builtin_memcpy
inline void VjobPpuRereadEA( uintp ea ){}
#if defined(_CERT) || defined(DISABLE_ASSERT)
#define Assert(x) ((void)(0))
#define AssertSpuMsg(x,MSG,...)((void)0)
#ifndef DBG_H
#define COMPILE_TIME_ASSERT( pred ) // to avoid any unpredictable affects in the optimizer
#endif
#else
#define DBGFLAG_ASSERT
#ifndef DBG_H
#define Assert(x) do{if( !( x ) ) { spu_printf( "Assert on SPU[%d](" #x ")\n", cellSpursGetCurrentSpuId() ); DebuggerBreak(); } }while(0)
#endif
#define AssertSpuMsg(x,MSG,...) do{if( !( x ) ) { spu_printf( "Assert on SPU[%d](" #x "), " MSG, cellSpursGetCurrentSpuId(), ## __VA_ARGS__ ); DebuggerBreak(); } }while(0)
#ifndef DBG_H
#define COMPILE_TIME_ASSERT( pred ) switch(0){case 0:case pred:;}
#endif
#endif
// mimic the PPU class on SPU
// template< int bytesAlignment, class T >
// class CAlignedNewDelete : public T
// {public:
// }
// WARNING: SLOWNESS. DO NOT USE IN PRODUCTION.
inline void DebugMemcpyEa( uint eaDest, uint eaSrc, uint nSize, void *lsScratch )
{
Assert( ! ( 0xF & ( eaSrc | eaDest | nSize ) ) );
uint nBytesLeft = nSize, nOffset = 0;
while( nBytesLeft )
{
uint nChunk = Min<uint>( 16 * 1024, nBytesLeft );
VjobDmaGet( lsScratch, eaSrc + nOffset, nChunk, DMATAG_SYNC, 0, 0 );
VjobWaitTagStatusAll( 1 << DMATAG_SYNC );
VjobDmaPut( lsScratch, eaDest + nOffset, nChunk, DMATAG_SYNC, 0, 0 );
VjobWaitTagStatusAll( 1 << DMATAG_SYNC );
nBytesLeft -= nChunk;
nOffset += nChunk;
}
}
#define vec_to_uint32(X) si_to_uint( ( qword )( X ) )
#define VjobQueuePort2PushJob( eaPort, eaJob, sizeDesc, tag, dmaTag, flag ) cellSpursJobQueuePort2PushJob( (uintp)( eaPort ), (uintp)( eaJob ), ( sizeDesc ), ( tag ), ( dmaTag ), ( flag ) )
#define VjobQueuePort2PushSync( eaPort2, tagMask, dmaTag, flag ) cellSpursJobQueuePort2PushSync( ( uintp ) ( eaPort2), ( tagMask ), ( dmaTag ), ( flag ) )
inline void VjobQueuePort2PushJobBlocking( CellSpursJobQueuePort2 *eaPort2, CellSpursJobHeader *eaJob, size_t sizeDesc, uint nQueueTag, uint nDmaTag )
{
int nError;
for(;;)
{
nError = cellSpursJobQueuePort2PushJob( uintp( eaPort2 ), uintp( eaJob ) , sizeDesc, nQueueTag, nDmaTag, CELL_SPURS_JOBQUEUE_FLAG_NON_BLOCKING );
if( nError != CELL_SPURS_JOB_ERROR_AGAIN )
{
break;
}
}
if ( nError != CELL_OK )
{
VjobSpuLog( "Cannot push job, error %d. RSX is going to hang, then SPUs, then PPU.\n", nError );
DebuggerBreak();
}
}
inline void VjobQueuePort2PushSyncBlocking( CellSpursJobQueuePort2 *eaPort2, unsigned tagMask, uint nDmaTag )
{
int nError;
for(;;)
{
nError = cellSpursJobQueuePort2PushSync( uintp( eaPort2 ), tagMask, nDmaTag, CELL_SPURS_JOBQUEUE_FLAG_NON_BLOCKING );
if( nError != CELL_SPURS_JOB_ERROR_AGAIN )
{
break;
}
}
if ( nError != CELL_OK )
{
VjobSpuLog( "Cannot push job, error %d. RSX is going to hang, then SPUs, then PPU.\n", nError );
DebuggerBreak();
}
}
#else
#include "tier0/platform.h"
#include "tier1/strtools.h"
#include "mathlib/ssemath.h"
#include <altivec.h>
#include <cell/spurs/job_context_types.h>
inline uint32_t GetCurrentSpuId()
{
return 0xFFFFFFFF;
}
using namespace ::cell::Spurs;
extern void VjobSpuLog( const char * p, ... );
#define VJOB_IOBUFFER_DMATAG 0 // fake DMA tag
#define PPU_ONLY(X) X
#define SPU_ONLY(X)
#ifdef _DEBUG
#define AssertSpuMsg(x,MSG,...) do { if( !( x ) ) { Warning( "Assert(" #x "), " MSG, ## __VA_ARGS__ ); DebuggerBreak(); } }while( 0 )
#else
#define AssertSpuMsg(x,MSG,...)
#endif
#define VjobQueuePort2PushJob( eaPort, eaJob, sizeDesc, tag, dmaTag, flag ) cellSpursJobQueuePort2PushJob( (CellSpursJobQueuePort2 *)( eaPort ), (CellSpursJobHeader *)( eaJob ), ( sizeDesc ), ( tag ), ( flag ) )
#define VjobQueuePort2PushSync( eaPort2, tagMask, dmaTag, flag ) cellSpursJobQueuePort2PushSync( (CellSpursJobQueuePort2 *) ( eaPort2), ( tagMask ), ( flag ) )
inline void VjobQueuePort2PushJobBlocking( CellSpursJobQueuePort2 *eaPort2, CellSpursJobHeader *eaJob, size_t sizeDesc, uint nQueueTag, uint nDmaTag )
{
int nError = cellSpursJobQueuePort2PushJob( eaPort2, eaJob, sizeDesc, nQueueTag, 0 );// synchronous call
(void) nError;
Assert( nError == CELL_OK );
}
inline void VjobQueuePort2PushSyncBlocking( CellSpursJobQueuePort2 *eaPort2, unsigned tagMask, uint nDmaTag )
{
int nError = cellSpursJobQueuePort2PushSync( eaPort2, tagMask, 0 ); // synchronous call
(void) nError;
Assert( nError == CELL_OK );
}
#define VjobSpuId() -1
#define LWSYNC_PPU_ONLY() __lwsync()
extern void VjobDmaPut(
const void * ls,
uint64_t ea,
uint32_t size,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
extern void VjobDmaGet(
void * ls,
uint64_t ea,
uint32_t size,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
extern void VjobDmaGetf(
void * ls,
uint64_t ea,
uint32_t size,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
extern void VjobDmaListGet(
void *ls,
uint64_t ea,
const CellDmaListElement *list,
uint32_t listSize,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
extern void VjobDmaLargePut(
const void * ls,
uint64_t ea,
uint32_t size,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
extern void VjobDmaLargePutf(
const void * ls,
uint64_t ea,
uint32_t size,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
extern void VjobDmaLargePutb(
const void * ls,
uint64_t ea,
uint32_t size,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
extern void VjobDmaPutf(
const void * ls,
uint64_t ea,
uint32_t size,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
extern void VjobDmaSmallPut(
const void * ls,
uint64_t ea,
uint32_t size,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
extern void VjobDmaSmallGet(
void * ls,
uint64_t ea,
uint32_t size,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
extern void VjobDmaSmallPutb(
const void * ls,
uint64_t ea,
uint32_t size,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
extern void VjobDmaSmallPutf(
const void * ls,
uint64_t ea,
uint32_t size,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
// NOTE: implementation must wait for tag
uint32_t VjobDmaGetUint32(
uint64_t ea,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
void VjobDmaPutUint32(
uint32_t value,
uint64_t ea,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
uint64_t VjobDmaGetUint64(
uint64_t ea,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
void VjobDmaPutUint64(
uint64_t value,
uint64_t ea,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
void VjobDmaUnalignedPutf(
const void *ls,
uint64_t ea,
uint32_t size,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
void VjobDmaUnalignedPut(
const void *ls,
uint64_t ea,
uint32_t size,
uint32_t tag,
uint32_t tid,
uint32_t rid
);
// These functions are empty because I'm too lazy to implement deferred DMA emulation ...
inline uint VjobWaitTagStatusAll( uint nTagMask ){ return nTagMask;}
inline uint VjobWaitTagStatusImmediate( uint nTagMask ) { return nTagMask ; }
#define VjobDmaPutfUint8(value, ea, tag) *(uint8*)ea = (uint8)value
#define VjobDmaPutfUint16(value, ea, tag) *(uint16*)ea = (uint16)value
#define VjobDmaPutfUint32(value, ea, tag) *(uint32*)ea = (uint32)value
#define VjobDmaPutfUint64(value, ea, tag) *(uint64*)ea = (uint64)value
void VjobPushJob( void ( *pfnMain )( CellSpursJobContext2 * stInfo, CellSpursJob256 * job ), CellSpursJob128 * job );
extern void VjobSpuLog( const char * p, ... );
extern void VjobPpuRereadEA( uintp ea );
inline void DebugMemcpyEa( uint eaDest, uint eaSrc, uint nSize, void *lsScratch )
{
Assert( ! ( 0xF & ( eaSrc | eaDest | nSize ) ) );
memcpy( (void*)eaDest, (void*)eaSrc, nSize );
}
extern void TestAlignBuffer();
#define vec_to_uint32(X) (*(uint32*)&(X))
#endif // SPU
#define VjobDmaEa2Ls16(ea, ls) ((uintptr_t)(ls)+((uint32_t)(ea)&15))
#define VjobDmaEa2Ls128(ea, ls) ((uintptr_t)(ls)+((uint32_t)(ea)&127))
inline uint32* PrepareSmallPut32( vector unsigned int * lsAligned, volatile uint32 * eaUnaligned, uint32 nInitialValue )
{
Assert( !( 3 & uint( lsAligned ) ) );
uint32 * ls = ( uint32* )VjobDmaEa2Ls16( eaUnaligned, lsAligned );
*ls = nInitialValue;
return ls;
}
inline uint64* PrepareSmallPut64( vector unsigned int * lsAligned, volatile uint64 * eaUnaligned, uint64 nInitialValue )
{
Assert( !( 7 & uint( lsAligned ) ) );
uint64 * ls = ( uint64* )VjobDmaEa2Ls16( eaUnaligned, lsAligned );
*ls = nInitialValue;
return ls;
}
extern CellSpursJobContext2* g_stInfo;
#ifndef IsDebug
# ifdef _DEBUG
# define IsDebug() true
# else
# define IsDebug() false
# endif
#endif
#ifndef IsCert
# ifdef _CERT
# define IsCert() true
# else
# define IsCert() false
# endif
#endif
extern uint g_nBreakMask ;
#ifdef _CERT
# define BreakOn( nId )
#else
# define BreakOn( nId ) do \
{ \
if( g_nBreakMask & ( 1 << nId ) ) \
DebuggerBreak(); \
}while( 0 )
#endif
inline void VjobDebugSpinCycles( uint nCycles )
{
if( !IsCert() )
{
#ifdef SPU
uint nStart = spu_read_decrementer();
while( nStart - spu_read_decrementer() < nCycles / 40 )
continue;
#else
sys_timer_usleep( nCycles / 3200 );
/*
uint nStart = __mftb();
while( __mftb() - nStart() < nCycles / 40 )
continue;
*/
#endif
}
}
// this is the DMA list element without notify or reserved fields, so that it's easy to fill it in
// and be sure there is no garbage left (in notify and reserved fields) and there are no bit field operations (to store size, which is effectively only 14-bit value)
struct BasicDmaListElement_t
{
uint32 size;
uint32 eal;
};
// shifts unaligned pBuffer of given size left by 0..15 bytes to make it aligned
// returns the aligned pointer, pBuffer & -16
extern void* AlignBuffer( void * pBuffer, uint nBytes);
//
// Adds constant nAdd to the given unaligned buffer of uint16's
//
extern void UnalignedBufferAddU16( uint16 * pBuffer, uint nCount, uint16 nAdd );
// SpursJob_t must be one of CellSpursJob64, CellSpursJob128, CellSpursJob256,...
// JobParam_t is the parameter structure passed to the job
template < typename JobParam_t , typename SpursJob_t >
inline JobParam_t * VjobGetJobParams( void * pJob )
{
Assert( sizeof( JobParam_t ) + sizeof( CellSpursJobHeader ) <= sizeof( SpursJob_t ) );
JobParam_t * pJobParams = ( JobParam_t* ) ( uintp( pJob ) + ( sizeof( SpursJob_t ) - sizeof( JobParam_t ) ) );
Assert( uintp( pJobParams + 1 ) == uintp( pJob ) + sizeof( SpursJob_t ) );
return pJobParams;
}
extern void UnalignedBufferAddU16( );
template <uint n> struct Log2{};
template<>struct Log2<8> {enum{VALUE=3};};
template<>struct Log2<16>{enum{VALUE=4};};
template<>struct Log2<32>{enum{VALUE=5};};
template<>struct Log2<256>{enum{VALUE=8};};
#define COMPILE_TIME_LOG2(VAL) ( Log2<VAL>::VALUE )
inline void ZeroMemAligned( void * p, uint nSize )
{
Assert( !( ( uintp( p ) | nSize ) & 15 ) );
for( uint i = 0; i < nSize; i += 16 )
{
*( vec_uint4* )( uintp( p ) + i ) = (vec_uint4){0,0,0,0};
}
}
inline void CopyMemAligned( void * pDst, const void * pSrc, uint nSize )
{
Assert( !( ( uintp( pDst ) | uintp( pSrc ) | nSize ) & 15 ) );
for( uint i = 0; i < nSize; i += 16 )
{
*( vec_uint4* )( uintp( pDst ) + i ) = *( vec_uint4* )( uintp( pSrc ) + i );
}
}
///////////////////////////////////////////////////////////////////////////
//
// Reference implementation
//
template <uint nBitCount>
class CBitArray
{
public:
void Clear()
{
for( uint i = 0; i < ( nBitCount >> 7 ); ++i )
{
m_qword[i] = ( vec_uint4 ){0,0,0,0};
}
//m_nSetCount = 0;
}
void SetRange( uint nStart, uint nEnd )
{
nEnd = Min( nEnd, nBitCount );
if( nStart > nEnd )
return;
//m_nSetCount = Max( nEnd, m_nSetCount );
uint nMask = uint( -1 ) >> ( nStart & 0x1F );
for( uint i = ( nStart >> 5 ); i < ( nEnd >> 5); ++i )
{
m_u32[i] |= nMask;
nMask = uint( -1 );
}
nMask &= ~( uint( -1 ) >> ( nEnd & 0x1F ) );
m_u32[ nEnd >> 5 ] |= nMask;
}
//uint GetSetCount()const{return m_nSetCount;}
uint GetFirst1( uint nFrom )const
{
for( uint i = nFrom; i < nBitCount; ++i )
if( GetBit( i ) )
return i;
return nBitCount;
}
uint GetFirst0( uint nFrom )const
{
for( uint i = nFrom; i < nBitCount; ++i )
if( !GetBit( i ) )
return i;
return nBitCount;
}
uint GetBit( uint n )const
{
return m_u32[ n >> 5 ] & ( 0x80000000 >> ( n & 0x1F ) );
}
protected:
union
{
vec_uint4 m_qword[ ( nBitCount + 127 ) / 128 ];
uint32 m_u32[ ( nBitCount + 31 ) / 32 ];
};
//uint m_nSetCount;
};
#endif // _PS3
#endif