612 lines
23 KiB
C++
Raw Permalink Normal View History

2021-07-24 21:11:47 -07:00
//========= Copyright <20> Valve Corporation, All rights reserved. ====//
#include "tier0/platform.h"
#ifdef _PS3
#include "dxabstract.h"
#include <sys/memory.h>
#include "ps3/spugcm_shared.h"
#include "fpcpatcher_spu.h"
#include "cg/cg.h"
#include "cg/cgBinary.h"
#include "vjobs_interface.h"
#include "tier0/hardware_clock_fast.h"
#include "vjobs/fpcpatch_shared.h"
#include "vjobs/root.h"
#include "ps3/vjobutils.h"
#include "tier0/microprofiler.h"
#include "ps3/ps3_gcm_config.h"
#include "spugcm.h"
enum
{
PROFILE_SCE_VP_RSX = 7003,
PROFILE_SCE_FP_RSX = 7004
};
#define GCM_MUST_SUCCEED( FUNC, ... ) do { int nError = FUNC(__VA_ARGS__); if( nError != CELL_OK ) { Error( "Error 0x%X in " #FUNC ", %s:%d\n", nError, __FILE__, __LINE__ ); } } while( 0 )
DEFINE_LOGGING_CHANNEL_NO_TAGS( LOG_VJOBS, "VJOBS" );
CFragmentProgramConstantPatcher_SPU g_pixelShaderPatcher; // Patches pixel shader constants
CMicroProfiler g_mpBindProgram, g_mpFpcPatch2;
// debug only
CFragmentProgramConstantPatcher_SPU::CFragmentProgramConstantPatcher_SPU()
{
m_pBuffer = m_pBufferEnd = NULL;
m_nIoOffsetDelta = 0; // m_pBuffer + m_nIoOffsetDelta == IO offset usable by RSX
m_pPutFragmentProgram = NULL;
#ifdef DEBUG_FPC_PATCHER
m_bSync = ( CommandLine()->FindParm( "-fpcpsync" ) != 0 );
#endif
}
void CFragmentProgramConstantPatcher_SPU::InitLocal( void *pBuffer, uint nSize )
{
m_nFpcPatchCounter = 0;
m_nFpcPatchCounterOfLastSyncJob = 0;
//cellGcmSetDebugOutputLevel( CELL_GCM_DEBUG_LEVEL2 );
const uint nOverfetchGuard = 1024; // RSX front end prefetches up to 4k, but 1k is ( should be ) enough to avoid overfetch crashes
const uint nStateBufferQwords = 1 << 12; // make space for at least 8 full batches of constants...
uint nPatchStateBufferSize = ( sizeof( job_fpcpatch::FpcPatchState_t ) + sizeof( fltx4 ) * nStateBufferQwords );
uint32 nBufferIoOffset;
m_bFpcPatchOnPpu = ( 0 != CommandLine()->FindParm( "-fpcpatchonppu" ) );
#ifdef DEBUG_FPC_PATCHER
m_bTestAlwaysStateSync = ( 0 != CommandLine()->FindParm( "-fpcpstatesync" ) );
#endif
m_bEnableSPU = true;
m_nFpcPatchSyncMask = 0;
// use this passed buffer (probably from local memory) for the patched stuff
m_pBuffer = ( uint32* ) pBuffer;
m_pBufferEnd = ( uint32* ) ( uintp( pBuffer ) + nSize );
m_nBufferLocation = CELL_GCM_LOCATION_LOCAL;
m_isBufferPassedIn = true;
m_state.Init( ( job_fpcpatch::FpcPatchState_t* )MemAlloc_AllocAligned( nPatchStateBufferSize, 128 ), nStateBufferQwords );
GCM_MUST_SUCCEED( cellGcmAddressToOffset, m_pBuffer, &nBufferIoOffset );
#ifdef DBGFLAG_ASSERT
uint32 nBufferIoOffsetCheck;
GCM_MUST_SUCCEED( cellGcmAddressToOffset, m_pBuffer, &nBufferIoOffsetCheck );
Assert( nBufferIoOffsetCheck == nBufferIoOffset );
Assert( !( nBufferIoOffsetCheck & 0x7F ) );
for( uint nOffset = 0; nOffset < nSize; nOffset += 128 )
{
GCM_MUST_SUCCEED( cellGcmAddressToOffset, ((uint8*)m_pBuffer) + nOffset, &nBufferIoOffsetCheck );
Assert( nBufferIoOffsetCheck == nBufferIoOffset + nOffset );
}
#endif
m_nIoOffsetDelta = nBufferIoOffset - uintp( m_pBuffer );
#ifdef DEBUG_FPC_PATCHER
m_pSyncState = ( fltx4* ) MemAlloc_AllocAligned( sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT, 16 );
V_memset( m_pSyncState, 0xCD, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT );
V_memset( m_state.m_pSharedState->m_reg, 0xCD, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT );
#endif
ResetPut();
//cellGcmSetDebugOutputLevel( CELL_GCM_DEBUG_LEVEL0 );
}
void CFragmentProgramConstantPatcher_SPU::Shutdown()
{
}
void CFragmentProgramConstantPatcher_SPU::ResetPut()
{
m_pPutFragmentProgram = m_pBufferEnd; // reserved word for the count of constants to set
}
CFragmentProgramConstantPatcher_SPU::~CFragmentProgramConstantPatcher_SPU()
{
if( m_isBufferPassedIn )
{
MemAlloc_FreeAligned( m_state.m_pSharedState );
}
else
{
sys_memory_free( ( sys_addr_t )m_pBuffer );
}
#ifdef DEBUG_FPC_PATCHER
MemAlloc_FreeAligned( m_pSyncState );
#endif
}
void CFragmentProgramConstantPatcher_SPU::BeginScene()
{
m_nFpcPatchCounterAtBeginScene = m_nFpcPatchCounter;
// we shouldn't have in-flight SPU jobs by now.. should we?
Assert( uint( g_spuGcmShared.m_nFpcpStartRangesAfterLastSync - m_state.m_pSharedState->m_nStartRanges ) <= m_state.m_pSharedState->m_nBufferMask + 1 );
}
void CFragmentProgramConstantPatcher_SPU::EndScene()
{
#if ENABLE_MICRO_PROFILER > 0
uint nPatchCounter = m_nFpcPatchCounter - m_nFpcPatchCounterAtBeginScene;
extern bool g_bDxMicroProfile;
if( g_bDxMicroProfile && nPatchCounter )
{
g_mpBindProgram.PrintAndReset( "[BindProgram] " );
g_mpFpcPatch2 .PrintAndReset( "[FpcPatch2] " );
}
#endif
}
job_fpcpatch2::FpHeader_t g_nullFpHeader = {0,0,0,0};
// semantics should match cgGLSetFragmentRegisterBlock()
void CFragmentProgramConstantPatcher_SPU::SetFragmentRegisterBlock( uint nStartRegister, uint nVector4fCount, const float * pConstantData )
{
#ifndef _CERT
if ( nStartRegister >= job_fpcpatch::MAX_VIRTUAL_CONST_COUNT || nStartRegister + nVector4fCount > job_fpcpatch::MAX_VIRTUAL_CONST_COUNT )
Error( "Invalid Fragment Register Block Range %u..%u\n", nStartRegister, nStartRegister + nVector4fCount );
#endif
#ifdef DEBUG_FPC_PATCHER
if( m_bSync )
{
fltx4 reg[job_fpcpatch::MAX_VIRTUAL_CONST_COUNT];
m_state.GetSyncState( reg );
Assert( !V_memcmp( m_pSyncState, reg, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
}
uint nEnd = m_state.m_nEndOfJournalIdx;
#endif
// we have 4 DMA elements ( 2..6 ) to fit the constant buffer; the 1st element may have to be as small as 16 bytes.
// this leaves the max constant buffer size 4 * 16kb + 16 bytes
const uint nMaxUploadRangeBeforeStateSync = ( 32 * 1024 ) / sizeof( fltx4 );
uint numUploadRangeQwords = m_state.m_nEndOfJournalIdx - g_spuGcmShared.m_nFpcpStartRangesAfterLastSync;
///////////////////////////////////////////////////////////////////////////
//
// PREPATCH MUST BE DONE IN (CTXFLUSH OR) DRAW JOB FROM NOW ON!!! g_spuGcmShared.m_nFpcpStartRangesAfterLastSync IS SYNCHRONOUS AND CORRECT THERE
//
//////////////////////////////////////////////////////////////////////////
/*
bool bPrePatch = nVector4fCount + 1 + numUploadRangeQwords > nMaxUploadRangeBeforeStateSync;
if( bPrePatch )
{
// force state sync now
if( g_spuGcmShared.m_enableStallWarnings )
{
Warning( "PPU-SPU Wait for RSX. SetFragmentRegisterBlock: Forced to set state on PPU, %u vectors, %u qwords in history. This is slow fallback path.\n", nVector4fCount, numUploadRangeQwords );
}
FpcPatch2( &g_nullFpHeader, sizeof( g_nullFpHeader ), NULL, NULL );
}
*/
if( uint nAttempts = m_state.AddRange( nStartRegister, nVector4fCount, pConstantData ) )
{
if( g_spuGcmShared.m_enableStallWarnings )
{
Warning( "PPU-SPU Wait for RSX. SetFragmentRegisterBlock: Stall, %d spins. Waiting for more memory; %d qwords, %d jobs buffered up\n", nAttempts, m_state.m_nEndOfJournalIdx - m_state.m_pSharedState->m_nStartRanges, g_spuGcmShared.m_nFpcPatchCounter - m_state.m_pSharedState->m_nThisStatePatchCounter );
}
}
#ifdef DEBUG_FPC_PATCHER
if( m_bTestAlwaysStateSync && !bPrePatch )
{
FpcPatch2( &g_nullFpHeader, sizeof( g_nullFpHeader ), NULL, NULL );
}
V_memcpy( m_pSyncState + nStartRegister, pConstantData, nVector4fCount * sizeof( fltx4 ) );
if( m_bSync )
{
fltx4 reg[job_fpcpatch::MAX_VIRTUAL_CONST_COUNT];
m_state.GetSyncState( reg );
Assert( !V_memcmp( m_pSyncState, reg, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
}
#endif
}
//volatile int g_nDebugStage = 0;
//
// Match the semantics of cgGLBindProgram()
// There are 2 formats of fragment shaders, see SDK docs "2. 2 Cg Compiler Options" and
// in Cg Compiler User's Guide:
// "7. 2 NV Binary Shader Format (VPO and FPO)"
// "7. 4 Cgb File Format Specification"
//
void CFragmentProgramConstantPatcher_SPU::BindProgram( const struct IDirect3DPixelShader9 * psh )
{
MICRO_PROFILE( g_mpBindProgram );
const job_fpcpatch2::FpHeader_t * prog = psh->m_data.m_eaFp;
uint32 nFragmentProgramOffset = uintp( m_pPutFragmentProgram ) + m_nIoOffsetDelta;
g_spuGcmShared.m_fpcpRing.UnlockRsxMemoryForSpu();
m_pPutFragmentProgram = ( uint32* )g_spuGcmShared.m_fpcpRing.LockRsxMemoryForSpu( &g_spuGcmShared.m_fpcpJobChain, prog->m_nUcodeSize );
nFragmentProgramOffset = uintp( m_pPutFragmentProgram ) - uintp( g_ps3gcmGlobalState.m_pLocalBaseAddress );
if( !IsCert() && nFragmentProgramOffset >= g_ps3gcmGlobalState.m_nLocalSize )
{
Error( "Fragment program Ucode buffer offset 0x%X is at unexpected address not in local memory\n", nFragmentProgramOffset );
}
if ( !IsCert() && ( m_pPutFragmentProgram < m_pBuffer || m_pPutFragmentProgram >= m_pBufferEnd ) )
{
Error( "Fragment Program UCode buffer overflow.\n" );
}
#ifdef DEBUG_FPC_PATCHER
if( m_bSync )
{
fltx4 reg[job_fpcpatch::MAX_VIRTUAL_CONST_COUNT];
m_state.GetSyncState( reg );
Assert( !V_memcmp( m_pSyncState, reg, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
}
#endif
uint nTexControls = prog->m_nTexControls;
// set jump to self
GCM_CTX_RESERVE( 7 + 2 * nTexControls );
uint32 * pJts = NULL;
FpcPatch2( prog, psh->m_data.m_nFpDmaSize, m_pPutFragmentProgram, pJts );
CELL_GCM_METHOD_SET_SHADER_CONTROL( GCM_CTX->current, prog->m_nShaderControl0 ); // +2
CELL_GCM_METHOD_SET_SHADER_PROGRAM( GCM_CTX->current, m_nBufferLocation + 1, ( nFragmentProgramOffset & 0x1fffffff ) ); // +2
CELL_GCM_METHOD_SET_VERTEX_ATTRIB_OUTPUT_MASK( GCM_CTX->current, psh->m_data.m_attributeInputMask | 0x20 ); // +2
V_memcpy( GCM_CTX->current, prog->GetTexControls(), nTexControls * sizeof( uint32 ) * 2 );
GCM_CTX->current += 2 * nTexControls;
#ifdef DEBUG_FPC_PATCHER
if( m_bSync )
{
g_ps3gcmGlobalState.CmdBufferFlush( CPs3gcmGlobalState::kFlushForcefully );
while ( *( volatile uint32* )pJts )
{
sys_timer_usleep( 50 );// wait for nop
}
#ifdef DEBUG_FPC_PATCHER
{
fltx4 reg[job_fpcpatch::MAX_VIRTUAL_CONST_COUNT];
m_state.GetSyncState( reg );
Assert( !V_memcmp( m_pSyncState, reg, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
}
ValidatePatchedProgram( psh->m_pCgProg, m_pPutFragmentProgram );
uint32 nFragmentProgramOffsetCheck;
GCM_MUST_SUCCEED( cellGcmAddressToOffset, m_pPutFragmentProgram, &nFragmentProgramOffsetCheck );
Assert( nFragmentProgramOffsetCheck == nFragmentProgramOffset );
#endif
g_ps3gcmGlobalState.CmdBufferFinish();
}
#endif
m_nFpcPatchCounter++;
}
uint g_nFpcPatch2JobExtraFlags = 0; // set this to 2 and SPU will break
static int s_nFpcPatch2Calls = 0;
void CFragmentProgramConstantPatcher_SPU::FpcPatch2( const job_fpcpatch2::FpHeader_t * prog, uint nFpDmaSize, void *pPatchedProgram, uint32 * pJts )
{
MICRO_PROFILE( g_mpFpcPatch2 );
#ifdef VJOBS_ON_SPURS
VjobChain3 &jobChain = g_spuGcm.m_jobSink;
uint32 nUCodeSize = prog->m_nUcodeSize;
CellSpursJob128 * pJob = g_spuGcm.m_jobPool128.Alloc( *g_spuGcm.m_pRoot->m_pFpcPatch2 );
Assert( pJob->header.sizeDmaList == 0 && pJob->header.sizeInOrInOut == 0 ); // the default MUST always be 1
pJob->header.useInOutBuffer = 1;
CDmaListConstructor dmaConstructor( pJob->workArea.dmaList );
dmaConstructor.AddInputDma( nFpDmaSize, prog );
dmaConstructor.AddInputDma( sizeof( *m_state.m_pSharedState ), ( void* )m_state.m_pSharedState );
// the g_spuGcmShared.m_nFpcpStartRangesAfterLastSync runs ahead of m_state.m_pSharedState->m_nStartRanges , because it's a PREDICTED
// start of range. It'll be absolutely in-sync with m_state.m_pSharedState->m_nStartRanges if we run SPUs synchronously
#ifdef DBGFLAG_ASSERT
uint nSharedStateStartRanges = m_state.m_pSharedState->m_nStartRanges;
#endif
// NOTE: if the asserts below fire, it may be due to invalid value in nSharedStateStartRanges because SPU DMAs stuff right down to m_state.m_pSharedState and it's changing while this code executes
Assert( uint( m_state.m_nEndOfJournalIdx - nSharedStateStartRanges ) <= m_state.m_pSharedState->m_nBufferMask + 1 );
Assert( uint( g_spuGcmShared.m_nFpcpStartRangesAfterLastSync - nSharedStateStartRanges ) <= uint( m_state.m_nEndOfJournalIdx - nSharedStateStartRanges ) );
uint nStartOfJournal = /*nSharedStateStartRanges*/g_spuGcmShared.m_nFpcpStartRangesAfterLastSync, nBufferMask = m_state.m_pSharedState->m_nBufferMask;
// we have 4 DMA elements ( 2..6 ) to fit the constant buffer; the 1st element may have to be as small as 16 bytes.
// this leaves the max constant buffer size 4 * 16kb + 16 bytes
const uint numRangeQwords = ( m_state.m_nEndOfJournalIdx - nStartOfJournal );
Assert( numRangeQwords <= nBufferMask + 1 );
if ( numRangeQwords != 0 )
{
uint nEndOfSpan0 = ( nStartOfJournal + nBufferMask + 1 ) & ~nBufferMask;
if ( ( signed int )( nEndOfSpan0 - m_state.m_nEndOfJournalIdx ) >= 0 )
{
//numRangeQwords = ( m_state.m_nEndOfJournalIdx - nStartOfJournal );
dmaConstructor.AddInputDmaLarge( ( numRangeQwords ) * sizeof( fltx4 ), m_state.m_pSharedState->GetBufferStart() + ( nStartOfJournal & nBufferMask ) );
}
else
{
//numRangeQwords = nFirstRange + nSecondRange ;
dmaConstructor.AddInputDmaLarge( ( nEndOfSpan0 - nStartOfJournal ) * sizeof( fltx4 ), m_state.m_pSharedState->GetBufferStart() + ( nStartOfJournal & nBufferMask ) );
dmaConstructor.AddInputDmaLarge( ( m_state.m_nEndOfJournalIdx - nEndOfSpan0 ) * sizeof( fltx4 ), m_state.m_pSharedState->GetBufferStart() );
}
}
else
{
dmaConstructor.AddSizeInOrInOut( 16 ); // we need at least 16 bytes in the ranges area for temporary storage
}
dmaConstructor.FinishIoBuffer( &pJob->header );
if( pJob->header.sizeDmaList > 7 * sizeof( uint64 ) )
{
Error( "FpcPatch2: DMA list size out of range (%d). job_fpcpatch2 parameters won't fit. numRangeQwords = %d\n", pJob->header.sizeDmaList, numRangeQwords );
}
// IMPORTANT: make it always synchronous , in case we don't have the target to patch. The only reason for this job to exist is to make it synchronous
// Also, if the range is large, still make it synchronous, to avoid subsequent jobs doing a lot of computations in vein
uint nAsync = !pPatchedProgram || numRangeQwords >= 1024 ? 0 : ( ( m_nFpcPatchCounter ) & m_nFpcPatchSyncMask ) ;
dmaConstructor[7][0] = m_nFpcPatchCounterOfLastSyncJob;
dmaConstructor[7][1] = m_nFpcPatchCounter;
dmaConstructor[8][0] = ( uint32 ) pPatchedProgram;
dmaConstructor[8][1] = uintp( pJts ); // the SPU->RSX dma element; may be NULL
dmaConstructor[9][0] = m_state.m_nEndOfJournalIdx;
dmaConstructor[9][1] = ( uint32 ) nStartOfJournal;
if( !IsCert() )
{
pJob->header.jobType |= CELL_SPURS_JOB_TYPE_MEMORY_CHECK;
}
dmaConstructor[8][0] |= g_nFpcPatch2JobExtraFlags;
if ( !nAsync )
{
dmaConstructor[8][0] |= job_fpcpatch::FLAG_PUT_STATE;
m_nFpcPatchCounterOfLastSyncJob = m_nFpcPatchCounter;
pJob->header.jobType |= CELL_SPURS_JOB_TYPE_STALL_SUCCESSOR;
g_spuGcmShared.m_nFpcpStartRangesAfterLastSync = m_state.m_nEndOfJournalIdx;
}
#ifdef DBGFLAG_ASSERT
int nError = cellSpursCheckJob( ( const CellSpursJob256* )pJob, sizeof( *pJob ), 256 );
static int s_nJobErrors = 0;
if( CELL_OK != nError )
{
++s_nJobErrors;
}
#endif
if ( !nAsync )
{
jobChain.PushSyncJobSync( CELL_SPURS_JOB_COMMAND_JOB( pJob ) );
}
else
{
jobChain.Push( CELL_SPURS_JOB_COMMAND_JOB( pJob ) );
}
#ifdef DEBUG_FPC_PATCHER
if( m_bSync )
{
if( pJts )
{
volatile uint32 * pJts2 = pJts;
while( *pJts2 )
continue;
}
volatile uint64_t * pEaJob = &pJob->header.eaBinary;
while( * pEaJob )
continue;
}
#endif
s_nFpcPatch2Calls++;
#endif
}
#ifdef DEBUG_FPC_PATCHER
extern void PatchUcodeConstSwap( uint32 * pDestination, const uint32 * pSource, int nLength );
extern uint fspatchGetLength( CGtype nType );
uint32 g_nConstLengthCounter[5] = { 0, 0, 0, 0, 0 };
void CFragmentProgramConstantPatcher_SPU::ValidatePatchedProgram( const CgBinaryProgram * prog, void * pPatchedUcode )
{
Assert( prog->profile == PROFILE_SCE_FP_RSX && prog->binaryFormatRevision == CG_BINARY_FORMAT_REVISION );
uint32 nUCodeSize = prog->ucodeSize;
void * pUcode = stackalloc( nUCodeSize );
void * pSourceUcode = ( ( uint8* ) prog ) + prog->ucode;
V_memcpy( pUcode, ( ( uint8* ) prog ) + prog->ucode, nUCodeSize );
CgBinaryParameter * pParameters = ( CgBinaryParameter * )( uintp( prog ) + prog->parameterArray ) ;
uint32 * pPatchDestination = NULL;
Assert( cellGcmCgGetCountParameter( ( CGprogram ) prog ) == prog->parameterCount );
for ( int nPar = 0; nPar < prog->parameterCount; ++nPar )
{
CgBinaryParameter * pPar = pParameters + nPar;
Assert( pPar == ( CgBinaryParameter * ) cellGcmCgGetIndexParameter( ( CGprogram ) prog, nPar ) );
#ifdef DBGFLAG_ASSERT
const char * pLeafName = ( const char * )( uintp( prog ) + pPar->name );
( void )pLeafName;
uint32 * pDefault = pPar->defaultValue ? ( uint32* )( uintp( prog ) + pPar->defaultValue ) : NULL ;
#endif
if ( pPar->embeddedConst )
{
Assert( pPar->res == CG_C && pPar->var == CG_UNIFORM ); // this MUST be a uniform constant.. at least I think that's the only kind we need to patch
const CgBinaryEmbeddedConstant * pEmbedded = ( const CgBinaryEmbeddedConstant* )( uintp( prog ) + pPar->embeddedConst );
int nLength = fspatchGetLength( pPar->type );
g_nConstLengthCounter[nLength] ++;
for ( uint nEm = 0; nEm < pEmbedded->ucodeCount; ++ nEm )
{
uint ucodeOffset = pEmbedded->ucodeOffset[nEm]; // is this the offset from prog structure start?
Assert( ucodeOffset < nUCodeSize - 4 );
#ifdef DBGFLAG_ASSERT
Assert( cellGcmCgGetEmbeddedConstantOffset( ( CGprogram ) prog, ( CGparameter ) pPar, nEm ) == ucodeOffset );
const float * pDefaultCheck = cellGcmCgGetParameterValues( ( CGprogram ) prog, ( CGparameter ) pPar );
Assert( pDefault == ( uint32* ) pDefaultCheck );
uint32 * pUcodeEmConst = ( uint32* )( uintp( pSourceUcode ) + ucodeOffset );
Assert( !pDefault || !V_memcmp( pDefault, pUcodeEmConst, nLength * 4 ) );
#endif
pPatchDestination = ( uint32* )( uintp( pUcode ) + ucodeOffset );
uint32 * pPatchedCheck = ( uint32* )( uintp( pPatchedUcode ) + ucodeOffset );
PatchUcodeConstSwap( pPatchDestination, ( uint32* ) & ( m_pSyncState[pPar->resIndex] ), nLength );
Assert( !V_memcmp( pPatchDestination, pPatchedCheck, nLength * 4 ) );
}
}
}
Assert( !V_memcmp( pPatchedUcode, pUcode, nUCodeSize ) );
}
#endif
void FpcPatchState::Init( job_fpcpatch::FpcPatchState_t * pSharedState, uint32 nBufferQwords )
{
#ifdef _DEBUG
//m_nRangesAdded = 0;
#endif
pSharedState->m_nBufferMask = m_nBufferMask = nBufferQwords - 1;
pSharedState->m_nStartRanges = m_nEndOfJournalIdx = IsCert() ? 0 : nBufferQwords - 128;
pSharedState->m_eaThis = m_pSharedState = pSharedState;
pSharedState->m_nThisStatePatchCounter = 0;
pSharedState->m_nDebuggerBreak = 0;
}
void FpcPatchState::GetSyncState( fltx4 * pRegisters )
{
V_memcpy( pRegisters, m_pSharedState->m_reg, job_fpcpatch:: MAX_VIRTUAL_CONST_COUNT * sizeof( fltx4 ) );
for( uint nJournalIdx = m_pSharedState->m_nStartRanges; nJournalIdx < m_nEndOfJournalIdx ; )
{
job_fpcpatch:: ConstRangeHeader_t & range = ((job_fpcpatch::ConstRangeHeader_t*)m_pSharedState->GetBufferStart())[ nJournalIdx & m_pSharedState->m_nBufferMask ];
nJournalIdx++;
for( uint nConstIdx = 0 ; nConstIdx < range.m_u32.m_nCount; ++nConstIdx, ++nJournalIdx )
{
pRegisters[ range.m_u32.m_nStart + nConstIdx ] = m_pSharedState->GetBufferStart()[nJournalIdx & m_pSharedState->m_nBufferMask ];
}
}
}
/*
void FpcPatchState::Reset()
{
m_nEndOfJournalIdx = 0;
m_pSharedState->m_nStartRanges = 0;
}
*/
#ifdef _DEBUG
static int s_nDebugRangeAdd = -1, s_nDebugSetConst = -1;
#endif
uint FpcPatchState::AddRange( uint32 nStart, uint32 nCount, const float * pData )
{
#ifndef _CERT
if( nStart + nCount > job_fpcpatch::MAX_VIRTUAL_CONST_COUNT )
{
Error( "AddRange(%d..%d) out of range <%d\n", nStart, nCount, int( job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
}
#endif
#ifdef _DEBUG
//Assert( s_nDebugRangeAdd != m_nRangesAdded );
if( int( s_nDebugSetConst - nStart ) >= 0 && int( s_nDebugSetConst - nStart ) < int( nCount ) )
{
fltx4 flDebugRegister = LoadUnalignedSIMD( pData + 4 * int( s_nDebugSetConst - nStart ) );
DebuggerBreak();
}
//++m_nRangesAdded;
#endif
// spin-wait, then V_memcpy range
COMPILE_TIME_ASSERT( sizeof( job_fpcpatch::ConstRangeHeader_t ) == 16 );
const uint nSpins = 0x1FF;
Assert( !( nSpins & ( nSpins + 1 ) ) );
//
// We need space for nCount + 1 QWords (1 Qword for the ConstRangeHeader_t)
// And we need m_nEndOfJournalIdx != m_nStartRanges to distinguish between
// the all-empty and all-full buffers
//
uint nAttempts = 0;
for ( ; ; ++nAttempts )
{
uint32 nStartRanges = m_pSharedState->m_nStartRanges;
Assert( uint32( m_nEndOfJournalIdx - nStartRanges ) <= m_nBufferMask + 1 );
// compute the new end - start; is it running further than buffer size away?
if ( ( m_nEndOfJournalIdx + nCount - ( nStartRanges + m_nBufferMask + 1 ) ) & 0x80000000 )
{ // no, the comparison is negative, therefore it's safe to fill it in
break;
}
// if ( ( nAttempts & nSpins ) == nSpins )
{
// the caller prints warning about this stall.
sys_timer_usleep( 60 ); // TODO: proper spinwait; proper OS syncronization
if( nAttempts == ( 1000000 / 60 ) )
{
// waiting for a second already ...
Warning(
"***************************************************************************************************************\n"
"* SPU hang in FpcPatchState::AddRange(). Please send this log (including a couple of screens above) to Sergiy *\n"
);
Msg( "AddRange(%d,%d,%p), ", nStart, nCount, pData );
Msg( "SharedState @%p {start=0x%X&0x%X,patch=%X,job=%X},", m_pSharedState, m_pSharedState->m_nStartRanges, m_pSharedState->m_nBufferMask, m_pSharedState->m_nThisStatePatchCounter, m_pSharedState->m_eaThisStateJobDescriptor );
Msg( "FpcpState @%p {end=0x%X},", this, this->m_nEndOfJournalIdx );
Msg( "SpuGcmShared trace {0x%X,0x%X,0x%X}\n", g_spuGcmShared.m_nFpcPatchCounterOfLastSyncJob, g_spuGcmShared.m_nFpcPatchCounter, g_spuGcmShared.m_nFpcpStartRangesAfterLastSync );
Msg( "RSX put=%X, get=%X sysring{put=%X,end=%X}\n", g_spuGcmShared.m_eaGcmControlRegister->put, g_spuGcmShared.m_eaGcmControlRegister->get,
g_spuGcmShared.m_sysring.m_nPut, g_spuGcmShared.m_sysring.m_nEnd );
Msg( "last JTS ret guard patched @%X, ", *cellGcmGetLabelAddress( GCM_LABEL_DEBUG_FPCP_RING ) );
Msg( "ringRsx[%d]:", g_spuGcmShared.m_fpcpRing.m_ringRsx.Count() );
for( int i = 0; i < g_spuGcmShared.m_fpcpRing.m_ringRsx.Count(); ++i )
{
RsxSpuDoubleRing::Segment_t & segment = g_spuGcmShared.m_fpcpRing.m_ringRsx[i];
Msg(" {%X,%p,%s}", segment.m_eaBase, segment.m_pSpuJts, *(segment.m_pSpuJts) == CELL_SPURS_JOB_COMMAND_LWSYNC ? "LWSYNC" : *(segment.m_pSpuJts) == CELL_SPURS_JOB_COMMAND_JTS ? "JTS" : "ERROR" );
}
Msg( "\nringSpu[%d]:", g_spuGcmShared.m_fpcpRing.m_ringSpu.Count() );
for( int i = 0; i < g_spuGcmShared.m_fpcpRing.m_ringSpu.Count(); ++i )
{
RsxSpuDoubleRing::Segment_t & segment = g_spuGcmShared.m_fpcpRing.m_ringSpu[i];
Msg(" {%X,%p,%s}", segment.m_eaBase, segment.m_pSpuJts, *(segment.m_pSpuJts) == CELL_SPURS_JOB_COMMAND_LWSYNC ? "LWSYNC" : *(segment.m_pSpuJts) == CELL_SPURS_JOB_COMMAND_JTS ? "JTS" : "ERROR" );
}
Msg( "***************************************************************************************************************\n" );
}
}
}
// we have enough free buffer to insert stuff
job_fpcpatch::ConstRangeHeader_t *hdr = (job_fpcpatch::ConstRangeHeader_t *)AddInternalPtr();
hdr->m_u32.m_nStart = nStart;
hdr->m_u32.m_nCount = nCount;
// add constants block
AddInternalBlock( pData, nCount );
return nAttempts;
}
#endif