612 lines
23 KiB
C++
612 lines
23 KiB
C++
//========= Copyright © Valve Corporation, All rights reserved. ====//
|
|
#include "tier0/platform.h"
|
|
#ifdef _PS3
|
|
#include "dxabstract.h"
|
|
|
|
#include <sys/memory.h>
|
|
#include "ps3/spugcm_shared.h"
|
|
#include "fpcpatcher_spu.h"
|
|
#include "cg/cg.h"
|
|
#include "cg/cgBinary.h"
|
|
#include "vjobs_interface.h"
|
|
#include "tier0/hardware_clock_fast.h"
|
|
#include "vjobs/fpcpatch_shared.h"
|
|
#include "vjobs/root.h"
|
|
#include "ps3/vjobutils.h"
|
|
#include "tier0/microprofiler.h"
|
|
#include "ps3/ps3_gcm_config.h"
|
|
#include "spugcm.h"
|
|
|
|
enum
|
|
{
|
|
PROFILE_SCE_VP_RSX = 7003,
|
|
PROFILE_SCE_FP_RSX = 7004
|
|
};
|
|
|
|
#define GCM_MUST_SUCCEED( FUNC, ... ) do { int nError = FUNC(__VA_ARGS__); if( nError != CELL_OK ) { Error( "Error 0x%X in " #FUNC ", %s:%d\n", nError, __FILE__, __LINE__ ); } } while( 0 )
|
|
DEFINE_LOGGING_CHANNEL_NO_TAGS( LOG_VJOBS, "VJOBS" );
|
|
|
|
CFragmentProgramConstantPatcher_SPU g_pixelShaderPatcher; // Patches pixel shader constants
|
|
|
|
|
|
CMicroProfiler g_mpBindProgram, g_mpFpcPatch2;
|
|
// debug only
|
|
CFragmentProgramConstantPatcher_SPU::CFragmentProgramConstantPatcher_SPU()
|
|
{
|
|
m_pBuffer = m_pBufferEnd = NULL;
|
|
m_nIoOffsetDelta = 0; // m_pBuffer + m_nIoOffsetDelta == IO offset usable by RSX
|
|
|
|
m_pPutFragmentProgram = NULL;
|
|
#ifdef DEBUG_FPC_PATCHER
|
|
m_bSync = ( CommandLine()->FindParm( "-fpcpsync" ) != 0 );
|
|
#endif
|
|
}
|
|
|
|
|
|
void CFragmentProgramConstantPatcher_SPU::InitLocal( void *pBuffer, uint nSize )
|
|
{
|
|
m_nFpcPatchCounter = 0;
|
|
m_nFpcPatchCounterOfLastSyncJob = 0;
|
|
|
|
//cellGcmSetDebugOutputLevel( CELL_GCM_DEBUG_LEVEL2 );
|
|
const uint nOverfetchGuard = 1024; // RSX front end prefetches up to 4k, but 1k is ( should be ) enough to avoid overfetch crashes
|
|
const uint nStateBufferQwords = 1 << 12; // make space for at least 8 full batches of constants...
|
|
uint nPatchStateBufferSize = ( sizeof( job_fpcpatch::FpcPatchState_t ) + sizeof( fltx4 ) * nStateBufferQwords );
|
|
uint32 nBufferIoOffset;
|
|
|
|
m_bFpcPatchOnPpu = ( 0 != CommandLine()->FindParm( "-fpcpatchonppu" ) );
|
|
#ifdef DEBUG_FPC_PATCHER
|
|
m_bTestAlwaysStateSync = ( 0 != CommandLine()->FindParm( "-fpcpstatesync" ) );
|
|
#endif
|
|
m_bEnableSPU = true;
|
|
m_nFpcPatchSyncMask = 0;
|
|
|
|
// use this passed buffer (probably from local memory) for the patched stuff
|
|
m_pBuffer = ( uint32* ) pBuffer;
|
|
m_pBufferEnd = ( uint32* ) ( uintp( pBuffer ) + nSize );
|
|
m_nBufferLocation = CELL_GCM_LOCATION_LOCAL;
|
|
m_isBufferPassedIn = true;
|
|
m_state.Init( ( job_fpcpatch::FpcPatchState_t* )MemAlloc_AllocAligned( nPatchStateBufferSize, 128 ), nStateBufferQwords );
|
|
GCM_MUST_SUCCEED( cellGcmAddressToOffset, m_pBuffer, &nBufferIoOffset );
|
|
|
|
#ifdef DBGFLAG_ASSERT
|
|
uint32 nBufferIoOffsetCheck;
|
|
GCM_MUST_SUCCEED( cellGcmAddressToOffset, m_pBuffer, &nBufferIoOffsetCheck );
|
|
Assert( nBufferIoOffsetCheck == nBufferIoOffset );
|
|
|
|
Assert( !( nBufferIoOffsetCheck & 0x7F ) );
|
|
|
|
for( uint nOffset = 0; nOffset < nSize; nOffset += 128 )
|
|
{
|
|
GCM_MUST_SUCCEED( cellGcmAddressToOffset, ((uint8*)m_pBuffer) + nOffset, &nBufferIoOffsetCheck );
|
|
Assert( nBufferIoOffsetCheck == nBufferIoOffset + nOffset );
|
|
}
|
|
#endif
|
|
|
|
m_nIoOffsetDelta = nBufferIoOffset - uintp( m_pBuffer );
|
|
|
|
#ifdef DEBUG_FPC_PATCHER
|
|
m_pSyncState = ( fltx4* ) MemAlloc_AllocAligned( sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT, 16 );
|
|
V_memset( m_pSyncState, 0xCD, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT );
|
|
V_memset( m_state.m_pSharedState->m_reg, 0xCD, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT );
|
|
#endif
|
|
ResetPut();
|
|
//cellGcmSetDebugOutputLevel( CELL_GCM_DEBUG_LEVEL0 );
|
|
}
|
|
|
|
|
|
void CFragmentProgramConstantPatcher_SPU::Shutdown()
|
|
{
|
|
}
|
|
|
|
|
|
|
|
void CFragmentProgramConstantPatcher_SPU::ResetPut()
|
|
{
|
|
m_pPutFragmentProgram = m_pBufferEnd; // reserved word for the count of constants to set
|
|
}
|
|
|
|
CFragmentProgramConstantPatcher_SPU::~CFragmentProgramConstantPatcher_SPU()
|
|
{
|
|
if( m_isBufferPassedIn )
|
|
{
|
|
MemAlloc_FreeAligned( m_state.m_pSharedState );
|
|
}
|
|
else
|
|
{
|
|
sys_memory_free( ( sys_addr_t )m_pBuffer );
|
|
}
|
|
#ifdef DEBUG_FPC_PATCHER
|
|
MemAlloc_FreeAligned( m_pSyncState );
|
|
#endif
|
|
}
|
|
|
|
|
|
void CFragmentProgramConstantPatcher_SPU::BeginScene()
|
|
{
|
|
m_nFpcPatchCounterAtBeginScene = m_nFpcPatchCounter;
|
|
// we shouldn't have in-flight SPU jobs by now.. should we?
|
|
Assert( uint( g_spuGcmShared.m_nFpcpStartRangesAfterLastSync - m_state.m_pSharedState->m_nStartRanges ) <= m_state.m_pSharedState->m_nBufferMask + 1 );
|
|
}
|
|
|
|
|
|
void CFragmentProgramConstantPatcher_SPU::EndScene()
|
|
{
|
|
#if ENABLE_MICRO_PROFILER > 0
|
|
uint nPatchCounter = m_nFpcPatchCounter - m_nFpcPatchCounterAtBeginScene;
|
|
extern bool g_bDxMicroProfile;
|
|
if( g_bDxMicroProfile && nPatchCounter )
|
|
{
|
|
g_mpBindProgram.PrintAndReset( "[BindProgram] " );
|
|
g_mpFpcPatch2 .PrintAndReset( "[FpcPatch2] " );
|
|
}
|
|
#endif
|
|
}
|
|
|
|
job_fpcpatch2::FpHeader_t g_nullFpHeader = {0,0,0,0};
|
|
|
|
// semantics should match cgGLSetFragmentRegisterBlock()
|
|
void CFragmentProgramConstantPatcher_SPU::SetFragmentRegisterBlock( uint nStartRegister, uint nVector4fCount, const float * pConstantData )
|
|
{
|
|
#ifndef _CERT
|
|
if ( nStartRegister >= job_fpcpatch::MAX_VIRTUAL_CONST_COUNT || nStartRegister + nVector4fCount > job_fpcpatch::MAX_VIRTUAL_CONST_COUNT )
|
|
Error( "Invalid Fragment Register Block Range %u..%u\n", nStartRegister, nStartRegister + nVector4fCount );
|
|
#endif
|
|
|
|
#ifdef DEBUG_FPC_PATCHER
|
|
if( m_bSync )
|
|
{
|
|
fltx4 reg[job_fpcpatch::MAX_VIRTUAL_CONST_COUNT];
|
|
m_state.GetSyncState( reg );
|
|
Assert( !V_memcmp( m_pSyncState, reg, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
|
|
}
|
|
uint nEnd = m_state.m_nEndOfJournalIdx;
|
|
#endif
|
|
|
|
// we have 4 DMA elements ( 2..6 ) to fit the constant buffer; the 1st element may have to be as small as 16 bytes.
|
|
// this leaves the max constant buffer size 4 * 16kb + 16 bytes
|
|
const uint nMaxUploadRangeBeforeStateSync = ( 32 * 1024 ) / sizeof( fltx4 );
|
|
uint numUploadRangeQwords = m_state.m_nEndOfJournalIdx - g_spuGcmShared.m_nFpcpStartRangesAfterLastSync;
|
|
///////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// PREPATCH MUST BE DONE IN (CTXFLUSH OR) DRAW JOB FROM NOW ON!!! g_spuGcmShared.m_nFpcpStartRangesAfterLastSync IS SYNCHRONOUS AND CORRECT THERE
|
|
//
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
/*
|
|
bool bPrePatch = nVector4fCount + 1 + numUploadRangeQwords > nMaxUploadRangeBeforeStateSync;
|
|
if( bPrePatch )
|
|
{
|
|
// force state sync now
|
|
if( g_spuGcmShared.m_enableStallWarnings )
|
|
{
|
|
Warning( "PPU-SPU Wait for RSX. SetFragmentRegisterBlock: Forced to set state on PPU, %u vectors, %u qwords in history. This is slow fallback path.\n", nVector4fCount, numUploadRangeQwords );
|
|
}
|
|
FpcPatch2( &g_nullFpHeader, sizeof( g_nullFpHeader ), NULL, NULL );
|
|
}
|
|
|
|
*/
|
|
if( uint nAttempts = m_state.AddRange( nStartRegister, nVector4fCount, pConstantData ) )
|
|
{
|
|
if( g_spuGcmShared.m_enableStallWarnings )
|
|
{
|
|
Warning( "PPU-SPU Wait for RSX. SetFragmentRegisterBlock: Stall, %d spins. Waiting for more memory; %d qwords, %d jobs buffered up\n", nAttempts, m_state.m_nEndOfJournalIdx - m_state.m_pSharedState->m_nStartRanges, g_spuGcmShared.m_nFpcPatchCounter - m_state.m_pSharedState->m_nThisStatePatchCounter );
|
|
}
|
|
}
|
|
|
|
#ifdef DEBUG_FPC_PATCHER
|
|
if( m_bTestAlwaysStateSync && !bPrePatch )
|
|
{
|
|
FpcPatch2( &g_nullFpHeader, sizeof( g_nullFpHeader ), NULL, NULL );
|
|
}
|
|
|
|
V_memcpy( m_pSyncState + nStartRegister, pConstantData, nVector4fCount * sizeof( fltx4 ) );
|
|
if( m_bSync )
|
|
{
|
|
fltx4 reg[job_fpcpatch::MAX_VIRTUAL_CONST_COUNT];
|
|
m_state.GetSyncState( reg );
|
|
Assert( !V_memcmp( m_pSyncState, reg, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
|
|
}
|
|
#endif
|
|
}
|
|
|
|
//volatile int g_nDebugStage = 0;
|
|
|
|
//
|
|
// Match the semantics of cgGLBindProgram()
|
|
// There are 2 formats of fragment shaders, see SDK docs "2. 2 Cg Compiler Options" and
|
|
// in Cg Compiler User's Guide:
|
|
// "7. 2 NV Binary Shader Format (VPO and FPO)"
|
|
// "7. 4 Cgb File Format Specification"
|
|
//
|
|
|
|
void CFragmentProgramConstantPatcher_SPU::BindProgram( const struct IDirect3DPixelShader9 * psh )
|
|
{
|
|
MICRO_PROFILE( g_mpBindProgram );
|
|
|
|
const job_fpcpatch2::FpHeader_t * prog = psh->m_data.m_eaFp;
|
|
uint32 nFragmentProgramOffset = uintp( m_pPutFragmentProgram ) + m_nIoOffsetDelta;
|
|
|
|
g_spuGcmShared.m_fpcpRing.UnlockRsxMemoryForSpu();
|
|
m_pPutFragmentProgram = ( uint32* )g_spuGcmShared.m_fpcpRing.LockRsxMemoryForSpu( &g_spuGcmShared.m_fpcpJobChain, prog->m_nUcodeSize );
|
|
nFragmentProgramOffset = uintp( m_pPutFragmentProgram ) - uintp( g_ps3gcmGlobalState.m_pLocalBaseAddress );
|
|
if( !IsCert() && nFragmentProgramOffset >= g_ps3gcmGlobalState.m_nLocalSize )
|
|
{
|
|
Error( "Fragment program Ucode buffer offset 0x%X is at unexpected address not in local memory\n", nFragmentProgramOffset );
|
|
}
|
|
|
|
if ( !IsCert() && ( m_pPutFragmentProgram < m_pBuffer || m_pPutFragmentProgram >= m_pBufferEnd ) )
|
|
{
|
|
Error( "Fragment Program UCode buffer overflow.\n" );
|
|
}
|
|
|
|
#ifdef DEBUG_FPC_PATCHER
|
|
if( m_bSync )
|
|
{
|
|
fltx4 reg[job_fpcpatch::MAX_VIRTUAL_CONST_COUNT];
|
|
m_state.GetSyncState( reg );
|
|
Assert( !V_memcmp( m_pSyncState, reg, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
|
|
}
|
|
#endif
|
|
|
|
uint nTexControls = prog->m_nTexControls;
|
|
|
|
// set jump to self
|
|
GCM_CTX_RESERVE( 7 + 2 * nTexControls );
|
|
uint32 * pJts = NULL;
|
|
|
|
FpcPatch2( prog, psh->m_data.m_nFpDmaSize, m_pPutFragmentProgram, pJts );
|
|
|
|
CELL_GCM_METHOD_SET_SHADER_CONTROL( GCM_CTX->current, prog->m_nShaderControl0 ); // +2
|
|
CELL_GCM_METHOD_SET_SHADER_PROGRAM( GCM_CTX->current, m_nBufferLocation + 1, ( nFragmentProgramOffset & 0x1fffffff ) ); // +2
|
|
CELL_GCM_METHOD_SET_VERTEX_ATTRIB_OUTPUT_MASK( GCM_CTX->current, psh->m_data.m_attributeInputMask | 0x20 ); // +2
|
|
V_memcpy( GCM_CTX->current, prog->GetTexControls(), nTexControls * sizeof( uint32 ) * 2 );
|
|
GCM_CTX->current += 2 * nTexControls;
|
|
|
|
#ifdef DEBUG_FPC_PATCHER
|
|
if( m_bSync )
|
|
{
|
|
g_ps3gcmGlobalState.CmdBufferFlush( CPs3gcmGlobalState::kFlushForcefully );
|
|
while ( *( volatile uint32* )pJts )
|
|
{
|
|
sys_timer_usleep( 50 );// wait for nop
|
|
}
|
|
#ifdef DEBUG_FPC_PATCHER
|
|
{
|
|
fltx4 reg[job_fpcpatch::MAX_VIRTUAL_CONST_COUNT];
|
|
m_state.GetSyncState( reg );
|
|
Assert( !V_memcmp( m_pSyncState, reg, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
|
|
}
|
|
ValidatePatchedProgram( psh->m_pCgProg, m_pPutFragmentProgram );
|
|
uint32 nFragmentProgramOffsetCheck;
|
|
GCM_MUST_SUCCEED( cellGcmAddressToOffset, m_pPutFragmentProgram, &nFragmentProgramOffsetCheck );
|
|
Assert( nFragmentProgramOffsetCheck == nFragmentProgramOffset );
|
|
#endif
|
|
|
|
g_ps3gcmGlobalState.CmdBufferFinish();
|
|
}
|
|
#endif
|
|
m_nFpcPatchCounter++;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
uint g_nFpcPatch2JobExtraFlags = 0; // set this to 2 and SPU will break
|
|
|
|
|
|
static int s_nFpcPatch2Calls = 0;
|
|
|
|
void CFragmentProgramConstantPatcher_SPU::FpcPatch2( const job_fpcpatch2::FpHeader_t * prog, uint nFpDmaSize, void *pPatchedProgram, uint32 * pJts )
|
|
{
|
|
MICRO_PROFILE( g_mpFpcPatch2 );
|
|
|
|
#ifdef VJOBS_ON_SPURS
|
|
VjobChain3 &jobChain = g_spuGcm.m_jobSink;
|
|
uint32 nUCodeSize = prog->m_nUcodeSize;
|
|
CellSpursJob128 * pJob = g_spuGcm.m_jobPool128.Alloc( *g_spuGcm.m_pRoot->m_pFpcPatch2 );
|
|
Assert( pJob->header.sizeDmaList == 0 && pJob->header.sizeInOrInOut == 0 ); // the default MUST always be 1
|
|
pJob->header.useInOutBuffer = 1;
|
|
|
|
CDmaListConstructor dmaConstructor( pJob->workArea.dmaList );
|
|
|
|
dmaConstructor.AddInputDma( nFpDmaSize, prog );
|
|
dmaConstructor.AddInputDma( sizeof( *m_state.m_pSharedState ), ( void* )m_state.m_pSharedState );
|
|
|
|
// the g_spuGcmShared.m_nFpcpStartRangesAfterLastSync runs ahead of m_state.m_pSharedState->m_nStartRanges , because it's a PREDICTED
|
|
// start of range. It'll be absolutely in-sync with m_state.m_pSharedState->m_nStartRanges if we run SPUs synchronously
|
|
#ifdef DBGFLAG_ASSERT
|
|
uint nSharedStateStartRanges = m_state.m_pSharedState->m_nStartRanges;
|
|
#endif
|
|
// NOTE: if the asserts below fire, it may be due to invalid value in nSharedStateStartRanges because SPU DMAs stuff right down to m_state.m_pSharedState and it's changing while this code executes
|
|
Assert( uint( m_state.m_nEndOfJournalIdx - nSharedStateStartRanges ) <= m_state.m_pSharedState->m_nBufferMask + 1 );
|
|
Assert( uint( g_spuGcmShared.m_nFpcpStartRangesAfterLastSync - nSharedStateStartRanges ) <= uint( m_state.m_nEndOfJournalIdx - nSharedStateStartRanges ) );
|
|
|
|
uint nStartOfJournal = /*nSharedStateStartRanges*/g_spuGcmShared.m_nFpcpStartRangesAfterLastSync, nBufferMask = m_state.m_pSharedState->m_nBufferMask;
|
|
|
|
// we have 4 DMA elements ( 2..6 ) to fit the constant buffer; the 1st element may have to be as small as 16 bytes.
|
|
// this leaves the max constant buffer size 4 * 16kb + 16 bytes
|
|
|
|
const uint numRangeQwords = ( m_state.m_nEndOfJournalIdx - nStartOfJournal );
|
|
Assert( numRangeQwords <= nBufferMask + 1 );
|
|
if ( numRangeQwords != 0 )
|
|
{
|
|
uint nEndOfSpan0 = ( nStartOfJournal + nBufferMask + 1 ) & ~nBufferMask;
|
|
if ( ( signed int )( nEndOfSpan0 - m_state.m_nEndOfJournalIdx ) >= 0 )
|
|
{
|
|
//numRangeQwords = ( m_state.m_nEndOfJournalIdx - nStartOfJournal );
|
|
dmaConstructor.AddInputDmaLarge( ( numRangeQwords ) * sizeof( fltx4 ), m_state.m_pSharedState->GetBufferStart() + ( nStartOfJournal & nBufferMask ) );
|
|
}
|
|
else
|
|
{
|
|
//numRangeQwords = nFirstRange + nSecondRange ;
|
|
dmaConstructor.AddInputDmaLarge( ( nEndOfSpan0 - nStartOfJournal ) * sizeof( fltx4 ), m_state.m_pSharedState->GetBufferStart() + ( nStartOfJournal & nBufferMask ) );
|
|
dmaConstructor.AddInputDmaLarge( ( m_state.m_nEndOfJournalIdx - nEndOfSpan0 ) * sizeof( fltx4 ), m_state.m_pSharedState->GetBufferStart() );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
dmaConstructor.AddSizeInOrInOut( 16 ); // we need at least 16 bytes in the ranges area for temporary storage
|
|
}
|
|
|
|
dmaConstructor.FinishIoBuffer( &pJob->header );
|
|
if( pJob->header.sizeDmaList > 7 * sizeof( uint64 ) )
|
|
{
|
|
Error( "FpcPatch2: DMA list size out of range (%d). job_fpcpatch2 parameters won't fit. numRangeQwords = %d\n", pJob->header.sizeDmaList, numRangeQwords );
|
|
}
|
|
|
|
|
|
// IMPORTANT: make it always synchronous , in case we don't have the target to patch. The only reason for this job to exist is to make it synchronous
|
|
// Also, if the range is large, still make it synchronous, to avoid subsequent jobs doing a lot of computations in vein
|
|
uint nAsync = !pPatchedProgram || numRangeQwords >= 1024 ? 0 : ( ( m_nFpcPatchCounter ) & m_nFpcPatchSyncMask ) ;
|
|
|
|
dmaConstructor[7][0] = m_nFpcPatchCounterOfLastSyncJob;
|
|
dmaConstructor[7][1] = m_nFpcPatchCounter;
|
|
dmaConstructor[8][0] = ( uint32 ) pPatchedProgram;
|
|
dmaConstructor[8][1] = uintp( pJts ); // the SPU->RSX dma element; may be NULL
|
|
dmaConstructor[9][0] = m_state.m_nEndOfJournalIdx;
|
|
dmaConstructor[9][1] = ( uint32 ) nStartOfJournal;
|
|
if( !IsCert() )
|
|
{
|
|
pJob->header.jobType |= CELL_SPURS_JOB_TYPE_MEMORY_CHECK;
|
|
}
|
|
|
|
dmaConstructor[8][0] |= g_nFpcPatch2JobExtraFlags;
|
|
if ( !nAsync )
|
|
{
|
|
dmaConstructor[8][0] |= job_fpcpatch::FLAG_PUT_STATE;
|
|
m_nFpcPatchCounterOfLastSyncJob = m_nFpcPatchCounter;
|
|
pJob->header.jobType |= CELL_SPURS_JOB_TYPE_STALL_SUCCESSOR;
|
|
g_spuGcmShared.m_nFpcpStartRangesAfterLastSync = m_state.m_nEndOfJournalIdx;
|
|
}
|
|
|
|
#ifdef DBGFLAG_ASSERT
|
|
int nError = cellSpursCheckJob( ( const CellSpursJob256* )pJob, sizeof( *pJob ), 256 );
|
|
static int s_nJobErrors = 0;
|
|
if( CELL_OK != nError )
|
|
{
|
|
++s_nJobErrors;
|
|
}
|
|
#endif
|
|
|
|
if ( !nAsync )
|
|
{
|
|
jobChain.PushSyncJobSync( CELL_SPURS_JOB_COMMAND_JOB( pJob ) );
|
|
}
|
|
else
|
|
{
|
|
jobChain.Push( CELL_SPURS_JOB_COMMAND_JOB( pJob ) );
|
|
}
|
|
|
|
#ifdef DEBUG_FPC_PATCHER
|
|
if( m_bSync )
|
|
{
|
|
if( pJts )
|
|
{
|
|
volatile uint32 * pJts2 = pJts;
|
|
while( *pJts2 )
|
|
continue;
|
|
}
|
|
|
|
volatile uint64_t * pEaJob = &pJob->header.eaBinary;
|
|
while( * pEaJob )
|
|
continue;
|
|
}
|
|
#endif
|
|
s_nFpcPatch2Calls++;
|
|
|
|
#endif
|
|
}
|
|
|
|
|
|
#ifdef DEBUG_FPC_PATCHER
|
|
extern void PatchUcodeConstSwap( uint32 * pDestination, const uint32 * pSource, int nLength );
|
|
extern uint fspatchGetLength( CGtype nType );
|
|
|
|
uint32 g_nConstLengthCounter[5] = { 0, 0, 0, 0, 0 };
|
|
|
|
void CFragmentProgramConstantPatcher_SPU::ValidatePatchedProgram( const CgBinaryProgram * prog, void * pPatchedUcode )
|
|
{
|
|
Assert( prog->profile == PROFILE_SCE_FP_RSX && prog->binaryFormatRevision == CG_BINARY_FORMAT_REVISION );
|
|
uint32 nUCodeSize = prog->ucodeSize;
|
|
void * pUcode = stackalloc( nUCodeSize );
|
|
void * pSourceUcode = ( ( uint8* ) prog ) + prog->ucode;
|
|
V_memcpy( pUcode, ( ( uint8* ) prog ) + prog->ucode, nUCodeSize );
|
|
|
|
CgBinaryParameter * pParameters = ( CgBinaryParameter * )( uintp( prog ) + prog->parameterArray ) ;
|
|
|
|
uint32 * pPatchDestination = NULL;
|
|
Assert( cellGcmCgGetCountParameter( ( CGprogram ) prog ) == prog->parameterCount );
|
|
for ( int nPar = 0; nPar < prog->parameterCount; ++nPar )
|
|
{
|
|
CgBinaryParameter * pPar = pParameters + nPar;
|
|
Assert( pPar == ( CgBinaryParameter * ) cellGcmCgGetIndexParameter( ( CGprogram ) prog, nPar ) );
|
|
|
|
#ifdef DBGFLAG_ASSERT
|
|
const char * pLeafName = ( const char * )( uintp( prog ) + pPar->name );
|
|
( void )pLeafName;
|
|
uint32 * pDefault = pPar->defaultValue ? ( uint32* )( uintp( prog ) + pPar->defaultValue ) : NULL ;
|
|
#endif
|
|
|
|
if ( pPar->embeddedConst )
|
|
{
|
|
Assert( pPar->res == CG_C && pPar->var == CG_UNIFORM ); // this MUST be a uniform constant.. at least I think that's the only kind we need to patch
|
|
const CgBinaryEmbeddedConstant * pEmbedded = ( const CgBinaryEmbeddedConstant* )( uintp( prog ) + pPar->embeddedConst );
|
|
int nLength = fspatchGetLength( pPar->type );
|
|
g_nConstLengthCounter[nLength] ++;
|
|
for ( uint nEm = 0; nEm < pEmbedded->ucodeCount; ++ nEm )
|
|
{
|
|
uint ucodeOffset = pEmbedded->ucodeOffset[nEm]; // is this the offset from prog structure start?
|
|
Assert( ucodeOffset < nUCodeSize - 4 );
|
|
#ifdef DBGFLAG_ASSERT
|
|
Assert( cellGcmCgGetEmbeddedConstantOffset( ( CGprogram ) prog, ( CGparameter ) pPar, nEm ) == ucodeOffset );
|
|
const float * pDefaultCheck = cellGcmCgGetParameterValues( ( CGprogram ) prog, ( CGparameter ) pPar );
|
|
Assert( pDefault == ( uint32* ) pDefaultCheck );
|
|
uint32 * pUcodeEmConst = ( uint32* )( uintp( pSourceUcode ) + ucodeOffset );
|
|
Assert( !pDefault || !V_memcmp( pDefault, pUcodeEmConst, nLength * 4 ) );
|
|
#endif
|
|
|
|
pPatchDestination = ( uint32* )( uintp( pUcode ) + ucodeOffset );
|
|
uint32 * pPatchedCheck = ( uint32* )( uintp( pPatchedUcode ) + ucodeOffset );
|
|
PatchUcodeConstSwap( pPatchDestination, ( uint32* ) & ( m_pSyncState[pPar->resIndex] ), nLength );
|
|
Assert( !V_memcmp( pPatchDestination, pPatchedCheck, nLength * 4 ) );
|
|
}
|
|
}
|
|
}
|
|
|
|
Assert( !V_memcmp( pPatchedUcode, pUcode, nUCodeSize ) );
|
|
}
|
|
#endif
|
|
|
|
|
|
void FpcPatchState::Init( job_fpcpatch::FpcPatchState_t * pSharedState, uint32 nBufferQwords )
|
|
{
|
|
#ifdef _DEBUG
|
|
//m_nRangesAdded = 0;
|
|
#endif
|
|
pSharedState->m_nBufferMask = m_nBufferMask = nBufferQwords - 1;
|
|
pSharedState->m_nStartRanges = m_nEndOfJournalIdx = IsCert() ? 0 : nBufferQwords - 128;
|
|
pSharedState->m_eaThis = m_pSharedState = pSharedState;
|
|
pSharedState->m_nThisStatePatchCounter = 0;
|
|
pSharedState->m_nDebuggerBreak = 0;
|
|
}
|
|
|
|
|
|
|
|
|
|
void FpcPatchState::GetSyncState( fltx4 * pRegisters )
|
|
{
|
|
V_memcpy( pRegisters, m_pSharedState->m_reg, job_fpcpatch:: MAX_VIRTUAL_CONST_COUNT * sizeof( fltx4 ) );
|
|
for( uint nJournalIdx = m_pSharedState->m_nStartRanges; nJournalIdx < m_nEndOfJournalIdx ; )
|
|
{
|
|
job_fpcpatch:: ConstRangeHeader_t & range = ((job_fpcpatch::ConstRangeHeader_t*)m_pSharedState->GetBufferStart())[ nJournalIdx & m_pSharedState->m_nBufferMask ];
|
|
nJournalIdx++;
|
|
for( uint nConstIdx = 0 ; nConstIdx < range.m_u32.m_nCount; ++nConstIdx, ++nJournalIdx )
|
|
{
|
|
pRegisters[ range.m_u32.m_nStart + nConstIdx ] = m_pSharedState->GetBufferStart()[nJournalIdx & m_pSharedState->m_nBufferMask ];
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
void FpcPatchState::Reset()
|
|
{
|
|
m_nEndOfJournalIdx = 0;
|
|
m_pSharedState->m_nStartRanges = 0;
|
|
}
|
|
*/
|
|
#ifdef _DEBUG
|
|
static int s_nDebugRangeAdd = -1, s_nDebugSetConst = -1;
|
|
#endif
|
|
|
|
uint FpcPatchState::AddRange( uint32 nStart, uint32 nCount, const float * pData )
|
|
{
|
|
#ifndef _CERT
|
|
if( nStart + nCount > job_fpcpatch::MAX_VIRTUAL_CONST_COUNT )
|
|
{
|
|
Error( "AddRange(%d..%d) out of range <%d\n", nStart, nCount, int( job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
|
|
}
|
|
#endif
|
|
#ifdef _DEBUG
|
|
//Assert( s_nDebugRangeAdd != m_nRangesAdded );
|
|
if( int( s_nDebugSetConst - nStart ) >= 0 && int( s_nDebugSetConst - nStart ) < int( nCount ) )
|
|
{
|
|
fltx4 flDebugRegister = LoadUnalignedSIMD( pData + 4 * int( s_nDebugSetConst - nStart ) );
|
|
DebuggerBreak();
|
|
}
|
|
//++m_nRangesAdded;
|
|
#endif
|
|
|
|
// spin-wait, then V_memcpy range
|
|
COMPILE_TIME_ASSERT( sizeof( job_fpcpatch::ConstRangeHeader_t ) == 16 );
|
|
const uint nSpins = 0x1FF;
|
|
Assert( !( nSpins & ( nSpins + 1 ) ) );
|
|
|
|
//
|
|
// We need space for nCount + 1 QWords (1 Qword for the ConstRangeHeader_t)
|
|
// And we need m_nEndOfJournalIdx != m_nStartRanges to distinguish between
|
|
// the all-empty and all-full buffers
|
|
//
|
|
|
|
uint nAttempts = 0;
|
|
for ( ; ; ++nAttempts )
|
|
{
|
|
uint32 nStartRanges = m_pSharedState->m_nStartRanges;
|
|
Assert( uint32( m_nEndOfJournalIdx - nStartRanges ) <= m_nBufferMask + 1 );
|
|
// compute the new end - start; is it running further than buffer size away?
|
|
if ( ( m_nEndOfJournalIdx + nCount - ( nStartRanges + m_nBufferMask + 1 ) ) & 0x80000000 )
|
|
{ // no, the comparison is negative, therefore it's safe to fill it in
|
|
break;
|
|
}
|
|
|
|
// if ( ( nAttempts & nSpins ) == nSpins )
|
|
{
|
|
// the caller prints warning about this stall.
|
|
sys_timer_usleep( 60 ); // TODO: proper spinwait; proper OS syncronization
|
|
if( nAttempts == ( 1000000 / 60 ) )
|
|
{
|
|
// waiting for a second already ...
|
|
Warning(
|
|
"***************************************************************************************************************\n"
|
|
"* SPU hang in FpcPatchState::AddRange(). Please send this log (including a couple of screens above) to Sergiy *\n"
|
|
);
|
|
Msg( "AddRange(%d,%d,%p), ", nStart, nCount, pData );
|
|
Msg( "SharedState @%p {start=0x%X&0x%X,patch=%X,job=%X},", m_pSharedState, m_pSharedState->m_nStartRanges, m_pSharedState->m_nBufferMask, m_pSharedState->m_nThisStatePatchCounter, m_pSharedState->m_eaThisStateJobDescriptor );
|
|
Msg( "FpcpState @%p {end=0x%X},", this, this->m_nEndOfJournalIdx );
|
|
Msg( "SpuGcmShared trace {0x%X,0x%X,0x%X}\n", g_spuGcmShared.m_nFpcPatchCounterOfLastSyncJob, g_spuGcmShared.m_nFpcPatchCounter, g_spuGcmShared.m_nFpcpStartRangesAfterLastSync );
|
|
|
|
Msg( "RSX put=%X, get=%X sysring{put=%X,end=%X}\n", g_spuGcmShared.m_eaGcmControlRegister->put, g_spuGcmShared.m_eaGcmControlRegister->get,
|
|
g_spuGcmShared.m_sysring.m_nPut, g_spuGcmShared.m_sysring.m_nEnd );
|
|
|
|
Msg( "last JTS ret guard patched @%X, ", *cellGcmGetLabelAddress( GCM_LABEL_DEBUG_FPCP_RING ) );
|
|
Msg( "ringRsx[%d]:", g_spuGcmShared.m_fpcpRing.m_ringRsx.Count() );
|
|
for( int i = 0; i < g_spuGcmShared.m_fpcpRing.m_ringRsx.Count(); ++i )
|
|
{
|
|
RsxSpuDoubleRing::Segment_t & segment = g_spuGcmShared.m_fpcpRing.m_ringRsx[i];
|
|
Msg(" {%X,%p,%s}", segment.m_eaBase, segment.m_pSpuJts, *(segment.m_pSpuJts) == CELL_SPURS_JOB_COMMAND_LWSYNC ? "LWSYNC" : *(segment.m_pSpuJts) == CELL_SPURS_JOB_COMMAND_JTS ? "JTS" : "ERROR" );
|
|
}
|
|
Msg( "\nringSpu[%d]:", g_spuGcmShared.m_fpcpRing.m_ringSpu.Count() );
|
|
for( int i = 0; i < g_spuGcmShared.m_fpcpRing.m_ringSpu.Count(); ++i )
|
|
{
|
|
RsxSpuDoubleRing::Segment_t & segment = g_spuGcmShared.m_fpcpRing.m_ringSpu[i];
|
|
Msg(" {%X,%p,%s}", segment.m_eaBase, segment.m_pSpuJts, *(segment.m_pSpuJts) == CELL_SPURS_JOB_COMMAND_LWSYNC ? "LWSYNC" : *(segment.m_pSpuJts) == CELL_SPURS_JOB_COMMAND_JTS ? "JTS" : "ERROR" );
|
|
}
|
|
Msg( "***************************************************************************************************************\n" );
|
|
}
|
|
}
|
|
}
|
|
// we have enough free buffer to insert stuff
|
|
job_fpcpatch::ConstRangeHeader_t *hdr = (job_fpcpatch::ConstRangeHeader_t *)AddInternalPtr();
|
|
hdr->m_u32.m_nStart = nStart;
|
|
hdr->m_u32.m_nCount = nCount;
|
|
|
|
// add constants block
|
|
AddInternalBlock( pData, nCount );
|
|
|
|
return nAttempts;
|
|
}
|
|
|
|
#endif
|