1956 lines
72 KiB
C++
1956 lines
72 KiB
C++
//========== Copyright © Valve Corporation, All rights reserved. ========
|
|
// This is the central hub for controlling SPU activities relating to
|
|
// RSX/graphics processing/rendering
|
|
//
|
|
#include "spugcm.h"
|
|
#include "ps3/ps3gcmmemory.h"
|
|
#include "fpcpatcher_spu.h"
|
|
#include "ps3gcmstate.h"
|
|
#include "vjobs/root.h"
|
|
#include "ps3/ps3gcmlabels.h"
|
|
#include "ps3/vjobutils_shared.h"
|
|
#include "vjobs/jobparams_shared.h"
|
|
#include "vjobs/ibmarkup_shared.h"
|
|
#include "inputsystem/iinputsystem.h"
|
|
#include <sysutil/sysutil_common.h>
|
|
#include <sysutil/sysutil_sysparam.h>
|
|
#include <cell/pad.h>
|
|
#include <materialsystem/imaterialsystem.h>
|
|
#include "fpcpatcher_spu.h"
|
|
#include "dxabstract.h"
|
|
#include "rsxflip.h"
|
|
|
|
extern IVJobs * g_pVJobs;
|
|
CSpuGcmSharedState g_spuGcmShared;
|
|
|
|
CSpuGcm g_spuGcm;
|
|
static int s_nFinishLabelValue = 0, s_nStopAtFinishLabelValue = -1;
|
|
CEdgeGeomRing g_edgeGeomRing;
|
|
ApplicationInstantCountersInfo_t g_aici;
|
|
CEdgePostWorkload g_edgePostWorkload;
|
|
|
|
#define PCB_RING_CTX ( *gCellGcmCurrentContext )
|
|
|
|
void FillNops( struct CellGcmContextData *context )
|
|
{
|
|
while( context->current < context->end )
|
|
*( context->current++ ) = CELL_GCM_METHOD_NOP;
|
|
}
|
|
|
|
int32_t SpuGcmCommandBufferReserveCallback( struct CellGcmContextData *context, uint32_t nCount )
|
|
{
|
|
return g_spuGcm.OnGcmCommandBufferReserveCallback( context, nCount );
|
|
}
|
|
|
|
void SpuGcmDebugFinish( CellGcmContextData *thisContext )
|
|
{
|
|
Assert( thisContext == &PCB_RING_CTX );
|
|
g_spuGcm.CmdBufferFinish();
|
|
}
|
|
|
|
|
|
void StallAndWarning( const char * pWarning )
|
|
{
|
|
sys_timer_usleep( 30 );
|
|
if( g_spuGcmShared.m_enableStallWarnings )
|
|
{
|
|
Warning( "Stall: %s\n", pWarning );
|
|
}
|
|
}
|
|
|
|
|
|
//#endif
|
|
|
|
void CSpuGcm::CreateRsxBuffers()
|
|
{
|
|
//////////////////////////////////////////////////////////////////////////
|
|
// Create Fragment program patch buffers
|
|
//
|
|
uint nFpcpRingBufferSize = CalculateMemorySizeFromCmdLineParam( "-fpcpRingSize", 512 * 1024, 32 * 1024 );
|
|
Msg("Fpcp ring size: %d bytes \n", nFpcpRingBufferSize );
|
|
m_fpcpRingBuffer.Alloc( kAllocPs3GcmShader, nFpcpRingBufferSize );
|
|
g_spuGcmShared.m_fpcpRing.SetRsxBuffer( m_fpcpRingBuffer.DataInLocalMemory(), nFpcpRingBufferSize, nFpcpRingBufferSize / 4, nFpcpRingBufferSize / 4096 );
|
|
uint nEdgeRingBufferSize = CalculateMemorySizeFromCmdLineParam( "-edgeRingSize", 2 * 1024 * 1024, 1536 * 1024 );
|
|
Msg("Edge ring size: %d bytes\n", nEdgeRingBufferSize );
|
|
m_edgeGeomRingBuffer.Alloc( kAllocPs3GcmEdgeGeomBuffer, nEdgeRingBufferSize );
|
|
if( nEdgeRingBufferSize < 8 * EDGEGEOMRING_MAX_ALLOCATION )
|
|
{
|
|
Error( "EdgeGeom has ring buffer that won't fit 8 jobs, which is a minimum. %u ( %u ) < 8 * %u\n", nEdgeRingBufferSize, m_edgeGeomRingBuffer.Size(), EDGEGEOMRING_MAX_ALLOCATION );
|
|
}
|
|
if( nEdgeRingBufferSize < 6 * 8 * EDGEGEOMRING_MAX_ALLOCATION )
|
|
{
|
|
Warning( "EdgeGeom has ring buffer that may block job_edgegeom performance. %u ( %u ) < 6 SPUs * 8 segments * %u\n", nEdgeRingBufferSize, m_edgeGeomRingBuffer.Size(), EDGEGEOMRING_MAX_ALLOCATION );
|
|
}
|
|
}
|
|
|
|
|
|
const vec_uint4 g_vuSpuGcmCookie = (vec_uint4){0x04291978,0xC00CC1EE,0x04291978,0xC00CC1EE};
|
|
void CSpuGcm::CreateIoBuffers()
|
|
{
|
|
const uint nCmdBufferOverfetchSlack = 1024;
|
|
uint nFpRingIoBufferSize = 16 * 1024;
|
|
uint nFpRingBufferSize = Max( nFpRingIoBufferSize, nCmdBufferOverfetchSlack ); // this buffer is RSX-write-only, at the end of mapped memory, it acts as an overfetch slack, too, so it must be at least the size of the slack
|
|
g_spuGcmShared.m_fpcpRing.SetIoBuffer( g_ps3gcmGlobalState.IoMemoryPrealloc( nFpRingIoBufferSize, nFpRingBufferSize ), nFpRingIoBufferSize );
|
|
|
|
m_pMlaaBufferCookie = NULL;
|
|
m_pMlaaBuffer = NULL;
|
|
m_pMlaaBufferOut = NULL;
|
|
m_pEdgePostRsxLock = NULL;
|
|
|
|
if( !CommandLine()->FindParm( "-noMlaa" ) )
|
|
//if( CommandLine()->FindParm( "-edgeMlaa" ) )
|
|
{
|
|
uint nSizeofEdgePostBuffer = g_ps3gcmGlobalState.GetRenderSurfaceBytes( 128 );
|
|
m_pMlaaBuffer = g_ps3gcmGlobalState.IoMemoryPrealloc( 128, nSizeofEdgePostBuffer + sizeof( g_vuSpuGcmCookie ) + sizeof( uint32 ) * CPs3gcmDisplay::SURFACE_COUNT );
|
|
if( m_pMlaaBuffer )
|
|
{
|
|
m_pMlaaBufferOut = m_pMlaaBuffer;//( void* )( uintp( m_pMlaaBuffer ) + nSizeofEdgePostBuffer );
|
|
m_pMlaaBufferCookie = ( vec_uint4* ) ( uintp( m_pMlaaBufferOut ) + nSizeofEdgePostBuffer );
|
|
*m_pMlaaBufferCookie = g_vuSpuGcmCookie;
|
|
m_pEdgePostRsxLock = ( uint32* )( m_pMlaaBufferCookie + 1 );
|
|
}
|
|
else
|
|
{
|
|
// if MlaaBuffer is NULL, it just means we're in the pass of computing the IO memory requirements
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
//
|
|
// memory optimization: IO memory has slack, use it if it's big enough
|
|
//
|
|
void CSpuGcm::UseIoBufferSlack( uint nIoBufferSlac )
|
|
{
|
|
uint nSpuDrawQueueSize = CalculateMemorySizeFromCmdLineParam( "-spuDrawRingSize", 512 * 1024, 32 * 1024 );
|
|
Msg( "SPU draw queue size: %d Kb\n" , nSpuDrawQueueSize / 1024 );
|
|
uint nSpuDrawQueueDeferredSize = CalculateMemorySizeFromCmdLineParam( "-spuDrawDeferredRingSize", 210 * 1024, 32 * 1024 );
|
|
Msg( "SPU draw deferred queue size: %d Kb\n" , nSpuDrawQueueDeferredSize / 1024 );
|
|
|
|
m_nSpuDrawQueueSelector = 0;
|
|
m_spuDrawQueues[0].Init( nSpuDrawQueueSize, &g_spuGcmShared.m_nSpuDrawGet[0], OnSpuDrawQueueFlush, OnSpuDrawQueueStall );
|
|
m_spuDrawQueues[1].Init( nSpuDrawQueueDeferredSize, &g_spuGcmShared.m_nSpuDrawGet[1], OnSpuDrawQueueFlushDeferred, OnSpuDrawQueueStallDeferredDelegator );
|
|
|
|
for( uint i = 0; i < ARRAYSIZE( m_pDeferredStates ); ++i )
|
|
m_pDeferredStates[i] = ( DeferredState_t * ) g_ps3gcmGlobalState.IoSlackAlloc( 128, sizeof( DeferredState_t ) );
|
|
|
|
for( uint i = 0; i < ARRAYSIZE( m_pDeferredQueueCursors ); ++i )
|
|
m_pDeferredQueueCursors[i] = m_spuDrawQueues[1].GetCursor();
|
|
m_pDeferredQueueSegment = m_pDeferredQueueCursors[0];
|
|
m_pDeferredChunkSubmittedTill[1] = m_spuDrawQueues[1].GetCursor();
|
|
for( uint i = 0; i < ARRAYSIZE( m_spuDrawQueues ); ++i )
|
|
g_spuGcmShared.m_nSpuDrawGet[i] = m_spuDrawQueues[i].GetSignal();
|
|
}
|
|
|
|
|
|
static fltx4 g_vertexProgramConstants[CELL_GCM_VTXPRG_MAX_CONST];
|
|
// static uint s_nLastCtxBufferCookie = 0;
|
|
// static uint s_nCtxBufferSegmentSubmitTime = 0; // divide by 2 and it'll be the weighted average of 79.8MHz ticks between segment submissions
|
|
|
|
void CSpuGcm::OnGcmInit()
|
|
{
|
|
if( 127 & uintp( g_ps3gcmGlobalState.m_pLocalBaseAddress ) )
|
|
{
|
|
Error( "Local addresses map to main memory without proper 128-byte alignment! Some DMA assumptions are broken!!\n" );
|
|
}
|
|
if( 127 & uintp( g_ps3gcmGlobalState.m_nIoOffsetDelta ) )
|
|
{
|
|
Error( "IO addresses map to local memory without proper 128-byte alignment! Some DMA assumptions are broken!!\n" );
|
|
}
|
|
V_memset( &g_spuGcmShared.m_dxGcmState, 0, sizeof( g_spuGcmShared.m_dxGcmState ) );
|
|
V_memset( &g_spuGcmShared.m_cachedRenderState, 0, sizeof( g_spuGcmShared.m_cachedRenderState ) );
|
|
|
|
m_nPcbringWaitSpins = 0;
|
|
m_pPcbringBuffer = NULL;
|
|
m_eaLastJobThatUpdatesSharedState = 0;
|
|
g_spuGcmShared.m_enableStallWarnings = ( CommandLine()->FindParm( "-enableStallWarnings" ) != 0 );
|
|
|
|
g_spuGcmShared.m_edgeGeomFeeder.Init( m_edgeGeomRingBuffer.Size() );
|
|
g_edgeGeomRing.Init( m_edgeGeomRingBuffer.DataInLocalMemory(), m_edgeGeomRingBuffer.Size(), g_ps3gcmGlobalState.m_nIoOffsetDelta, g_ps3gcmGlobalState.m_pLocalBaseAddress, GCM_LABEL_EDGEGEOMRING );
|
|
g_spuGcmShared.m_eaEdgeGeomRing = &g_edgeGeomRing;
|
|
|
|
g_spuGcmShared.m_fpcpRing.OnGcmInit( g_ps3gcmGlobalState.m_nIoOffsetDelta );
|
|
g_spuGcmShared.m_nDrawLayerBits = g_spuGcmShared.LAYER_RENDER;
|
|
g_spuGcmShared.m_nDrawLayerPredicates = g_spuGcmShared.LAYER_RENDER_AND_Z;
|
|
|
|
g_spuGcmShared.m_nLastRsxInterruptValue = 0;
|
|
|
|
if( m_pEdgePostRsxLock )
|
|
{
|
|
for( uint i = 0; i < CPs3gcmDisplay::SURFACE_COUNT; ++i )
|
|
{
|
|
m_pEdgePostRsxLock[i] = CELL_GCM_RETURN(); // assume previous flips already happened
|
|
}
|
|
}
|
|
|
|
g_pVJobs->Register( this );
|
|
|
|
m_zPass.Init();
|
|
m_bUseDeferredDrawQueue = true;
|
|
BeginGcmStateTransaction();
|
|
g_pixelShaderPatcher.InitLocal( g_spuGcmShared.m_fpcpRing.GetRsxBuffer(), g_spuGcmShared.m_fpcpRing.GetRsxBufferSize() );
|
|
g_spuGcmShared.m_eaFpcpSharedState = g_pixelShaderPatcher.m_state.m_pSharedState;
|
|
g_spuGcmShared.m_nFpcpBufferMask = g_spuGcmShared.m_eaFpcpSharedState->m_nBufferMask;
|
|
g_spuGcmShared.m_eaLocalBaseAddress = (uint32)g_ps3gcmGlobalState.m_pLocalBaseAddress;
|
|
g_spuGcmShared.m_cachedRenderState.m_nDisabledSamplers = 0;
|
|
g_spuGcmShared.m_cachedRenderState.m_nSetTransformBranchBits = 0;
|
|
g_spuGcmShared.m_nDebuggerRunMask = SPUGCM_DEBUG_MODE ? 2 : 0;
|
|
g_spuGcmShared.m_eaLastJobThatUpdatedMe = 0;
|
|
g_spuGcmShared.m_nFpcPatchCounterOfLastSyncJob = g_pixelShaderPatcher.m_nFpcPatchCounterOfLastSyncJob;
|
|
g_spuGcmShared.m_nFpcPatchCounter = g_pixelShaderPatcher.m_nFpcPatchCounter;
|
|
g_spuGcmShared.m_nFpcpStartRangesAfterLastSync = g_spuGcmShared.m_eaFpcpSharedState->m_nStartRanges;
|
|
g_spuGcmShared.m_eaZPassSavedState = NULL;
|
|
|
|
g_spuGcmShared.m_nIoLocalOffsetEmptyFragmentProgramSetupRoutine = g_ps3gcmGlobalState.m_nIoLocalOffsetEmptyFragmentProgramSetupRoutine;
|
|
g_spuGcmShared.m_eaPs3texFormats = g_ps3texFormats;
|
|
|
|
g_spuGcmShared.m_eaVertexProgramConstants = g_vertexProgramConstants;
|
|
|
|
m_nGcmFlushJobScratchSize = 0;
|
|
m_nFrame = 0;
|
|
|
|
// we shouldn't have used this format yet
|
|
Assert( g_ps3texFormats[PS3_TEX_CANONICAL_FORMAT_COUNT].m_gcmPitchPer4X == 0 );
|
|
Assert( g_ps3texFormats[PS3_TEX_CANONICAL_FORMAT_COUNT-1].m_gcmPitchPer4X != 0 );
|
|
Assert( !( 0xF & uintp( g_spuGcmShared.m_eaPs3texFormats ) ) );
|
|
|
|
Assert( g_spuGcmShared.m_nIoLocalOffsetEmptyFragmentProgramSetupRoutine );
|
|
COMPILE_TIME_ASSERT( !GCM_CTX_UNSAFE_MODE );
|
|
{
|
|
m_pFinishLabel = cellGcmGetLabelAddress( GCM_LABEL_SPUGCM_FINISH );
|
|
*m_pFinishLabel = s_nFinishLabelValue;
|
|
|
|
uint nSysringBytes = g_ps3gcmGlobalState.m_nCmdSize - SYSTEM_CMD_BUFFER_RESERVED_AREA - 16 - sizeof( SysringWrapSequence::Tail_t ); // 16 bytes for the JTN to wrap the buffer around, and to be able to DMA it in 16-byte chunks
|
|
nSysringBytes &= -16; // make it 16-byte aligned
|
|
uint eaSysringBuffer = uintp( g_ps3gcmGlobalState.m_pIoAddress ) + SYSTEM_CMD_BUFFER_RESERVED_AREA;
|
|
uint32 * pSysringBufferEnd = ( uint32* )( eaSysringBuffer + nSysringBytes );
|
|
*pSysringBufferEnd = // this is not strictly needed...
|
|
g_spuGcmShared.m_sysringWrap.m_tail.m_nJumpToBegin = CELL_GCM_JUMP( SYSTEM_CMD_BUFFER_RESERVED_AREA );
|
|
V_memset( g_spuGcmShared.m_sysringWrap.m_tail.m_nNops, 0, sizeof( g_spuGcmShared.m_sysringWrap.m_tail.m_nNops ) );
|
|
Assert( !( 0xF & uint( &g_spuGcmShared.m_sysringWrap ) ) );
|
|
|
|
|
|
//COMPILE_TIME_ASSERT( SPUGCM_USE_SET_REFERENCE_FOR_SYSRING_SIGNAL );
|
|
//g_spuGcmShared.m_pEaSysringEndLabel = ( uint32* ) cellGcmGetLabelAddress( GCM_LABEL_SYSRING_END );
|
|
//*g_spuGcmShared.m_pEaSysringEndLabel = g_spuGcmShared.m_sysring.m_nEnd; // pretend we finished all processing
|
|
|
|
//g_spuGcmShared.m_nSysringSegmentWords = ( g_ps3gcmGlobalState.m_nCmdSize - nSysringCmdBufferSystemArea ) / sizeof( uint32 ) / g_spuGcmShared.NUM_SYSTEM_SEGMENTS;
|
|
//g_spuGcmShared.m_nSysringSegmentWords &= -16; // make it aligned, at least -4 words but may be more for easier debugging (more round numbers)
|
|
g_spuGcmShared.m_nIoOffsetDelta = g_ps3gcmGlobalState.m_nIoOffsetDelta;
|
|
g_spuGcmShared.m_nSysringWaitSpins = 0;
|
|
g_spuGcmShared.m_nSysringPuts = 0;
|
|
g_spuGcmShared.m_nSysringSegmentSizeLog2 = 29 - __cntlzw( g_ps3gcmGlobalState.m_nCmdSize ); // make 4 subsegments; guarantee segment switch whenever the ring wraps around
|
|
// we need AT LEAST 2 segments and each segment must be AT LEAST 1kb - for performant and reliable operation;
|
|
Assert( ( g_ps3gcmGlobalState.m_nCmdSize >> g_spuGcmShared.m_nSysringSegmentSizeLog2 ) > 2 && ( g_ps3gcmGlobalState.m_nCmdSize >> g_spuGcmShared.m_nSysringSegmentSizeLog2 ) < 8 && g_spuGcmShared.m_nSysringSegmentSizeLog2 >= 10 );
|
|
//g_spuGcmShared.m_nSysringPut = 0;
|
|
//g_spuGcmShared.m_nSysringEnd = g_spuGcmShared.NUM_SYSTEM_SEGMENTS; // pretend we got the whole buffer already
|
|
g_spuGcmShared.m_nDebuggerBreakMask = 0x00000000;
|
|
g_spuGcmShared.m_nDebugLastSeenGet = 0xFEFEFEFE;
|
|
|
|
uint nPcbringSize = SPUGCM_DEFAULT_PCBRING_SIZE;
|
|
COMPILE_TIME_ASSERT( !( SPUGCM_DEFAULT_PCBRING_SIZE & ( SPUGCM_DEFAULT_PCBRING_SIZE - 1 ) ) );
|
|
g_spuGcmShared.m_nPcbringSize = nPcbringSize ;
|
|
// 12 extra bytes are allocated for buffer alignment code to avoid writing past end of the buffer ; 4 more bytes are for the cookie
|
|
//m_pPcbringBuffer = ( uint32 * )MemAlloc_AllocAligned( nPcbringSize + 12 + 4, 0x10 );
|
|
//*AddBytes( m_pPcbringBuffer, g_spuGcmShared.m_nPcbringSize + 12 ) = 0x1234ABCD;
|
|
m_nPcbringBegin = 0;
|
|
g_spuGcmShared.m_nPcbringEnd = g_spuGcmShared.m_nPcbringSize; // consider the full ring buffer already processed on SPU and free: this End is the end of "free to use" area
|
|
// these is the max count of words needed to align the cmd buffer and insert any write-labels/set-reference-values
|
|
// we need to add at least 3 to the count, in case we align current pointer in the process ( because we may need to submit )
|
|
// also, we want this segment size to fit inside the between-segment signal
|
|
m_nMaxPcbringSegmentBytes = Min<uint>( ( ( nPcbringSize - 32 - SPUGCM_SIZEOF_SYSRING_ENDOFSEGMENT_SIGNAL_COMMAND ) / 4 ) & -16, ( 1 << g_spuGcmShared.m_nSysringSegmentSizeLog2 ) - SPUGCM_SIZEOF_SYSRING_ENDOFSEGMENT_SIGNAL_COMMAND - 12 ); //
|
|
// we definitely need PCBring segment to fit well into local store
|
|
m_nMaxPcbringSegmentBytes = Min<uint>( m_nMaxPcbringSegmentBytes, SPUGCM_LSRING_SIZE / 2 );
|
|
m_nMaxPcbringSegmentBytes = Min<uint>( m_nMaxPcbringSegmentBytes, SPUGCM_MAX_PCBRING_SEGMENT_SIZE );
|
|
m_nMaxPcbringSegmentBytes &= -16; // make it 16-byte aligned..
|
|
|
|
cellGcmReserveMethodSize( gCellGcmCurrentContext, 3 ); // we need at most ( 2 words for reference command + ) 3 words for alignment
|
|
|
|
// align the buffer on 16-byte boundary, because we manage it in 16-byte increments
|
|
while( 0xF & uintp( gCellGcmCurrentContext->current ) )
|
|
{
|
|
*( gCellGcmCurrentContext->current++ ) = CELL_GCM_METHOD_NOP;
|
|
}
|
|
|
|
g_spuGcmShared.m_sysring.Init( eaSysringBuffer, nSysringBytes, uint( gCellGcmCurrentContext->current ) - eaSysringBuffer );
|
|
g_spuGcmShared.m_sysringRo.Init( GCM_LABEL_SYSRING_SIGNAL );
|
|
g_spuGcmShared.m_nSysringWrapCounter = 0;
|
|
g_spuGcmShared.m_eaGcmControlRegister = cellGcmGetControlRegister();
|
|
g_spuGcmShared.m_eaSysringLabel = cellGcmGetLabelAddress( GCM_LABEL_SYSRING_SIGNAL );
|
|
g_spuGcmShared.m_eaDebugLabel[0] = cellGcmGetLabelAddress( GCM_LABEL_DEBUG0 );
|
|
g_spuGcmShared.m_eaDebugLabel[1] = cellGcmGetLabelAddress( GCM_LABEL_DEBUG1 );
|
|
g_spuGcmShared.m_eaDebugLabel[2] = cellGcmGetLabelAddress( GCM_LABEL_DEBUG2 );
|
|
*g_spuGcmShared.m_eaSysringLabel = g_spuGcmShared.m_sysring.GetSignal(); // pretend we executed WriteLabel
|
|
g_spuGcmShared.m_nLastSignal = g_spuGcmShared.m_sysring.GetInvalidSignal();
|
|
#if SPU_GCM_DEBUG_TRACE
|
|
g_spuGcmShared.m_nDebugTraceBufferNext = 0;
|
|
g_spuGcmShared.m_eaDebugTraceBuffer = ( SpuGcmDebugTrace_t* )MemAlloc_AllocAligned( g_spuGcmShared.DEBUG_BUFFER_COUNT * sizeof( SpuGcmDebugTrace_t ), 16 );
|
|
#endif
|
|
if( SPUGCM_USE_SET_REFERENCE_FOR_SYSRING_SIGNAL )
|
|
{
|
|
g_spuGcmShared.m_eaGcmControlRegister->ref = g_spuGcmShared.m_sysring.m_nEnd;// pretend we finished all processing
|
|
}
|
|
|
|
#ifdef _DEBUG
|
|
m_nJobsPushed = 0;
|
|
// fill in JTS in the rest of the buffer
|
|
for( uint32 * pSlack = gCellGcmCurrentContext->current; pSlack < pSysringBufferEnd; ++pSlack )
|
|
*pSlack = CELL_GCM_JUMP( uintp( pSlack ) - uintp( g_ps3gcmGlobalState.m_pIoAddress ) );
|
|
#endif
|
|
// set reference BEFORE we switch to sysring
|
|
uint nGcmPut = uintp( gCellGcmCurrentContext->current ) + g_spuGcmShared.m_nIoOffsetDelta;
|
|
Assert( !( 0xF & nGcmPut ) );
|
|
__sync();
|
|
g_spuGcmShared.m_eaGcmControlRegister->put = nGcmPut;
|
|
// wait for RSX to reach this point, then switch to the new command buffer scheme
|
|
int nAttempts = 0;
|
|
while( g_spuGcmShared.m_eaGcmControlRegister->get != nGcmPut )
|
|
{
|
|
sys_timer_usleep(1000);
|
|
if( ++nAttempts > 1000 )
|
|
{
|
|
Warning( "Cannot properly wait for RSX in OnGcmInit(%X!=%X); assuming everything's all right anyway.\n", g_spuGcmShared.m_eaGcmControlRegister->get, nGcmPut );
|
|
break; // don't wait forever..
|
|
}
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
// Switch to PPU Command Buffer RING
|
|
//
|
|
// set reference BEFORE we switch to sysring; wait for all RSX initialization to go through before switching
|
|
PCB_RING_CTX.begin = PCB_RING_CTX.current = NULL;//m_pPcbringBuffer;
|
|
// we need to at least double-buffer to avoid deadlocks while waiting to submit a Pcbring segment
|
|
// Each segment ends with a reference value update, and we need that update to unblock a piece of memory for use by subsequent submits
|
|
Assert( GetMaxPcbringSegmentBytes() <= nPcbringSize / 2 );
|
|
PCB_RING_CTX.end = NULL;//AddBytes( m_pPcbringBuffer, GetMaxPcbringSegmentBytes() );
|
|
PCB_RING_CTX.callback = SpuGcmCommandBufferReserveCallback;
|
|
|
|
#ifdef CELL_GCM_DEBUG // [
|
|
gCellGcmDebugCallback = SpuGcmDebugFinish;
|
|
cellGcmDebugCheckEnable( CELL_GCM_TRUE );
|
|
#endif // ]
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inline signed int CSpuGcm::GetPcbringAvailableBytes()const
|
|
{
|
|
int nReallyAvailable = int32( *(volatile uint32*)&g_spuGcmShared.m_nPcbringEnd ) - int32( m_nPcbringBegin );
|
|
#ifdef DBGFLAG_ASSERT
|
|
Assert( uint( nReallyAvailable ) <= g_spuGcmShared.m_nPcbringSize );
|
|
static int s_nLastPcbringAvailableBytes = -1;
|
|
s_nLastPcbringAvailableBytes = nReallyAvailable;
|
|
#endif
|
|
Assert( nReallyAvailable >= 0 );
|
|
return nReallyAvailable;
|
|
}
|
|
|
|
|
|
|
|
int CSpuGcm::OnGcmCommandBufferReserveCallback( struct CellGcmContextData *context, uint32_t nReserveCount )
|
|
{
|
|
FillNops(context);
|
|
// IMPORTANT: we only allocate the necessary number of words here, no more no less
|
|
// if we over-allocate, we may end up reordering commands in SPU draw queue following after GCM_FUNC commands
|
|
uint nReserve = nReserveCount;
|
|
uint32 * pDrawQueueCommand = GetDrawQueue()->AllocWords( nReserve + 1 );
|
|
*pDrawQueueCommand = SPUDRAWQUEUE_GCMCOMMANDS_METHOD | nReserve;
|
|
context->begin = context->current = pDrawQueueCommand + 1;
|
|
context->end = context->begin + nReserve;
|
|
if( IsDebug() )
|
|
V_memset( context->current, 0xFE, nReserve * 4 );
|
|
return CELL_OK;
|
|
}
|
|
|
|
|
|
|
|
void CSpuGcm::BeginGcmStateTransaction()
|
|
{
|
|
m_nCurrentBatch = BATCH_GCMSTATE;
|
|
SetCurrentBatchCursor( GetDrawQueue()->GetCursor() );
|
|
}
|
|
|
|
|
|
|
|
void CSpuGcm::PushStateFlushJob( SpuDrawQueue * pDrawQueue, uint nResultantSpuDrawQueueSignal, uint32 *pCursorBegin, uint32 * pCursorEnd )
|
|
{
|
|
// only submit the job if there are any commands in the state command buffer
|
|
CellSpursJob128 * pJob = m_jobPool128.Alloc( *m_pRoot->m_pGcmStateFlush );
|
|
job_gcmstateflush::JobParams_t * pJobParams = job_gcmstateflush::GetJobParams( pJob );
|
|
pJob->header.useInOutBuffer = 1;
|
|
CDmaListConstructor dmaConstructor( pJob->workArea.dmaList );
|
|
dmaConstructor.AddInputDma( sizeof( g_spuGcmShared ), &g_spuGcmShared ); // dma[0]; must be the first to be 128-byte aligned for atomics
|
|
uint nSizeofDrawQueueUploadBytes = pDrawQueue->Collect( pCursorBegin, pCursorEnd, dmaConstructor );
|
|
Assert( !( nSizeofDrawQueueUploadBytes & 3 ) );
|
|
|
|
dmaConstructor.AddSizeInOrInOut( 48 + SPUGCM_LSRING_SIZE ); // 16 bytes for alignment; 16 for lsZero; 16 for lsTemp;
|
|
COMPILE_TIME_ASSERT( sizeof( CPs3gcmTextureLayout::Format_t ) == 16 );
|
|
dmaConstructor.AddCacheDma( g_nPs3texFormatCount * sizeof( CPs3gcmTextureLayout::Format_t ), g_ps3texFormats ) ;
|
|
dmaConstructor.FinishIoBuffer( &pJob->header, pJobParams );
|
|
|
|
pJobParams->m_nSkipDrawQueueWords = ( uintp( pCursorBegin ) / sizeof( uint32 ) ) & 3;
|
|
pJobParams->m_nSizeofDrawQueueUploadWords = nSizeofDrawQueueUploadBytes / sizeof( uint32 ) ;
|
|
Assert( uint( pJobParams->m_nSizeofDrawQueueUploadWords ) == nSizeofDrawQueueUploadBytes / sizeof( uint32 ) ); // make sure it fits into uint16
|
|
pJobParams->m_nSpuDrawQueueSignal = nResultantSpuDrawQueueSignal;
|
|
|
|
#ifdef DBGFLAG_ASSERT
|
|
SpuDrawQueue * pSignalDrawQueue = &m_spuDrawQueues[ nResultantSpuDrawQueueSignal & 3 ? 1 : 0 ]; (void)pSignalDrawQueue;
|
|
Assert( pSignalDrawQueue->IsValidCursor( (uint32*)( nResultantSpuDrawQueueSignal & ~3 ) ) );
|
|
#endif
|
|
uint nResultantSpuDrawQueueIndex = nResultantSpuDrawQueueSignal & 3;
|
|
m_pDeferredChunkSubmittedTill[ nResultantSpuDrawQueueIndex ] = ( uint32* )( nResultantSpuDrawQueueSignal & ~3 );
|
|
|
|
|
|
Assert( CELL_OK == cellSpursCheckJob( (const CellSpursJob256 *)pJob, sizeof( *pJob ), 256 ) );
|
|
m_eaLastJobThatUpdatesSharedState = ( uintp )pJob;
|
|
|
|
pJob->header.sizeScratch = m_nGcmFlushJobScratchSize;
|
|
m_nGcmFlushJobScratchSize = 0;
|
|
PushSpuGcmJob( pJob );
|
|
|
|
if( SPUGCM_DEBUG_MODE )
|
|
{
|
|
// in SPUGCM_DEBUG_MODE, we execute all jobs and wait for them to complete. So, the GET pointer should always trail our pNext pointer
|
|
Assert( g_spuGcmShared.m_nSpuDrawGet[nResultantSpuDrawQueueIndex] == ( nResultantSpuDrawQueueSignal & ~3 ) );
|
|
}
|
|
}
|
|
|
|
|
|
void CSpuGcm::GcmStateFlush( )
|
|
{
|
|
Assert( m_nCurrentBatch == BATCH_GCMSTATE );
|
|
if( IsDeferredDrawQueue() )
|
|
{
|
|
Warning( "Unexpected Flush in deferred spu draw queue\n" );
|
|
OpenDeferredChunk();
|
|
}
|
|
else
|
|
{
|
|
if( GetCurrentBatchCursor() != GetDrawQueue()->GetCursor() )
|
|
{
|
|
FillNops( &PCB_RING_CTX );
|
|
Assert( GetDrawQueue() == &m_spuDrawQueues[0] );
|
|
PushStateFlushJob( &m_spuDrawQueues[0], m_spuDrawQueues[0].GetSignal(), GetCurrentBatchCursor(), GetDrawQueue()->GetCursor() );
|
|
|
|
BeginGcmStateTransaction();
|
|
ZPassCheckpoint( 6 );
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void CSpuGcm::PushSpuGcmJob( CellSpursJob128 * pJob )
|
|
{
|
|
#ifdef _DEBUG
|
|
m_nJobsPushed++;
|
|
#endif
|
|
PushSpuGcmJobCommand( CELL_SPURS_JOB_COMMAND_JOB( pJob ) );
|
|
if( SPUGCM_DEBUG_MODE )
|
|
{
|
|
if( !m_zPass )
|
|
{
|
|
// in ZPass_Z the job doesn't free its descriptor
|
|
// in ZPass_Render, we don't start the jobs through here
|
|
// so we can't use this spin-wait to wait for the job to complete
|
|
while( *( volatile uint64* )&pJob->header.eaBinary )
|
|
{
|
|
sys_timer_usleep( 60 );
|
|
}
|
|
}
|
|
|
|
while( g_spuGcmShared.m_eaLastJobThatUpdatedMe != uintp( pJob ) )
|
|
{
|
|
sys_timer_usleep( 60 );
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void CSpuGcm::PushSpuGcmJobCommand( uint64 nCommand )
|
|
{
|
|
if( m_zPass )
|
|
{
|
|
m_zPass.PushCommand( nCommand );
|
|
}
|
|
else
|
|
{
|
|
m_jobSink.PushSyncJobSync( nCommand );
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void CSpuGcm::ZPassCheckpoint( uint nReserveSlots )
|
|
{
|
|
if( m_zPass )
|
|
{
|
|
uint nFreeSubchainSlots = m_zPass.GetSubchainCapacity();
|
|
if( nFreeSubchainSlots < 2 * nReserveSlots )
|
|
{
|
|
ExecuteOnce( Warning("Aborting Z prepass: not enough room for commands in zpass sub-job-chain (%d left).\n", nFreeSubchainSlots ) );
|
|
AbortZPass(); // initiate Abort sequence of ZPass; reentrant
|
|
}
|
|
uint nFreeJobDescriptors = m_jobPool128.GetReserve( m_zPass.m_nJobPoolMarker );
|
|
if( nFreeJobDescriptors < nReserveSlots )
|
|
{
|
|
ExecuteOnce( Warning("Aborting Z prepass: not enough room for job descriptors in m_jobPool128 (%d left)\n", nFreeJobDescriptors ) );
|
|
AbortZPass();
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void CSpuGcm::OnSetPixelShaderConstant()
|
|
{
|
|
Assert( !IsDeferredDrawQueue() );
|
|
if( m_zPass )
|
|
{
|
|
if( !m_zPass.m_isInEndZPass )
|
|
{
|
|
if( g_pixelShaderPatcher.GetJournalSpaceLeftSince( m_zPass.m_nFpcpStateEndOfJournalIdxAtZPassBegin ) < 512 )
|
|
{
|
|
ExecuteOnce( Warning( "Performance Warning: Too many pixel shader constants set inside ZPass; aborting ZPass\n" ) );
|
|
AbortZPass();
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// we have space for 48kB (3k of constants) in FPCP;
|
|
// every SetPixelShaderConstant may add 97 constants (96 values, 1 header)
|
|
if( g_pixelShaderPatcher.GetJournalSpaceUsedSince( m_nFpcpStateEndOfJournalIdxAtSpuGcmJob ) > ( 32*1024 / 16 ) || g_pixelShaderPatcher.GetJournalSpaceLeftSince( m_nFpcpStateEndOfJournalIdxAtSpuGcmJob ) < 512 )
|
|
{
|
|
ExecuteOnce( Warning("Performance Warning: SetPixelShaderConstantF called for %d constants, but no draw calls were issued. Flushing FPCP state.\n", g_pixelShaderPatcher.GetJournalSpaceUsedSince( m_nFpcpStateEndOfJournalIdxAtSpuGcmJob ) ) );
|
|
// flush GCM with only one purpose: make it flush the patcher
|
|
GetDrawQueue()->Push2( SPUDRAWQUEUE_FLUSH_FPCP_JOURNAL, g_pixelShaderPatcher.GetStateEndOfJournalIdx() );
|
|
GcmStateFlush();
|
|
}
|
|
}
|
|
}
|
|
|
|
void CSpuGcm::OnSpuDrawQueueStallDeferredDelegator( SpuDrawQueue *pDrawQueue, uint32 * pGet, uint nWords )
|
|
{
|
|
g_spuGcm.OnSpuDrawQueueStallDeferred( pDrawQueue, pGet, nWords );
|
|
}
|
|
|
|
void CSpuGcm::OnSpuDrawQueueStallDeferred( SpuDrawQueue *pDrawQueue, uint32 * pGet, uint nWords )
|
|
{
|
|
// we need to try to wait for the previous deferred batch to finish
|
|
// in any case we should be prepared for "out of space" condition
|
|
// in which case we'll just execute all deferred commands right now
|
|
if( pGet == m_pDeferredChunkSubmittedTill[1] )
|
|
{
|
|
// we have nothing else to wait for, we need to free the space by executing deferred commands now
|
|
// full flush (this frame only, since previous frame was flushed the first time we called DrawQueueDeferred()
|
|
FillNops( &PCB_RING_CTX ); // switching draw queues, preallocated gcm context no longer usable
|
|
|
|
// the only deferred chunk that can resize is GCMFLUSH
|
|
// and handling it is pretty easy: we can either execute whatever it collected so far
|
|
if( m_pDeferredChunkHead )
|
|
{
|
|
// sanity check: we shouldn't have chunks as big as 64KB
|
|
Assert( m_spuDrawQueues[1].Length( m_pCurrentBatchCursor[1], m_pDeferredChunkHead ) <= 64*1024 );
|
|
Assert( *m_pDeferredChunkHead == SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD && m_pDeferredChunkHead == m_pDeferredQueueCursors[0] );
|
|
}
|
|
|
|
// temporarily switch to normal queue state in order to replay the deferred queue commands and purge them
|
|
uint32 * pDeferredQueueSegment = m_pDeferredQueueSegment;
|
|
m_nSpuDrawQueueSelector = 0;
|
|
Assert( m_pCurrentBatchCursor[0] == m_spuDrawQueues[0].GetCursor() );
|
|
BeginGcmStateTransaction(); // this transaction is beginning in Normal draw queue; Deferred queue is currently in "frozen" state (almost out of memory)
|
|
|
|
g_flipHandler.QmsAdviceBeforeDrawPrevFramebuffer();
|
|
// flush previous frame first, and if it doesn't change Get , flush this frame
|
|
ExecuteDeferredDrawQueue( 1 );
|
|
extern void DxDeviceForceUpdateRenderTarget( );
|
|
DxDeviceForceUpdateRenderTarget( ); // recover main render target, as it was screwed up by execution of previous frame's commands
|
|
ExecuteDeferredDrawQueue( 0 );
|
|
m_nFramesToDisableDeferredQueue = 1;
|
|
|
|
// return to the deferred state after purging the queue. During purging the deferred queue, DrawQueue(Normal|Deferred) could not have been called
|
|
// this "unfreezes" the deferred queue, which should by now be almost-all-free( or pending, depending on how fast SPUs will chew through it)
|
|
Assert( m_pDeferredQueueSegment == pDeferredQueueSegment );
|
|
|
|
// we executed up to this point (last opened chunk), we discard everything before it.
|
|
// the last opened chunk is perfectly fine to begin the queue segment, so we pretend we began deferred queue there
|
|
m_pDeferredQueueSegment = m_pDeferredQueueCursors[0];
|
|
|
|
m_nSpuDrawQueueSelector = 1;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void CSpuGcm::OnSpuDrawQueueFlushDeferred( SpuDrawQueue *pDrawQueue )
|
|
{
|
|
// break up long GCM chunks
|
|
Assert( pDrawQueue == g_spuGcm.GetDrawQueue() );
|
|
Assert( !g_spuGcm.m_pDeferredChunkHead || ( *g_spuGcm.m_pDeferredChunkHead & ~SPUDRAWQUEUE_DEFERRED_GCMFLUSH_MASK ) == SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD ); // this is the only chunk we allocate incrementally
|
|
|
|
// prevent this from being called recursively: reset flush watermark before doing anything else
|
|
pDrawQueue->SetFlushWatermarkFrom( pDrawQueue->GetCursor() );
|
|
|
|
g_spuGcm.OpenDeferredChunk();
|
|
}
|
|
|
|
void CSpuGcm::OnSpuDrawQueueStall( SpuDrawQueue *pDrawQueue, uint32 * pGet, uint32 nWords )
|
|
{
|
|
Assert( pDrawQueue == &g_spuGcm.m_spuDrawQueues[0] );
|
|
StallAndWarning( "SpuDrawQueue stall: PPU is waiting for SPU, and SPU is probably waiting for RSX\n"/*, nWords, pGet, g_spuGcm.m_spuDrawQueues[0].GetCursor()*/ );
|
|
}
|
|
|
|
void CSpuGcm::OnSpuDrawQueueFlush( SpuDrawQueue *pDrawQueue )
|
|
{
|
|
// currently, there's only one and it's
|
|
Assert( pDrawQueue == g_spuGcm.GetDrawQueue() );
|
|
g_spuGcm.GcmStateFlush();
|
|
}
|
|
|
|
|
|
|
|
void CSpuGcm::OnSpuDrawQueueFlushInZPass()
|
|
{
|
|
//
|
|
// flush watermark has changed now (it changes on every collect())
|
|
// override flush watermark to flush before we reach ZPass cursor,
|
|
// and if it's impossible, then Abort ZPass - we don't have enough space
|
|
// in SPU GCM buffer
|
|
//
|
|
// Take care not to flush excessively when pusing the last few commands into
|
|
// SPUGCM draw buffer because we can be doing that right around flush watermark
|
|
// frequently
|
|
//
|
|
|
|
uint32 * pOldFlushWatermark = GetDrawQueue()->GetFlushWatermark();
|
|
|
|
GcmStateFlush();
|
|
|
|
uint32 * pNewFlushWatermark = GetDrawQueue()->GetFlushWatermark();
|
|
|
|
if( pNewFlushWatermark < pOldFlushWatermark ? pNewFlushWatermark >= m_zPass.m_pCursor || pOldFlushWatermark <= m_zPass.m_pCursor : pOldFlushWatermark <= m_zPass.m_pCursor && m_zPass.m_pCursor <= pNewFlushWatermark )
|
|
{
|
|
// the next flush will be too late;
|
|
// NOTE: we can recover up to 32KB by adjusting the flush watermark here, but I have bigger fish to fry, so we'll just abort ZPass right now and here
|
|
AbortZPass();
|
|
}
|
|
}
|
|
|
|
void CSpuGcm::OnSpuDrawQueueFlushInZPass( SpuDrawQueue *pDrawQueue )
|
|
{
|
|
// TODO: check if cursor is intersected and potentially EndZPass()
|
|
Assert( pDrawQueue == g_spuGcm.GetDrawQueue() );
|
|
g_spuGcm.OnSpuDrawQueueFlushInZPass();
|
|
}
|
|
|
|
void SpuGcmCommandBufferFlush()
|
|
{
|
|
g_spuGcm.CmdBufferFlush();
|
|
}
|
|
|
|
|
|
|
|
SpuDrawHeader_t * CSpuGcm::BeginDrawBatch()
|
|
{
|
|
SpuDrawHeader_t * pDrawHeader;
|
|
if( IsDeferredDrawQueue() )
|
|
{
|
|
uintp eaSpuDrawHeader = ( uintp ) OpenDeferredChunk( SPUDRAWQUEUE_DEFERRED_DRAW_METHOD, 3 + ( sizeof( SpuDrawHeader_t ) + sizeof( IDirect3DVertexDeclaration9 * /*pVertDecl*/ ) ) / sizeof( uint32 ) );
|
|
pDrawHeader = ( SpuDrawHeader_t * ) AlignValue( eaSpuDrawHeader, 16 );
|
|
}
|
|
else
|
|
{
|
|
GcmStateFlush();
|
|
// we must be in the default batch transaction, and it must be empty so that we can switch the transaction type
|
|
Assert( m_nCurrentBatch == BATCH_GCMSTATE && GetCurrentBatchCursor() == GetDrawQueue()->GetCursor() );
|
|
pDrawHeader = GetDrawQueue()->AllocAligned<SpuDrawHeader_t>();
|
|
}
|
|
m_nCurrentBatch = BATCH_DRAW;
|
|
Assert( GetDrawQueue()->IsValidCursor( (uint32*)( pDrawHeader + 1 ) ) );
|
|
SetCurrentBatchCursor( ( uint32* ) pDrawHeader );
|
|
return pDrawHeader;
|
|
}
|
|
|
|
|
|
|
|
CellSpursJob128 * CSpuGcm::PushDrawBatchJob( uint nResultantSpuDrawQueueSignal, SpuDrawHeader_t * pDrawHeader, IDirect3DVertexDeclaration9 *pVertDecl, OptimizedModel::OptimizedIndexBufferMarkupPs3_t *pIbMarkup )
|
|
{
|
|
CellSpursJob128 * pJob = m_jobPool128.Alloc( *m_pRoot->m_pDrawIndexedPrimitive );
|
|
pJob->header.useInOutBuffer = 1;
|
|
// we'll DMA get textures and layouts inside the job; we'll need space for DMA elements to do so
|
|
pJob->header.sizeScratch = AlignValue( sizeof( JobDrawIndexedPrimitiveScratch_t ), 128 ) / 16;
|
|
|
|
CDmaListConstructor dmaConstructor( pJob->workArea.dmaList );
|
|
dmaConstructor.AddInputDma( sizeof( g_spuGcmShared ), &g_spuGcmShared ); // dma[0]; must be the first to be 128-byte aligned for atomics
|
|
dmaConstructor.AddInputDma( sizeof( *pVertDecl ), pVertDecl ); // dma[1]
|
|
dmaConstructor.AddInputDma( sizeof( *pDrawHeader ), pDrawHeader ); // dma[2]
|
|
|
|
COMPILE_TIME_ASSERT( sizeof( g_spuGcmShared ) < 16 * 1024 && sizeof( *pVertDecl ) < 16 * 1024 && sizeof( *pDrawHeader ) < 16 * 1024 );
|
|
|
|
// pIbMarkup = pDrawHeader->m_eaIbMarkup;
|
|
if ( pIbMarkup )
|
|
{
|
|
uint nIbMarkupBytes = ( pIbMarkup->m_numPartitions * sizeof( OptimizedModel::OptimizedIndexBufferMarkupPs3_t::Partition_t ) + sizeof( OptimizedModel::OptimizedIndexBufferMarkupPs3_t ) );
|
|
dmaConstructor.AddInputDma( ( nIbMarkupBytes + 31 ) & -16, ( const void* )( uintp( pIbMarkup ) & -16 ) ); // dma[3]
|
|
}
|
|
|
|
//dmaConstructor.AddInputDmaLarge( SPUGCM_LSRING_SIZE, nUsefulBytesAligned, PCB_RING_CTX.begin ); // dma[4,5,6,7]
|
|
dmaConstructor.AddSizeInOrInOut( SPUGCM_LSRING_SIZE );
|
|
COMPILE_TIME_ASSERT( SPUGCM_LSRING_SIZE / (16*1024) <= 4 );
|
|
// usage of the IO buffer slack:
|
|
// alignment, sync signal, wrap sequence, alignment, RSX PUT control register output, SPURS job command output
|
|
dmaConstructor.AddSizeInOrInOut(
|
|
128 // potential misalignment of command buffer, for double-bandwidth DMA to command buffer (not used now)
|
|
+ sizeof( SysringWrapSequence ) // is it accounted for in the LSRING_SLACK?
|
|
+ 16 // lsResetDrawBatch
|
|
+ 16 // lsTempRsxPut
|
|
+ 16 // g_lsDummyRead
|
|
);
|
|
COMPILE_TIME_ASSERT( sizeof( CPs3gcmTextureLayout::Format_t ) == 16 );
|
|
dmaConstructor.AddCacheDma( g_nPs3texFormatCount * sizeof( CPs3gcmTextureLayout::Format_t ), g_ps3texFormats ) ; // dma[8]
|
|
dmaConstructor.FinishIoBuffer( &pJob->header );
|
|
pJob->header.sizeStack = 16 * 1024 / 16;
|
|
|
|
pDrawHeader->m_nPs3texFormatCount = g_nPs3texFormatCount; // for reference; is not strictly needed here
|
|
pDrawHeader->m_nUsefulCmdBytes = 0;//nUsefulBytes;
|
|
pDrawHeader->m_nPcbringBegin = 0;//m_nPcbringBegin; // note: this is the post-updated buffer counter!
|
|
pDrawHeader->m_nResultantSpuDrawGet = nResultantSpuDrawQueueSignal;
|
|
|
|
#ifdef DBGFLAG_ASSERT
|
|
SpuDrawQueue * pSignalDrawQueue = &m_spuDrawQueues[ nResultantSpuDrawQueueSignal & 3 ? 1 : 0 ];(void)pSignalDrawQueue;
|
|
Assert( pSignalDrawQueue->IsValidCursor( (uint32*)( nResultantSpuDrawQueueSignal & ~3 ) ) );
|
|
#endif
|
|
uint nResultantSpuDrawQueueIndex = nResultantSpuDrawQueueSignal & 3;
|
|
m_pDeferredChunkSubmittedTill[ nResultantSpuDrawQueueIndex ] = ( uint32* )( nResultantSpuDrawQueueSignal & ~3 );
|
|
|
|
Assert( CELL_OK == cellSpursCheckJob( (const CellSpursJob256 *)pJob, sizeof( *pJob ), 256 ) );
|
|
m_eaLastJobThatUpdatesSharedState = ( uintp )pJob;
|
|
//PCB_RING_CTX.begin = PCB_RING_CTX.current = pSkipTo; // submitted; now when needed, we'll wait for SPU to reply through shared state
|
|
//Assert( PCB_RING_CTX.begin <= PCB_RING_CTX.end );
|
|
|
|
PushSpuGcmJob( pJob );
|
|
|
|
// after this job runs, it spawns FPCP job, which will advance the FPCP state
|
|
m_nFpcpStateEndOfJournalIdxAtSpuGcmJob = g_pixelShaderPatcher.GetStateEndOfJournalIdx();
|
|
|
|
if( SPUGCM_DEBUG_MODE )
|
|
{
|
|
// in SPUGCM_DEBUG_MODE, we execute all jobs and wait for them to complete. So, the GET pointer should always trail our pNext pointer
|
|
Assert( g_spuGcmShared.m_nSpuDrawGet[ nResultantSpuDrawQueueIndex ] == ( nResultantSpuDrawQueueSignal & ~3 ) );
|
|
}
|
|
|
|
return pJob;
|
|
}
|
|
|
|
|
|
// BUG: pVertDecl may be released right after this call, we need to copy it somewhere or addref
|
|
void CSpuGcm::SubmitDrawBatch( IDirect3DVertexDeclaration9 *pVertDecl, OptimizedModel::OptimizedIndexBufferMarkupPs3_t *pIbMarkup )
|
|
{
|
|
Assert( m_nCurrentBatch == BATCH_DRAW );
|
|
SpuDrawHeader_t * pDrawHeader = ( SpuDrawHeader_t * )GetCurrentBatchCursor();
|
|
|
|
if ( pIbMarkup )
|
|
{
|
|
Assert( pIbMarkup->kHeaderCookie == pIbMarkup->m_uiHeaderCookie );
|
|
// real markup exists in this index buffer
|
|
pDrawHeader->m_eaIbMarkup = pIbMarkup;
|
|
pDrawHeader->m_nIbMarkupPartitions = pIbMarkup->m_numPartitions;
|
|
}
|
|
else
|
|
{
|
|
pDrawHeader->m_eaIbMarkup = NULL;
|
|
pDrawHeader->m_nIbMarkupPartitions = 0;
|
|
}
|
|
|
|
if( IsDeferredDrawQueue() )
|
|
{
|
|
*( ( IDirect3DVertexDeclaration9 ** )( pDrawHeader + 1 ) ) = pVertDecl;
|
|
OpenDeferredChunk();
|
|
m_nCurrentBatch = BATCH_GCMSTATE;
|
|
ValidateDeferredQueue();
|
|
}
|
|
else
|
|
{
|
|
PushDrawBatchJob( GetDrawQueue()->GetSignal(), pDrawHeader, pVertDecl, pIbMarkup );
|
|
|
|
BeginGcmStateTransaction();
|
|
ZPassCheckpoint( 8 );
|
|
|
|
if ( SPUGCM_DEBUG_MODE )
|
|
{
|
|
GCM_FUNC( cellGcmSetWriteBackEndLabel, GCM_LABEL_DEBUG0, (uint)pDrawHeader );
|
|
CmdBufferFinish();
|
|
volatile uint32 * pDebugLabel = cellGcmGetLabelAddress( GCM_LABEL_DEBUG0 );
|
|
while( *pDebugLabel != ( uint ) pDrawHeader )
|
|
{
|
|
// this may happen due to latency , but it won't be an infinite loop
|
|
//Msg( "Hmmmm... WriteLabel; Finish(); but label isn't set yet! 0x%X != 0x%X\n", *pDebugLabel, (uint)pDrawHeader );
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
bool ZPass::CanBegin( )
|
|
{
|
|
if( m_pCursor )
|
|
{
|
|
return false; // already begun
|
|
}
|
|
|
|
// we need at least some memory to store the job descriptor pointers
|
|
if( GetSubchainCapacity( ) < 32 )
|
|
{
|
|
Warning( "Cannot begin ZPass: zpass job subchain buffer is full\n" );
|
|
return false;
|
|
}
|
|
|
|
// we need a buffer in spuDrawQueue to store "ZPass begin, switch, end" commands
|
|
// we may potentially need the space to store the whole state before ZPass, too
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
void ZPass::Begin( uint32 * pCursor )
|
|
{
|
|
m_pCursor = pCursor;
|
|
m_nDrawPassSubchain = m_nPut;
|
|
m_pSubchain = GetCurrentCommandPtr();
|
|
*m_pSubchain = CELL_SPURS_JOB_COMMAND_JTS;
|
|
m_nFpcpStateEndOfJournalIdxAtZPassBegin = g_pixelShaderPatcher.GetStateEndOfJournalIdx();
|
|
}
|
|
|
|
void ZPass::PushCommand( uint64 nCommand )
|
|
{
|
|
Validate();
|
|
Assert( GetSubchainCapacity() > 2 );
|
|
uint64 * pLwsync = GetCurrentCommandPtr();
|
|
m_nPut++;
|
|
uint64 * pCommand = GetCurrentCommandPtr();
|
|
m_nPut++;
|
|
uint64 * pJts = GetCurrentCommandPtr();
|
|
Validate();
|
|
|
|
*pJts = CELL_SPURS_JOB_COMMAND_JTS;
|
|
*pCommand = nCommand;
|
|
__lwsync();
|
|
*pLwsync = CELL_SPURS_JOB_COMMAND_LWSYNC; // release the previous JTS
|
|
}
|
|
|
|
|
|
|
|
bool CSpuGcm::BeginZPass( )
|
|
{
|
|
if( !IsDeferredDrawQueue() && m_zPass.CanBegin() )
|
|
{
|
|
// debug - do not checkin
|
|
// while( g_pixelShaderPatcher.GetJournalSpaceLeftSince( g_spuGcmShared.m_nFpcpStartRangesAfterLastSync ) > 20 )
|
|
// {
|
|
// g_pixelShaderPatcher.SetFragmentRegisterBlock(95, 1, (const float*)&g_spuGcmShared.m_eaFpcpSharedState->m_reg[95] );
|
|
// }
|
|
if( m_nFpcpStateEndOfJournalIdxAtSpuGcmJob != g_pixelShaderPatcher.GetStateEndOfJournalIdx() )
|
|
{
|
|
GetDrawQueue()->Push2( SPUDRAWQUEUE_FLUSH_FPCP_JOURNAL, g_pixelShaderPatcher.GetStateEndOfJournalIdx() );
|
|
}
|
|
|
|
|
|
// this is where we start commands that we'll need to replay
|
|
uint32 * pCursorBegin = GetDrawQueue()->GetCursor();
|
|
uint nSafetyBufferWords = 4 ; // buffer so that when we come around, we can insert EndZPostPass method command (at least 3 words)
|
|
uint nCommandWords = 2 // command : the command and EA of ZPassSavedState_t
|
|
+ nSafetyBufferWords
|
|
+ 4 // alignment buffer for ZPassSavedState_t
|
|
+ sizeof( ZPassSavedState_t );
|
|
m_zPass.m_nJobPoolMarker = m_jobPool128.GetMarker();
|
|
|
|
uint32 * pCmdBeginZPrepass = GetDrawQueue()->AllocWords( nCommandWords );
|
|
pCmdBeginZPrepass[0] = SPUDRAWQUEUE_BEGINZPREPASS_METHOD | ( SPUDRAWQUEUE_BEGINZPREPASS_MASK & nCommandWords );
|
|
ZPassSavedState_t * pSavedState = ( ZPassSavedState_t * )AlignValue( uintp( pCmdBeginZPrepass + 2 + nSafetyBufferWords ), 16 );
|
|
pCmdBeginZPrepass[1] = ( uintp )pSavedState;
|
|
m_zPass.m_pSavedState = pSavedState;
|
|
|
|
//
|
|
// WARNING.
|
|
//
|
|
// SPUDRAWQUEUE_BEGINZPREPASS_METHOD must be the last method that modifies g_spuGcmShared.m_dxGcmState in a job_gcmflush SpuDrawQueue.
|
|
// This is because its implementation doesn't wait for DMA put to finish.
|
|
//
|
|
|
|
GCM_PERF_PUSH_MARKER( "ZPass_Z" );
|
|
CmdBufferFlush();
|
|
|
|
// actually begin; don't let anyone overwrite the commands after cursor
|
|
m_zPass.Begin( pCursorBegin );
|
|
GetDrawQueue()->PushFlushCallback( OnSpuDrawQueueFlushInZPass );
|
|
PushSpuGcmCallSubchain( m_zPass.m_pSubchain ); // call all those SPUGCM jobs for the first time
|
|
return true;
|
|
}
|
|
else
|
|
return false;
|
|
}
|
|
|
|
void CSpuGcm::SetPredication( uint nPredicationMask ) // D3DPRED_* mask
|
|
{
|
|
uint32 * pCmd = GetDrawQueue()->AllocWords( 1 );
|
|
*pCmd = SPUDRAWQUEUE_PREDICATION_METHOD | ( SPUDRAWQUEUE_PREDICATION_MASK & nPredicationMask );
|
|
}
|
|
|
|
|
|
|
|
|
|
void CSpuGcm::EndZPass( bool bPopMarker )
|
|
{
|
|
if( m_zPass && !m_zPass.m_isInEndZPass )
|
|
{
|
|
m_zPass.m_isInEndZPass = 1;
|
|
GetDrawQueue()->PopFlushCallback();
|
|
|
|
// as a precaution, since we don't need watermark-flush callbacks for the duration of this function, we'll disable it to avoid recursive flushes
|
|
GetDrawQueue()->PushFlushCallback( OnSpuDrawQueueFlushDoNothing );
|
|
|
|
// flush whatever state we may have.. it's not really needed to replay it twice, but whatever. we do need to replay it the 2nd time, and we can't just skip on it easily now in the 1st pass
|
|
CmdBufferFlush();
|
|
m_zPass.PushCommand( CELL_SPURS_JOB_COMMAND_RET );
|
|
m_zPass.End(); // at this point, there's no more "Z prepass". There's just a bunch of SPUGCM commands waiting to be executed
|
|
|
|
// replay from cursor
|
|
uint32 * pCmdEndZPrepass = GetDrawQueue()->AllocWords( 2 );
|
|
//m_nGcmFlushJobScratchSize = MAX( m_nGcmFlushJobScratchSize, CELL_GCM_VTXPRG_MAX_CONST );
|
|
pCmdEndZPrepass[0] = SPUDRAWQUEUE_ENDZPREPASS_METHOD;
|
|
pCmdEndZPrepass[1] = ( uintp )m_zPass.m_pSavedState;
|
|
if( bPopMarker )
|
|
{
|
|
GCM_PERF_POP_MARKER( /*"ZPass_Z"*/ );
|
|
GCM_PERF_MARKER( "ZPass_ZEnd" );
|
|
}
|
|
else
|
|
{
|
|
GCM_PERF_MARKER( "ZPass_Abort" );
|
|
}
|
|
|
|
CmdBufferFlush(); // commit the "End Z Prepass" command. NOTE: we don't want to commit it twice, so we End ZPass BEFORE we commit this command
|
|
|
|
// even though Z Prepass is ended now, all those commands and their memory are still intact
|
|
// re-execute them here now
|
|
PushSpuGcmCallSubchain( m_zPass.m_pSubchain ); // call all those SPUGCM jobs again!
|
|
|
|
GetDrawQueue()->PopFlushCallback();
|
|
// SPUGCM ring release point: after this point, we can simply wait for more space to become available in SPUGCM draw command ring
|
|
|
|
// Do we need to really end the render pass?
|
|
// Hopefully not, because hopefully it'll just organically be indistinguishable from the non-Z-prepassed rendering
|
|
|
|
uint32 * pCmdEndZPostPass = GetDrawQueue()->AllocWords( 3 );
|
|
pCmdEndZPostPass[0] = SPUDRAWQUEUE_ENDZPOSTPASS_METHOD;
|
|
pCmdEndZPostPass[1] = m_zPass.m_nPut;
|
|
pCmdEndZPostPass[2] = (uintp)&m_zPass.m_nGet;
|
|
GCM_PERF_MARKER( bPopMarker ? "ZPass_RenderEnd" : "AbortedZPass_RenderEnd" );
|
|
CmdBufferFlush();
|
|
|
|
m_zPass.m_isInEndZPass = 0;
|
|
|
|
}
|
|
else
|
|
{
|
|
if( bPopMarker )
|
|
{
|
|
GCM_PERF_POP_MARKER( );
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void ZPass::Init()
|
|
{
|
|
m_nDummy = 0;
|
|
m_pCursor = NULL;
|
|
m_nJobs = 2048;
|
|
m_pJobs = (uint64*)MemAlloc_AllocAligned( ( m_nJobs + 1 )* sizeof( uint64 ), 16 );
|
|
m_pJobs[m_nJobs] = CELL_SPURS_JOB_COMMAND_NEXT( m_pJobs );
|
|
m_nGet = 0;
|
|
m_nPut = 0;
|
|
m_isInEndZPass = 0;
|
|
}
|
|
|
|
void ZPass::Shutdown()
|
|
{
|
|
MemAlloc_FreeAligned( m_pJobs );
|
|
}
|
|
|
|
|
|
//#endif
|
|
|
|
uint g_nEdgeJobChainMaxContention = 5;
|
|
|
|
void CSpuGcm::OnVjobsInit()
|
|
{
|
|
int nJobPoolCount = Max<uint>( 256, g_spuGcmShared.m_fpcpRing.GetMaxJobsPerSegment() * 4 );
|
|
int nCmdLineJobPoolCount = CommandLine()->ParmValue( "-spugcmJobPool", nJobPoolCount );
|
|
if( nCmdLineJobPoolCount > nJobPoolCount && !( nCmdLineJobPoolCount & ( nCmdLineJobPoolCount - 1 ) ) )
|
|
{
|
|
Msg("Increasing spugcm cjob pool count from %d to %d\n", nJobPoolCount, nCmdLineJobPoolCount );
|
|
nJobPoolCount = nCmdLineJobPoolCount;
|
|
}
|
|
// priority lower than the main job queue, in order to yield
|
|
if( int nError = m_jobSink.Init( m_pRoot, 1, nJobPoolCount, ( uint8_t* )&m_pRoot->m_nSpugcmChainPriority, "spugcm", DMATAG_GCM_JOBCHAIN ) )
|
|
{
|
|
Error( "Cannot init SpuGcm, cell error %d\n", nError );
|
|
}
|
|
|
|
COMPILE_TIME_ASSERT( sizeof( job_edgegeom::JobDescriptor_t ) == 512 );
|
|
if( int nError = g_spuGcmShared.m_edgeJobChain.Init( m_pRoot, g_nEdgeJobChainMaxContention, 128, ( uint8_t* )&m_pRoot->m_nEdgeChainPriority, sizeof( job_edgegeom::JobDescriptor_t ), CELL_SPURS_JOBQUEUE_DEFAULT_MAX_GRAB, "edge", DMATAG_EDGE_JOBCHAIN ) )
|
|
{
|
|
Error(" Cannot init SpuGcm, edge jobchain, error %d\n", nError );
|
|
}
|
|
|
|
if( int nError = g_spuGcmShared.m_fpcpJobChain.Init( m_pRoot, 1, 512, ( uint8_t* )&m_pRoot->m_nFpcpChainPriority, 128, CELL_SPURS_JOBQUEUE_DEFAULT_MAX_GRAB, "fpcp", DMATAG_FPCP_JOBCHAIN ) )
|
|
{
|
|
Error(" Cannot init SpuGcm, fpcp jobchain, error %d\n", nError );
|
|
}
|
|
|
|
if( nJobPoolCount < g_spuGcmShared.m_fpcpRing.GetMaxJobsPerSegment() * 4 ) // we need at least this much to avoid at least most stalls
|
|
{
|
|
Error( "Job pool count %d is too small! With %d jobs per segment, make it at least %d\n", nJobPoolCount, g_spuGcmShared.m_fpcpRing.GetMaxJobsPerSegment(), g_spuGcmShared.m_fpcpRing.GetMaxJobsPerSegment() * 4 );
|
|
}
|
|
|
|
|
|
|
|
m_jobPool128.Init( nJobPoolCount );
|
|
g_spuGcmShared.m_jobPoolEdgeGeom.Init( 128 );
|
|
|
|
g_spuGcmShared.m_jobFpcPatch2 = *( m_pRoot->m_pFpcPatch2 );
|
|
g_spuGcmShared.m_jobEdgeGeom = *( m_pRoot->m_pEdgeGeom );
|
|
|
|
if( m_pMlaaBuffer )
|
|
{
|
|
g_edgePostWorkload.OnVjobsInit( m_pRoot );
|
|
}
|
|
}
|
|
|
|
|
|
#if 0 // priorities test
|
|
bool PriorityTest_t::Test( class VjobChain4 *pJobChain )
|
|
{
|
|
m_notify.m_nCopyFrom = 1;
|
|
m_notify.m_nCopyTo = 0;
|
|
uint nTick0 = __mftb();
|
|
pJobChain->Run();
|
|
uint nTick1 = __mftb();
|
|
*( pJobChain->Push() ) = CELL_SPURS_JOB_COMMAND_JOB( &m_job );
|
|
uint nTick2 = __mftb(), nTick3;
|
|
do
|
|
{
|
|
nTick3 = __mftb();
|
|
if( nTick3 - nTick2 > 79800000 * 5 )
|
|
{
|
|
Msg("%s:HANG\n", pJobChain->GetName());
|
|
return false;
|
|
}
|
|
}
|
|
while( !*(volatile uint32*)&m_notify.m_nCopyTo );
|
|
|
|
Msg("%s[%d]:%5.0f+%5.0f(run=%5.0f)\n", pJobChain->GetName(), m_notify.m_nSpuId, (nTick2-nTick1)*40.1f, (nTick3-nTick2)*40.1f, (nTick1 - nTick0) * 40.1f );
|
|
return true;
|
|
}
|
|
|
|
|
|
void CSpuGcm::TestPriorities()
|
|
{
|
|
PriorityTest_t * pTest = (PriorityTest_t*)MemAlloc_AllocAligned( sizeof( PriorityTest_t ), 128 );
|
|
|
|
V_memset( &pTest->m_job, 0, sizeof( pTest->m_job ) );
|
|
pTest->m_job.header = *(m_pRoot->m_pJobNotify);
|
|
pTest->m_job.header.useInOutBuffer = 1;
|
|
AddInputDma( &pTest->m_job, sizeof( pTest->m_notify ), &pTest->m_notify );
|
|
pTest->m_job.workArea.userData[1] = 0; // function: default
|
|
|
|
for( uint i = 0; i < 50; ++ i)
|
|
{
|
|
if( !pTest->Test( &g_spuGcmShared.m_edgeJobChain ) )
|
|
return ; // leak
|
|
if( ! pTest->Test( &g_spuGcmShared.m_fpcpJobChain ) )
|
|
return ; // leak
|
|
}
|
|
|
|
MemAlloc_FreeAligned( pTest );
|
|
}
|
|
#endif
|
|
|
|
void CSpuGcm::OnVjobsShutdown() // gets called before m_pRoot is about to be destructed and NULL'ed
|
|
{
|
|
CmdBufferFinish();
|
|
g_edgePostWorkload.OnVjobsShutdown( m_pRoot );
|
|
|
|
// in case of priority issues with job chains (when experimenting with reload_vjobs), let's first end and then join all workloads
|
|
m_jobSink.End();
|
|
g_spuGcmShared.m_fpcpJobChain.End();
|
|
g_spuGcmShared.m_edgeJobChain.End();
|
|
|
|
m_jobSink.Join();
|
|
g_spuGcmShared.m_fpcpJobChain.Join();
|
|
g_spuGcmShared.m_edgeJobChain.Join();
|
|
|
|
m_jobPool128.Shutdown();
|
|
|
|
g_spuGcmShared.m_jobPoolEdgeGeom.Shutdown();
|
|
}
|
|
|
|
void CSpuGcm::Shutdown()
|
|
{
|
|
g_pVJobs->Unregister( this ); // note: this will also call VjobsShutdown, which will join all SPU workloads and effectively call CmdBufferFinish();
|
|
g_edgeGeomRing.Shutdown();
|
|
if( m_pPcbringBuffer )
|
|
{
|
|
MemAlloc_FreeAligned( m_pPcbringBuffer );
|
|
}
|
|
m_spuDrawQueues[1].Shutdown();
|
|
m_spuDrawQueues[0].Shutdown();
|
|
#if SPU_GCM_DEBUG_TRACE
|
|
MemAlloc_FreeAligned( g_spuGcmShared.m_eaDebugTraceBuffer );
|
|
#endif
|
|
m_zPass.Shutdown();
|
|
|
|
for( uint i = 0; i < ARRAYSIZE( m_pDeferredStates ); ++i )
|
|
{
|
|
g_ps3gcmGlobalState.IoSlackFree( m_pDeferredStates[i] );
|
|
}
|
|
}
|
|
|
|
|
|
void CSpuGcm::BeginScene()
|
|
{
|
|
DrawQueueNormal();
|
|
if( m_nFramesToDisableDeferredQueue > 0 )
|
|
{
|
|
m_nFramesToDisableDeferredQueue-- ;
|
|
}
|
|
}
|
|
|
|
|
|
void CSpuGcm::EndScene()
|
|
{
|
|
g_aici.m_nCpuActivityMask = g_edgeGeomRing.m_nUsedSpus;
|
|
g_edgeGeomRing.m_nUsedSpus = 0;
|
|
|
|
g_aici.m_nDeferredWordsAllocated = m_spuDrawQueues[1].m_nAllocWords - m_nDeferredQueueWords;
|
|
m_nDeferredQueueWords = m_spuDrawQueues[1].m_nAllocWords;
|
|
|
|
|
|
if( m_zPass )
|
|
{
|
|
ExecuteNTimes( 100, Warning( "SpuGcm:EndScene must Abort ZPass; mismatched BeginZPass/EndZPass\n" ) );
|
|
AbortZPass();
|
|
}
|
|
|
|
if( g_spuGcmShared.m_enableStallWarnings )
|
|
{
|
|
|
|
if( m_jobPool128.m_nWaitSpins > 100 )
|
|
{
|
|
if( g_spuGcmShared.m_enableStallWarnings )
|
|
{
|
|
Warning( "SpuGcm: %d spins in job pool, PPU is really ahead of SPU and (probably) RSX.\n", m_jobPool128.m_nWaitSpins );
|
|
}
|
|
}
|
|
m_jobPool128.m_nWaitSpins = 0;
|
|
|
|
/*
|
|
if( m_jobPool256.m_nWaitSpins )
|
|
{
|
|
if( g_spuGcmShared.m_enableStallWarnings )
|
|
{
|
|
Warning( "SpuGcm: %d spins in job pool 256, PPU is really ahead of SPU and (probably) RSX.\n", m_jobPool256.m_nWaitSpins );
|
|
}
|
|
m_jobPool256.m_nWaitSpins = 0;
|
|
}
|
|
*/
|
|
|
|
if( m_nPcbringWaitSpins > 100 )
|
|
{
|
|
if( g_spuGcmShared.m_enableStallWarnings )
|
|
{
|
|
Warning( "SpuGcm: %d spins in PcbRing, PPU is waiting for SPU (possibly) waiting for RSX\n", m_nPcbringWaitSpins );
|
|
}
|
|
}
|
|
m_nPcbringWaitSpins = 0;
|
|
}
|
|
m_nFrame++;
|
|
|
|
COMPILE_TIME_ASSERT( ARRAYSIZE( m_pDeferredStates ) == 2 ); // we need to rotate the array if it's not 2-element
|
|
Swap( m_pDeferredStates[0], m_pDeferredStates[1] );
|
|
|
|
extern ConVar r_ps3_mlaa;
|
|
m_bUseDeferredDrawQueue = m_pMlaaBuffer && !( r_ps3_mlaa.GetInt() & 16 );
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void CSpuGcm::CmdBufferFinish()
|
|
{
|
|
#ifdef CELL_GCM_DEBUG // [
|
|
extern void (*fnSaveCellGcmDebugCallback)(struct CellGcmContextData*) = gCellGcmDebugCallback;
|
|
gCellGcmDebugCallback = NULL; // disable recursive callback
|
|
#endif // ]
|
|
|
|
s_nFinishLabelValue++;
|
|
GCM_FUNC( cellGcmSetWriteBackEndLabel, GCM_LABEL_SPUGCM_FINISH, s_nFinishLabelValue );
|
|
CmdBufferFlush();
|
|
Assert( s_nStopAtFinishLabelValue != s_nFinishLabelValue );
|
|
|
|
// now wait for RSX to reach
|
|
uint nSpins = 0;
|
|
uint nTbStart = __mftb();
|
|
volatile uint32 * pLastJobUpdate = &g_spuGcmShared.m_eaLastJobThatUpdatedMe;
|
|
while( ( s_nFinishLabelValue != *m_pFinishLabel ) ||
|
|
( *pLastJobUpdate != m_eaLastJobThatUpdatesSharedState ) )
|
|
{
|
|
sys_timer_usleep( 30 ); // don't hog the PPU
|
|
++nSpins;
|
|
#ifndef _CERT
|
|
if( nSpins && ( nSpins % 100000 == 0 ) )
|
|
{
|
|
Warning(
|
|
"** SpuGcm detected an SPU/RSX hang. **\n"
|
|
);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
uint nTbEnd = __mftb();
|
|
|
|
if( nSpins > 1000 )
|
|
{
|
|
Warning( "Long wait (%d us / %d spins) in CmdBufferFinish()\n", ( nTbEnd - nTbStart ) / 80, nSpins );
|
|
}
|
|
|
|
#ifdef CELL_GCM_DEBUG // [
|
|
gCellGcmDebugCallback = fnSaveCellGcmDebugCallback;
|
|
#endif // ]
|
|
}
|
|
|
|
|
|
|
|
void CSpuGcm::SyncMlaa( void * pLocalSurface )
|
|
{
|
|
uint nInSurfaceOffset = ( g_ps3gcmGlobalState.m_nRenderSize[1]/2 * g_ps3gcmGlobalState.m_nSurfaceRenderPitch ) & -16;
|
|
vec_int4 * pIn = ( vec_int4 * )( ( uintp( m_pMlaaBuffer ) + nInSurfaceOffset ) ), *pOut = ( vec_int4 * ) ( uintp( pLocalSurface ) + nInSurfaceOffset );
|
|
|
|
|
|
uint nRowWidth = g_ps3gcmGlobalState.m_nSurfaceRenderPitch/64, nExclude = ( m_nFrame % ( nRowWidth - 2 ) ) + 1;
|
|
for( uint nRow = 0; nRow < 4; ++nRow )
|
|
{
|
|
vec_int4 * pRowIn = AddBytes( pIn, g_ps3gcmGlobalState.m_nSurfaceRenderPitch * nRow );
|
|
vec_int4 * pRowOut = AddBytes( pOut, g_ps3gcmGlobalState.m_nSurfaceRenderPitch * nRow );
|
|
for( uint i = 0; i < nExclude; i ++ )
|
|
{
|
|
vec_int4 *input = pRowIn + i * 4, *output = pRowOut + i * 4;
|
|
output[0] = vec_nor( input[0], input[0] );
|
|
output[1] = vec_nor( input[1], input[1] );
|
|
output[2] = vec_nor( input[2], input[2] );
|
|
output[3] = vec_nor( input[3], input[3] );
|
|
}
|
|
|
|
for( uint i = nExclude + 1; i < nRowWidth ; ++i )
|
|
{
|
|
vec_int4 *input = pRowIn + i*4, *output = pRowOut + i*4;
|
|
output[0] = vec_nor( input[0], input[0] );
|
|
output[1] = vec_nor( input[1], input[1] );
|
|
output[2] = vec_nor( input[2], input[2] );
|
|
output[3] = vec_nor( input[3], input[3] );
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void CSpuGcm::CloseDeferredChunk()
|
|
{
|
|
Assert( m_nSpuDrawQueueSelector == 1 );
|
|
uint32 * pDeferredQueueCursor = m_spuDrawQueues[1].GetCursor();
|
|
if( m_pDeferredChunkHead )
|
|
{
|
|
#ifdef _DEBUG
|
|
m_nChunksClosedInSegment++;
|
|
#endif
|
|
// mark the previous chunk with its end
|
|
m_pDeferredChunkHead[1] = ( uint32 )pDeferredQueueCursor;
|
|
m_pDeferredChunkHead = NULL;
|
|
}
|
|
m_pDeferredQueueCursors[0] = pDeferredQueueCursor;
|
|
ValidateDeferredQueue();
|
|
}
|
|
|
|
|
|
|
|
#if SPUGCM_DEBUG_MODE
|
|
uint g_nDeferredChunks[0x800][4], g_nDeferredChunkCount = 0;
|
|
#endif
|
|
|
|
|
|
|
|
|
|
uint32* CSpuGcm::OpenDeferredChunk( uint nHeader, uint nAllocExtra )
|
|
{
|
|
Assert( IsValidDeferredHeader( nHeader ) );
|
|
Assert( m_nSpuDrawQueueSelector == 1 );
|
|
|
|
// skip allocation of the new chunk if the current chunk is empty
|
|
if( !m_pDeferredChunkHead || m_pDeferredChunkHead + SPUDRAWQUEUE_DEFERRED_HEADER_WORDS != GetDrawQueue()->GetCursor() || nAllocExtra > 0 )
|
|
{
|
|
// we don't have an empty chunk already; allocate more
|
|
CloseDeferredChunk();
|
|
m_pDeferredChunkHead = GetDrawQueue()->AllocWords( SPUDRAWQUEUE_DEFERRED_HEADER_WORDS + nAllocExtra );
|
|
}
|
|
m_pDeferredChunkHead[0] = nHeader; // just flush state by default
|
|
m_nDeferredChunkHead = nHeader;
|
|
m_pDeferredChunkHead[1] = ( uintp )GetDrawQueue()->GetCursor();
|
|
|
|
ValidateDeferredQueue();
|
|
|
|
#ifdef _DEBUG
|
|
if( SPUDRAWQUEUE_DEFERRED_HEADER_WORDS > 2 )
|
|
{
|
|
m_pDeferredChunkHead[2] = GetDrawQueue()->m_nAllocCount;
|
|
}
|
|
#endif
|
|
#if SPUGCM_DEBUG_MODE
|
|
uint nIdx = (g_nDeferredChunkCount++)%(ARRAYSIZE(g_nDeferredChunks));
|
|
Assert( nIdx < ARRAYSIZE(g_nDeferredChunks) );
|
|
uint * pDebug = g_nDeferredChunks[nIdx];
|
|
pDebug[0] = nHeader;
|
|
pDebug[1] = (uint32)m_pDeferredChunkHead;
|
|
pDebug[2] = nAllocExtra;
|
|
pDebug[3] = GetDrawQueue()->m_nAllocCount;
|
|
#endif
|
|
GetDrawQueue()->SetFlushWatermarkFrom( m_pDeferredChunkHead );
|
|
return m_pDeferredChunkHead + SPUDRAWQUEUE_DEFERRED_HEADER_WORDS;
|
|
}
|
|
|
|
|
|
void CSpuGcm::DrawQueueNormal( bool bExecuteDeferredQueueSegment )
|
|
{
|
|
if( m_nSpuDrawQueueSelector != 0 )
|
|
{
|
|
FillNops( &PCB_RING_CTX ); // switching draw queues, preallocated gcm context no longer usable
|
|
Assert( *m_pDeferredChunkHead == SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD );
|
|
GetDrawQueue()->Push1( SPUDRAWQUEUE_PERF_MARKER_DrawNormal );
|
|
CloseDeferredChunk();
|
|
m_pDeferredQueueCursors[0] = m_spuDrawQueues[1].GetCursor();
|
|
/*uint nBytesInSegment = m_spuDrawQueues[1].Length( m_pDeferredQueueSegment, m_pDeferredQueueCursors[0] );
|
|
Msg( "DrawQueueNormal %p..%p=%.1fKB (%p,%p)\n", m_pDeferredQueueSegment, m_pDeferredQueueCursors[0],
|
|
nBytesInSegment / 1024.0f,
|
|
m_pDeferredQueueCursors[1], m_pDeferredQueueCursors[2] );*/
|
|
m_nSpuDrawQueueSelector = 0;
|
|
if( m_pDeferredQueueSegment && bExecuteDeferredQueueSegment )
|
|
{
|
|
ExecuteDeferredDrawQueueSegment( m_pDeferredQueueSegment, m_pDeferredQueueCursors[0], false );
|
|
m_pDeferredQueueSegment = NULL;
|
|
}
|
|
|
|
Assert( m_pCurrentBatchCursor[0] == m_spuDrawQueues[0].GetCursor() );
|
|
m_pDeferredChunkHead = NULL;
|
|
BeginGcmStateTransaction();
|
|
}
|
|
if( m_nFramesToDisableDeferredQueue > 0 )
|
|
{
|
|
ExecuteDeferredDrawQueue( 0 );
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
void CSpuGcm::DisableMlaaForTwoFrames()
|
|
{
|
|
g_flipHandler.DisableMlaaForTwoFrames();
|
|
m_nFramesToDisableDeferredQueue = 2; // this frame and next will have disabled deferred queue
|
|
DrawQueueNormal();
|
|
}
|
|
*/
|
|
|
|
void CSpuGcm::DisableMlaa()
|
|
{
|
|
DrawQueueNormal( false );
|
|
// we could, but we don't have to flush the previous frame:
|
|
// we'll do that at Flip, the same way we do it every time
|
|
g_flipHandler.DisableMlaa();
|
|
}
|
|
|
|
void CSpuGcm::DisableMlaaPermanently()
|
|
{
|
|
DrawQueueNormal( false );
|
|
|
|
g_flipHandler.QmsAdviceBeforeDrawPrevFramebuffer();
|
|
// flush previous frame first
|
|
ExecuteDeferredDrawQueue( 1 );
|
|
|
|
g_flipHandler.DisableMlaaPermannetly();
|
|
g_flipHandler.DisableMlaa();
|
|
|
|
extern void DxDeviceForceUpdateRenderTarget( );
|
|
DxDeviceForceUpdateRenderTarget( ); // recover main render target, as it was screwed up by execution of previous frame's commands
|
|
ExecuteDeferredDrawQueue( 0 );
|
|
}
|
|
|
|
|
|
|
|
CSpuGcm::DrawQueueDeferred_Result CSpuGcm::DrawQueueDeferred() // may flush previous frame deferred queue the first time
|
|
{
|
|
DrawQueueDeferred_Result result;
|
|
if( m_bUseDeferredDrawQueue && ( m_nFramesToDisableDeferredQueue == 0 ) && ( m_nSpuDrawQueueSelector != 1 ) )
|
|
{
|
|
FillNops( &PCB_RING_CTX ); // switching draw queues, preallocated gcm context no longer usable
|
|
// do we have anything in the deferred queue?
|
|
result.isFirstInFrame = m_pDeferredQueueCursors[0] == m_pDeferredQueueCursors[1];
|
|
GetDrawQueue()->Push1( SPUDRAWQUEUE_PERF_MARKER_DrawDeferred );
|
|
if( result.isFirstInFrame )
|
|
{
|
|
GetDrawQueue()->Push2( SPUDRAWQUEUE_DEFER_STATE, uintp( m_pDeferredStates[0] ) );
|
|
}
|
|
// before we dive into deferred queue, we flush the current queue, because we'll have to restart current queue when we dive out of deferred queue
|
|
// this will also make sure that any state dump required for deferred queue to execute will be dumped before deferred queue will try to execute
|
|
GcmStateFlush();
|
|
Assert( m_pCurrentBatchCursor[0] == m_spuDrawQueues[0].GetCursor() );
|
|
//ExecuteDeferredDrawQueue( 1 ); // dubious: we might want to execute this in the end of the frame to avoid undesirable state changes
|
|
m_nSpuDrawQueueSelector = 1;
|
|
BeginGcmStateTransaction();
|
|
m_pDeferredQueueSegment = m_spuDrawQueues[1].GetCursor();
|
|
#ifdef _DEBUG
|
|
m_nChunksClosedInSegment = 0;
|
|
#endif
|
|
//Msg( "DrawQueueDeferred %p / %.1f KB free...", m_pDeferredQueueSegment, m_spuDrawQueues[1].Length( m_pDeferredQueueSegment, m_spuDrawQueues[1].m_pGet ) );
|
|
OpenDeferredChunk( SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD );
|
|
if( result.isFirstInFrame ) // we defer the "UNDEFER" command in here
|
|
{
|
|
GetDrawQueue()->Push2( SPUDRAWQUEUE_UNDEFER_STATE, uintp( m_pDeferredStates[0] ) );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
result.isFirstInFrame = false;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
|
|
// returns: true if some memory will be freed up by SPU by poking into corresponding GET pointer later
|
|
bool CSpuGcm::ExecuteDeferredDrawQueue( uint nPrevious )
|
|
{
|
|
Assert( !IsDeferredDrawQueue() );
|
|
|
|
// just copy the commands to the main spugcm buffer
|
|
Assert( m_pDeferredQueueCursors[0] == m_spuDrawQueues[1].GetCursor() || m_pDeferredQueueCursors[0] == m_pDeferredChunkHead );
|
|
|
|
uint32 * pCmdEnd = m_pDeferredQueueCursors[nPrevious];//, *pCmdEnd = ( ( nPrevious == 0 ) ? m_spuDrawQueues[1].GetCursor() : m_pDeferredQueueCursors[ nPrevious - 1 ] );
|
|
uint32 * pCmdBegin = m_pDeferredQueueCursors[ARRAYSIZE(m_pDeferredQueueCursors)-1];
|
|
if( pCmdEnd == pCmdBegin )
|
|
return false;
|
|
//Msg( "ExecuteDeferredDrawQueue(%d) %p..%p=%.1fKB\n", nPrevious, pCmdBegin, pCmdEnd, m_spuDrawQueues[1].Length( pCmdBegin, pCmdEnd ) );
|
|
|
|
FillNops( &PCB_RING_CTX );
|
|
#if defined( _DEBUG ) && !defined( _CERT )
|
|
m_spuDrawQueues[0].Push1( SPUDRAWQUEUE_PERF_MARKER_AAReplay );
|
|
#endif
|
|
|
|
GcmStateFlush();
|
|
Assert( m_pCurrentBatchCursor[0] == m_spuDrawQueues[0].GetCursor() );// we're not deferred; so, GcmStateFlush calls BeginGcmStateTransaction that will reset the current batch cursor
|
|
|
|
bool bMoveGet = ExecuteDeferredDrawQueueSegment( pCmdBegin, pCmdEnd, true );
|
|
|
|
#if defined( _DEBUG ) && !defined( _CERT )
|
|
m_spuDrawQueues[0].Push1( SPUDRAWQUEUE_PERF_MARKER_AAReplayEnd );
|
|
SetCurrentBatchCursor( GetDrawQueue()->GetCursor() );
|
|
#endif
|
|
|
|
// forget about previously executed frames/chunks
|
|
for( uint i = nPrevious + 1; i < ARRAYSIZE( m_pDeferredQueueCursors ); ++i )
|
|
m_pDeferredQueueCursors[i] = pCmdEnd;
|
|
|
|
return bMoveGet;
|
|
}
|
|
|
|
|
|
bool CSpuGcm::ExecuteDeferredDrawQueueSegment( uint32 * pCmdBegin, uint32 * pCmdEnd, bool bExecuteDraws )
|
|
{
|
|
Assert( m_nCurrentBatch == BATCH_GCMSTATE );
|
|
// if we're in deferred queue, we should switch to normal queue before drawing from deferred to normal queue
|
|
Assert( !IsDeferredDrawQueue() );
|
|
bool bMoveGet = false;
|
|
uint nResultantSpuDrawQueueIndex = bExecuteDraws ? 1 : 2; // [2] is a dummy write-only resultant "GET" register..
|
|
|
|
#if SPUGCM_DEBUG_MODE
|
|
uint nDeferredChunkDebugIdx = 0xFFFFFFFF;
|
|
for( uint i = 1;i <= ARRAYSIZE( g_nDeferredChunks ); ++i )
|
|
{
|
|
uint j = ( g_nDeferredChunkCount - i ) & ( ARRAYSIZE( g_nDeferredChunks ) - 1 );
|
|
if( g_nDeferredChunks[j][1] == uintp( pCmdBegin ) )
|
|
{
|
|
nDeferredChunkDebugIdx = j;
|
|
break;
|
|
}
|
|
}
|
|
Assert( nDeferredChunkDebugIdx < ARRAYSIZE( g_nDeferredChunks ) );
|
|
#endif
|
|
|
|
SpuDrawQueue *pDrawQueue = &m_spuDrawQueues[1];
|
|
for( uint32 * pCmd = pDrawQueue->NormalizeCursor( pCmdBegin ), * pCmdNormalizedEnd = pDrawQueue->NormalizeCursor( pCmdEnd ), *pPrev = pCmd; pCmd != pCmdNormalizedEnd; )
|
|
{
|
|
if( !IsCert() && !pDrawQueue->IsValidCursor( pCmd ) )
|
|
DebuggerBreakIfDebugging();
|
|
uint nCmd = *pCmd;
|
|
if( nCmd == 0 )
|
|
{
|
|
pCmd++;
|
|
}
|
|
else if( ( nCmd & SPUDRAWQUEUE_METHOD_MASK ) == SPUDRAWQUEUE_NOPCOUNT_METHOD )
|
|
{
|
|
pCmd += 1 + ( nCmd & SPUDRAWQUEUE_NOPCOUNT_MASK );
|
|
}
|
|
else
|
|
{
|
|
uint32 * pNext = (uint32*)pCmd[1], *pCmdHeaderEnd = pCmd + SPUDRAWQUEUE_DEFERRED_HEADER_WORDS;
|
|
Assert( m_spuDrawQueues[1].IsValidCursor( pNext ) );
|
|
|
|
#if SPUGCM_DEBUG_MODE
|
|
for( uint i = 0; ; ++i )
|
|
{
|
|
uint j = ( nDeferredChunkDebugIdx + i ) & ( ARRAYSIZE( g_nDeferredChunks ) - 1 );
|
|
if( g_nDeferredChunks[j][1] == uintp( pCmd ) )
|
|
{
|
|
nDeferredChunkDebugIdx = j;
|
|
break;
|
|
}
|
|
if( i >= ARRAYSIZE( g_nDeferredChunks ) ) // stop if we don't find the debug idx
|
|
{
|
|
DebuggerBreak();
|
|
break;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
switch ( nCmd & SPUDRAWQUEUE_DEFERRED_METHOD_MASK )
|
|
{
|
|
case SPUDRAWQUEUE_DEFERRED_SET_FP_CONST_METHOD:
|
|
{
|
|
uint nStartRegister = ( nCmd >> 12 ) & 0xFFF, nRegisterCount = nCmd & 0xFFF;
|
|
Assert( nStartRegister < 96 && nRegisterCount <= 96 );
|
|
OnSetPixelShaderConstant();
|
|
g_pixelShaderPatcher.SetFragmentRegisterBlock( nStartRegister, nRegisterCount, ( const float* )pCmdHeaderEnd );
|
|
//m_dirtyCachesMask |= DxAbstractGcmState_t::kDirtyPxConstants;
|
|
}
|
|
break;
|
|
|
|
|
|
case SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD:
|
|
if( nCmd == SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD || bExecuteDraws )
|
|
{
|
|
PushStateFlushJob( pDrawQueue, uint( pNext ) | nResultantSpuDrawQueueIndex, pCmdHeaderEnd, pNext );
|
|
Assert( m_pDeferredChunkSubmittedTill[nResultantSpuDrawQueueIndex] == pNext );
|
|
bMoveGet = true;
|
|
}
|
|
break;
|
|
|
|
case SPUDRAWQUEUE_DEFERRED_DRAW_METHOD:
|
|
if( bExecuteDraws )
|
|
{
|
|
Assert( nCmd == SPUDRAWQUEUE_DEFERRED_DRAW_METHOD );
|
|
SpuDrawHeader_t * pDrawHeader = ( SpuDrawHeader_t * )AlignValue( uintp( pCmdHeaderEnd ), 16 );
|
|
// at the time we set up these deferred calls, we don't track the FPCP journal, so we need to refresh the indices referring into it here
|
|
pDrawHeader->m_nFpcpEndOfJournalIdx = g_pixelShaderPatcher.GetStateEndOfJournalIdx();
|
|
CellSpursJob128 * pDrawJob = PushDrawBatchJob( uint( pNext ) | nResultantSpuDrawQueueIndex, pDrawHeader, *( IDirect3DVertexDeclaration9** )( pDrawHeader + 1 ), pDrawHeader->m_eaIbMarkup );
|
|
Assert( m_pDeferredChunkSubmittedTill[nResultantSpuDrawQueueIndex] == pNext );
|
|
bMoveGet = true;
|
|
}
|
|
break;
|
|
}
|
|
pPrev = pCmd;
|
|
pCmd = pNext;
|
|
}
|
|
pCmd = pDrawQueue->NormalizeCursor( pCmd );
|
|
}
|
|
|
|
return bMoveGet;
|
|
}
|
|
|
|
|
|
void CSpuGcm::FlipDeferredDrawQueue()
|
|
{
|
|
//Msg( "FlipDeferredDrawQueue {%p,%p,%p} Frame=%d\n", m_pDeferredQueueCursors[0], m_pDeferredQueueCursors[1], m_pDeferredQueueCursors[2], m_nFrame );
|
|
Assert( !IsDeferredDrawQueue() );
|
|
m_pDeferredQueueCursors[0] = m_spuDrawQueues[1].GetCursor();
|
|
for( uint i = ARRAYSIZE( m_pDeferredQueueCursors ); i-- > 1; )
|
|
{
|
|
m_pDeferredQueueCursors[ i ] = m_pDeferredQueueCursors[ i - 1 ];
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void CEdgePostWorkload::OnVjobsInit( VJobsRoot* pRoot )
|
|
{
|
|
uint numSpus = 5, nScratchSize = EDGE_POST_MLAA_HANDLER_SPU_BUFFER_SIZE( numSpus ) * 3;
|
|
m_pMlaaScratch = MemAlloc_AllocAligned( nScratchSize, EDGE_POST_MLAA_HANDLER_BUFFER_ALIGN );
|
|
int nOk = edgePostMlaaInitializeContext( &m_mlaaContext, numSpus, &pRoot->m_spurs, ( uint8_t* )&pRoot->m_nEdgePostWorkloadPriority, GCM_LABEL_EDGEPOSTMLAA, m_pMlaaScratch, nScratchSize );
|
|
if( nOk != CELL_OK )
|
|
{
|
|
Warning("Cannot initialize MLAA, error %d\n", nOk );
|
|
edgePostMlaaDestroyContext( &m_mlaaContext );
|
|
MemAlloc_FreeAligned( m_pMlaaScratch );
|
|
return;
|
|
}
|
|
m_isInitialized = true;
|
|
|
|
|
|
}
|
|
|
|
void CEdgePostWorkload::OnVjobsShutdown( VJobsRoot* pRoot )
|
|
{
|
|
if( m_isInitialized )
|
|
{
|
|
edgePostMlaaDestroyContext( &m_mlaaContext );
|
|
MemAlloc_FreeAligned( m_pMlaaScratch );
|
|
m_isInitialized = false;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int32_t GhostGcmCtxCallback( struct CellGcmContextData *pContext, uint32_t nCount )
|
|
{
|
|
Error("Trying to allocate %d more words in the ghost context\n", nCount );
|
|
return CELL_ERROR_ERROR_FLAG;
|
|
}
|
|
|
|
enum TruePauseStateEnum_t
|
|
{
|
|
TRUE_PAUSE_NONE,
|
|
TRUE_PAUSE_SPINNING,
|
|
TRUE_PAUSE_LOCKED0, // locked, Shoulder and X buttons down
|
|
TRUE_PAUSE_LOCKED1, // locked, Shoulder button up
|
|
TRUE_PAUSE_SINGLE_STEP
|
|
};
|
|
|
|
TruePauseStateEnum_t g_nTruePauseState = TRUE_PAUSE_NONE;
|
|
|
|
|
|
bool CSpuGcm::TruePause()
|
|
{
|
|
switch( g_nTruePauseState )
|
|
{
|
|
case TRUE_PAUSE_NONE:
|
|
g_nTruePauseState = TRUE_PAUSE_SPINNING;
|
|
case TRUE_PAUSE_SINGLE_STEP:
|
|
break; // re-entering after single step
|
|
default:
|
|
g_nTruePauseState = TRUE_PAUSE_NONE;
|
|
return false; // inconsistent state, don't try to continue
|
|
}
|
|
|
|
CmdBufferFinish(); // this'll put the end marker to the last frame.
|
|
g_spuGcmShared.m_sysring.NotifyRsxGet( g_spuGcmShared.m_eaGcmControlRegister->get );
|
|
|
|
//Assert( g_spuGcmShared.m_sysring.m_nPut == g_spuGcmShared.m_sysring.m_nEnd );
|
|
const uint nReserve = 0x1000;
|
|
if( !g_spuGcmShared.m_sysring.CanPutNoWrap( nReserve ) )
|
|
{
|
|
if( !g_spuGcmShared.m_sysring.CanWrapAndPut( nReserve ) )
|
|
{
|
|
Msg( "Cannot replay because sysring wraps around right here and you got unlucky. If you get this a lot, ask Sergiy to implement/fix wrap-around replay\n" );
|
|
return false;
|
|
}
|
|
g_spuGcmShared.WrapSequence();
|
|
}
|
|
|
|
int nReplayFrames = 2;
|
|
|
|
if( !g_spuGcmShared.CanReplayPastFrames( nReplayFrames, nReserve ) )
|
|
{
|
|
uint nSysringBytesNeeded = 0;
|
|
Warning( "Cannot replay frames: %d frames didn't fit into command buffer of %d bytes and was generated and executed in multiple passes/segments\n", nReplayFrames, g_ps3gcmGlobalState.m_nCmdSize );
|
|
return false;
|
|
}
|
|
|
|
// all relevant SPU, RSX activity ceased at this point
|
|
uintp eaEnd = g_spuGcmShared.m_sysring.EaPut();
|
|
uint32 * pEnd = (uint32*)eaEnd;
|
|
uint nIoOffsetEnd = eaEnd + g_spuGcmShared.m_nIoOffsetDelta;
|
|
|
|
//nOffsetBeginFrame = g_spuGcmShared.m_sysring.PutToEa( g_spuGcmShared.GetPastFrame(2).m_nSysringBegin ) + g_spuGcmShared.m_nIoOffsetDelta;
|
|
|
|
//uint nSurfaceFlipIndex = g_ps3gcmGlobalState.m_display.surfaceFlipIdx, nSurfaceFlipAltIndex = g_ps3gcmGlobalState.m_display.PrevSurfaceIndex();
|
|
|
|
//CPs3gcmLocalMemoryBlock &altSurface = g_ps3gcmGlobalState.m_display.surfaceColor[nSurfaceFlipAltIndex];
|
|
//V_memset( altSurface.DataInAnyMemory(), 0, altSurface.Size() );
|
|
|
|
int nCurrentReplayFrame = 1;
|
|
// Note: we probably shouldn't start with the frame rendering in the same surface as the last frame flipped..
|
|
|
|
uint32 * pReplayLabelReset = (uint32*)g_spuGcmShared.m_sysring.EaPut();
|
|
uint nReplayLabelResetIoOffset = uintp( pReplayLabelReset ) + g_spuGcmShared.m_nIoOffsetDelta;
|
|
CellGcmContextData ghostCtx;
|
|
ghostCtx.current = ghostCtx.begin = pReplayLabelReset;
|
|
uint32 * pGhostAreaEnd = ghostCtx.end = ghostCtx.begin + ( nReserve / sizeof( uint32 ) );
|
|
ghostCtx.callback = GhostGcmCtxCallback;
|
|
|
|
cellGcmSetWriteBackEndLabel( &ghostCtx, GCM_LABEL_REPLAY, 0 );
|
|
|
|
uint32 * pReplayGhostArea = ghostCtx.current;
|
|
uint nReplayGhostAreaIoOffset = uintp( pReplayGhostArea ) + g_spuGcmShared.m_nIoOffsetDelta;
|
|
|
|
g_spuGcmShared.m_sysring.Put( uintp( pReplayGhostArea ) - uintp( pReplayLabelReset ) );
|
|
Assert( g_spuGcmShared.m_sysring.EaPut() == uintp( pReplayGhostArea ) );
|
|
|
|
volatile uint32 * pLabelReplay = cellGcmGetLabelAddress( GCM_LABEL_REPLAY );
|
|
*pLabelReplay = 0xFFFFFFFF;
|
|
__sync();
|
|
|
|
bool isFirstIteration = true;
|
|
|
|
do
|
|
{
|
|
g_spuGcmShared.m_eaGcmControlRegister->put = nReplayGhostAreaIoOffset;
|
|
|
|
while( *pLabelReplay != 0 )
|
|
continue;
|
|
// we're now synchronized at the beginning of ghost area
|
|
switch( g_nTruePauseState )
|
|
{
|
|
case TRUE_PAUSE_NONE:
|
|
return false;
|
|
case TRUE_PAUSE_SINGLE_STEP:
|
|
if( !isFirstIteration )
|
|
{
|
|
return true;
|
|
}
|
|
break;
|
|
}
|
|
|
|
const BeginFrameRecord_t &pastFrame = g_spuGcmShared.GetPastFrame( nCurrentReplayFrame );
|
|
int nOffsetBeginFrame = uintp( pastFrame.m_eaBegin ) + g_spuGcmShared.m_nIoOffsetDelta, nOffsetEndFrame = uintp( pastFrame.m_eaEnd ) + g_spuGcmShared.m_nIoOffsetDelta;
|
|
|
|
Msg("frame@ %X..%X ", nOffsetBeginFrame , nOffsetEndFrame );
|
|
|
|
ghostCtx.current = ghostCtx.begin = pReplayGhostArea;
|
|
ghostCtx.end = pGhostAreaEnd;
|
|
|
|
*( ghostCtx.current++ ) = CELL_GCM_JUMP( nOffsetBeginFrame ); // jump to the beginning of the frame we want to replay
|
|
uint32 nOffsetReturnFromFrame = uintp( ghostCtx.current ) + g_spuGcmShared.m_nIoOffsetDelta;
|
|
|
|
Assert( pastFrame.m_eaEnd[0] == 0 && pastFrame.m_eaEnd[1] == 0 && pastFrame.m_eaEnd[2] == 0 && pastFrame.m_eaEnd[3] == 0 ); // we expect 4 NOPs at the end of the frame
|
|
Assert( pastFrame.m_eaBegin[0] == 0 && pastFrame.m_eaBegin[1] == 0 && pastFrame.m_eaBegin[2] == 0 && pastFrame.m_eaBegin[3] == 0 ); // we expect 4 NOPs at the beginning of the frame
|
|
pastFrame.m_eaEnd[0] = CELL_GCM_JUMP( nOffsetReturnFromFrame ); // return to replay area after rendering the whole frame
|
|
cellGcmSetWriteBackEndLabel( &ghostCtx, GCM_LABEL_REPLAY, 1 );
|
|
|
|
__sync();
|
|
uint32 nTickStart = __mftb(); // let's start rendering (replaying) the captured GCM frame
|
|
g_spuGcmShared.m_eaGcmControlRegister->put = uintp( ghostCtx.current ) + g_spuGcmShared.m_nIoOffsetDelta;
|
|
while( *pLabelReplay != 1 )
|
|
continue;
|
|
|
|
int nSurfaceFlipIndex = g_ps3gcmGlobalState.m_display.PrevSurfaceIndex( nCurrentReplayFrame );
|
|
Assert( nSurfaceFlipIndex >= 0 );
|
|
|
|
while ( cellGcmGetFlipStatus() != CELL_GCM_DISPLAY_FLIP_STATUS_DONE )
|
|
{
|
|
// Wait for the previous flip to completely finish
|
|
ThreadSleep( 1 );
|
|
}
|
|
|
|
cellGcmResetFlipStatus(); // Need to reset GCM flip status
|
|
|
|
// start flipping
|
|
cellGcmSetFlip( &ghostCtx, nSurfaceFlipIndex );
|
|
|
|
cellGcmSetWriteBackEndLabel( &ghostCtx, GCM_LABEL_REPLAY, 2 );
|
|
|
|
int nOffsetEndOfFlip = uintp( ghostCtx.current ) + g_spuGcmShared.m_nIoOffsetDelta;
|
|
|
|
cellGcmSetWriteBackEndLabel( &ghostCtx, GCM_LABEL_REPLAY, 3 ); // reset label
|
|
|
|
*( ghostCtx.current++ ) = CELL_GCM_JUMP( nReplayLabelResetIoOffset );
|
|
|
|
__sync();
|
|
|
|
g_spuGcmShared.m_eaGcmControlRegister->put = nOffsetEndOfFlip;
|
|
Msg( "[%d.%d] flip@ %X..%X. ", nCurrentReplayFrame, nSurfaceFlipIndex, nReplayGhostAreaIoOffset, nOffsetEndOfFlip );
|
|
|
|
while( *pLabelReplay != 2 )
|
|
continue;
|
|
|
|
uint32 nFrameEnd = __mftb(); Msg( "%.2f ..ms.\n", ( nFrameEnd - nTickStart ) / 79800.0f );
|
|
|
|
while ( cellGcmGetFlipStatus() != CELL_GCM_DISPLAY_FLIP_STATUS_DONE )
|
|
{
|
|
// Wait for the previous flip to completely finish
|
|
ThreadSleep( 1 );
|
|
}
|
|
|
|
uint32 nFlipEnd = __mftb(); Msg( "%.2f ms.\n", ( nFlipEnd - nTickStart ) / 79800.0f );
|
|
|
|
|
|
pastFrame.m_eaEnd[0] = CELL_GCM_METHOD_NOP;
|
|
__sync();
|
|
nCurrentReplayFrame = ( nCurrentReplayFrame + nReplayFrames - 1 ) % nReplayFrames;
|
|
|
|
int bContinueProcessing = 0;
|
|
CellPadData padData;
|
|
do
|
|
{
|
|
int nError = cellPadGetData( 0, &padData );
|
|
if( nError )
|
|
{
|
|
Msg( "Error 0x%X trying to get pad data, aborting true pause\n", nError );
|
|
g_nTruePauseState = TRUE_PAUSE_NONE;
|
|
return false;
|
|
}
|
|
else
|
|
{
|
|
if( padData.len >= 3 )
|
|
{
|
|
int isL1Down = padData.button[CELL_PAD_BTN_OFFSET_DIGITAL2] & CELL_PAD_CTRL_R1;
|
|
int isTriangleDown = padData.button[CELL_PAD_BTN_OFFSET_DIGITAL1] & CELL_PAD_CTRL_UP;
|
|
int isCrossDown = padData.button[CELL_PAD_BTN_OFFSET_DIGITAL1] & CELL_PAD_CTRL_DOWN;
|
|
int isCircleDown = padData.button[CELL_PAD_BTN_OFFSET_DIGITAL1] & CELL_PAD_CTRL_RIGHT;
|
|
|
|
bContinueProcessing = isTriangleDown; // go into infinite loop here if the triangle is down
|
|
|
|
int isLockDown = isCrossDown, isSingleStepDown = isCircleDown, isPauseDown = isL1Down;
|
|
|
|
if( g_nTruePauseState != TRUE_PAUSE_SINGLE_STEP && isSingleStepDown )
|
|
{
|
|
g_nTruePauseState = TRUE_PAUSE_SINGLE_STEP;
|
|
bContinueProcessing = false; // return to render a single step
|
|
}
|
|
|
|
switch( g_nTruePauseState )
|
|
{
|
|
case TRUE_PAUSE_LOCKED1:
|
|
case TRUE_PAUSE_LOCKED0:
|
|
if( isPauseDown )
|
|
{
|
|
if( g_nTruePauseState == TRUE_PAUSE_LOCKED1 )
|
|
{
|
|
g_nTruePauseState = TRUE_PAUSE_NONE; // second press on the shoulder releases the lock
|
|
bContinueProcessing = false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if( g_nTruePauseState == TRUE_PAUSE_LOCKED0 )
|
|
{
|
|
g_nTruePauseState = TRUE_PAUSE_LOCKED1; // promote: shoulder isn't pressed any more
|
|
}
|
|
}
|
|
|
|
break;
|
|
|
|
case TRUE_PAUSE_SPINNING:
|
|
|
|
if( isLockDown )
|
|
{
|
|
g_nTruePauseState = TRUE_PAUSE_LOCKED0;
|
|
}
|
|
else if( isSingleStepDown )
|
|
{
|
|
g_nTruePauseState = TRUE_PAUSE_SINGLE_STEP;
|
|
bContinueProcessing = false; // do the single step
|
|
}
|
|
else if( !isPauseDown )
|
|
{
|
|
if( isFirstIteration )
|
|
{
|
|
g_nTruePauseState = TRUE_PAUSE_LOCKED1; // assume we go into locked state if L1 wasn't pressed the very first frame
|
|
}
|
|
else
|
|
{
|
|
g_nTruePauseState = TRUE_PAUSE_NONE;
|
|
bContinueProcessing = false;
|
|
}
|
|
}
|
|
break;
|
|
|
|
case TRUE_PAUSE_SINGLE_STEP:
|
|
// we skipped one render frame; go into normal spinning state as soon as the user depresses circle
|
|
if( !isSingleStepDown )
|
|
{
|
|
if( isPauseDown )
|
|
{
|
|
g_nTruePauseState = TRUE_PAUSE_SPINNING; // the shoulder is still down, so the user didn't decide yet if they want to let the game go
|
|
}
|
|
else
|
|
{
|
|
g_nTruePauseState = TRUE_PAUSE_LOCKED1; // we let the shoulder go, so it must be a locked state
|
|
}
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
isFirstIteration = false;
|
|
}
|
|
while( bContinueProcessing );
|
|
}
|
|
while( true );
|
|
|
|
return false;
|
|
}
|
|
|
|
static ConVar spugcm_validatedeferredqueue( "spugcm_validatedeferredqueue", "0" );
|
|
|
|
void CSpuGcm::ValidateDeferredQueue()
|
|
{
|
|
#ifdef _DEBUG
|
|
if( !spugcm_validatedeferredqueue.GetBool() )
|
|
return;
|
|
uint32 * pCmdEnd = m_pDeferredChunkHead;
|
|
if( !pCmdEnd )
|
|
pCmdEnd = m_pDeferredQueueCursors[0];
|
|
pCmdEnd = m_spuDrawQueues[1].NormalizeCursor( pCmdEnd );
|
|
Assert( m_spuDrawQueues[1].IsValidCursor( pCmdEnd ) );
|
|
uint32 * pCmdBegin = m_pDeferredQueueCursors[ARRAYSIZE(m_pDeferredQueueCursors)-1];
|
|
uint nWraps = 0;
|
|
for( uint32 * pCmd = pCmdBegin; pCmd != pCmdEnd; )
|
|
{
|
|
uint nCmd = *pCmd;
|
|
if( nCmd == 0 )
|
|
{
|
|
pCmd++;
|
|
}
|
|
else if( ( nCmd & SPUDRAWQUEUE_METHOD_MASK ) == SPUDRAWQUEUE_NOPCOUNT_METHOD )
|
|
{
|
|
pCmd += 1 + ( nCmd & SPUDRAWQUEUE_NOPCOUNT_MASK );
|
|
}
|
|
else
|
|
{
|
|
Assert( IsValidDeferredHeader( nCmd ) );
|
|
Assert( nWraps == 0 || pCmd < pCmdBegin );
|
|
Assert( m_spuDrawQueues[ 1 ].IsValidCursor( pCmd ) );
|
|
uint32 * pNext = ( uint32* )pCmd[ 1 ];
|
|
Assert( m_spuDrawQueues[ 1 ].IsValidCursor( pNext ) );
|
|
if( pNext < pCmd )
|
|
{
|
|
Assert( nWraps == 0 );
|
|
nWraps++;
|
|
}
|
|
pCmd = pNext;
|
|
}
|
|
pCmd = m_spuDrawQueues[1].NormalizeCursor( pCmd );
|
|
}
|
|
#endif
|
|
}
|