1122 lines
41 KiB
C++
1122 lines
41 KiB
C++
#include "basetypes.h"
|
|
#include "mathlib/ssemath.h"
|
|
#include "soundsystem/lowlevel.h"
|
|
#include "mix.h"
|
|
#include "tier0/vprof.h"
|
|
|
|
// simple inline to test alignemnt of a value
|
|
inline bool IsAlign4( uint nAlign )
|
|
{
|
|
return ( nAlign & 3 ) == 0;
|
|
}
|
|
|
|
inline bool IsAligned16Bytes( void *p )
|
|
{
|
|
return ( uintp( p ) & 0xF ) ? false : true;
|
|
}
|
|
|
|
// this processes the low-level mix command list and produces pResults
|
|
void ProcessAudioMix( CAudioMixResults *pResults, const CAudioMixState &mixState, CAudioMixDescription &mixSetup )
|
|
{
|
|
// set up with current counts
|
|
pResults->m_pOutput.RemoveAll();
|
|
|
|
pResults->m_pOutput.SetCount( mixSetup.m_nMixBufferMax );
|
|
pResults->m_debugOutputs.SetCount( mixSetup.m_nDebugOutputCount );
|
|
pResults->m_flOutputLevels.SetCount( mixSetup.m_nOutputLevelCount );
|
|
|
|
// now run the commands
|
|
VPROF("IAudioMix::Process");
|
|
for ( int i = 0; i < mixSetup.m_commands.Count(); i++ )
|
|
{
|
|
audio_mix_command_t &cmd = mixSetup.m_commands[i];
|
|
switch( cmd.m_nCommandId )
|
|
{
|
|
case AUDIO_MIX_CLEAR:
|
|
SilenceBuffer( pResults->m_pOutput[ cmd.m_nOutput ].m_flData );
|
|
break;
|
|
|
|
case AUDIO_MIX_EXTRACT_SOURCE:
|
|
ConvertSourceToFloat( *mixState.GetInput( cmd.m_nInput0 ), cmd.m_flParam1, pResults->m_pOutput[cmd.m_nOutput].m_flData, mixState.GetOutput( cmd.m_nInput0 ) );
|
|
break;
|
|
|
|
case AUDIO_MIX_ADVANCE_SOURCE:
|
|
AdvanceSource( *mixState.GetInput( cmd.m_nInput0 ), cmd.m_flParam1, mixState.GetOutput( cmd.m_nInput0 ) );
|
|
break;
|
|
|
|
case AUDIO_MIX_MULTIPLY:
|
|
ScaleBuffer( pResults->m_pOutput[cmd.m_nOutput].m_flData, pResults->m_pOutput[cmd.m_nInput0].m_flData, cmd.m_flParam0 );
|
|
break;
|
|
|
|
case AUDIO_MIX_PROCESS:
|
|
{
|
|
CAudioProcessor *pProc = mixSetup.m_processors[cmd.m_nInput1];
|
|
pProc->Process( &pResults->m_pOutput[cmd.m_nInput0], &pResults->m_pOutput[cmd.m_nOutput], int(cmd.m_flParam0), mixState.DSPGlobals() );
|
|
}
|
|
break;
|
|
|
|
case AUDIO_MIX_ACCUMULATE:
|
|
MixBuffer( pResults->m_pOutput[cmd.m_nOutput].m_flData, pResults->m_pOutput[cmd.m_nInput0].m_flData, cmd.m_flParam0 );
|
|
break;
|
|
case AUDIO_MIX_ACCUMULATE_RAMP:
|
|
MixBufferRamp( pResults->m_pOutput[cmd.m_nOutput].m_flData, pResults->m_pOutput[cmd.m_nInput0].m_flData, cmd.m_flParam0, cmd.m_flParam1 );
|
|
break;
|
|
|
|
case AUDIO_MIX_SUM:
|
|
SumBuffer2x1( pResults->m_pOutput[cmd.m_nOutput].m_flData, pResults->m_pOutput[cmd.m_nInput0].m_flData, cmd.m_flParam0, pResults->m_pOutput[cmd.m_nInput1].m_flData, cmd.m_flParam1 );
|
|
break;
|
|
case AUDIO_MIX_SWAP:
|
|
SwapBuffersInPlace( pResults->m_pOutput[cmd.m_nOutput].m_flData, pResults->m_pOutput[cmd.m_nInput0].m_flData );
|
|
break;
|
|
case AUDIO_MIX_MEASURE_DEBUG_LEVEL:
|
|
{
|
|
int nChannelCount = cmd.m_nInput1;
|
|
mix_debug_outputs_t &debugOut = pResults->m_debugOutputs[cmd.m_nOutput];
|
|
debugOut.m_flLevel = 0.0f;
|
|
const float flScale = 1.0f / 32768.0f;
|
|
for ( int nChan = 0; nChan < nChannelCount; nChan++ )
|
|
{
|
|
debugOut.m_flChannelLevels[nChan] = flScale * BufferLevel( pResults->m_pOutput[cmd.m_nInput0 + nChan].m_flData );
|
|
debugOut.m_flLevel = Max( debugOut.m_flLevel, debugOut.m_flChannelLevels[nChan] );
|
|
}
|
|
debugOut.m_nChannelCount = nChannelCount;
|
|
}
|
|
break;
|
|
case AUDIO_MIX_OUTPUT_LEVEL:
|
|
{
|
|
int nChannelCount = cmd.m_nInput1;
|
|
float flLevel = 0.0f;
|
|
const float flScale = 1.0f / 32768.0f;
|
|
|
|
for ( int nChan = 0; nChan < nChannelCount; nChan++ )
|
|
{
|
|
float flOut = flScale * AvergeBufferAmplitude( pResults->m_pOutput[cmd.m_nInput0 + nChan].m_flData );
|
|
flLevel = Max( flLevel, flOut );
|
|
}
|
|
pResults->m_flOutputLevels[cmd.m_nOutput] = clamp( flLevel, 0.0f, 1.0f );
|
|
}
|
|
break;
|
|
default:
|
|
Assert( 0 );
|
|
//AssertMsg( 0, "Unknown mix command %d\n", int(cmd.m_nCommandId) );
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
void CAudioMixCommandList::ClearMultichannel( uint16 nTarget, int nCount )
|
|
{
|
|
for ( int i = 0; i < nCount; i++ )
|
|
{
|
|
audio_mix_command_t cmd;
|
|
cmd.Init( AUDIO_MIX_CLEAR, nTarget + i );
|
|
m_commands.AddToTail( cmd );
|
|
}
|
|
}
|
|
|
|
void CAudioMixCommandList::ScaleMultichannel( uint16 nOutput, uint16 nInput, int nCount, float flVolume )
|
|
{
|
|
for ( int i = 0; i < nCount; i++ )
|
|
{
|
|
audio_mix_command_t cmd;
|
|
cmd.Init( AUDIO_MIX_MULTIPLY, nOutput + i, nInput + i, flVolume );
|
|
m_commands.AddToTail( cmd );
|
|
}
|
|
}
|
|
|
|
|
|
void CAudioMixCommandList::AccumulateMultichannel( uint16 nOutput, int nOutputChannels, uint16 nInput, int nInputChannels, float flInputVolume )
|
|
{
|
|
if ( nOutputChannels == nInputChannels )
|
|
{
|
|
for ( int i = 0; i < nInputChannels; i++ )
|
|
{
|
|
AccumulateToBuffer( nOutput + i, nInput + i, flInputVolume );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// need to downmix or expand channels
|
|
if ( nOutputChannels == 2 )
|
|
{
|
|
// downmix 6 ch to 2 ch
|
|
Assert( nInputChannels == 6 ); // other cases should have been handled above or there's more code to write
|
|
// out.left += 0.5 * (in.left + in.center*0.5) + 0.5 * in.rear_left
|
|
AccumulateToBuffer( nOutput + 0, nInput + 0, flInputVolume * 0.5f );
|
|
AccumulateToBuffer( nOutput + 0, nInput + 2, flInputVolume * 0.25f );
|
|
AccumulateToBuffer( nOutput + 0, nInput + 4, flInputVolume * 0.5f );
|
|
// out.right += 0.5 * (in.right + in.center*0.5) + 0.5 * in.rear_right
|
|
AccumulateToBuffer( nOutput + 1, nInput + 1, flInputVolume * 0.5f );
|
|
AccumulateToBuffer( nOutput + 1, nInput + 2, flInputVolume * 0.25f );
|
|
AccumulateToBuffer( nOutput + 1, nInput + 5, flInputVolume * 0.5f );
|
|
}
|
|
else if ( nOutputChannels == 6 )
|
|
{
|
|
// expand 2ch to 6 ch
|
|
Assert( nInputChannels == 2 );
|
|
// out.left += in.left
|
|
AccumulateToBuffer( nOutput + 0, nInput + 0, flInputVolume );
|
|
// out.right += in.right
|
|
AccumulateToBuffer( nOutput + 1, nInput + 1, flInputVolume );
|
|
// out.center = 0.5f * (in.left + in.right)
|
|
AccumulateToBuffer( nOutput + 2, nInput + 0, flInputVolume * 0.5f );
|
|
AccumulateToBuffer( nOutput + 2, nInput + 1, flInputVolume * 0.5f );
|
|
// out.rear_left += in.left
|
|
AccumulateToBuffer( nOutput + 4, nInput + 0, flInputVolume );
|
|
// out.rear_right += in.right
|
|
AccumulateToBuffer( nOutput + 5, nInput + 1, flInputVolume );
|
|
}
|
|
else if ( nOutputChannels == 8 && (nInputChannels == 2 || nInputChannels == 6) )
|
|
{
|
|
// right now we just use this for solo/debug, copy
|
|
for ( int i = 0; i < nInputChannels; i++ )
|
|
{
|
|
AccumulateToBuffer( nOutput + i, nInput + i, flInputVolume );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// some other case we haven't implemented
|
|
Assert(0);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
FORCEINLINE shortx8 ShiftRightShortSIMD( const shortx8 &inputValue, const shortx8 &shiftBitCount )
|
|
{
|
|
return _mm_srl_epi16( inputValue, shiftBitCount );
|
|
}
|
|
|
|
FORCEINLINE shortx8 SignedExtractLowAsInt32( const shortx8 &a )
|
|
{
|
|
shortx8 signExtend = _mm_cmplt_epi16( a, _mm_setzero_si128() );
|
|
return _mm_unpacklo_epi16( a, signExtend );
|
|
}
|
|
|
|
FORCEINLINE shortx8 SignedExtractHighAsInt32( const shortx8 &a )
|
|
{
|
|
shortx8 signExtend = _mm_cmplt_epi16( a, _mm_setzero_si128() );
|
|
return _mm_unpackhi_epi16( a, signExtend );
|
|
}
|
|
|
|
FORCEINLINE shortx8 RoundtFloatToInt32( const fltx4 &input )
|
|
{
|
|
return _mm_cvtps_epi32( input );
|
|
}
|
|
|
|
FORCEINLINE shortx8 PackInt32x2ToShortx8( const shortx8 &input0, const shortx8 &input1 )
|
|
{
|
|
return _mm_packs_epi32( input0, input1 );
|
|
}
|
|
|
|
// Load 4 aligned words into a SIMD register
|
|
FORCEINLINE shortx8 LoadAlignedShortx8SIMD( const void * RESTRICT pSIMD )
|
|
{
|
|
return _mm_load_si128( reinterpret_cast<const __m128i *>( pSIMD ) );
|
|
}
|
|
|
|
// Load 4 unaligned words into a SIMD register
|
|
FORCEINLINE shortx8 LoadUnalignedShortx8SIMD( const void * RESTRICT pSIMD )
|
|
{
|
|
return _mm_loadu_si128( reinterpret_cast<const __m128i *>( pSIMD ) );
|
|
}
|
|
|
|
// create a stereo interleaved signed-16 buffer from two float-32 buffers
|
|
void ConvertFloat32Int16_Clamp_Interleave2_Unaligned( short *pOut, float *pflInputLeft, float *pflInputRight, int nSampleCount )
|
|
{
|
|
if ( nSampleCount >= 8 )
|
|
{
|
|
int nSampleQuads = nSampleCount >> 2;
|
|
// truncate sample count to remainder after 4-bundles
|
|
nSampleCount &= 3;
|
|
|
|
short *pWrite = pOut;
|
|
for ( int i = 0; i < nSampleQuads; i++ )
|
|
{
|
|
// load 4 samples from left and four from right
|
|
fltx4 leftSamples = LoadAlignedSIMD( pflInputLeft );
|
|
pflInputLeft += 4;
|
|
fltx4 rightSamples = LoadAlignedSIMD( pflInputRight );
|
|
pflInputRight += 4;
|
|
shortx8 nLeft = RoundtFloatToInt32( leftSamples );
|
|
shortx8 nRight = RoundtFloatToInt32( rightSamples );
|
|
// interleave into L/R pairs
|
|
shortx8 nInterleavedLow = _mm_unpacklo_epi32( nLeft, nRight );
|
|
shortx8 nInterleavedHigh = _mm_unpackhi_epi32( nLeft, nRight );
|
|
// pack
|
|
shortx8 nOut = PackInt32x2ToShortx8( nInterleavedLow, nInterleavedHigh );
|
|
StoreUnalignedSIMD( pWrite, nOut );
|
|
pWrite += 8;
|
|
}
|
|
}
|
|
|
|
// now convert and clamp any remaining samples (not in SIMD 4-bundles)
|
|
for ( int i = 0; i < nSampleCount; i++ )
|
|
{
|
|
int l = (int)pflInputLeft[i];
|
|
if ( l < -32768 ) l = -32768;
|
|
if ( l > 32767 ) l = 32767;
|
|
int r = (int)pflInputRight[i];
|
|
if ( r < -32768 ) r = -32768;
|
|
if ( r > 32767 ) r = 32767;
|
|
pOut[0] = l;
|
|
pOut[1] = r;
|
|
pOut += 2;
|
|
}
|
|
}
|
|
|
|
void ConvertFloat32Int16_Clamp_Interleave2( short *pOut, float *pflInputLeft, float *pflInputRight, int nSampleCount )
|
|
{
|
|
if ( !IsAligned16Bytes(pOut) )
|
|
{
|
|
ConvertFloat32Int16_Clamp_Interleave2_Unaligned( pOut, pflInputLeft, pflInputRight, nSampleCount );
|
|
return;
|
|
}
|
|
if ( nSampleCount >= 8 )
|
|
{
|
|
int nSampleQuads = nSampleCount >> 2;
|
|
|
|
// truncate sample count to remainder after 4-bundles
|
|
nSampleCount &= 3;
|
|
|
|
short *pWrite = pOut;
|
|
for ( int i = 0; i < nSampleQuads; i++ )
|
|
{
|
|
// load 4 samples from left and four from right
|
|
fltx4 leftSamples = LoadAlignedSIMD( pflInputLeft );
|
|
pflInputLeft += 4;
|
|
fltx4 rightSamples = LoadAlignedSIMD( pflInputRight );
|
|
pflInputRight += 4;
|
|
shortx8 nLeft = RoundtFloatToInt32( leftSamples );
|
|
shortx8 nRight = RoundtFloatToInt32( rightSamples );
|
|
shortx8 nInterleavedLow = _mm_unpacklo_epi32( nLeft, nRight );
|
|
shortx8 nInterleavedHigh = _mm_unpackhi_epi32( nLeft, nRight );
|
|
shortx8 nOut = PackInt32x2ToShortx8( nInterleavedLow, nInterleavedHigh );
|
|
StoreAlignedSIMD( pWrite, nOut );
|
|
|
|
pWrite += 8;
|
|
}
|
|
}
|
|
|
|
// now convert and clamp any remaining samples (not in SIMD 4-bundles)
|
|
for ( int i = 0; i < nSampleCount; i++ )
|
|
{
|
|
int l = (int)pflInputLeft[i];
|
|
if ( l < -32768 ) l = -32768;
|
|
if ( l > 32767 ) l = 32767;
|
|
int r = (int)pflInputRight[i];
|
|
if ( r < -32768 ) r = -32768;
|
|
if ( r > 32767 ) r = 32767;
|
|
pOut[0] = l;
|
|
pOut[1] = r;
|
|
pOut += 2;
|
|
}
|
|
}
|
|
|
|
// Faster SIMD version for 6-in, 6-out
|
|
void ConvertFloat32Int16_Clamp_Interleave6( short *pOut, int nOutputChannelCount, int nChannelStrideFloats, float *pflChannel0, int nInputChannelCount, int nSampleCount )
|
|
{
|
|
Assert( nOutputChannelCount == 6 && nInputChannelCount == 6 && IsAligned16Bytes( pflChannel0 ) );
|
|
const float *pInput0 = pflChannel0;
|
|
const float *pInput1 = pflChannel0 + nChannelStrideFloats;
|
|
const float *pInput2 = pflChannel0 + 2*nChannelStrideFloats;
|
|
const float *pInput3 = pflChannel0 + 3*nChannelStrideFloats;
|
|
const float *pInput4 = pflChannel0 + 4*nChannelStrideFloats;
|
|
const float *pInput5 = pflChannel0 + 5*nChannelStrideFloats;
|
|
short *pWrite = pOut;
|
|
// process 24 samples per loop, grab 6 bundles of 4, write out 3 bundles of 8
|
|
for ( int i = 0; i < nSampleCount; i += 4 )
|
|
{
|
|
// grab 6 bundles of 4 samples
|
|
fltx4 fl4Samples0 = LoadAlignedSIMD( pInput0 + i ); // 0 6 12 18
|
|
fltx4 fl4Samples1 = LoadAlignedSIMD( pInput1 + i ); // 1 7 13 19
|
|
fltx4 fl4Samples2 = LoadAlignedSIMD( pInput2 + i ); // 2 8 14 20
|
|
fltx4 fl4Samples3 = LoadAlignedSIMD( pInput3 + i ); // 3 9 15 21
|
|
fltx4 fl4Samples4 = LoadAlignedSIMD( pInput4 + i ); // 4 10 16 22
|
|
fltx4 fl4Samples5 = LoadAlignedSIMD( pInput5 + i ); // 5 11 17 23
|
|
|
|
// interleave into pairs
|
|
fltx4 fl4Pair0 = _mm_shuffle_ps( fl4Samples0, fl4Samples1, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 0 6 1 7
|
|
fltx4 fl4Pair1 = _mm_shuffle_ps( fl4Samples0, fl4Samples1, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 12 18 13 19
|
|
fltx4 fl4Pair2 = _mm_shuffle_ps( fl4Samples2, fl4Samples3, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 2 8 3 9
|
|
fltx4 fl4Pair3 = _mm_shuffle_ps( fl4Samples2, fl4Samples3, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 14 20 15 21
|
|
fltx4 fl4Pair4 = _mm_shuffle_ps( fl4Samples4, fl4Samples5, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 4 10 5 11
|
|
fltx4 fl4Pair5 = _mm_shuffle_ps( fl4Samples4, fl4Samples5, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 16 22 17 23
|
|
|
|
// now put in final order
|
|
fltx4 fl4Out0 = _mm_shuffle_ps( fl4Pair0, fl4Pair2, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 0 1 2 3
|
|
fltx4 fl4Out1 = _mm_shuffle_ps( fl4Pair4, fl4Pair0, MM_SHUFFLE_REV( 0, 2, 1, 3 ) ); // 4 5 6 7
|
|
fltx4 fl4Out2 = _mm_shuffle_ps( fl4Pair2, fl4Pair4, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 8 9 10 11
|
|
fltx4 fl4Out3 = _mm_shuffle_ps( fl4Pair1, fl4Pair3, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 12 13 14 15
|
|
fltx4 fl4Out4 = _mm_shuffle_ps( fl4Pair5, fl4Pair1, MM_SHUFFLE_REV( 0, 2, 1, 3 ) ); // 16 17 18 19
|
|
fltx4 fl4Out5 = _mm_shuffle_ps( fl4Pair3, fl4Pair5, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 20 21 22 23
|
|
|
|
// pack into 3 bundles of 8
|
|
shortx8 nOut0 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out0 ), RoundtFloatToInt32( fl4Out1 ) );
|
|
shortx8 nOut1 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out2 ), RoundtFloatToInt32( fl4Out3 ) );
|
|
shortx8 nOut2 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out4 ), RoundtFloatToInt32( fl4Out5 ) );
|
|
// NOTE: Optimize alignment?
|
|
StoreUnalignedSIMD( pWrite, nOut0 );
|
|
StoreUnalignedSIMD( pWrite + 8, nOut1 );
|
|
StoreUnalignedSIMD( pWrite + 16, nOut2 );
|
|
pWrite += 24;
|
|
}
|
|
}
|
|
|
|
|
|
// Faster SIMD version for 8-in, 8-out
|
|
void ConvertFloat32Int16_Clamp_Interleave8( short *pOut, int nOutputChannelCount, int nChannelStrideFloats, float *pflChannel0, int nInputChannelCount, int nSampleCount )
|
|
{
|
|
Assert( nOutputChannelCount == 8 && nInputChannelCount == 8 && IsAligned16Bytes( pflChannel0 ) );
|
|
const float *pInput0 = pflChannel0;
|
|
const float *pInput1 = pflChannel0 + nChannelStrideFloats;
|
|
const float *pInput2 = pflChannel0 + 2 * nChannelStrideFloats;
|
|
const float *pInput3 = pflChannel0 + 3 * nChannelStrideFloats;
|
|
const float *pInput4 = pflChannel0 + 4 * nChannelStrideFloats;
|
|
const float *pInput5 = pflChannel0 + 5 * nChannelStrideFloats;
|
|
const float *pInput6 = pflChannel0 + 6 * nChannelStrideFloats;
|
|
const float *pInput7 = pflChannel0 + 7 * nChannelStrideFloats;
|
|
short *pWrite = pOut;
|
|
// process 32 samples per loop, grab 6 bundles of 4, write out 4 bundles of 8
|
|
for ( int i = 0; i < nSampleCount; i += 4 )
|
|
{
|
|
// grab 8 bundles of 4 samples
|
|
fltx4 fl4Samples0 = LoadAlignedSIMD( pInput0 + i ); // 0 8 16 24
|
|
fltx4 fl4Samples1 = LoadAlignedSIMD( pInput1 + i ); // 1 9 17 25
|
|
fltx4 fl4Samples2 = LoadAlignedSIMD( pInput2 + i ); // 2 10 18 26
|
|
fltx4 fl4Samples3 = LoadAlignedSIMD( pInput3 + i ); // 3 11 19 27
|
|
fltx4 fl4Samples4 = LoadAlignedSIMD( pInput4 + i ); // 4 12 20 28
|
|
fltx4 fl4Samples5 = LoadAlignedSIMD( pInput5 + i ); // 5 13 21 29
|
|
fltx4 fl4Samples6 = LoadAlignedSIMD( pInput6 + i ); // 6 14 22 30
|
|
fltx4 fl4Samples7 = LoadAlignedSIMD( pInput7 + i ); // 7 15 23 31
|
|
|
|
// interleave into pairs
|
|
fltx4 fl4Pair0 = _mm_shuffle_ps( fl4Samples0, fl4Samples1, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 0 8 1 9
|
|
fltx4 fl4Pair1 = _mm_shuffle_ps( fl4Samples0, fl4Samples1, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 16 24 17 25
|
|
fltx4 fl4Pair2 = _mm_shuffle_ps( fl4Samples2, fl4Samples3, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 2 10 3 11
|
|
fltx4 fl4Pair3 = _mm_shuffle_ps( fl4Samples2, fl4Samples3, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 18 26 19 27
|
|
fltx4 fl4Pair4 = _mm_shuffle_ps( fl4Samples4, fl4Samples5, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 4 12 5 13
|
|
fltx4 fl4Pair5 = _mm_shuffle_ps( fl4Samples4, fl4Samples5, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 20 28 21 29
|
|
fltx4 fl4Pair6 = _mm_shuffle_ps( fl4Samples6, fl4Samples7, MM_SHUFFLE_REV( 0, 1, 0, 1 ) ); // 6 14 7 15
|
|
fltx4 fl4Pair7 = _mm_shuffle_ps( fl4Samples6, fl4Samples7, MM_SHUFFLE_REV( 2, 3, 2, 3 ) ); // 22 30 23 31
|
|
|
|
// now put in final order
|
|
fltx4 fl4Out0 = _mm_shuffle_ps( fl4Pair0, fl4Pair2, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 0 1 2 3
|
|
fltx4 fl4Out1 = _mm_shuffle_ps( fl4Pair4, fl4Pair6, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 4 5 6 7
|
|
fltx4 fl4Out2 = _mm_shuffle_ps( fl4Pair0, fl4Pair2, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 8 9 10 11
|
|
fltx4 fl4Out3 = _mm_shuffle_ps( fl4Pair4, fl4Pair6, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 12 13 14 15
|
|
fltx4 fl4Out4 = _mm_shuffle_ps( fl4Pair1, fl4Pair3, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 16 17 18 19
|
|
fltx4 fl4Out5 = _mm_shuffle_ps( fl4Pair5, fl4Pair7, MM_SHUFFLE_REV( 0, 2, 0, 2 ) ); // 20 21 22 23
|
|
fltx4 fl4Out6 = _mm_shuffle_ps( fl4Pair1, fl4Pair3, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 24 25 26 27
|
|
fltx4 fl4Out7 = _mm_shuffle_ps( fl4Pair5, fl4Pair7, MM_SHUFFLE_REV( 1, 3, 1, 3 ) ); // 28 29 30 31
|
|
|
|
// pack into 4 bundles of 8
|
|
shortx8 nOut0 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out0 ), RoundtFloatToInt32( fl4Out1 ) );
|
|
shortx8 nOut1 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out2 ), RoundtFloatToInt32( fl4Out3 ) );
|
|
shortx8 nOut2 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out4 ), RoundtFloatToInt32( fl4Out5 ) );
|
|
shortx8 nOut3 = PackInt32x2ToShortx8( RoundtFloatToInt32( fl4Out6 ), RoundtFloatToInt32( fl4Out7 ) );
|
|
// NOTE: Optimize alignment?
|
|
StoreUnalignedSIMD( pWrite, nOut0 );
|
|
StoreUnalignedSIMD( pWrite + 8, nOut1 );
|
|
StoreUnalignedSIMD( pWrite + 16, nOut2 );
|
|
StoreUnalignedSIMD( pWrite + 24, nOut3 );
|
|
pWrite += 32;
|
|
}
|
|
}
|
|
|
|
// slow version to support 4/6/8 channel devices
|
|
void ConvertFloat32Int16_Clamp_InterleaveStride( short *pOut, int nOutputChannelCount, int nChannelStrideFloats, float *pflChannel0, int nInputChannelCount, int nSampleCount )
|
|
{
|
|
// detect optimizable cases and call fast code
|
|
if ( nInputChannelCount == 6 && nOutputChannelCount == 6 && IsAlign4( nSampleCount ) )
|
|
{
|
|
ConvertFloat32Int16_Clamp_Interleave6( pOut, nOutputChannelCount, nChannelStrideFloats, pflChannel0, nInputChannelCount, nSampleCount );
|
|
return;
|
|
}
|
|
if ( nInputChannelCount == 8 && nOutputChannelCount == 8 && IsAlign4( nSampleCount ) )
|
|
{
|
|
ConvertFloat32Int16_Clamp_Interleave8( pOut, nOutputChannelCount, nChannelStrideFloats, pflChannel0, nInputChannelCount, nSampleCount );
|
|
return;
|
|
}
|
|
|
|
// run the slower code in this case
|
|
if ( nOutputChannelCount > nInputChannelCount )
|
|
{
|
|
for ( int i = 0; i < nSampleCount; i++ )
|
|
{
|
|
float *pIn = pflChannel0 + i;
|
|
for ( int j = 0; j < nInputChannelCount; j++ )
|
|
{
|
|
int nOut = int( pIn[0] );
|
|
nOut = clamp( nOut, -32768, 32767 );
|
|
*pOut++ = nOut;
|
|
pIn += nChannelStrideFloats;
|
|
}
|
|
for ( int j = nInputChannelCount; j < nOutputChannelCount; j++ )
|
|
{
|
|
*pOut++ = 0;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
int nCopyChannels = MIN(nOutputChannelCount, nInputChannelCount);
|
|
for ( int i = 0; i < nSampleCount; i++ )
|
|
{
|
|
float *pIn = pflChannel0 + i;
|
|
for ( int j = 0; j < nCopyChannels; j++ )
|
|
{
|
|
int nOut = int( pIn[0] );
|
|
nOut = clamp( nOut, -32768, 32767 );
|
|
*pOut++ = nOut;
|
|
pIn += nChannelStrideFloats;
|
|
}
|
|
}
|
|
}
|
|
Assert( nOutputChannelCount >= nInputChannelCount );
|
|
}
|
|
|
|
static void ConvertShortToFloatx8( float flOutput[MIX_BUFFER_SIZE], const short *pIn )
|
|
{
|
|
fltx4 *pOutput = reinterpret_cast<fltx4 *>(&flOutput[0]);
|
|
const shortx8 *pInput = reinterpret_cast<const shortx8 *>(pIn);
|
|
for ( int i = 0; i < (MIX_BUFFER_SIZE/8); i++ )
|
|
{
|
|
shortx8 samples = LoadUnalignedShortSIMD( pInput );
|
|
pInput++;
|
|
fltx4 lo = SignedIntConvertToFltSIMD( SignedExtractLowAsInt32( samples ) );
|
|
fltx4 hi = SignedIntConvertToFltSIMD( SignedExtractHighAsInt32( samples ) );
|
|
StoreAlignedSIMD( (float *)pOutput, lo );
|
|
pOutput++;
|
|
StoreAlignedSIMD( (float *)pOutput, hi );
|
|
pOutput++;
|
|
}
|
|
}
|
|
|
|
// use 15-bit fixed point fractions for resampling
|
|
#define FIX_BITS 15
|
|
#define FIX_MASK ((1ul<<FIX_BITS)-1)
|
|
|
|
FORCEINLINE int FLOAT_TO_FIXED( float flVal )
|
|
{
|
|
return int( flVal * float( 1ul << FIX_BITS ) );
|
|
}
|
|
|
|
// UNDONE: This can be trivially optimized to not loop
|
|
static int CalcAdvanceSamples( int nOutCount, float sampleRatio, uint *pInputOffsetFrac )
|
|
{
|
|
uint nRateScaleFix = FLOAT_TO_FIXED( sampleRatio );
|
|
uint nSampleFrac = *pInputOffsetFrac;
|
|
uint nSampleIndex = 0;
|
|
|
|
for ( int i = 0; i < nOutCount; i++ )
|
|
{
|
|
nSampleFrac += nRateScaleFix;
|
|
nSampleIndex += nSampleFrac >> FIX_BITS;
|
|
nSampleFrac = nSampleFrac & FIX_MASK;
|
|
}
|
|
*pInputOffsetFrac = nSampleFrac;
|
|
return nSampleIndex;
|
|
}
|
|
|
|
// resample 16-bit audio data at the given ratio using linear interpolation
|
|
// output is 32-bits per sample float
|
|
static uint Resample16to32( float *pOut, const short *pWaveData, float sampleRatio, uint *pInputOffsetFrac )
|
|
{
|
|
uint nRateScaleFix = FLOAT_TO_FIXED( sampleRatio );
|
|
uint nSampleFrac = *pInputOffsetFrac;
|
|
Assert( nSampleFrac < ( 1ul << FIX_BITS ) );
|
|
uint nSampleIndex = 0;
|
|
|
|
int nFirst, nSecond, nInterp;
|
|
for ( int i = 0; i < MIX_BUFFER_SIZE; i++ )
|
|
{
|
|
nFirst = (int)( pWaveData[nSampleIndex] );
|
|
nSecond = (int)( pWaveData[nSampleIndex + 1] );
|
|
#if 0
|
|
// this expression doesn't truncate the value to 16-bits and preserves fractional samples in the float
|
|
// output. It is a bit slower and the improved precision won't be audible unless the sample is amplified
|
|
// or processed in some way because the output stage will simply round these back to 16-bit values
|
|
// so disable this until we find a reason that we need it
|
|
nInterp = ( nFirst << FIX_BITS ) + ( ( ( nSecond - nFirst ) * int( nSampleFrac ) ) );
|
|
pOut[i] = float( nInterp ) * ( 1.0f / float( 1ul << FIX_BITS ) );
|
|
#else
|
|
nInterp = nFirst + ( ( ( nSecond - nFirst ) * int( nSampleFrac ) ) >> FIX_BITS );
|
|
pOut[i] = float( nInterp );
|
|
#endif
|
|
|
|
nSampleFrac += nRateScaleFix;
|
|
nSampleIndex += nSampleFrac >> FIX_BITS;
|
|
nSampleFrac = nSampleFrac & FIX_MASK;
|
|
}
|
|
|
|
*pInputOffsetFrac = nSampleFrac;
|
|
return nSampleIndex;
|
|
}
|
|
|
|
const fltx4 g_fl4LinerInterp2x_lo={1.0,0.5,1.0,0.5};
|
|
const fltx4 g_fl4LinerInterp2x_hi={0.0,0.5,0.0,0.5};
|
|
|
|
static uint Resample16to32_2x( float flOutput[MIX_BUFFER_SIZE], const short *pWaveData, uint *pInputOffsetFrac )
|
|
{
|
|
fltx4 *pOutput = reinterpret_cast<fltx4 *>(&flOutput[0]);
|
|
const shortx8 *pInput = reinterpret_cast<const shortx8 *>(pWaveData);
|
|
fltx4 flAllOne = LoadAlignedSIMD( (float *)g_SIMD_AllOnesMask );
|
|
fltx4 fl4FirstTwo = LoadAlignedSIMD( (float *)&g_SIMD_SkipTailMask[2] );
|
|
fltx4 fl4LastTwo = AndNotSIMD( fl4FirstTwo, flAllOne );
|
|
for ( int i = 0; i < (MIX_BUFFER_SIZE/16); i++ )
|
|
{
|
|
shortx8 samples = LoadUnalignedShortSIMD( pInput );
|
|
pInput++;
|
|
fltx4 lo = SignedIntConvertToFltSIMD( SignedExtractLowAsInt32( samples ) );
|
|
fltx4 hi = SignedIntConvertToFltSIMD( SignedExtractHighAsInt32( samples ) );
|
|
shortx8 samplesNext = LoadUnalignedShortSIMD( pInput );
|
|
// LAME: Only need one value for this but I can't be bothered to unroll this yet
|
|
fltx4 hi4 = SplatXSIMD( SignedIntConvertToFltSIMD( SignedExtractLowAsInt32( samplesNext ) ) );
|
|
|
|
fltx4 samp0 = SplatXSIMD( lo );
|
|
fltx4 samp1 = SplatYSIMD( lo );
|
|
fltx4 samp0011 = OrSIMD( AndSIMD( fl4FirstTwo, samp0 ), AndSIMD( fl4LastTwo, samp1 ) );
|
|
fltx4 samp2 = SplatZSIMD( lo );
|
|
fltx4 samp1122 = OrSIMD( AndSIMD( fl4FirstTwo, samp1 ), AndSIMD( fl4LastTwo, samp2 ) );
|
|
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp2x_lo, samp0011, MulSIMD( g_fl4LinerInterp2x_hi, samp1122 ) ) ); // 4
|
|
pOutput++;
|
|
|
|
fltx4 samp3 = SplatWSIMD( lo );
|
|
fltx4 samp2233 = OrSIMD( AndSIMD( fl4FirstTwo, samp2 ), AndSIMD( fl4LastTwo, samp3 ) );
|
|
fltx4 samp4 = SplatXSIMD( hi );
|
|
fltx4 samp3344 = OrSIMD( AndSIMD( fl4FirstTwo, samp3 ), AndSIMD( fl4LastTwo, samp4 ) );
|
|
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp2x_lo, samp2233, MulSIMD( g_fl4LinerInterp2x_hi, samp3344 ) ) ); // 8
|
|
pOutput++;
|
|
|
|
fltx4 samp5 = SplatYSIMD( hi );
|
|
fltx4 samp4455 = OrSIMD( AndSIMD( fl4FirstTwo, samp4 ), AndSIMD( fl4LastTwo, samp5 ) );
|
|
fltx4 samp6 = SplatZSIMD( hi );
|
|
fltx4 samp5566 = OrSIMD( AndSIMD( fl4FirstTwo, samp5 ), AndSIMD( fl4LastTwo, samp6 ) );
|
|
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp2x_lo, samp4455, MulSIMD( g_fl4LinerInterp2x_hi, samp5566 ) ) ); // 12
|
|
pOutput++;
|
|
|
|
fltx4 samp7 = SplatWSIMD( hi );
|
|
fltx4 samp6677 = OrSIMD( AndSIMD( fl4FirstTwo, samp6 ), AndSIMD( fl4LastTwo, samp7 ) );
|
|
fltx4 samp8 = SplatXSIMD( hi4 );
|
|
fltx4 samp7788 = OrSIMD( AndSIMD( fl4FirstTwo, samp7 ), AndSIMD( fl4LastTwo, samp8 ) );
|
|
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp2x_lo, samp6677, MulSIMD( g_fl4LinerInterp2x_hi, samp7788 ) ) ); // 16
|
|
pOutput++;
|
|
}
|
|
return MIX_BUFFER_SIZE / 2;
|
|
}
|
|
|
|
const fltx4 g_fl4LinerInterp4x_lo={1.0,0.75,0.5,0.25};
|
|
const fltx4 g_fl4LinerInterp4x_hi={0.0,0.25,0.5,0.75};
|
|
|
|
static uint Resample16to32_4x( float flOutput[MIX_BUFFER_SIZE], const short *pWaveData, uint *pInputOffsetFrac )
|
|
{
|
|
fltx4 *pOutput = reinterpret_cast<fltx4 *>(&flOutput[0]);
|
|
const shortx8 *pInput = reinterpret_cast<const shortx8 *>(pWaveData);
|
|
for ( int i = 0; i < (MIX_BUFFER_SIZE/32); i++ )
|
|
{
|
|
shortx8 samples = LoadUnalignedShortSIMD( pInput );
|
|
pInput++;
|
|
fltx4 lo = SignedIntConvertToFltSIMD( SignedExtractLowAsInt32( samples ) );
|
|
fltx4 hi = SignedIntConvertToFltSIMD( SignedExtractHighAsInt32( samples ) );
|
|
shortx8 samplesNext = LoadUnalignedShortSIMD( pInput );
|
|
// LAME: Only need one value for this but I can't be bothered to unroll this yet
|
|
fltx4 hi4 = SplatXSIMD( SignedIntConvertToFltSIMD( SignedExtractLowAsInt32( samplesNext ) ) );
|
|
|
|
fltx4 samp0 = SplatXSIMD( lo );
|
|
fltx4 samp1 = SplatYSIMD( lo );
|
|
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp0, MulSIMD( g_fl4LinerInterp4x_hi, samp1 ) ) ); // 4
|
|
pOutput++;
|
|
|
|
fltx4 samp2 = SplatZSIMD( lo );
|
|
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp1, MulSIMD( g_fl4LinerInterp4x_hi, samp2 ) ) ); // 8
|
|
pOutput++;
|
|
|
|
fltx4 samp3 = SplatWSIMD( lo );
|
|
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp2, MulSIMD( g_fl4LinerInterp4x_hi, samp3 ) ) ); // 12
|
|
pOutput++;
|
|
|
|
fltx4 samp4 = SplatXSIMD( hi );
|
|
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp3, MulSIMD( g_fl4LinerInterp4x_hi, samp4 ) ) ); // 16
|
|
pOutput++;
|
|
|
|
fltx4 samp5 = SplatYSIMD( hi );
|
|
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp4, MulSIMD( g_fl4LinerInterp4x_hi, samp5 ) ) ); // 20
|
|
pOutput++;
|
|
|
|
fltx4 samp6 = SplatZSIMD( hi );
|
|
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp5, MulSIMD( g_fl4LinerInterp4x_hi, samp6 ) ) ); // 24
|
|
pOutput++;
|
|
|
|
fltx4 samp7 = SplatWSIMD( hi );
|
|
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp6, MulSIMD( g_fl4LinerInterp4x_hi, samp7 ) ) ); // 28
|
|
pOutput++;
|
|
|
|
fltx4 samp8 = SplatXSIMD( hi4 );
|
|
StoreAlignedSIMD( (float *)pOutput, MaddSIMD( g_fl4LinerInterp4x_lo, samp7, MulSIMD( g_fl4LinerInterp4x_hi, samp8 ) ) ); // 32
|
|
pOutput++;
|
|
}
|
|
return MIX_BUFFER_SIZE / 4;
|
|
}
|
|
|
|
|
|
static void Convert32ToFloatx4( float flOutput[MIX_BUFFER_SIZE], int *pIn )
|
|
{
|
|
fltx4 *pOutput = reinterpret_cast<fltx4 *>(&flOutput[0]);
|
|
const shortx8 *pInput = reinterpret_cast<const shortx8 *>(pIn);
|
|
|
|
for ( int i = 0; i < (MIX_BUFFER_SIZE/4); i++ )
|
|
{
|
|
shortx8 n4Samples = LoadAlignedShortx8SIMD( pInput );
|
|
pInput++;
|
|
fltx4 fl4Output = SignedIntConvertToFltSIMD( n4Samples );
|
|
StoreAlignedSIMD( (float *)pOutput, fl4Output );
|
|
pOutput++;
|
|
}
|
|
}
|
|
|
|
inline void ZeroFill( short *pBuffer, int nCount )
|
|
{
|
|
short *pLast = pBuffer + nCount;
|
|
while ( pBuffer < pLast )
|
|
{
|
|
*pBuffer++ = 0;
|
|
}
|
|
}
|
|
|
|
// Join buffer list into a contiguous sample list
|
|
const short *GetContiguousSamples_8Mono( const audio_source_input_t &source, const audio_source_indexstate_t *pState, int nSamplesNeeded, short *pTemp, int nTempSampleCount )
|
|
{
|
|
Assert( nSamplesNeeded < nTempSampleCount );
|
|
|
|
int nSampleIndex = pState->m_nBufferSampleOffset;
|
|
uint nPacketIndex = pState->m_nPacketIndex;
|
|
int nOutIndex = 0;
|
|
for ( ; nPacketIndex < source.m_nPacketCount; nPacketIndex++ )
|
|
{
|
|
const uint8 *pSourceData = (uint8 *)(source.m_pPackets[nPacketIndex].m_pSamples) + nSampleIndex;
|
|
int nSamplesAvailable = source.m_pPackets[nPacketIndex].m_nSampleCount - nSampleIndex;
|
|
Assert( nSamplesAvailable > 0 );
|
|
int nCopy = Min(nSamplesAvailable, nSamplesNeeded);
|
|
for ( int i = 0; i < nCopy; i++ )
|
|
{
|
|
// 8-bit PCM is unsigned, but we assume it has been converted to signed on load
|
|
uint32 nSample = (uint8)((int32) pSourceData[i]);
|
|
pTemp[nOutIndex+i] = (nSample<<8) | nSample;
|
|
}
|
|
nSamplesNeeded -= nCopy;
|
|
nOutIndex += nCopy;
|
|
Assert(nSamplesNeeded >= 0);
|
|
if ( nSamplesNeeded <= 0 )
|
|
break;
|
|
nSampleIndex = 0;
|
|
}
|
|
if ( nSamplesNeeded )
|
|
{
|
|
ZeroFill( &pTemp[nOutIndex], nSamplesNeeded );
|
|
}
|
|
return pTemp;
|
|
}
|
|
|
|
const short *GetContiguousSamples_8Stereo( const audio_source_input_t &source, const audio_source_indexstate_t *pState, int nSamplesNeeded, short *pTemp, int nTempSampleCount, int nChannel )
|
|
{
|
|
Assert( nSamplesNeeded < nTempSampleCount );
|
|
|
|
uint nSampleIndex = pState->m_nBufferSampleOffset;
|
|
uint nPacketIndex = pState->m_nPacketIndex;
|
|
int nOutIndex = 0;
|
|
for ( ; nPacketIndex < source.m_nPacketCount; nPacketIndex++ )
|
|
{
|
|
const uint8 *pSourceData = (uint8 *)(source.m_pPackets[nPacketIndex].m_pSamples) + (nSampleIndex<<1) + nChannel;
|
|
int nSamplesAvailable = source.m_pPackets[nPacketIndex].m_nSampleCount - nSampleIndex;
|
|
Assert( nSamplesAvailable > 0 );
|
|
int nCopy = Min(nSamplesAvailable, nSamplesNeeded);
|
|
for ( int i = 0; i < nCopy; i++ )
|
|
{
|
|
// 8-bit PCM is unsigned, but we assume it has been converted to signed on load
|
|
uint32 nSample = (uint8)( (int32)pSourceData[i << 1] );
|
|
pTemp[nOutIndex+i] = (nSample<<8) | nSample;
|
|
}
|
|
nSamplesNeeded -= nCopy;
|
|
nOutIndex += nCopy;
|
|
Assert(nSamplesNeeded >= 0);
|
|
if ( nSamplesNeeded <= 0 )
|
|
break;
|
|
nSampleIndex = 0;
|
|
}
|
|
if ( nSamplesNeeded )
|
|
{
|
|
ZeroFill( &pTemp[nOutIndex], nSamplesNeeded );
|
|
}
|
|
return pTemp;
|
|
}
|
|
|
|
const short *GetContiguousSamples_16Mono( const audio_source_input_t &source, const audio_source_indexstate_t *pState, int nSamplesNeeded, short *pTemp, int nTempSampleCount )
|
|
{
|
|
Assert( nSamplesNeeded <= nTempSampleCount );
|
|
|
|
uint nSampleIndex = pState->m_nBufferSampleOffset;
|
|
uint nPacketIndex = pState->m_nPacketIndex;
|
|
|
|
if ( nPacketIndex < source.m_nPacketCount )
|
|
{
|
|
int nSamplesAvailable = source.m_pPackets[nPacketIndex].m_nSampleCount - nSampleIndex;
|
|
|
|
// optimization: if the entire request can be satisfied by the current packet, just point to that (don't copy)
|
|
if ( nSamplesAvailable >= nSamplesNeeded )
|
|
{
|
|
Assert( source.m_pPackets[nPacketIndex].m_pSamples != NULL );
|
|
return source.m_pPackets[nPacketIndex].m_pSamples + nSampleIndex;
|
|
}
|
|
|
|
int nOutIndex = 0;
|
|
for ( ; nPacketIndex < source.m_nPacketCount; nPacketIndex++ )
|
|
{
|
|
const short *pSourceData = source.m_pPackets[nPacketIndex].m_pSamples + nSampleIndex;
|
|
nSamplesAvailable = source.m_pPackets[nPacketIndex].m_nSampleCount - nSampleIndex;
|
|
Assert( nSamplesAvailable > 0 );
|
|
int nCopy = Min(nSamplesAvailable, nSamplesNeeded);
|
|
V_memcpy( &pTemp[nOutIndex], pSourceData, nCopy * sizeof(short) );
|
|
nSamplesNeeded -= nCopy;
|
|
nOutIndex += nCopy;
|
|
Assert(nSamplesNeeded >= 0);
|
|
if ( nSamplesNeeded <= 0 )
|
|
break;
|
|
nSampleIndex = 0;
|
|
}
|
|
if ( nSamplesNeeded )
|
|
{
|
|
// pad with zeros
|
|
ZeroFill( &pTemp[nOutIndex], nSamplesNeeded );
|
|
}
|
|
return pTemp;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
const short *GetContiguousSamples_16Stereo( const audio_source_input_t &source, const audio_source_indexstate_t *pState, int nSamplesNeeded, short *pTemp, int nTempSampleCount, int nChannel )
|
|
{
|
|
Assert( nSamplesNeeded < nTempSampleCount );
|
|
|
|
uint nSampleIndex = pState->m_nBufferSampleOffset;
|
|
uint nPacketIndex = pState->m_nPacketIndex;
|
|
int nOutIndex = 0;
|
|
for ( ; nPacketIndex < source.m_nPacketCount; nPacketIndex++ )
|
|
{
|
|
const short *pSourceData = source.m_pPackets[nPacketIndex].m_pSamples + (nSampleIndex<<1) + nChannel;
|
|
int nSamplesAvailable = source.m_pPackets[nPacketIndex].m_nSampleCount - nSampleIndex;
|
|
Assert( nSamplesAvailable > 0 );
|
|
int nCopy = MIN(nSamplesAvailable, nSamplesNeeded);
|
|
for ( int i = 0; i < nCopy; i++ )
|
|
{
|
|
// copy every other sample to drop one channel. Note that pSourceData is already offset to the appropriate channel
|
|
pTemp[nOutIndex + i] = pSourceData[ i<<1 ];
|
|
}
|
|
nSamplesNeeded -= nCopy;
|
|
nOutIndex += nCopy;
|
|
Assert(nSamplesNeeded >= 0);
|
|
if ( nSamplesNeeded <= 0 )
|
|
break;
|
|
nSampleIndex = 0;
|
|
}
|
|
if ( nSamplesNeeded )
|
|
{
|
|
// pad with zeros
|
|
ZeroFill( &pTemp[nOutIndex], nSamplesNeeded );
|
|
}
|
|
return pTemp;
|
|
}
|
|
|
|
// has this source finished playing its sample data
|
|
bool IsFinished( const audio_source_input_t &source, const audio_source_indexstate_t *pCurrentState )
|
|
{
|
|
return pCurrentState->m_nPacketIndex >= source.m_nPacketCount ? true : false;
|
|
}
|
|
|
|
// Move the source offset by some number of samples
|
|
// If necessary also advance the packet index
|
|
uint AdvanceSourceIndex( audio_source_indexstate_t *pOut, const audio_source_input_t &source, uint nAdvance )
|
|
{
|
|
for ( ; pOut->m_nPacketIndex < source.m_nPacketCount; pOut->m_nPacketIndex++ )
|
|
{
|
|
nAdvance += pOut->m_nBufferSampleOffset;
|
|
pOut->m_nBufferSampleOffset = nAdvance;
|
|
// We can skip entirely within this packet by adjusting the offset, so return
|
|
if ( nAdvance < source.m_pPackets[pOut->m_nPacketIndex].m_nSampleCount )
|
|
return 0;
|
|
|
|
nAdvance -= source.m_pPackets[pOut->m_nPacketIndex].m_nSampleCount;
|
|
pOut->m_nBufferSampleOffset = 0;
|
|
}
|
|
return nAdvance;
|
|
}
|
|
|
|
|
|
int ConvertSourceToFloat( const audio_source_input_t &source, float flPitch, float flOutput[MIX_BUFFER_SIZE], audio_source_indexstate_t *pOut )
|
|
{
|
|
//TestResample();
|
|
VPROF("ConvertSourceToFloat");
|
|
|
|
// if float
|
|
// join, resample
|
|
// return;
|
|
// if 8 bit
|
|
// if stereo - extract/join/updepth
|
|
// if mono - join/updepth
|
|
// if 16 bit
|
|
// if stereo - extract/join
|
|
// if mono - join
|
|
// now we have 16-bit joined mono data
|
|
// resample and convert to float
|
|
// for now assume 16-bit mono, joined
|
|
short nJoinedData[MIX_BUFFER_SIZE*2 + 8];
|
|
|
|
float flSampleRatio = 1.0f;
|
|
int nSamplesNeeded = MIX_BUFFER_SIZE;
|
|
float flSampleRate = float(source.m_nSamplingRate) * flPitch;
|
|
bool bResample = flSampleRate != MIX_DEFAULT_SAMPLING_RATE ? true : false;
|
|
|
|
if ( bResample )
|
|
{
|
|
flSampleRatio = flSampleRate * (1.0f / MIX_DEFAULT_SAMPLING_RATE);
|
|
flSampleRatio = clamp(flSampleRatio, 0.125f, 2.0f);
|
|
nSamplesNeeded = int( (MIX_BUFFER_SIZE * flSampleRatio) + 0.5f ) + 2; // add 2 for rounding, interpolate to next neighbor
|
|
|
|
// some of the resampling code processes in blocks of 8 samples with SSE2 instructions, so align to nearest 8
|
|
nSamplesNeeded = AlignValue( nSamplesNeeded, 8 );
|
|
#if _DEBUG
|
|
uint64 nSampleRefCount = ( ( ( MIX_BUFFER_SIZE * FLOAT_TO_FIXED( flSampleRatio ) ) + pOut->m_nSampleFracOffset ) >> FIX_BITS ) + 1;
|
|
Assert( nSampleRefCount <= nSamplesNeeded );
|
|
#endif
|
|
}
|
|
|
|
const short *pSourceData = NULL;
|
|
// Grab a pointer to a joined set of sample data at the right length
|
|
if ( source.m_nSampleFormat == SAMPLE_INT8_MONO )
|
|
{
|
|
pSourceData = GetContiguousSamples_8Mono( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData) );
|
|
}
|
|
else if ( source.m_nSampleFormat == SAMPLE_INT16_MONO )
|
|
{
|
|
pSourceData = GetContiguousSamples_16Mono( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData) );
|
|
}
|
|
else if ( source.m_nSampleFormat == SAMPLE_INT16_STEREO_L )
|
|
{
|
|
pSourceData = GetContiguousSamples_16Stereo( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData), 0 );
|
|
}
|
|
else if ( source.m_nSampleFormat == SAMPLE_INT16_STEREO_R )
|
|
{
|
|
pSourceData = GetContiguousSamples_16Stereo( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData), 1 );
|
|
}
|
|
else if ( source.m_nSampleFormat == SAMPLE_INT8_STEREO_L )
|
|
{
|
|
pSourceData = GetContiguousSamples_8Stereo( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData), 0 );
|
|
}
|
|
else if ( source.m_nSampleFormat == SAMPLE_INT8_STEREO_R )
|
|
{
|
|
pSourceData = GetContiguousSamples_8Stereo( source, pOut, nSamplesNeeded, nJoinedData, Q_ARRAYSIZE(nJoinedData), 1 );
|
|
}
|
|
|
|
if ( pSourceData )
|
|
{
|
|
if ( bResample )
|
|
{
|
|
if ( flSampleRate == 11025.0f )
|
|
{
|
|
nSamplesNeeded = Resample16to32_4x( flOutput, pSourceData, &pOut->m_nSampleFracOffset );
|
|
}
|
|
else if ( flSampleRate == 22050.0f )
|
|
{
|
|
nSamplesNeeded = Resample16to32_2x( flOutput, pSourceData, &pOut->m_nSampleFracOffset );
|
|
}
|
|
else
|
|
{
|
|
// slow path, resample arbitrary ratio
|
|
VPROF("Resample_Ratio");
|
|
nSamplesNeeded = Resample16to32( flOutput, pSourceData, flSampleRatio, &pOut->m_nSampleFracOffset );
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ConvertShortToFloatx8( flOutput, pSourceData );
|
|
}
|
|
// update the index state
|
|
AdvanceSourceIndex( pOut, source, nSamplesNeeded );
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int AdvanceSource( const audio_source_input_t &source, float flPitch, audio_source_indexstate_t *pOut )
|
|
{
|
|
float flSampleRatio = 1.0f;
|
|
int nSamplesNeeded = MIX_BUFFER_SIZE;
|
|
float flSampleRate = float(source.m_nSamplingRate) * flPitch;
|
|
if ( flSampleRate != MIX_DEFAULT_SAMPLING_RATE )
|
|
{
|
|
flSampleRatio = flSampleRate * (1.0f / MIX_DEFAULT_SAMPLING_RATE);
|
|
flSampleRatio = clamp(flSampleRatio, 0.125f, 2.0f);
|
|
nSamplesNeeded = CalcAdvanceSamples( nSamplesNeeded, flSampleRatio, &pOut->m_nSampleFracOffset );
|
|
}
|
|
|
|
// update the index state
|
|
AdvanceSourceIndex( pOut, source, nSamplesNeeded );
|
|
return nSamplesNeeded;
|
|
}
|
|
|
|
// constants for linear ramping
|
|
const float flMixBufferSizeInv = 1.0f / MIX_BUFFER_SIZE;
|
|
const fltx4 g_fl4_MixBufferSizeInv = { flMixBufferSizeInv, flMixBufferSizeInv, flMixBufferSizeInv, flMixBufferSizeInv };
|
|
const fltx4 g_fl4_Sequence1234 = { 1.0, 2.0, 3.0, 4.0 };
|
|
|
|
|
|
void ScaleBuffer( float flOutput[MIX_BUFFER_SIZE], const float input[MIX_BUFFER_SIZE], float scale )
|
|
{
|
|
fltx4 volume = ReplicateX4(scale);
|
|
fltx4 * RESTRICT pOut = (fltx4 *)&flOutput[0];
|
|
fltx4 * RESTRICT pIn = (fltx4 *)&input[0];
|
|
for ( int i = 0; i < MIX_BUFFER_SIZE/4; i++ )
|
|
{
|
|
fltx4 sample = LoadAlignedSIMD( pIn );
|
|
StoreAlignedSIMD( (float *)pOut, MulSIMD( volume, sample ) );
|
|
pOut++;
|
|
pIn++;
|
|
}
|
|
}
|
|
|
|
void ScaleBufferRamp( float flOutput[MIX_BUFFER_SIZE], const float flInput[MIX_BUFFER_SIZE], float flScaleStart, float flScaleEnd )
|
|
{
|
|
fltx4 fl4Volume = ReplicateX4( flScaleStart );
|
|
fltx4 fl4VolumeStep = MulSIMD( g_fl4_MixBufferSizeInv, SubSIMD( ReplicateX4( flScaleEnd ), fl4Volume ) );
|
|
|
|
// offset volume by first ramp steps
|
|
fl4Volume = AddSIMD( fl4Volume, MulSIMD( fl4VolumeStep, g_fl4_Sequence1234 ) );
|
|
|
|
fltx4 fl4VolumeInc = MulSIMD( fl4VolumeStep, Four_Fours );
|
|
|
|
fltx4 * RESTRICT pOut = (fltx4 *)&flOutput[0];
|
|
fltx4 * RESTRICT pIn = (fltx4 *)&flInput[0];
|
|
for ( int i = 0; i < MIX_BUFFER_SIZE / 4; i++ )
|
|
{
|
|
fltx4 fl4Sample = LoadAlignedSIMD( pIn );
|
|
StoreAlignedSIMD( (float *)pOut, MulSIMD( fl4Volume, fl4Sample ) );
|
|
pOut++;
|
|
pIn++;
|
|
fl4Volume = AddSIMD( fl4VolumeInc, fl4Volume );
|
|
}
|
|
}
|
|
|
|
void SilenceBuffer( float flBuffer[MIX_BUFFER_SIZE] )
|
|
{
|
|
fltx4 * RESTRICT pOut = (fltx4 *)&flBuffer[0];
|
|
fltx4 fl4Zero = LoadZeroSIMD();
|
|
for ( int i = 0; i < MIX_BUFFER_SIZE/4; i++ )
|
|
{
|
|
StoreAlignedSIMD( (float *)pOut, fl4Zero );
|
|
pOut++;
|
|
}
|
|
}
|
|
|
|
void SilenceBuffers( CAudioMixBuffer *pBuffers, int nBufferCount )
|
|
{
|
|
for ( int i = 0; i < nBufferCount; i++ )
|
|
{
|
|
SilenceBuffer( pBuffers[i].m_flData );
|
|
}
|
|
}
|
|
|
|
void MixBuffer( float flOutput[MIX_BUFFER_SIZE], const float flInput[MIX_BUFFER_SIZE], float scale )
|
|
{
|
|
fltx4 fl4Volume = ReplicateX4(scale);
|
|
fltx4 * RESTRICT pOut = (fltx4 *)&flOutput[0];
|
|
fltx4 * RESTRICT pIn = (fltx4 *)&flInput[0];
|
|
for ( int i = 0; i < MIX_BUFFER_SIZE/4; i++ )
|
|
{
|
|
fltx4 fl4Sample = LoadAlignedSIMD( pIn );
|
|
fltx4 fl4Mix = LoadAlignedSIMD( pOut );
|
|
StoreAlignedSIMD( (float *)pOut, MaddSIMD( fl4Volume, fl4Sample, fl4Mix ) );
|
|
pOut++;
|
|
pIn++;
|
|
}
|
|
}
|
|
|
|
void MixBufferRamp( float flOutput[MIX_BUFFER_SIZE], const float flInput[MIX_BUFFER_SIZE], float flScaleStart, float flScaleEnd )
|
|
{
|
|
fltx4 fl4Volume = ReplicateX4( flScaleStart );
|
|
fltx4 fl4VolumeStep = MulSIMD( g_fl4_MixBufferSizeInv, SubSIMD( ReplicateX4( flScaleEnd ), fl4Volume ) );
|
|
|
|
// offset volume by first ramp steps
|
|
fl4Volume = AddSIMD( fl4Volume, MulSIMD( fl4VolumeStep, g_fl4_Sequence1234 ) );
|
|
|
|
fltx4 fl4VolumeInc = MulSIMD( fl4VolumeStep, Four_Fours );
|
|
|
|
fltx4 * RESTRICT pOut = (fltx4 *)&flOutput[0];
|
|
fltx4 * RESTRICT pIn = (fltx4 *)&flInput[0];
|
|
for ( int i = 0; i < MIX_BUFFER_SIZE / 4; i++ )
|
|
{
|
|
fltx4 fl4Sample = LoadAlignedSIMD( pIn );
|
|
fltx4 fl4Mix = LoadAlignedSIMD( pOut );
|
|
StoreAlignedSIMD( (float *)pOut, MaddSIMD( fl4Volume, fl4Sample, fl4Mix ) );
|
|
pOut++;
|
|
pIn++;
|
|
fl4Volume = AddSIMD( fl4VolumeInc, fl4Volume );
|
|
}
|
|
}
|
|
|
|
void SumBuffer2x1( float flOutput[MIX_BUFFER_SIZE], float flInput0[MIX_BUFFER_SIZE], float flScale0, float flInput1[MIX_BUFFER_SIZE], float flScale1 )
|
|
{
|
|
fltx4 fl4Scale0 = ReplicateX4(flScale0);
|
|
fltx4 fl4Scale1 = ReplicateX4(flScale1);
|
|
fltx4 * RESTRICT pOut = (fltx4 *)&flOutput[0];
|
|
fltx4 * RESTRICT pIn0 = (fltx4 *)&flInput0[0];
|
|
fltx4 * RESTRICT pIn1 = (fltx4 *)&flInput1[0];
|
|
for ( int i = 0; i < MIX_BUFFER_SIZE/4; i++ )
|
|
{
|
|
fltx4 fl4Sample0 = LoadAlignedSIMD( pIn0 );
|
|
fltx4 fl4Sample1 = LoadAlignedSIMD( pIn1 );
|
|
StoreAlignedSIMD( (float *)pOut, MaddSIMD( fl4Scale0, fl4Sample0, MulSIMD( fl4Scale1, fl4Sample1 ) ) );
|
|
pOut++;
|
|
pIn0++;
|
|
pIn1++;
|
|
}
|
|
}
|
|
|
|
|
|
void SwapBuffersInPlace( float flInput0[MIX_BUFFER_SIZE], float flInput1[MIX_BUFFER_SIZE] )
|
|
{
|
|
fltx4 * RESTRICT pIn0 = (fltx4 *)&flInput0[0];
|
|
fltx4 * RESTRICT pIn1 = (fltx4 *)&flInput1[0];
|
|
for ( int i = 0; i < MIX_BUFFER_SIZE/4; i++ )
|
|
{
|
|
fltx4 fl4Sample0 = LoadAlignedSIMD( pIn0 );
|
|
fltx4 fl4Sample1 = LoadAlignedSIMD( pIn1 );
|
|
StoreAlignedSIMD( (float *)pIn0, fl4Sample1 );
|
|
StoreAlignedSIMD( (float *)pIn1, fl4Sample0 );
|
|
pIn0++;
|
|
pIn1++;
|
|
}
|
|
}
|
|
|
|
// UNDONE: OPTIMIZE: SIMD implementation
|
|
float BufferLevel( float flInput0[MIX_BUFFER_SIZE] )
|
|
{
|
|
float flAbsMax = 0.0f;
|
|
for ( int i = 0; i < MIX_BUFFER_SIZE; i++ )
|
|
{
|
|
flAbsMax = Max( flAbsMax, (float)fabs(flInput0[i]) );
|
|
}
|
|
return flAbsMax;
|
|
}
|
|
|
|
float AvergeBufferAmplitude( float flInput0[MIX_BUFFER_SIZE] )
|
|
{
|
|
float flTotal = 0;
|
|
for ( int i = 0; i < MIX_BUFFER_SIZE; i++ )
|
|
{
|
|
flTotal += fabs( flInput0[i] );
|
|
}
|
|
return flTotal * ( 1.0f / MIX_BUFFER_SIZE );
|
|
}
|