csgo-2018-source/utils/phonemeextractor/phonemeextractor.cpp

//========= Copyright © 1996-2005, Valve Corporation, All rights reserved. ============//
//
// Purpose:
//
// $NoKeywords: $
//
//=============================================================================//
// extracephonemes.cpp : Defines the entry point for the console application.
//
#define PROTECTED_THINGS_DISABLE

#include "tier0/wchartypes.h"
#include <stdio.h>
#include <windows.h>
#include <tchar.h>
#include "sphelper.h"
#include "spddkhlp.h"
// ATL Header Files
#include <atlbase.h>
// Face poser and util includes
#include "utlvector.h"
#include "phonemeextractor/PhonemeExtractor.h"
#include "PhonemeConverter.h"
#include "sentence.h"
#include "tier0/dbg.h"
#include "tier0/icommandline.h"
#include "FileSystem.h"

// Extract phoneme grammar id
#define EP_GRAM_ID			101
// First rule of dynamic sentence rule set
#define DYN_SENTENCERULE	102
// # of milliseconds to allow for processing before timeout
#define SR_WAVTIMEOUT		4000
// Weight tag for rule to rule word/rule transitions
#define CONFIDENCE_WEIGHT	0.0f

//#define LOGGING		1
#define LOGFILE		"c:\\fp.log"

void LogReset( void )
{
#if LOGGING
	FILE *fp = fopen( LOGFILE, "w" );
	if ( fp )
		fclose( fp );
#endif
}

char *va( const char *fmt, ... );

DEFINE_LOGGING_CHANNEL_NO_TAGS( LOG_PhonemeExtractor, "PhonemeExtractor" );

//-----------------------------------------------------------------------------
// Purpose:
// Input  : *words -
//-----------------------------------------------------------------------------
void LogWords( CSentence& sentence )
{
	Log_Msg( LOG_PhonemeExtractor, "Wordcount == %i\n", sentence.m_Words.Count() );

	for ( int i = 0; i < sentence.m_Words.Count(); i++ )
	{
		const CWordTag *w = sentence.m_Words[ i ];
		Log_Msg( LOG_PhonemeExtractor, "Word %s %u to %u\n", w->GetWord(), w->m_uiStartByte, w->m_uiEndByte );
	}
}

//-----------------------------------------------------------------------------
// Purpose:
// Input  : *phonemes -
//-----------------------------------------------------------------------------
void LogPhonemes( CSentence& sentence )
{
	return;

	Log_Msg( LOG_PhonemeExtractor, "Phonemecount == %i\n", sentence.CountPhonemes() );

	for ( int i = 0; i < sentence.m_Words.Count(); i++ )
	{
		const CWordTag *w = sentence.m_Words[ i ];

		for ( int j = 0; j < w->m_Phonemes.Count(); j++ )
		{
			const CPhonemeTag *p = w->m_Phonemes[ j ];
			Log_Msg( LOG_PhonemeExtractor, "Phoneme %s %u to %u\n", p->GetTag(), p->m_uiStartByte, p->m_uiEndByte );
		}
	}
}

#define NANO_CONVERT 10000000.0f;

//-----------------------------------------------------------------------------
// Purpose: Walk list of words and phonemes and create phoneme tags in CSentence object
//  FIXME:  Right now, phonemes are assumed to evenly space out across a word.
// Input  : *converter -
//			result -
//			sentence -
//-----------------------------------------------------------------------------
void EnumeratePhonemes( ISpPhoneConverter *converter, const ISpRecoResult* result, CSentence& sentence )
{
	USES_CONVERSION;

	// Grab access to element container
	ISpPhrase *phrase = ( ISpPhrase * )result;
	if ( !phrase )
		return;

    SPPHRASE *pElements;
	if ( !SUCCEEDED( phrase->GetPhrase( &pElements ) ) )
		return;

	// Only use it if it's better/same size as what we already had on-hand
	if ( pElements->Rule.ulCountOfElements > 0 )
		//(unsigned int)( sentence.m_Words.Size() - sentence.GetWordBase() ) )
	{
		sentence.ResetToBase();

		// Walk list of words
		for ( ULONG i = 0; i < pElements->Rule.ulCountOfElements; i++ )
		{
			unsigned int wordstart, wordend;

			// Get start/end sample index
			wordstart	= pElements->pElements[i].ulAudioStreamOffset + (unsigned int)pElements->ullAudioStreamPosition;
			wordend		= wordstart + pElements->pElements[i].ulAudioSizeBytes;

			// Create word tag
			CWordTag *w = new CWordTag( W2T( pElements->pElements[i].pszDisplayText ) );
			Assert( w );
			w->m_uiStartByte = wordstart;
			w->m_uiEndByte   = wordend;

			sentence.AddWordTag( w );

			// Count # of phonemes in this word
			SPPHONEID pstr[ 2 ];
			pstr[ 1 ] = 0;
			WCHAR wszPhoneme[ SP_MAX_PRON_LENGTH ];

			const SPPHONEID *current;
			SPPHONEID phoneme;
			current = pElements->pElements[i].pszPronunciation;
			float total_weight = 0.0f;
			while ( 1 )
			{
				phoneme = *current++;
				if ( !phoneme )
					break;

				pstr[ 0 ] = phoneme;
				wszPhoneme[ 0 ] = L'\0';

				converter->IdToPhone( pstr, wszPhoneme );

				total_weight += WeightForPhoneme( W2A( wszPhoneme ) );
			}

			current = pElements->pElements[i].pszPronunciation;

			// Decide # of bytes/phoneme weight
			float psize = 0;
			if ( total_weight )
			{
				psize = ( wordend - wordstart ) / total_weight;
			}

			int number = 0;

			// Re-walk the phoneme list and create true phoneme tags
			float startWeight = 0.0f;
			while ( 1 )
			{
				phoneme = *current++;
				if ( !phoneme )
					break;

				pstr[ 0 ] = phoneme;
				wszPhoneme[ 0 ] = L'\0';

				converter->IdToPhone( pstr, wszPhoneme );

				CPhonemeTag *p = new CPhonemeTag( W2A( wszPhoneme ) );
				Assert( p );

				float weight = WeightForPhoneme( W2A( wszPhoneme ) );

				p->m_uiStartByte = wordstart + (int)( startWeight * psize );
				p->m_uiEndByte	 = p->m_uiStartByte + (int)( psize * weight );

				startWeight += weight;

				// Convert to IPA phoneme code
				p->SetPhonemeCode( TextToPhoneme( p->GetTag() ) );

				sentence.AddPhonemeTag( w, p );

				number++;
			}
		}
	}

	// Free memory
    ::CoTaskMemFree(pElements);
}

//-----------------------------------------------------------------------------
// Purpose: Create rules for each word in the reference sentence
//-----------------------------------------------------------------------------
typedef struct
{
	int					ruleId;
	SPSTATEHANDLE		hRule;
	CSpDynamicString	word;
	char				plaintext[ 256 ];
} WORDRULETYPE;

//-----------------------------------------------------------------------------
// Purpose: Creates start for word of sentence
// Input  : cpRecoGrammar -
//			*root -
//			*rules -
//			word -
//-----------------------------------------------------------------------------
void AddWordRule( ISpRecoGrammar* cpRecoGrammar, SPSTATEHANDLE *root, CUtlVector< WORDRULETYPE > *rules, CSpDynamicString& word )
{
	USES_CONVERSION;
	HRESULT hr;
	WORDRULETYPE *newrule;

	int idx = (*rules).AddToTail();

	newrule = &(*rules)[ idx ];

	newrule->ruleId = DYN_SENTENCERULE + idx + 1;
	newrule->word = word;

	strcpy( newrule->plaintext, W2T( word ) );

	// Create empty rule
	hr = cpRecoGrammar->CreateNewState( *root, &newrule->hRule );
	Assert( !FAILED( hr ) );
}

//-----------------------------------------------------------------------------
// Purpose:
// Input  : cpRecoGrammar -
//			*from -
//			*to -
//-----------------------------------------------------------------------------
void AddWordTransitionRule( ISpRecoGrammar* cpRecoGrammar, WORDRULETYPE *from, WORDRULETYPE *to )
{
	USES_CONVERSION;

	HRESULT hr;
	Assert( from );

	if ( from && !to )
	{
		OutputDebugString( va( "Transition from %s to TERM\r\n", from->plaintext ) );
	}
	else
	{
		OutputDebugString( va( "Transition from %s to %s\r\n", from->plaintext, to->plaintext ) );
	}

	hr = cpRecoGrammar->AddWordTransition( from->hRule, to ? to->hRule : NULL, (WCHAR *)from->word, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
	Assert( !FAILED( hr ) );
}

//-----------------------------------------------------------------------------
// Purpose:
// Input  : cpRecoGrammar -
//			*from -
//			*to -
//-----------------------------------------------------------------------------
void AddOptionalTransitionRule( ISpRecoGrammar* cpRecoGrammar, WORDRULETYPE *from, WORDRULETYPE *to )
{
	USES_CONVERSION;

	HRESULT hr;
	Assert( from );

	if ( from && !to )
	{
		OutputDebugString( va( "Opt transition from %s to TERM\r\n", from->plaintext ) );
	}
	else
	{
		OutputDebugString( va( "Opt transition from %s to %s\r\n", from->plaintext, to->plaintext ) );
	}

	hr = cpRecoGrammar->AddWordTransition( from->hRule, to ? to->hRule : NULL, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
	Assert( !FAILED( hr ) );
}

#define MAX_WORD_SKIP 1
//-----------------------------------------------------------------------------
// Purpose: Links together all word rule states into a sentence rule CFG
// Input  : singleword -
//			cpRecoGrammar -
//			*root -
//			*rules -
//-----------------------------------------------------------------------------
bool BuildRules( ISpRecoGrammar* cpRecoGrammar, SPSTATEHANDLE *root, CUtlVector< WORDRULETYPE > *rules )
{
	HRESULT hr;
	WORDRULETYPE *rule, *next;

	int numrules = (*rules).Count();

	rule = &(*rules)[ 0 ];

	// Add transition
	hr = cpRecoGrammar->AddWordTransition( *root, rule->hRule, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
	Assert( !FAILED( hr ) );

	for ( int i = 0; i < numrules; i++ )
	{
		rule = &(*rules)[ i ];
		if ( i < numrules - 1 )
		{
			next = &(*rules)[ i + 1 ];
		}
		else
		{
			next = NULL;
		}

		AddWordTransitionRule( cpRecoGrammar, rule, next );
	}

	if ( numrules > 1 )
	{
		for ( int skip = 1; skip <= min( MAX_WORD_SKIP, numrules ); skip++ )
		{
			OutputDebugString( va( "Opt transition from Root to %s\r\n", (*rules)[ 0 ].plaintext ) );

			hr = cpRecoGrammar->AddWordTransition( *root, (*rules)[ 0 ].hRule, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );

			// Now build rules where you can skip 1 to N intervening words
			for ( int i = 1; i < numrules; i++ )
			{
				// Start at the beginning?
				rule = &(*rules)[ i ];
				if ( i < numrules - skip )
				{
					next = &(*rules)[ i + skip ];
				}
				else
				{
					continue;
				}

				// Add transition
				AddOptionalTransitionRule( cpRecoGrammar, rule, next );
			}

			// Go from final rule to end point
			AddOptionalTransitionRule( cpRecoGrammar, rule, NULL );
		}
	}

	// Store it
	hr = cpRecoGrammar->Commit(NULL);
	if ( FAILED( hr ) )
		return false;

	return true;
}

//-----------------------------------------------------------------------------
// Purpose: Debugging, prints alternate list if one is created
// Input  : cpResult -
//			(*pfnPrint -
//-----------------------------------------------------------------------------
void PrintAlternates( ISpRecoResult* cpResult, void (*pfnPrint)( const char *fmt, ... ) )
{
	ISpPhraseAlt *rgPhraseAlt[ 32 ];
	memset( rgPhraseAlt, 0, sizeof( rgPhraseAlt ) );

	ULONG ulCount;

	ISpPhrase *phrase = ( ISpPhrase * )cpResult;
	if ( phrase )
	{
		SPPHRASE *pElements;
		if ( SUCCEEDED( phrase->GetPhrase( &pElements ) ) )
		{
			if ( pElements->Rule.ulCountOfElements > 0 )
			{
				HRESULT hr = cpResult->GetAlternates(
					pElements->Rule.ulFirstElement,
					pElements->Rule.ulCountOfElements,
					32,
					rgPhraseAlt,
					&ulCount);

				Assert( !FAILED( hr ) );

				for ( ULONG r = 0 ; r < ulCount; r++ )
				{
					CSpDynamicString dstrText;
					hr = rgPhraseAlt[ r ]->GetText( (ULONG)SP_GETWHOLEPHRASE, (ULONG)SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL);
					Assert( !FAILED( hr ) );

					pfnPrint( "[ ALT ]" );
					pfnPrint( dstrText.CopyToChar() );
					pfnPrint( "\r\n" );
				}
			}
		}

	}

	for ( int i = 0; i < 32; i++ )
	{
		if ( rgPhraseAlt[ i ] )
		{
			rgPhraseAlt[ i ]->Release();
			rgPhraseAlt[ i ] = NULL;
		}
	}
}

void PrintWordsAndPhonemes( CSentence& sentence, void (*pfnPrint)( const char *fmt, ... ) )
{
	char sz[ 256 ];
	int i;

	pfnPrint( "WORDS\r\n\r\n" );

	for ( i = 0 ; i < sentence.m_Words.Count(); i++ )
	{
		CWordTag *word = sentence.m_Words[ i ];
		if ( !word )
			continue;

		sprintf( sz, "<%u - %u> %s\r\n",
			word->m_uiStartByte, word->m_uiEndByte, word->GetWord() );

		pfnPrint( sz );

		for ( int j = 0 ; j < word->m_Phonemes.Count(); j++ )
		{
			CPhonemeTag *phoneme = word->m_Phonemes[ j ];
			if ( !phoneme )
				continue;

			sprintf( sz, "  <%u - %u> %s\r\n",
				phoneme->m_uiStartByte, phoneme->m_uiEndByte, phoneme->GetTag() );

			pfnPrint( sz );
		}
	}

	pfnPrint( "\r\n" );
}

//-----------------------------------------------------------------------------
// Purpose: Given a wave file and a string of words "text", creates a CFG from the
//  sentence and stores the resulting words/phonemes in CSentence
// Input  : *wavname -
//			text -
//			sentence -
//			(*pfnPrint -
// Output : SR_RESULT
//-----------------------------------------------------------------------------
SR_RESULT ExtractPhonemes( const char *wavname, CSpDynamicString& text, CSentence& sentence, void (*pfnPrint)( const char *fmt, ...) )
{
	// Assume failure
	SR_RESULT result = SR_RESULT_ERROR;

	if ( text.Length() <= 0 )
	{
		pfnPrint( "Error:  no rule / text specified\n" );
		return result;
	}

	USES_CONVERSION;
	HRESULT hr;

	CUtlVector < WORDRULETYPE > wordRules;

	CComPtr<ISpStream> cpInputStream;
	CComPtr<ISpRecognizer> cpRecognizer;
	CComPtr<ISpRecoContext> cpRecoContext;
	CComPtr<ISpRecoGrammar> cpRecoGrammar;
	CComPtr<ISpPhoneConverter>  cpPhoneConv;

	// Create basic SAPI stream object
	// NOTE: The helper SpBindToFile can be used to perform the following operations
	hr = cpInputStream.CoCreateInstance(CLSID_SpStream);
	if ( FAILED( hr ) )
	{
		pfnPrint( "Error:  SAPI 5.1 Stream object not installed?\n" );
		return result;
	}

	CSpStreamFormat sInputFormat;

	// setup stream object with wav file MY_WAVE_AUDIO_FILENAME
	//   for read-only access, since it will only be access by the SR engine
	hr = cpInputStream->BindToFile(
		T2W(wavname),
		SPFM_OPEN_READONLY,
		NULL,
		sInputFormat.WaveFormatExPtr(),
		SPFEI_ALL_EVENTS );

	if ( FAILED( hr ) )
	{
		pfnPrint( "Error: couldn't open wav file %s\n", wavname );
		return result;
	}

	// Create in-process speech recognition engine
	hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer);
	if ( FAILED( hr ) )
	{
		pfnPrint( "Error:  SAPI 5.1 In process recognizer object not installed?\n" );
		return result;
	}

	// Create recognition context to receive events
	hr = cpRecognizer->CreateRecoContext(&cpRecoContext);
	if ( FAILED( hr ) )
	{
		pfnPrint( "Error:  SAPI 5.1 Unable to create recognizer context\n" );
		return result;
	}

	// Create a grammar
	hr = cpRecoContext->CreateGrammar( EP_GRAM_ID, &cpRecoGrammar );
	if ( FAILED( hr ) )
	{
		pfnPrint( "Error:  SAPI 5.1 Unable to create recognizer grammar\n" );
		return result;
	}

	LANGID englishID = 0x409; // 1033 decimal

	bool userSpecified = false;
	LANGID langID = SpGetUserDefaultUILanguage();

	// Allow commandline override
	if ( CommandLine()->FindParm( "-languageid" ) != 0 )
	{
		userSpecified = true;
		langID = CommandLine()->ParmValue( "-languageid", langID );
	}

	// Create a phoneme converter ( so we can convert to IPA codes )
	hr = SpCreatePhoneConverter( langID, NULL, NULL, &cpPhoneConv );
	if ( FAILED( hr ) )
	{
		if ( langID != englishID )
		{
			if ( userSpecified )
			{
				pfnPrint( "Warning:  SAPI 5.1 Unable to create phoneme converter for command line override -languageid %i\n", langID );
			}
			else
			{
				pfnPrint( "Warning:  SAPI 5.1 Unable to create phoneme converter for default UI language %i\n",langID );
			}

			// Try english!!!
			langID = englishID;
			hr = SpCreatePhoneConverter( langID, NULL, NULL, &cpPhoneConv );
		}

		if ( FAILED( hr ) )
		{
			pfnPrint( "Error:  SAPI 5.1 Unable to create phoneme converter for English language id %i\n", langID );
			return result;
		}
		else
		{
			pfnPrint( "Note:  SAPI 5.1 Falling back to use english -languageid %i\n", langID );
		}
	}
	else if ( userSpecified )
	{
		pfnPrint( "Note:  SAPI 5.1 Using user specified -languageid %i\n",langID );
	}

	SPSTATEHANDLE hStateRoot;
	// create/re-create Root level rule of grammar
	hr = cpRecoGrammar->GetRule(L"Root", 0, SPRAF_TopLevel | SPRAF_Active, TRUE, &hStateRoot);
	if ( FAILED( hr ) )
	{
		pfnPrint( "Error:  SAPI 5.1 Unable to create root rule\n" );
		return result;
	}

	// Inactivate it so we can alter it
	hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_INACTIVE );
	if ( FAILED( hr ) )
	{
		pfnPrint( "Error:  SAPI 5.1 Unable to deactivate grammar rules\n" );
		return result;
	}

	// Create the rule set from the words in text
	{
		CSpDynamicString currentWord;
		WCHAR *pos = ( WCHAR * )text;
		WCHAR str[ 2 ];
		str[1]= 0;

		while ( *pos )
		{
			if ( *pos == L' ' /*|| *pos == L'.' || *pos == L'-'*/ )
			{
				// Add word to rule set
				if ( currentWord.Length() > 0 )
				{
					AddWordRule( cpRecoGrammar, &hStateRoot, &wordRules, currentWord );
					currentWord.Clear();
				}
				pos++;
				continue;
			}

			// Skip anything that's inside a [ xxx ] pair.
			if ( *pos == L'[' )
			{
				while ( *pos && *pos != L']' )
				{
					pos++;
				}

				if ( *pos )
				{
					pos++;
				}
				continue;
			}

			str[ 0 ] = *pos;

			currentWord.Append( str );
			pos++;
		}

		if ( currentWord.Length() > 0 )
		{
			AddWordRule( cpRecoGrammar, &hStateRoot, &wordRules, currentWord );
		}

		if ( wordRules.Count() <= 0 )
		{
			pfnPrint( "Error:  Text %s contained no usable words\n", text );
			return result;
		}

		// Build all word to word transitions in the grammar
		if ( !BuildRules( cpRecoGrammar, &hStateRoot, &wordRules ) )
		{
			pfnPrint( "Error:  Rule set for %s could not be generated\n", text );
			return result;
		}
	}

	// check for recognitions and end of stream event
	const ULONGLONG ullInterest =
		SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM) | SPFEI(SPEI_FALSE_RECOGNITION) |
		SPFEI(SPEI_PHRASE_START ) | SPFEI(SPEI_HYPOTHESIS ) | SPFEI(SPEI_INTERFERENCE) ;
	hr = cpRecoContext->SetInterest( ullInterest, ullInterest );
	if ( FAILED( hr ) )
	{
		pfnPrint( "Error:  SAPI 5.1 Unable to set interest level\n" );
		return result;
	}
	// use Win32 events for command-line style application
	hr = cpRecoContext->SetNotifyWin32Event();
	if ( FAILED( hr ) )
	{
		pfnPrint( "Error:  SAPI 5.1 Unable to set win32 notify event\n" );
		return result;
	}
	// connect wav input to recognizer
	// SAPI will negotiate mismatched engine/input audio formats using system audio codecs, so second parameter is not important - use default of TRUE
	hr = cpRecognizer->SetInput(cpInputStream, TRUE);
	if ( FAILED( hr ) )
	{
		pfnPrint( "Error:  SAPI 5.1 Unable to associate input stream\n" );
		return result;
	}

	// Activate the CFG ( rather than using dictation )
	hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_ACTIVE );
	if ( FAILED( hr ) )
	{
		switch ( hr )
		{
		case E_INVALIDARG:
			pfnPrint( "pszName is invalid or bad. Alternatively, pReserved is non-NULL\n" );
			break;
		case SP_STREAM_UNINITIALIZED:
			pfnPrint( "ISpRecognizer::SetInput has not been called with the InProc recognizer\n" );
			break;
		case SPERR_UNINITIALIZED:
			pfnPrint( "The object has not been properly initialized.\n");
			break;
		case SPERR_UNSUPPORTED_FORMAT:
			pfnPrint( "Audio format is bad or is not recognized. Alternatively, the device driver may be busy by another application and cannot be accessed.\n" );
			break;
		case SPERR_NOT_TOPLEVEL_RULE:
			pfnPrint( "The rule pszName exists, but is not a top-level rule.\n" );
			break;
		default:
			pfnPrint( "Unknown error\n" );
			break;
		}
		pfnPrint( "Error:  SAPI 5.1 Unable to activate rule set\n" );
		return result;
	}

	// while events occur, continue processing
	// timeout should be greater than the audio stream length, or a reasonable amount of time expected to pass before no more recognitions are expected in an audio stream
	BOOL fEndStreamReached = FALSE;
	while (!fEndStreamReached && S_OK == cpRecoContext->WaitForNotifyEvent( SR_WAVTIMEOUT ))
	{
		CSpEvent spEvent;
		// pull all queued events from the reco context's event queue

		while (!fEndStreamReached && S_OK == spEvent.GetFrom(cpRecoContext))
		{
			// Check event type
			switch (spEvent.eEventId)
			{
			case SPEI_INTERFERENCE:
				{
					SPINTERFERENCE interference = spEvent.Interference();

					switch ( interference )
					{
					case SPINTERFERENCE_NONE:
						pfnPrint( "[ I None ]\r\n" );
						break;
					case SPINTERFERENCE_NOISE:
						pfnPrint( "[ I Noise ]\r\n" );
						break;
					case SPINTERFERENCE_NOSIGNAL:
						pfnPrint( "[ I No Signal ]\r\n" );
						break;
					case SPINTERFERENCE_TOOLOUD:
						pfnPrint( "[ I Too Loud ]\r\n" );
						break;
					case SPINTERFERENCE_TOOQUIET:
						pfnPrint( "[ I Too Quiet ]\r\n" );
						break;
					case SPINTERFERENCE_TOOFAST:
						pfnPrint( "[ I Too Fast ]\r\n" );
						break;
					case SPINTERFERENCE_TOOSLOW:
						pfnPrint( "[ I Too Slow ]\r\n" );
						break;
					default:
						break;
					}
				}
				break;
			case SPEI_PHRASE_START:
				pfnPrint( "Phrase Start\r\n" );
				sentence.MarkNewPhraseBase();
				break;

			case SPEI_HYPOTHESIS:
			case SPEI_RECOGNITION:
			case SPEI_FALSE_RECOGNITION:
				{
                    CComPtr<ISpRecoResult> cpResult;
                    cpResult = spEvent.RecoResult();

                    CSpDynamicString dstrText;
                    if (spEvent.eEventId == SPEI_FALSE_RECOGNITION)
                    {
                        dstrText = L"(Unrecognized)";

						result = SR_RESULT_FAILED;

						// It's possible that the failed recog might have more words, so see if that's the case
						EnumeratePhonemes( cpPhoneConv, cpResult, sentence );
					}
                    else
                    {
						// Hypothesis or recognition success
                        cpResult->GetText( (ULONG)SP_GETWHOLEPHRASE, (ULONG)SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL);

						EnumeratePhonemes( cpPhoneConv, cpResult, sentence );

						if ( spEvent.eEventId == SPEI_RECOGNITION )
						{
							result = SR_RESULT_SUCCESS;
						}

						pfnPrint( va( "%s%s\r\n", spEvent.eEventId == SPEI_HYPOTHESIS ? "[ Hypothesis ] " : "", dstrText.CopyToChar() ) );
					}

                    cpResult.Release();
				}
				break;
				// end of the wav file was reached by the speech recognition engine
            case SPEI_END_SR_STREAM:
				fEndStreamReached = TRUE;
				break;
			}

			// clear any event data/object references
			spEvent.Clear();
		}// END event pulling loop - break on empty event queue OR end stream
	}// END event polling loop - break on event timeout OR end stream

	// Deactivate rule
	hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_INACTIVE );
	if ( FAILED( hr ) )
	{
		pfnPrint( "Error:  SAPI 5.1 Unable to deactivate rule set\n" );
		return result;
	}

	// close the input stream, since we're done with it
	// NOTE: smart pointer will call SpStream's destructor, and consequently ::Close, but code may want to check for errors on ::Close operation
	hr = cpInputStream->Close();
	if ( FAILED( hr ) )
	{
		pfnPrint( "Error:  SAPI 5.1 Unable to close input stream\n" );
		return result;
	}

	return result;
}

//-----------------------------------------------------------------------------
// Purpose: HACK HACK:  We have to delete the RecoContext key or sapi starts to train
//  itself on each iteration which was causing some problems.
// Input  : hKey -
//-----------------------------------------------------------------------------
void RecursiveRegDelKey(HKEY hKey)
{
	char keyname[256]={0};
	DWORD namesize=256;

	//base case: no subkeys when RegEnumKeyEx returns error on index 0
	LONG lResult=RegEnumKeyEx(hKey,0,keyname,&namesize,NULL,NULL,NULL,NULL);
	if (lResult!=ERROR_SUCCESS)
	{
		return;
	}

	do
	{
		HKEY subkey;
		LONG lResult2;
		LONG lDelResult;
		lResult2=RegOpenKeyEx(hKey,keyname,0,KEY_ALL_ACCESS,&subkey);

		if (lResult2==ERROR_SUCCESS)
		{
			RecursiveRegDelKey(subkey);

			RegCloseKey(subkey);
			lDelResult=RegDeleteKey(hKey,keyname);
			namesize=256;
			//use 0 in the next function call because when you delete one, the rest shift down!
			lResult=RegEnumKeyEx(hKey,0,keyname,&namesize,NULL,NULL,NULL,NULL);
		}

		else
		{
			break;
		}

	} while (lResult!=ERROR_NO_MORE_ITEMS);
}

bool IsUseable( CWordTag *word )
{
	if ( word->m_uiStartByte || word->m_uiEndByte )
		return true;

	return false;
}

int FindLastUsableWord( CSentence& outwords )
{
	int numwords = outwords.m_Words.Count();
	if ( numwords < 1 )
	{
		Assert( 0 );
		return -1;
	}

	for ( int i = numwords-1; i >= 0; i-- )
	{
		CWordTag *check = outwords.m_Words[ i ];
		if ( IsUseable( check ) )
		{
			return i;
		}
	}

	return -1;
}


int FindFirstUsableWord( CSentence& outwords )
{
	int numwords = outwords.m_Words.Count();
	if ( numwords < 1 )
	{
		Assert( 0 );
		return -1;
	}

	for ( int i = 0; i < numwords; i++ )
	{
		CWordTag *check = outwords.m_Words[ i ];
		if ( IsUseable( check ) )
		{
			return i;
		}
	}

	return -1;
}

//-----------------------------------------------------------------------------
// Purpose: Counts words which have either a valid start or end byte
// Input  : *outwords -
// Output : int
//-----------------------------------------------------------------------------
int CountUsableWords( CSentence& outwords )
{
	int count = 0;
	int numwords = outwords.m_Words.Count();
	// Nothing to do
	if ( numwords <= 0 )
		return count;

	for ( int i = 0; i < numwords; i++ )
	{
		CWordTag *word = outwords.m_Words[ i ];
		if ( !IsUseable( word ) )
			continue;

		count++;
	}

	return count;
}


//-----------------------------------------------------------------------------
// Purpose: Counts words which have either a valid start or end byte
// Input  : *outwords -
// Output : int
//-----------------------------------------------------------------------------
int CountUnuseableWords( CSentence& outwords )
{
	int count = 0;
	int numwords = outwords.m_Words.Count();
	// Nothing to do
	if ( numwords <= 0 )
		return count;

	for ( int i = 0; i < numwords; i++ )
	{
		CWordTag *word = outwords.m_Words[ i ];
		if ( IsUseable( word ) )
			continue;

		count++;
	}

	return count;
}

// Keeps same relative spacing, but rebases list
void RepartitionPhonemes( CWordTag *word, unsigned int oldStart, unsigned int oldEnd )
{
	// Repartition phonemes based on old range
	float oldRange = ( float )( oldEnd - oldStart );
	float newRange = ( float )( word->m_uiEndByte - word->m_uiStartByte );

	for ( int i = 0; i < word->m_Phonemes.Count(); i++ )
	{
		CPhonemeTag *tag = word->m_Phonemes[ i ];
		Assert( tag );

		float frac1 = 0.0f, frac2 = 0.0f;
		float delta1, delta2;

		delta1 = ( float ) ( tag->m_uiStartByte - oldStart );
		delta2 = ( float ) ( tag->m_uiEndByte - oldStart );
		if ( oldRange > 0.0f )
		{
			frac1 = delta1 / oldRange;
			frac2 = delta2 / oldRange;
		}

		tag->m_uiStartByte = word->m_uiStartByte + ( unsigned int ) ( frac1 * newRange );
		tag->m_uiEndByte = word->m_uiStartByte +  ( unsigned int ) ( frac2 * newRange );
	}
}

void PartitionWords( CSentence& outwords, int start, int end, int sampleStart, int sampleEnd )
{
	int wordCount = end - start + 1;
	Assert( wordCount >= 1 );
	int stepSize  = ( sampleEnd - sampleStart ) / wordCount;

	int currentStart = sampleStart;

	for ( int i = start; i <= end; i++ )
	{
		CWordTag *word = outwords.m_Words[ i ];
		Assert( word );

		unsigned int oldStart = word->m_uiStartByte;
		unsigned int oldEnd = word->m_uiEndByte;

		word->m_uiStartByte = currentStart;
		word->m_uiEndByte = currentStart + stepSize;

		RepartitionPhonemes( word, oldStart, oldEnd );

		currentStart += stepSize;
	}
}

void MergeWords( CWordTag *w1, CWordTag *w2 )
{
	unsigned int start, end;

	start = min( w1->m_uiStartByte, w2->m_uiStartByte );
	end = max( w1->m_uiEndByte, w2->m_uiEndByte );

	unsigned int mid = ( start + end ) / 2;

	unsigned int oldw1start, oldw2start, oldw1end, oldw2end;

	oldw1start = w1->m_uiStartByte;
	oldw2start = w2->m_uiStartByte;
	oldw1end = w1->m_uiEndByte;
	oldw2end = w2->m_uiEndByte;

	w1->m_uiStartByte = start;
	w1->m_uiEndByte = mid;
	w2->m_uiStartByte = mid;
	w2->m_uiEndByte = end;

	RepartitionPhonemes( w1, oldw1start, oldw1end );
	RepartitionPhonemes( w2, oldw2start, oldw2end );
}

void FixupZeroLengthWords( CSentence& outwords )
{
	while ( 1 )
	{
		int i;
		for ( i = 0 ; i < outwords.m_Words.Count() - 1; i++ )
		{
			CWordTag *current, *next;

			current = outwords.m_Words[ i ];
			next = outwords.m_Words[ i + 1 ];

			if ( current->m_uiEndByte - current->m_uiStartByte <= 0 )
			{
				MergeWords( current, next );
				break;
			}

			if ( next->m_uiEndByte - next->m_uiStartByte <= 0 )
			{
				MergeWords( current, next );
				break;
			}
		}

		if ( i >= outwords.m_Words.Count() - 1 )
		{
			break;
		}
	}
}

void ComputeMissingByteSpans( int numsamples, CSentence& outwords )
{
	int numwords = outwords.m_Words.Count();
	// Nothing to do
	if ( numwords <= 0 )
		return;

	int interationcount = 1;

	while( 1 )
	{
		Log_Msg( LOG_PhonemeExtractor, "\nCompute %i\n", interationcount++ );
		LogWords( outwords );

		int wordNumber;

		// Done!
		if ( !CountUnuseableWords( outwords ) )
		{
			FixupZeroLengthWords( outwords );
			break;
		}

		if ( !CountUsableWords( outwords ) )
		{
			// Evenly space words across full sample time
			PartitionWords( outwords, 0, numwords - 1, 0, numsamples );
			break;
		}

		wordNumber = FindFirstUsableWord( outwords );
		// Not the first word
		if ( wordNumber > 0 )
		{
			// Repartition all of the unusables and the first one starting at zero over the range
			CWordTag *firstUsable = outwords.m_Words[ wordNumber ];
			Assert( firstUsable );

			if ( firstUsable->m_uiStartByte != 0 )
			{
				PartitionWords( outwords, 0, wordNumber - 1, 0, firstUsable->m_uiStartByte );
			}
			else
			{
				PartitionWords( outwords, 0, wordNumber, 0, firstUsable->m_uiEndByte );
			}

			// Start over
			continue;
		}

		wordNumber = FindLastUsableWord( outwords );
		// Not the last word
		if ( wordNumber >= 0 && wordNumber < numwords - 1 )
		{
			// Repartition all of the unusables and the first one starting at zero over the range
			CWordTag *lastUsable = outwords.m_Words[ wordNumber ];
			Assert( lastUsable );

			if ( lastUsable->m_uiEndByte != (unsigned int)numsamples )
			{
				PartitionWords( outwords, wordNumber + 1, numwords-1, lastUsable->m_uiEndByte, numsamples );
			}
			else
			{
				PartitionWords( outwords, wordNumber, numwords-1, lastUsable->m_uiStartByte, numsamples );
			}

			// Start over
			continue;
		}

		// If we get here it means that the start and end of the list are okay and we just have to
		//  iterate across the list and fix things in the middle
		int startByte = 0;
		int endByte = 0;
		for ( int i = 0; i < numwords ; i++ )
		{
			CWordTag *word = outwords.m_Words[ i ];
			if ( IsUseable( word ) )
			{
				startByte = word->m_uiEndByte;
				continue;
			}

			// Found the start of a chain of 1 or more unusable words
			// Find the startbyte of the next usable word and count how many words we check
			int wordCount = 1;
			for ( int j = i + 1; j < numwords; j++ )
			{
				CWordTag *next = outwords.m_Words[ j ];
				if ( IsUseable( next ) )
				{
					endByte = next->m_uiStartByte;
					break;
				}

				wordCount++;
			}

			// Now partition words across the gap and go to start again
			PartitionWords( outwords, i, i + wordCount - 1, startByte, endByte );
			break;
		}
	}
}

//-----------------------------------------------------------------------------
// Purpose: Given a wavfile and a list of inwords, determines the word/phonene
//  sample counts for the sentce
// Input  : *wavfile -
//			*inwords -
//			*outphonemes{	text.Clear( -
// Output : SR_RESULT
//-----------------------------------------------------------------------------
static SR_RESULT SAPI_ExtractPhonemes(
	const char *wavfile,
	int numsamples,
	void (*pfnPrint)( const char *fmt, ... ),
	CSentence& inwords,
	CSentence& outwords )
{
	LogReset();

	USES_CONVERSION;

	CSpDynamicString text;
	text.Clear();

	HKEY hkwipe;
	LONG lResult = RegOpenKeyEx( HKEY_CURRENT_USER, "Software\\Microsoft\\Speech\\RecoProfiles", 0, KEY_ALL_ACCESS, &hkwipe );
	if ( lResult == ERROR_SUCCESS )
	{
		RecursiveRegDelKey( hkwipe );
		RegCloseKey( hkwipe );
	}

	if ( strlen( inwords.GetText() ) <= 0 )
	{
		inwords.SetTextFromWords();
	}

	// Construct a string from the inwords array
	text.Append( T2W( inwords.GetText() ) );

	// Assume failure
	SR_RESULT result = SR_RESULT_ERROR;

	if ( text.Length() > 0 )
	{
		CSentence sentence;

		pfnPrint( "Processing...\r\n" );

		// Give it a try
		result = ExtractPhonemes( wavfile, text, sentence, pfnPrint );

		pfnPrint( "Finished.\r\n" );
		// PrintWordsAndPhonemes( sentence, pfnPrint );

		// Copy results to outputs
		outwords.Reset();

		outwords.SetText( inwords.GetText() );

		Log_Msg( LOG_PhonemeExtractor, "Starting\n" );
		LogWords( inwords );

		if ( SR_RESULT_ERROR != result )
		{
			int i;

			Log_Msg( LOG_PhonemeExtractor, "Hypothesized\n" );
			LogWords( sentence );

			for( i = 0 ; i < sentence.m_Words.Count(); i++ )
			{
				CWordTag *tag = sentence.m_Words[ i ];
				if ( tag )
				{
					// Skip '...' tag
					if ( stricmp( tag->GetWord(), "..." ) )
					{
						CWordTag *newTag = new CWordTag( *tag );

						outwords.m_Words.AddToTail( newTag );
					}
				}
			}

			// Now insert unrecognized/skipped words from original list
			//
			int frompos = 0, topos = 0;

			while( 1 )
			{
				// End of source list
				if ( frompos >= inwords.m_Words.Count() )
					break;

				const CWordTag *fromTag = inwords.m_Words[ frompos ];

				// Reached end of destination list, just copy words over from from source list until
				//  we run out of source words
				if ( topos >= outwords.m_Words.Count() )
				{
					// Just copy words over
					CWordTag *newWord = new CWordTag( *fromTag );

					// Remove phonemes
					while ( newWord->m_Phonemes.Count() > 0 )
					{
						CPhonemeTag *kill = newWord->m_Phonemes[ 0 ];
						newWord->m_Phonemes.Remove( 0 );
						delete kill;
					}

					outwords.m_Words.AddToTail( newWord );
					frompos++;
					topos++;
					continue;
				}

				// Destination word
				const CWordTag *toTag = outwords.m_Words[ topos ];

				// Words match, just skip ahead
				if ( !stricmp( fromTag->GetWord(), toTag->GetWord() ) )
				{
					frompos++;
					topos++;
					continue;
				}

				// The only case we handle is that something in the source wasn't in the destination

				// Find the next source word that appears in the destination
				int skipAhead = frompos + 1;
				bool found = false;
				while ( skipAhead < inwords.m_Words.Count() )
				{
					const CWordTag *sourceWord = inwords.m_Words[ skipAhead ];
					if ( !stricmp( sourceWord->GetWord(), toTag->GetWord() ) )
					{
						found = true;
						break;
					}

					skipAhead++;
				}

				// Uh oh destination has words that are not in source, just skip to next destination word?
				if ( !found )
				{
					topos++;
				}
				else
				{
					// Copy words from from source list into destination
					//
					int skipCount = skipAhead - frompos;

					while ( --skipCount>= 0 )
					{
						const CWordTag *sourceWord = inwords.m_Words[ frompos++ ];
						CWordTag *newWord = new CWordTag( *sourceWord );

						// Remove phonemes
						while ( newWord->m_Phonemes.Count() > 0 )
						{
							CPhonemeTag *kill = newWord->m_Phonemes[ 0 ];
							newWord->m_Phonemes.Remove( 0 );
							delete kill;
						}

						outwords.m_Words.InsertBefore( topos, newWord );
						topos++;
					}

					frompos++;
					topos++;
				}
			}

			Log_Msg( LOG_PhonemeExtractor, "\nDone simple check\n" );

			LogWords( outwords );
			LogPhonemes( outwords );

			ComputeMissingByteSpans( numsamples, outwords );

			Log_Msg( LOG_PhonemeExtractor, "\nFinal check\n" );

			LogWords( outwords );
			LogPhonemes( outwords );
		}
	}
	else
	{
		pfnPrint( "Input sentence is empty!\n" );
	}

	// Return results
	return result;
}


//-----------------------------------------------------------------------------
// Purpose: Expose the interface
//-----------------------------------------------------------------------------
class CPhonemeExtractorSAPI : public IPhonemeExtractor
{
public:
	virtual PE_APITYPE	GetAPIType() const
	{
		return SPEECH_API_SAPI;
	}

	// Used for menus, etc
	virtual char const *GetName() const
	{
		return "MS SAPI 5.1";
	}

	SR_RESULT Extract(
		const char *wavfile,
		int numsamples,
		void (*pfnPrint)( const char *fmt, ... ),
		CSentence& inwords,
		CSentence& outwords )
	{
		return SAPI_ExtractPhonemes( wavfile, numsamples, pfnPrint, inwords, outwords );
	}
};

EXPOSE_SINGLE_INTERFACE( CPhonemeExtractorSAPI, IPhonemeExtractor, VPHONEME_EXTRACTOR_INTERFACE );