mirror of
https://github.com/alliedmodders/hl2sdk.git
synced 2025-01-05 17:13:36 +08:00
1425 lines
37 KiB
C++
1425 lines
37 KiB
C++
|
//========= Copyright Valve Corporation, All rights reserved. ============//
|
||
|
//
|
||
|
// Purpose:
|
||
|
//
|
||
|
// $NoKeywords: $
|
||
|
//
|
||
|
//=============================================================================//
|
||
|
// extracephonemes.cpp : Defines the entry point for the console application.
|
||
|
//
|
||
|
#define PROTECTED_THINGS_DISABLE
|
||
|
|
||
|
#include "tier0/wchartypes.h"
|
||
|
#include <stdio.h>
|
||
|
#include <windows.h>
|
||
|
#include <tchar.h>
|
||
|
#include "sphelper.h"
|
||
|
#include "spddkhlp.h"
|
||
|
// ATL Header Files
|
||
|
#include <atlbase.h>
|
||
|
// Face poser and util includes
|
||
|
#include "utlvector.h"
|
||
|
#include "phonemeextractor/PhonemeExtractor.h"
|
||
|
#include "PhonemeConverter.h"
|
||
|
#include "sentence.h"
|
||
|
#include "tier0/dbg.h"
|
||
|
#include "tier0/icommandline.h"
|
||
|
#include "filesystem.h"
|
||
|
|
||
|
// Extract phoneme grammar id
|
||
|
#define EP_GRAM_ID 101
|
||
|
// First rule of dynamic sentence rule set
|
||
|
#define DYN_SENTENCERULE 102
|
||
|
// # of milliseconds to allow for processing before timeout
|
||
|
#define SR_WAVTIMEOUT 4000
|
||
|
// Weight tag for rule to rule word/rule transitions
|
||
|
#define CONFIDENCE_WEIGHT 0.0f
|
||
|
|
||
|
//#define LOGGING 1
|
||
|
#define LOGFILE "c:\\fp.log"
|
||
|
|
||
|
void LogReset( void )
|
||
|
{
|
||
|
#if LOGGING
|
||
|
FILE *fp = fopen( LOGFILE, "w" );
|
||
|
if ( fp )
|
||
|
fclose( fp );
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
char *va( const char *fmt, ... );
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Purpose:
|
||
|
// Input : *words -
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void LogWords( CSentence& sentence )
|
||
|
{
|
||
|
Log( "Wordcount == %i\n", sentence.m_Words.Size() );
|
||
|
|
||
|
for ( int i = 0; i < sentence.m_Words.Size(); i++ )
|
||
|
{
|
||
|
const CWordTag *w = sentence.m_Words[ i ];
|
||
|
Log( "Word %s %u to %u\n", w->GetWord(), w->m_uiStartByte, w->m_uiEndByte );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Purpose:
|
||
|
// Input : *phonemes -
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void LogPhonemes( CSentence& sentence )
|
||
|
{
|
||
|
return;
|
||
|
|
||
|
Log( "Phonemecount == %i\n", sentence.CountPhonemes() );
|
||
|
|
||
|
for ( int i = 0; i < sentence.m_Words.Size(); i++ )
|
||
|
{
|
||
|
const CWordTag *w = sentence.m_Words[ i ];
|
||
|
|
||
|
for ( int j = 0; j < w->m_Phonemes.Size(); j++ )
|
||
|
{
|
||
|
const CPhonemeTag *p = w->m_Phonemes[ j ];
|
||
|
Log( "Phoneme %s %u to %u\n", p->GetTag(), p->m_uiStartByte, p->m_uiEndByte );
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#define NANO_CONVERT 10000000.0f;
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Purpose: Walk list of words and phonemes and create phoneme tags in CSentence object
|
||
|
// FIXME: Right now, phonemes are assumed to evenly space out across a word.
|
||
|
// Input : *converter -
|
||
|
// result -
|
||
|
// sentence -
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void EnumeratePhonemes( ISpPhoneConverter *converter, const ISpRecoResult* result, CSentence& sentence )
|
||
|
{
|
||
|
USES_CONVERSION;
|
||
|
|
||
|
// Grab access to element container
|
||
|
ISpPhrase *phrase = ( ISpPhrase * )result;
|
||
|
if ( !phrase )
|
||
|
return;
|
||
|
|
||
|
SPPHRASE *pElements;
|
||
|
if ( !SUCCEEDED( phrase->GetPhrase( &pElements ) ) )
|
||
|
return;
|
||
|
|
||
|
// Only use it if it's better/same size as what we already had on-hand
|
||
|
if ( pElements->Rule.ulCountOfElements > 0 )
|
||
|
//(unsigned int)( sentence.m_Words.Size() - sentence.GetWordBase() ) )
|
||
|
{
|
||
|
sentence.ResetToBase();
|
||
|
|
||
|
// Walk list of words
|
||
|
for ( ULONG i = 0; i < pElements->Rule.ulCountOfElements; i++ )
|
||
|
{
|
||
|
unsigned int wordstart, wordend;
|
||
|
|
||
|
// Get start/end sample index
|
||
|
wordstart = pElements->pElements[i].ulAudioStreamOffset + (unsigned int)pElements->ullAudioStreamPosition;
|
||
|
wordend = wordstart + pElements->pElements[i].ulAudioSizeBytes;
|
||
|
|
||
|
// Create word tag
|
||
|
CWordTag *w = new CWordTag( W2T( pElements->pElements[i].pszDisplayText ) );
|
||
|
Assert( w );
|
||
|
w->m_uiStartByte = wordstart;
|
||
|
w->m_uiEndByte = wordend;
|
||
|
|
||
|
sentence.AddWordTag( w );
|
||
|
|
||
|
// Count # of phonemes in this word
|
||
|
SPPHONEID pstr[ 2 ];
|
||
|
pstr[ 1 ] = 0;
|
||
|
WCHAR wszPhoneme[ SP_MAX_PRON_LENGTH ];
|
||
|
|
||
|
const SPPHONEID *current;
|
||
|
SPPHONEID phoneme;
|
||
|
current = pElements->pElements[i].pszPronunciation;
|
||
|
float total_weight = 0.0f;
|
||
|
while ( 1 )
|
||
|
{
|
||
|
phoneme = *current++;
|
||
|
if ( !phoneme )
|
||
|
break;
|
||
|
|
||
|
pstr[ 0 ] = phoneme;
|
||
|
wszPhoneme[ 0 ] = L'\0';
|
||
|
|
||
|
converter->IdToPhone( pstr, wszPhoneme );
|
||
|
|
||
|
total_weight += WeightForPhoneme( W2A( wszPhoneme ) );
|
||
|
}
|
||
|
|
||
|
current = pElements->pElements[i].pszPronunciation;
|
||
|
|
||
|
// Decide # of bytes/phoneme weight
|
||
|
float psize = 0;
|
||
|
if ( total_weight )
|
||
|
{
|
||
|
psize = ( wordend - wordstart ) / total_weight;
|
||
|
}
|
||
|
|
||
|
int number = 0;
|
||
|
|
||
|
// Re-walk the phoneme list and create true phoneme tags
|
||
|
float startWeight = 0.0f;
|
||
|
while ( 1 )
|
||
|
{
|
||
|
phoneme = *current++;
|
||
|
if ( !phoneme )
|
||
|
break;
|
||
|
|
||
|
pstr[ 0 ] = phoneme;
|
||
|
wszPhoneme[ 0 ] = L'\0';
|
||
|
|
||
|
converter->IdToPhone( pstr, wszPhoneme );
|
||
|
|
||
|
CPhonemeTag *p = new CPhonemeTag( W2A( wszPhoneme ) );
|
||
|
Assert( p );
|
||
|
|
||
|
float weight = WeightForPhoneme( W2A( wszPhoneme ) );
|
||
|
|
||
|
p->m_uiStartByte = wordstart + (int)( startWeight * psize );
|
||
|
p->m_uiEndByte = p->m_uiStartByte + (int)( psize * weight );
|
||
|
|
||
|
startWeight += weight;
|
||
|
|
||
|
// Convert to IPA phoneme code
|
||
|
p->SetPhonemeCode( TextToPhoneme( p->GetTag() ) );
|
||
|
|
||
|
sentence.AddPhonemeTag( w, p );
|
||
|
|
||
|
number++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Free memory
|
||
|
::CoTaskMemFree(pElements);
|
||
|
}
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Purpose: Create rules for each word in the reference sentence
|
||
|
//-----------------------------------------------------------------------------
|
||
|
typedef struct
|
||
|
{
|
||
|
int ruleId;
|
||
|
SPSTATEHANDLE hRule;
|
||
|
CSpDynamicString word;
|
||
|
char plaintext[ 256 ];
|
||
|
} WORDRULETYPE;
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Purpose: Creates start for word of sentence
|
||
|
// Input : cpRecoGrammar -
|
||
|
// *root -
|
||
|
// *rules -
|
||
|
// word -
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void AddWordRule( ISpRecoGrammar* cpRecoGrammar, SPSTATEHANDLE *root, CUtlVector< WORDRULETYPE > *rules, CSpDynamicString& word )
|
||
|
{
|
||
|
USES_CONVERSION;
|
||
|
HRESULT hr;
|
||
|
WORDRULETYPE *newrule;
|
||
|
|
||
|
int idx = (*rules).AddToTail();
|
||
|
|
||
|
newrule = &(*rules)[ idx ];
|
||
|
|
||
|
newrule->ruleId = DYN_SENTENCERULE + idx + 1;
|
||
|
newrule->word = word;
|
||
|
|
||
|
strcpy( newrule->plaintext, W2T( word ) );
|
||
|
|
||
|
// Create empty rule
|
||
|
hr = cpRecoGrammar->CreateNewState( *root, &newrule->hRule );
|
||
|
Assert( !FAILED( hr ) );
|
||
|
}
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Purpose:
|
||
|
// Input : cpRecoGrammar -
|
||
|
// *from -
|
||
|
// *to -
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void AddWordTransitionRule( ISpRecoGrammar* cpRecoGrammar, WORDRULETYPE *from, WORDRULETYPE *to )
|
||
|
{
|
||
|
USES_CONVERSION;
|
||
|
|
||
|
HRESULT hr;
|
||
|
Assert( from );
|
||
|
|
||
|
if ( from && !to )
|
||
|
{
|
||
|
OutputDebugString( va( "Transition from %s to TERM\r\n", from->plaintext ) );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
OutputDebugString( va( "Transition from %s to %s\r\n", from->plaintext, to->plaintext ) );
|
||
|
}
|
||
|
|
||
|
hr = cpRecoGrammar->AddWordTransition( from->hRule, to ? to->hRule : NULL, (WCHAR *)from->word, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
|
||
|
Assert( !FAILED( hr ) );
|
||
|
}
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Purpose:
|
||
|
// Input : cpRecoGrammar -
|
||
|
// *from -
|
||
|
// *to -
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void AddOptionalTransitionRule( ISpRecoGrammar* cpRecoGrammar, WORDRULETYPE *from, WORDRULETYPE *to )
|
||
|
{
|
||
|
USES_CONVERSION;
|
||
|
|
||
|
HRESULT hr;
|
||
|
Assert( from );
|
||
|
|
||
|
if ( from && !to )
|
||
|
{
|
||
|
OutputDebugString( va( "Opt transition from %s to TERM\r\n", from->plaintext ) );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
OutputDebugString( va( "Opt transition from %s to %s\r\n", from->plaintext, to->plaintext ) );
|
||
|
}
|
||
|
|
||
|
hr = cpRecoGrammar->AddWordTransition( from->hRule, to ? to->hRule : NULL, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
|
||
|
Assert( !FAILED( hr ) );
|
||
|
}
|
||
|
|
||
|
#define MAX_WORD_SKIP 1
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Purpose: Links together all word rule states into a sentence rule CFG
|
||
|
// Input : singleword -
|
||
|
// cpRecoGrammar -
|
||
|
// *root -
|
||
|
// *rules -
|
||
|
//-----------------------------------------------------------------------------
|
||
|
bool BuildRules( ISpRecoGrammar* cpRecoGrammar, SPSTATEHANDLE *root, CUtlVector< WORDRULETYPE > *rules )
|
||
|
{
|
||
|
HRESULT hr;
|
||
|
WORDRULETYPE *rule, *next;
|
||
|
|
||
|
int numrules = (*rules).Size();
|
||
|
|
||
|
rule = &(*rules)[ 0 ];
|
||
|
|
||
|
// Add transition
|
||
|
hr = cpRecoGrammar->AddWordTransition( *root, rule->hRule, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
|
||
|
Assert( !FAILED( hr ) );
|
||
|
|
||
|
for ( int i = 0; i < numrules; i++ )
|
||
|
{
|
||
|
rule = &(*rules)[ i ];
|
||
|
if ( i < numrules - 1 )
|
||
|
{
|
||
|
next = &(*rules)[ i + 1 ];
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
next = NULL;
|
||
|
}
|
||
|
|
||
|
AddWordTransitionRule( cpRecoGrammar, rule, next );
|
||
|
}
|
||
|
|
||
|
if ( numrules > 1 )
|
||
|
{
|
||
|
for ( int skip = 1; skip <= min( MAX_WORD_SKIP, numrules ); skip++ )
|
||
|
{
|
||
|
OutputDebugString( va( "Opt transition from Root to %s\r\n", (*rules)[ 0 ].plaintext ) );
|
||
|
|
||
|
hr = cpRecoGrammar->AddWordTransition( *root, (*rules)[ 0 ].hRule, NULL, NULL, SPWT_LEXICAL, CONFIDENCE_WEIGHT, NULL );
|
||
|
|
||
|
// Now build rules where you can skip 1 to N intervening words
|
||
|
for ( int i = 1; i < numrules; i++ )
|
||
|
{
|
||
|
// Start at the beginning?
|
||
|
rule = &(*rules)[ i ];
|
||
|
if ( i < numrules - skip )
|
||
|
{
|
||
|
next = &(*rules)[ i + skip ];
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// Add transition
|
||
|
AddOptionalTransitionRule( cpRecoGrammar, rule, next );
|
||
|
}
|
||
|
|
||
|
// Go from final rule to end point
|
||
|
AddOptionalTransitionRule( cpRecoGrammar, rule, NULL );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Store it
|
||
|
hr = cpRecoGrammar->Commit(NULL);
|
||
|
if ( FAILED( hr ) )
|
||
|
return false;
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Purpose: Debugging, prints alternate list if one is created
|
||
|
// Input : cpResult -
|
||
|
// (*pfnPrint -
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void PrintAlternates( ISpRecoResult* cpResult, void (*pfnPrint)( const char *fmt, ... ) )
|
||
|
{
|
||
|
ISpPhraseAlt *rgPhraseAlt[ 32 ];
|
||
|
memset( rgPhraseAlt, 0, sizeof( rgPhraseAlt ) );
|
||
|
|
||
|
ULONG ulCount;
|
||
|
|
||
|
ISpPhrase *phrase = ( ISpPhrase * )cpResult;
|
||
|
if ( phrase )
|
||
|
{
|
||
|
SPPHRASE *pElements;
|
||
|
if ( SUCCEEDED( phrase->GetPhrase( &pElements ) ) )
|
||
|
{
|
||
|
if ( pElements->Rule.ulCountOfElements > 0 )
|
||
|
{
|
||
|
HRESULT hr = cpResult->GetAlternates(
|
||
|
pElements->Rule.ulFirstElement,
|
||
|
pElements->Rule.ulCountOfElements,
|
||
|
32,
|
||
|
rgPhraseAlt,
|
||
|
&ulCount);
|
||
|
|
||
|
Assert( !FAILED( hr ) );
|
||
|
|
||
|
for ( ULONG r = 0 ; r < ulCount; r++ )
|
||
|
{
|
||
|
CSpDynamicString dstrText;
|
||
|
hr = rgPhraseAlt[ r ]->GetText( (ULONG)SP_GETWHOLEPHRASE, (ULONG)SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL);
|
||
|
Assert( !FAILED( hr ) );
|
||
|
|
||
|
pfnPrint( "[ ALT ]" );
|
||
|
pfnPrint( dstrText.CopyToChar() );
|
||
|
pfnPrint( "\r\n" );
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
for ( int i = 0; i < 32; i++ )
|
||
|
{
|
||
|
if ( rgPhraseAlt[ i ] )
|
||
|
{
|
||
|
rgPhraseAlt[ i ]->Release();
|
||
|
rgPhraseAlt[ i ] = NULL;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void PrintWordsAndPhonemes( CSentence& sentence, void (*pfnPrint)( const char *fmt, ... ) )
|
||
|
{
|
||
|
char sz[ 256 ];
|
||
|
int i;
|
||
|
|
||
|
pfnPrint( "WORDS\r\n\r\n" );
|
||
|
|
||
|
for ( i = 0 ; i < sentence.m_Words.Size(); i++ )
|
||
|
{
|
||
|
CWordTag *word = sentence.m_Words[ i ];
|
||
|
if ( !word )
|
||
|
continue;
|
||
|
|
||
|
sprintf( sz, "<%u - %u> %s\r\n",
|
||
|
word->m_uiStartByte, word->m_uiEndByte, word->GetWord() );
|
||
|
|
||
|
pfnPrint( sz );
|
||
|
|
||
|
for ( int j = 0 ; j < word->m_Phonemes.Size(); j++ )
|
||
|
{
|
||
|
CPhonemeTag *phoneme = word->m_Phonemes[ j ];
|
||
|
if ( !phoneme )
|
||
|
continue;
|
||
|
|
||
|
sprintf( sz, " <%u - %u> %s\r\n",
|
||
|
phoneme->m_uiStartByte, phoneme->m_uiEndByte, phoneme->GetTag() );
|
||
|
|
||
|
pfnPrint( sz );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
pfnPrint( "\r\n" );
|
||
|
}
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Purpose: Given a wave file and a string of words "text", creates a CFG from the
|
||
|
// sentence and stores the resulting words/phonemes in CSentence
|
||
|
// Input : *wavname -
|
||
|
// text -
|
||
|
// sentence -
|
||
|
// (*pfnPrint -
|
||
|
// Output : SR_RESULT
|
||
|
//-----------------------------------------------------------------------------
|
||
|
SR_RESULT ExtractPhonemes( const char *wavname, CSpDynamicString& text, CSentence& sentence, void (*pfnPrint)( const char *fmt, ...) )
|
||
|
{
|
||
|
// Assume failure
|
||
|
SR_RESULT result = SR_RESULT_ERROR;
|
||
|
|
||
|
if ( text.Length() <= 0 )
|
||
|
{
|
||
|
pfnPrint( "Error: no rule / text specified\n" );
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
USES_CONVERSION;
|
||
|
HRESULT hr;
|
||
|
|
||
|
CUtlVector < WORDRULETYPE > wordRules;
|
||
|
|
||
|
CComPtr<ISpStream> cpInputStream;
|
||
|
CComPtr<ISpRecognizer> cpRecognizer;
|
||
|
CComPtr<ISpRecoContext> cpRecoContext;
|
||
|
CComPtr<ISpRecoGrammar> cpRecoGrammar;
|
||
|
CComPtr<ISpPhoneConverter> cpPhoneConv;
|
||
|
|
||
|
// Create basic SAPI stream object
|
||
|
// NOTE: The helper SpBindToFile can be used to perform the following operations
|
||
|
hr = cpInputStream.CoCreateInstance(CLSID_SpStream);
|
||
|
if ( FAILED( hr ) )
|
||
|
{
|
||
|
pfnPrint( "Error: SAPI 5.1 Stream object not installed?\n" );
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
CSpStreamFormat sInputFormat;
|
||
|
|
||
|
// setup stream object with wav file MY_WAVE_AUDIO_FILENAME
|
||
|
// for read-only access, since it will only be access by the SR engine
|
||
|
hr = cpInputStream->BindToFile(
|
||
|
T2W(wavname),
|
||
|
SPFM_OPEN_READONLY,
|
||
|
NULL,
|
||
|
sInputFormat.WaveFormatExPtr(),
|
||
|
SPFEI_ALL_EVENTS );
|
||
|
|
||
|
if ( FAILED( hr ) )
|
||
|
{
|
||
|
pfnPrint( "Error: couldn't open wav file %s\n", wavname );
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
// Create in-process speech recognition engine
|
||
|
hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer);
|
||
|
if ( FAILED( hr ) )
|
||
|
{
|
||
|
pfnPrint( "Error: SAPI 5.1 In process recognizer object not installed?\n" );
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
// Create recognition context to receive events
|
||
|
hr = cpRecognizer->CreateRecoContext(&cpRecoContext);
|
||
|
if ( FAILED( hr ) )
|
||
|
{
|
||
|
pfnPrint( "Error: SAPI 5.1 Unable to create recognizer context\n" );
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
// Create a grammar
|
||
|
hr = cpRecoContext->CreateGrammar( EP_GRAM_ID, &cpRecoGrammar );
|
||
|
if ( FAILED( hr ) )
|
||
|
{
|
||
|
pfnPrint( "Error: SAPI 5.1 Unable to create recognizer grammar\n" );
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
LANGID englishID = 0x409; // 1033 decimal
|
||
|
|
||
|
bool userSpecified = false;
|
||
|
LANGID langID = SpGetUserDefaultUILanguage();
|
||
|
|
||
|
// Allow commandline override
|
||
|
if ( CommandLine()->FindParm( "-languageid" ) != 0 )
|
||
|
{
|
||
|
userSpecified = true;
|
||
|
langID = CommandLine()->ParmValue( "-languageid", langID );
|
||
|
}
|
||
|
|
||
|
// Create a phoneme converter ( so we can convert to IPA codes )
|
||
|
hr = SpCreatePhoneConverter( langID, NULL, NULL, &cpPhoneConv );
|
||
|
if ( FAILED( hr ) )
|
||
|
{
|
||
|
if ( langID != englishID )
|
||
|
{
|
||
|
if ( userSpecified )
|
||
|
{
|
||
|
pfnPrint( "Warning: SAPI 5.1 Unable to create phoneme converter for command line override -languageid %i\n", langID );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
pfnPrint( "Warning: SAPI 5.1 Unable to create phoneme converter for default UI language %i\n",langID );
|
||
|
}
|
||
|
|
||
|
// Try english!!!
|
||
|
langID = englishID;
|
||
|
hr = SpCreatePhoneConverter( langID, NULL, NULL, &cpPhoneConv );
|
||
|
}
|
||
|
|
||
|
if ( FAILED( hr ) )
|
||
|
{
|
||
|
pfnPrint( "Error: SAPI 5.1 Unable to create phoneme converter for English language id %i\n", langID );
|
||
|
return result;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
pfnPrint( "Note: SAPI 5.1 Falling back to use english -languageid %i\n", langID );
|
||
|
}
|
||
|
}
|
||
|
else if ( userSpecified )
|
||
|
{
|
||
|
pfnPrint( "Note: SAPI 5.1 Using user specified -languageid %i\n",langID );
|
||
|
}
|
||
|
|
||
|
SPSTATEHANDLE hStateRoot;
|
||
|
// create/re-create Root level rule of grammar
|
||
|
hr = cpRecoGrammar->GetRule(L"Root", 0, SPRAF_TopLevel | SPRAF_Active, TRUE, &hStateRoot);
|
||
|
if ( FAILED( hr ) )
|
||
|
{
|
||
|
pfnPrint( "Error: SAPI 5.1 Unable to create root rule\n" );
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
// Inactivate it so we can alter it
|
||
|
hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_INACTIVE );
|
||
|
if ( FAILED( hr ) )
|
||
|
{
|
||
|
pfnPrint( "Error: SAPI 5.1 Unable to deactivate grammar rules\n" );
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
// Create the rule set from the words in text
|
||
|
{
|
||
|
CSpDynamicString currentWord;
|
||
|
WCHAR *pos = ( WCHAR * )text;
|
||
|
WCHAR str[ 2 ];
|
||
|
str[1]= 0;
|
||
|
|
||
|
while ( *pos )
|
||
|
{
|
||
|
if ( *pos == L' ' /*|| *pos == L'.' || *pos == L'-'*/ )
|
||
|
{
|
||
|
// Add word to rule set
|
||
|
if ( currentWord.Length() > 0 )
|
||
|
{
|
||
|
AddWordRule( cpRecoGrammar, &hStateRoot, &wordRules, currentWord );
|
||
|
currentWord.Clear();
|
||
|
}
|
||
|
pos++;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// Skip anything that's inside a [ xxx ] pair.
|
||
|
if ( *pos == L'[' )
|
||
|
{
|
||
|
while ( *pos && *pos != L']' )
|
||
|
{
|
||
|
pos++;
|
||
|
}
|
||
|
|
||
|
if ( *pos )
|
||
|
{
|
||
|
pos++;
|
||
|
}
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
str[ 0 ] = *pos;
|
||
|
|
||
|
currentWord.Append( str );
|
||
|
pos++;
|
||
|
}
|
||
|
|
||
|
if ( currentWord.Length() > 0 )
|
||
|
{
|
||
|
AddWordRule( cpRecoGrammar, &hStateRoot, &wordRules, currentWord );
|
||
|
}
|
||
|
|
||
|
if ( wordRules.Size() <= 0 )
|
||
|
{
|
||
|
pfnPrint( "Error: Text %s contained no usable words\n", text );
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
// Build all word to word transitions in the grammar
|
||
|
if ( !BuildRules( cpRecoGrammar, &hStateRoot, &wordRules ) )
|
||
|
{
|
||
|
pfnPrint( "Error: Rule set for %s could not be generated\n", text );
|
||
|
return result;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// check for recognitions and end of stream event
|
||
|
const ULONGLONG ullInterest =
|
||
|
SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM) | SPFEI(SPEI_FALSE_RECOGNITION) |
|
||
|
SPFEI(SPEI_PHRASE_START ) | SPFEI(SPEI_HYPOTHESIS ) | SPFEI(SPEI_INTERFERENCE) ;
|
||
|
hr = cpRecoContext->SetInterest( ullInterest, ullInterest );
|
||
|
if ( FAILED( hr ) )
|
||
|
{
|
||
|
pfnPrint( "Error: SAPI 5.1 Unable to set interest level\n" );
|
||
|
return result;
|
||
|
}
|
||
|
// use Win32 events for command-line style application
|
||
|
hr = cpRecoContext->SetNotifyWin32Event();
|
||
|
if ( FAILED( hr ) )
|
||
|
{
|
||
|
pfnPrint( "Error: SAPI 5.1 Unable to set win32 notify event\n" );
|
||
|
return result;
|
||
|
}
|
||
|
// connect wav input to recognizer
|
||
|
// SAPI will negotiate mismatched engine/input audio formats using system audio codecs, so second parameter is not important - use default of TRUE
|
||
|
hr = cpRecognizer->SetInput(cpInputStream, TRUE);
|
||
|
if ( FAILED( hr ) )
|
||
|
{
|
||
|
pfnPrint( "Error: SAPI 5.1 Unable to associate input stream\n" );
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
// Activate the CFG ( rather than using dictation )
|
||
|
hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_ACTIVE );
|
||
|
if ( FAILED( hr ) )
|
||
|
{
|
||
|
switch ( hr )
|
||
|
{
|
||
|
case E_INVALIDARG:
|
||
|
pfnPrint( "pszName is invalid or bad. Alternatively, pReserved is non-NULL\n" );
|
||
|
break;
|
||
|
case SP_STREAM_UNINITIALIZED:
|
||
|
pfnPrint( "ISpRecognizer::SetInput has not been called with the InProc recognizer\n" );
|
||
|
break;
|
||
|
case SPERR_UNINITIALIZED:
|
||
|
pfnPrint( "The object has not been properly initialized.\n");
|
||
|
break;
|
||
|
case SPERR_UNSUPPORTED_FORMAT:
|
||
|
pfnPrint( "Audio format is bad or is not recognized. Alternatively, the device driver may be busy by another application and cannot be accessed.\n" );
|
||
|
break;
|
||
|
case SPERR_NOT_TOPLEVEL_RULE:
|
||
|
pfnPrint( "The rule pszName exists, but is not a top-level rule.\n" );
|
||
|
break;
|
||
|
default:
|
||
|
pfnPrint( "Unknown error\n" );
|
||
|
break;
|
||
|
}
|
||
|
pfnPrint( "Error: SAPI 5.1 Unable to activate rule set\n" );
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
// while events occur, continue processing
|
||
|
// timeout should be greater than the audio stream length, or a reasonable amount of time expected to pass before no more recognitions are expected in an audio stream
|
||
|
BOOL fEndStreamReached = FALSE;
|
||
|
while (!fEndStreamReached && S_OK == cpRecoContext->WaitForNotifyEvent( SR_WAVTIMEOUT ))
|
||
|
{
|
||
|
CSpEvent spEvent;
|
||
|
// pull all queued events from the reco context's event queue
|
||
|
|
||
|
while (!fEndStreamReached && S_OK == spEvent.GetFrom(cpRecoContext))
|
||
|
{
|
||
|
// Check event type
|
||
|
switch (spEvent.eEventId)
|
||
|
{
|
||
|
case SPEI_INTERFERENCE:
|
||
|
{
|
||
|
SPINTERFERENCE interference = spEvent.Interference();
|
||
|
|
||
|
switch ( interference )
|
||
|
{
|
||
|
case SPINTERFERENCE_NONE:
|
||
|
pfnPrint( "[ I None ]\r\n" );
|
||
|
break;
|
||
|
case SPINTERFERENCE_NOISE:
|
||
|
pfnPrint( "[ I Noise ]\r\n" );
|
||
|
break;
|
||
|
case SPINTERFERENCE_NOSIGNAL:
|
||
|
pfnPrint( "[ I No Signal ]\r\n" );
|
||
|
break;
|
||
|
case SPINTERFERENCE_TOOLOUD:
|
||
|
pfnPrint( "[ I Too Loud ]\r\n" );
|
||
|
break;
|
||
|
case SPINTERFERENCE_TOOQUIET:
|
||
|
pfnPrint( "[ I Too Quiet ]\r\n" );
|
||
|
break;
|
||
|
case SPINTERFERENCE_TOOFAST:
|
||
|
pfnPrint( "[ I Too Fast ]\r\n" );
|
||
|
break;
|
||
|
case SPINTERFERENCE_TOOSLOW:
|
||
|
pfnPrint( "[ I Too Slow ]\r\n" );
|
||
|
break;
|
||
|
default:
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
break;
|
||
|
case SPEI_PHRASE_START:
|
||
|
pfnPrint( "Phrase Start\r\n" );
|
||
|
sentence.MarkNewPhraseBase();
|
||
|
break;
|
||
|
|
||
|
case SPEI_HYPOTHESIS:
|
||
|
case SPEI_RECOGNITION:
|
||
|
case SPEI_FALSE_RECOGNITION:
|
||
|
{
|
||
|
CComPtr<ISpRecoResult> cpResult;
|
||
|
cpResult = spEvent.RecoResult();
|
||
|
|
||
|
CSpDynamicString dstrText;
|
||
|
if (spEvent.eEventId == SPEI_FALSE_RECOGNITION)
|
||
|
{
|
||
|
dstrText = L"(Unrecognized)";
|
||
|
|
||
|
result = SR_RESULT_FAILED;
|
||
|
|
||
|
// It's possible that the failed recog might have more words, so see if that's the case
|
||
|
EnumeratePhonemes( cpPhoneConv, cpResult, sentence );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// Hypothesis or recognition success
|
||
|
cpResult->GetText( (ULONG)SP_GETWHOLEPHRASE, (ULONG)SP_GETWHOLEPHRASE, TRUE, &dstrText, NULL);
|
||
|
|
||
|
EnumeratePhonemes( cpPhoneConv, cpResult, sentence );
|
||
|
|
||
|
if ( spEvent.eEventId == SPEI_RECOGNITION )
|
||
|
{
|
||
|
result = SR_RESULT_SUCCESS;
|
||
|
}
|
||
|
|
||
|
pfnPrint( va( "%s%s\r\n", spEvent.eEventId == SPEI_HYPOTHESIS ? "[ Hypothesis ] " : "", dstrText.CopyToChar() ) );
|
||
|
}
|
||
|
|
||
|
cpResult.Release();
|
||
|
}
|
||
|
break;
|
||
|
// end of the wav file was reached by the speech recognition engine
|
||
|
case SPEI_END_SR_STREAM:
|
||
|
fEndStreamReached = TRUE;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
// clear any event data/object references
|
||
|
spEvent.Clear();
|
||
|
}// END event pulling loop - break on empty event queue OR end stream
|
||
|
}// END event polling loop - break on event timeout OR end stream
|
||
|
|
||
|
// Deactivate rule
|
||
|
hr = cpRecoGrammar->SetRuleState( NULL, NULL, SPRS_INACTIVE );
|
||
|
if ( FAILED( hr ) )
|
||
|
{
|
||
|
pfnPrint( "Error: SAPI 5.1 Unable to deactivate rule set\n" );
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
// close the input stream, since we're done with it
|
||
|
// NOTE: smart pointer will call SpStream's destructor, and consequently ::Close, but code may want to check for errors on ::Close operation
|
||
|
hr = cpInputStream->Close();
|
||
|
if ( FAILED( hr ) )
|
||
|
{
|
||
|
pfnPrint( "Error: SAPI 5.1 Unable to close input stream\n" );
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Purpose: HACK HACK: We have to delete the RecoContext key or sapi starts to train
|
||
|
// itself on each iteration which was causing some problems.
|
||
|
// Input : hKey -
|
||
|
//-----------------------------------------------------------------------------
|
||
|
void RecursiveRegDelKey(HKEY hKey)
|
||
|
{
|
||
|
char keyname[256]={0};
|
||
|
DWORD namesize=256;
|
||
|
|
||
|
//base case: no subkeys when RegEnumKeyEx returns error on index 0
|
||
|
LONG lResult=RegEnumKeyEx(hKey,0,keyname,&namesize,NULL,NULL,NULL,NULL);
|
||
|
if (lResult!=ERROR_SUCCESS)
|
||
|
{
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
do
|
||
|
{
|
||
|
HKEY subkey;
|
||
|
LONG lResult2;
|
||
|
LONG lDelResult;
|
||
|
lResult2=RegOpenKeyEx(hKey,keyname,0,KEY_ALL_ACCESS,&subkey);
|
||
|
|
||
|
if (lResult2==ERROR_SUCCESS)
|
||
|
{
|
||
|
RecursiveRegDelKey(subkey);
|
||
|
|
||
|
RegCloseKey(subkey);
|
||
|
lDelResult=RegDeleteKey(hKey,keyname);
|
||
|
namesize=256;
|
||
|
//use 0 in the next function call because when you delete one, the rest shift down!
|
||
|
lResult=RegEnumKeyEx(hKey,0,keyname,&namesize,NULL,NULL,NULL,NULL);
|
||
|
}
|
||
|
|
||
|
else
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
} while (lResult!=ERROR_NO_MORE_ITEMS);
|
||
|
}
|
||
|
|
||
|
bool IsUseable( CWordTag *word )
|
||
|
{
|
||
|
if ( word->m_uiStartByte || word->m_uiEndByte )
|
||
|
return true;
|
||
|
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
int FindLastUsableWord( CSentence& outwords )
|
||
|
{
|
||
|
int numwords = outwords.m_Words.Size();
|
||
|
if ( numwords < 1 )
|
||
|
{
|
||
|
Assert( 0 );
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
for ( int i = numwords-1; i >= 0; i-- )
|
||
|
{
|
||
|
CWordTag *check = outwords.m_Words[ i ];
|
||
|
if ( IsUseable( check ) )
|
||
|
{
|
||
|
return i;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
|
||
|
int FindFirstUsableWord( CSentence& outwords )
|
||
|
{
|
||
|
int numwords = outwords.m_Words.Size();
|
||
|
if ( numwords < 1 )
|
||
|
{
|
||
|
Assert( 0 );
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
for ( int i = 0; i < numwords; i++ )
|
||
|
{
|
||
|
CWordTag *check = outwords.m_Words[ i ];
|
||
|
if ( IsUseable( check ) )
|
||
|
{
|
||
|
return i;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Purpose: Counts words which have either a valid start or end byte
|
||
|
// Input : *outwords -
|
||
|
// Output : int
|
||
|
//-----------------------------------------------------------------------------
|
||
|
int CountUsableWords( CSentence& outwords )
|
||
|
{
|
||
|
int count = 0;
|
||
|
int numwords = outwords.m_Words.Size();
|
||
|
// Nothing to do
|
||
|
if ( numwords <= 0 )
|
||
|
return count;
|
||
|
|
||
|
for ( int i = 0; i < numwords; i++ )
|
||
|
{
|
||
|
CWordTag *word = outwords.m_Words[ i ];
|
||
|
if ( !IsUseable( word ) )
|
||
|
continue;
|
||
|
|
||
|
count++;
|
||
|
}
|
||
|
|
||
|
return count;
|
||
|
}
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Purpose: Counts words which have either a valid start or end byte
|
||
|
// Input : *outwords -
|
||
|
// Output : int
|
||
|
//-----------------------------------------------------------------------------
|
||
|
int CountUnuseableWords( CSentence& outwords )
|
||
|
{
|
||
|
int count = 0;
|
||
|
int numwords = outwords.m_Words.Size();
|
||
|
// Nothing to do
|
||
|
if ( numwords <= 0 )
|
||
|
return count;
|
||
|
|
||
|
for ( int i = 0; i < numwords; i++ )
|
||
|
{
|
||
|
CWordTag *word = outwords.m_Words[ i ];
|
||
|
if ( IsUseable( word ) )
|
||
|
continue;
|
||
|
|
||
|
count++;
|
||
|
}
|
||
|
|
||
|
return count;
|
||
|
}
|
||
|
|
||
|
// Keeps same relative spacing, but rebases list
|
||
|
void RepartitionPhonemes( CWordTag *word, unsigned int oldStart, unsigned int oldEnd )
|
||
|
{
|
||
|
// Repartition phonemes based on old range
|
||
|
float oldRange = ( float )( oldEnd - oldStart );
|
||
|
float newRange = ( float )( word->m_uiEndByte - word->m_uiStartByte );
|
||
|
|
||
|
for ( int i = 0; i < word->m_Phonemes.Size(); i++ )
|
||
|
{
|
||
|
CPhonemeTag *tag = word->m_Phonemes[ i ];
|
||
|
Assert( tag );
|
||
|
|
||
|
float frac1 = 0.0f, frac2 = 0.0f;
|
||
|
float delta1, delta2;
|
||
|
|
||
|
delta1 = ( float ) ( tag->m_uiStartByte - oldStart );
|
||
|
delta2 = ( float ) ( tag->m_uiEndByte - oldStart );
|
||
|
if ( oldRange > 0.0f )
|
||
|
{
|
||
|
frac1 = delta1 / oldRange;
|
||
|
frac2 = delta2 / oldRange;
|
||
|
}
|
||
|
|
||
|
tag->m_uiStartByte = word->m_uiStartByte + ( unsigned int ) ( frac1 * newRange );
|
||
|
tag->m_uiEndByte = word->m_uiStartByte + ( unsigned int ) ( frac2 * newRange );
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void PartitionWords( CSentence& outwords, int start, int end, int sampleStart, int sampleEnd )
|
||
|
{
|
||
|
int wordCount = end - start + 1;
|
||
|
Assert( wordCount >= 1 );
|
||
|
int stepSize = ( sampleEnd - sampleStart ) / wordCount;
|
||
|
|
||
|
int currentStart = sampleStart;
|
||
|
|
||
|
for ( int i = start; i <= end; i++ )
|
||
|
{
|
||
|
CWordTag *word = outwords.m_Words[ i ];
|
||
|
Assert( word );
|
||
|
|
||
|
unsigned int oldStart = word->m_uiStartByte;
|
||
|
unsigned int oldEnd = word->m_uiEndByte;
|
||
|
|
||
|
word->m_uiStartByte = currentStart;
|
||
|
word->m_uiEndByte = currentStart + stepSize;
|
||
|
|
||
|
RepartitionPhonemes( word, oldStart, oldEnd );
|
||
|
|
||
|
currentStart += stepSize;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void MergeWords( CWordTag *w1, CWordTag *w2 )
|
||
|
{
|
||
|
unsigned int start, end;
|
||
|
|
||
|
start = min( w1->m_uiStartByte, w2->m_uiStartByte );
|
||
|
end = max( w1->m_uiEndByte, w2->m_uiEndByte );
|
||
|
|
||
|
unsigned int mid = ( start + end ) / 2;
|
||
|
|
||
|
unsigned int oldw1start, oldw2start, oldw1end, oldw2end;
|
||
|
|
||
|
oldw1start = w1->m_uiStartByte;
|
||
|
oldw2start = w2->m_uiStartByte;
|
||
|
oldw1end = w1->m_uiEndByte;
|
||
|
oldw2end = w2->m_uiEndByte;
|
||
|
|
||
|
w1->m_uiStartByte = start;
|
||
|
w1->m_uiEndByte = mid;
|
||
|
w2->m_uiStartByte = mid;
|
||
|
w2->m_uiEndByte = end;
|
||
|
|
||
|
RepartitionPhonemes( w1, oldw1start, oldw1end );
|
||
|
RepartitionPhonemes( w2, oldw2start, oldw2end );
|
||
|
}
|
||
|
|
||
|
void FixupZeroLengthWords( CSentence& outwords )
|
||
|
{
|
||
|
while ( 1 )
|
||
|
{
|
||
|
int i;
|
||
|
for ( i = 0 ; i < outwords.m_Words.Size() - 1; i++ )
|
||
|
{
|
||
|
CWordTag *current, *next;
|
||
|
|
||
|
current = outwords.m_Words[ i ];
|
||
|
next = outwords.m_Words[ i + 1 ];
|
||
|
|
||
|
if ( current->m_uiEndByte - current->m_uiStartByte <= 0 )
|
||
|
{
|
||
|
MergeWords( current, next );
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if ( next->m_uiEndByte - next->m_uiStartByte <= 0 )
|
||
|
{
|
||
|
MergeWords( current, next );
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if ( i >= outwords.m_Words.Size() - 1 )
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void ComputeMissingByteSpans( int numsamples, CSentence& outwords )
|
||
|
{
|
||
|
int numwords = outwords.m_Words.Size();
|
||
|
// Nothing to do
|
||
|
if ( numwords <= 0 )
|
||
|
return;
|
||
|
|
||
|
int interationcount = 1;
|
||
|
|
||
|
while( 1 )
|
||
|
{
|
||
|
Log( "\nCompute %i\n", interationcount++ );
|
||
|
LogWords( outwords );
|
||
|
|
||
|
int wordNumber;
|
||
|
|
||
|
// Done!
|
||
|
if ( !CountUnuseableWords( outwords ) )
|
||
|
{
|
||
|
FixupZeroLengthWords( outwords );
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if ( !CountUsableWords( outwords ) )
|
||
|
{
|
||
|
// Evenly space words across full sample time
|
||
|
PartitionWords( outwords, 0, numwords - 1, 0, numsamples );
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
wordNumber = FindFirstUsableWord( outwords );
|
||
|
// Not the first word
|
||
|
if ( wordNumber > 0 )
|
||
|
{
|
||
|
// Repartition all of the unusables and the first one starting at zero over the range
|
||
|
CWordTag *firstUsable = outwords.m_Words[ wordNumber ];
|
||
|
Assert( firstUsable );
|
||
|
|
||
|
if ( firstUsable->m_uiStartByte != 0 )
|
||
|
{
|
||
|
PartitionWords( outwords, 0, wordNumber - 1, 0, firstUsable->m_uiStartByte );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
PartitionWords( outwords, 0, wordNumber, 0, firstUsable->m_uiEndByte );
|
||
|
}
|
||
|
|
||
|
// Start over
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
wordNumber = FindLastUsableWord( outwords );
|
||
|
// Not the last word
|
||
|
if ( wordNumber >= 0 && wordNumber < numwords - 1 )
|
||
|
{
|
||
|
// Repartition all of the unusables and the first one starting at zero over the range
|
||
|
CWordTag *lastUsable = outwords.m_Words[ wordNumber ];
|
||
|
Assert( lastUsable );
|
||
|
|
||
|
if ( lastUsable->m_uiEndByte != (unsigned int)numsamples )
|
||
|
{
|
||
|
PartitionWords( outwords, wordNumber + 1, numwords-1, lastUsable->m_uiEndByte, numsamples );
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
PartitionWords( outwords, wordNumber, numwords-1, lastUsable->m_uiStartByte, numsamples );
|
||
|
}
|
||
|
|
||
|
// Start over
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// If we get here it means that the start and end of the list are okay and we just have to
|
||
|
// iterate across the list and fix things in the middle
|
||
|
int startByte = 0;
|
||
|
int endByte = 0;
|
||
|
for ( int i = 0; i < numwords ; i++ )
|
||
|
{
|
||
|
CWordTag *word = outwords.m_Words[ i ];
|
||
|
if ( IsUseable( word ) )
|
||
|
{
|
||
|
startByte = word->m_uiEndByte;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// Found the start of a chain of 1 or more unusable words
|
||
|
// Find the startbyte of the next usable word and count how many words we check
|
||
|
int wordCount = 1;
|
||
|
for ( int j = i + 1; j < numwords; j++ )
|
||
|
{
|
||
|
CWordTag *next = outwords.m_Words[ j ];
|
||
|
if ( IsUseable( next ) )
|
||
|
{
|
||
|
endByte = next->m_uiStartByte;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
wordCount++;
|
||
|
}
|
||
|
|
||
|
// Now partition words across the gap and go to start again
|
||
|
PartitionWords( outwords, i, i + wordCount - 1, startByte, endByte );
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Purpose: Given a wavfile and a list of inwords, determines the word/phonene
|
||
|
// sample counts for the sentce
|
||
|
// Input : *wavfile -
|
||
|
// *inwords -
|
||
|
// *outphonemes{ text.Clear( -
|
||
|
// Output : SR_RESULT
|
||
|
//-----------------------------------------------------------------------------
|
||
|
static SR_RESULT SAPI_ExtractPhonemes(
|
||
|
const char *wavfile,
|
||
|
int numsamples,
|
||
|
void (*pfnPrint)( const char *fmt, ... ),
|
||
|
CSentence& inwords,
|
||
|
CSentence& outwords )
|
||
|
{
|
||
|
LogReset();
|
||
|
|
||
|
USES_CONVERSION;
|
||
|
|
||
|
CSpDynamicString text;
|
||
|
text.Clear();
|
||
|
|
||
|
HKEY hkwipe;
|
||
|
LONG lResult = RegOpenKeyEx( HKEY_CURRENT_USER, "Software\\Microsoft\\Speech\\RecoProfiles", 0, KEY_ALL_ACCESS, &hkwipe );
|
||
|
if ( lResult == ERROR_SUCCESS )
|
||
|
{
|
||
|
RecursiveRegDelKey( hkwipe );
|
||
|
RegCloseKey( hkwipe );
|
||
|
}
|
||
|
|
||
|
if ( strlen( inwords.GetText() ) <= 0 )
|
||
|
{
|
||
|
inwords.SetTextFromWords();
|
||
|
}
|
||
|
|
||
|
// Construct a string from the inwords array
|
||
|
text.Append( T2W( inwords.GetText() ) );
|
||
|
|
||
|
// Assume failure
|
||
|
SR_RESULT result = SR_RESULT_ERROR;
|
||
|
|
||
|
if ( text.Length() > 0 )
|
||
|
{
|
||
|
CSentence sentence;
|
||
|
|
||
|
pfnPrint( "Processing...\r\n" );
|
||
|
|
||
|
// Give it a try
|
||
|
result = ExtractPhonemes( wavfile, text, sentence, pfnPrint );
|
||
|
|
||
|
pfnPrint( "Finished.\r\n" );
|
||
|
// PrintWordsAndPhonemes( sentence, pfnPrint );
|
||
|
|
||
|
// Copy results to outputs
|
||
|
outwords.Reset();
|
||
|
|
||
|
outwords.SetText( inwords.GetText() );
|
||
|
|
||
|
Log( "Starting\n" );
|
||
|
LogWords( inwords );
|
||
|
|
||
|
if ( SR_RESULT_ERROR != result )
|
||
|
{
|
||
|
int i;
|
||
|
|
||
|
Log( "Hypothesized\n" );
|
||
|
LogWords( sentence );
|
||
|
|
||
|
for( i = 0 ; i < sentence.m_Words.Size(); i++ )
|
||
|
{
|
||
|
CWordTag *tag = sentence.m_Words[ i ];
|
||
|
if ( tag )
|
||
|
{
|
||
|
// Skip '...' tag
|
||
|
if ( stricmp( tag->GetWord(), "..." ) )
|
||
|
{
|
||
|
CWordTag *newTag = new CWordTag( *tag );
|
||
|
|
||
|
outwords.m_Words.AddToTail( newTag );
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Now insert unrecognized/skipped words from original list
|
||
|
//
|
||
|
int frompos = 0, topos = 0;
|
||
|
|
||
|
while( 1 )
|
||
|
{
|
||
|
// End of source list
|
||
|
if ( frompos >= inwords.m_Words.Size() )
|
||
|
break;
|
||
|
|
||
|
const CWordTag *fromTag = inwords.m_Words[ frompos ];
|
||
|
|
||
|
// Reached end of destination list, just copy words over from from source list until
|
||
|
// we run out of source words
|
||
|
if ( topos >= outwords.m_Words.Size() )
|
||
|
{
|
||
|
// Just copy words over
|
||
|
CWordTag *newWord = new CWordTag( *fromTag );
|
||
|
|
||
|
// Remove phonemes
|
||
|
while ( newWord->m_Phonemes.Size() > 0 )
|
||
|
{
|
||
|
CPhonemeTag *kill = newWord->m_Phonemes[ 0 ];
|
||
|
newWord->m_Phonemes.Remove( 0 );
|
||
|
delete kill;
|
||
|
}
|
||
|
|
||
|
outwords.m_Words.AddToTail( newWord );
|
||
|
frompos++;
|
||
|
topos++;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// Destination word
|
||
|
const CWordTag *toTag = outwords.m_Words[ topos ];
|
||
|
|
||
|
// Words match, just skip ahead
|
||
|
if ( !stricmp( fromTag->GetWord(), toTag->GetWord() ) )
|
||
|
{
|
||
|
frompos++;
|
||
|
topos++;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// The only case we handle is that something in the source wasn't in the destination
|
||
|
|
||
|
// Find the next source word that appears in the destination
|
||
|
int skipAhead = frompos + 1;
|
||
|
bool found = false;
|
||
|
while ( skipAhead < inwords.m_Words.Size() )
|
||
|
{
|
||
|
const CWordTag *sourceWord = inwords.m_Words[ skipAhead ];
|
||
|
if ( !stricmp( sourceWord->GetWord(), toTag->GetWord() ) )
|
||
|
{
|
||
|
found = true;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
skipAhead++;
|
||
|
}
|
||
|
|
||
|
// Uh oh destination has words that are not in source, just skip to next destination word?
|
||
|
if ( !found )
|
||
|
{
|
||
|
topos++;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// Copy words from from source list into destination
|
||
|
//
|
||
|
int skipCount = skipAhead - frompos;
|
||
|
|
||
|
while ( --skipCount>= 0 )
|
||
|
{
|
||
|
const CWordTag *sourceWord = inwords.m_Words[ frompos++ ];
|
||
|
CWordTag *newWord = new CWordTag( *sourceWord );
|
||
|
|
||
|
// Remove phonemes
|
||
|
while ( newWord->m_Phonemes.Size() > 0 )
|
||
|
{
|
||
|
CPhonemeTag *kill = newWord->m_Phonemes[ 0 ];
|
||
|
newWord->m_Phonemes.Remove( 0 );
|
||
|
delete kill;
|
||
|
}
|
||
|
|
||
|
outwords.m_Words.InsertBefore( topos, newWord );
|
||
|
topos++;
|
||
|
}
|
||
|
|
||
|
frompos++;
|
||
|
topos++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
Log( "\nDone simple check\n" );
|
||
|
|
||
|
LogWords( outwords );
|
||
|
LogPhonemes( outwords );
|
||
|
|
||
|
ComputeMissingByteSpans( numsamples, outwords );
|
||
|
|
||
|
Log( "\nFinal check\n" );
|
||
|
|
||
|
LogWords( outwords );
|
||
|
LogPhonemes( outwords );
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
pfnPrint( "Input sentence is empty!\n" );
|
||
|
}
|
||
|
|
||
|
// Return results
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
|
||
|
//-----------------------------------------------------------------------------
|
||
|
// Purpose: Expose the interface
|
||
|
//-----------------------------------------------------------------------------
|
||
|
class CPhonemeExtractorSAPI : public IPhonemeExtractor
|
||
|
{
|
||
|
public:
|
||
|
virtual PE_APITYPE GetAPIType() const
|
||
|
{
|
||
|
return SPEECH_API_SAPI;
|
||
|
}
|
||
|
|
||
|
// Used for menus, etc
|
||
|
virtual char const *GetName() const
|
||
|
{
|
||
|
return "MS SAPI 5.1";
|
||
|
}
|
||
|
|
||
|
SR_RESULT Extract(
|
||
|
const char *wavfile,
|
||
|
int numsamples,
|
||
|
void (*pfnPrint)( const char *fmt, ... ),
|
||
|
CSentence& inwords,
|
||
|
CSentence& outwords )
|
||
|
{
|
||
|
return SAPI_ExtractPhonemes( wavfile, numsamples, pfnPrint, inwords, outwords );
|
||
|
}
|
||
|
};
|
||
|
|
||
|
EXPOSE_SINGLE_INTERFACE( CPhonemeExtractorSAPI, IPhonemeExtractor, VPHONEME_EXTRACTOR_INTERFACE );
|