Hello,
Welcome to Microsoft Q&A.
From the description of your question, what you need is to judge which texts have been read according to the playback progress of the stream data generated by SpeechSynthesizer.
Here is a simple example:
SpeechPage.xaml
<Grid>
<StackPanel HorizontalAlignment="Center" VerticalAlignment="Center">
<TextBlock x:Name="ReadTextBlock" HorizontalAlignment="Center" TextAlignment="Center"
FontSize="20" TextWrapping="Wrap" MaxWidth="300"/>
<Button Content="Start" Click="Button_Click" HorizontalAlignment="Center" Margin="0,15,0,0"/>
</StackPanel>
</Grid>
SpeechPage.xaml.cs
public sealed partial class SpeechPage : Page
{
private MediaPlayer _player = new MediaPlayer();
private SpeechSynthesizer _synth = new SpeechSynthesizer();
private string readText = "";
private string totalText = "This is a simple example to test the progress of text reading";
public SpeechPage()
{
this.InitializeComponent();
_synth.Options.IncludeWordBoundaryMetadata = true;
_synth.Options.IncludeSentenceBoundaryMetadata = true;
}
private async void Button_Click(object sender, RoutedEventArgs e)
{
readText = "";
SpeechSynthesisStream synthesisStream = await _synth.SynthesizeTextToStreamAsync(totalText);
// Create a media source from the stream:
var mediaSource = MediaSource.CreateFromStream(synthesisStream, synthesisStream.ContentType);
//Create a Media Playback Item
var mediaPlaybackItem = new MediaPlaybackItem(mediaSource);
RegisterForWordBoundaryEvents(mediaPlaybackItem);
_player.Source = mediaPlaybackItem;
_player.Play();
}
/// <summary>
/// This function executes when a SpeechCue is hit and calls the functions to update the UI
/// </summary>
/// <param name="timedMetadataTrack">The timedMetadataTrack associated with the event.</param>
/// <param name="args">the arguments associated with the event.</param>
private async void metadata_SpeechCueEntered(TimedMetadataTrack timedMetadataTrack, MediaCueEventArgs args)
{
// Check in case there are different tracks and the handler was used for more tracks
if (timedMetadataTrack.TimedMetadataKind == TimedMetadataKind.Speech)
{
var cue = args.Cue as SpeechCue;
if (cue != null)
{
System.Diagnostics.Debug.WriteLine("Hit Cue with start:" + cue.StartPositionInInput + " and end:" + cue.EndPositionInInput);
System.Diagnostics.Debug.WriteLine("Cue text:[" + cue.Text + "]");
// Do something with the cue
await Dispatcher.RunAsync(CoreDispatcherPriority.Normal,
() =>
{
if (cue.StartPositionInInput == 0 && cue.EndPositionInInput == totalText.Length)
return;
readText += cue.Text+" ";
ReadTextBlock.Text = readText.Trim();
});
}
}
}
/// <summary>
/// Register for all boundary events and register a function to add any new events if they arise.
/// </summary>
/// <param name="mediaPlaybackItem">The Media PLayback Item add handlers to.</param>
private void RegisterForWordBoundaryEvents(MediaPlaybackItem mediaPlaybackItem)
{
// If tracks were available at source resolution time, itterate through and register:
for (int index = 0; index < mediaPlaybackItem.TimedMetadataTracks.Count; index++)
{
RegisterMetadataHandlerForWords(mediaPlaybackItem, index);
}
// Since the tracks are added later we will
// monitor the tracks being added and subscribe to the ones of interest
mediaPlaybackItem.TimedMetadataTracksChanged += (MediaPlaybackItem sender, IVectorChangedEventArgs args) =>
{
if (args.CollectionChange == CollectionChange.ItemInserted)
{
RegisterMetadataHandlerForWords(sender, (int)args.Index);
}
else if (args.CollectionChange == CollectionChange.Reset)
{
for (int index = 0; index < sender.TimedMetadataTracks.Count; index++)
{
RegisterMetadataHandlerForWords(sender, index);
}
}
};
}
/// <summary>
/// Register for just word boundary events.
/// </summary>
/// <param name="mediaPlaybackItem">The Media Playback Item to register handlers for.</param>
/// <param name="index">Index of the timedMetadataTrack within the mediaPlaybackItem.</param>
private void RegisterMetadataHandlerForWords(MediaPlaybackItem mediaPlaybackItem, int index)
{
var timedTrack = mediaPlaybackItem.TimedMetadataTracks[index];
//register for only word cues
if (timedTrack.Id == "SpeechWord")
{
timedTrack.CueEntered += metadata_SpeechCueEntered;
mediaPlaybackItem.TimedMetadataTracks.SetPresentationMode((uint)index, TimedMetadataTrackPresentationMode.ApplicationPresented);
}
}
}
By setting SpeechSynthesizerOptions.IncludeWordBoundaryMetadata=True , the stream data generated by SpeechSynthesizer.SynthesizeTextToStreamAsync() can contain the information of the read text, and then through the analysis of TimedMetadataTrack, we can get the text currently being read.
Microsoft provides a more complete example, you can refer to here
Thanks.