Skip to content

Instantly share code, notes, and snippets.

@kpprt
Last active September 18, 2020 19:59
Show Gist options
  • Save kpprt/bba2b187e2cc3ac70014a61c72f3c83a to your computer and use it in GitHub Desktop.
Save kpprt/bba2b187e2cc3ac70014a61c72f3c83a to your computer and use it in GitHub Desktop.
A simple MonoBehaviour for testing the TextToSpeech service of IBM Watson via Websocket connection. Tested with IBM unity-sdk 4.8.0 and IBM unity-sdk-core 1.2.1. Currently there seems to be no way to determine the final length of the AudioClip while streaming the data. So we use an _estimatedAudioLength that defines the length of our Playback Au…
using System.Collections;
using System.Collections.Generic;
using IBM.Cloud.SDK.Authentication;
using IBM.Cloud.SDK.Authentication.Iam;
using IBM.Cloud.SDK.Utilities;
using IBM.Watson.TextToSpeech.V1;
using UnityEngine;
[RequireComponent(typeof(AudioSource))]
public class SimpleIBMTextToSpeechWS : MonoBehaviour {
[Header("Service Data")]
[SerializeField] private string _iamApiKey = default;
[SerializeField] private string _serviceUrl = default;
[SerializeField] private string _voice = "en-US_MichaelV3Voice";
[Header("Test Settings")]
[SerializeField] private float _prebufferSeconds = 1f;
[SerializeField] private string _outputText = "Hello Mamoon, thanks for helping out and giving support!";
[SerializeField] private float _estimatedAudioLength = 5f;
private AudioSource _audioSource;
private TextToSpeechService _textToSpeech;
private TextToSpeechService _textToSpeechWS;
private List<byte> _byteStream;
private float[] _floatStream;
private int _convertedBytes;
private int _playheadPosition;
private AudioClip _audioClip;
private int _estimatedClipLength;
private void Awake() {
_audioSource = GetComponent<AudioSource>();
}
private IEnumerator Start() {
Debug.Log("Initializing Service.");
yield return InitializeService();
while(true) {
float start = Time.realtimeSinceStartup;
Debug.Log("Requesting Audio.");
yield return SayWS(_outputText, Mathf.RoundToInt(_estimatedAudioLength * 22050));
Debug.LogFormat("Prebuffering {0} seconds.", _prebufferSeconds);
yield return new WaitForSeconds(_prebufferSeconds);
Debug.Log("Playing the Audio.");
_audioSource.PlayOneShot(_audioClip);
yield return new WaitForSeconds(_audioClip.length);
float duration = Time.realtimeSinceStartup - start;
Debug.LogFormat("Stopping Playback after {0} seconds.", duration.ToString("0.0"));
Debug.Log("----------------------------------------------------------------");
}
}
private void Update() {
bool isByteStreamAvailable = _byteStream != null && _byteStream.Count > 0;
if(isByteStreamAvailable && _convertedBytes != _byteStream.Count) {
// we have received audio data that has not yet been converted / parsed
AudioClip clip = WaveFile.ParseWAV("chunk", _byteStream.ToArray());
// ParseWAV converts the whole byte stream into an AudioClip each time we received a new chunk.
// It would be more efficient to only convert the data of the new chunk,
// but WaveFile has no such function, yet.
// get the data from the converted AudioClip
_floatStream = new float[clip.samples * clip.channels];
clip.GetData(_floatStream, 0);
_convertedBytes = _byteStream.Count;
// TODO: find out how long the clip will be before streaming the data
if(_audioClip == null) {
_audioClip = AudioClip.Create("stream", _estimatedClipLength, 1, 22050, true, OnAudioRead, OnSetPosition);
}
}
}
public IEnumerator InitializeService() {
Authenticator authenticator = new IamAuthenticator(_iamApiKey);
yield return new WaitUntil(() => authenticator.CanAuthenticate());
_textToSpeechWS = new TextToSpeechService(authenticator);
_textToSpeechWS.SetServiceUrl(_serviceUrl);
_textToSpeechWS.Voice = _voice;
_textToSpeechWS.OnError += OnError;
}
public IEnumerator SayWS(string outputText, int estimatedClipLength) {
if(_textToSpeechWS == null) {
OnError("Service not initialized.");
}
if(_textToSpeechWS.IsListening) {
// we only start a new output when we have stopped listening
OnError("Already listening to WebSocket.");
yield break;
}
_estimatedClipLength = estimatedClipLength;
// initialize fields
_byteStream = new List<byte>();
_floatStream = null;
_convertedBytes = 0;
_playheadPosition = 0;
_audioClip = null;
// CAUTION: bool only checks for isListening, but not for a wrong service url
if(!_textToSpeechWS.StartListening(OnSynthesize)) {
OnError("Could not connect to WebSocket.");
yield break;
}
// CAUTION: bool only checks for isListening, but not for a wrong service url
if(!_textToSpeechWS.SynthesizeUsingWebsockets(outputText)) {
OnError("Could not synthesize text.");
yield break;
}
// TODO: handle case of an error, where _audioClip is never set
// _audioClip will be set in Update
yield return new WaitWhile(() => _audioClip == null);
}
private void OnSynthesize(byte[] result) {
// Debug.Log("Synthesizing");
// append the data to the stream
_byteStream.AddRange(result);
}
private void OnAudioRead(float[] data) {
if(_floatStream == null || _byteStream == null || _byteStream.Count == 0 || _playheadPosition == _floatStream.Length) {
// we have not received audio data yet
// return empty audio
for(int i = 0; i < data.Length; i++) {
data[i] = 0;
}
return;
}
int increment = data.Length;
for(int i = 0; i < data.Length; i++) {
int streamIndex = _playheadPosition + i;
if(streamIndex < _floatStream.Length) {
data[i] = _floatStream[streamIndex];
} else {
// not enough data in the stream
if(i < increment) {
Debug.LogWarning("End of streamed data");
increment = i;// + 1;
}
data[i] = 0;
}
}
_playheadPosition += increment;
}
private void OnSetPosition(int position) {
_playheadPosition = position;
}
private void OnError(string errorMessage) {
Debug.LogError(errorMessage);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment