Last active
September 18, 2020 19:59
-
-
Save kpprt/bba2b187e2cc3ac70014a61c72f3c83a to your computer and use it in GitHub Desktop.
A simple MonoBehaviour for testing the TextToSpeech service of IBM Watson via Websocket connection. Tested with IBM unity-sdk 4.8.0 and IBM unity-sdk-core 1.2.1. Currently there seems to be no way to determine the final length of the AudioClip while streaming the data. So we use an _estimatedAudioLength that defines the length of our Playback Au…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Collections; | |
using System.Collections.Generic; | |
using IBM.Cloud.SDK.Authentication; | |
using IBM.Cloud.SDK.Authentication.Iam; | |
using IBM.Cloud.SDK.Utilities; | |
using IBM.Watson.TextToSpeech.V1; | |
using UnityEngine; | |
[RequireComponent(typeof(AudioSource))] | |
public class SimpleIBMTextToSpeechWS : MonoBehaviour { | |
[Header("Service Data")] | |
[SerializeField] private string _iamApiKey = default; | |
[SerializeField] private string _serviceUrl = default; | |
[SerializeField] private string _voice = "en-US_MichaelV3Voice"; | |
[Header("Test Settings")] | |
[SerializeField] private float _prebufferSeconds = 1f; | |
[SerializeField] private string _outputText = "Hello Mamoon, thanks for helping out and giving support!"; | |
[SerializeField] private float _estimatedAudioLength = 5f; | |
private AudioSource _audioSource; | |
private TextToSpeechService _textToSpeech; | |
private TextToSpeechService _textToSpeechWS; | |
private List<byte> _byteStream; | |
private float[] _floatStream; | |
private int _convertedBytes; | |
private int _playheadPosition; | |
private AudioClip _audioClip; | |
private int _estimatedClipLength; | |
private void Awake() { | |
_audioSource = GetComponent<AudioSource>(); | |
} | |
private IEnumerator Start() { | |
Debug.Log("Initializing Service."); | |
yield return InitializeService(); | |
while(true) { | |
float start = Time.realtimeSinceStartup; | |
Debug.Log("Requesting Audio."); | |
yield return SayWS(_outputText, Mathf.RoundToInt(_estimatedAudioLength * 22050)); | |
Debug.LogFormat("Prebuffering {0} seconds.", _prebufferSeconds); | |
yield return new WaitForSeconds(_prebufferSeconds); | |
Debug.Log("Playing the Audio."); | |
_audioSource.PlayOneShot(_audioClip); | |
yield return new WaitForSeconds(_audioClip.length); | |
float duration = Time.realtimeSinceStartup - start; | |
Debug.LogFormat("Stopping Playback after {0} seconds.", duration.ToString("0.0")); | |
Debug.Log("----------------------------------------------------------------"); | |
} | |
} | |
private void Update() { | |
bool isByteStreamAvailable = _byteStream != null && _byteStream.Count > 0; | |
if(isByteStreamAvailable && _convertedBytes != _byteStream.Count) { | |
// we have received audio data that has not yet been converted / parsed | |
AudioClip clip = WaveFile.ParseWAV("chunk", _byteStream.ToArray()); | |
// ParseWAV converts the whole byte stream into an AudioClip each time we received a new chunk. | |
// It would be more efficient to only convert the data of the new chunk, | |
// but WaveFile has no such function, yet. | |
// get the data from the converted AudioClip | |
_floatStream = new float[clip.samples * clip.channels]; | |
clip.GetData(_floatStream, 0); | |
_convertedBytes = _byteStream.Count; | |
// TODO: find out how long the clip will be before streaming the data | |
if(_audioClip == null) { | |
_audioClip = AudioClip.Create("stream", _estimatedClipLength, 1, 22050, true, OnAudioRead, OnSetPosition); | |
} | |
} | |
} | |
public IEnumerator InitializeService() { | |
Authenticator authenticator = new IamAuthenticator(_iamApiKey); | |
yield return new WaitUntil(() => authenticator.CanAuthenticate()); | |
_textToSpeechWS = new TextToSpeechService(authenticator); | |
_textToSpeechWS.SetServiceUrl(_serviceUrl); | |
_textToSpeechWS.Voice = _voice; | |
_textToSpeechWS.OnError += OnError; | |
} | |
public IEnumerator SayWS(string outputText, int estimatedClipLength) { | |
if(_textToSpeechWS == null) { | |
OnError("Service not initialized."); | |
} | |
if(_textToSpeechWS.IsListening) { | |
// we only start a new output when we have stopped listening | |
OnError("Already listening to WebSocket."); | |
yield break; | |
} | |
_estimatedClipLength = estimatedClipLength; | |
// initialize fields | |
_byteStream = new List<byte>(); | |
_floatStream = null; | |
_convertedBytes = 0; | |
_playheadPosition = 0; | |
_audioClip = null; | |
// CAUTION: bool only checks for isListening, but not for a wrong service url | |
if(!_textToSpeechWS.StartListening(OnSynthesize)) { | |
OnError("Could not connect to WebSocket."); | |
yield break; | |
} | |
// CAUTION: bool only checks for isListening, but not for a wrong service url | |
if(!_textToSpeechWS.SynthesizeUsingWebsockets(outputText)) { | |
OnError("Could not synthesize text."); | |
yield break; | |
} | |
// TODO: handle case of an error, where _audioClip is never set | |
// _audioClip will be set in Update | |
yield return new WaitWhile(() => _audioClip == null); | |
} | |
private void OnSynthesize(byte[] result) { | |
// Debug.Log("Synthesizing"); | |
// append the data to the stream | |
_byteStream.AddRange(result); | |
} | |
private void OnAudioRead(float[] data) { | |
if(_floatStream == null || _byteStream == null || _byteStream.Count == 0 || _playheadPosition == _floatStream.Length) { | |
// we have not received audio data yet | |
// return empty audio | |
for(int i = 0; i < data.Length; i++) { | |
data[i] = 0; | |
} | |
return; | |
} | |
int increment = data.Length; | |
for(int i = 0; i < data.Length; i++) { | |
int streamIndex = _playheadPosition + i; | |
if(streamIndex < _floatStream.Length) { | |
data[i] = _floatStream[streamIndex]; | |
} else { | |
// not enough data in the stream | |
if(i < increment) { | |
Debug.LogWarning("End of streamed data"); | |
increment = i;// + 1; | |
} | |
data[i] = 0; | |
} | |
} | |
_playheadPosition += increment; | |
} | |
private void OnSetPosition(int position) { | |
_playheadPosition = position; | |
} | |
private void OnError(string errorMessage) { | |
Debug.LogError(errorMessage); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment