kpprt · September 18, 2020 19:59
diff --git a/SimpleIBMTextToSpeechWS.cs b/SimpleIBMTextToSpeechWS.cs
 using System.Collections;
 using System.Collections.Generic;
 using IBM.Cloud.SDK.Authentication;
 using IBM.Cloud.SDK.Authentication.Iam;
 using IBM.Cloud.SDK.Utilities;
 using IBM.Watson.TextToSpeech.V1;
 using UnityEngine;


 [RequireComponent(typeof(AudioSource))]
 public class SimpleIBMTextToSpeechWS : MonoBehaviour {

    [Header("Service Data")]
    [SerializeField] private string _iamApiKey = default;
    [SerializeField] private string _serviceUrl = default;
    [SerializeField] private string _voice = "en-US_MichaelV3Voice";

    [Header("Test Settings")]
    [SerializeField] private float _prebufferSeconds = 1f;
    [SerializeField] private string _outputText = "Hello Mamoon, thanks for helping out and giving support!";
    [SerializeField] private float _estimatedAudioLength = 5f;


    private AudioSource _audioSource;

    private TextToSpeechService _textToSpeech;
    private TextToSpeechService _textToSpeechWS;

    private List<byte> _byteStream;
    private float[] _floatStream;
    private int _convertedBytes;
    private int _playheadPosition;
    private AudioClip _audioClip;

    private int _estimatedClipLength;


    private void Awake() {
        _audioSource = GetComponent<AudioSource>();
    }

    private IEnumerator Start() {
        Debug.Log("Initializing Service.");
        yield return InitializeService();
        
        while(true) {
            float start = Time.realtimeSinceStartup;

            Debug.Log("Requesting Audio.");
            yield return SayWS(_outputText, Mathf.RoundToInt(_estimatedAudioLength * 22050));

            Debug.LogFormat("Prebuffering {0} seconds.", _prebufferSeconds);
            yield return new WaitForSeconds(_prebufferSeconds);

            Debug.Log("Playing the Audio.");
            _audioSource.PlayOneShot(_audioClip);

            yield return new WaitForSeconds(_audioClip.length);
            float duration = Time.realtimeSinceStartup - start;
            Debug.LogFormat("Stopping Playback after {0} seconds.", duration.ToString("0.0"));
            
            Debug.Log("----------------------------------------------------------------");
        }
    }

    private void Update() {
        bool isByteStreamAvailable = _byteStream != null && _byteStream.Count > 0;
        if(isByteStreamAvailable && _convertedBytes != _byteStream.Count) {
            // we have received audio data that has not yet been converted / parsed
            
            AudioClip clip = WaveFile.ParseWAV("chunk", _byteStream.ToArray());
            // ParseWAV converts the whole byte stream into an AudioClip each time we received a new chunk.
            // It would be more efficient to only convert the data of the new chunk,
            // but WaveFile has no such function, yet.
            
            // get the data from the converted AudioClip
            _floatStream = new float[clip.samples * clip.channels];
            clip.GetData(_floatStream, 0);
            _convertedBytes = _byteStream.Count;

            // TODO: find out how long the clip will be before streaming the data
            if(_audioClip == null) {
                _audioClip = AudioClip.Create("stream", _estimatedClipLength, 1, 22050, true, OnAudioRead, OnSetPosition);
            }
        }
    }


    public IEnumerator InitializeService() {
        Authenticator authenticator = new IamAuthenticator(_iamApiKey);

        yield return new WaitUntil(() => authenticator.CanAuthenticate());

        _textToSpeechWS = new TextToSpeechService(authenticator);
        _textToSpeechWS.SetServiceUrl(_serviceUrl);
        _textToSpeechWS.Voice = _voice;
        _textToSpeechWS.OnError += OnError;
    }

    public IEnumerator SayWS(string outputText, int estimatedClipLength) {
        if(_textToSpeechWS == null) {
            OnError("Service not initialized.");
        }
        if(_textToSpeechWS.IsListening) {
            // we only start a new output when we have stopped listening
            OnError("Already listening to WebSocket.");
            yield break;
        }
        _estimatedClipLength = estimatedClipLength;

        // initialize fields
        _byteStream = new List<byte>();
        _floatStream = null;
        _convertedBytes = 0;
        _playheadPosition = 0;
        _audioClip = null;

        // CAUTION: bool only checks for isListening, but not for a wrong service url
        if(!_textToSpeechWS.StartListening(OnSynthesize)) {
            OnError("Could not connect to WebSocket.");
            yield break;
        }

        // CAUTION: bool only checks for isListening, but not for a wrong service url
        if(!_textToSpeechWS.SynthesizeUsingWebsockets(outputText)) {
            OnError("Could not synthesize text.");
            yield break;
        }

        // TODO: handle case of an error, where _audioClip is never set
        // _audioClip will be set in Update
        yield return new WaitWhile(() => _audioClip == null);
    }


    private void OnSynthesize(byte[] result) {
        // Debug.Log("Synthesizing");

        // append the data to the stream
        _byteStream.AddRange(result);
    }

    private void OnAudioRead(float[] data) {
        if(_floatStream == null || _byteStream == null || _byteStream.Count == 0 || _playheadPosition == _floatStream.Length) {
            // we have not received audio data yet
            // return empty audio
            for(int i = 0; i < data.Length; i++) {
                data[i] = 0;
            }
            return;
        }
        int increment = data.Length;
        for(int i = 0; i < data.Length; i++) {
            int streamIndex = _playheadPosition + i;
            if(streamIndex < _floatStream.Length) {
                data[i] = _floatStream[streamIndex];
            } else {
                // not enough data in the stream
                if(i < increment) {
                    Debug.LogWarning("End of streamed data");
                    increment = i;// + 1;
                }
                data[i] = 0;
            }
        }
        _playheadPosition += increment;
    }

    private void OnSetPosition(int position) {
        _playheadPosition = position;
    }

    private void OnError(string errorMessage) {
        Debug.LogError(errorMessage);
    }

 }
	using System.Collections;
	using System.Collections.Generic;
	using IBM.Cloud.SDK.Authentication;
	using IBM.Cloud.SDK.Authentication.Iam;
	using IBM.Cloud.SDK.Utilities;
	using IBM.Watson.TextToSpeech.V1;
	using UnityEngine;


	[RequireComponent(typeof(AudioSource))]
	public class SimpleIBMTextToSpeechWS : MonoBehaviour {

	[Header("Service Data")]
	[SerializeField] private string _iamApiKey = default;
	[SerializeField] private string _serviceUrl = default;
	[SerializeField] private string _voice = "en-US_MichaelV3Voice";

	[Header("Test Settings")]
	[SerializeField] private float _prebufferSeconds = 1f;
	[SerializeField] private string _outputText = "Hello Mamoon, thanks for helping out and giving support!";
	[SerializeField] private float _estimatedAudioLength = 5f;


	private AudioSource _audioSource;

	private TextToSpeechService _textToSpeech;
	private TextToSpeechService _textToSpeechWS;

	private List<byte> _byteStream;
	private float[] _floatStream;
	private int _convertedBytes;
	private int _playheadPosition;
	private AudioClip _audioClip;

	private int _estimatedClipLength;


	private void Awake() {
	_audioSource = GetComponent<AudioSource>();
	}

	private IEnumerator Start() {
	Debug.Log("Initializing Service.");
	yield return InitializeService();

	while(true) {
	float start = Time.realtimeSinceStartup;

	Debug.Log("Requesting Audio.");
	yield return SayWS(_outputText, Mathf.RoundToInt(_estimatedAudioLength * 22050));

	Debug.LogFormat("Prebuffering {0} seconds.", _prebufferSeconds);
	yield return new WaitForSeconds(_prebufferSeconds);

	Debug.Log("Playing the Audio.");
	_audioSource.PlayOneShot(_audioClip);

	yield return new WaitForSeconds(_audioClip.length);
	float duration = Time.realtimeSinceStartup - start;
	Debug.LogFormat("Stopping Playback after {0} seconds.", duration.ToString("0.0"));

	Debug.Log("----------------------------------------------------------------");
	}
	}

	private void Update() {
	bool isByteStreamAvailable = _byteStream != null && _byteStream.Count > 0;
	if(isByteStreamAvailable && _convertedBytes != _byteStream.Count) {
	// we have received audio data that has not yet been converted / parsed

	AudioClip clip = WaveFile.ParseWAV("chunk", _byteStream.ToArray());
	// ParseWAV converts the whole byte stream into an AudioClip each time we received a new chunk.
	// It would be more efficient to only convert the data of the new chunk,
	// but WaveFile has no such function, yet.

	// get the data from the converted AudioClip
	_floatStream = new float[clip.samples * clip.channels];
	clip.GetData(_floatStream, 0);
	_convertedBytes = _byteStream.Count;

	// TODO: find out how long the clip will be before streaming the data
	if(_audioClip == null) {
	_audioClip = AudioClip.Create("stream", _estimatedClipLength, 1, 22050, true, OnAudioRead, OnSetPosition);
	}
	}
	}


	public IEnumerator InitializeService() {
	Authenticator authenticator = new IamAuthenticator(_iamApiKey);

	yield return new WaitUntil(() => authenticator.CanAuthenticate());

	_textToSpeechWS = new TextToSpeechService(authenticator);
	_textToSpeechWS.SetServiceUrl(_serviceUrl);
	_textToSpeechWS.Voice = _voice;
	_textToSpeechWS.OnError += OnError;
	}

	public IEnumerator SayWS(string outputText, int estimatedClipLength) {
	if(_textToSpeechWS == null) {
	OnError("Service not initialized.");
	}
	if(_textToSpeechWS.IsListening) {
	// we only start a new output when we have stopped listening
	OnError("Already listening to WebSocket.");
	yield break;
	}
	_estimatedClipLength = estimatedClipLength;

	// initialize fields
	_byteStream = new List<byte>();
	_floatStream = null;
	_convertedBytes = 0;
	_playheadPosition = 0;
	_audioClip = null;

	// CAUTION: bool only checks for isListening, but not for a wrong service url
	if(!_textToSpeechWS.StartListening(OnSynthesize)) {
	OnError("Could not connect to WebSocket.");
	yield break;
	}

	// CAUTION: bool only checks for isListening, but not for a wrong service url
	if(!_textToSpeechWS.SynthesizeUsingWebsockets(outputText)) {
	OnError("Could not synthesize text.");
	yield break;
	}

	// TODO: handle case of an error, where _audioClip is never set
	// _audioClip will be set in Update
	yield return new WaitWhile(() => _audioClip == null);
	}


	private void OnSynthesize(byte[] result) {
	// Debug.Log("Synthesizing");

	// append the data to the stream
	_byteStream.AddRange(result);
	}

	private void OnAudioRead(float[] data) {
	if(_floatStream == null \|\| _byteStream == null \|\| _byteStream.Count == 0 \|\| _playheadPosition == _floatStream.Length) {
	// we have not received audio data yet
	// return empty audio
	for(int i = 0; i < data.Length; i++) {
	data[i] = 0;
	}
	return;
	}
	int increment = data.Length;
	for(int i = 0; i < data.Length; i++) {
	int streamIndex = _playheadPosition + i;
	if(streamIndex < _floatStream.Length) {
	data[i] = _floatStream[streamIndex];
	} else {
	// not enough data in the stream
	if(i < increment) {
	Debug.LogWarning("End of streamed data");
	increment = i;// + 1;
	}
	data[i] = 0;
	}
	}
	_playheadPosition += increment;
	}

	private void OnSetPosition(int position) {
	_playheadPosition = position;
	}

	private void OnError(string errorMessage) {
	Debug.LogError(errorMessage);
	}

	}