qtangs · August 27, 2024 01:56 · Aug 27, 2024
diff --git a/streaming_tts_using_google.ts b/streaming_tts_using_google.ts
@@ -0,0 +1,85 @@
+const textToSpeech = require('@google-cloud/text-to-speech');
+
+const fs = require('fs');
+
+async function streamTextToSpeech(texts: string[]) {
+  const client = new textToSpeech.TextToSpeechClient();
+
+  const ttsStream = client.streamingSynthesize();
+
+  // Write the response to a file, replace with your desired output stream
+  const writeStream = fs.createWriteStream('output.wav');
+
+  // The audio data is headerless LINEAR16 audio with a sample rate of 24000.
+  // https://github.com/googleapis/google-cloud-node/blob/main/packages/google-cloud-texttospeech/protos/google/cloud/texttospeech/v1/cloud_tts.proto#L352
+  const sampleRate = 24000;
+  const numChannels = 1; // Mono audio
+  const byteRate = sampleRate * numChannels * 2;
+  const header = createWavHeader(sampleRate, numChannels, byteRate, 0);
+  writeStream.write(header);
+
+  // Handle the TTS response stream
+  ttsStream.on('data', (response: any) => {
+    if (response.audioContent) {
+      writeStream.write(response.audioContent);
+    }
+  });
+
+  ttsStream.on('error', (err: any) => {
+    console.error('Error during Text-to-Speech:', err);
+    writeStream.end();
+  });
+
+  ttsStream.on('end', () => {
+    console.log('Finished streaming Text-to-Speech');
+    writeStream.end();
+  });
+
+  // Note: Only Journey voices support streaming for now.
+  ttsStream.write({streamingConfig: {voice: { name: 'en-us-Journey-O', languageCode: 'en-US', ssmlGender: "NEUTRAL" }}});
+
+  // Stream the texts to TTS stream, replace with actual streaming texts
+  for (const text of texts) {
+    ttsStream.write({input: {text: text}});
+  }
+
+  ttsStream.end();
+}
+
+function writeString(view: DataView, offset: number, str: string) {
+  for (let i = 0; i < str.length; i++) {
+    view.setUint8(offset + i, str.charCodeAt(i));
+  }
+}
+
+function createWavHeader(sampleRate: number, numChannels: number, byteRate: number, dataSize: number): Uint8Array {
+  const header = new ArrayBuffer(44);
+  const view = new DataView(header);
+
+  // RIFF chunk descriptor
+  writeString(view, 0, 'RIFF');
+  view.setUint32(4, 36 + dataSize, true); // File size - 8
+  writeString(view, 8, 'WAVE');
+
+  // fmt sub-chunk
+  writeString(view, 12, 'fmt ');
+  view.setUint32(16, 16, true); // Subchunk1Size (16 for PCM)
+  view.setUint16(20, 1, true); // AudioFormat (1 for PCM)
+  view.setUint16(22, numChannels, true); // NumChannels
+  view.setUint32(24, sampleRate, true); // SampleRate
+  view.setUint32(28, byteRate, true); // ByteRate (SampleRate * NumChannels * BitsPerSample/8)
+  view.setUint16(32, numChannels * 2, true); // BlockAlign (NumChannels * BitsPerSample/8)
+  view.setUint16(34, 16, true); // BitsPerSample
+
+  // data sub-chunk
+  writeString(view, 36, 'data');
+  view.setUint32(40, dataSize, true); // Subchunk2Size
+
+  return new Uint8Array(header);
+}
+
+// Example usage
+streamTextToSpeech(['Hello,', ' world!']).then((data) => {
+    console.log("Success");
+  }
+);