Created
August 27, 2024 01:56
Revisions
-
qtangs created this gist
Aug 27, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,85 @@ const textToSpeech = require('@google-cloud/text-to-speech'); const fs = require('fs'); async function streamTextToSpeech(texts: string[]) { const client = new textToSpeech.TextToSpeechClient(); const ttsStream = client.streamingSynthesize(); // Write the response to a file, replace with your desired output stream const writeStream = fs.createWriteStream('output.wav'); // The audio data is headerless LINEAR16 audio with a sample rate of 24000. // https://github.com/googleapis/google-cloud-node/blob/main/packages/google-cloud-texttospeech/protos/google/cloud/texttospeech/v1/cloud_tts.proto#L352 const sampleRate = 24000; const numChannels = 1; // Mono audio const byteRate = sampleRate * numChannels * 2; const header = createWavHeader(sampleRate, numChannels, byteRate, 0); writeStream.write(header); // Handle the TTS response stream ttsStream.on('data', (response: any) => { if (response.audioContent) { writeStream.write(response.audioContent); } }); ttsStream.on('error', (err: any) => { console.error('Error during Text-to-Speech:', err); writeStream.end(); }); ttsStream.on('end', () => { console.log('Finished streaming Text-to-Speech'); writeStream.end(); }); // Note: Only Journey voices support streaming for now. ttsStream.write({streamingConfig: {voice: { name: 'en-us-Journey-O', languageCode: 'en-US', ssmlGender: "NEUTRAL" }}}); // Stream the texts to TTS stream, replace with actual streaming texts for (const text of texts) { ttsStream.write({input: {text: text}}); } ttsStream.end(); } function writeString(view: DataView, offset: number, str: string) { for (let i = 0; i < str.length; i++) { view.setUint8(offset + i, str.charCodeAt(i)); } } function createWavHeader(sampleRate: number, numChannels: number, byteRate: number, dataSize: number): Uint8Array { const header = new ArrayBuffer(44); const view = new DataView(header); // RIFF chunk descriptor writeString(view, 0, 'RIFF'); view.setUint32(4, 36 + dataSize, true); // File size - 8 writeString(view, 8, 'WAVE'); // fmt sub-chunk writeString(view, 12, 'fmt '); view.setUint32(16, 16, true); // Subchunk1Size (16 for PCM) view.setUint16(20, 1, true); // AudioFormat (1 for PCM) view.setUint16(22, numChannels, true); // NumChannels view.setUint32(24, sampleRate, true); // SampleRate view.setUint32(28, byteRate, true); // ByteRate (SampleRate * NumChannels * BitsPerSample/8) view.setUint16(32, numChannels * 2, true); // BlockAlign (NumChannels * BitsPerSample/8) view.setUint16(34, 16, true); // BitsPerSample // data sub-chunk writeString(view, 36, 'data'); view.setUint32(40, dataSize, true); // Subchunk2Size return new Uint8Array(header); } // Example usage streamTextToSpeech(['Hello,', ' world!']).then((data) => { console.log("Success"); } );