Skip to content

Instantly share code, notes, and snippets.

@qtangs
Created August 27, 2024 01:56

Revisions

  1. qtangs created this gist Aug 27, 2024.
    85 changes: 85 additions & 0 deletions streaming_tts_using_google.ts
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,85 @@
    const textToSpeech = require('@google-cloud/text-to-speech');

    const fs = require('fs');

    async function streamTextToSpeech(texts: string[]) {
    const client = new textToSpeech.TextToSpeechClient();

    const ttsStream = client.streamingSynthesize();

    // Write the response to a file, replace with your desired output stream
    const writeStream = fs.createWriteStream('output.wav');

    // The audio data is headerless LINEAR16 audio with a sample rate of 24000.
    // https://github.com/googleapis/google-cloud-node/blob/main/packages/google-cloud-texttospeech/protos/google/cloud/texttospeech/v1/cloud_tts.proto#L352
    const sampleRate = 24000;
    const numChannels = 1; // Mono audio
    const byteRate = sampleRate * numChannels * 2;
    const header = createWavHeader(sampleRate, numChannels, byteRate, 0);
    writeStream.write(header);

    // Handle the TTS response stream
    ttsStream.on('data', (response: any) => {
    if (response.audioContent) {
    writeStream.write(response.audioContent);
    }
    });

    ttsStream.on('error', (err: any) => {
    console.error('Error during Text-to-Speech:', err);
    writeStream.end();
    });

    ttsStream.on('end', () => {
    console.log('Finished streaming Text-to-Speech');
    writeStream.end();
    });

    // Note: Only Journey voices support streaming for now.
    ttsStream.write({streamingConfig: {voice: { name: 'en-us-Journey-O', languageCode: 'en-US', ssmlGender: "NEUTRAL" }}});

    // Stream the texts to TTS stream, replace with actual streaming texts
    for (const text of texts) {
    ttsStream.write({input: {text: text}});
    }

    ttsStream.end();
    }

    function writeString(view: DataView, offset: number, str: string) {
    for (let i = 0; i < str.length; i++) {
    view.setUint8(offset + i, str.charCodeAt(i));
    }
    }

    function createWavHeader(sampleRate: number, numChannels: number, byteRate: number, dataSize: number): Uint8Array {
    const header = new ArrayBuffer(44);
    const view = new DataView(header);

    // RIFF chunk descriptor
    writeString(view, 0, 'RIFF');
    view.setUint32(4, 36 + dataSize, true); // File size - 8
    writeString(view, 8, 'WAVE');

    // fmt sub-chunk
    writeString(view, 12, 'fmt ');
    view.setUint32(16, 16, true); // Subchunk1Size (16 for PCM)
    view.setUint16(20, 1, true); // AudioFormat (1 for PCM)
    view.setUint16(22, numChannels, true); // NumChannels
    view.setUint32(24, sampleRate, true); // SampleRate
    view.setUint32(28, byteRate, true); // ByteRate (SampleRate * NumChannels * BitsPerSample/8)
    view.setUint16(32, numChannels * 2, true); // BlockAlign (NumChannels * BitsPerSample/8)
    view.setUint16(34, 16, true); // BitsPerSample

    // data sub-chunk
    writeString(view, 36, 'data');
    view.setUint32(40, dataSize, true); // Subchunk2Size

    return new Uint8Array(header);
    }

    // Example usage
    streamTextToSpeech(['Hello,', ' world!']).then((data) => {
    console.log("Success");
    }
    );