Skip to content

Instantly share code, notes, and snippets.

@mbrock
Created November 29, 2024 23:44

Revisions

  1. mbrock created this gist Nov 29, 2024.
    528 changes: 528 additions & 0 deletions voice-writer.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,528 @@
    /**
    * Copyright (c) 2024 Mikael Brockman <https://github.com/mbrock>
    *
    * Permission is hereby granted, free of charge, to any person obtaining a copy
    * of this software and associated documentation files (the "Software"), to deal
    * in the Software without restriction, including without limitation the rights
    * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    * copies of the Software, and to permit persons to whom the Software is
    * furnished to do so, subject to the following conditions:
    *
    * The above copyright notice and this permission notice shall be included in all
    * copies or substantial portions of the Software.
    *
    * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    * SOFTWARE.
    */

    /**
    * TypeWriter reveals text gradually using the CSS Highlight API,
    * creating a smooth typing animation.
    *
    * The typing speed varies based on punctuation and position in the text,
    * with natural pauses at punctuation marks and acceleration as it progresses.
    */
    class TypeWriter extends HTMLElement {
    constructor() {
    super()
    // Counter for how many characters to reveal
    this.limit = 0
    // Range object used to hide unrevealed text
    this.blind = new Range()
    // Observer that watches for content changes and triggers updates
    this.scout = new MutationObserver(() => {
    this.update()
    if (!this.timer) this.proceed()
    })
    this.lastTranscriptTime = Date.now()
    }

    // Typing speed delays for different punctuation marks (in relative units)
    static delays = {
    " ": 3,
    ",": 8,
    ";": 8,
    ":": 9,
    ".": 10,
    "—": 12,
    "–": 7,
    "!": 15,
    "?": 15,
    "\n": 20,
    }

    // Base typing speed configuration
    static get speedConfig() {
    return {
    min: 30, // Minimum characters per second
    max: 80, // Maximum characters per second
    curve: 2, // Acceleration curve power
    }
    }

    connectedCallback() {
    // Set up CSS highlight API for revealing text gradually
    const css = new CSSStyleSheet()
    css.replaceSync(`::highlight(transparent) { color: transparent }`)
    document.adoptedStyleSheets = [...document.adoptedStyleSheets, css]

    // Initialize the blind range to cover all content
    this.blind.selectNodeContents(this)

    // Create or get the highlight for unrevealed text
    const highlight = CSS.highlights.get("transparent") ?? new Highlight()
    highlight.add(this.blind)
    CSS.highlights.set("transparent", highlight)

    // Start observing content changes
    this.scout.observe(this, {
    childList: true,
    subtree: true,
    characterData: true,
    })

    this.proceed()
    }

    disconnectedCallback() {
    this.scout.disconnect()
    CSS.highlights.get("transparent")?.delete(this.blind)
    clearTimeout(this.timer)
    }

    update() {
    // Walk through text nodes to find where to place the blind range
    const walk = document.createTreeWalker(this, NodeFilter.SHOW_TEXT)
    let node = null
    let limit = this.limit

    while ((node = walk.nextNode())) {
    const length = node.data.slice(0, limit).length
    limit -= length
    if (limit <= 0) {
    // Found the node where the reveal cutoff should be
    this.blind.setStart(node, length)
    break
    }
    }

    if (limit > 0) {
    // If we've revealed all text, reset blind to start
    this.blind.setStart(this, 0)
    }

    // Always set blind to end after all content
    this.blind.setEndAfter(this)
    }

    proceed() {
    if (this.blind.toString().trim() === "") {
    this.timer = undefined
    this.dispatchEvent(new CustomEvent("typingComplete"))
    return
    }

    this.limit = Math.min(this.limit + 1, this.innerText.length)
    this.update()

    const remainingText = this.blind.toString()
    const totalLength = this.innerText.length
    const speed = this.calculateSpeed(totalLength, remainingText)

    this.timer = setTimeout(() => this.proceed(), 1000 / speed)
    }

    calculateSpeed(totalLength, remainingText) {
    const { min, max, curve } = TypeWriter.speedConfig
    const speedRange = max - min
    const progress = 1 - remainingText.length / totalLength
    const baseSpeed = min + speedRange * progress ** curve
    const nextChar = remainingText[0]
    return baseSpeed / (TypeWriter.delays[nextChar] ?? 1)
    }

    setSpeed(multiplier) {
    const { min, max } = TypeWriter.speedConfig
    TypeWriter.speedConfig.min = min * multiplier
    TypeWriter.speedConfig.max = max * multiplier
    }
    }

    customElements.define("type-writer", TypeWriter)

    /**
    * VoiceWriter is a custom element that combines speech recognition with typewriter effects.
    * It captures audio input, streams it to a WebSocket server for real-time transcription,
    * and displays the results with a typewriter effect. It handles both interim results
    * (shown faded) and final transcriptions (shown solid), creating a natural voice-to-text
    * experience with visual feedback.
    */
    class VoiceWriter extends HTMLElement {
    constructor() {
    super()
    this.handleTranscript = this.handleTranscript.bind(this)
    this.processAudio = this.processAudio.bind(this)
    this.reconnectAttempts = 0
    this.maxReconnectAttempts = 5
    this.reconnectDelay = 1000

    // Track both instant rate and moving average
    this.bytesSent = 0
    this.uploadRate = 0
    this.movingAverage = 0
    this.alpha = 0.2 // Smoothing factor (0.2 gives more weight to recent values)
    this.uploadRateInterval = null
    this.isRecording = true
    this.lastTranscriptTime = Date.now()
    }

    static get observedAttributes() {
    return ["language", "server"]
    }

    get language() {
    return this.getAttribute("language") ?? "en-US"
    }

    get server() {
    return this.getAttribute("server") ?? "wss://swa.sh"
    }

    get wsUrl() {
    return `${this.server}/transcribe?language=${this.language}`
    }

    connectedCallback() {
    this.writer = document.createElement("type-writer")
    this.writer.className = "block p-4"

    // Create status bar
    this.statusBar = document.createElement("div")
    this.statusBar.className =
    "flex items-center font-sans py-1 justify-between gap-2 px-1 text-sm bg-gray-100 dark:bg-gray-800/30 border-b border-gray-300 dark:border-gray-700"

    // Create status section
    this.statusSection = document.createElement("div")
    this.statusSection.className = "flex items-center gap-2"
    this.statusBar.appendChild(this.statusSection)

    // Create recording toggle
    const toggleContainer = document.createElement("div")
    toggleContainer.className = "flex items-center gap-2"

    const toggle = document.createElement("input")
    toggle.type = "checkbox"
    toggle.id = "recordingToggle"
    toggle.checked = true
    toggle.className =
    "form-checkbox h-4 w-4 text-blue-600 transition duration-150 ease-in-out"
    toggle.addEventListener("change", (e) => {
    this.isRecording = e.target.checked
    this.updateStatus(
    this.isRecording ? "Listening..." : "Paused",
    this.isRecording
    ? "text-emerald-600 dark:text-emerald-400"
    : "text-yellow-600 dark:text-yellow-400"
    )
    })

    const label = document.createElement("label")
    label.htmlFor = "recordingToggle"
    label.className = "text-sm text-gray-600 dark:text-gray-400"
    label.textContent = ""

    toggleContainer.appendChild(toggle)
    toggleContainer.appendChild(label)
    this.statusBar.prepend(toggleContainer)

    // Add debug button to status bar if debug mode is enabled
    if (this.hasAttribute("debug")) {
    const debugButton = document.createElement("button")
    debugButton.className =
    "px-2 hover:bg-slate-200 dark:hover:bg-slate-800 transition-colors"
    debugButton.textContent = "🔌"
    debugButton.addEventListener("click", () => {
    console.log("🎤 Debug: Simulating WebSocket disconnect")
    if (this.ws?.readyState === WebSocket.OPEN) {
    this.ws.close()
    }
    })
    this.statusBar.appendChild(debugButton)
    }

    this.updateStatus("Initializing...")
    this.appendChild(this.statusBar)
    this.appendChild(this.writer)

    navigator.mediaDevices
    .getUserMedia({ audio: true })
    .then((stream) => this.beginListening(stream))
    .catch(console.error)
    }

    disconnectedCallback() {
    clearInterval(this.uploadRateInterval)
    this.encoder?.close()
    this.processor?.disconnect()
    this.source?.disconnect()
    this.context?.close()
    this.ws?.close()
    }

    setWebSocketState(state) {
    this.setAttribute("data-ws-state", state)

    const statusMessages = {
    connecting: "Connecting...",
    connected: "Listening...",
    disconnected: "Reconnecting...",
    }

    const statusColors = {
    connecting: "text-yellow-600 dark:text-yellow-400",
    connected: "text-emerald-600 dark:text-emerald-400",
    disconnected: "text-red-600 dark:text-red-400",
    }

    this.updateStatus(
    statusMessages[state] || state,
    statusColors[state] || "text-gray-600 dark:text-gray-400"
    )
    }

    updateStatus(message, colorClass = "text-gray-600 dark:text-gray-400") {
    // Clear existing content
    this.statusSection.innerHTML = ""

    // Add status message
    const text = document.createElement("span")
    text.className = colorClass
    text.textContent = message
    this.statusSection.appendChild(text)
    }

    async connectWebSocket() {
    try {
    console.log("🎤 Connecting to transcription service...")
    this.setWebSocketState("connecting")
    this.ws = new WebSocket(this.wsUrl)
    this.ws.binaryType = "arraybuffer"

    this.ws.addEventListener("message", this.handleTranscript)
    this.ws.addEventListener("open", () => {
    console.log("🎤 Connected to transcription service")
    this.setWebSocketState("connected")
    this.reconnectAttempts = 0
    this.reconnectDelay = 1000
    })

    this.ws.addEventListener("close", () => {
    console.log("🎤 Disconnected from transcription service")
    this.setWebSocketState("disconnected")
    this.attemptReconnect()
    })

    this.ws.addEventListener("error", (error) => {
    console.error("🎤 WebSocket error:", error)
    this.setWebSocketState("disconnected")
    })
    } catch (error) {
    console.error("🎤 Failed to connect:", error)
    this.setWebSocketState("disconnected")
    }
    }

    attemptReconnect() {
    if (this.reconnectAttempts >= this.maxReconnectAttempts) {
    console.error("🎤 Max reconnection attempts reached, giving up")
    return
    }

    const attempt = this.reconnectAttempts + 1
    const delay = this.reconnectDelay / 1000
    console.log(
    `🎤 Attempting to reconnect (attempt ${attempt}/${this.maxReconnectAttempts}) in ${delay}s...`
    )

    setTimeout(() => {
    this.reconnectAttempts++
    this.connectWebSocket()
    // Exponential backoff
    this.reconnectDelay = Math.min(this.reconnectDelay * 2, 10000)
    }, this.reconnectDelay)
    }

    async beginListening(stream) {
    await this.connectWebSocket()

    // Set up audio context and nodes
    this.context = new AudioContext()
    this.source = this.context.createMediaStreamSource(stream)

    const channels = stream.getAudioTracks()[0].getSettings().channelCount ?? 1
    this.processor = this.context.createScriptProcessor(16384, channels, 1)

    // Set up encoder
    this.encoder = new AudioEncoder({
    output: (packet) => {
    if (this.ws?.readyState === WebSocket.OPEN) {
    const buffer = new ArrayBuffer(packet.byteLength)
    packet.copyTo(buffer)
    this.ws.send(buffer)
    this.bytesSent += buffer.byteLength
    }
    },
    error: console.error,
    })

    // Sample rate and update moving average every second
    this.uploadRateInterval = setInterval(() => {
    this.uploadRate = this.bytesSent * 2
    this.movingAverage =
    this.alpha * this.uploadRate + (1 - this.alpha) * this.movingAverage
    this.bytesSent = 0 // reset counter
    this.updateUploadRate()
    }, 500)

    // Set up encoder
    await this.encoder.configure({
    codec: "opus",
    sampleRate: 48000,
    numberOfChannels: 1,
    opus: {
    application: "lowdelay",
    signal: "voice",
    },
    })

    // Wire up audio pipeline
    this.source.connect(this.processor)
    this.processor.connect(this.context.destination)
    this.processor.addEventListener("audioprocess", this.processAudio)
    }

    processAudio(event) {
    if (this.ws?.readyState !== WebSocket.OPEN || !this.isRecording) return

    const inputData = event.inputBuffer.getChannelData(0)
    const buffer = new ArrayBuffer(inputData.length * 4)
    const view = new DataView(buffer)

    for (let i = 0; i < inputData.length; i++) {
    view.setFloat32(i * 4, inputData[i], true)
    }

    this.encoder?.encode(
    new AudioData({
    data: buffer,
    timestamp: event.playbackTime * 1000000,
    format: "f32",
    numberOfChannels: 1,
    numberOfFrames: inputData.length,
    sampleRate: 48000,
    })
    )
    }

    updateUploadRate() {
    const instantRate = this.uploadRate
    const avgRate = this.movingAverage

    const formatRate = (rate) => {
    if (rate > 1024) {
    return `${(rate / 1024).toFixed(1)} KB/s`
    } else {
    return `${rate.toFixed(1)} B/s`
    }
    }

    // Update or create upload rate element
    if (!this.uploadRateElement) {
    this.uploadRateElement = document.createElement("span")
    this.uploadRateElement.className =
    "font-mono text-blue-600 dark:text-blue-400 ml-auto mr-2"
    const debugButton = this.statusBar.querySelector("button")
    if (debugButton) {
    this.statusBar.insertBefore(this.uploadRateElement, debugButton)
    } else {
    this.statusBar.appendChild(this.uploadRateElement)
    }
    }

    // Show moving average rate
    this.uploadRateElement.textContent = `${formatRate(avgRate)}`
    }

    handleTranscript(event) {
    if (typeof event.data !== "string") return

    try {
    const result = JSON.parse(event.data)
    if (
    result.type !== "Results" ||
    !result.channel?.alternatives?.[0]?.transcript
    )
    return

    let text = result.channel.alternatives[0].transcript
    if (!text) return

    const currentTime = Date.now()
    const timeSinceLastTranscript = currentTime - this.lastTranscriptTime
    console.log(`🎤 Time since last transcript: ${timeSinceLastTranscript}ms`)

    // Add line break if more than 5 seconds have passed
    if (timeSinceLastTranscript > 5000 && this.writer.lastElementChild) {
    const lineBreak = document.createElement("br")
    this.writer.appendChild(lineBreak)
    }

    this.lastTranscriptTime = currentTime

    let element = this.writer.lastElementChild

    // Use <ins> for interim results that may change
    if (!element || !element.matches("ins")) {
    element = document.createElement("ins")
    this.writer.appendChild(element)
    }

    element.textContent = text

    if (result.is_final) {
    // Convert interim <ins> to final <span> when transcription is confirmed
    const span = document.createElement("span")
    if (!text.match(/[.!?]$/)) text += "—"

    span.textContent = text + " "

    // Use view transitions API for smooth visual update
    document.startViewTransition(() => {
    element.replaceWith(span)
    })
    }
    } catch (error) {
    console.error("Error parsing transcript:", error)
    }
    }
    }

    const sheet = new CSSStyleSheet()
    sheet.replaceSync(`
    voice-writer ins {
    text-decoration: none;
    opacity: 0.6;
    }
    `)

    document.adoptedStyleSheets = [...document.adoptedStyleSheets, sheet]

    customElements.define("voice-writer", VoiceWriter)

    // Usage:
    // <voice-writer language="en-US" server="wss://swa.sh"></voice-writer>