mbrock · November 29, 2024 23:44 · Nov 29, 2024
diff --git a/voice-writer.js b/voice-writer.js
@@ -0,0 +1,528 @@
+/**
+ * Copyright (c) 2024 Mikael Brockman <https://github.com/mbrock>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/**
+ * TypeWriter reveals text gradually using the CSS Highlight API,
+ * creating a smooth typing animation.
+ *
+ * The typing speed varies based on punctuation and position in the text,
+ * with natural pauses at punctuation marks and acceleration as it progresses.
+ */
+class TypeWriter extends HTMLElement {
+  constructor() {
+    super()
+    // Counter for how many characters to reveal
+    this.limit = 0
+    // Range object used to hide unrevealed text
+    this.blind = new Range()
+    // Observer that watches for content changes and triggers updates
+    this.scout = new MutationObserver(() => {
+      this.update()
+      if (!this.timer) this.proceed()
+    })
+    this.lastTranscriptTime = Date.now()
+  }
+
+  // Typing speed delays for different punctuation marks (in relative units)
+  static delays = {
+    " ": 3,
+    ",": 8,
+    ";": 8,
+    ":": 9,
+    ".": 10,
+    "—": 12,
+    "–": 7,
+    "!": 15,
+    "?": 15,
+    "\n": 20,
+  }
+
+  // Base typing speed configuration
+  static get speedConfig() {
+    return {
+      min: 30, // Minimum characters per second
+      max: 80, // Maximum characters per second
+      curve: 2, // Acceleration curve power
+    }
+  }
+
+  connectedCallback() {
+    // Set up CSS highlight API for revealing text gradually
+    const css = new CSSStyleSheet()
+    css.replaceSync(`::highlight(transparent) { color: transparent }`)
+    document.adoptedStyleSheets = [...document.adoptedStyleSheets, css]
+
+    // Initialize the blind range to cover all content
+    this.blind.selectNodeContents(this)
+
+    // Create or get the highlight for unrevealed text
+    const highlight = CSS.highlights.get("transparent") ?? new Highlight()
+    highlight.add(this.blind)
+    CSS.highlights.set("transparent", highlight)
+
+    // Start observing content changes
+    this.scout.observe(this, {
+      childList: true,
+      subtree: true,
+      characterData: true,
+    })
+
+    this.proceed()
+  }
+
+  disconnectedCallback() {
+    this.scout.disconnect()
+    CSS.highlights.get("transparent")?.delete(this.blind)
+    clearTimeout(this.timer)
+  }
+
+  update() {
+    // Walk through text nodes to find where to place the blind range
+    const walk = document.createTreeWalker(this, NodeFilter.SHOW_TEXT)
+    let node = null
+    let limit = this.limit
+
+    while ((node = walk.nextNode())) {
+      const length = node.data.slice(0, limit).length
+      limit -= length
+      if (limit <= 0) {
+        // Found the node where the reveal cutoff should be
+        this.blind.setStart(node, length)
+        break
+      }
+    }
+
+    if (limit > 0) {
+      // If we've revealed all text, reset blind to start
+      this.blind.setStart(this, 0)
+    }
+
+    // Always set blind to end after all content
+    this.blind.setEndAfter(this)
+  }
+
+  proceed() {
+    if (this.blind.toString().trim() === "") {
+      this.timer = undefined
+      this.dispatchEvent(new CustomEvent("typingComplete"))
+      return
+    }
+
+    this.limit = Math.min(this.limit + 1, this.innerText.length)
+    this.update()
+
+    const remainingText = this.blind.toString()
+    const totalLength = this.innerText.length
+    const speed = this.calculateSpeed(totalLength, remainingText)
+
+    this.timer = setTimeout(() => this.proceed(), 1000 / speed)
+  }
+
+  calculateSpeed(totalLength, remainingText) {
+    const { min, max, curve } = TypeWriter.speedConfig
+    const speedRange = max - min
+    const progress = 1 - remainingText.length / totalLength
+    const baseSpeed = min + speedRange * progress ** curve
+    const nextChar = remainingText[0]
+    return baseSpeed / (TypeWriter.delays[nextChar] ?? 1)
+  }
+
+  setSpeed(multiplier) {
+    const { min, max } = TypeWriter.speedConfig
+    TypeWriter.speedConfig.min = min * multiplier
+    TypeWriter.speedConfig.max = max * multiplier
+  }
+}
+
+customElements.define("type-writer", TypeWriter)
+
+/**
+ * VoiceWriter is a custom element that combines speech recognition with typewriter effects.
+ * It captures audio input, streams it to a WebSocket server for real-time transcription,
+ * and displays the results with a typewriter effect. It handles both interim results
+ * (shown faded) and final transcriptions (shown solid), creating a natural voice-to-text
+ * experience with visual feedback.
+ */
+class VoiceWriter extends HTMLElement {
+  constructor() {
+    super()
+    this.handleTranscript = this.handleTranscript.bind(this)
+    this.processAudio = this.processAudio.bind(this)
+    this.reconnectAttempts = 0
+    this.maxReconnectAttempts = 5
+    this.reconnectDelay = 1000
+
+    // Track both instant rate and moving average
+    this.bytesSent = 0
+    this.uploadRate = 0
+    this.movingAverage = 0
+    this.alpha = 0.2 // Smoothing factor (0.2 gives more weight to recent values)
+    this.uploadRateInterval = null
+    this.isRecording = true
+    this.lastTranscriptTime = Date.now()
+  }
+
+  static get observedAttributes() {
+    return ["language", "server"]
+  }
+
+  get language() {
+    return this.getAttribute("language") ?? "en-US"
+  }
+
+  get server() {
+    return this.getAttribute("server") ?? "wss://swa.sh"
+  }
+
+  get wsUrl() {
+    return `${this.server}/transcribe?language=${this.language}`
+  }
+
+  connectedCallback() {
+    this.writer = document.createElement("type-writer")
+    this.writer.className = "block p-4"
+
+    // Create status bar
+    this.statusBar = document.createElement("div")
+    this.statusBar.className =
+      "flex items-center font-sans py-1 justify-between gap-2 px-1 text-sm bg-gray-100 dark:bg-gray-800/30 border-b border-gray-300 dark:border-gray-700"
+
+    // Create status section
+    this.statusSection = document.createElement("div")
+    this.statusSection.className = "flex items-center gap-2"
+    this.statusBar.appendChild(this.statusSection)
+
+    // Create recording toggle
+    const toggleContainer = document.createElement("div")
+    toggleContainer.className = "flex items-center gap-2"
+
+    const toggle = document.createElement("input")
+    toggle.type = "checkbox"
+    toggle.id = "recordingToggle"
+    toggle.checked = true
+    toggle.className =
+      "form-checkbox h-4 w-4 text-blue-600 transition duration-150 ease-in-out"
+    toggle.addEventListener("change", (e) => {
+      this.isRecording = e.target.checked
+      this.updateStatus(
+        this.isRecording ? "Listening..." : "Paused",
+        this.isRecording
+          ? "text-emerald-600 dark:text-emerald-400"
+          : "text-yellow-600 dark:text-yellow-400"
+      )
+    })
+
+    const label = document.createElement("label")
+    label.htmlFor = "recordingToggle"
+    label.className = "text-sm text-gray-600 dark:text-gray-400"
+    label.textContent = ""
+
+    toggleContainer.appendChild(toggle)
+    toggleContainer.appendChild(label)
+    this.statusBar.prepend(toggleContainer)
+
+    // Add debug button to status bar if debug mode is enabled
+    if (this.hasAttribute("debug")) {
+      const debugButton = document.createElement("button")
+      debugButton.className =
+        "px-2 hover:bg-slate-200 dark:hover:bg-slate-800 transition-colors"
+      debugButton.textContent = "🔌"
+      debugButton.addEventListener("click", () => {
+        console.log("🎤 Debug: Simulating WebSocket disconnect")
+        if (this.ws?.readyState === WebSocket.OPEN) {
+          this.ws.close()
+        }
+      })
+      this.statusBar.appendChild(debugButton)
+    }
+
+    this.updateStatus("Initializing...")
+    this.appendChild(this.statusBar)
+    this.appendChild(this.writer)
+
+    navigator.mediaDevices
+      .getUserMedia({ audio: true })
+      .then((stream) => this.beginListening(stream))
+      .catch(console.error)
+  }
+
+  disconnectedCallback() {
+    clearInterval(this.uploadRateInterval)
+    this.encoder?.close()
+    this.processor?.disconnect()
+    this.source?.disconnect()
+    this.context?.close()
+    this.ws?.close()
+  }
+
+  setWebSocketState(state) {
+    this.setAttribute("data-ws-state", state)
+
+    const statusMessages = {
+      connecting: "Connecting...",
+      connected: "Listening...",
+      disconnected: "Reconnecting...",
+    }
+
+    const statusColors = {
+      connecting: "text-yellow-600 dark:text-yellow-400",
+      connected: "text-emerald-600 dark:text-emerald-400",
+      disconnected: "text-red-600 dark:text-red-400",
+    }
+
+    this.updateStatus(
+      statusMessages[state] || state,
+      statusColors[state] || "text-gray-600 dark:text-gray-400"
+    )
+  }
+
+  updateStatus(message, colorClass = "text-gray-600 dark:text-gray-400") {
+    // Clear existing content
+    this.statusSection.innerHTML = ""
+
+    // Add status message
+    const text = document.createElement("span")
+    text.className = colorClass
+    text.textContent = message
+    this.statusSection.appendChild(text)
+  }
+
+  async connectWebSocket() {
+    try {
+      console.log("🎤 Connecting to transcription service...")
+      this.setWebSocketState("connecting")
+      this.ws = new WebSocket(this.wsUrl)
+      this.ws.binaryType = "arraybuffer"
+
+      this.ws.addEventListener("message", this.handleTranscript)
+      this.ws.addEventListener("open", () => {
+        console.log("🎤 Connected to transcription service")
+        this.setWebSocketState("connected")
+        this.reconnectAttempts = 0
+        this.reconnectDelay = 1000
+      })
+
+      this.ws.addEventListener("close", () => {
+        console.log("🎤 Disconnected from transcription service")
+        this.setWebSocketState("disconnected")
+        this.attemptReconnect()
+      })
+
+      this.ws.addEventListener("error", (error) => {
+        console.error("🎤 WebSocket error:", error)
+        this.setWebSocketState("disconnected")
+      })
+    } catch (error) {
+      console.error("🎤 Failed to connect:", error)
+      this.setWebSocketState("disconnected")
+    }
+  }
+
+  attemptReconnect() {
+    if (this.reconnectAttempts >= this.maxReconnectAttempts) {
+      console.error("🎤 Max reconnection attempts reached, giving up")
+      return
+    }
+
+    const attempt = this.reconnectAttempts + 1
+    const delay = this.reconnectDelay / 1000
+    console.log(
+      `🎤 Attempting to reconnect (attempt ${attempt}/${this.maxReconnectAttempts}) in ${delay}s...`
+    )
+
+    setTimeout(() => {
+      this.reconnectAttempts++
+      this.connectWebSocket()
+      // Exponential backoff
+      this.reconnectDelay = Math.min(this.reconnectDelay * 2, 10000)
+    }, this.reconnectDelay)
+  }
+
+  async beginListening(stream) {
+    await this.connectWebSocket()
+
+    // Set up audio context and nodes
+    this.context = new AudioContext()
+    this.source = this.context.createMediaStreamSource(stream)
+
+    const channels = stream.getAudioTracks()[0].getSettings().channelCount ?? 1
+    this.processor = this.context.createScriptProcessor(16384, channels, 1)
+
+    // Set up encoder
+    this.encoder = new AudioEncoder({
+      output: (packet) => {
+        if (this.ws?.readyState === WebSocket.OPEN) {
+          const buffer = new ArrayBuffer(packet.byteLength)
+          packet.copyTo(buffer)
+          this.ws.send(buffer)
+          this.bytesSent += buffer.byteLength
+        }
+      },
+      error: console.error,
+    })
+
+    // Sample rate and update moving average every second
+    this.uploadRateInterval = setInterval(() => {
+      this.uploadRate = this.bytesSent * 2
+      this.movingAverage =
+        this.alpha * this.uploadRate + (1 - this.alpha) * this.movingAverage
+      this.bytesSent = 0 // reset counter
+      this.updateUploadRate()
+    }, 500)
+
+    // Set up encoder
+    await this.encoder.configure({
+      codec: "opus",
+      sampleRate: 48000,
+      numberOfChannels: 1,
+      opus: {
+        application: "lowdelay",
+        signal: "voice",
+      },
+    })
+
+    // Wire up audio pipeline
+    this.source.connect(this.processor)
+    this.processor.connect(this.context.destination)
+    this.processor.addEventListener("audioprocess", this.processAudio)
+  }
+
+  processAudio(event) {
+    if (this.ws?.readyState !== WebSocket.OPEN || !this.isRecording) return
+
+    const inputData = event.inputBuffer.getChannelData(0)
+    const buffer = new ArrayBuffer(inputData.length * 4)
+    const view = new DataView(buffer)
+
+    for (let i = 0; i < inputData.length; i++) {
+      view.setFloat32(i * 4, inputData[i], true)
+    }
+
+    this.encoder?.encode(
+      new AudioData({
+        data: buffer,
+        timestamp: event.playbackTime * 1000000,
+        format: "f32",
+        numberOfChannels: 1,
+        numberOfFrames: inputData.length,
+        sampleRate: 48000,
+      })
+    )
+  }
+
+  updateUploadRate() {
+    const instantRate = this.uploadRate
+    const avgRate = this.movingAverage
+
+    const formatRate = (rate) => {
+      if (rate > 1024) {
+        return `${(rate / 1024).toFixed(1)} KB/s`
+      } else {
+        return `${rate.toFixed(1)} B/s`
+      }
+    }
+
+    // Update or create upload rate element
+    if (!this.uploadRateElement) {
+      this.uploadRateElement = document.createElement("span")
+      this.uploadRateElement.className =
+        "font-mono text-blue-600 dark:text-blue-400 ml-auto mr-2"
+      const debugButton = this.statusBar.querySelector("button")
+      if (debugButton) {
+        this.statusBar.insertBefore(this.uploadRateElement, debugButton)
+      } else {
+        this.statusBar.appendChild(this.uploadRateElement)
+      }
+    }
+
+    // Show moving average rate
+    this.uploadRateElement.textContent = `${formatRate(avgRate)}`
+  }
+
+  handleTranscript(event) {
+    if (typeof event.data !== "string") return
+
+    try {
+      const result = JSON.parse(event.data)
+      if (
+        result.type !== "Results" ||
+        !result.channel?.alternatives?.[0]?.transcript
+      )
+        return
+
+      let text = result.channel.alternatives[0].transcript
+      if (!text) return
+
+      const currentTime = Date.now()
+      const timeSinceLastTranscript = currentTime - this.lastTranscriptTime
+      console.log(`🎤 Time since last transcript: ${timeSinceLastTranscript}ms`)
+
+      // Add line break if more than 5 seconds have passed
+      if (timeSinceLastTranscript > 5000 && this.writer.lastElementChild) {
+        const lineBreak = document.createElement("br")
+        this.writer.appendChild(lineBreak)
+      }
+
+      this.lastTranscriptTime = currentTime
+
+      let element = this.writer.lastElementChild
+
+      // Use <ins> for interim results that may change
+      if (!element || !element.matches("ins")) {
+        element = document.createElement("ins")
+        this.writer.appendChild(element)
+      }
+
+      element.textContent = text
+
+      if (result.is_final) {
+        // Convert interim <ins> to final <span> when transcription is confirmed
+        const span = document.createElement("span")
+        if (!text.match(/[.!?]$/)) text += "—"
+
+        span.textContent = text + " "
+
+        // Use view transitions API for smooth visual update
+        document.startViewTransition(() => {
+          element.replaceWith(span)
+        })
+      }
+    } catch (error) {
+      console.error("Error parsing transcript:", error)
+    }
+  }
+}
+
+const sheet = new CSSStyleSheet()
+sheet.replaceSync(`
+voice-writer ins {
+    text-decoration: none;
+    opacity: 0.6;
+}
+`)
+
+document.adoptedStyleSheets = [...document.adoptedStyleSheets, sheet]
+
+customElements.define("voice-writer", VoiceWriter)
+
+// Usage:
+// <voice-writer language="en-US" server="wss://swa.sh"></voice-writer>