josejuan · April 9, 2025 08:55 · Apr 9, 2025
diff --git a/transcript.cpp b/transcript.cpp
@@ -0,0 +1,179 @@
+#include "whisper.h"
+
+#include <iostream>
+#include <vector>
+#include <fstream>
+#include <string>
+#include <filesystem>
+#include <thread>
+#include <chrono>
+#include <regex>
+#include <map>
+#include <csignal>
+#include <inotifytools/inotifytools.h>
+#include <sys/inotify.h>
+
+namespace fs = std::filesystem;
+
+#define WHISPER_SAMPLE_RATE 16000
+
+volatile bool running = true;
+
+void signal_handler(int) {
+    running = false;
+}
+
+bool wait_for_file_complete(const fs::path &path, int retries = 10) {
+    using namespace std::chrono_literals;
+    uintmax_t last_size = 0;
+
+    for (int i = 0; i < retries; ++i) {
+        auto cur_size = fs::file_size(path);
+        if (cur_size == last_size)
+            return true;
+        last_size = cur_size;
+        std::this_thread::sleep_for(500ms);
+    }
+
+    return false;
+}
+
+bool load_wav_mono(const std::string &path, std::vector<float> &pcmf32) {
+    FILE *fp = fopen(path.c_str(), "rb");
+    if (!fp) return false;
+
+    fseek(fp, 0, SEEK_END);
+    long len = ftell(fp);
+    fseek(fp, 0, SEEK_SET);
+
+    std::vector<uint8_t> buf(len);
+    fread(buf.data(), 1, len, fp);
+    fclose(fp);
+
+    if (buf.size() < 44) return false;
+
+    int channels = buf[22] | (buf[23] << 8);
+    int sample_rate = buf[24] | (buf[25] << 8) | (buf[26] << 16) | (buf[27] << 24);
+    int bits_per_sample = buf[34] | (buf[35] << 8);
+
+    if (channels != 1 || sample_rate != WHISPER_SAMPLE_RATE || bits_per_sample != 16) {
+        std::cerr << "Expected mono 16-bit PCM 16kHz WAV\n";
+        return false;
+    }
+
+    int data_offset = 44;
+    int num_samples = (buf.size() - data_offset) / 2;
+    pcmf32.resize(num_samples);
+
+    for (int i = 0; i < num_samples; ++i) {
+        int16_t s = buf[data_offset + 2*i] | (buf[data_offset + 2*i + 1] << 8);
+        pcmf32[i] = s / 32768.0f;
+    }
+
+    return true;
+}
+
+void transcribe_file(const fs::path &path, struct whisper_context *ctx) {
+    std::string name = path.filename();
+    std::smatch m;
+    std::regex rx(R"(^(en|es)-(.+)\.wav$)");
+
+    if (!std::regex_match(name, m, rx)) {
+        std::cerr << "Ignoring invalid file: " << name << "\n";
+        return;
+    }
+
+    std::string lang = m[1];
+    std::string base = m[2];
+    std::vector<float> pcmf32;
+
+    if (!wait_for_file_complete(path)) {
+        std::cerr << "File incomplete: " << name << "\n";
+        return;
+    }
+
+    if (!load_wav_mono(path, pcmf32)) {
+        std::cerr << "Failed to load: " << name << "\n";
+        return;
+    }
+
+    whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+    params.print_progress = false;
+    params.print_special  = false;
+    params.print_realtime = false;
+    params.print_timestamps = false;
+    params.translate = (lang == "en");
+    params.language = "es";
+
+    if (whisper_full(ctx, params, pcmf32.data(), pcmf32.size()) != 0) {
+        std::cerr << "whisper_full() failed\n";
+        return;
+    }
+
+    std::string transcription;
+    int n_segments = whisper_full_n_segments(ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        transcription += whisper_full_get_segment_text(ctx, i);
+    }
+
+    std::string escaped;
+    for (char c : transcription) {
+        if (c == '"' || c == '$' || c == '`' || c == '\\')
+            escaped += '\\';
+        escaped += c;
+    }
+
+    std::string cmd = "xdotool type \"" + escaped + "\"";
+    int ret = std::system(cmd.c_str());
+
+    if (ret != 0) {
+        std::cerr << "xdotool failed\n";
+    } else {
+        std::cout << "Typed: " << name << "\n";
+        fs::remove(path);
+    }
+}
+
+int main(int argc, char **argv) {
+    if (argc != 2) {
+        std::cerr << "usage: " << argv[0] << " <watch_dir>\n";
+        return 1;
+    }
+
+    const fs::path watch_dir = argv[1];
+    const std::string model_path = "./models/ggml-large-v3.bin";
+
+    if (!fs::exists(watch_dir) || !fs::is_directory(watch_dir)) {
+        std::cerr << "Invalid directory\n";
+        return 1;
+    }
+
+    struct whisper_context *ctx = whisper_init_from_file(model_path.c_str());
+    if (!ctx) {
+        std::cerr << "failed to load model\n";
+        return 2;
+    }
+
+    signal(SIGINT, signal_handler);
+
+    if (!inotifytools_initialize() || !inotifytools_watch_recursively(watch_dir.c_str(), IN_CLOSE_WRITE)) {
+        std::cerr << "inotify init failed\n";
+        return 3;
+    }
+
+    std::cout << "Watching " << watch_dir << " ... Ctrl+C to exit\n";
+
+    while (running) {
+        const struct inotify_event *evt = inotifytools_next_event(-1);
+        if (!evt) continue;
+
+        fs::path fname = evt->name;
+        if (fname.extension() == ".wav") {
+            fs::path fullpath = watch_dir / fname;
+            transcribe_file(fullpath, ctx);
+        }
+    }
+
+    whisper_free(ctx);
+    return 0;
+}