Skip to content

Instantly share code, notes, and snippets.

@josejuan
Created April 9, 2025 08:55

Revisions

  1. josejuan created this gist Apr 9, 2025.
    179 changes: 179 additions & 0 deletions transcript.cpp
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,179 @@
    #include "whisper.h"

    #include <iostream>
    #include <vector>
    #include <fstream>
    #include <string>
    #include <filesystem>
    #include <thread>
    #include <chrono>
    #include <regex>
    #include <map>
    #include <csignal>
    #include <inotifytools/inotifytools.h>
    #include <sys/inotify.h>

    namespace fs = std::filesystem;

    #define WHISPER_SAMPLE_RATE 16000

    volatile bool running = true;

    void signal_handler(int) {
    running = false;
    }

    bool wait_for_file_complete(const fs::path &path, int retries = 10) {
    using namespace std::chrono_literals;
    uintmax_t last_size = 0;

    for (int i = 0; i < retries; ++i) {
    auto cur_size = fs::file_size(path);
    if (cur_size == last_size)
    return true;
    last_size = cur_size;
    std::this_thread::sleep_for(500ms);
    }

    return false;
    }

    bool load_wav_mono(const std::string &path, std::vector<float> &pcmf32) {
    FILE *fp = fopen(path.c_str(), "rb");
    if (!fp) return false;

    fseek(fp, 0, SEEK_END);
    long len = ftell(fp);
    fseek(fp, 0, SEEK_SET);

    std::vector<uint8_t> buf(len);
    fread(buf.data(), 1, len, fp);
    fclose(fp);

    if (buf.size() < 44) return false;

    int channels = buf[22] | (buf[23] << 8);
    int sample_rate = buf[24] | (buf[25] << 8) | (buf[26] << 16) | (buf[27] << 24);
    int bits_per_sample = buf[34] | (buf[35] << 8);

    if (channels != 1 || sample_rate != WHISPER_SAMPLE_RATE || bits_per_sample != 16) {
    std::cerr << "Expected mono 16-bit PCM 16kHz WAV\n";
    return false;
    }

    int data_offset = 44;
    int num_samples = (buf.size() - data_offset) / 2;
    pcmf32.resize(num_samples);

    for (int i = 0; i < num_samples; ++i) {
    int16_t s = buf[data_offset + 2*i] | (buf[data_offset + 2*i + 1] << 8);
    pcmf32[i] = s / 32768.0f;
    }

    return true;
    }

    void transcribe_file(const fs::path &path, struct whisper_context *ctx) {
    std::string name = path.filename();
    std::smatch m;
    std::regex rx(R"(^(en|es)-(.+)\.wav$)");

    if (!std::regex_match(name, m, rx)) {
    std::cerr << "Ignoring invalid file: " << name << "\n";
    return;
    }

    std::string lang = m[1];
    std::string base = m[2];
    std::vector<float> pcmf32;

    if (!wait_for_file_complete(path)) {
    std::cerr << "File incomplete: " << name << "\n";
    return;
    }

    if (!load_wav_mono(path, pcmf32)) {
    std::cerr << "Failed to load: " << name << "\n";
    return;
    }

    whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
    params.print_progress = false;
    params.print_special = false;
    params.print_realtime = false;
    params.print_timestamps = false;
    params.translate = (lang == "en");
    params.language = "es";

    if (whisper_full(ctx, params, pcmf32.data(), pcmf32.size()) != 0) {
    std::cerr << "whisper_full() failed\n";
    return;
    }

    std::string transcription;
    int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
    transcription += whisper_full_get_segment_text(ctx, i);
    }

    std::string escaped;
    for (char c : transcription) {
    if (c == '"' || c == '$' || c == '`' || c == '\\')
    escaped += '\\';
    escaped += c;
    }

    std::string cmd = "xdotool type \"" + escaped + "\"";
    int ret = std::system(cmd.c_str());

    if (ret != 0) {
    std::cerr << "xdotool failed\n";
    } else {
    std::cout << "Typed: " << name << "\n";
    fs::remove(path);
    }
    }

    int main(int argc, char **argv) {
    if (argc != 2) {
    std::cerr << "usage: " << argv[0] << " <watch_dir>\n";
    return 1;
    }

    const fs::path watch_dir = argv[1];
    const std::string model_path = "./models/ggml-large-v3.bin";

    if (!fs::exists(watch_dir) || !fs::is_directory(watch_dir)) {
    std::cerr << "Invalid directory\n";
    return 1;
    }

    struct whisper_context *ctx = whisper_init_from_file(model_path.c_str());
    if (!ctx) {
    std::cerr << "failed to load model\n";
    return 2;
    }

    signal(SIGINT, signal_handler);

    if (!inotifytools_initialize() || !inotifytools_watch_recursively(watch_dir.c_str(), IN_CLOSE_WRITE)) {
    std::cerr << "inotify init failed\n";
    return 3;
    }

    std::cout << "Watching " << watch_dir << " ... Ctrl+C to exit\n";

    while (running) {
    const struct inotify_event *evt = inotifytools_next_event(-1);
    if (!evt) continue;

    fs::path fname = evt->name;
    if (fname.extension() == ".wav") {
    fs::path fullpath = watch_dir / fname;
    transcribe_file(fullpath, ctx);
    }
    }

    whisper_free(ctx);
    return 0;
    }