Created
April 9, 2025 08:55
Revisions
-
josejuan created this gist
Apr 9, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,179 @@ #include "whisper.h" #include <iostream> #include <vector> #include <fstream> #include <string> #include <filesystem> #include <thread> #include <chrono> #include <regex> #include <map> #include <csignal> #include <inotifytools/inotifytools.h> #include <sys/inotify.h> namespace fs = std::filesystem; #define WHISPER_SAMPLE_RATE 16000 volatile bool running = true; void signal_handler(int) { running = false; } bool wait_for_file_complete(const fs::path &path, int retries = 10) { using namespace std::chrono_literals; uintmax_t last_size = 0; for (int i = 0; i < retries; ++i) { auto cur_size = fs::file_size(path); if (cur_size == last_size) return true; last_size = cur_size; std::this_thread::sleep_for(500ms); } return false; } bool load_wav_mono(const std::string &path, std::vector<float> &pcmf32) { FILE *fp = fopen(path.c_str(), "rb"); if (!fp) return false; fseek(fp, 0, SEEK_END); long len = ftell(fp); fseek(fp, 0, SEEK_SET); std::vector<uint8_t> buf(len); fread(buf.data(), 1, len, fp); fclose(fp); if (buf.size() < 44) return false; int channels = buf[22] | (buf[23] << 8); int sample_rate = buf[24] | (buf[25] << 8) | (buf[26] << 16) | (buf[27] << 24); int bits_per_sample = buf[34] | (buf[35] << 8); if (channels != 1 || sample_rate != WHISPER_SAMPLE_RATE || bits_per_sample != 16) { std::cerr << "Expected mono 16-bit PCM 16kHz WAV\n"; return false; } int data_offset = 44; int num_samples = (buf.size() - data_offset) / 2; pcmf32.resize(num_samples); for (int i = 0; i < num_samples; ++i) { int16_t s = buf[data_offset + 2*i] | (buf[data_offset + 2*i + 1] << 8); pcmf32[i] = s / 32768.0f; } return true; } void transcribe_file(const fs::path &path, struct whisper_context *ctx) { std::string name = path.filename(); std::smatch m; std::regex rx(R"(^(en|es)-(.+)\.wav$)"); if (!std::regex_match(name, m, rx)) { std::cerr << "Ignoring invalid file: " << name << "\n"; return; } std::string lang = m[1]; std::string base = m[2]; std::vector<float> pcmf32; if (!wait_for_file_complete(path)) { std::cerr << "File incomplete: " << name << "\n"; return; } if (!load_wav_mono(path, pcmf32)) { std::cerr << "Failed to load: " << name << "\n"; return; } whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); params.print_progress = false; params.print_special = false; params.print_realtime = false; params.print_timestamps = false; params.translate = (lang == "en"); params.language = "es"; if (whisper_full(ctx, params, pcmf32.data(), pcmf32.size()) != 0) { std::cerr << "whisper_full() failed\n"; return; } std::string transcription; int n_segments = whisper_full_n_segments(ctx); for (int i = 0; i < n_segments; ++i) { transcription += whisper_full_get_segment_text(ctx, i); } std::string escaped; for (char c : transcription) { if (c == '"' || c == '$' || c == '`' || c == '\\') escaped += '\\'; escaped += c; } std::string cmd = "xdotool type \"" + escaped + "\""; int ret = std::system(cmd.c_str()); if (ret != 0) { std::cerr << "xdotool failed\n"; } else { std::cout << "Typed: " << name << "\n"; fs::remove(path); } } int main(int argc, char **argv) { if (argc != 2) { std::cerr << "usage: " << argv[0] << " <watch_dir>\n"; return 1; } const fs::path watch_dir = argv[1]; const std::string model_path = "./models/ggml-large-v3.bin"; if (!fs::exists(watch_dir) || !fs::is_directory(watch_dir)) { std::cerr << "Invalid directory\n"; return 1; } struct whisper_context *ctx = whisper_init_from_file(model_path.c_str()); if (!ctx) { std::cerr << "failed to load model\n"; return 2; } signal(SIGINT, signal_handler); if (!inotifytools_initialize() || !inotifytools_watch_recursively(watch_dir.c_str(), IN_CLOSE_WRITE)) { std::cerr << "inotify init failed\n"; return 3; } std::cout << "Watching " << watch_dir << " ... Ctrl+C to exit\n"; while (running) { const struct inotify_event *evt = inotifytools_next_event(-1); if (!evt) continue; fs::path fname = evt->name; if (fname.extension() == ".wav") { fs::path fullpath = watch_dir / fname; transcribe_file(fullpath, ctx); } } whisper_free(ctx); return 0; }