Skip to content

Instantly share code, notes, and snippets.

@garyachy
Last active June 25, 2025 14:42
Show Gist options
  • Save garyachy/1ef8367784bedb3959d86e7a45db907e to your computer and use it in GitHub Desktop.
Save garyachy/1ef8367784bedb3959d86e7a45db907e to your computer and use it in GitHub Desktop.
Transcriber
class AudioTranscriber
{
public:
AudioTranscriber(
std::string name,
const std::string &model_path,
uint32_t input_sample_rate,
uint32_t output_sample_rate,
uint32_t window_size_seconds,
G711ToPcmTranscoder &transcoder,
int transcription_interval_seconds
):
name_( std::move( name ) ),
model_path_( model_path ),
input_sample_rate_( input_sample_rate ),
output_sample_rate_( output_sample_rate ),
window_size_seconds_( window_size_seconds ),
max_pcm_samples_( output_sample_rate * window_size_seconds ),
transcoder_( transcoder ),
transcription_interval_seconds_( transcription_interval_seconds )
{
whisper_context_params cparams = whisper_context_default_params();
ctx_ = whisper_init_from_file_with_params( model_path_.c_str(), cparams );
if( !ctx_ )
{
throw std::runtime_error(
"Failed to initialize whisper context. Check model path "
"and whisper.cpp setup."
);
}
wparams_ = whisper_full_default_params( WHISPER_SAMPLING_GREEDY );
wparams_.print_progress = false;
wparams_.print_special = false;
wparams_.print_timestamps = false;
wparams_.print_realtime = false;
wparams_.n_threads = 8;
}
~AudioTranscriber()
{
whisper_free( ctx_ );
}
void process_rtp_payload( const u_char *rtp_payload, int rtp_payload_size )
{
auto pcm_chunk = transcoder_.transcode( rtp_payload, rtp_payload_size );
add_pcm_data( pcm_chunk );
add_samples( rtp_payload_size ); // For G.711, 1 byte = 1 sample
if( get_accumulated_samples() >=
get_input_sample_rate() * transcription_interval_seconds_ )
{
transcribe();
reset_accumulated_samples();
}
}
void finalize()
{
add_pcm_data( transcoder_.flush() );
transcribe();
}
void add_pcm_data( const std::vector<float> &pcm_chunk )
{
pcm_data_.insert( pcm_data_.end(), pcm_chunk.begin(), pcm_chunk.end() );
// Keep only the last window_size_seconds_ seconds of audio
if( pcm_data_.size() > max_pcm_samples_ )
{
pcm_data_.erase(
pcm_data_.begin(),
pcm_data_.begin() + ( pcm_data_.size() - max_pcm_samples_ )
);
}
}
void set_input_sample_rate( uint32_t rate )
{
input_sample_rate_ = rate;
}
uint32_t get_input_sample_rate() const
{
return input_sample_rate_;
}
void add_samples( size_t count )
{
accumulated_samples_ += count;
}
size_t get_accumulated_samples() const
{
return accumulated_samples_;
}
void reset_accumulated_samples()
{
accumulated_samples_ = 0;
}
void transcribe() const
{
if( pcm_data_.empty() )
{
return;
}
if( whisper_full( ctx_, wparams_, pcm_data_.data(), pcm_data_.size() ) ==
0 )
{
const int n_segments = whisper_full_n_segments( ctx_ );
std::string full_transcription;
for( int i = 0; i < n_segments; ++i )
{
const char *text = whisper_full_get_segment_text( ctx_, i );
if( text )
{
full_transcription += text;
}
}
}
}
size_t pcm_data_size() const
{
return pcm_data_.size();
}
private:
std::string name_;
std::string model_path_;
std::vector<float> pcm_data_;
whisper_context *ctx_ = nullptr;
uint32_t input_sample_rate_;
uint32_t output_sample_rate_;
size_t accumulated_samples_ = 0;
const size_t max_pcm_samples_;
uint32_t window_size_seconds_;
whisper_full_params wparams_;
G711ToPcmTranscoder &transcoder_;
int transcription_interval_seconds_;
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment