Skip to content

Instantly share code, notes, and snippets.

@Emveez
Created June 27, 2025 16:11
Show Gist options
  • Save Emveez/9af86131400f3577b29c0953060ea2e3 to your computer and use it in GitHub Desktop.
Save Emveez/9af86131400f3577b29c0953060ea2e3 to your computer and use it in GitHub Desktop.
whisper benchmark
from faster_whisper import WhisperModel
import time
import soundfile
import requests
import io
import numpy as np
DEVICE = "cpu"
# MODEL = "KBLab/kb-whisper-large"
MODEL = "KBLab/kb-whisper-tiny"
def resample_audio(audio_data, original_sr, desired_sr):
# Calculate the duration of the audio in seconds
duration = len(audio_data) / original_sr
# Create an array of time points for the original and desired sample rates
original_times = np.arange(0, duration, 1 / original_sr)
desired_times = np.arange(0, duration, 1 / desired_sr)
# Use NumPy's interpolation function to interpolate the audio data
resampled_audio = np.interp(desired_times, original_times, audio_data)
return resampled_audio
def inference(a, m):
t = time.perf_counter()
segments, info = m.transcribe(a, beam_size=5)
segments = list(segments)
total_time = time.perf_counter() - t
return total_time
response = requests.get(
"https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0010_8k.wav",
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
},
)
audio, sampling_rate = soundfile.read(io.BytesIO(response.content))
if sampling_rate != 16000:
audio_data = resample_audio(
audio_data=audio, original_sr=sampling_rate, desired_sr=16000
)
model = WhisperModel(
MODEL, device=DEVICE, compute_type="float32" if DEVICE == "cpu" else "float16"
)
# number of sim for each benchmark
n = 25
res = {}
for j in range(n):
for i in range(1, 11):
end = int(audio.shape[0] * (i / 10.0)) # increase in 10 %
audio_length_seconds = end // sampling_rate
if audio_length_seconds not in res.keys():
res[audio_length_seconds] = []
audio_chunk = audio[:end]
print(f"running iter {j} and length: {audio_length_seconds}")
inference_time = inference(audio_chunk, model)
res[audio_length_seconds].append(inference_time)
for audio_length, times in res.items():
avg_time = sum(times) / len(times)
variance = sum((t - avg_time) ** 2 for t in times) / len(times)
print(
f"audio length: {audio_length}s, avg transcribe time: {avg_time:.3f}s, variance: {variance:.3f}s²"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment