Created
June 27, 2025 16:11
-
-
Save Emveez/9af86131400f3577b29c0953060ea2e3 to your computer and use it in GitHub Desktop.
whisper benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from faster_whisper import WhisperModel | |
import time | |
import soundfile | |
import requests | |
import io | |
import numpy as np | |
DEVICE = "cpu" | |
# MODEL = "KBLab/kb-whisper-large" | |
MODEL = "KBLab/kb-whisper-tiny" | |
def resample_audio(audio_data, original_sr, desired_sr): | |
# Calculate the duration of the audio in seconds | |
duration = len(audio_data) / original_sr | |
# Create an array of time points for the original and desired sample rates | |
original_times = np.arange(0, duration, 1 / original_sr) | |
desired_times = np.arange(0, duration, 1 / desired_sr) | |
# Use NumPy's interpolation function to interpolate the audio data | |
resampled_audio = np.interp(desired_times, original_times, audio_data) | |
return resampled_audio | |
def inference(a, m): | |
t = time.perf_counter() | |
segments, info = m.transcribe(a, beam_size=5) | |
segments = list(segments) | |
total_time = time.perf_counter() - t | |
return total_time | |
response = requests.get( | |
"https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0010_8k.wav", | |
headers={ | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" | |
}, | |
) | |
audio, sampling_rate = soundfile.read(io.BytesIO(response.content)) | |
if sampling_rate != 16000: | |
audio_data = resample_audio( | |
audio_data=audio, original_sr=sampling_rate, desired_sr=16000 | |
) | |
model = WhisperModel( | |
MODEL, device=DEVICE, compute_type="float32" if DEVICE == "cpu" else "float16" | |
) | |
# number of sim for each benchmark | |
n = 25 | |
res = {} | |
for j in range(n): | |
for i in range(1, 11): | |
end = int(audio.shape[0] * (i / 10.0)) # increase in 10 % | |
audio_length_seconds = end // sampling_rate | |
if audio_length_seconds not in res.keys(): | |
res[audio_length_seconds] = [] | |
audio_chunk = audio[:end] | |
print(f"running iter {j} and length: {audio_length_seconds}") | |
inference_time = inference(audio_chunk, model) | |
res[audio_length_seconds].append(inference_time) | |
for audio_length, times in res.items(): | |
avg_time = sum(times) / len(times) | |
variance = sum((t - avg_time) ** 2 for t in times) / len(times) | |
print( | |
f"audio length: {audio_length}s, avg transcribe time: {avg_time:.3f}s, variance: {variance:.3f}s²" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment