Emveez · June 27, 2025 16:11
diff --git a/benchmark.py b/benchmark.py
 from faster_whisper import WhisperModel
 import time
 import soundfile
 import requests
 import io
 import numpy as np

 DEVICE = "cpu"
 # MODEL = "KBLab/kb-whisper-large"
 MODEL = "KBLab/kb-whisper-tiny"


 def resample_audio(audio_data, original_sr, desired_sr):
    # Calculate the duration of the audio in seconds
    duration = len(audio_data) / original_sr

    # Create an array of time points for the original and desired sample rates
    original_times = np.arange(0, duration, 1 / original_sr)
    desired_times = np.arange(0, duration, 1 / desired_sr)

    # Use NumPy's interpolation function to interpolate the audio data
    resampled_audio = np.interp(desired_times, original_times, audio_data)

    return resampled_audio


 def inference(a, m):
    t = time.perf_counter()
    segments, info = m.transcribe(a, beam_size=5)
    segments = list(segments)
    total_time = time.perf_counter() - t

    return total_time


 response = requests.get(
    "https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0010_8k.wav",
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    },
 )

 audio, sampling_rate = soundfile.read(io.BytesIO(response.content))

 if sampling_rate != 16000:
    audio_data = resample_audio(
        audio_data=audio, original_sr=sampling_rate, desired_sr=16000
    )

 model = WhisperModel(
    MODEL, device=DEVICE, compute_type="float32" if DEVICE == "cpu" else "float16"
 )


 # number of sim for each benchmark
 n = 25
 res = {}

 for j in range(n):
    for i in range(1, 11):
        end = int(audio.shape[0] * (i / 10.0))  # increase in 10 %
        audio_length_seconds = end // sampling_rate
        if audio_length_seconds not in res.keys():
            res[audio_length_seconds] = []
        audio_chunk = audio[:end]
        print(f"running iter {j} and length: {audio_length_seconds}")
        inference_time = inference(audio_chunk, model)
        res[audio_length_seconds].append(inference_time)

 for audio_length, times in res.items():
    avg_time = sum(times) / len(times)
    variance = sum((t - avg_time) ** 2 for t in times) / len(times)
    print(
        f"audio length: {audio_length}s, avg transcribe time: {avg_time:.3f}s, variance: {variance:.3f}s²"
    )
	from faster_whisper import WhisperModel
	import time
	import soundfile
	import requests
	import io
	import numpy as np

	DEVICE = "cpu"
	# MODEL = "KBLab/kb-whisper-large"
	MODEL = "KBLab/kb-whisper-tiny"


	def resample_audio(audio_data, original_sr, desired_sr):
	# Calculate the duration of the audio in seconds
	duration = len(audio_data) / original_sr

	# Create an array of time points for the original and desired sample rates
	original_times = np.arange(0, duration, 1 / original_sr)
	desired_times = np.arange(0, duration, 1 / desired_sr)

	# Use NumPy's interpolation function to interpolate the audio data
	resampled_audio = np.interp(desired_times, original_times, audio_data)

	return resampled_audio


	def inference(a, m):
	t = time.perf_counter()
	segments, info = m.transcribe(a, beam_size=5)
	segments = list(segments)
	total_time = time.perf_counter() - t

	return total_time


	response = requests.get(
	"https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0010_8k.wav",
	headers={
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
	},
	)

	audio, sampling_rate = soundfile.read(io.BytesIO(response.content))

	if sampling_rate != 16000:
	audio_data = resample_audio(
	audio_data=audio, original_sr=sampling_rate, desired_sr=16000
	)

	model = WhisperModel(
	MODEL, device=DEVICE, compute_type="float32" if DEVICE == "cpu" else "float16"
	)


	# number of sim for each benchmark
	n = 25
	res = {}

	for j in range(n):
	for i in range(1, 11):
	end = int(audio.shape[0] * (i / 10.0)) # increase in 10 %
	audio_length_seconds = end // sampling_rate
	if audio_length_seconds not in res.keys():
	res[audio_length_seconds] = []
	audio_chunk = audio[:end]
	print(f"running iter {j} and length: {audio_length_seconds}")
	inference_time = inference(audio_chunk, model)
	res[audio_length_seconds].append(inference_time)

	for audio_length, times in res.items():
	avg_time = sum(times) / len(times)
	variance = sum((t - avg_time) ** 2 for t in times) / len(times)
	print(
	f"audio length: {audio_length}s, avg transcribe time: {avg_time:.3f}s, variance: {variance:.3f}s²"
	)