Skip to content

Instantly share code, notes, and snippets.

@weihanchen
Created May 9, 2023 22:54
Show Gist options
  • Save weihanchen/471f33e73345cf34d19b5577cd1b1e8a to your computer and use it in GitHub Desktop.
Save weihanchen/471f33e73345cf34d19b5577cd1b1e8a to your computer and use it in GitHub Desktop.
pydub音訊處理: 分離左右聲道並轉換成Numpy Array結構,以進行whisper語音辨識
import io
from typing import BinaryIO, Tuple, Union
import numpy as np
from pydub.utils import get_array_type
def decode_audio(
input_file: Union[str, BinaryIO],
sampling_rate: int = 16000,
) -> Tuple[np.ndarray, np.ndarray]:
"""Decodes the audio.
Args:
input_file: Path to the input file or a file-like object.
sampling_rate: Resample the audio to this sample rate.
Returns:
A float32 Numpy array.
returns a 2-tuple with the separated left and right channels.
"""
raw_audio = AudioSegment.from_file(input_file)
# 16-bit (2 bytes)
raw_audio = raw_audio.set_sample_width(2)
# 預設轉為雙聲道layout
raw_audio = raw_audio.set_channels(2)
# resampling
raw_audio = raw_audio.set_frame_rate(sampling_rate)
raw_data = raw_audio.raw_data
dtype = get_array_type(raw_audio.sample_width * 8)
audio = np.frombuffer(raw_data, dtype=dtype)
# Convert s16 back to f32.
audio = audio.astype(np.float32) / 32768.0
left_channel = audio[0::2]
right_channel = audio[1::2]
return left_channel, right_channel
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment