Created
May 9, 2023 22:54
-
-
Save weihanchen/471f33e73345cf34d19b5577cd1b1e8a to your computer and use it in GitHub Desktop.
pydub音訊處理: 分離左右聲道並轉換成Numpy Array結構,以進行whisper語音辨識
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
from typing import BinaryIO, Tuple, Union | |
import numpy as np | |
from pydub.utils import get_array_type | |
def decode_audio( | |
input_file: Union[str, BinaryIO], | |
sampling_rate: int = 16000, | |
) -> Tuple[np.ndarray, np.ndarray]: | |
"""Decodes the audio. | |
Args: | |
input_file: Path to the input file or a file-like object. | |
sampling_rate: Resample the audio to this sample rate. | |
Returns: | |
A float32 Numpy array. | |
returns a 2-tuple with the separated left and right channels. | |
""" | |
raw_audio = AudioSegment.from_file(input_file) | |
# 16-bit (2 bytes) | |
raw_audio = raw_audio.set_sample_width(2) | |
# 預設轉為雙聲道layout | |
raw_audio = raw_audio.set_channels(2) | |
# resampling | |
raw_audio = raw_audio.set_frame_rate(sampling_rate) | |
raw_data = raw_audio.raw_data | |
dtype = get_array_type(raw_audio.sample_width * 8) | |
audio = np.frombuffer(raw_data, dtype=dtype) | |
# Convert s16 back to f32. | |
audio = audio.astype(np.float32) / 32768.0 | |
left_channel = audio[0::2] | |
right_channel = audio[1::2] | |
return left_channel, right_channel |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment