Open AI で出力したものを手直ししたものなので荒はあるかも
Created
November 20, 2024 07:32
-
-
Save potato4d/5c9369bd470b59d78fc360df3193d974 to your computer and use it in GitHub Desktop.
Whisper + Resemblyzer を使った音声データの書き起こし
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import whisper | |
from resemblyzer import VoiceEncoder, preprocess_wav | |
from sklearn.cluster import DBSCAN | |
from pydub import AudioSegment | |
import numpy as np | |
import torch | |
# GPUが利用可能なら設定 | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Whisperモデルのロード | |
whisper_model = whisper.load_model("large", device=device) | |
# Resemblyzerのエンコーダーをロード | |
encoder = VoiceEncoder(device=device) | |
# 音声ファイルのパス | |
audio_path = "original.mp3" | |
# MP3をWAVに変換 | |
audio = AudioSegment.from_mp3(audio_path) | |
wav_path = "temp_audio.wav" | |
audio.export(wav_path, format="wav") | |
# Whisperで文字起こし | |
result = whisper_model.transcribe(wav_path) | |
segments = result["segments"] | |
# Resemblyzerで話者特徴量を抽出 | |
wav = preprocess_wav(wav_path) | |
speaker_embeddings = [] | |
timestamps = [] | |
# 各セグメントから特徴量を抽出 | |
for segment in segments: | |
start = segment["start"] | |
end = segment["end"] | |
segment_wav = wav[int(start * 16000):int(end * 16000)] | |
embedding = encoder.embed_utterance(segment_wav) | |
speaker_embeddings.append(embedding) | |
timestamps.append((start, end)) | |
# クラスタリング(DBSCAN) | |
embeddings = np.array(speaker_embeddings) | |
clustering = DBSCAN(eps=0.5, min_samples=2).fit(embeddings) | |
labels = clustering.labels_ | |
# 話者識別付きの文字起こし結果をフォーマット | |
output = [] | |
for i, segment in enumerate(segments): | |
start = segment["start"] | |
text = segment["text"].strip() | |
speaker = labels[i] if labels[i] != -1 else "Unknown" | |
timestamp = f"{int(start // 60)}:{int(start % 60):02}" | |
output.append(f"@Speaker_{speaker} {timestamp}\n{text}") | |
# 結果を表示 | |
formatted_transcription = "\n".join(output) | |
f = open('transcript.txt', 'w') | |
f.write(formatted_transcription) | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment