Skip to content

Instantly share code, notes, and snippets.

@potato4d
Created November 20, 2024 07:32
Show Gist options
  • Save potato4d/5c9369bd470b59d78fc360df3193d974 to your computer and use it in GitHub Desktop.
Save potato4d/5c9369bd470b59d78fc360df3193d974 to your computer and use it in GitHub Desktop.
Whisper + Resemblyzer を使った音声データの書き起こし

Open AI で出力したものを手直ししたものなので荒はあるかも

import whisper
from resemblyzer import VoiceEncoder, preprocess_wav
from sklearn.cluster import DBSCAN
from pydub import AudioSegment
import numpy as np
import torch
# GPUが利用可能なら設定
device = "cuda" if torch.cuda.is_available() else "cpu"
# Whisperモデルのロード
whisper_model = whisper.load_model("large", device=device)
# Resemblyzerのエンコーダーをロード
encoder = VoiceEncoder(device=device)
# 音声ファイルのパス
audio_path = "original.mp3"
# MP3をWAVに変換
audio = AudioSegment.from_mp3(audio_path)
wav_path = "temp_audio.wav"
audio.export(wav_path, format="wav")
# Whisperで文字起こし
result = whisper_model.transcribe(wav_path)
segments = result["segments"]
# Resemblyzerで話者特徴量を抽出
wav = preprocess_wav(wav_path)
speaker_embeddings = []
timestamps = []
# 各セグメントから特徴量を抽出
for segment in segments:
start = segment["start"]
end = segment["end"]
segment_wav = wav[int(start * 16000):int(end * 16000)]
embedding = encoder.embed_utterance(segment_wav)
speaker_embeddings.append(embedding)
timestamps.append((start, end))
# クラスタリング(DBSCAN)
embeddings = np.array(speaker_embeddings)
clustering = DBSCAN(eps=0.5, min_samples=2).fit(embeddings)
labels = clustering.labels_
# 話者識別付きの文字起こし結果をフォーマット
output = []
for i, segment in enumerate(segments):
start = segment["start"]
text = segment["text"].strip()
speaker = labels[i] if labels[i] != -1 else "Unknown"
timestamp = f"{int(start // 60)}:{int(start % 60):02}"
output.append(f"@Speaker_{speaker} {timestamp}\n{text}")
# 結果を表示
formatted_transcription = "\n".join(output)
f = open('transcript.txt', 'w')
f.write(formatted_transcription)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment