Created
December 12, 2022 04:03
-
-
Save Jackiexiao/b2193b2390eeab2366cfe7488884a74d to your computer and use it in GitHub Desktop.
使用ModelScope生成带CMU音素标注的美式英文数据1男1女
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
使用Modelscope开源美式英文TTS生成语音数据,仅供学习分享交流之用,如有侵权,请联系我删除 | |
Use Modelscope's OpenSource TTS to generate English(en-us) speech data, for learning and sharing only, if there is any infringement, please contact me to delete it | |
下载地址 Download link | |
https://pan.baidu.com/s/1qUjBhCVknOTV-xm4VBEuDQ?pwd=uqbd | |
数据示例data example: | |
annie|annie_LJ001-0002_0.wav|IH0 N #1 B IY1 IH0 NG #1 K AH0 M P EH1 R AH0 T IH0 V L IY0 #1 M AA1 D ER0 N #4|in being comparatively modern. | |
约40小时,16khz单声道,29912句,一男一女,美式英语 | |
About 40 hours, 16khz mono, 29912 sentences | |
优点是带了精确的CMU标注,缺点是阿里开源的TTS生成的语音音质不太好,有杂音 | |
The advantage is that it has accurate CMU annotations, but the sound quality of the TTS generated by Ali open source is not very good, with noise | |
来源Source: | |
- text: ljspeech | |
- https://modelscope.cn/models/damo/speech_sambert-hifigan_tts_andy_en-us_16k/summary | |
- https://modelscope.cn/models/damo/speech_sambert-hifigan_tts_annie_en-us_16k/summary | |
Usage: | |
pip install "modelscope[audio]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html | |
pip install --find-links https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html ttsfrd | |
CUDA_VISIBLE_DEVICES=0 python local/generate_ali_en_tts_data.py --outdir ${your_dir} --file LJSpeech-1.1-metadata.csv --spk annie | |
CUDA_VISIBLE_DEVICES=0 python local/generate_ali_en_tts_data.py --outdir ${your_dir} --file LJSpeech-1.1-metadata.csv --spk andy | |
""" | |
import re | |
import json | |
from pathlib import Path | |
from scipy.io.wavfile import write | |
from tqdm import tqdm | |
from modelscope.outputs import OutputKeys | |
from modelscope.pipelines import pipeline | |
from modelscope.utils.constant import Tasks | |
# from modelscope.models.audio.tts.sambert_hifi import SambertHifigan | |
# tts_engine = SambertHifigan(model_dir, am='xx', vocoder = 'xx', lang_type='xx') | |
def format_ttsfrd_output(res, text): | |
phone_list = [] | |
for sen in res.split("\n"): | |
if sen: | |
sen_idx, phoneme_infos = sen.split("\t") | |
for info_str in phoneme_infos.split(): | |
# info_str example: '{h_c$tone3$s_begin$word_middle$emotion_neutral$F7}' | |
infos = info_str[1:-1].split("$") | |
phone = infos[0] | |
tone = infos[1].replace("tone", "") | |
if phone.startswith("#"): | |
pass | |
elif "_c" in phone: # 中文拼音,以便与英文 cmudict 相区分 | |
if re.search(r"[aeiouvr]", phone): | |
if not re.match(r"[0-5]", tone): | |
print(f"非法中文声调 {text} error: {phone} {tone}") | |
phone += tone | |
elif re.search(r"g[a-z]", phone): | |
# 单元音的辅音不加声调,例如: | |
# 比如: 嗯: 有 ge$tone_5 en_c$tone_5,还有很多类似的 go ga 等 | |
pass | |
elif re.search(r"[a-z]", phone): # 英文音素 | |
if re.search(r"[aeiou]", phone): # 元音加声调 | |
if not re.match(r"[0-2]", tone): | |
print(f"非法英文声调 {text} error: {phone} {tone}") | |
phone += tone | |
else: | |
print(f"{text} 含有未知音素 error: {info_str}, {phone}, {tone}") | |
phone_list.append(phone) | |
return " ".join(phone_list) | |
CMU_PHONEMES = [ | |
"AA0", | |
"AA1", | |
"AA2", | |
"AE0", | |
"AE1", | |
"AE2", | |
"AH0", | |
"AH1", | |
"AH2", | |
"AO0", | |
"AO1", | |
"AO2", | |
"AW0", | |
"AW1", | |
"AW2", | |
"AY0", | |
"AY1", | |
"AY2", | |
"B", | |
"CH", | |
"D", | |
"DH", | |
"EH0", | |
"EH1", | |
"EH2", | |
"ER0", | |
"ER1", | |
"ER2", | |
"EY0", | |
"EY1", | |
"EY2", | |
"F", | |
"G", | |
"HH", | |
"IH0", | |
"IH1", | |
"IH2", | |
"IY0", | |
"IY1", | |
"IY2", | |
"JH", | |
"K", | |
"L", | |
"M", | |
"N", | |
"NG", | |
"OW0", | |
"OW1", | |
"OW2", | |
"OY0", | |
"OY1", | |
"OY2", | |
"P", | |
"R", | |
"S", | |
"SH", | |
"T", | |
"TH", | |
"UH0", | |
"UH1", | |
"UH2", | |
"UW", | |
"UW0", | |
"UW1", | |
"UW2", | |
"V", | |
"W", | |
"Y", | |
"Z", | |
"ZH", | |
] | |
SET_CMU_PHONEMES = set(CMU_PHONEMES) | |
spk_model_id = { | |
"annie": "damo/speech_sambert-hifigan_tts_annie_en-us_16k", | |
"andy": "damo/speech_sambert-hifigan_tts_andy_en-us_16k", | |
} | |
def clean_text(text: str): | |
"""删除不表示停顿的 句号""" | |
text = text.replace("i.e. ", "IE ") | |
text = text.replace("Mr. ", "Mr ") | |
text = text.replace("Mrs. ", "Mrs ") | |
text = text.replace("Ms. ", "Ms ") | |
text = text.replace("Dr. ", "Dr ") | |
text = text.replace("Prof. ", "Prof ") | |
text = text.replace("St. ", "St ") | |
text = text.replace("Sr. ", "Sr ") | |
text = text.replace("Jr. ", "Jr ") | |
text = text.replace("Maj. ", "Maj ") | |
text = text.replace("Gen. ", "Gen ") | |
text = text.replace("Col. ", "Col ") | |
text = text.replace("Lt. ", "Lt ") | |
text = text.replace("Capt. ", "Capt ") | |
text = text.replace("Hon. ", "Hon ") | |
text = text.replace("Sen. ", "Sen ") | |
text = text.replace("Rep. ", "Rep ") | |
text = text.replace("Gov. ", "Gov ") | |
text = text.replace("U.S.", "US") | |
return text | |
def split_by_punc(text: str, puncs: str = ".!?;"): | |
for punc in puncs: | |
text = text.replace(punc, punc + "▁") | |
return text.split("▁") | |
def main(args): | |
wav_dir = args.outdir / "wavs_16k" | |
wav_dir.mkdir(parents=True, exist_ok=True) | |
model_id = spk_model_id[args.spk] | |
spk = args.spk | |
total_lines = [] | |
with open(args.file, "r", encoding="utf-8") as f: | |
lines = f.readlines() | |
for i, line in enumerate(lines): | |
wav_id, ori_text, ori_text = line.strip().split("|") | |
# 按大标点符号分割,因为这里的 tts_engine 拼接句子时中间没有加入停顿 | |
sub_texts = split_by_punc(clean_text(ori_text)) | |
sub_texts = [x.strip() for x in sub_texts if x.strip()] | |
for j, sub_text in enumerate(sub_texts): | |
wav_path = str(wav_dir / f"{spk}_{wav_id}_{j}.wav") | |
total_lines.append([wav_path, sub_text]) | |
tts_engine = pipeline(task=Tasks.text_to_speech, model=model_id) | |
frontend = tts_engine.model._SambertHifigan__frontend | |
# output = tts_engine(input="hello word text to synthesis") | |
# pcm = output[OutputKeys.OUTPUT_PCM] | |
# write('test.wav', 16000, pcm) | |
# print('Write test.wav') | |
ling_info = {} | |
data = [] | |
with open(args.outdir / f"{spk}_ali_en_us_16k.csv", "w", encoding="utf8") as wf: | |
for (wav_path, text) in tqdm(total_lines): | |
result = tts_engine(input=text) | |
wav = result[OutputKeys.OUTPUT_PCM] | |
write(wav_path, 16000, wav) | |
linguistic_info = frontend.gen_tacotron_symbols(text) | |
ling_info[wav_path] = {} | |
ling_info[wav_path]["info"] = linguistic_info | |
ling_info[wav_path]["text"] = text | |
phonemes = format_ttsfrd_output(linguistic_info, text).upper() | |
item = [spk, wav_path, phonemes, text] | |
data.append(item) | |
wf.write("|".join(item) + "\n") | |
with open(args.outdir / f"{spk}_ling_info.json", "w", encoding="utf-8") as wf: | |
json.dump(ling_info, wf, indent=4, ensure_ascii=False) | |
# check the result | |
for line in data: | |
spk, _, _, wav_path, phonemes, text = line | |
for phone in phonemes.split(): | |
if phone.startswith("#"): | |
continue | |
if phone not in SET_CMU_PHONEMES: | |
print(f"{phone} not in CMU_PHONEMES: {text=}") | |
break | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--file", default=None) | |
parser.add_argument("--outdir", default="./temp", type=Path) | |
parser.add_argument("--spk", default="annie", type=str) | |
args = parser.parse_args() | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
阿里开源的前端也带有 TN、G2P、Prosody 预测功能,也可以用于生成 TTS前端数据