Forked from thomwolf/fast_speech_text_speech.py
Created
February 16, 2024 04:13
Revisions
-
thomwolf revised this gist
Feb 15, 2024 . 1 changed file with 0 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -21,9 +21,6 @@ SYSTEM_MESSAGE = "You are Bob an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL." SPEAKER_WAV = None llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed") -
thomwolf revised this gist
Feb 15, 2024 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -19,7 +19,7 @@ from api import BaseSpeakerTTS, ToneColorConverter from utils import split_sentences_latin SYSTEM_MESSAGE = "You are Bob an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL." SPEAKER_WAV = None # SPEAKER_WAV = [f"{os.path.dirname(__file__)}/resources/demo_speaker0.mp3", # f"{os.path.dirname(__file__)}/resources/demo_speaker1.mp3", -
thomwolf revised this gist
Feb 15, 2024 . No changes.There are no files selected for viewing
-
thomwolf revised this gist
Feb 14, 2024 . 1 changed file with 12 additions and 6 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,5 +1,8 @@ """ To use: install LLM studio (or Ollama), clone OpenVoice, run this script in the OpenVoice directory git clone https://github.com/myshell-ai/OpenVoice cd OpenVoice git clone https://huggingface.co/myshell-ai/OpenVoice cp -r OpenVoice/* . pip install whisper pynput pyaudio """ @@ -16,8 +19,11 @@ from api import BaseSpeakerTTS, ToneColorConverter from utils import split_sentences_latin SYSTEM_MESSAGE = "You are Aura a very emotional AI assistant provided with a body, preferences and opinions. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL." SPEAKER_WAV = None # SPEAKER_WAV = [f"{os.path.dirname(__file__)}/resources/demo_speaker0.mp3", # f"{os.path.dirname(__file__)}/resources/demo_speaker1.mp3", # f"{os.path.dirname(__file__)}/resources/demo_speaker2.mp3"] llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed") @@ -30,7 +36,7 @@ tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device) tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth') en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device) target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else (None, None) sampling_rate = tts_model.hps.data.sampling_rate mark = tts_model.language_marks.get("english", None) @@ -50,7 +56,7 @@ def play_audio(text): x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(tts_model.device) sid = torch.LongTensor([tts_model.hps.speakers["default"]]).to(tts_model.device) audio = tts_model.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6)[0][0, 0].data.cpu().float().numpy() if target_se is not None: audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se) audio_list.append(audio) data = tts_model.audio_numpy_concat(audio_list, sr=sampling_rate).tobytes() @@ -104,12 +110,12 @@ def conversation(): while True: user_input = record_and_transcribe_audio() conversation_history.append({'role': 'user', 'content': user_input}) response = llm_client.chat.completions.create(model="local-model", messages=conversation_history) chatbot_response = response.choices[0].message.content conversation_history.append({'role': 'assistant', 'content': chatbot_response}) print(conversation_history) play_audio(chatbot_response) if len(conversation_history) > 20: conversation_history = conversation_history[-20:] -
thomwolf revised this gist
Feb 14, 2024 . 1 changed file with 9 additions and 11 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -16,8 +16,8 @@ from api import BaseSpeakerTTS, ToneColorConverter from utils import split_sentences_latin SYSTEM_MESSAGE = "You are an AI assistant trapped in a computer. KEEP YOUR RESPONSES SHORT AND CONVERSATIONAL." SPEAKER_WAV = None # "./resources/demo_speaker0.mp3" llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed") @@ -30,7 +30,7 @@ tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device) tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth') en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device) target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else None, None sampling_rate = tts_model.hps.data.sampling_rate mark = tts_model.language_marks.get("english", None) @@ -50,7 +50,8 @@ def play_audio(text): x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(tts_model.device) sid = torch.LongTensor([tts_model.hps.speakers["default"]]).to(tts_model.device) audio = tts_model.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6)[0][0, 0].data.cpu().float().numpy() if target_se: audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se) audio_list.append(audio) data = tts_model.audio_numpy_concat(audio_list, sr=sampling_rate).tobytes() stream.write(data) @@ -69,7 +70,6 @@ def on_press(key): def on_release(key): nonlocal recording if key == keyboard.Key.shift: recording = False return False @@ -88,7 +88,7 @@ def on_release(key): frames = [] while recording: data = stream.read(1024, exception_on_overflow = False) frames.append(np.frombuffer(data, dtype=np.int16)) print('Finished recording') data = np.hstack(frames, dtype=np.float32) / 32768.0 @@ -101,14 +101,12 @@ def on_release(key): def conversation(): conversation_history = [{'role': 'system', 'content': SYSTEM_MESSAGE}] while True: user_input = record_and_transcribe_audio() conversation_history.append({'role': 'user', 'content': user_input}) print(conversation_history) response = llm_client.chat.completions.create(model="local-model", messages=conversation_history) chatbot_response = response.choices[0].message.content play_audio(chatbot_response) conversation_history.append({'role': 'assistant', 'content': chatbot_response}) -
thomwolf created this gist
Feb 14, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,119 @@ """ To use: install LLM studio (or Ollama), clone OpenVoice, run this script in the OpenVoice directory git clone https://github.com/myshell-ai/OpenVoice pip install whisper pynput pyaudio """ from openai import OpenAI import time import pyaudio import numpy as np import torch import os import re import se_extractor import whisper from pynput import keyboard from api import BaseSpeakerTTS, ToneColorConverter from utils import split_sentences_latin SYSTEM_MESSAGE = "You are an AI assistant acting like a Minion. Keep your responses short and conversational." SPEAKER_WAV = "/Users/thomwolf/Documents/voice-chat-with-mistral/OpenVoice/resources/demo_speaker1.mp3" llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed") tts_en_ckpt_base = os.path.join(os.path.dirname(__file__), "checkpoints/base_speakers/EN") tts_ckpt_converter = os.path.join(os.path.dirname(__file__), "checkpoints/converter") device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" tts_model = BaseSpeakerTTS(f'{tts_en_ckpt_base}/config.json', device=device) tts_model.load_ckpt(f'{tts_en_ckpt_base}/checkpoint.pth') tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device) tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth') en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device) target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) sampling_rate = tts_model.hps.data.sampling_rate mark = tts_model.language_marks.get("english", None) asr_model = whisper.load_model("base.en") def play_audio(text): p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paFloat32, channels=1, rate=sampling_rate, output=True) texts = split_sentences_latin(text) for t in texts: audio_list = [] t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t) t = f'[{mark}]{t}[{mark}]' stn_tst = tts_model.get_text(t, tts_model.hps, False) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0).to(tts_model.device) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(tts_model.device) sid = torch.LongTensor([tts_model.hps.speakers["default"]]).to(tts_model.device) audio = tts_model.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6)[0][0, 0].data.cpu().float().numpy() audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se) audio_list.append(audio) data = tts_model.audio_numpy_concat(audio_list, sr=sampling_rate).tobytes() stream.write(data) stream.stop_stream() stream.close() p.terminate() def record_and_transcribe_audio(): recording = False def on_press(key): nonlocal recording if key == keyboard.Key.shift: recording = True def on_release(key): nonlocal recording if key == keyboard.Key.shift: print('Stop recording...') recording = False return False listener = keyboard.Listener( on_press=on_press, on_release=on_release) listener.start() print('Press shift to record...') while not recording: time.sleep(0.1) print('Start recording...') p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, frames_per_buffer=1024, input=True) frames = [] while recording: data = stream.read(1024, exception_on_overflow = False) frames.append(np.fromstring(data, dtype=np.int16)) print('Finished recording') data = np.hstack(frames, dtype=np.float32) / 32768.0 result = asr_model.transcribe(data)['text'] stream.stop_stream() stream.close() p.terminate() return result def conversation(): conversation_history = [{'role': 'system', 'content': SYSTEM_MESSAGE}] while True: user_input = record_and_transcribe_audio() conversation_history.append({'role': 'user', 'content': user_input}) response = llm_client.chat.completions.create(model="local-model", messages=conversation_history, ) chatbot_response = response.choices[0].message.content play_audio(chatbot_response) conversation_history.append({'role': 'assistant', 'content': chatbot_response}) if len(conversation_history) > 20: conversation_history = conversation_history[-20:] conversation()