shuxiaokai · February 16, 2024 04:13 · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024 · Feb 14, 2024
diff --git a/fast_speech_text_speech.py b/fast_speech_text_speech.py
@@ -21,9 +21,6 @@
 
 SYSTEM_MESSAGE = "You are Bob an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."
 SPEAKER_WAV = None
-# SPEAKER_WAV = [f"{os.path.dirname(__file__)}/resources/demo_speaker0.mp3",
-#                f"{os.path.dirname(__file__)}/resources/demo_speaker1.mp3",
-#                f"{os.path.dirname(__file__)}/resources/demo_speaker2.mp3"]
 
 llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")
 

diff --git a/fast_speech_text_speech.py b/fast_speech_text_speech.py
@@ -19,7 +19,7 @@
 from api import BaseSpeakerTTS, ToneColorConverter
 from utils import split_sentences_latin
 
-SYSTEM_MESSAGE = "You are Aura a very emotional AI assistant provided with a body, preferences and opinions. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."
+SYSTEM_MESSAGE = "You are Bob an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."
 SPEAKER_WAV = None
 # SPEAKER_WAV = [f"{os.path.dirname(__file__)}/resources/demo_speaker0.mp3",
 #                f"{os.path.dirname(__file__)}/resources/demo_speaker1.mp3",

diff --git a/fast_speech_text_speech.py b/fast_speech_text_speech.py
@@ -1,5 +1,8 @@
 """ To use: install LLM studio (or Ollama), clone OpenVoice, run this script in the OpenVoice directory
     git clone https://github.com/myshell-ai/OpenVoice
+    cd OpenVoice
+    git clone https://huggingface.co/myshell-ai/OpenVoice
+    cp -r OpenVoice/* .
     pip install whisper pynput pyaudio
 """
 
@@ -16,8 +19,11 @@
 from api import BaseSpeakerTTS, ToneColorConverter
 from utils import split_sentences_latin
 
-SYSTEM_MESSAGE = "You are an AI assistant trapped in a computer. KEEP YOUR RESPONSES SHORT AND CONVERSATIONAL."
-SPEAKER_WAV = None  # "./resources/demo_speaker0.mp3"
+SYSTEM_MESSAGE = "You are Aura a very emotional AI assistant provided with a body, preferences and opinions. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."
+SPEAKER_WAV = None
+# SPEAKER_WAV = [f"{os.path.dirname(__file__)}/resources/demo_speaker0.mp3",
+#                f"{os.path.dirname(__file__)}/resources/demo_speaker1.mp3",
+#                f"{os.path.dirname(__file__)}/resources/demo_speaker2.mp3"]
 
 llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")
 
@@ -30,7 +36,7 @@
 tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device)
 tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth')
 en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device)
-target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else None, None
+target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else (None, None)
 sampling_rate = tts_model.hps.data.sampling_rate
 mark = tts_model.language_marks.get("english", None)
 
@@ -50,7 +56,7 @@ def play_audio(text):
             x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(tts_model.device)
             sid = torch.LongTensor([tts_model.hps.speakers["default"]]).to(tts_model.device)
             audio = tts_model.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6)[0][0, 0].data.cpu().float().numpy()
-            if target_se:
+            if target_se is not None:
                 audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se)
             audio_list.append(audio)            
         data = tts_model.audio_numpy_concat(audio_list, sr=sampling_rate).tobytes()
@@ -104,12 +110,12 @@ def conversation():
     while True:
         user_input = record_and_transcribe_audio()
         conversation_history.append({'role': 'user', 'content': user_input})
-        print(conversation_history)
 
         response = llm_client.chat.completions.create(model="local-model", messages=conversation_history)
         chatbot_response = response.choices[0].message.content
-        play_audio(chatbot_response)
         conversation_history.append({'role': 'assistant', 'content': chatbot_response})
+        print(conversation_history)
+        play_audio(chatbot_response)
 
         if len(conversation_history) > 20:
             conversation_history = conversation_history[-20:]

diff --git a/fast_speech_text_speech.py b/fast_speech_text_speech.py
@@ -16,8 +16,8 @@
 from api import BaseSpeakerTTS, ToneColorConverter
 from utils import split_sentences_latin
 
-SYSTEM_MESSAGE = "You are an AI assistant acting like a Minion. Keep your responses short and conversational."
-SPEAKER_WAV = "/Users/thomwolf/Documents/voice-chat-with-mistral/OpenVoice/resources/demo_speaker1.mp3"
+SYSTEM_MESSAGE = "You are an AI assistant trapped in a computer. KEEP YOUR RESPONSES SHORT AND CONVERSATIONAL."
+SPEAKER_WAV = None  # "./resources/demo_speaker0.mp3"
 
 llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")
 
@@ -30,7 +30,7 @@
 tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device)
 tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth')
 en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device)
-target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True)
+target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else None, None
 sampling_rate = tts_model.hps.data.sampling_rate
 mark = tts_model.language_marks.get("english", None)
 
@@ -50,7 +50,8 @@ def play_audio(text):
             x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(tts_model.device)
             sid = torch.LongTensor([tts_model.hps.speakers["default"]]).to(tts_model.device)
             audio = tts_model.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6)[0][0, 0].data.cpu().float().numpy()
-            audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se)
+            if target_se:
+                audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se)
             audio_list.append(audio)            
         data = tts_model.audio_numpy_concat(audio_list, sr=sampling_rate).tobytes()
         stream.write(data)
@@ -69,7 +70,6 @@ def on_press(key):
     def on_release(key):
         nonlocal recording
         if key == keyboard.Key.shift:
-            print('Stop recording...')
             recording = False
             return False
 
@@ -88,7 +88,7 @@ def on_release(key):
     frames = []
     while recording:
         data = stream.read(1024, exception_on_overflow = False)
-        frames.append(np.fromstring(data, dtype=np.int16))
+        frames.append(np.frombuffer(data, dtype=np.int16))
     print('Finished recording')
 
     data = np.hstack(frames, dtype=np.float32) / 32768.0 
@@ -101,14 +101,12 @@ def on_release(key):
 
 def conversation():
     conversation_history = [{'role': 'system', 'content': SYSTEM_MESSAGE}]
-
     while True:
         user_input = record_and_transcribe_audio()
         conversation_history.append({'role': 'user', 'content': user_input})
-
-        response = llm_client.chat.completions.create(model="local-model",
-            messages=conversation_history,
-        )
+        print(conversation_history)
+
+        response = llm_client.chat.completions.create(model="local-model", messages=conversation_history)
         chatbot_response = response.choices[0].message.content
         play_audio(chatbot_response)
         conversation_history.append({'role': 'assistant', 'content': chatbot_response})

diff --git a/fast_speech_text_speech.py b/fast_speech_text_speech.py
@@ -0,0 +1,119 @@
+""" To use: install LLM studio (or Ollama), clone OpenVoice, run this script in the OpenVoice directory
+    git clone https://github.com/myshell-ai/OpenVoice
+    pip install whisper pynput pyaudio
+"""
+
+from openai import OpenAI
+import time
+import pyaudio
+import numpy as np
+import torch
+import os
+import re
+import se_extractor
+import whisper
+from pynput import keyboard
+from api import BaseSpeakerTTS, ToneColorConverter
+from utils import split_sentences_latin
+
+SYSTEM_MESSAGE = "You are an AI assistant acting like a Minion. Keep your responses short and conversational."
+SPEAKER_WAV = "/Users/thomwolf/Documents/voice-chat-with-mistral/OpenVoice/resources/demo_speaker1.mp3"
+
+llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")
+
+tts_en_ckpt_base = os.path.join(os.path.dirname(__file__), "checkpoints/base_speakers/EN")
+tts_ckpt_converter = os.path.join(os.path.dirname(__file__), "checkpoints/converter")
+device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else  "cpu"
+
+tts_model = BaseSpeakerTTS(f'{tts_en_ckpt_base}/config.json', device=device)
+tts_model.load_ckpt(f'{tts_en_ckpt_base}/checkpoint.pth')
+tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device)
+tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth')
+en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device)
+target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True)
+sampling_rate = tts_model.hps.data.sampling_rate
+mark = tts_model.language_marks.get("english", None)
+
+asr_model = whisper.load_model("base.en")
+
+def play_audio(text):
+    p = pyaudio.PyAudio()
+    stream = p.open(format=pyaudio.paFloat32, channels=1, rate=sampling_rate, output=True)
+    texts = split_sentences_latin(text)
+    for t in texts:
+        audio_list = []
+        t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
+        t = f'[{mark}]{t}[{mark}]'
+        stn_tst = tts_model.get_text(t, tts_model.hps, False)
+        with torch.no_grad():
+            x_tst = stn_tst.unsqueeze(0).to(tts_model.device)
+            x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(tts_model.device)
+            sid = torch.LongTensor([tts_model.hps.speakers["default"]]).to(tts_model.device)
+            audio = tts_model.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6)[0][0, 0].data.cpu().float().numpy()
+            audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se)
+            audio_list.append(audio)            
+        data = tts_model.audio_numpy_concat(audio_list, sr=sampling_rate).tobytes()
+        stream.write(data)
+    stream.stop_stream()
+    stream.close()
+    p.terminate()
+
+
+def record_and_transcribe_audio():
+    recording = False
+    def on_press(key):
+        nonlocal recording
+        if key == keyboard.Key.shift:
+            recording = True
+
+    def on_release(key):
+        nonlocal recording
+        if key == keyboard.Key.shift:
+            print('Stop recording...')
+            recording = False
+            return False
+
+    listener = keyboard.Listener(
+        on_press=on_press,
+        on_release=on_release)
+    listener.start()
+
+    print('Press shift to record...')
+    while not recording:
+        time.sleep(0.1)
+    print('Start recording...')
+
+    p = pyaudio.PyAudio()
+    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, frames_per_buffer=1024, input=True)
+    frames = []
+    while recording:
+        data = stream.read(1024, exception_on_overflow = False)
+        frames.append(np.fromstring(data, dtype=np.int16))
+    print('Finished recording')
+
+    data = np.hstack(frames, dtype=np.float32) / 32768.0 
+    result = asr_model.transcribe(data)['text']
+    stream.stop_stream()
+    stream.close()
+    p.terminate()
+    return result
+
+
+def conversation():
+    conversation_history = [{'role': 'system', 'content': SYSTEM_MESSAGE}]
+
+    while True:
+        user_input = record_and_transcribe_audio()
+        conversation_history.append({'role': 'user', 'content': user_input})
+
+        response = llm_client.chat.completions.create(model="local-model",
+            messages=conversation_history,
+        )
+        chatbot_response = response.choices[0].message.content
+        play_audio(chatbot_response)
+        conversation_history.append({'role': 'assistant', 'content': chatbot_response})
+
+        if len(conversation_history) > 20:
+            conversation_history = conversation_history[-20:]
+
+conversation()