Skip to content

Instantly share code, notes, and snippets.

@shuxiaokai
Forked from thomwolf/fast_speech_text_speech.py
Created February 16, 2024 04:13

Revisions

  1. @thomwolf thomwolf revised this gist Feb 15, 2024. 1 changed file with 0 additions and 3 deletions.
    3 changes: 0 additions & 3 deletions fast_speech_text_speech.py
    Original file line number Diff line number Diff line change
    @@ -21,9 +21,6 @@

    SYSTEM_MESSAGE = "You are Bob an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."
    SPEAKER_WAV = None
    # SPEAKER_WAV = [f"{os.path.dirname(__file__)}/resources/demo_speaker0.mp3",
    # f"{os.path.dirname(__file__)}/resources/demo_speaker1.mp3",
    # f"{os.path.dirname(__file__)}/resources/demo_speaker2.mp3"]

    llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

  2. @thomwolf thomwolf revised this gist Feb 15, 2024. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion fast_speech_text_speech.py
    Original file line number Diff line number Diff line change
    @@ -19,7 +19,7 @@
    from api import BaseSpeakerTTS, ToneColorConverter
    from utils import split_sentences_latin

    SYSTEM_MESSAGE = "You are Aura a very emotional AI assistant provided with a body, preferences and opinions. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."
    SYSTEM_MESSAGE = "You are Bob an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."
    SPEAKER_WAV = None
    # SPEAKER_WAV = [f"{os.path.dirname(__file__)}/resources/demo_speaker0.mp3",
    # f"{os.path.dirname(__file__)}/resources/demo_speaker1.mp3",
  3. @thomwolf thomwolf revised this gist Feb 15, 2024. No changes.
  4. @thomwolf thomwolf revised this gist Feb 14, 2024. 1 changed file with 12 additions and 6 deletions.
    18 changes: 12 additions & 6 deletions fast_speech_text_speech.py
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,8 @@
    """ To use: install LLM studio (or Ollama), clone OpenVoice, run this script in the OpenVoice directory
    git clone https://github.com/myshell-ai/OpenVoice
    cd OpenVoice
    git clone https://huggingface.co/myshell-ai/OpenVoice
    cp -r OpenVoice/* .
    pip install whisper pynput pyaudio
    """

    @@ -16,8 +19,11 @@
    from api import BaseSpeakerTTS, ToneColorConverter
    from utils import split_sentences_latin

    SYSTEM_MESSAGE = "You are an AI assistant trapped in a computer. KEEP YOUR RESPONSES SHORT AND CONVERSATIONAL."
    SPEAKER_WAV = None # "./resources/demo_speaker0.mp3"
    SYSTEM_MESSAGE = "You are Aura a very emotional AI assistant provided with a body, preferences and opinions. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."
    SPEAKER_WAV = None
    # SPEAKER_WAV = [f"{os.path.dirname(__file__)}/resources/demo_speaker0.mp3",
    # f"{os.path.dirname(__file__)}/resources/demo_speaker1.mp3",
    # f"{os.path.dirname(__file__)}/resources/demo_speaker2.mp3"]

    llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

    @@ -30,7 +36,7 @@
    tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device)
    tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth')
    en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device)
    target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else None, None
    target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else (None, None)
    sampling_rate = tts_model.hps.data.sampling_rate
    mark = tts_model.language_marks.get("english", None)

    @@ -50,7 +56,7 @@ def play_audio(text):
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(tts_model.device)
    sid = torch.LongTensor([tts_model.hps.speakers["default"]]).to(tts_model.device)
    audio = tts_model.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6)[0][0, 0].data.cpu().float().numpy()
    if target_se:
    if target_se is not None:
    audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se)
    audio_list.append(audio)
    data = tts_model.audio_numpy_concat(audio_list, sr=sampling_rate).tobytes()
    @@ -104,12 +110,12 @@ def conversation():
    while True:
    user_input = record_and_transcribe_audio()
    conversation_history.append({'role': 'user', 'content': user_input})
    print(conversation_history)

    response = llm_client.chat.completions.create(model="local-model", messages=conversation_history)
    chatbot_response = response.choices[0].message.content
    play_audio(chatbot_response)
    conversation_history.append({'role': 'assistant', 'content': chatbot_response})
    print(conversation_history)
    play_audio(chatbot_response)

    if len(conversation_history) > 20:
    conversation_history = conversation_history[-20:]
  5. @thomwolf thomwolf revised this gist Feb 14, 2024. 1 changed file with 9 additions and 11 deletions.
    20 changes: 9 additions & 11 deletions fast_speech_text_speech.py
    Original file line number Diff line number Diff line change
    @@ -16,8 +16,8 @@
    from api import BaseSpeakerTTS, ToneColorConverter
    from utils import split_sentences_latin

    SYSTEM_MESSAGE = "You are an AI assistant acting like a Minion. Keep your responses short and conversational."
    SPEAKER_WAV = "/Users/thomwolf/Documents/voice-chat-with-mistral/OpenVoice/resources/demo_speaker1.mp3"
    SYSTEM_MESSAGE = "You are an AI assistant trapped in a computer. KEEP YOUR RESPONSES SHORT AND CONVERSATIONAL."
    SPEAKER_WAV = None # "./resources/demo_speaker0.mp3"

    llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

    @@ -30,7 +30,7 @@
    tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device)
    tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth')
    en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device)
    target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True)
    target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else None, None
    sampling_rate = tts_model.hps.data.sampling_rate
    mark = tts_model.language_marks.get("english", None)

    @@ -50,7 +50,8 @@ def play_audio(text):
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(tts_model.device)
    sid = torch.LongTensor([tts_model.hps.speakers["default"]]).to(tts_model.device)
    audio = tts_model.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6)[0][0, 0].data.cpu().float().numpy()
    audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se)
    if target_se:
    audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se)
    audio_list.append(audio)
    data = tts_model.audio_numpy_concat(audio_list, sr=sampling_rate).tobytes()
    stream.write(data)
    @@ -69,7 +70,6 @@ def on_press(key):
    def on_release(key):
    nonlocal recording
    if key == keyboard.Key.shift:
    print('Stop recording...')
    recording = False
    return False

    @@ -88,7 +88,7 @@ def on_release(key):
    frames = []
    while recording:
    data = stream.read(1024, exception_on_overflow = False)
    frames.append(np.fromstring(data, dtype=np.int16))
    frames.append(np.frombuffer(data, dtype=np.int16))
    print('Finished recording')

    data = np.hstack(frames, dtype=np.float32) / 32768.0
    @@ -101,14 +101,12 @@ def on_release(key):

    def conversation():
    conversation_history = [{'role': 'system', 'content': SYSTEM_MESSAGE}]

    while True:
    user_input = record_and_transcribe_audio()
    conversation_history.append({'role': 'user', 'content': user_input})

    response = llm_client.chat.completions.create(model="local-model",
    messages=conversation_history,
    )
    print(conversation_history)

    response = llm_client.chat.completions.create(model="local-model", messages=conversation_history)
    chatbot_response = response.choices[0].message.content
    play_audio(chatbot_response)
    conversation_history.append({'role': 'assistant', 'content': chatbot_response})
  6. @thomwolf thomwolf created this gist Feb 14, 2024.
    119 changes: 119 additions & 0 deletions fast_speech_text_speech.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,119 @@
    """ To use: install LLM studio (or Ollama), clone OpenVoice, run this script in the OpenVoice directory
    git clone https://github.com/myshell-ai/OpenVoice
    pip install whisper pynput pyaudio
    """

    from openai import OpenAI
    import time
    import pyaudio
    import numpy as np
    import torch
    import os
    import re
    import se_extractor
    import whisper
    from pynput import keyboard
    from api import BaseSpeakerTTS, ToneColorConverter
    from utils import split_sentences_latin

    SYSTEM_MESSAGE = "You are an AI assistant acting like a Minion. Keep your responses short and conversational."
    SPEAKER_WAV = "/Users/thomwolf/Documents/voice-chat-with-mistral/OpenVoice/resources/demo_speaker1.mp3"

    llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

    tts_en_ckpt_base = os.path.join(os.path.dirname(__file__), "checkpoints/base_speakers/EN")
    tts_ckpt_converter = os.path.join(os.path.dirname(__file__), "checkpoints/converter")
    device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

    tts_model = BaseSpeakerTTS(f'{tts_en_ckpt_base}/config.json', device=device)
    tts_model.load_ckpt(f'{tts_en_ckpt_base}/checkpoint.pth')
    tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device)
    tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth')
    en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device)
    target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True)
    sampling_rate = tts_model.hps.data.sampling_rate
    mark = tts_model.language_marks.get("english", None)

    asr_model = whisper.load_model("base.en")

    def play_audio(text):
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paFloat32, channels=1, rate=sampling_rate, output=True)
    texts = split_sentences_latin(text)
    for t in texts:
    audio_list = []
    t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
    t = f'[{mark}]{t}[{mark}]'
    stn_tst = tts_model.get_text(t, tts_model.hps, False)
    with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0).to(tts_model.device)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(tts_model.device)
    sid = torch.LongTensor([tts_model.hps.speakers["default"]]).to(tts_model.device)
    audio = tts_model.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6)[0][0, 0].data.cpu().float().numpy()
    audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se)
    audio_list.append(audio)
    data = tts_model.audio_numpy_concat(audio_list, sr=sampling_rate).tobytes()
    stream.write(data)
    stream.stop_stream()
    stream.close()
    p.terminate()


    def record_and_transcribe_audio():
    recording = False
    def on_press(key):
    nonlocal recording
    if key == keyboard.Key.shift:
    recording = True

    def on_release(key):
    nonlocal recording
    if key == keyboard.Key.shift:
    print('Stop recording...')
    recording = False
    return False

    listener = keyboard.Listener(
    on_press=on_press,
    on_release=on_release)
    listener.start()

    print('Press shift to record...')
    while not recording:
    time.sleep(0.1)
    print('Start recording...')

    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, frames_per_buffer=1024, input=True)
    frames = []
    while recording:
    data = stream.read(1024, exception_on_overflow = False)
    frames.append(np.fromstring(data, dtype=np.int16))
    print('Finished recording')

    data = np.hstack(frames, dtype=np.float32) / 32768.0
    result = asr_model.transcribe(data)['text']
    stream.stop_stream()
    stream.close()
    p.terminate()
    return result


    def conversation():
    conversation_history = [{'role': 'system', 'content': SYSTEM_MESSAGE}]

    while True:
    user_input = record_and_transcribe_audio()
    conversation_history.append({'role': 'user', 'content': user_input})

    response = llm_client.chat.completions.create(model="local-model",
    messages=conversation_history,
    )
    chatbot_response = response.choices[0].message.content
    play_audio(chatbot_response)
    conversation_history.append({'role': 'assistant', 'content': chatbot_response})

    if len(conversation_history) > 20:
    conversation_history = conversation_history[-20:]

    conversation()