Created
November 17, 2024 22:59
-
-
Save harrisonrw/6ecc30570e4475213706bc1331c6b998 to your computer and use it in GitHub Desktop.
Voice Cloning using Tortoise-TTS on Apple Silicon
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # See detailed instructions here: | |
| # https://robertharrison.ca/blog/voice-cloning-tortoise-tts-apple-silicon/ | |
| # Import PyTorch and Tortoise. | |
| import torch | |
| import torchaudio | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from tortoise.api import TextToSpeech | |
| from tortoise.utils.audio import load_audio, load_voice, load_voices | |
| # Instantiate a Tortoise TextToSpeech object. It will download all the models used by Tortoise from HuggingFace. | |
| tts = TextToSpeech() | |
| # This is the text that will be spoken. Try with more interesting strings. | |
| text = "Hello. This is your clone speaking." | |
| # Pick a "preset mode" to determine quality. | |
| # Options: {"ultra_fast", "fast" (default), "standard", "high_quality"}. | |
| preset = "fast" | |
| # This is the voice that will be cloned. | |
| # Set it to the name of the directory you created in the `tortoise/voices` directory. | |
| voice = "robert" | |
| # Generate speech with the custom voice. | |
| voice_samples, conditioning_latents = load_voice(voice) | |
| gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, preset=preset) | |
| torchaudio.save(f'generated-{voice}.wav', gen.squeeze(0).cpu(), 24000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment