{'af': 'Afrikaans', 'am': 'Amharic', 'an': 'Aragonese', 'ar': 'Arabic', 'as': 'Assamese', 'az': 'Azerbaijani', 'ba': 'Bashkir', 'be': 'Belarusian', 'bg': 'Bulgarian', 'bn': 'Bengali', 'bpy': 'Bishnupriya Manipuri', 'bs': 'Bosnian', 'ca': 'Catalan', 'ca-ba': 'Catalan (Balearic)', 'ca-nw': 'Catalan (North-western)', 'ca-va': 'Catalan (Valencian)', 'chr-US-Qaaa-x-west': 'Cherokee ', 'cmn': 'Chinese (Mandarin, latin as English)', 'cmn-latn-pinyin': 'Chinese (Mandarin, latin as Pinyin)', 'cs': 'Czech', 'cv': 'Chuvash', 'cy': 'Welsh', 'da': 'Danish', 'de': 'German', 'el': 'Greek', 'en-029': 'English (Caribbean)', 'en-gb': 'English (Great Britain)', 'en-gb-scotland': 'English (Scotland)', 'en-gb-x-gbclan': 'English (Lancaster)', 'en-gb-x-gbcwmd': 'English (West Midlands)', 'en-gb-x-rp': 'English (Received Pronunciation)', 'en-shaw': 'English (Shavian alphabet)', 'en-us': 'English (America)', 'en-us-nyc': 'English (America, New York City)', 'eo': 'Esperanto', 'es': 'Spanish (Spain)', 'es-419': 'Spanish (Latin America
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| uv pip install transformers tqdm | |
| """ | |
| from transformers import AutoTokenizer, AutoModel | |
| from transformers.models.bert.tokenization_bert_fast import BertTokenizerFast | |
| from tqdm import tqdm | |
| in_path = 'input.txt' | |
| out_path = 'output.txt' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import re | |
| HEBREW_PHONEMES = r'ˈaeioubvdhzχtjklmnsfpʃwʔɡʁʒ' | |
| HEBREW_LETTERS = r"אבגדהוזחטיכךלמםנןסעפףצץקרשת" | |
| HEBREW_WORD_PATTERN = rf'[{HEBREW_LETTERS}]+' | |
| HEBREW_PHONEME_WORD_PATTERN = rf'[{HEBREW_PHONEMES}]+' |
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 1 column, instead of 3 in line 1.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| gold_000_line_000|ʃalˈom tslˈil ʔavʁahˈam. | |
| gold_000_line_001|leɡˈamʁe, madhˈim, lˈo? | |
| gold_000_line_002|vedavkˈa biɡlˈal ʃekulˈanu ʔajˈinu ʔasukˈim bamilχamˈa, hapˈodkast hazˈe ʃelˈanu hˈu hizdamnˈut lehaʃlˈim peʔaʁˈim ʃˈel mˈa ʃekaʁˈa bamiʃpˈat baʃvuʔˈajim haʔaχʁonˈim. | |
| gold_000_line_003|ʔˈaz hajˈom ʔanˈaχnu nedabˈeʁ ʔˈal haχakiʁˈa haneɡdˈit ʃˈel ʔˈilan jeʃuˈa, χakiʁˈa ʃehˈi mamˈaʃ | |
| gold_000_line_004|haʁamˈat masˈaχ meʔˈal svˈaχ ʃˈel ʔinteʁˈesim beʔolˈam hatikʃˈoʁet hajisʁaʔelˈit. | |
| gold_000_line_005|ʃenatχˈil | |
| gold_000_line_006|ʔaɡˈav, kˈol ʔoʁˈeχ dˈin ʔosˈe ʔˈet zˈe benifʁˈad. | |
| gold_000_line_008|vebasˈof jaʔasˈe ʔˈet zˈe ʔoʁˈeχ hadˈin bˈoʔaz bˈentsuʁ ʃehˈu ʔoʁˈeχ hadˈin ʃˈel netanjˈa. | |
| gold_000_line_009|naχˈon. | |
| gold_000_line_011|ʃeʔitonaʔˈim bewˈala lˈo jaχlˈu liχtˈov beʔˈetsem jediʔˈot ʔˈal bˈezek. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| uv run prepare_ljspeech.py --input_path saspeech_automatic/metadata.csv --output_path saspeech_automatic/metadata1.csv | |
| """ | |
| import argparse | |
| import pandas as pd | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--input_path", type=str, required=True) | |
| parser.add_argument("--output_path", type=str, required=True) |
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque faucibus ex sapien vitae pellentesque sem placerat. In id cursus mi pretium tellus duis convallis. Tempus leo eu aenean sed diam urna tempor. Pulvinar vivamus fringilla lacus nec metus bibendum egestas. Iaculis massa nisl malesuada lacinia integer nunc posuere. Ut hendrerit semper vel class aptent taciti sociosqu. Ad litora torquent per conubia nostra inceptos himenaeos. | |
| Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque faucibus ex sapien vitae pellentesque sem placerat. In id cursus mi pretium tellus duis convallis. Tempus leo eu aenean sed diam urna tempor. Pulvinar vivamus fringilla lacus nec metus bibendum egestas. Iaculis massa nisl malesuada lacinia integer nunc posuere. Ut hendrerit semper vel class aptent taciti sociosqu. Ad litora torquent per conubia nostra inceptos himenaeos. | |
| Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque faucibus ex sapien vitae pellentesque sem placerat. In id cursus mi pretium tellus |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| uv venv -p3.10 | |
| uv pip insatll numpy==1.26.4 soundfile transformers | |
| uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu | |
| """ | |
| from transformers import VitsModel, AutoTokenizer | |
| import torch | |
| import soundfile as sf |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Hebrew diacritics encoding and decoding | |
| """ | |
| import unicodedata | |
| import re | |
| # Deduplicate duplicate phonetic diacritics | |
| NIQQUD_DEDUPLICATE = { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| wget https://huggingface.co/datasets/thewh1teagle/hebright/resolve/main/knesset.txt.zip | |
| unzip knesset.txt.zip | |
| uv run main.py | |
| """ | |
| from pathlib import Path | |
| import time | |
| import requests |
Cross compile espeak-ng for aarch64 Linux
# Cross compile for aarch64 Linux
# Alsa
git clone https://github.com/alsa-project/alsa-lib
cd alsa-libNewerOlder