Created
October 26, 2023 10:53
-
-
Save arturmartins/1c78de3e8c21ffce81a17dc2f2181de4 to your computer and use it in GitHub Desktop.
Converts WEBVTT subtitles (vtt) to plain text.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Converts WEBVTT subtitles (vtt) to plain text. | |
It removes all time related info as well as duplicated and empty lines. | |
""" | |
# Author: Artur Martins <[email protected]> | |
# Version: 1.0 | |
# Date: 2023-Oct-25 | |
import re | |
import argparse | |
import os | |
ENCODING_TYPE = "utf-8" | |
HEADER = "WEBVTT" | |
def clean_line(line: str) -> str: | |
""" | |
Remove all WebVTT tags and time codes from the given line. | |
Args: | |
line (str): The line of text to clean. | |
Returns: | |
str: The cleaned line with all tags and time codes removed and leading/trailing whitespace stripped. | |
""" | |
# Remove all WebVTT tags and time codes | |
cleaned_line = re.sub(r"<.*?>", "", line) | |
cleaned_line = re.sub(r"\d{2}:\d{2}:\d{2}\.\d{3}", "", cleaned_line) | |
return cleaned_line.strip() | |
def convert_webvtt_to_text(input_path: str, output_path: str, verbose: bool) -> None: | |
""" | |
Convert a WebVTT file to plain text. | |
Args: | |
input_path (str): The path to the WebVTT input file. | |
output_path (str): The path to the output text file. | |
verbose (bool): If True, print the cleaned lines as they are written. | |
Returns: | |
None | |
""" | |
last_written_line = "" | |
with open(input_path, "r", encoding=ENCODING_TYPE) as infile, open( | |
output_path, "w", encoding=ENCODING_TYPE | |
) as outfile: | |
lines = infile.readlines() | |
for line in lines: | |
line = line.strip() | |
# Skip time lines or WebVTT header | |
if "-->" in line or line == HEADER: | |
continue | |
# Skip empty lines | |
if not line: | |
continue | |
cleaned_line = clean_line(line) | |
if cleaned_line and cleaned_line != last_written_line: | |
if verbose: | |
print(f"Writing: {cleaned_line}") | |
outfile.write(cleaned_line + "\n") | |
last_written_line = cleaned_line | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Convert WebVTT to text.") | |
parser.add_argument( | |
"-i", "--input", required=True, help="Path to the input WebVTT file." | |
) | |
parser.add_argument( | |
"-o", | |
"--output", | |
help="Path to the output text file. Defaults to input file name with .txt extension.", | |
) | |
parser.add_argument( | |
"-v", "--verbose", action="store_true", help="Enable verbose output." | |
) | |
args = parser.parse_args() | |
if args.output is None: | |
base_name = os.path.splitext(args.input)[0] | |
args.output = f"{base_name}.txt" | |
if args.verbose: | |
print(f"Converting {args.input} to {args.output}...") | |
convert_webvtt_to_text(args.input, args.output, args.verbose) |
Thank you for this.
I wanted to use it as a filter so that I could:
yt-dlp --skip-download --sub-langs en --convert-subs vtt -o ~/.tmp.subtitle <video_url> | vtt2txt; cat ~/.tmp.subtitle.en.vtt | vtt2txt | fabric -ps extract_wisdom
So I made this into this filter
You're welcome, @mholtzhausen ! Your filter it's pretty cool! I like it! I will use it for myself as well :)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sorry @epogrebnyak, I completely missed your message.
Sure, no problem. Feel free to leave my name and email in the script as is. Good luck!