Last active
June 17, 2025 12:35
-
-
Save mholtzhausen/beafda24a3bc9e4799b102bdc3df348d to your computer and use it in GitHub Desktop.
Converts WEBVTT subtitles (vtt) to plain text - pipe through.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Convert YouTube subtitles(vtt) to human readable text. | |
This script is designed to be used as a command-line tool, | |
reading from stdin and writing to stdout. | |
Example Usage: | |
yt-dlp --skip-download --sub-langs en --convert-subs vtt -o ~/.tmp.subtitle <video_url> | vtt2txt; cat ~/.tmp.subtitle.en.vtt | vtt2txt | fabric -ps extract_wisdom | |
""" | |
import sys | |
import re | |
def remove_tags(text): | |
""" | |
Remove vtt markup tags. | |
""" | |
tags = [ | |
r'</c>', | |
r'<c(\.color\w+)?>', | |
r'<\d{2}:\d{2}:\d{2}\.\d{3}>', | |
] | |
for pat in tags: | |
text = re.sub(pat, '', text) | |
# Extract timestamp, only keep HH:MM | |
text = re.sub( | |
r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .*', | |
r'\g<1>', | |
text | |
) | |
# Clean up lines that might be empty after tag removal | |
text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE) | |
return text | |
def remove_header(lines): | |
""" | |
Remove vtt file header lines. | |
""" | |
# Find the position of the first timestamp to reliably skip the header | |
start_pos = 0 | |
for i, line in enumerate(lines): | |
if '-->' in line: | |
start_pos = i | |
break | |
# Return lines from the first subtitle entry onwards | |
return lines[start_pos:] | |
def merge_duplicates(lines): | |
""" | |
Remove duplicated subtitles. Duplicates are always adjacent. | |
""" | |
last_timestamp = '' | |
last_cap = '' | |
for line in lines: | |
if not line.strip(): # Skip empty or whitespace-only lines | |
continue | |
# Check if the line is a timestamp | |
if re.match(r'^\d{2}:\d{2}$', line): | |
if line != last_timestamp: | |
yield line | |
last_timestamp = line | |
else: | |
# Check if the line is a subtitle text | |
if line.strip() != last_cap: | |
yield line | |
last_cap = line.strip() | |
def merge_short_lines(lines): | |
""" | |
Merge consecutive short subtitle lines into a single line up to a certain width. | |
""" | |
buffer = '' | |
for line in lines: | |
# If it's a timestamp, print the buffer and then the timestamp | |
if re.match(r'^\d{2}:\d{2}$', line.strip()): | |
if buffer: | |
yield buffer.strip() | |
buffer = '' | |
yield '\n' + line.strip() | |
continue | |
# If it's a text line, add it to the buffer | |
if len(buffer) + len(line) < 80: | |
buffer += ' ' + line.strip() | |
else: | |
yield buffer.strip() | |
buffer = line.strip() | |
# Yield any remaining text in the buffer | |
if buffer: | |
yield buffer.strip() | |
def process_vtt(text): | |
""" | |
Main processing pipeline for the VTT content. | |
""" | |
# 1. Initial tag and metadata removal | |
text = remove_tags(text) | |
lines = text.splitlines() | |
# 2. Remove header | |
lines = remove_header(lines) | |
# 3. Remove duplicate lines | |
lines = list(merge_duplicates(lines)) | |
# 4. Merge short lines for better readability | |
lines = list(merge_short_lines(lines)) | |
# 5. Final cleanup of any remaining timestamp-only lines | |
processed_lines = [] | |
for line in lines: | |
if line and not re.match(r'^\s*\d{2}:\d{2}\s*$', line): | |
processed_lines.append(line) | |
return '\n'.join(processed_lines).strip() | |
def main(): | |
""" | |
Reads from stdin, processes the VTT content, and prints to stdout. | |
""" | |
# Check if there is any input from stdin | |
if sys.stdin.isatty(): | |
print("Usage: cat your_file.vtt | vtt2txt", file=sys.stderr) | |
sys.exit(1) | |
# Read the entire VTT content from stdin | |
vtt_content = sys.stdin.read() | |
# Process the content | |
plain_text = process_vtt(vtt_content) | |
# Print the final, clean text to stdout | |
print(plain_text) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you to arturmartins for his code - this code was modified from that.