Skip to content

Instantly share code, notes, and snippets.

@mholtzhausen
Last active June 17, 2025 12:35
Show Gist options
  • Save mholtzhausen/beafda24a3bc9e4799b102bdc3df348d to your computer and use it in GitHub Desktop.
Save mholtzhausen/beafda24a3bc9e4799b102bdc3df348d to your computer and use it in GitHub Desktop.
Converts WEBVTT subtitles (vtt) to plain text - pipe through.
#!/usr/bin/env python3
"""
Convert YouTube subtitles(vtt) to human readable text.
This script is designed to be used as a command-line tool,
reading from stdin and writing to stdout.
Example Usage:
yt-dlp --skip-download --sub-langs en --convert-subs vtt -o ~/.tmp.subtitle <video_url> | vtt2txt; cat ~/.tmp.subtitle.en.vtt | vtt2txt | fabric -ps extract_wisdom
"""
import sys
import re
def remove_tags(text):
"""
Remove vtt markup tags.
"""
tags = [
r'</c>',
r'<c(\.color\w+)?>',
r'<\d{2}:\d{2}:\d{2}\.\d{3}>',
]
for pat in tags:
text = re.sub(pat, '', text)
# Extract timestamp, only keep HH:MM
text = re.sub(
r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .*',
r'\g<1>',
text
)
# Clean up lines that might be empty after tag removal
text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
return text
def remove_header(lines):
"""
Remove vtt file header lines.
"""
# Find the position of the first timestamp to reliably skip the header
start_pos = 0
for i, line in enumerate(lines):
if '-->' in line:
start_pos = i
break
# Return lines from the first subtitle entry onwards
return lines[start_pos:]
def merge_duplicates(lines):
"""
Remove duplicated subtitles. Duplicates are always adjacent.
"""
last_timestamp = ''
last_cap = ''
for line in lines:
if not line.strip(): # Skip empty or whitespace-only lines
continue
# Check if the line is a timestamp
if re.match(r'^\d{2}:\d{2}$', line):
if line != last_timestamp:
yield line
last_timestamp = line
else:
# Check if the line is a subtitle text
if line.strip() != last_cap:
yield line
last_cap = line.strip()
def merge_short_lines(lines):
"""
Merge consecutive short subtitle lines into a single line up to a certain width.
"""
buffer = ''
for line in lines:
# If it's a timestamp, print the buffer and then the timestamp
if re.match(r'^\d{2}:\d{2}$', line.strip()):
if buffer:
yield buffer.strip()
buffer = ''
yield '\n' + line.strip()
continue
# If it's a text line, add it to the buffer
if len(buffer) + len(line) < 80:
buffer += ' ' + line.strip()
else:
yield buffer.strip()
buffer = line.strip()
# Yield any remaining text in the buffer
if buffer:
yield buffer.strip()
def process_vtt(text):
"""
Main processing pipeline for the VTT content.
"""
# 1. Initial tag and metadata removal
text = remove_tags(text)
lines = text.splitlines()
# 2. Remove header
lines = remove_header(lines)
# 3. Remove duplicate lines
lines = list(merge_duplicates(lines))
# 4. Merge short lines for better readability
lines = list(merge_short_lines(lines))
# 5. Final cleanup of any remaining timestamp-only lines
processed_lines = []
for line in lines:
if line and not re.match(r'^\s*\d{2}:\d{2}\s*$', line):
processed_lines.append(line)
return '\n'.join(processed_lines).strip()
def main():
"""
Reads from stdin, processes the VTT content, and prints to stdout.
"""
# Check if there is any input from stdin
if sys.stdin.isatty():
print("Usage: cat your_file.vtt | vtt2txt", file=sys.stderr)
sys.exit(1)
# Read the entire VTT content from stdin
vtt_content = sys.stdin.read()
# Process the content
plain_text = process_vtt(vtt_content)
# Print the final, clean text to stdout
print(plain_text)
if __name__ == "__main__":
main()
@mholtzhausen
Copy link
Author

Thank you to arturmartins for his code - this code was modified from that.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment