mholtzhausen · June 17, 2025 12:35 · mholtzhausen · Jun 16, 2025
diff --git a/vtt2txt.py b/vtt2txt.py
 #!/usr/bin/env python3

 """
 Convert YouTube subtitles(vtt) to human readable text.
 This script is designed to be used as a command-line tool,
 reading from stdin and writing to stdout.

 Example Usage:
 yt-dlp --skip-download --sub-langs en --convert-subs vtt -o ~/.tmp.subtitle <video_url> | vtt2txt; cat ~/.tmp.subtitle.en.vtt | vtt2txt | fabric -ps extract_wisdom
 """

 import sys
 import re


 def remove_tags(text):
    """
    Remove vtt markup tags.
    """
    tags = [
        r'</c>',
        r'<c(\.color\w+)?>',
        r'<\d{2}:\d{2}:\d{2}\.\d{3}>',
    ]

    for pat in tags:
        text = re.sub(pat, '', text)

    # Extract timestamp, only keep HH:MM
    text = re.sub(
        r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .*',
        r'\g<1>',
        text
    )
    
    # Clean up lines that might be empty after tag removal
    text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
    return text

 def remove_header(lines):
    """
    Remove vtt file header lines.
    """
    # Find the position of the first timestamp to reliably skip the header
    start_pos = 0
    for i, line in enumerate(lines):
        if '-->' in line:
            start_pos = i
            break
    
    # Return lines from the first subtitle entry onwards
    return lines[start_pos:]


 def merge_duplicates(lines):
    """
    Remove duplicated subtitles. Duplicates are always adjacent.
    """
    last_timestamp = ''
    last_cap = ''
    for line in lines:
        if not line.strip():  # Skip empty or whitespace-only lines
            continue
        
        # Check if the line is a timestamp
        if re.match(r'^\d{2}:\d{2}$', line):
            if line != last_timestamp:
                yield line
                last_timestamp = line
        else:
            # Check if the line is a subtitle text
            if line.strip() != last_cap:
                yield line
                last_cap = line.strip()


 def merge_short_lines(lines):
    """
    Merge consecutive short subtitle lines into a single line up to a certain width.
    """
    buffer = ''
    for line in lines:
        # If it's a timestamp, print the buffer and then the timestamp
        if re.match(r'^\d{2}:\d{2}$', line.strip()):
            if buffer:
                yield buffer.strip()
            buffer = ''
            yield '\n' + line.strip()
            continue

        # If it's a text line, add it to the buffer
        if len(buffer) + len(line) < 80:
            buffer += ' ' + line.strip()
        else:
            yield buffer.strip()
            buffer = line.strip()
            
    # Yield any remaining text in the buffer
    if buffer:
        yield buffer.strip()

 def process_vtt(text):
    """
    Main processing pipeline for the VTT content.
    """
    # 1. Initial tag and metadata removal
    text = remove_tags(text)
    lines = text.splitlines()
    
    # 2. Remove header
    lines = remove_header(lines)
    
    # 3. Remove duplicate lines
    lines = list(merge_duplicates(lines))
    
    # 4. Merge short lines for better readability
    lines = list(merge_short_lines(lines))
    
    # 5. Final cleanup of any remaining timestamp-only lines
    processed_lines = []
    for line in lines:
        if line and not re.match(r'^\s*\d{2}:\d{2}\s*$', line):
            processed_lines.append(line)
            
    return '\n'.join(processed_lines).strip()


 def main():
    """
    Reads from stdin, processes the VTT content, and prints to stdout.
    """
    # Check if there is any input from stdin
    if sys.stdin.isatty():
        print("Usage: cat your_file.vtt | vtt2txt", file=sys.stderr)
        sys.exit(1)

    # Read the entire VTT content from stdin
    vtt_content = sys.stdin.read()
    
    # Process the content
    plain_text = process_vtt(vtt_content)
    
    # Print the final, clean text to stdout
    print(plain_text)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	"""
	Convert YouTube subtitles(vtt) to human readable text.
	This script is designed to be used as a command-line tool,
	reading from stdin and writing to stdout.

	Example Usage:
	yt-dlp --skip-download --sub-langs en --convert-subs vtt -o ~/.tmp.subtitle <video_url> \| vtt2txt; cat ~/.tmp.subtitle.en.vtt \| vtt2txt \| fabric -ps extract_wisdom
	"""

	import sys
	import re


	def remove_tags(text):
	"""
	Remove vtt markup tags.
	"""
	tags = [
	r'</c>',
	r'<c(\.color\w+)?>',
	r'<\d{2}:\d{2}:\d{2}\.\d{3}>',
	]

	for pat in tags:
	text = re.sub(pat, '', text)

	# Extract timestamp, only keep HH:MM
	text = re.sub(
	r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .*',
	r'\g<1>',
	text
	)

	# Clean up lines that might be empty after tag removal
	text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
	return text

	def remove_header(lines):
	"""
	Remove vtt file header lines.
	"""
	# Find the position of the first timestamp to reliably skip the header
	start_pos = 0
	for i, line in enumerate(lines):
	if '-->' in line:
	start_pos = i
	break

	# Return lines from the first subtitle entry onwards
	return lines[start_pos:]


	def merge_duplicates(lines):
	"""
	Remove duplicated subtitles. Duplicates are always adjacent.
	"""
	last_timestamp = ''
	last_cap = ''
	for line in lines:
	if not line.strip(): # Skip empty or whitespace-only lines
	continue

	# Check if the line is a timestamp
	if re.match(r'^\d{2}:\d{2}$', line):
	if line != last_timestamp:
	yield line
	last_timestamp = line
	else:
	# Check if the line is a subtitle text
	if line.strip() != last_cap:
	yield line
	last_cap = line.strip()


	def merge_short_lines(lines):
	"""
	Merge consecutive short subtitle lines into a single line up to a certain width.
	"""
	buffer = ''
	for line in lines:
	# If it's a timestamp, print the buffer and then the timestamp
	if re.match(r'^\d{2}:\d{2}$', line.strip()):
	if buffer:
	yield buffer.strip()
	buffer = ''
	yield '\n' + line.strip()
	continue

	# If it's a text line, add it to the buffer
	if len(buffer) + len(line) < 80:
	buffer += ' ' + line.strip()
	else:
	yield buffer.strip()
	buffer = line.strip()

	# Yield any remaining text in the buffer
	if buffer:
	yield buffer.strip()

	def process_vtt(text):
	"""
	Main processing pipeline for the VTT content.
	"""
	# 1. Initial tag and metadata removal
	text = remove_tags(text)
	lines = text.splitlines()

	# 2. Remove header
	lines = remove_header(lines)

	# 3. Remove duplicate lines
	lines = list(merge_duplicates(lines))

	# 4. Merge short lines for better readability
	lines = list(merge_short_lines(lines))

	# 5. Final cleanup of any remaining timestamp-only lines
	processed_lines = []
	for line in lines:
	if line and not re.match(r'^\s\d{2}:\d{2}\s$', line):
	processed_lines.append(line)

	return '\n'.join(processed_lines).strip()


	def main():
	"""
	Reads from stdin, processes the VTT content, and prints to stdout.
	"""
	# Check if there is any input from stdin
	if sys.stdin.isatty():
	print("Usage: cat your_file.vtt \| vtt2txt", file=sys.stderr)
	sys.exit(1)

	# Read the entire VTT content from stdin
	vtt_content = sys.stdin.read()

	# Process the content
	plain_text = process_vtt(vtt_content)

	# Print the final, clean text to stdout
	print(plain_text)


	if __name__ == "__main__":
	main()