""" Convert YouTube subtitles(vtt) to human readable text. Download only subtitles from YouTube with youtube-dl: youtube-dl --skip-download --convert-subs vtt <video_url> Note that default subtitle format provided by YouTube is ass, which is hard to process with simple regex. Luckily youtube-dl can convert ass to vtt, which is easier to process. To conver all vtt files inside a directory: find . -name "*.vtt" -exec python vtt2text.py {} \; """ import sys import re def remove_tags(text): """ Remove vtt markup tags """ tags = [ r'</c>', r'<c(\.color\w+)?>', r'<\d{2}:\d{2}:\d{2}\.\d{3}>', ] for pat in tags: text = re.sub(pat, '', text) # extract timestamp, only kep HH:MM text = re.sub( r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .* align:start position:0%', r'\g<1>', text ) text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE) return text def remove_header(lines): """ Remove vtt file header """ pos = -1 for mark in ('##', 'Language: en',): if mark in lines: pos = lines.index(mark) lines = lines[pos+1:] return lines def merge_duplicates(lines): """ Remove duplicated subtitles. Duplacates are always adjacent. """ last_timestamp = '' last_cap = '' for line in lines: if line == "": continue if re.match('^\d{2}:\d{2}$', line): if line != last_timestamp: yield line last_timestamp = line else: if line != last_cap: yield line last_cap = line def merge_short_lines(lines): buffer = '' for line in lines: if line == "" or re.match('^\d{2}:\d{2}$', line): yield '\n' + line continue if len(line+buffer) < 80: buffer += ' ' + line else: yield buffer.strip() buffer = line yield buffer def remove_remaining_timestamp_lines(lines): regex_remaining_timestamp_lines = re.compile('^\\n[0-9]{2}:[d0-9]{2}$') lines = [line for line in lines if not regex_remaining_timestamp_lines.match(line)] return lines def remove_webvtt_header(lines): lines[0] = re.sub('WEBVTT Kind: captions Language: [a-z]{2} ', '', lines[0]) return lines def main(): vtt_file_name = sys.argv[1] txt_name = re.sub(r'.vtt$', '.txt', vtt_file_name) with open(vtt_file_name) as f: text = f.read() text = remove_tags(text) lines = text.splitlines() lines = remove_header(lines) lines = merge_duplicates(lines) lines = list(lines) lines = merge_short_lines(lines) lines = list(lines) lines = remove_remaining_timestamp_lines(lines) lines = remove_webvtt_header(lines) with open(txt_name, 'w') as f: for line in lines: f.write(line) f.write("\n") if __name__ == "__main__": main()