Last active
November 5, 2020 21:37
-
-
Save larshb/8ec6657729662425dbc89232029eddfc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import re | |
from argparse import ArgumentParser | |
from pathlib import Path | |
import logging as log | |
SRT_REGEX = r"(?P<index>\d+)\n(?P<time>[\d\:\ \-\>\,]+)\n(?P<text>(?:.|\n)*?)\n\n" | |
EPILOG = f'''Example: | |
python3 {__file__} -i subtitle.srt -bcap''' | |
parser = ArgumentParser(epilog=EPILOG) | |
parser.add_argument('-i', '--input', required=True) | |
parser.add_argument('-o', '--output') | |
parser.add_argument('-f', '--force', action='store_true', | |
help="Force write") | |
parser.add_argument('-v', '--verbose', action='store_true', | |
help="Verbose logging") | |
parser.add_argument('-p', '--parantheses', action='store_true', | |
help="Strip text between parentheses ( )") | |
parser.add_argument('-c', '--curlies', action='store_true', | |
help="Strip text between curly brackets { }") | |
parser.add_argument('-a', '--angles', action='store_true', | |
help="Strip text between angle brackets < >") | |
parser.add_argument('-b', '--brackets', action='store_true', | |
help="Strip text between square brackets [ ]") | |
parser.add_argument('-*', '--asterisks', action='store_true', | |
help="Strip text between asterisks * ... *") | |
parser.add_argument('-n', '--notes', action='store_true', | |
help="Strip any cue containing a music note (♪)") | |
parser.add_argument('-e', '--extra', type=str, default="", | |
help="Extra characters to strip when matched") | |
args = parser.parse_args() | |
if args.verbose: | |
log.basicConfig(level=log.INFO) | |
log.info(args) | |
log.info("Running in verbose mode") | |
strip = set(args.extra + ''.join({ | |
'parantheses': '(', | |
'curlies': '{', | |
'angles': '<', | |
'brackets': '[', | |
'asterisks': '*', | |
'notes': '♪', | |
}.get(key, '') for key, val in vars(args).items() if val)) | |
log.info(f"Stripping: {strip}") | |
input_path = Path(args.input) | |
if not input_path.is_file(): | |
raise FileNotFoundError(f"File {input_path} not found") | |
else: | |
log.info(f"{input_path} found") | |
output_path = args.output if args.output else \ | |
str(input_path.with_suffix('.clean')) + input_path.suffix | |
log.info(f"Using output file: {output_path}") | |
if not args.force and Path(output_path).exists(): | |
print("Use -f to ignore") | |
c = input(f"{output_path} exists, overwrite? [y/N] ") | |
if not c.lower().startswith('y'): | |
log.info("Cancelled") | |
exit(0) | |
# Interpret SRT | |
srt = open(input_path).read() | |
captions = [match.groupdict() for match in \ | |
re.finditer(SRT_REGEX, srt, re.DOTALL | re.MULTILINE)] | |
log.info(f"Found {len(captions)} entries") | |
# Sanity check | |
index = 1 | |
for data in captions: | |
data['index'] = int(data['index']) | |
log.info(data) | |
if data['index'] != index: | |
log.warning(f"Index {index} not found, currently at {data['index']}") | |
if not args.force: | |
c = input("Cancel stripping? [y/N]") | |
if not c.lower().startswith('y'): | |
log.info('Cancelled') | |
exit(0) | |
index = data['index'] + 1 | |
else: | |
index += 1 | |
# Strip | |
index = 1 | |
clean = [] | |
for caption in captions: | |
text = caption['text'].strip() | |
# Check for encapsulations line by line | |
lines = [] | |
for line in text.splitlines(): | |
line = line.strip() | |
if line[0] in strip: | |
remove = line[-1] == { | |
'(': ')', | |
'[': ']', | |
'<': '>' | |
}.get(line[0], line[0]) | |
log.info(f"Stripping: {line}") | |
continue | |
lines.append(line) | |
if not lines: | |
continue | |
log.debug(f"Keeping: {caption}") | |
caption['index'] = index | |
caption['text'] = '\n'.join(lines) | |
clean.append(caption) | |
index += 1 | |
with open(output_path, 'w') as f: | |
for caption in clean: | |
print("{index}\n{time}\n{text}\n".format(**caption), file=f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment