Skip to content

Instantly share code, notes, and snippets.

@larshb
Last active November 5, 2020 21:37
Show Gist options
  • Save larshb/8ec6657729662425dbc89232029eddfc to your computer and use it in GitHub Desktop.
Save larshb/8ec6657729662425dbc89232029eddfc to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import re
from argparse import ArgumentParser
from pathlib import Path
import logging as log
SRT_REGEX = r"(?P<index>\d+)\n(?P<time>[\d\:\ \-\>\,]+)\n(?P<text>(?:.|\n)*?)\n\n"
EPILOG = f'''Example:
python3 {__file__} -i subtitle.srt -bcap'''
parser = ArgumentParser(epilog=EPILOG)
parser.add_argument('-i', '--input', required=True)
parser.add_argument('-o', '--output')
parser.add_argument('-f', '--force', action='store_true',
help="Force write")
parser.add_argument('-v', '--verbose', action='store_true',
help="Verbose logging")
parser.add_argument('-p', '--parantheses', action='store_true',
help="Strip text between parentheses ( )")
parser.add_argument('-c', '--curlies', action='store_true',
help="Strip text between curly brackets { }")
parser.add_argument('-a', '--angles', action='store_true',
help="Strip text between angle brackets < >")
parser.add_argument('-b', '--brackets', action='store_true',
help="Strip text between square brackets [ ]")
parser.add_argument('-*', '--asterisks', action='store_true',
help="Strip text between asterisks * ... *")
parser.add_argument('-n', '--notes', action='store_true',
help="Strip any cue containing a music note (♪)")
parser.add_argument('-e', '--extra', type=str, default="",
help="Extra characters to strip when matched")
args = parser.parse_args()
if args.verbose:
log.basicConfig(level=log.INFO)
log.info(args)
log.info("Running in verbose mode")
strip = set(args.extra + ''.join({
'parantheses': '(',
'curlies': '{',
'angles': '<',
'brackets': '[',
'asterisks': '*',
'notes': '♪',
}.get(key, '') for key, val in vars(args).items() if val))
log.info(f"Stripping: {strip}")
input_path = Path(args.input)
if not input_path.is_file():
raise FileNotFoundError(f"File {input_path} not found")
else:
log.info(f"{input_path} found")
output_path = args.output if args.output else \
str(input_path.with_suffix('.clean')) + input_path.suffix
log.info(f"Using output file: {output_path}")
if not args.force and Path(output_path).exists():
print("Use -f to ignore")
c = input(f"{output_path} exists, overwrite? [y/N] ")
if not c.lower().startswith('y'):
log.info("Cancelled")
exit(0)
# Interpret SRT
srt = open(input_path).read()
captions = [match.groupdict() for match in \
re.finditer(SRT_REGEX, srt, re.DOTALL | re.MULTILINE)]
log.info(f"Found {len(captions)} entries")
# Sanity check
index = 1
for data in captions:
data['index'] = int(data['index'])
log.info(data)
if data['index'] != index:
log.warning(f"Index {index} not found, currently at {data['index']}")
if not args.force:
c = input("Cancel stripping? [y/N]")
if not c.lower().startswith('y'):
log.info('Cancelled')
exit(0)
index = data['index'] + 1
else:
index += 1
# Strip
index = 1
clean = []
for caption in captions:
text = caption['text'].strip()
# Check for encapsulations line by line
lines = []
for line in text.splitlines():
line = line.strip()
if line[0] in strip:
remove = line[-1] == {
'(': ')',
'[': ']',
'<': '>'
}.get(line[0], line[0])
log.info(f"Stripping: {line}")
continue
lines.append(line)
if not lines:
continue
log.debug(f"Keeping: {caption}")
caption['index'] = index
caption['text'] = '\n'.join(lines)
clean.append(caption)
index += 1
with open(output_path, 'w') as f:
for caption in clean:
print("{index}\n{time}\n{text}\n".format(**caption), file=f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment