Last active
March 26, 2023 22:06
-
-
Save GRAYgoose124/fd0d04e3e02f9ad41ad968d1d36fe032 to your computer and use it in GitHub Desktop.
A script to organize and clean html files generated by SingleFile browser extension.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import argparse | |
import datetime | |
import re | |
import os | |
import yaml | |
import nltk | |
from pathlib import Path | |
# TODO: Filter out malformed pages | |
# TODO: Whitelist titles to never remove | |
# TODO: better blacklist pattern matching to not accidentally delete pages | |
BLACKLIST = [ | |
'Youtube', | |
'DDoS-Guard', | |
'No Title', | |
'Sign In', | |
'Element _ _', | |
'GitHub', | |
'Discord', | |
'Download', | |
'Google Search' | |
"'s gists", | |
"Gmail" | |
] | |
def argparser(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-dlp', '--html_download_path', type=Path, default=os.getcwd()) | |
parser.add_argument('-bl', '--blacklist', nargs='+', default=BLACKLIST) | |
parser.add_argument('-rf', '--results_file', type=Path, default=Path('single_page_results.yml')) | |
parser.add_argument('-r', '--RUN', action='store_true', default=False) | |
parser.add_argument('-urf', '--USE_RESULTS_FILE', action='store_true', default=False) | |
return parser | |
def parse_html_folder(path, blacklist=None): | |
# glob all html files | |
html_files = path.glob('*.html') | |
# regex to extract title and time | |
title = r"(.*)" | |
time = r"(\(\d{1,2}_\d{1,2}_\d{4} \d{1,2}_\d{1,2}_\d{1,2} [AP]M\))" | |
optional_duplicate = r"(\(\d\))?" | |
pattern = rf'{title} {time}{optional_duplicate}.html' | |
html_name_regex = re.compile(pattern) | |
# clean up files | |
duplicates = {} | |
to_remove = set() | |
seen = set() | |
not_yet_duplicated = {} | |
for html_file in html_files: | |
matched = html_name_regex.search(html_file.name) | |
if matched is None: | |
continue | |
title, time, duplicate = matched.groups() | |
if duplicate is not None: | |
duplicate = int(duplicate[1:-1]) | |
else: | |
duplicate = 0 | |
time = datetime.datetime.strptime(time[1:-1], '%m_%d_%Y %I_%M_%S %p') | |
# blacklist | |
if blacklist is not None: | |
for blacklisted in blacklist: | |
if blacklisted.lower() in title.lower(): | |
#html_file.unlink() | |
to_remove.add(html_file) | |
# duplicates | |
if title in seen: | |
if title not in duplicates: | |
duplicates[title] = {} | |
if time not in duplicates[title]: | |
duplicates[title][time] = [] | |
if title in not_yet_duplicated: | |
t, d, hf = not_yet_duplicated.pop(title) | |
if t not in duplicates[title]: | |
duplicates[title][t] = [] | |
duplicates[title][t].append((d, hf)) | |
duplicates[title][time].append((duplicate, html_file)) | |
else: | |
not_yet_duplicated[title] = time, duplicate, html_file | |
seen.add(title) | |
# remove duplicates, etc | |
flattened = flatten_duplicates(duplicates) | |
for title, files in flattened.items(): | |
files.sort(key=lambda x: x[0]) | |
to_remove.update([f for _, f in files[1:]]) | |
return to_remove | |
def flatten_duplicates(d): | |
"""Duplicates are indiced by title then time. This function flattens the | |
dictionary so that it is indexed by title only. | |
""" | |
flattened = {} | |
for title, times in d.items(): | |
for time, duplicates in times.items(): | |
for _, html_file in duplicates: | |
if title not in flattened: | |
flattened[title] = [] | |
flattened[title].append((time, html_file)) | |
return flattened | |
def init_yaml(): | |
# yaml loader for pathlib.Path | |
def path_constructor(loader, node): | |
return Path(*loader.construct_sequence(node)) | |
yaml.add_constructor('tag:yaml.org,2002:python/object/apply:pathlib.WindowsPath', path_constructor) | |
yaml.add_constructor('tag:yaml.org,2002:python/object/apply:pathlib.PosixPath', path_constructor) | |
def main(): | |
init_yaml() | |
parser = argparser() | |
args = parser.parse_args() | |
if args.results_file.exists() and args.USE_RESULTS_FILE: | |
print(f'Loading results from {args.results_file} \U0001F98C') | |
with open(args.results_file, 'r') as f: | |
to_remove = yaml.full_load(f) | |
LOADED_FROM_FILE = True | |
else: | |
to_remove = parse_html_folder(args.html_download_path, blacklist=args.blacklist) | |
LOADED_FROM_FILE = False | |
count = len(to_remove) | |
if count == 0: | |
print('No files to remove. Exiting... \U0001F98B') | |
return | |
else: | |
print(f'Found {count} files to remove. \U0001F98E') | |
if not args.RUN: | |
run = input('Run? [y/N] ') | |
if args.RUN or run.lower() == 'y': | |
print('Removing files now...') | |
# remove files | |
for html_file in to_remove: | |
html_file.unlink() | |
if args.results_file.exists(): | |
args.results_file.unlink() | |
print('Done!\tGoodbye.\t\t\t\U0001F98B') | |
else: | |
# dump results to file | |
if not LOADED_FROM_FILE: | |
with open(args.results_file, 'w') as f: | |
yaml.dump(to_remove, f) | |
print(f'Wrote results to {args.results_file} \U0001F98B') | |
print(f'Dry run complete!\tGoodbye.') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment