Last active
June 27, 2025 05:01
-
-
Save DerfJagged/7e581470fed0f54bc3f706045dc20068 to your computer and use it in GitHub Desktop.
Finds duplicate files on your MediaWiki instance, deletes the newer one, and adds a redirect on the new page to the first uploaded version.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# By Derf Jagged | |
# Update the two EXAMPLE entries, BOT_USERNAME_HERE, and PASSWORD_HERE | |
import requests | |
import json | |
import mwclient | |
from datetime import datetime | |
def parse_iso_timestamp(ts_str): | |
return datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%SZ") | |
while(1): | |
# Set up your MediaWiki API connection | |
site = mwclient.Site('EXAMPLE.org', path='/wiki/') # Replace with your wiki's URL | |
# Authenticate if required | |
site.login(username='BOT_USERNAME_HERE', password='PASSWORD_HERE') # Replace with your creds | |
S = requests.Session() | |
URL = "https://EXAMPLE.org/wiki/api.php" # Replace with your wiki API link | |
PARAMS = { | |
"action": "query", | |
"list": "querypage", | |
"qppage": "ListDuplicatedFiles", | |
"qplimit": "50", #500 | |
"format": "json" | |
} | |
R = S.get(url=URL, params=PARAMS) | |
pages_with_duplicates_json = R.json() | |
#Debug: | |
#print(pages_with_duplicates_json) | |
#input("Press Enter to continue...") | |
title_list = "" | |
for result in pages_with_duplicates_json['query']['querypage']['results']: | |
title = result['title'] | |
if ".pdf" not in title: | |
continue | |
title_list += result['title'] + "|" | |
title_list = title_list[:-1] | |
if (title_list == ""): | |
print("No pages with duplicates found.") | |
quit() | |
#Debug: | |
#print("\n" + title_list + "\n") | |
PARAMS = { | |
"action": "query", | |
"titles": title_list, | |
"prop": "duplicatefiles", | |
"format": "json" | |
} | |
R = S.get(url=URL, params=PARAMS) | |
data = R.json() | |
#Debug: | |
#print(data) | |
#input("Press Enter to continue...") | |
# Loop through the JSON data and process duplicate files | |
for page_id, page_data in data['query']['pages'].items(): | |
original_title = page_data['title'] | |
original_page = site.pages[original_title] | |
if not original_page.exists: | |
print(f"Original file {original_title} does not exist.") | |
continue | |
# Get timestamp of the original file page | |
original_info = next(original_page.revisions(dir='newer', api_chunk_size=1)) | |
original_timestamp = datetime(*original_info['timestamp'][:6]) | |
try: | |
duplicate_files = page_data['duplicatefiles'] | |
except: | |
print("No more duplicate files found") | |
break | |
for duplicate_file in duplicate_files: | |
duplicate_name = "File:" + duplicate_file['name'] | |
# Get information about the duplicate file | |
duplicate_page = site.pages[duplicate_name] | |
# Check if the duplicate file exists | |
if duplicate_page.exists: | |
duplicate_timestamp = parse_iso_timestamp(duplicate_file['timestamp']) | |
if duplicate_timestamp < original_timestamp: | |
print(f"{duplicate_name} is older than {original_title}.") | |
# Delete the duplicate file | |
original_page.delete(reason='Duplicate file - redirecting to original') | |
# Edit the duplicate file's content to redirect to the original file | |
redirect_text = f"#REDIRECT [[{duplicate_name}]]" | |
original_page.save(redirect_text, summary='Redirecting duplicate file to original') | |
print(f"Redirected {original_title} to {duplicate_name} and deleted the duplicate.") | |
break | |
else: | |
# Delete the duplicate file | |
duplicate_page.delete(reason='Duplicate file - redirecting to original') | |
# Edit the duplicate file's content to redirect to the original file | |
redirect_text = f"#REDIRECT [[{original_title}]]" | |
duplicate_page.save(redirect_text, summary='Redirecting duplicate file to original') | |
print(f"Redirected {duplicate_name} to {original_title} and deleted the duplicate.") | |
else: | |
print(f"{duplicate_name} doesn't exist.") | |
#input("Press Enter to continue...") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment