Skip to content

Instantly share code, notes, and snippets.

@DerfJagged
Last active June 27, 2025 05:01
Show Gist options
  • Save DerfJagged/7e581470fed0f54bc3f706045dc20068 to your computer and use it in GitHub Desktop.
Save DerfJagged/7e581470fed0f54bc3f706045dc20068 to your computer and use it in GitHub Desktop.
Finds duplicate files on your MediaWiki instance, deletes the newer one, and adds a redirect on the new page to the first uploaded version.
#!/usr/bin/python3
# By Derf Jagged
# Update the two EXAMPLE entries, BOT_USERNAME_HERE, and PASSWORD_HERE
import requests
import json
import mwclient
from datetime import datetime
def parse_iso_timestamp(ts_str):
return datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%SZ")
while(1):
# Set up your MediaWiki API connection
site = mwclient.Site('EXAMPLE.org', path='/wiki/') # Replace with your wiki's URL
# Authenticate if required
site.login(username='BOT_USERNAME_HERE', password='PASSWORD_HERE') # Replace with your creds
S = requests.Session()
URL = "https://EXAMPLE.org/wiki/api.php" # Replace with your wiki API link
PARAMS = {
"action": "query",
"list": "querypage",
"qppage": "ListDuplicatedFiles",
"qplimit": "50", #500
"format": "json"
}
R = S.get(url=URL, params=PARAMS)
pages_with_duplicates_json = R.json()
#Debug:
#print(pages_with_duplicates_json)
#input("Press Enter to continue...")
title_list = ""
for result in pages_with_duplicates_json['query']['querypage']['results']:
title = result['title']
if ".pdf" not in title:
continue
title_list += result['title'] + "|"
title_list = title_list[:-1]
if (title_list == ""):
print("No pages with duplicates found.")
quit()
#Debug:
#print("\n" + title_list + "\n")
PARAMS = {
"action": "query",
"titles": title_list,
"prop": "duplicatefiles",
"format": "json"
}
R = S.get(url=URL, params=PARAMS)
data = R.json()
#Debug:
#print(data)
#input("Press Enter to continue...")
# Loop through the JSON data and process duplicate files
for page_id, page_data in data['query']['pages'].items():
original_title = page_data['title']
original_page = site.pages[original_title]
if not original_page.exists:
print(f"Original file {original_title} does not exist.")
continue
# Get timestamp of the original file page
original_info = next(original_page.revisions(dir='newer', api_chunk_size=1))
original_timestamp = datetime(*original_info['timestamp'][:6])
try:
duplicate_files = page_data['duplicatefiles']
except:
print("No more duplicate files found")
break
for duplicate_file in duplicate_files:
duplicate_name = "File:" + duplicate_file['name']
# Get information about the duplicate file
duplicate_page = site.pages[duplicate_name]
# Check if the duplicate file exists
if duplicate_page.exists:
duplicate_timestamp = parse_iso_timestamp(duplicate_file['timestamp'])
if duplicate_timestamp < original_timestamp:
print(f"{duplicate_name} is older than {original_title}.")
# Delete the duplicate file
original_page.delete(reason='Duplicate file - redirecting to original')
# Edit the duplicate file's content to redirect to the original file
redirect_text = f"#REDIRECT [[{duplicate_name}]]"
original_page.save(redirect_text, summary='Redirecting duplicate file to original')
print(f"Redirected {original_title} to {duplicate_name} and deleted the duplicate.")
break
else:
# Delete the duplicate file
duplicate_page.delete(reason='Duplicate file - redirecting to original')
# Edit the duplicate file's content to redirect to the original file
redirect_text = f"#REDIRECT [[{original_title}]]"
duplicate_page.save(redirect_text, summary='Redirecting duplicate file to original')
print(f"Redirected {duplicate_name} to {original_title} and deleted the duplicate.")
else:
print(f"{duplicate_name} doesn't exist.")
#input("Press Enter to continue...")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment