DerfJagged · June 27, 2025 05:01
diff --git a/remove_duplicate_files.py b/remove_duplicate_files.py
 #!/usr/bin/python3
 # By Derf Jagged
 # Update the two EXAMPLE entries, BOT_USERNAME_HERE, and PASSWORD_HERE

 import requests
 import json
 import mwclient
 from datetime import datetime

 def parse_iso_timestamp(ts_str):
    return datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%SZ")

 while(1):
    # Set up your MediaWiki API connection
    site = mwclient.Site('EXAMPLE.org', path='/wiki/')  # Replace with your wiki's URL

    # Authenticate if required
    site.login(username='BOT_USERNAME_HERE', password='PASSWORD_HERE') # Replace with your creds

    S = requests.Session()
    URL = "https://EXAMPLE.org/wiki/api.php" # Replace with your wiki API link

    PARAMS = {
        "action": "query",
        "list": "querypage",
        "qppage": "ListDuplicatedFiles",
        "qplimit": "50", #500
        "format": "json"
    }

    R = S.get(url=URL, params=PARAMS)
    pages_with_duplicates_json = R.json()

    #Debug:
    #print(pages_with_duplicates_json)
    #input("Press Enter to continue...")

    title_list = ""

    for result in pages_with_duplicates_json['query']['querypage']['results']:
        title = result['title']
        if ".pdf" not in title:
            continue
        title_list += result['title'] + "|"

    title_list = title_list[:-1]

    if (title_list == ""):
        print("No pages with duplicates found.")
        quit()

    #Debug:
    #print("\n" + title_list + "\n")

    PARAMS = {
        "action": "query",
        "titles": title_list,
        "prop": "duplicatefiles",
        "format": "json"
    }

    R = S.get(url=URL, params=PARAMS)
    data = R.json()

    #Debug:
    #print(data)
    #input("Press Enter to continue...")

    # Loop through the JSON data and process duplicate files
    for page_id, page_data in data['query']['pages'].items():
        original_title = page_data['title']
        original_page = site.pages[original_title]

        if not original_page.exists:
            print(f"Original file {original_title} does not exist.")
            continue

        # Get timestamp of the original file page
        original_info = next(original_page.revisions(dir='newer', api_chunk_size=1))
        original_timestamp = datetime(*original_info['timestamp'][:6])

        try:
            duplicate_files = page_data['duplicatefiles']
        except:
            print("No more duplicate files found")
            break

        for duplicate_file in duplicate_files:
            duplicate_name = "File:" + duplicate_file['name']

            # Get information about the duplicate file
            duplicate_page = site.pages[duplicate_name]

            # Check if the duplicate file exists
            if duplicate_page.exists:
                duplicate_timestamp = parse_iso_timestamp(duplicate_file['timestamp'])
                if duplicate_timestamp < original_timestamp:
                    print(f"{duplicate_name} is older than {original_title}.")                
                    
                    # Delete the duplicate file
                    original_page.delete(reason='Duplicate file - redirecting to original')

                    # Edit the duplicate file's content to redirect to the original file
                    redirect_text = f"#REDIRECT [[{duplicate_name}]]"
                    original_page.save(redirect_text, summary='Redirecting duplicate file to original')

                    print(f"Redirected {original_title} to {duplicate_name} and deleted the duplicate.")
                    break
                else:
                    # Delete the duplicate file
                    duplicate_page.delete(reason='Duplicate file - redirecting to original')

                    # Edit the duplicate file's content to redirect to the original file
                    redirect_text = f"#REDIRECT [[{original_title}]]"
                    duplicate_page.save(redirect_text, summary='Redirecting duplicate file to original')

                    print(f"Redirected {duplicate_name} to {original_title} and deleted the duplicate.")
            else:
                print(f"{duplicate_name} doesn't exist.")
            #input("Press Enter to continue...")
	#!/usr/bin/python3
	# By Derf Jagged
	# Update the two EXAMPLE entries, BOT_USERNAME_HERE, and PASSWORD_HERE

	import requests
	import json
	import mwclient
	from datetime import datetime

	def parse_iso_timestamp(ts_str):
	return datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%SZ")

	while(1):
	# Set up your MediaWiki API connection
	site = mwclient.Site('EXAMPLE.org', path='/wiki/') # Replace with your wiki's URL

	# Authenticate if required
	site.login(username='BOT_USERNAME_HERE', password='PASSWORD_HERE') # Replace with your creds

	S = requests.Session()
	URL = "https://EXAMPLE.org/wiki/api.php" # Replace with your wiki API link

	PARAMS = {
	"action": "query",
	"list": "querypage",
	"qppage": "ListDuplicatedFiles",
	"qplimit": "50", #500
	"format": "json"
	}

	R = S.get(url=URL, params=PARAMS)
	pages_with_duplicates_json = R.json()

	#Debug:
	#print(pages_with_duplicates_json)
	#input("Press Enter to continue...")

	title_list = ""

	for result in pages_with_duplicates_json['query']['querypage']['results']:
	title = result['title']
	if ".pdf" not in title:
	continue
	title_list += result['title'] + "\|"

	title_list = title_list[:-1]

	if (title_list == ""):
	print("No pages with duplicates found.")
	quit()

	#Debug:
	#print("\n" + title_list + "\n")

	PARAMS = {
	"action": "query",
	"titles": title_list,
	"prop": "duplicatefiles",
	"format": "json"
	}

	R = S.get(url=URL, params=PARAMS)
	data = R.json()

	#Debug:
	#print(data)
	#input("Press Enter to continue...")

	# Loop through the JSON data and process duplicate files
	for page_id, page_data in data['query']['pages'].items():
	original_title = page_data['title']
	original_page = site.pages[original_title]

	if not original_page.exists:
	print(f"Original file {original_title} does not exist.")
	continue

	# Get timestamp of the original file page
	original_info = next(original_page.revisions(dir='newer', api_chunk_size=1))
	original_timestamp = datetime(*original_info['timestamp'][:6])

	try:
	duplicate_files = page_data['duplicatefiles']
	except:
	print("No more duplicate files found")
	break

	for duplicate_file in duplicate_files:
	duplicate_name = "File:" + duplicate_file['name']

	# Get information about the duplicate file
	duplicate_page = site.pages[duplicate_name]

	# Check if the duplicate file exists
	if duplicate_page.exists:
	duplicate_timestamp = parse_iso_timestamp(duplicate_file['timestamp'])
	if duplicate_timestamp < original_timestamp:
	print(f"{duplicate_name} is older than {original_title}.")

	# Delete the duplicate file
	original_page.delete(reason='Duplicate file - redirecting to original')

	# Edit the duplicate file's content to redirect to the original file
	redirect_text = f"#REDIRECT [[{duplicate_name}]]"
	original_page.save(redirect_text, summary='Redirecting duplicate file to original')

	print(f"Redirected {original_title} to {duplicate_name} and deleted the duplicate.")
	break
	else:
	# Delete the duplicate file
	duplicate_page.delete(reason='Duplicate file - redirecting to original')

	# Edit the duplicate file's content to redirect to the original file
	redirect_text = f"#REDIRECT [[{original_title}]]"
	duplicate_page.save(redirect_text, summary='Redirecting duplicate file to original')

	print(f"Redirected {duplicate_name} to {original_title} and deleted the duplicate.")
	else:
	print(f"{duplicate_name} doesn't exist.")
	#input("Press Enter to continue...")