Last active
January 24, 2025 19:20
-
-
Save gabrielfeo/10e2f819e8b5c98fe3cf3534a451f6c6 to your computer and use it in GitHub Desktop.
A script to download assets from an unintuitive University Of Coimbra file-hosting website.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
''' | |
Downloads requested assets based on a JSON file that lists a directory tree. | |
USAGE: | |
./download-deec-assets.py --json-file deec-assets.json --dir-name '[E2]Eletronica_2' | |
# or | |
./download-deec-assets.py --json-file deec-assets.json --dir-id '0fx80' | |
To obtain the JSON file, watch requests with the browser web inspector when first opening | |
https://my.deec.uc.pt/media/material. **Make sure to open it right before running the script** | |
(otherwise the server responds to API requests with HTTP 504 for whatever reason). | |
The JSON file is formatted as in this example (begining of the file): | |
```json | |
{ | |
"fileMap": { | |
"55964": { | |
"id": "55964", | |
"name": "Aulas", | |
"modDate": "2024-02-27T15:35:56.898Z", | |
"isDir": true, | |
"childrenCount": 2, | |
"childrenIds": [ | |
"55al4", | |
"55al5" | |
], | |
"parentId": "f9qs" | |
}, | |
"55974": { | |
"id": "55974", | |
"name": "testes", | |
"modDate": "2024-02-27T15:39:59.170Z", | |
"isDir": true, | |
"childrenCount": 1, | |
"childrenIds": [ | |
"55aog" | |
], | |
"parentId": "fcpv" | |
}, | |
"55975": { | |
"id": "55975", | |
"name": "aulas", | |
"modDate": "2024-02-27T15:40:10.882Z", | |
"isDir": true, | |
"childrenCount": 5, | |
"childrenIds": [ | |
"55aob", | |
"55aof", | |
"55aoc", | |
"55aod", | |
"55aoe" | |
], | |
"parentId": "fcyg" | |
}, | |
``` | |
Prompt this script was based on (for future reference): | |
The script requires a `dir` arg, which is the name of a directory. It then parses | |
for a single element in `fileMap` that has the same `name` value as the `dir` arg. | |
Then, it recurses all the children of that element, based on the `childrenIds` property, | |
and downloads each file that is not a directory, as indicated by `isDir` of each element. | |
Files are download with a `curl -LO https://my.deec.uc.pt/api/assets/explorer/<id>` command, | |
where `<id>` is the `id` property of each element, started via `subprocess`. Files are | |
downloaded concurrently. | |
''' | |
import json | |
import sys | |
import subprocess | |
import argparse | |
from pathlib import Path | |
ASSETS_URL = 'https://my.deec.uc.pt/api/assets/explorer' | |
downloads = [] | |
def download_file(element, output_dir): | |
url = f"{ASSETS_URL}/{element['id']}" | |
print(f"Downloading file: {element['name']} (ID: {element['id']})...") | |
return subprocess.Popen(["curl", "-sSL", url, "-o", str(output_dir / element['name'])]) | |
def require_single_directory(file_map, dir_name): | |
matching_dirs = [element for element in file_map.values() if element['name'] == dir_name and is_dir(element)] | |
if len(matching_dirs) == 1: | |
return matching_dirs[0] | |
if len(matching_dirs) < 1: | |
print(f"Error: Found no directory named '{dir_name}' in the file tree. Pass the exact name of the directory, or pass `--dir-id` instead. See") | |
if len(matching_dirs) > 1: | |
print(f"Error: Found more than one directory named '{dir_name}' in the file tree. You may pass `--dir-id` instead. See --help.") | |
sys.exit(1) | |
def is_dir(element): | |
return 'isDir' in element and element['isDir'] | |
def download_assets(file_map, dir_id, output_dir): | |
for element in file_map.values(): | |
if element['id'] == dir_id and is_dir(element): | |
print(f"Traversing directory: {element['name']} (ID: {element['id']})") | |
for child_id in element['childrenIds']: | |
child = file_map[child_id] | |
if is_dir(child): | |
download_assets(file_map, child['id'], output_dir) | |
else: | |
downloads.append(download_file(child, output_dir)) | |
break | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description='Download requested assets based on a JSON file that lists a directory tree.') | |
parser.add_argument('--output-dir', type=Path, default=Path('downloads'), help='Output directory to save the files') | |
parser.add_argument('--json-file', type=str, required=True, help='Path to the JSON file') | |
parser.add_argument('--dir-name', type=str, help='Pass either this or --dir-id. Name of the directory to download') | |
parser.add_argument('--dir-id', type=str, help="Pass either this or --dir-name. ID of the directory to download. To obtain the ID of the directory, open the browser developer tools, focus on the directory's icon and copy the value of its `data-chonky-file-id` attribute from the HTML. Note that the directory may consist of multiple nested divs, so make sure to look for the ID in all of them.") | |
args = parser.parse_args() | |
data = None | |
with open(args.json_file, 'r') as f: | |
data = json.load(f) | |
if (not args.dir_name and not args.dir_id) or (args.dir_name and args.dir_id): | |
print("Error: Pass exacly one of `dir_name` or `dir_id`. See --help.") | |
sys.exit(1) | |
args.output_dir.mkdir(exist_ok=True) | |
dir_id = args.dir_id or require_single_directory(data['fileMap'], args.dir_name)['id'] | |
download_assets(data['fileMap'], dir_id, args.output_dir) | |
for download in downloads: | |
download.wait() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment