Skip to content

Instantly share code, notes, and snippets.

@nklamann
Last active November 18, 2024 09:53
Show Gist options
  • Save nklamann/02dbf4d4a6add63ea67ec838aa9f7e84 to your computer and use it in GitHub Desktop.
Save nklamann/02dbf4d4a6add63ea67ec838aa9f7e84 to your computer and use it in GitHub Desktop.
Use the paperless ngx JSON-API to save data locally

This script is run at a client computer, developed under windows , but should run cross-platform. The local JSON files can be further analysed with tools like 'jq' and all kind of metainformation becomes accessible.

The script downloads data for every api endpoint.

If the variable GET_ALL_DATA in the source code is set to True, the script follows the next URL of the date recursively and collects the resutof all pages results into one result .

Speed and ressource utilization depend on a lot of foctars, subjectively I would call the thing fast.

The screen output of a test run looks like this

correspondents
--> .\data\correspondents.json (23 results)
document_types
--> .\data\document_types.json (10 results)
documents
--> .\data\documents.json date not complete without recursion
--> .\data\documents.json (25 results)
logs
--> .\data\logs.json (JSON has no element for results)
tags
--> .\data\tags.json (14 results)
saved_views
--> .\data\saved_views.json (2 results)
storage_paths
--> .\data\storage_paths.json (9 results)
tasks
--> .\data\tasks.json (JSON has no element for results)
users
--> .\data\users.json (2 results)
groups
--> .\data\groups.json (4 results)
mail_accounts
--> .\data\mail_accounts.json (2 results)
mail_rules
--> .\data\mail_rules.json (4 results)
share_links
--> .\data\share_links.json (0 results)
workflow_triggers
--> .\data\workflow_triggers.json (8 results)
workflow_actions
--> .\data\workflow_actions.json (8 results)
workflows
--> .\data\workflows.json (7 results)
custom_fields
--> .\data\custom_fields.json (2 results)
config
--> .\data\config.json (JSON has no element for results)
"""Write all JSON data from a paperless instance to local JSON files"""
import os
import json
import requests # pylint: disable=import-error
from pl_secrets import TK, URL
## pl_secrets.py example:
# TK="superuser-token"
# URL="<BASE_URL>/api/?format=json"
HEADERS = {"Authorization": "Token " + TK}
FILEROOT_DATA = os.path.join(".", "data")
GET_ALL_DATA = False
def get_api_dict() -> dict:
"""Get a dictionary of all api endpints
Returns:
api_d: the result of the XXX/api/?format=json call
"""
r = requests.get(URL, headers=HEADERS)
api_d = r.json()
return api_d
def concat_results(next_url: str, level: int = 1) -> dict:
"""Recursively amend the list of 'results' with the 'concat_results' from the following page
Args:
level (int): The level of recursion, for diagnostic purposes
next_url (str): the url in 'next'
Returns:
dict: the 'results' of this page and the 'concat_result' of all the follwing pages
"""
next_level = level + 1
r = requests.get(next_url, headers=HEADERS)
d = r.json()
concat_result = d["results"]
if d["next"]:
concat_result = concat_result + concat_results(
next_url=d["next"], level=next_level
)
# print (f"{next_level * '-' } {next_level} {len(concat_result )}")
return concat_result
def write_data(request_data: dict, api_name: str, get_all_data: bool = True) -> None:
"""Write the data from the api endpoint
Write out , if data is not complete (next url is not None)
Args:
request_data (dict): the result of the request to the api endpoint
api_name (str): the api name
get_all_data (bool): get all data revcursively
"""
try:
os.mkdir(path=FILEROOT_DATA)
except FileExistsError:
pass
rqd = request_data # we might need to manipulate the data, so we copy
fname_data = os.path.join(FILEROOT_DATA, f"{api_name}.json")
try:
if rqd["next"] and get_all_data: # follow the next url
rqd["results"] = rqd["results"] + concat_results(next_url=rqd["next"])
elif rqd["next"] and (not get_all_data): # results are NOT complete
print(f"--> {fname_data} date not complete without recursion")
elif not rqd["next"]: # results are
pass
except (KeyError, TypeError):
print(f"--> {fname_data} (JSON has no element for results)")
else:
print(f"--> {fname_data} ({len(rqd['results'])} results)")
with open(fname_data, mode="w", encoding="utf-8") as f:
json.dump(obj=rqd, fp=f, indent=2)
def process_api(api_name: str, api_url: str, get_all_data: bool = True):
"""Save the data for the api as json
Args:
api_name (str): name of the api according tpo top-level api call
api_url (str): URL of the api
get_all_data (bool): get all data revcursively
"""
r = requests.get(api_url, headers=HEADERS)
data_d = r.json()
print(api_name)
write_data(request_data=data_d, api_name=api_name, get_all_data=get_all_data)
if __name__ == "__main__":
apis = get_api_dict()
for name, url in apis.items():
process_api(api_name=name, api_url=url, get_all_data=GET_ALL_DATA)
TK="superuser-token"
URL="<BASE_URL>/api/?format=json"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment