Last active
March 19, 2023 16:15
-
-
Save ewels/dcb0771e0ace7565c6a084f33ff0fa2d to your computer and use it in GitHub Desktop.
Analyse @nf-core AWS s3 storage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
aws s3 ls s3://nf-core-awsmegatests --recursive > files.txt |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from collections import defaultdict | |
import requests | |
import re | |
from rich import print | |
from rich.table import Table | |
from rich.console import Console | |
console = Console(record=True) | |
# Get pipelines | |
pipelines_r = requests.get("https://nf-co.re/pipelines.json") | |
pipelines = {} | |
for pipeline in pipelines_r.json()["remote_workflows"]: | |
pipelines[pipeline["name"]] = pipeline | |
pipeline_inputs = defaultdict(int) | |
pipeline_results_expected = defaultdict(int) | |
pipeline_results_expected_per_release = defaultdict(lambda: defaultdict(int)) | |
pipeline_results_unexpected = defaultdict(int) | |
pipeline_results_unexpected_paths = set() | |
work_dir = defaultdict(int) | |
pipeline_other = defaultdict(int) | |
pipeline_other_paths_sizes = defaultdict(lambda: defaultdict(int)) | |
other_toplevel = defaultdict(int) | |
with open("files.txt") as fh: | |
for line in fh: | |
timestamp = line[:19] # 2022-02-14 17:45:22 | |
filesize_col = re.search(r"\s+\d+", line[19:]).group() # 205718877 | |
filesize = int(filesize_col.strip()) # 205718877 | |
path = line[len(str(filesize_col)) + 20 :] | |
if filesize == 0: | |
next | |
root, dir1, *_ = path.split("/") | |
if root in pipelines: | |
if "input" in dir1: | |
pipeline_inputs[root] += filesize | |
elif "results-" in dir1: | |
if dir1[8:] in [ | |
release["tag_sha"] for release in pipelines[root]["releases"] | |
]: | |
pipeline_results_expected[root] += filesize | |
pipeline_results_expected_per_release[root][dir1] += filesize | |
else: | |
pipeline_results_unexpected[root] += filesize | |
pipeline_results_unexpected_paths.add(f"{root}/{dir1}") | |
else: | |
pipeline_other[root] += filesize | |
pipeline_other_paths_sizes[root][dir1] += filesize | |
else: | |
if root == "work": | |
work_dir[dir1] += filesize | |
else: | |
other_toplevel[root] += filesize | |
pipeline_results_expected_avg_size = {} | |
pipeline_results_releases = {} | |
pipeline_results_releases_total = 0 | |
for pipeline, releases in pipeline_results_expected_per_release.items(): | |
avg_size = sum(releases.values()) / len(releases) | |
pipeline_results_expected_avg_size[pipeline] = f"{avg_size/1000000000:.2f}GB" | |
pipeline_results_releases[pipeline] = len(releases) | |
pipeline_results_releases_total += len(releases) | |
def print_table( | |
title, | |
data, | |
extra_col=None, | |
extra_col_title=None, | |
extra_col_footer="", | |
extra_extra_col=None, | |
extra_extra_col_title=None, | |
extra_extra_col_footer="", | |
): | |
table = Table(title=title) | |
table.add_column("Pipeline", style="magenta") | |
table.add_column("File size", justify="right", style="green") | |
if extra_col: | |
table.add_column(extra_col_title, justify="right", style="yellow") | |
if extra_extra_col: | |
table.add_column(extra_extra_col_title, justify="right", style="cyan") | |
total = 0 | |
for k, v in sorted(data.items(), key=lambda x: x[1], reverse=True): | |
row = [k, f"{(v/1000000000):.2f}GB"] | |
if extra_col: | |
row.append(extra_col.get(k)) | |
if extra_extra_col: | |
row.append(str(extra_extra_col.get(k))) | |
table.add_row(*row) | |
total += v | |
table.add_section() | |
footer_row = [f"{len(data)} Pipelines", f"{(total/1000000000):.2f}GB"] | |
if extra_col: | |
footer_row.append(str(extra_col_footer)) | |
if extra_extra_col: | |
footer_row.append(str(extra_extra_col_footer)) | |
table.add_row(*footer_row, style="bold") | |
console.print(table) | |
console.print("\n\n") | |
return total | |
# Input data | |
grand_total = 0 | |
grand_total += print_table("Pipeline Input Data", pipeline_inputs) | |
grand_total += print_table( | |
"Results from Release Commits", | |
pipeline_results_expected, | |
pipeline_results_expected_avg_size, | |
"Avg size per release", | |
"", | |
pipeline_results_releases, | |
"Number of releases", | |
pipeline_results_releases_total, | |
) | |
grand_total += print_table( | |
"Results from unexpected commits", pipeline_results_unexpected | |
) | |
grand_total += print_table("Unexpected stuff in pipeline directories", pipeline_other) | |
for pipeline, size in pipeline_other.items(): | |
if size > 10000000000: | |
print_table( | |
f"Unexpected stuff in {pipeline}", | |
pipeline_other_paths_sizes[pipeline], | |
) | |
grand_total += print_table("Work directory", work_dir) | |
grand_total += print_table("Other random pipeline data", other_toplevel) | |
console.print(f"GRAND TOTAL: {grand_total/1000000000:.2f}GB") | |
with open("unexpected_results.txt", "w") as fh: | |
fh.write("\n".join(sorted(pipeline_results_unexpected_paths))) | |
# console.save_html("report.html") | |
console.save_text("report.txt") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment