ewels · March 19, 2023 16:15
diff --git a/command.sh b/command.sh
 aws s3 ls s3://nf-core-awsmegatests --recursive > files.txt
diff --git a/s3_usage.py b/s3_usage.py
 #!/usr/bin/env python

 from collections import defaultdict
 import requests
 import re
 from rich import print
 from rich.table import Table
 from rich.console import Console

 console = Console(record=True)

 # Get pipelines
 pipelines_r = requests.get("https://nf-co.re/pipelines.json")
 pipelines = {}
 for pipeline in pipelines_r.json()["remote_workflows"]:
    pipelines[pipeline["name"]] = pipeline


 pipeline_inputs = defaultdict(int)
 pipeline_results_expected = defaultdict(int)
 pipeline_results_expected_per_release = defaultdict(lambda: defaultdict(int))
 pipeline_results_unexpected = defaultdict(int)
 pipeline_results_unexpected_paths = set()
 work_dir = defaultdict(int)
 pipeline_other = defaultdict(int)
 pipeline_other_paths_sizes = defaultdict(lambda: defaultdict(int))
 other_toplevel = defaultdict(int)

 with open("files.txt") as fh:
    for line in fh:
        timestamp = line[:19]  # 2022-02-14 17:45:22
        filesize_col = re.search(r"\s+\d+", line[19:]).group()  #   205718877
        filesize = int(filesize_col.strip())  #   205718877
        path = line[len(str(filesize_col)) + 20 :]
        if filesize == 0:
            next
        root, dir1, *_ = path.split("/")
        if root in pipelines:
            if "input" in dir1:
                pipeline_inputs[root] += filesize
            elif "results-" in dir1:
                if dir1[8:] in [
                    release["tag_sha"] for release in pipelines[root]["releases"]
                ]:
                    pipeline_results_expected[root] += filesize
                    pipeline_results_expected_per_release[root][dir1] += filesize
                else:
                    pipeline_results_unexpected[root] += filesize
                    pipeline_results_unexpected_paths.add(f"{root}/{dir1}")
            else:
                pipeline_other[root] += filesize
                pipeline_other_paths_sizes[root][dir1] += filesize
        else:
            if root == "work":
                work_dir[dir1] += filesize
            else:
                other_toplevel[root] += filesize

 pipeline_results_expected_avg_size = {}
 pipeline_results_releases = {}
 pipeline_results_releases_total = 0
 for pipeline, releases in pipeline_results_expected_per_release.items():
    avg_size = sum(releases.values()) / len(releases)
    pipeline_results_expected_avg_size[pipeline] = f"{avg_size/1000000000:.2f}GB"
    pipeline_results_releases[pipeline] = len(releases)
    pipeline_results_releases_total += len(releases)


 def print_table(
    title,
    data,
    extra_col=None,
    extra_col_title=None,
    extra_col_footer="",
    extra_extra_col=None,
    extra_extra_col_title=None,
    extra_extra_col_footer="",
 ):
    table = Table(title=title)
    table.add_column("Pipeline", style="magenta")
    table.add_column("File size", justify="right", style="green")
    if extra_col:
        table.add_column(extra_col_title, justify="right", style="yellow")
    if extra_extra_col:
        table.add_column(extra_extra_col_title, justify="right", style="cyan")
    total = 0
    for k, v in sorted(data.items(), key=lambda x: x[1], reverse=True):
        row = [k, f"{(v/1000000000):.2f}GB"]
        if extra_col:
            row.append(extra_col.get(k))
        if extra_extra_col:
            row.append(str(extra_extra_col.get(k)))
        table.add_row(*row)

        total += v
    table.add_section()
    footer_row = [f"{len(data)} Pipelines", f"{(total/1000000000):.2f}GB"]
    if extra_col:
        footer_row.append(str(extra_col_footer))
    if extra_extra_col:
        footer_row.append(str(extra_extra_col_footer))
    table.add_row(*footer_row, style="bold")
    console.print(table)
    console.print("\n\n")
    return total


 # Input data
 grand_total = 0
 grand_total += print_table("Pipeline Input Data", pipeline_inputs)
 grand_total += print_table(
    "Results from Release Commits",
    pipeline_results_expected,
    pipeline_results_expected_avg_size,
    "Avg size per release",
    "",
    pipeline_results_releases,
    "Number of releases",
    pipeline_results_releases_total,
 )
 grand_total += print_table(
    "Results from unexpected commits", pipeline_results_unexpected
 )
 grand_total += print_table("Unexpected stuff in pipeline directories", pipeline_other)
 for pipeline, size in pipeline_other.items():
    if size > 10000000000:
        print_table(
            f"Unexpected stuff in {pipeline}",
            pipeline_other_paths_sizes[pipeline],
        )
 grand_total += print_table("Work directory", work_dir)
 grand_total += print_table("Other random pipeline data", other_toplevel)
 console.print(f"GRAND TOTAL: {grand_total/1000000000:.2f}GB")

 with open("unexpected_results.txt", "w") as fh:
    fh.write("\n".join(sorted(pipeline_results_unexpected_paths)))

 # console.save_html("report.html")
 console.save_text("report.txt")
	#!/usr/bin/env python

	from collections import defaultdict
	import requests
	import re
	from rich import print
	from rich.table import Table
	from rich.console import Console

	console = Console(record=True)

	# Get pipelines
	pipelines_r = requests.get("https://nf-co.re/pipelines.json")
	pipelines = {}
	for pipeline in pipelines_r.json()["remote_workflows"]:
	pipelines[pipeline["name"]] = pipeline


	pipeline_inputs = defaultdict(int)
	pipeline_results_expected = defaultdict(int)
	pipeline_results_expected_per_release = defaultdict(lambda: defaultdict(int))
	pipeline_results_unexpected = defaultdict(int)
	pipeline_results_unexpected_paths = set()
	work_dir = defaultdict(int)
	pipeline_other = defaultdict(int)
	pipeline_other_paths_sizes = defaultdict(lambda: defaultdict(int))
	other_toplevel = defaultdict(int)

	with open("files.txt") as fh:
	for line in fh:
	timestamp = line[:19] # 2022-02-14 17:45:22
	filesize_col = re.search(r"\s+\d+", line[19:]).group() # 205718877
	filesize = int(filesize_col.strip()) # 205718877
	path = line[len(str(filesize_col)) + 20 :]
	if filesize == 0:
	next
	root, dir1, *_ = path.split("/")
	if root in pipelines:
	if "input" in dir1:
	pipeline_inputs[root] += filesize
	elif "results-" in dir1:
	if dir1[8:] in [
	release["tag_sha"] for release in pipelines[root]["releases"]
	]:
	pipeline_results_expected[root] += filesize
	pipeline_results_expected_per_release[root][dir1] += filesize
	else:
	pipeline_results_unexpected[root] += filesize
	pipeline_results_unexpected_paths.add(f"{root}/{dir1}")
	else:
	pipeline_other[root] += filesize
	pipeline_other_paths_sizes[root][dir1] += filesize
	else:
	if root == "work":
	work_dir[dir1] += filesize
	else:
	other_toplevel[root] += filesize

	pipeline_results_expected_avg_size = {}
	pipeline_results_releases = {}
	pipeline_results_releases_total = 0
	for pipeline, releases in pipeline_results_expected_per_release.items():
	avg_size = sum(releases.values()) / len(releases)
	pipeline_results_expected_avg_size[pipeline] = f"{avg_size/1000000000:.2f}GB"
	pipeline_results_releases[pipeline] = len(releases)
	pipeline_results_releases_total += len(releases)


	def print_table(
	title,
	data,
	extra_col=None,
	extra_col_title=None,
	extra_col_footer="",
	extra_extra_col=None,
	extra_extra_col_title=None,
	extra_extra_col_footer="",
	):
	table = Table(title=title)
	table.add_column("Pipeline", style="magenta")
	table.add_column("File size", justify="right", style="green")
	if extra_col:
	table.add_column(extra_col_title, justify="right", style="yellow")
	if extra_extra_col:
	table.add_column(extra_extra_col_title, justify="right", style="cyan")
	total = 0
	for k, v in sorted(data.items(), key=lambda x: x[1], reverse=True):
	row = [k, f"{(v/1000000000):.2f}GB"]
	if extra_col:
	row.append(extra_col.get(k))
	if extra_extra_col:
	row.append(str(extra_extra_col.get(k)))
	table.add_row(*row)

	total += v
	table.add_section()
	footer_row = [f"{len(data)} Pipelines", f"{(total/1000000000):.2f}GB"]
	if extra_col:
	footer_row.append(str(extra_col_footer))
	if extra_extra_col:
	footer_row.append(str(extra_extra_col_footer))
	table.add_row(*footer_row, style="bold")
	console.print(table)
	console.print("\n\n")
	return total


	# Input data
	grand_total = 0
	grand_total += print_table("Pipeline Input Data", pipeline_inputs)
	grand_total += print_table(
	"Results from Release Commits",
	pipeline_results_expected,
	pipeline_results_expected_avg_size,
	"Avg size per release",
	"",
	pipeline_results_releases,
	"Number of releases",
	pipeline_results_releases_total,
	)
	grand_total += print_table(
	"Results from unexpected commits", pipeline_results_unexpected
	)
	grand_total += print_table("Unexpected stuff in pipeline directories", pipeline_other)
	for pipeline, size in pipeline_other.items():
	if size > 10000000000:
	print_table(
	f"Unexpected stuff in {pipeline}",
	pipeline_other_paths_sizes[pipeline],
	)
	grand_total += print_table("Work directory", work_dir)
	grand_total += print_table("Other random pipeline data", other_toplevel)
	console.print(f"GRAND TOTAL: {grand_total/1000000000:.2f}GB")

	with open("unexpected_results.txt", "w") as fh:
	fh.write("\n".join(sorted(pipeline_results_unexpected_paths)))

	# console.save_html("report.html")
	console.save_text("report.txt")