Skip to content

Instantly share code, notes, and snippets.

@beccajcarlson
Last active June 8, 2026 02:44
Show Gist options
  • Select an option

  • Save beccajcarlson/f9319af882e372194bd30640c0cbf2ed to your computer and use it in GitHub Desktop.

Select an option

Save beccajcarlson/f9319af882e372194bd30640c0cbf2ed to your computer and use it in GitHub Desktop.
Per-DB cutoff trade-off: universe size vs bench accuracy (accessible-surfaceome)

DB cutoff tradeoff — universe size vs benchmark accuracy per source

For each of the 5 M1 surface DBs (UniProt, GO CC, HPA, SURFY, CSPA), a small panel plotting how the choice of "surface-vote" cutoff trades universe size (proteins this filter admits, log-scale x) for benchmark accuracy on the 147-gene bench (y, %). Lower-x = stricter.

Markers:

  • Circle — alternative cutoff option, not currently used in the M1 merge rules.
  • Diamond — canonical baseline currently configured in the loaders (cross-checked against candidate_universe.tsv flag counts).
  • Star — recommended cutoff after the trade-off audit, shown only when it differs from canonical (currently: UniProt "TM+signal" and CSPA "HC-only").

Per-point annotation: variant short label, universe size, +pos% / -neg% recall on the bench.

Run:

uv run make_db_cutoff_tradeoff.py

Source (fetched live from the public repo):

Canonical in-repo generator: scripts/triage_bench_db_barplot.py::make_db_tradeoff_plot. The accuracy math runs over the raw DB source dumps; this gist script reads the precomputed points table instead so it stays small and self-contained.

# /// script
# requires-python = ">=3.11"
# dependencies = [
# "matplotlib>=3.9",
# "pandas>=2.2",
# "seaborn>=0.13",
# "httpx>=0.27",
# ]
# ///
"""Reproduce ``db_cutoff_tradeoff.{pdf,png}`` from the public repo.
Five per-source subplots showing the universe-size / accuracy
trade-off across cutoff variants for each surface DB. Reads a tiny
precomputed-points TSV (raw-fetched), so the gist stays small.
Visual styling matches the in-repo `_plotting_config` (Deliverome
categorical palette + Manrope-when-available, per-DB sequential ramps
from BRAND_SEQUENTIAL). Inlined so the gist runs standalone.
Standalone — ``uv run make_db_cutoff_tradeoff.py``.
"""
from __future__ import annotations
import io
from pathlib import Path
import httpx
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.ticker import FixedLocator, FuncFormatter, NullLocator
def _log_tick_label(x: float, _pos: int) -> str:
"""Human-readable label for log-scale x-axis ticks.
Examples: 100 → "100", 200 → "200", 500 → "500", 1000 → "1k",
1500 → "1.5k", 2000 → "2k", 5000 → "5k", 10000 → "10k". Half-decimal
precision in the "k" range so 1k / 1.5k / 2k are visually distinct.
"""
if x < 1000:
return f"{int(x)}"
k = x / 1000
if abs(k - round(k)) < 0.05:
return f"{int(round(k))}k"
return f"{k:.1f}k"
def _nice_log_ticks(lo: float, hi: float, *, min_ticks: int = 3) -> list[float]:
"""Pick ≥``min_ticks`` "round" log-spaced tick positions within [lo, hi].
Tier 1: 1×, 2×, 5× per decade (the standard log-axis choice). This
covers wide ranges nicely (HPA 60→4400 gets 100/200/500/1k/2k).
Tier 2: 1×, 1.5×, 2×, 3×, 5×, 7× per decade. Falls back to this finer
grid when the panel's range is too narrow for tier 1 to land
``min_ticks`` inside it (CSPA 577→2686 gets 1k/1.5k/2k, GO 1015→4603
gets 1.5k/2k/3k/5k).
Matplotlib's LogLocator(subs=(1,2,5), numticks=N) auto-falls-back to
half-decade positions when the range is narrow, but does so without
bound (GO ended up with 7 ticks at 1.5k/2k/2.5k/3k/3.5k/4k/4.5k —
visually crowded). This helper bounds the output deterministically.
"""
import math
lo_dec = math.floor(math.log10(lo))
hi_dec = math.ceil(math.log10(hi))
def gen(subs):
out = []
for d in range(lo_dec - 1, hi_dec + 1):
for s in subs:
t = s * 10 ** d
if lo <= t <= hi and t not in out:
out.append(t)
return out
ticks = gen((1, 2, 5))
if len(ticks) < min_ticks:
ticks = gen((1, 1.5, 2, 3, 5, 7))
return ticks
REPO = "Deliverome-Project/accessible-surfaceome"
BRANCH = "main"
POINTS_URL = (
f"https://raw.githubusercontent.com/{REPO}/{BRANCH}"
f"/data/processed/triage_bench/db_cutoff_tradeoff_points.tsv"
)
# Published reproduction gist (embedded into output PNG Source / PDF
# Subject metadata — mirrors save_figure in _plotting_config.py).
GIST_URL = "https://gist.github.com/beccajcarlson/f9319af882e372194bd30640c0cbf2ed"
# ──── Inline brand styling — sentinel: brand-style-v3 ────
# Mirrors src/accessible_surfaceome/audit/_plotting_config.py so the gist
# stays self-contained. Kept in sync via tests/test_figure_gists_styling.py.
BRAND_PALETTE = [
"#BC3C4C", # maroon-light
"#3D6B60", # teal-mid
"#F4AA28", # amber-bright
"#8878C8", # lavender-bright
"#6E1428", # maroon-dark
"#7AAB9F", # teal-light
]
BRAND_SEQUENTIAL = {
"maroon": ["#3E0A18", "#6E1428", "#922038", "#BC3C4C", "#F0A098", "#FDE8E6"],
"teal": ["#152E28", "#244840", "#3D6B60", "#4D8A80", "#7AAB9F", "#CCE8E4"],
"amber": ["#5A2608", "#8C4210", "#C07830", "#F4AA28", "#F4C070", "#FAECD4"],
"lavender": ["#1E1450", "#3A2888", "#5848A8", "#8878C8", "#A090D4", "#E4E0F8"],
}
BRAND_CLAUDE_ORANGE = "#d87851"
BRAND_INK = "#1F1718"
BRAND_NEUTRAL = "#6F5D5A"
BRAND_GRID = "#E6DAD4"
def _register_brand_fonts() -> None:
candidates = [
Path(__file__).resolve().parents[3] / "assets" / "fonts",
Path.cwd() / "assets" / "fonts",
]
for fonts_dir in candidates:
if fonts_dir.is_dir():
for path in sorted(list(fonts_dir.glob("*.ttf")) + list(fonts_dir.glob("*.otf"))):
try:
fm.fontManager.addfont(str(path))
except Exception: # noqa: BLE001
continue
return
def _apply_brand_style() -> None:
"""Inline equivalent of `setup_plotting_style`. Sentinel: brand-style-v3.
v2: bumped sizes ~25% + explicit medium weight (avoids ExtraLight default
that matplotlib picks from the Manrope variable file). Companion to the
static Manrope-{regular,medium,semibold,bold}.otf files in assets/fonts/."""
_register_brand_fonts()
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.0)
plt.rcParams.update({
"savefig.dpi": 300,
"savefig.bbox": "tight",
"figure.facecolor": "none",
"savefig.facecolor": "none",
"font.family": "sans-serif",
"font.sans-serif": ["Manrope", "Outfit", "DejaVu Sans", "Liberation Sans", "Arial"],
"font.weight": "medium",
"font.size": 21,
"axes.labelsize": 25,
"axes.labelweight": "medium",
"axes.titlesize": 0,
"axes.titlepad": 0,
"axes.spines.top": False,
"axes.spines.right": False,
"axes.grid": True,
"axes.axisbelow": True,
"axes.edgecolor": BRAND_GRID,
"axes.labelcolor": BRAND_INK,
"axes.facecolor": "none",
"text.color": BRAND_INK,
"grid.alpha": 0.35,
"grid.linestyle": "-",
"grid.linewidth": 0.7,
"grid.color": BRAND_GRID,
"xtick.labelsize": 20,
"ytick.labelsize": 20,
"xtick.color": BRAND_INK,
"ytick.color": BRAND_INK,
"legend.frameon": False,
"legend.fontsize": 20,
"patch.edgecolor": "none",
"patch.linewidth": 0.0,
})
GROUP_ORDER = ["UniProt", "GO", "HPA", "SURFY", "CSPA"]
# Per-DB sequential ramps, light → dark = strict → permissive. Pulled
# from BRAND_SEQUENTIAL so the visual identity matches the rest of the
# figure set.
GROUP_RAMP = {
"UniProt": [BRAND_SEQUENTIAL["maroon"][i] for i in (1, 2, 3, 4)],
"GO": [BRAND_SEQUENTIAL["teal"][i] for i in (1, 2, 4)],
"HPA": [BRAND_SEQUENTIAL["amber"][i] for i in (1, 3, 4)],
"SURFY": [BRAND_SEQUENTIAL["lavender"][i] for i in (1, 2, 3, 4)],
"CSPA": [BRAND_SEQUENTIAL["maroon"][i] for i in (1, 3, 4)], # CSPA is maroon-family too — distinct from UniProt by darkness pattern
}
RECOMMENDED_EDGE = BRAND_SEQUENTIAL["teal"][1] # teal-deep
CANONICAL_EDGE = BRAND_INK
# Short labels match the canonical generator.
SHORT_LABEL = {
"UniProt strict-only\n(4 subcell terms)": "strict-4",
"UniProt baseline": "canonical",
"UniProt TM-or-signal-or-surface\n(topology proxy)": "TM+signal",
"UniProt permissive\n(incl. plain Cell membrane)": "permissive",
"GO experimental+curated": "exp+curated",
"GO baseline": "canonical",
"GO permissive\n(incl. IEA-only)": "+ IEA",
"HPA Enhanced-only\n(strictest tier)": "Enhanced",
"HPA baseline": "canonical",
"HPA + Uncertain tier": "+ Uncertain",
"SURFY score>0.9": ">0.9",
"SURFY score>0.7": ">0.7",
"SURFY score>0.5": ">0.5",
"SURFY baseline": "canonical",
"CSPA high-conf only": "HC-only",
"CSPA baseline": "canonical",
"CSPA + unspecific": "+ unspecific",
}
def _fetch_tsv(url: str) -> pd.DataFrame:
local = Path(__file__).resolve().parents[3] / url[len(f"https://raw.githubusercontent.com/{REPO}/{BRANCH}/"):]
if local.is_file():
return pd.read_csv(local, sep="\t")
r = httpx.get(url, timeout=30)
r.raise_for_status()
return pd.read_csv(io.StringIO(r.text), sep="\t")
def main() -> None:
_apply_brand_style()
df = _fetch_tsv(POINTS_URL)
# Wider + taller than v2's (13, 11): v2's panels were noticeably taller
# than wide (≈4.3"w × 5.5"h each). (19, 13) gives ≈6.3"w × 6.5"h —
# near-square. Extra height also makes room for the per-panel caption
# block to breathe under v2's larger caption fontsize.
fig, axes = plt.subplots(2, 3, figsize=(19, 13))
axes_flat = axes.flatten()
for gi, group in enumerate(GROUP_ORDER):
ax = axes_flat[gi]
pts = df[df["group"] == group].sort_values("size").to_dict("records")
ramp = GROUP_RAMP.get(group, [BRAND_NEUTRAL])
# Strictness-ladder line.
ax.plot([p["size"] for p in pts], [p["acc"] * 100 for p in pts],
color=ramp[0], linewidth=1.6, alpha=0.5, zorder=2)
caption_lines = []
for idx, p in enumerate(pts):
color = ramp[min(idx, len(ramp) - 1)]
if p.get("recommended"):
marker, msize, edge = "*", 480, RECOMMENDED_EDGE
marker_char = "★"
elif p.get("canonical"):
marker, msize, edge = "D", 200, CANONICAL_EDGE
marker_char = "◆"
else:
marker, msize, edge = "o", 130, "white"
marker_char = "●"
ax.scatter(p["size"], p["acc"] * 100,
marker=marker, s=msize, color=color,
edgecolor=edge, linewidth=1.6, zorder=4)
short = SHORT_LABEL.get(p["label"], p["label"]).replace("\n", " ")
caption_lines.append(
f"{marker_char} {short:<14} n={int(p['size']):>5,} +{p['pos']*100:.0f}/-{p['neg']*100:.0f}"
)
ax.set_xscale("log")
ax.text(0.02, 0.97, group, transform=ax.transAxes,
ha="left", va="top", fontsize=24, fontweight="bold",
color=ramp[0])
ax.set_ylim(25, 102)
xs = [p["size"] for p in pts]
if xs:
ax.set_xlim(max(40, min(xs) / 1.8), max(xs) * 1.8)
# Tick set must be computed AFTER set_xlim so the adaptive locator
# sees the actual panel range — narrow ranges fall back to a finer
# 1×/1.5×/2×/3×/5×/7× per-decade grid to guarantee ≥3 labels.
lo, hi = ax.get_xlim()
tick_positions = _nice_log_ticks(lo, hi, min_ticks=3)
ax.xaxis.set_major_locator(FixedLocator(tick_positions))
ax.xaxis.set_major_formatter(FuncFormatter(_log_tick_label))
ax.xaxis.set_minor_locator(NullLocator())
sns.despine(ax=ax, top=True, right=True)
# Per-panel caption block — one line per cutoff variant, marker
# char doubles as the visual key for matching to on-plot points.
# Monospace so columns align; tiny enough that even DejaVu Sans
# Mono fallback reads cleanly on systems without a brand mono.
ax.text(
0.0, -0.30, "\n".join(caption_lines),
transform=ax.transAxes,
ha="left", va="top",
fontsize=15, color=BRAND_INK,
family="monospace",
)
# 6th cell: marker-shape key + missing-rule footnote.
legend_ax = axes_flat[5]
legend_ax.axis("off")
handles = [
plt.Line2D([], [], marker="o", linestyle="", color=BRAND_NEUTRAL,
markersize=11, markeredgecolor="white",
markeredgewidth=1.4, label="Alternative cutoff (not used)"),
plt.Line2D([], [], marker="D", linestyle="", color=BRAND_NEUTRAL,
markersize=12, markeredgecolor=CANONICAL_EDGE,
markeredgewidth=1.4, label="Canonical (current merge rule)"),
plt.Line2D([], [], marker="*", linestyle="", color=BRAND_NEUTRAL,
markersize=20, markeredgecolor=RECOMMENDED_EDGE,
markeredgewidth=1.4, label="Recommended after trade-off audit"),
]
legend_ax.legend(handles=handles, loc="upper center", fontsize=20,
frameon=False, title="Marker shape",
title_fontsize=21)
fig.supxlabel("Universe size — proteins this filter would admit "
"(log scale; lower = stricter)", fontsize=20, y=0.02,
color=BRAND_INK)
# supylabel y defaults to 0.5 (figure midline) which is what reads as
# "centered" against the supxlabel's default x=0.5. Earlier overrides
# to 0.54 (axes-content midpoint) and the dynamic equivalent were
# technically correct but visually off — keep the matplotlib default.
# x=0.02 just keeps the rotated label off the canvas-left edge under
# brand-style-v3's larger axes.labelsize.
fig.supylabel("Accuracy on\n147-gene benchmark (%)", fontsize=20,
x=0.02, color=BRAND_INK)
plt.tight_layout(rect=[0.04, 0.03, 1, 0.985])
# Vertical gap between row-1 panels (with caption blocks BELOW them at
# axes-y=-0.30, extending further for the longest panel — 6 variants
# × ~22pt line height) and row-2 panels. Without this, row-1 captions
# crash into row-2 axes / row-2 captions extend into the supxlabel.
# Bumped 0.95 → 1.40 to give room for the now-larger (19, 13) figure +
# the recommended-cutoff line (★) below the canonical line (◆) in
# each panel.
plt.subplots_adjust(hspace=1.40)
out_pdf = Path("db_cutoff_tradeoff.pdf")
out_png = Path("db_cutoff_tradeoff.png")
fig.savefig(out_pdf, bbox_inches="tight", metadata={"Subject": GIST_URL})
fig.savefig(out_png, bbox_inches="tight", dpi=300, metadata={"Source": GIST_URL})
print(f"Wrote {out_pdf} + {out_png} ({len(df)} cutoff variants across {df['group'].nunique()} DBs)")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment