beccajcarlson · June 8, 2026 02:44
diff --git a/01_db_cutoff_tradeoff.md b/01_db_cutoff_tradeoff.md
diff --git a/make_db_cutoff_tradeoff.py b/make_db_cutoff_tradeoff.py
 # /// script
 # requires-python = ">=3.11"
 # dependencies = [
 #   "matplotlib>=3.9",
 #   "pandas>=2.2",
 #   "seaborn>=0.13",
 #   "httpx>=0.27",
 # ]
 # ///
 """Reproduce ``db_cutoff_tradeoff.{pdf,png}`` from the public repo.

 Five per-source subplots showing the universe-size / accuracy
 trade-off across cutoff variants for each surface DB. Reads a tiny
 precomputed-points TSV (raw-fetched), so the gist stays small.

 Visual styling matches the in-repo `_plotting_config` (Deliverome
 categorical palette + Manrope-when-available, per-DB sequential ramps
 from BRAND_SEQUENTIAL). Inlined so the gist runs standalone.

 Standalone — ``uv run make_db_cutoff_tradeoff.py``.
 """
 from __future__ import annotations

 import io
 from pathlib import Path

 import httpx
 import matplotlib.font_manager as fm
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
 from matplotlib.ticker import FixedLocator, FuncFormatter, NullLocator


 def _log_tick_label(x: float, _pos: int) -> str:
    """Human-readable label for log-scale x-axis ticks.

    Examples: 100 → "100", 200 → "200", 500 → "500", 1000 → "1k",
    1500 → "1.5k", 2000 → "2k", 5000 → "5k", 10000 → "10k". Half-decimal
    precision in the "k" range so 1k / 1.5k / 2k are visually distinct.
    """
    if x < 1000:
        return f"{int(x)}"
    k = x / 1000
    if abs(k - round(k)) < 0.05:
        return f"{int(round(k))}k"
    return f"{k:.1f}k"


 def _nice_log_ticks(lo: float, hi: float, *, min_ticks: int = 3) -> list[float]:
    """Pick ≥``min_ticks`` "round" log-spaced tick positions within [lo, hi].

    Tier 1: 1×, 2×, 5× per decade (the standard log-axis choice). This
    covers wide ranges nicely (HPA 60→4400 gets 100/200/500/1k/2k).

    Tier 2: 1×, 1.5×, 2×, 3×, 5×, 7× per decade. Falls back to this finer
    grid when the panel's range is too narrow for tier 1 to land
    ``min_ticks`` inside it (CSPA 577→2686 gets 1k/1.5k/2k, GO 1015→4603
    gets 1.5k/2k/3k/5k).

    Matplotlib's LogLocator(subs=(1,2,5), numticks=N) auto-falls-back to
    half-decade positions when the range is narrow, but does so without
    bound (GO ended up with 7 ticks at 1.5k/2k/2.5k/3k/3.5k/4k/4.5k —
    visually crowded). This helper bounds the output deterministically.
    """
    import math
    lo_dec = math.floor(math.log10(lo))
    hi_dec = math.ceil(math.log10(hi))
    def gen(subs):
        out = []
        for d in range(lo_dec - 1, hi_dec + 1):
            for s in subs:
                t = s * 10 ** d
                if lo <= t <= hi and t not in out:
                    out.append(t)
        return out
    ticks = gen((1, 2, 5))
    if len(ticks) < min_ticks:
        ticks = gen((1, 1.5, 2, 3, 5, 7))
    return ticks

 REPO = "Deliverome-Project/accessible-surfaceome"
 BRANCH = "main"
 POINTS_URL = (
    f"https://raw.githubusercontent.com/{REPO}/{BRANCH}"
    f"/data/processed/triage_bench/db_cutoff_tradeoff_points.tsv"
 )

 # Published reproduction gist (embedded into output PNG Source / PDF
 # Subject metadata — mirrors save_figure in _plotting_config.py).
 GIST_URL = "https://gist.github.com/beccajcarlson/f9319af882e372194bd30640c0cbf2ed"

 # ──── Inline brand styling — sentinel: brand-style-v3 ────
 # Mirrors src/accessible_surfaceome/audit/_plotting_config.py so the gist
 # stays self-contained. Kept in sync via tests/test_figure_gists_styling.py.
 BRAND_PALETTE = [
    "#BC3C4C",  # maroon-light
    "#3D6B60",  # teal-mid
    "#F4AA28",  # amber-bright
    "#8878C8",  # lavender-bright
    "#6E1428",  # maroon-dark
    "#7AAB9F",  # teal-light
 ]
 BRAND_SEQUENTIAL = {
    "maroon":   ["#3E0A18", "#6E1428", "#922038", "#BC3C4C", "#F0A098", "#FDE8E6"],
    "teal":     ["#152E28", "#244840", "#3D6B60", "#4D8A80", "#7AAB9F", "#CCE8E4"],
    "amber":    ["#5A2608", "#8C4210", "#C07830", "#F4AA28", "#F4C070", "#FAECD4"],
    "lavender": ["#1E1450", "#3A2888", "#5848A8", "#8878C8", "#A090D4", "#E4E0F8"],
 }
 BRAND_CLAUDE_ORANGE = "#d87851"
 BRAND_INK = "#1F1718"
 BRAND_NEUTRAL = "#6F5D5A"
 BRAND_GRID = "#E6DAD4"


 def _register_brand_fonts() -> None:
    candidates = [
        Path(__file__).resolve().parents[3] / "assets" / "fonts",
        Path.cwd() / "assets" / "fonts",
    ]
    for fonts_dir in candidates:
        if fonts_dir.is_dir():
            for path in sorted(list(fonts_dir.glob("*.ttf")) + list(fonts_dir.glob("*.otf"))):
                try:
                    fm.fontManager.addfont(str(path))
                except Exception:  # noqa: BLE001
                    continue
            return


 def _apply_brand_style() -> None:
    """Inline equivalent of `setup_plotting_style`. Sentinel: brand-style-v3.
    v2: bumped sizes ~25% + explicit medium weight (avoids ExtraLight default
    that matplotlib picks from the Manrope variable file). Companion to the
    static Manrope-{regular,medium,semibold,bold}.otf files in assets/fonts/."""
    _register_brand_fonts()
    sns.set_style("whitegrid")
    sns.set_context("notebook", font_scale=1.0)
    plt.rcParams.update({
        "savefig.dpi": 300,
        "savefig.bbox": "tight",
        "figure.facecolor": "none",
        "savefig.facecolor": "none",
        "font.family": "sans-serif",
        "font.sans-serif": ["Manrope", "Outfit", "DejaVu Sans", "Liberation Sans", "Arial"],
        "font.weight": "medium",
        "font.size": 21,
        "axes.labelsize": 25,
        "axes.labelweight": "medium",
        "axes.titlesize": 0,
        "axes.titlepad": 0,
        "axes.spines.top": False,
        "axes.spines.right": False,
        "axes.grid": True,
        "axes.axisbelow": True,
        "axes.edgecolor": BRAND_GRID,
        "axes.labelcolor": BRAND_INK,
        "axes.facecolor": "none",
        "text.color": BRAND_INK,
        "grid.alpha": 0.35,
        "grid.linestyle": "-",
        "grid.linewidth": 0.7,
        "grid.color": BRAND_GRID,
        "xtick.labelsize": 20,
        "ytick.labelsize": 20,
        "xtick.color": BRAND_INK,
        "ytick.color": BRAND_INK,
        "legend.frameon": False,
        "legend.fontsize": 20,
        "patch.edgecolor": "none",
        "patch.linewidth": 0.0,
    })


 GROUP_ORDER = ["UniProt", "GO", "HPA", "SURFY", "CSPA"]
 # Per-DB sequential ramps, light → dark = strict → permissive. Pulled
 # from BRAND_SEQUENTIAL so the visual identity matches the rest of the
 # figure set.
 GROUP_RAMP = {
    "UniProt": [BRAND_SEQUENTIAL["maroon"][i]   for i in (1, 2, 3, 4)],
    "GO":      [BRAND_SEQUENTIAL["teal"][i]     for i in (1, 2, 4)],
    "HPA":     [BRAND_SEQUENTIAL["amber"][i]    for i in (1, 3, 4)],
    "SURFY":   [BRAND_SEQUENTIAL["lavender"][i] for i in (1, 2, 3, 4)],
    "CSPA":    [BRAND_SEQUENTIAL["maroon"][i]   for i in (1, 3, 4)],  # CSPA is maroon-family too — distinct from UniProt by darkness pattern
 }
 RECOMMENDED_EDGE = BRAND_SEQUENTIAL["teal"][1]  # teal-deep
 CANONICAL_EDGE = BRAND_INK

 # Short labels match the canonical generator.
 SHORT_LABEL = {
    "UniProt strict-only\n(4 subcell terms)":              "strict-4",
    "UniProt baseline":                                    "canonical",
    "UniProt TM-or-signal-or-surface\n(topology proxy)":   "TM+signal",
    "UniProt permissive\n(incl. plain Cell membrane)":     "permissive",
    "GO experimental+curated":                             "exp+curated",
    "GO baseline":                                         "canonical",
    "GO permissive\n(incl. IEA-only)":                     "+ IEA",
    "HPA Enhanced-only\n(strictest tier)":                 "Enhanced",
    "HPA baseline":                                        "canonical",
    "HPA + Uncertain tier":                                "+ Uncertain",
    "SURFY score>0.9":                                     ">0.9",
    "SURFY score>0.7":                                     ">0.7",
    "SURFY score>0.5":                                     ">0.5",
    "SURFY baseline":                                      "canonical",
    "CSPA high-conf only":                                 "HC-only",
    "CSPA baseline":                                       "canonical",
    "CSPA + unspecific":                                   "+ unspecific",
 }


 def _fetch_tsv(url: str) -> pd.DataFrame:
    local = Path(__file__).resolve().parents[3] / url[len(f"https://raw.githubusercontent.com/{REPO}/{BRANCH}/"):]
    if local.is_file():
        return pd.read_csv(local, sep="\t")
    r = httpx.get(url, timeout=30)
    r.raise_for_status()
    return pd.read_csv(io.StringIO(r.text), sep="\t")


 def main() -> None:
    _apply_brand_style()
    df = _fetch_tsv(POINTS_URL)

    # Wider + taller than v2's (13, 11): v2's panels were noticeably taller
    # than wide (≈4.3"w × 5.5"h each). (19, 13) gives ≈6.3"w × 6.5"h —
    # near-square. Extra height also makes room for the per-panel caption
    # block to breathe under v2's larger caption fontsize.
    fig, axes = plt.subplots(2, 3, figsize=(19, 13))
    axes_flat = axes.flatten()

    for gi, group in enumerate(GROUP_ORDER):
        ax = axes_flat[gi]
        pts = df[df["group"] == group].sort_values("size").to_dict("records")
        ramp = GROUP_RAMP.get(group, [BRAND_NEUTRAL])

        # Strictness-ladder line.
        ax.plot([p["size"] for p in pts], [p["acc"] * 100 for p in pts],
                color=ramp[0], linewidth=1.6, alpha=0.5, zorder=2)

        caption_lines = []
        for idx, p in enumerate(pts):
            color = ramp[min(idx, len(ramp) - 1)]
            if p.get("recommended"):
                marker, msize, edge = "*", 480, RECOMMENDED_EDGE
                marker_char = "★"
            elif p.get("canonical"):
                marker, msize, edge = "D", 200, CANONICAL_EDGE
                marker_char = "◆"
            else:
                marker, msize, edge = "o", 130, "white"
                marker_char = "●"
            ax.scatter(p["size"], p["acc"] * 100,
                       marker=marker, s=msize, color=color,
                       edgecolor=edge, linewidth=1.6, zorder=4)
            short = SHORT_LABEL.get(p["label"], p["label"]).replace("\n", " ")
            caption_lines.append(
                f"{marker_char}  {short:<14}  n={int(p['size']):>5,}   +{p['pos']*100:.0f}/-{p['neg']*100:.0f}"
            )

        ax.set_xscale("log")
        ax.text(0.02, 0.97, group, transform=ax.transAxes,
                ha="left", va="top", fontsize=24, fontweight="bold",
                color=ramp[0])
        ax.set_ylim(25, 102)
        xs = [p["size"] for p in pts]
        if xs:
            ax.set_xlim(max(40, min(xs) / 1.8), max(xs) * 1.8)
        # Tick set must be computed AFTER set_xlim so the adaptive locator
        # sees the actual panel range — narrow ranges fall back to a finer
        # 1×/1.5×/2×/3×/5×/7× per-decade grid to guarantee ≥3 labels.
        lo, hi = ax.get_xlim()
        tick_positions = _nice_log_ticks(lo, hi, min_ticks=3)
        ax.xaxis.set_major_locator(FixedLocator(tick_positions))
        ax.xaxis.set_major_formatter(FuncFormatter(_log_tick_label))
        ax.xaxis.set_minor_locator(NullLocator())
        sns.despine(ax=ax, top=True, right=True)

        # Per-panel caption block — one line per cutoff variant, marker
        # char doubles as the visual key for matching to on-plot points.
        # Monospace so columns align; tiny enough that even DejaVu Sans
        # Mono fallback reads cleanly on systems without a brand mono.
        ax.text(
            0.0, -0.30, "\n".join(caption_lines),
            transform=ax.transAxes,
            ha="left", va="top",
            fontsize=15, color=BRAND_INK,
            family="monospace",
        )

    # 6th cell: marker-shape key + missing-rule footnote.
    legend_ax = axes_flat[5]
    legend_ax.axis("off")
    handles = [
        plt.Line2D([], [], marker="o", linestyle="", color=BRAND_NEUTRAL,
                   markersize=11, markeredgecolor="white",
                   markeredgewidth=1.4, label="Alternative cutoff (not used)"),
        plt.Line2D([], [], marker="D", linestyle="", color=BRAND_NEUTRAL,
                   markersize=12, markeredgecolor=CANONICAL_EDGE,
                   markeredgewidth=1.4, label="Canonical (current merge rule)"),
        plt.Line2D([], [], marker="*", linestyle="", color=BRAND_NEUTRAL,
                   markersize=20, markeredgecolor=RECOMMENDED_EDGE,
                   markeredgewidth=1.4, label="Recommended after trade-off audit"),
    ]
    legend_ax.legend(handles=handles, loc="upper center", fontsize=20,
                     frameon=False, title="Marker shape",
                     title_fontsize=21)

    fig.supxlabel("Universe size — proteins this filter would admit "
                  "(log scale; lower = stricter)", fontsize=20, y=0.02,
                  color=BRAND_INK)
    # supylabel y defaults to 0.5 (figure midline) which is what reads as
    # "centered" against the supxlabel's default x=0.5. Earlier overrides
    # to 0.54 (axes-content midpoint) and the dynamic equivalent were
    # technically correct but visually off — keep the matplotlib default.
    # x=0.02 just keeps the rotated label off the canvas-left edge under
    # brand-style-v3's larger axes.labelsize.
    fig.supylabel("Accuracy on\n147-gene benchmark (%)", fontsize=20,
                  x=0.02, color=BRAND_INK)
    plt.tight_layout(rect=[0.04, 0.03, 1, 0.985])
    # Vertical gap between row-1 panels (with caption blocks BELOW them at
    # axes-y=-0.30, extending further for the longest panel — 6 variants
    # × ~22pt line height) and row-2 panels. Without this, row-1 captions
    # crash into row-2 axes / row-2 captions extend into the supxlabel.
    # Bumped 0.95 → 1.40 to give room for the now-larger (19, 13) figure +
    # the recommended-cutoff line (★) below the canonical line (◆) in
    # each panel.
    plt.subplots_adjust(hspace=1.40)

    out_pdf = Path("db_cutoff_tradeoff.pdf")
    out_png = Path("db_cutoff_tradeoff.png")
    fig.savefig(out_pdf, bbox_inches="tight", metadata={"Subject": GIST_URL})
    fig.savefig(out_png, bbox_inches="tight", dpi=300, metadata={"Source": GIST_URL})
    print(f"Wrote {out_pdf} + {out_png}  ({len(df)} cutoff variants across {df['group'].nunique()} DBs)")


 if __name__ == "__main__":
    main()
	# /// script
	# requires-python = ">=3.11"
	# dependencies = [
	# "matplotlib>=3.9",
	# "pandas>=2.2",
	# "seaborn>=0.13",
	# "httpx>=0.27",
	# ]
	# ///
	"""Reproduce ``db_cutoff_tradeoff.{pdf,png}`` from the public repo.

	Five per-source subplots showing the universe-size / accuracy
	trade-off across cutoff variants for each surface DB. Reads a tiny
	precomputed-points TSV (raw-fetched), so the gist stays small.

	Visual styling matches the in-repo `_plotting_config` (Deliverome
	categorical palette + Manrope-when-available, per-DB sequential ramps
	from BRAND_SEQUENTIAL). Inlined so the gist runs standalone.

	Standalone — ``uv run make_db_cutoff_tradeoff.py``.
	"""
	from __future__ import annotations

	import io
	from pathlib import Path

	import httpx
	import matplotlib.font_manager as fm
	import matplotlib.pyplot as plt
	import pandas as pd
	import seaborn as sns
	from matplotlib.ticker import FixedLocator, FuncFormatter, NullLocator


	def _log_tick_label(x: float, _pos: int) -> str:
	"""Human-readable label for log-scale x-axis ticks.

	Examples: 100 → "100", 200 → "200", 500 → "500", 1000 → "1k",
	1500 → "1.5k", 2000 → "2k", 5000 → "5k", 10000 → "10k". Half-decimal
	precision in the "k" range so 1k / 1.5k / 2k are visually distinct.
	"""
	if x < 1000:
	return f"{int(x)}"
	k = x / 1000
	if abs(k - round(k)) < 0.05:
	return f"{int(round(k))}k"
	return f"{k:.1f}k"


	def _nice_log_ticks(lo: float, hi: float, *, min_ticks: int = 3) -> list[float]:
	"""Pick ≥``min_ticks`` "round" log-spaced tick positions within [lo, hi].

	Tier 1: 1×, 2×, 5× per decade (the standard log-axis choice). This
	covers wide ranges nicely (HPA 60→4400 gets 100/200/500/1k/2k).

	Tier 2: 1×, 1.5×, 2×, 3×, 5×, 7× per decade. Falls back to this finer
	grid when the panel's range is too narrow for tier 1 to land
	``min_ticks`` inside it (CSPA 577→2686 gets 1k/1.5k/2k, GO 1015→4603
	gets 1.5k/2k/3k/5k).

	Matplotlib's LogLocator(subs=(1,2,5), numticks=N) auto-falls-back to
	half-decade positions when the range is narrow, but does so without
	bound (GO ended up with 7 ticks at 1.5k/2k/2.5k/3k/3.5k/4k/4.5k —
	visually crowded). This helper bounds the output deterministically.
	"""
	import math
	lo_dec = math.floor(math.log10(lo))
	hi_dec = math.ceil(math.log10(hi))
	def gen(subs):
	out = []
	for d in range(lo_dec - 1, hi_dec + 1):
	for s in subs:
	t = s * 10 ** d
	if lo <= t <= hi and t not in out:
	out.append(t)
	return out
	ticks = gen((1, 2, 5))
	if len(ticks) < min_ticks:
	ticks = gen((1, 1.5, 2, 3, 5, 7))
	return ticks

	REPO = "Deliverome-Project/accessible-surfaceome"
	BRANCH = "main"
	POINTS_URL = (
	f"https://raw.githubusercontent.com/{REPO}/{BRANCH}"
	f"/data/processed/triage_bench/db_cutoff_tradeoff_points.tsv"
	)

	# Published reproduction gist (embedded into output PNG Source / PDF
	# Subject metadata — mirrors save_figure in _plotting_config.py).
	GIST_URL = "https://gist.github.com/beccajcarlson/f9319af882e372194bd30640c0cbf2ed"

	# ──── Inline brand styling — sentinel: brand-style-v3 ────
	# Mirrors src/accessible_surfaceome/audit/_plotting_config.py so the gist
	# stays self-contained. Kept in sync via tests/test_figure_gists_styling.py.
	BRAND_PALETTE = [
	"#BC3C4C", # maroon-light
	"#3D6B60", # teal-mid
	"#F4AA28", # amber-bright
	"#8878C8", # lavender-bright
	"#6E1428", # maroon-dark
	"#7AAB9F", # teal-light
	]
	BRAND_SEQUENTIAL = {
	"maroon": ["#3E0A18", "#6E1428", "#922038", "#BC3C4C", "#F0A098", "#FDE8E6"],
	"teal": ["#152E28", "#244840", "#3D6B60", "#4D8A80", "#7AAB9F", "#CCE8E4"],
	"amber": ["#5A2608", "#8C4210", "#C07830", "#F4AA28", "#F4C070", "#FAECD4"],
	"lavender": ["#1E1450", "#3A2888", "#5848A8", "#8878C8", "#A090D4", "#E4E0F8"],
	}
	BRAND_CLAUDE_ORANGE = "#d87851"
	BRAND_INK = "#1F1718"
	BRAND_NEUTRAL = "#6F5D5A"
	BRAND_GRID = "#E6DAD4"


	def _register_brand_fonts() -> None:
	candidates = [
	Path(__file__).resolve().parents[3] / "assets" / "fonts",
	Path.cwd() / "assets" / "fonts",
	]
	for fonts_dir in candidates:
	if fonts_dir.is_dir():
	for path in sorted(list(fonts_dir.glob(".ttf")) + list(fonts_dir.glob(".otf"))):
	try:
	fm.fontManager.addfont(str(path))
	except Exception: # noqa: BLE001
	continue
	return


	def _apply_brand_style() -> None:
	"""Inline equivalent of `setup_plotting_style`. Sentinel: brand-style-v3.
	v2: bumped sizes ~25% + explicit medium weight (avoids ExtraLight default
	that matplotlib picks from the Manrope variable file). Companion to the
	static Manrope-{regular,medium,semibold,bold}.otf files in assets/fonts/."""
	_register_brand_fonts()
	sns.set_style("whitegrid")
	sns.set_context("notebook", font_scale=1.0)
	plt.rcParams.update({
	"savefig.dpi": 300,
	"savefig.bbox": "tight",
	"figure.facecolor": "none",
	"savefig.facecolor": "none",
	"font.family": "sans-serif",
	"font.sans-serif": ["Manrope", "Outfit", "DejaVu Sans", "Liberation Sans", "Arial"],
	"font.weight": "medium",
	"font.size": 21,
	"axes.labelsize": 25,
	"axes.labelweight": "medium",
	"axes.titlesize": 0,
	"axes.titlepad": 0,
	"axes.spines.top": False,
	"axes.spines.right": False,
	"axes.grid": True,
	"axes.axisbelow": True,
	"axes.edgecolor": BRAND_GRID,
	"axes.labelcolor": BRAND_INK,
	"axes.facecolor": "none",
	"text.color": BRAND_INK,
	"grid.alpha": 0.35,
	"grid.linestyle": "-",
	"grid.linewidth": 0.7,
	"grid.color": BRAND_GRID,
	"xtick.labelsize": 20,
	"ytick.labelsize": 20,
	"xtick.color": BRAND_INK,
	"ytick.color": BRAND_INK,
	"legend.frameon": False,
	"legend.fontsize": 20,
	"patch.edgecolor": "none",
	"patch.linewidth": 0.0,
	})


	GROUP_ORDER = ["UniProt", "GO", "HPA", "SURFY", "CSPA"]
	# Per-DB sequential ramps, light → dark = strict → permissive. Pulled
	# from BRAND_SEQUENTIAL so the visual identity matches the rest of the
	# figure set.
	GROUP_RAMP = {
	"UniProt": [BRAND_SEQUENTIAL["maroon"][i] for i in (1, 2, 3, 4)],
	"GO": [BRAND_SEQUENTIAL["teal"][i] for i in (1, 2, 4)],
	"HPA": [BRAND_SEQUENTIAL["amber"][i] for i in (1, 3, 4)],
	"SURFY": [BRAND_SEQUENTIAL["lavender"][i] for i in (1, 2, 3, 4)],
	"CSPA": [BRAND_SEQUENTIAL["maroon"][i] for i in (1, 3, 4)], # CSPA is maroon-family too — distinct from UniProt by darkness pattern
	}
	RECOMMENDED_EDGE = BRAND_SEQUENTIAL["teal"][1] # teal-deep
	CANONICAL_EDGE = BRAND_INK

	# Short labels match the canonical generator.
	SHORT_LABEL = {
	"UniProt strict-only\n(4 subcell terms)": "strict-4",
	"UniProt baseline": "canonical",
	"UniProt TM-or-signal-or-surface\n(topology proxy)": "TM+signal",
	"UniProt permissive\n(incl. plain Cell membrane)": "permissive",
	"GO experimental+curated": "exp+curated",
	"GO baseline": "canonical",
	"GO permissive\n(incl. IEA-only)": "+ IEA",
	"HPA Enhanced-only\n(strictest tier)": "Enhanced",
	"HPA baseline": "canonical",
	"HPA + Uncertain tier": "+ Uncertain",
	"SURFY score>0.9": ">0.9",
	"SURFY score>0.7": ">0.7",
	"SURFY score>0.5": ">0.5",
	"SURFY baseline": "canonical",
	"CSPA high-conf only": "HC-only",
	"CSPA baseline": "canonical",
	"CSPA + unspecific": "+ unspecific",
	}


	def _fetch_tsv(url: str) -> pd.DataFrame:
	local = Path(__file__).resolve().parents[3] / url[len(f"https://raw.githubusercontent.com/{REPO}/{BRANCH}/"):]
	if local.is_file():
	return pd.read_csv(local, sep="\t")
	r = httpx.get(url, timeout=30)
	r.raise_for_status()
	return pd.read_csv(io.StringIO(r.text), sep="\t")


	def main() -> None:
	_apply_brand_style()
	df = _fetch_tsv(POINTS_URL)

	# Wider + taller than v2's (13, 11): v2's panels were noticeably taller
	# than wide (≈4.3"w × 5.5"h each). (19, 13) gives ≈6.3"w × 6.5"h —
	# near-square. Extra height also makes room for the per-panel caption
	# block to breathe under v2's larger caption fontsize.
	fig, axes = plt.subplots(2, 3, figsize=(19, 13))
	axes_flat = axes.flatten()

	for gi, group in enumerate(GROUP_ORDER):
	ax = axes_flat[gi]
	pts = df[df["group"] == group].sort_values("size").to_dict("records")
	ramp = GROUP_RAMP.get(group, [BRAND_NEUTRAL])

	# Strictness-ladder line.
	ax.plot([p["size"] for p in pts], [p["acc"] * 100 for p in pts],
	color=ramp[0], linewidth=1.6, alpha=0.5, zorder=2)

	caption_lines = []
	for idx, p in enumerate(pts):
	color = ramp[min(idx, len(ramp) - 1)]
	if p.get("recommended"):
	marker, msize, edge = "*", 480, RECOMMENDED_EDGE
	marker_char = "★"
	elif p.get("canonical"):
	marker, msize, edge = "D", 200, CANONICAL_EDGE
	marker_char = "◆"
	else:
	marker, msize, edge = "o", 130, "white"
	marker_char = "●"
	ax.scatter(p["size"], p["acc"] * 100,
	marker=marker, s=msize, color=color,
	edgecolor=edge, linewidth=1.6, zorder=4)
	short = SHORT_LABEL.get(p["label"], p["label"]).replace("\n", " ")
	caption_lines.append(
	f"{marker_char} {short:<14} n={int(p['size']):>5,} +{p['pos']100:.0f}/-{p['neg']100:.0f}"
	)

	ax.set_xscale("log")
	ax.text(0.02, 0.97, group, transform=ax.transAxes,
	ha="left", va="top", fontsize=24, fontweight="bold",
	color=ramp[0])
	ax.set_ylim(25, 102)
	xs = [p["size"] for p in pts]
	if xs:
	ax.set_xlim(max(40, min(xs) / 1.8), max(xs) * 1.8)
	# Tick set must be computed AFTER set_xlim so the adaptive locator
	# sees the actual panel range — narrow ranges fall back to a finer
	# 1×/1.5×/2×/3×/5×/7× per-decade grid to guarantee ≥3 labels.
	lo, hi = ax.get_xlim()
	tick_positions = _nice_log_ticks(lo, hi, min_ticks=3)
	ax.xaxis.set_major_locator(FixedLocator(tick_positions))
	ax.xaxis.set_major_formatter(FuncFormatter(_log_tick_label))
	ax.xaxis.set_minor_locator(NullLocator())
	sns.despine(ax=ax, top=True, right=True)

	# Per-panel caption block — one line per cutoff variant, marker
	# char doubles as the visual key for matching to on-plot points.
	# Monospace so columns align; tiny enough that even DejaVu Sans
	# Mono fallback reads cleanly on systems without a brand mono.
	ax.text(
	0.0, -0.30, "\n".join(caption_lines),
	transform=ax.transAxes,
	ha="left", va="top",
	fontsize=15, color=BRAND_INK,
	family="monospace",
	)

	# 6th cell: marker-shape key + missing-rule footnote.
	legend_ax = axes_flat[5]
	legend_ax.axis("off")
	handles = [
	plt.Line2D([], [], marker="o", linestyle="", color=BRAND_NEUTRAL,
	markersize=11, markeredgecolor="white",
	markeredgewidth=1.4, label="Alternative cutoff (not used)"),
	plt.Line2D([], [], marker="D", linestyle="", color=BRAND_NEUTRAL,
	markersize=12, markeredgecolor=CANONICAL_EDGE,
	markeredgewidth=1.4, label="Canonical (current merge rule)"),
	plt.Line2D([], [], marker="*", linestyle="", color=BRAND_NEUTRAL,
	markersize=20, markeredgecolor=RECOMMENDED_EDGE,
	markeredgewidth=1.4, label="Recommended after trade-off audit"),
	]
	legend_ax.legend(handles=handles, loc="upper center", fontsize=20,
	frameon=False, title="Marker shape",
	title_fontsize=21)

	fig.supxlabel("Universe size — proteins this filter would admit "
	"(log scale; lower = stricter)", fontsize=20, y=0.02,
	color=BRAND_INK)
	# supylabel y defaults to 0.5 (figure midline) which is what reads as
	# "centered" against the supxlabel's default x=0.5. Earlier overrides
	# to 0.54 (axes-content midpoint) and the dynamic equivalent were
	# technically correct but visually off — keep the matplotlib default.
	# x=0.02 just keeps the rotated label off the canvas-left edge under
	# brand-style-v3's larger axes.labelsize.
	fig.supylabel("Accuracy on\n147-gene benchmark (%)", fontsize=20,
	x=0.02, color=BRAND_INK)
	plt.tight_layout(rect=[0.04, 0.03, 1, 0.985])
	# Vertical gap between row-1 panels (with caption blocks BELOW them at
	# axes-y=-0.30, extending further for the longest panel — 6 variants
	# × ~22pt line height) and row-2 panels. Without this, row-1 captions
	# crash into row-2 axes / row-2 captions extend into the supxlabel.
	# Bumped 0.95 → 1.40 to give room for the now-larger (19, 13) figure +
	# the recommended-cutoff line (★) below the canonical line (◆) in
	# each panel.
	plt.subplots_adjust(hspace=1.40)

	out_pdf = Path("db_cutoff_tradeoff.pdf")
	out_png = Path("db_cutoff_tradeoff.png")
	fig.savefig(out_pdf, bbox_inches="tight", metadata={"Subject": GIST_URL})
	fig.savefig(out_png, bbox_inches="tight", dpi=300, metadata={"Source": GIST_URL})
	print(f"Wrote {out_pdf} + {out_png} ({len(df)} cutoff variants across {df['group'].nunique()} DBs)")


	if __name__ == "__main__":
	main()
No results found