Skip to content

Instantly share code, notes, and snippets.

@beccajcarlson
Last active June 8, 2026 03:11
Show Gist options
  • Select an option

  • Save beccajcarlson/a4526c9e6de5e958826bf1d764744c1b to your computer and use it in GitHub Desktop.

Select an option

Save beccajcarlson/a4526c9e6de5e958826bf1d764744c1b to your computer and use it in GitHub Desktop.
Zero-DB rescues by triage — what the Sonnet+NCBI agent catches that classical surface DBs miss

Zero-DB rescues by triage — what the agent catches that classical surface DBs miss

Whole-genome view of the genes the Sonnet (+ NCBI) triage agent flags as surface-accessible despite none of the five classical surface DBs (UniProt / GO CC / HPA / SURFY / CSPA) voting "yes." Two grouped bar panels on a shared y-axis show per-reason counts within each verdict bucket:

  • yes — definite surface, by triage agent's verdict=yes
  • contextual — state / lineage / partner-dependent surface display, by triage agent's verdict=contextual

Beneath each panel, select gene callouts illustrate the kind of biology the triage agent surfaces in each verdict bucket. Each callout's reason is verified at runtime against the catalog's triage.reason — mismatches raise an error.

Run:

uv run make_zero_db_rescues_by_triage.py

Source: the whole-proteome catalog TSV — data/processed/catalog/whole_proteome_catalog.tsv. 19,324 protein-coding human genes; each row carries the five *_surface_flag fields + n_sources_surface + the canonical Sonnet sonnet_verdict and sonnet_reason. The rescue slice = n_sources_surface == 0 AND sonnet_verdict ∈ {yes, contextual}. Sourced from D1 by scripts/export_whole_proteome_catalog_to_tsv.py.

Canonical in-repo generator: scripts/zero_db_rescues_by_triage.py.

# /// script
# requires-python = ">=3.11"
# dependencies = [
# "matplotlib>=3.9",
# "seaborn>=0.13",
# "pandas>=2.0",
# ]
# ///
"""Reproduce ``zero_db_rescues_by_triage.{pdf,png}`` from the public catalog.
Whole-genome view of zero-DB rescues: genes where no classical surface
DB (UniProt / GO CC / HPA / SURFY / CSPA) voted yes, yet the Sonnet+NCBI
triage agent voted `yes` (definite surface) or `contextual` (state/lineage
dependent). Two grouped bar panels on a shared y-axis: per-reason counts
within each verdict bucket. Beneath each panel, hand-picked select gene
callouts illustrate the kind of biology the triage agent surfaces.
Data: read from the in-repo whole-proteome catalog TSV
(``data/processed/catalog/whole_proteome_catalog.tsv``, un-LFS so
raw.githubusercontent.com serves it as plain text). Source of truth:
D1 ``candidate_universe_public`` + ``triage_run_public`` (canonical
sweep ``genome_full_sonnet_ncbi_v2``); the TSV is regenerated by
``scripts/export_whole_proteome_catalog_to_tsv.py`` whenever the
universe / triage state changes.
Visual styling matches the in-repo ``_plotting_config`` (Deliverome
categorical palette + Manrope-when-available). Inlined so the gist runs
standalone.
Standalone — ``uv run make_zero_db_rescues_by_triage.py``.
"""
from __future__ import annotations
import csv
import io
import urllib.request
from collections import Counter, defaultdict
from pathlib import Path
import matplotlib.font_manager as fm
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import seaborn as sns
REPO = "Deliverome-Project/accessible-surfaceome"
BRANCH = "main"
CATALOG_TSV_URL = (
f"https://raw.githubusercontent.com/{REPO}/{BRANCH}"
"/data/processed/catalog/whole_proteome_catalog.tsv"
)
# Published reproduction gist (embedded into output PNG Source / PDF
# Subject metadata — mirrors save_figure in _plotting_config.py).
GIST_URL = "https://gist.github.com/beccajcarlson/a4526c9e6de5e958826bf1d764744c1b"
# ──── Inline brand styling — sentinel: brand-style-v3 ────
# Mirrors src/accessible_surfaceome/audit/_plotting_config.py so the gist
# stays self-contained. Kept in sync via tests/test_figure_gists_styling.py.
BRAND_PALETTE = [
"#BC3C4C", # maroon-light
"#3D6B60", # teal-mid
"#F4AA28", # amber-bright
"#8878C8", # lavender-bright
"#6E1428", # maroon-dark
"#7AAB9F", # teal-light
]
BRAND_SEQUENTIAL = {
"maroon": ["#3E0A18", "#6E1428", "#922038", "#BC3C4C", "#F0A098", "#FDE8E6"],
"teal": ["#152E28", "#244840", "#3D6B60", "#4D8A80", "#7AAB9F", "#CCE8E4"],
"amber": ["#5A2608", "#8C4210", "#C07830", "#F4AA28", "#F4C070", "#FAECD4"],
"lavender": ["#1E1450", "#3A2888", "#5848A8", "#8878C8", "#A090D4", "#E4E0F8"],
}
BRAND_CLAUDE_ORANGE = "#d87851"
BRAND_INK = "#1F1718"
BRAND_NEUTRAL = "#6F5D5A"
BRAND_GRID = "#E6DAD4"
def _register_brand_fonts() -> None:
candidates = [
Path(__file__).resolve().parents[3] / "assets" / "fonts",
Path.cwd() / "assets" / "fonts",
]
for fonts_dir in candidates:
if fonts_dir.is_dir():
for path in sorted(list(fonts_dir.glob("*.ttf")) + list(fonts_dir.glob("*.otf"))):
try:
fm.fontManager.addfont(str(path))
except Exception: # noqa: BLE001
continue
return
def _apply_brand_style() -> None:
"""Inline equivalent of `setup_plotting_style`. Sentinel: brand-style-v3.
v2: bumped sizes ~25% + explicit medium weight (avoids ExtraLight default
that matplotlib picks from the Manrope variable file). Companion to the
static Manrope-{regular,medium,semibold,bold}.otf files in assets/fonts/."""
_register_brand_fonts()
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.0)
plt.rcParams.update({
"savefig.dpi": 300,
"savefig.bbox": "tight",
"figure.facecolor": "none",
"savefig.facecolor": "none",
"font.family": "sans-serif",
"font.sans-serif": ["Manrope", "Outfit", "DejaVu Sans", "Liberation Sans", "Arial"],
"font.weight": "medium",
"font.size": 21,
"axes.labelsize": 25,
"axes.labelweight": "medium",
"axes.titlesize": 0,
"axes.titlepad": 0,
"axes.spines.top": False,
"axes.spines.right": False,
"axes.grid": True,
"axes.axisbelow": True,
"axes.edgecolor": BRAND_GRID,
"axes.labelcolor": BRAND_INK,
"axes.facecolor": "none",
"text.color": BRAND_INK,
"grid.alpha": 0.35,
"grid.linestyle": "-",
"grid.linewidth": 0.7,
"grid.color": BRAND_GRID,
"xtick.labelsize": 20,
"ytick.labelsize": 20,
"xtick.color": BRAND_INK,
"ytick.color": BRAND_INK,
"legend.frameon": False,
"legend.fontsize": 20,
"patch.edgecolor": "none",
"patch.linewidth": 0.0,
})
# Reason taxonomy per verdict bucket (matches the triage agent's closed enum).
YES_REASONS = [
"classical_surface_receptor",
"multipass_with_exposed_loops",
"gpi_anchored",
"stable_complex_partner",
"other",
]
CONTEXTUAL_REASONS = [
"dual_localization",
"tissue_restricted_surface",
"stable_surface_attachment",
"cell_state_induced",
"lysosomal_exocytosis",
"other",
]
REASON_LABEL = {
"classical_surface_receptor": "classical\nsurface\nreceptor",
"multipass_with_exposed_loops": "multipass\nw/ exposed\nloops",
"gpi_anchored": "GPI-\nanchored",
"stable_complex_partner": "stable\ncomplex\npartner",
"dual_localization": "dual\nlocalization",
"tissue_restricted_surface": "tissue-\nrestricted\nsurface",
"stable_surface_attachment": "stable\nsurface\nattachment",
"cell_state_induced": "cell-state\ninduced",
"lysosomal_exocytosis": "lysosomal\nexocytosis",
"other": "other",
}
# Sequential green ramp for yes (definite surface); amber ramp for contextual
# (state / lineage / partner dependent). `other` neutral grey.
YES_PALETTE = {
"classical_surface_receptor": "#2E7A55",
"multipass_with_exposed_loops": "#4D8A80",
"gpi_anchored": "#7AAB9F",
"stable_complex_partner": "#A8C8C0",
"other": "#6F5D5A",
}
CONTEXTUAL_PALETTE = {
"dual_localization": "#8C4210",
"tissue_restricted_surface": "#C07830",
"stable_surface_attachment": "#F4AA28",
"cell_state_induced": "#F4C070",
"lysosomal_exocytosis": "#FAECD4",
"other": "#6F5D5A",
}
YES_HEADER_COLOR = "#2E7A55"
CONTEXTUAL_HEADER_COLOR = "#8C4210"
# Ordered to match the per-reason bar order in the panel above
# (YES_REASONS / CONTEXTUAL_REASONS), so callouts read top-to-bottom in
# the same sequence as the bars left-to-right.
YES_CALLOUTS = [
("PVRIG", "NK/T checkpoint; COM701 = anti-PVRIG surface mAb", "classical_surface_receptor"),
("ECEL1", "Type-II TM; neprilysin/M13 family", "classical_surface_receptor"),
("STEAP1", "Prostate ADC + BiTE target", "multipass_with_exposed_loops"),
("ORAI2", "Store-operated Ca2+ channel", "multipass_with_exposed_loops"),
("CRIPTO", "GPI-anchored oncofetal antigen", "gpi_anchored"),
("LY96", "MD-2 — TLR4 co-receptor", "stable_complex_partner"),
]
CONTEXTUAL_CALLOUTS = [
("IL15", "Surface trans-presentation via IL-15Rα", "dual_localization"),
("TIMP2", "MT1-MMP ternary complex", "stable_surface_attachment"),
("GSDMD", "Gasdermin D/E/C — pyroptosis pores", "cell_state_induced"),
("HSPA1A", "Surface Hsp70; cmHsp70.1 mAb", "cell_state_induced"),
("HSP90B1", "Surface GRP94 in tumor cells", "cell_state_induced"),
("HPSE", "Surface heparanase on activated platelets / tumor cells",
"lysosomal_exocytosis"),
]
def _load_catalog() -> list[dict]:
"""Return the whole-proteome catalog rows from the in-repo TSV.
Each row has the v1-style expanded columns: ``hgnc_symbol``,
``uniprot_acc``, the five ``*_surface_flag`` fields,
``n_sources_surface``, ``sonnet_verdict``, ``sonnet_reason``,
plus stable IDs. Sourced from D1 via
``scripts/export_whole_proteome_catalog_to_tsv.py``.
"""
# Local-first: inside a repo checkout, prefer the on-disk TSV so the
# script renders against the working tree (matches the local-fallback
# pattern used by the other figure scripts in this folder).
local = Path(__file__).resolve().parents[3] / (
"data/processed/catalog/whole_proteome_catalog.tsv"
)
if local.is_file():
print(f"Reading {local} ...")
text = local.read_text(encoding="utf-8")
else:
print(f"Fetching {CATALOG_TSV_URL} ...")
with urllib.request.urlopen(CATALOG_TSV_URL, timeout=60) as resp: # noqa: S310
text = resp.read().decode("utf-8")
rows = list(csv.DictReader(io.StringIO(text), delimiter="\t"))
for r in rows:
r["n_sources_surface"] = int(r.get("n_sources_surface", 0) or 0)
return rows
def _draw_reason_bars(ax, counts, reasons, palette, header_label, header_color, y_max):
visible = [r for r in reasons if counts.get(r, 0) > 0]
n_bars = len(visible)
x_positions = list(range(n_bars))
heights = [counts[r] for r in visible]
colors = [palette[r] for r in visible]
BAR_SPACING = 1.4
x_positions = [x * BAR_SPACING for x in x_positions]
ax.bar(
x_positions, heights,
color=colors, edgecolor="white",
linewidth=1.2, width=0.55 * BAR_SPACING,
)
for x, h in zip(x_positions, heights, strict=True):
ax.text(
x, h + y_max * 0.015,
f"{h}",
ha="center", va="bottom",
fontsize=25, fontweight="bold", color=header_color,
)
ax.set_title(
header_label,
fontsize=25, color=header_color, fontweight="bold",
loc="left", pad=16,
)
ax.set_xticks(x_positions)
# Flatten the pre-wrapped REASON_LABEL strings (\n → space) so the
# rotated labels read on one line; multi-line + rotation looks broken.
ax.set_xticklabels(
[REASON_LABEL[r].replace("\n", " ") for r in visible],
fontsize=18, color=BRAND_INK,
rotation=30, ha="right", rotation_mode="anchor",
)
ax.tick_params(axis="y", labelsize=20)
ax.set_xlim(-0.9, x_positions[-1] + 0.9)
sns.despine(ax=ax, top=True, right=True)
def _draw_callouts(ax, callouts, palette, title):
ax.set_axis_off()
ax.text(
0.0, 1.0, title,
transform=ax.transAxes, ha="left", va="top",
fontsize=24, color=BRAND_NEUTRAL, fontweight="bold",
)
y0 = 0.82
n = len(callouts)
row_h = (y0 - 0.05) / max(n - 1, 1) if n > 1 else 0.0
row_h = min(row_h, 0.155)
for i, (symbol, desc, reason) in enumerate(callouts):
y = y0 - i * row_h
ax.scatter(
[0.025], [y],
marker="s", s=260,
color=palette.get(reason, BRAND_NEUTRAL),
edgecolor="none", transform=ax.transAxes, zorder=10,
)
ax.text(
0.07, y, symbol,
transform=ax.transAxes, ha="left", va="center",
fontsize=21, fontweight="bold", color=BRAND_INK,
)
ax.text(
0.26, y, f"— {desc}",
transform=ax.transAxes, ha="left", va="center",
fontsize=15, color=BRAND_NEUTRAL,
)
def main() -> None:
_apply_brand_style()
rows = _load_catalog()
print(f" loaded {len(rows):,} rows; sonnet = claude-sonnet-4-6")
zero_db = [r for r in rows if r["n_sources_surface"] == 0]
print(f"\nZero-DB universe: {len(zero_db):,} / {len(rows):,} "
f"({100*len(zero_db)/len(rows):.1f}%)")
def verdict_reason(row):
v = (row.get("sonnet_verdict") or "").strip() or "unknown"
reason = (row.get("sonnet_reason") or "").strip() or "other"
return v, reason
yes_counts: Counter = Counter()
ctx_counts: Counter = Counter()
yes_symbols: dict[str, list[str]] = defaultdict(list)
ctx_symbols: dict[str, list[str]] = defaultdict(list)
for r in zero_db:
v, reason = verdict_reason(r)
sym = r.get("hgnc_symbol", "")
if v == "yes":
yes_counts[reason] += 1
yes_symbols[reason].append(sym)
elif v == "contextual":
ctx_counts[reason] += 1
ctx_symbols[reason].append(sym)
n_yes = sum(yes_counts.values())
n_ctx = sum(ctx_counts.values())
print(f"\nRescues: yes={n_yes}, contextual={n_ctx}")
# Validate callouts.
bad = []
for sym, _, reason in YES_CALLOUTS:
if sym not in yes_symbols.get(reason, []):
bad.append(("yes", sym, reason))
for sym, _, reason in CONTEXTUAL_CALLOUTS:
if sym not in ctx_symbols.get(reason, []):
bad.append(("contextual", sym, reason))
if bad:
raise RuntimeError(f"Callouts not found in rescue slice: {bad}")
fig = plt.figure(figsize=(19, 13))
gs = gridspec.GridSpec(
2, 2, figure=fig,
# v2: bumped callout-row weight (1.05 → 1.55) + figure height
# (11 → 13) so the per-row callout text (larger under brand-style-v3)
# has room to breathe instead of crashing into the bar panels.
height_ratios=[2.2, 1.55],
width_ratios=[1.0, 1.5],
hspace=0.55, wspace=0.28,
top=0.93, bottom=0.04, left=0.06, right=0.97,
)
ax_yes = fig.add_subplot(gs[0, 0])
ax_ctx = fig.add_subplot(gs[0, 1], sharey=ax_yes)
ax_callouts_yes = fig.add_subplot(gs[1, 0])
ax_callouts_ctx = fig.add_subplot(gs[1, 1])
# Subpanel labels — Manrope ExtraBold (weight 800), upper-left of each
# bar panel. Convention: lowercase letters, axis coordinates so they
# track the panel through resize.
for ax, letter in ((ax_yes, "a"), (ax_ctx, "b")):
ax.text(
-0.06, 1.08, letter,
transform=ax.transAxes, ha="left", va="bottom",
fontsize=32, fontweight=800, color=BRAND_INK,
)
max_count = max(
max(yes_counts.values(), default=0),
max(ctx_counts.values(), default=0),
)
y_max = max_count * 1.18
ax_yes.set_ylim(0, y_max)
_draw_reason_bars(
ax_yes, yes_counts, YES_REASONS, YES_PALETTE,
header_label=f"yes — definite surface (n = {n_yes})",
header_color=YES_HEADER_COLOR, y_max=y_max,
)
_draw_reason_bars(
ax_ctx, ctx_counts, CONTEXTUAL_REASONS, CONTEXTUAL_PALETTE,
header_label=f"contextual — state / lineage dependent (n = {n_ctx})",
header_color=CONTEXTUAL_HEADER_COLOR, y_max=y_max,
)
ax_yes.set_ylabel("Genes rescued from\nzero-DB universe", fontsize=25)
ax_yes.tick_params(axis="y", labelsize=20)
plt.setp(ax_ctx.get_yticklabels(), visible=False)
_draw_callouts(ax_callouts_yes, YES_CALLOUTS, YES_PALETTE, title="Select yes rescues")
_draw_callouts(ax_callouts_ctx, CONTEXTUAL_CALLOUTS, CONTEXTUAL_PALETTE, title="Select contextual rescues")
out_pdf = Path("zero_db_rescues_by_triage.pdf")
out_png = Path("zero_db_rescues_by_triage.png")
fig.savefig(out_pdf, bbox_inches="tight", metadata={"Subject": GIST_URL})
fig.savefig(out_png, bbox_inches="tight", dpi=300, metadata={"Source": GIST_URL})
print(f"Wrote {out_pdf} + {out_png}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment