|
# /// script |
|
# requires-python = ">=3.11" |
|
# dependencies = [ |
|
# "matplotlib>=3.9", |
|
# "seaborn>=0.13", |
|
# "pandas>=2.0", |
|
# ] |
|
# /// |
|
"""Reproduce ``zero_db_rescues_by_triage.{pdf,png}`` from the public catalog. |
|
|
|
Whole-genome view of zero-DB rescues: genes where no classical surface |
|
DB (UniProt / GO CC / HPA / SURFY / CSPA) voted yes, yet the Sonnet+NCBI |
|
triage agent voted `yes` (definite surface) or `contextual` (state/lineage |
|
dependent). Two grouped bar panels on a shared y-axis: per-reason counts |
|
within each verdict bucket. Beneath each panel, hand-picked select gene |
|
callouts illustrate the kind of biology the triage agent surfaces. |
|
|
|
Data: read from the in-repo whole-proteome catalog TSV |
|
(``data/processed/catalog/whole_proteome_catalog.tsv``, un-LFS so |
|
raw.githubusercontent.com serves it as plain text). Source of truth: |
|
D1 ``candidate_universe_public`` + ``triage_run_public`` (canonical |
|
sweep ``genome_full_sonnet_ncbi_v2``); the TSV is regenerated by |
|
``scripts/export_whole_proteome_catalog_to_tsv.py`` whenever the |
|
universe / triage state changes. |
|
|
|
Visual styling matches the in-repo ``_plotting_config`` (Deliverome |
|
categorical palette + Manrope-when-available). Inlined so the gist runs |
|
standalone. |
|
|
|
Standalone — ``uv run make_zero_db_rescues_by_triage.py``. |
|
""" |
|
from __future__ import annotations |
|
|
|
import csv |
|
import io |
|
import urllib.request |
|
from collections import Counter, defaultdict |
|
from pathlib import Path |
|
|
|
import matplotlib.font_manager as fm |
|
import matplotlib.gridspec as gridspec |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
REPO = "Deliverome-Project/accessible-surfaceome" |
|
BRANCH = "main" |
|
CATALOG_TSV_URL = ( |
|
f"https://raw.githubusercontent.com/{REPO}/{BRANCH}" |
|
"/data/processed/catalog/whole_proteome_catalog.tsv" |
|
) |
|
|
|
# Published reproduction gist (embedded into output PNG Source / PDF |
|
# Subject metadata — mirrors save_figure in _plotting_config.py). |
|
GIST_URL = "https://gist.github.com/beccajcarlson/a4526c9e6de5e958826bf1d764744c1b" |
|
|
|
# ──── Inline brand styling — sentinel: brand-style-v3 ──── |
|
# Mirrors src/accessible_surfaceome/audit/_plotting_config.py so the gist |
|
# stays self-contained. Kept in sync via tests/test_figure_gists_styling.py. |
|
BRAND_PALETTE = [ |
|
"#BC3C4C", # maroon-light |
|
"#3D6B60", # teal-mid |
|
"#F4AA28", # amber-bright |
|
"#8878C8", # lavender-bright |
|
"#6E1428", # maroon-dark |
|
"#7AAB9F", # teal-light |
|
] |
|
BRAND_SEQUENTIAL = { |
|
"maroon": ["#3E0A18", "#6E1428", "#922038", "#BC3C4C", "#F0A098", "#FDE8E6"], |
|
"teal": ["#152E28", "#244840", "#3D6B60", "#4D8A80", "#7AAB9F", "#CCE8E4"], |
|
"amber": ["#5A2608", "#8C4210", "#C07830", "#F4AA28", "#F4C070", "#FAECD4"], |
|
"lavender": ["#1E1450", "#3A2888", "#5848A8", "#8878C8", "#A090D4", "#E4E0F8"], |
|
} |
|
BRAND_CLAUDE_ORANGE = "#d87851" |
|
BRAND_INK = "#1F1718" |
|
BRAND_NEUTRAL = "#6F5D5A" |
|
BRAND_GRID = "#E6DAD4" |
|
|
|
|
|
def _register_brand_fonts() -> None: |
|
candidates = [ |
|
Path(__file__).resolve().parents[3] / "assets" / "fonts", |
|
Path.cwd() / "assets" / "fonts", |
|
] |
|
for fonts_dir in candidates: |
|
if fonts_dir.is_dir(): |
|
for path in sorted(list(fonts_dir.glob("*.ttf")) + list(fonts_dir.glob("*.otf"))): |
|
try: |
|
fm.fontManager.addfont(str(path)) |
|
except Exception: # noqa: BLE001 |
|
continue |
|
return |
|
|
|
|
|
def _apply_brand_style() -> None: |
|
"""Inline equivalent of `setup_plotting_style`. Sentinel: brand-style-v3. |
|
v2: bumped sizes ~25% + explicit medium weight (avoids ExtraLight default |
|
that matplotlib picks from the Manrope variable file). Companion to the |
|
static Manrope-{regular,medium,semibold,bold}.otf files in assets/fonts/.""" |
|
_register_brand_fonts() |
|
sns.set_style("whitegrid") |
|
sns.set_context("notebook", font_scale=1.0) |
|
plt.rcParams.update({ |
|
"savefig.dpi": 300, |
|
"savefig.bbox": "tight", |
|
"figure.facecolor": "none", |
|
"savefig.facecolor": "none", |
|
"font.family": "sans-serif", |
|
"font.sans-serif": ["Manrope", "Outfit", "DejaVu Sans", "Liberation Sans", "Arial"], |
|
"font.weight": "medium", |
|
"font.size": 21, |
|
"axes.labelsize": 25, |
|
"axes.labelweight": "medium", |
|
"axes.titlesize": 0, |
|
"axes.titlepad": 0, |
|
"axes.spines.top": False, |
|
"axes.spines.right": False, |
|
"axes.grid": True, |
|
"axes.axisbelow": True, |
|
"axes.edgecolor": BRAND_GRID, |
|
"axes.labelcolor": BRAND_INK, |
|
"axes.facecolor": "none", |
|
"text.color": BRAND_INK, |
|
"grid.alpha": 0.35, |
|
"grid.linestyle": "-", |
|
"grid.linewidth": 0.7, |
|
"grid.color": BRAND_GRID, |
|
"xtick.labelsize": 20, |
|
"ytick.labelsize": 20, |
|
"xtick.color": BRAND_INK, |
|
"ytick.color": BRAND_INK, |
|
"legend.frameon": False, |
|
"legend.fontsize": 20, |
|
"patch.edgecolor": "none", |
|
"patch.linewidth": 0.0, |
|
}) |
|
|
|
|
|
# Reason taxonomy per verdict bucket (matches the triage agent's closed enum). |
|
YES_REASONS = [ |
|
"classical_surface_receptor", |
|
"multipass_with_exposed_loops", |
|
"gpi_anchored", |
|
"stable_complex_partner", |
|
"other", |
|
] |
|
CONTEXTUAL_REASONS = [ |
|
"dual_localization", |
|
"tissue_restricted_surface", |
|
"stable_surface_attachment", |
|
"cell_state_induced", |
|
"lysosomal_exocytosis", |
|
"other", |
|
] |
|
REASON_LABEL = { |
|
"classical_surface_receptor": "classical\nsurface\nreceptor", |
|
"multipass_with_exposed_loops": "multipass\nw/ exposed\nloops", |
|
"gpi_anchored": "GPI-\nanchored", |
|
"stable_complex_partner": "stable\ncomplex\npartner", |
|
"dual_localization": "dual\nlocalization", |
|
"tissue_restricted_surface": "tissue-\nrestricted\nsurface", |
|
"stable_surface_attachment": "stable\nsurface\nattachment", |
|
"cell_state_induced": "cell-state\ninduced", |
|
"lysosomal_exocytosis": "lysosomal\nexocytosis", |
|
"other": "other", |
|
} |
|
|
|
# Sequential green ramp for yes (definite surface); amber ramp for contextual |
|
# (state / lineage / partner dependent). `other` neutral grey. |
|
YES_PALETTE = { |
|
"classical_surface_receptor": "#2E7A55", |
|
"multipass_with_exposed_loops": "#4D8A80", |
|
"gpi_anchored": "#7AAB9F", |
|
"stable_complex_partner": "#A8C8C0", |
|
"other": "#6F5D5A", |
|
} |
|
CONTEXTUAL_PALETTE = { |
|
"dual_localization": "#8C4210", |
|
"tissue_restricted_surface": "#C07830", |
|
"stable_surface_attachment": "#F4AA28", |
|
"cell_state_induced": "#F4C070", |
|
"lysosomal_exocytosis": "#FAECD4", |
|
"other": "#6F5D5A", |
|
} |
|
|
|
YES_HEADER_COLOR = "#2E7A55" |
|
CONTEXTUAL_HEADER_COLOR = "#8C4210" |
|
|
|
# Ordered to match the per-reason bar order in the panel above |
|
# (YES_REASONS / CONTEXTUAL_REASONS), so callouts read top-to-bottom in |
|
# the same sequence as the bars left-to-right. |
|
YES_CALLOUTS = [ |
|
("PVRIG", "NK/T checkpoint; COM701 = anti-PVRIG surface mAb", "classical_surface_receptor"), |
|
("ECEL1", "Type-II TM; neprilysin/M13 family", "classical_surface_receptor"), |
|
("STEAP1", "Prostate ADC + BiTE target", "multipass_with_exposed_loops"), |
|
("ORAI2", "Store-operated Ca2+ channel", "multipass_with_exposed_loops"), |
|
("CRIPTO", "GPI-anchored oncofetal antigen", "gpi_anchored"), |
|
("LY96", "MD-2 — TLR4 co-receptor", "stable_complex_partner"), |
|
] |
|
CONTEXTUAL_CALLOUTS = [ |
|
("IL15", "Surface trans-presentation via IL-15Rα", "dual_localization"), |
|
("TIMP2", "MT1-MMP ternary complex", "stable_surface_attachment"), |
|
("GSDMD", "Gasdermin D/E/C — pyroptosis pores", "cell_state_induced"), |
|
("HSPA1A", "Surface Hsp70; cmHsp70.1 mAb", "cell_state_induced"), |
|
("HSP90B1", "Surface GRP94 in tumor cells", "cell_state_induced"), |
|
("HPSE", "Surface heparanase on activated platelets / tumor cells", |
|
"lysosomal_exocytosis"), |
|
] |
|
|
|
|
|
def _load_catalog() -> list[dict]: |
|
"""Return the whole-proteome catalog rows from the in-repo TSV. |
|
|
|
Each row has the v1-style expanded columns: ``hgnc_symbol``, |
|
``uniprot_acc``, the five ``*_surface_flag`` fields, |
|
``n_sources_surface``, ``sonnet_verdict``, ``sonnet_reason``, |
|
plus stable IDs. Sourced from D1 via |
|
``scripts/export_whole_proteome_catalog_to_tsv.py``. |
|
""" |
|
# Local-first: inside a repo checkout, prefer the on-disk TSV so the |
|
# script renders against the working tree (matches the local-fallback |
|
# pattern used by the other figure scripts in this folder). |
|
local = Path(__file__).resolve().parents[3] / ( |
|
"data/processed/catalog/whole_proteome_catalog.tsv" |
|
) |
|
if local.is_file(): |
|
print(f"Reading {local} ...") |
|
text = local.read_text(encoding="utf-8") |
|
else: |
|
print(f"Fetching {CATALOG_TSV_URL} ...") |
|
with urllib.request.urlopen(CATALOG_TSV_URL, timeout=60) as resp: # noqa: S310 |
|
text = resp.read().decode("utf-8") |
|
rows = list(csv.DictReader(io.StringIO(text), delimiter="\t")) |
|
for r in rows: |
|
r["n_sources_surface"] = int(r.get("n_sources_surface", 0) or 0) |
|
return rows |
|
|
|
|
|
def _draw_reason_bars(ax, counts, reasons, palette, header_label, header_color, y_max): |
|
visible = [r for r in reasons if counts.get(r, 0) > 0] |
|
n_bars = len(visible) |
|
x_positions = list(range(n_bars)) |
|
heights = [counts[r] for r in visible] |
|
colors = [palette[r] for r in visible] |
|
|
|
BAR_SPACING = 1.4 |
|
x_positions = [x * BAR_SPACING for x in x_positions] |
|
ax.bar( |
|
x_positions, heights, |
|
color=colors, edgecolor="white", |
|
linewidth=1.2, width=0.55 * BAR_SPACING, |
|
) |
|
|
|
for x, h in zip(x_positions, heights, strict=True): |
|
ax.text( |
|
x, h + y_max * 0.015, |
|
f"{h}", |
|
ha="center", va="bottom", |
|
fontsize=25, fontweight="bold", color=header_color, |
|
) |
|
|
|
ax.set_title( |
|
header_label, |
|
fontsize=25, color=header_color, fontweight="bold", |
|
loc="left", pad=16, |
|
) |
|
ax.set_xticks(x_positions) |
|
# Flatten the pre-wrapped REASON_LABEL strings (\n → space) so the |
|
# rotated labels read on one line; multi-line + rotation looks broken. |
|
ax.set_xticklabels( |
|
[REASON_LABEL[r].replace("\n", " ") for r in visible], |
|
fontsize=18, color=BRAND_INK, |
|
rotation=30, ha="right", rotation_mode="anchor", |
|
) |
|
ax.tick_params(axis="y", labelsize=20) |
|
ax.set_xlim(-0.9, x_positions[-1] + 0.9) |
|
sns.despine(ax=ax, top=True, right=True) |
|
|
|
|
|
def _draw_callouts(ax, callouts, palette, title): |
|
ax.set_axis_off() |
|
ax.text( |
|
0.0, 1.0, title, |
|
transform=ax.transAxes, ha="left", va="top", |
|
fontsize=24, color=BRAND_NEUTRAL, fontweight="bold", |
|
) |
|
y0 = 0.82 |
|
n = len(callouts) |
|
row_h = (y0 - 0.05) / max(n - 1, 1) if n > 1 else 0.0 |
|
row_h = min(row_h, 0.155) |
|
for i, (symbol, desc, reason) in enumerate(callouts): |
|
y = y0 - i * row_h |
|
ax.scatter( |
|
[0.025], [y], |
|
marker="s", s=260, |
|
color=palette.get(reason, BRAND_NEUTRAL), |
|
edgecolor="none", transform=ax.transAxes, zorder=10, |
|
) |
|
ax.text( |
|
0.07, y, symbol, |
|
transform=ax.transAxes, ha="left", va="center", |
|
fontsize=21, fontweight="bold", color=BRAND_INK, |
|
) |
|
ax.text( |
|
0.26, y, f"— {desc}", |
|
transform=ax.transAxes, ha="left", va="center", |
|
fontsize=15, color=BRAND_NEUTRAL, |
|
) |
|
|
|
|
|
def main() -> None: |
|
_apply_brand_style() |
|
|
|
rows = _load_catalog() |
|
print(f" loaded {len(rows):,} rows; sonnet = claude-sonnet-4-6") |
|
|
|
zero_db = [r for r in rows if r["n_sources_surface"] == 0] |
|
print(f"\nZero-DB universe: {len(zero_db):,} / {len(rows):,} " |
|
f"({100*len(zero_db)/len(rows):.1f}%)") |
|
|
|
def verdict_reason(row): |
|
v = (row.get("sonnet_verdict") or "").strip() or "unknown" |
|
reason = (row.get("sonnet_reason") or "").strip() or "other" |
|
return v, reason |
|
|
|
yes_counts: Counter = Counter() |
|
ctx_counts: Counter = Counter() |
|
yes_symbols: dict[str, list[str]] = defaultdict(list) |
|
ctx_symbols: dict[str, list[str]] = defaultdict(list) |
|
for r in zero_db: |
|
v, reason = verdict_reason(r) |
|
sym = r.get("hgnc_symbol", "") |
|
if v == "yes": |
|
yes_counts[reason] += 1 |
|
yes_symbols[reason].append(sym) |
|
elif v == "contextual": |
|
ctx_counts[reason] += 1 |
|
ctx_symbols[reason].append(sym) |
|
|
|
n_yes = sum(yes_counts.values()) |
|
n_ctx = sum(ctx_counts.values()) |
|
print(f"\nRescues: yes={n_yes}, contextual={n_ctx}") |
|
|
|
# Validate callouts. |
|
bad = [] |
|
for sym, _, reason in YES_CALLOUTS: |
|
if sym not in yes_symbols.get(reason, []): |
|
bad.append(("yes", sym, reason)) |
|
for sym, _, reason in CONTEXTUAL_CALLOUTS: |
|
if sym not in ctx_symbols.get(reason, []): |
|
bad.append(("contextual", sym, reason)) |
|
if bad: |
|
raise RuntimeError(f"Callouts not found in rescue slice: {bad}") |
|
|
|
fig = plt.figure(figsize=(19, 13)) |
|
gs = gridspec.GridSpec( |
|
2, 2, figure=fig, |
|
# v2: bumped callout-row weight (1.05 → 1.55) + figure height |
|
# (11 → 13) so the per-row callout text (larger under brand-style-v3) |
|
# has room to breathe instead of crashing into the bar panels. |
|
height_ratios=[2.2, 1.55], |
|
width_ratios=[1.0, 1.5], |
|
hspace=0.55, wspace=0.28, |
|
top=0.93, bottom=0.04, left=0.06, right=0.97, |
|
) |
|
ax_yes = fig.add_subplot(gs[0, 0]) |
|
ax_ctx = fig.add_subplot(gs[0, 1], sharey=ax_yes) |
|
ax_callouts_yes = fig.add_subplot(gs[1, 0]) |
|
ax_callouts_ctx = fig.add_subplot(gs[1, 1]) |
|
|
|
# Subpanel labels — Manrope ExtraBold (weight 800), upper-left of each |
|
# bar panel. Convention: lowercase letters, axis coordinates so they |
|
# track the panel through resize. |
|
for ax, letter in ((ax_yes, "a"), (ax_ctx, "b")): |
|
ax.text( |
|
-0.06, 1.08, letter, |
|
transform=ax.transAxes, ha="left", va="bottom", |
|
fontsize=32, fontweight=800, color=BRAND_INK, |
|
) |
|
|
|
max_count = max( |
|
max(yes_counts.values(), default=0), |
|
max(ctx_counts.values(), default=0), |
|
) |
|
y_max = max_count * 1.18 |
|
ax_yes.set_ylim(0, y_max) |
|
|
|
_draw_reason_bars( |
|
ax_yes, yes_counts, YES_REASONS, YES_PALETTE, |
|
header_label=f"yes — definite surface (n = {n_yes})", |
|
header_color=YES_HEADER_COLOR, y_max=y_max, |
|
) |
|
_draw_reason_bars( |
|
ax_ctx, ctx_counts, CONTEXTUAL_REASONS, CONTEXTUAL_PALETTE, |
|
header_label=f"contextual — state / lineage dependent (n = {n_ctx})", |
|
header_color=CONTEXTUAL_HEADER_COLOR, y_max=y_max, |
|
) |
|
|
|
ax_yes.set_ylabel("Genes rescued from\nzero-DB universe", fontsize=25) |
|
ax_yes.tick_params(axis="y", labelsize=20) |
|
plt.setp(ax_ctx.get_yticklabels(), visible=False) |
|
|
|
_draw_callouts(ax_callouts_yes, YES_CALLOUTS, YES_PALETTE, title="Select yes rescues") |
|
_draw_callouts(ax_callouts_ctx, CONTEXTUAL_CALLOUTS, CONTEXTUAL_PALETTE, title="Select contextual rescues") |
|
|
|
out_pdf = Path("zero_db_rescues_by_triage.pdf") |
|
out_png = Path("zero_db_rescues_by_triage.png") |
|
fig.savefig(out_pdf, bbox_inches="tight", metadata={"Subject": GIST_URL}) |
|
fig.savefig(out_png, bbox_inches="tight", dpi=300, metadata={"Source": GIST_URL}) |
|
print(f"Wrote {out_pdf} + {out_png}") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |