Skip to content

Instantly share code, notes, and snippets.

@beccajcarlson
Last active June 8, 2026 02:44
Show Gist options
  • Select an option

  • Save beccajcarlson/1265c867a3bbb08efd81262789e1f013 to your computer and use it in GitHub Desktop.

Select an option

Save beccajcarlson/1265c867a3bbb08efd81262789e1f013 to your computer and use it in GitHub Desktop.
DB vs Sonnet agreement on the whole proteome — bench-optimized cutoffs

DB ↔ Sonnet agreement on the whole proteome — bench-optimized cutoffs

Whole-genome analog of the 147-gene bench plot db_correctness_by_class. On the bench, ground truth is hand-curated; on the whole proteome (~19,324 protein-coding genes) it doesn't exist, so we use the Sonnet (+ NCBI) triage verdict as the reference and ask: for each surface DB (under its bench-optimized cutoff) and each ≥k-DB ensemble (k = 1..5), what fraction of genes does it agree with Sonnet on, split by Sonnet's verdict bucket.

Soft-credit rule: DB "yes" matches Sonnet "yes" or "contextual"; DB "no" matches Sonnet "no" only.

Four buckets per caller:

  • overall — across all genes with a rated Sonnet verdict
  • Sonnet = yes — sensitivity-like
  • Sonnet = contextual — DB must yes-vote for a match
  • Sonnet = no — specificity-like (dominates overall by population)

10 callers total: 5 ensembles ≥1..≥5 (teal ramp, light = permissive) + 5 individual DBs under their optimized cutoffs (sorted descending by overall agreement). Sonnet is the reference and does not appear as a bar — it would be 100% by construction.

Run:

uv run make_db_vs_sonnet_whole_proteome.py

Sources (fetched live from the public repo, no API calls):

Canonical in-repo generator: scripts/db_vs_sonnet_whole_proteome.py.

# /// script
# requires-python = ">=3.11"
# dependencies = [
# "matplotlib>=3.9",
# "pandas>=2.2",
# "seaborn>=0.13",
# "httpx>=0.27",
# ]
# ///
"""Reproduce ``db_vs_sonnet_whole_proteome.{pdf,png}`` from the public repo.
Whole-genome DB ↔ Sonnet agreement plot. The 147-gene bench has hand-
curated ground truth; on the whole proteome it doesn't exist, so we use
the Sonnet (+ NCBI) triage verdict as the **reference** and ask how
often each surface DB (under its bench-optimized cutoff) and each
≥k-DB ensemble agrees with Sonnet, split by Sonnet's verdict bucket.
Soft-credit rule (same as the bench plot): DB "yes" matches Sonnet
"yes" or "contextual"; DB "no" matches Sonnet "no" only.
Four buckets per caller:
* **overall** — across all genes with a rated Sonnet verdict
* **Sonnet = yes** — sensitivity-like
* **Sonnet = contextual** — DB must yes-vote for a match
* **Sonnet = no** — specificity-like
10 callers: 5 ensembles (≥1..≥5 of the 5 surface DBs vote yes, teal
ramp) + 5 individual DBs under their optimized cutoffs (sorted desc by
overall agreement). Sonnet is the reference and does not appear as a
bar (would be 100% by construction).
Visual styling matches the in-repo ``_plotting_config`` (Deliverome
categorical palette + Manrope-when-available). Inlined so the gist
runs standalone.
Data: catalog fetched live from
``https://api.deliverome.org/surfaceome/v1/catalog`` (~19,324 genes
with per-gene Sonnet+NCBI verdict); per-DB votes from
candidate_universe.tsv + db_optimized_cutoffs.tsv via
raw.githubusercontent.com.
Standalone — ``uv run make_db_vs_sonnet_whole_proteome.py``.
"""
from __future__ import annotations
import io
from pathlib import Path
import httpx
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
REPO = "Deliverome-Project/accessible-surfaceome"
BRANCH = "main"
BASE = f"https://raw.githubusercontent.com/{REPO}/{BRANCH}"
CATALOG_TSV = f"{BASE}/data/processed/catalog/whole_proteome_catalog.tsv"
OPT_CUTOFFS_TSV = f"{BASE}/data/processed/triage_bench/db_optimized_cutoffs.tsv"
# Published reproduction gist (embedded into output PNG Source / PDF
# Subject metadata — mirrors save_figure in _plotting_config.py).
GIST_URL = "https://gist.github.com/beccajcarlson/1265c867a3bbb08efd81262789e1f013"
# ──── Inline brand styling — sentinel: brand-style-v3 ────
# Mirrors src/accessible_surfaceome/audit/_plotting_config.py so the gist
# stays self-contained. Kept in sync via tests/test_figure_gists_styling.py.
BRAND_PALETTE = [
"#BC3C4C", # maroon-light
"#3D6B60", # teal-mid
"#F4AA28", # amber-bright
"#8878C8", # lavender-bright
"#6E1428", # maroon-dark
"#7AAB9F", # teal-light
]
BRAND_SEQUENTIAL = {
"maroon": ["#3E0A18", "#6E1428", "#922038", "#BC3C4C", "#F0A098", "#FDE8E6"],
"teal": ["#152E28", "#244840", "#3D6B60", "#4D8A80", "#7AAB9F", "#CCE8E4"],
"amber": ["#5A2608", "#8C4210", "#C07830", "#F4AA28", "#F4C070", "#FAECD4"],
"lavender": ["#1E1450", "#3A2888", "#5848A8", "#8878C8", "#A090D4", "#E4E0F8"],
}
BRAND_CLAUDE_ORANGE = "#d87851"
BRAND_INK = "#1F1718"
BRAND_NEUTRAL = "#6F5D5A"
BRAND_GRID = "#E6DAD4"
def _register_brand_fonts() -> None:
candidates = [
Path(__file__).resolve().parents[3] / "assets" / "fonts",
Path.cwd() / "assets" / "fonts",
]
for fonts_dir in candidates:
if fonts_dir.is_dir():
for path in sorted(list(fonts_dir.glob("*.ttf")) + list(fonts_dir.glob("*.otf"))):
try:
fm.fontManager.addfont(str(path))
except Exception: # noqa: BLE001
continue
return
def _apply_brand_style() -> None:
"""Inline equivalent of `setup_plotting_style`. Sentinel: brand-style-v3.
v2: bumped sizes ~25% + explicit medium weight (avoids ExtraLight default
that matplotlib picks from the Manrope variable file). Companion to the
static Manrope-{regular,medium,semibold,bold}.otf files in assets/fonts/."""
_register_brand_fonts()
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.0)
plt.rcParams.update({
"savefig.dpi": 300,
"savefig.bbox": "tight",
"figure.facecolor": "none",
"savefig.facecolor": "none",
"font.family": "sans-serif",
"font.sans-serif": ["Manrope", "Outfit", "DejaVu Sans", "Liberation Sans", "Arial"],
"font.weight": "medium",
"font.size": 21,
"axes.labelsize": 25,
"axes.labelweight": "medium",
"axes.titlesize": 0,
"axes.titlepad": 0,
"axes.spines.top": False,
"axes.spines.right": False,
"axes.grid": True,
"axes.axisbelow": True,
"axes.edgecolor": BRAND_GRID,
"axes.labelcolor": BRAND_INK,
"axes.facecolor": "none",
"text.color": BRAND_INK,
"grid.alpha": 0.35,
"grid.linestyle": "-",
"grid.linewidth": 0.7,
"grid.color": BRAND_GRID,
"xtick.labelsize": 20,
"ytick.labelsize": 20,
"xtick.color": BRAND_INK,
"ytick.color": BRAND_INK,
"legend.frameon": False,
"legend.fontsize": 20,
"patch.edgecolor": "none",
"patch.linewidth": 0.0,
})
DB_LABELS = ["UniProt", "GO CC", "HPA", "SURFY", "CSPA"]
ENSEMBLE_KS = [1, 2, 3, 4, 5]
DB_PALETTE = {label: BRAND_PALETTE[i] for i, label in enumerate(DB_LABELS)}
# Sequential teal ramp for the ≥k ensembles (light → dark = permissive → strict).
ENSEMBLE_PALETTE = {
1: "#7AAB9F", # teal-light (≥1 = permissive union)
2: "#4D8A80",
3: "#3D6B60",
4: "#244840",
5: "#152E28", # teal-deep (≥5 = strict intersection)
}
BUCKETS = ["overall", "yes", "contextual", "no"]
BUCKET_LABEL = {
"overall": "overall",
"yes": "Sonnet = yes",
"contextual": "Sonnet = contextual\n(yes-vote = match)",
"no": "Sonnet = no\n(no-vote = match)",
}
def _fetch_tsv(url: str) -> pd.DataFrame:
local = Path(__file__).resolve().parents[3] / url[len(BASE) + 1:]
if local.is_file():
return pd.read_csv(local, sep="\t")
r = httpx.get(url, timeout=30)
r.raise_for_status()
return pd.read_csv(io.StringIO(r.text), sep="\t")
def _vote_match(db_vote: str, sonnet: str) -> bool:
"""DB calls yes/no; Sonnet calls yes/contextual/no. Soft credit on positive side."""
if db_vote == sonnet:
return True
return db_vote in ("yes", "contextual") and sonnet in ("yes", "contextual")
def main() -> None:
_apply_brand_style()
# Catalog now comes from a static TSV (sourced from D1 by
# scripts/export_whole_proteome_catalog_to_tsv.py) instead of the
# live Worker — keeps the gist self-contained and snapshot-pinned
# to whatever branch/SHA this script is fetched from. Each row
# carries the v1-style ``*_surface_flag`` columns AND the
# canonical Sonnet+NCBI verdict, so this single TSV replaces the
# CATALOG_URL + CAND_TSV pair the figure used previously.
catalog = _fetch_tsv(CATALOG_TSV)
print(f" loaded {len(catalog):,} catalog rows; sonnet = claude-sonnet-4-6")
opt = _fetch_tsv(OPT_CUTOFFS_TSV)
uniprot_opt = set(opt.loc[opt["uniprot_optimized"] == 1, "accession"].astype(str))
cspa_opt = set(opt.loc[opt["cspa_optimized"] == 1, "accession"].astype(str))
records = []
for row in catalog.itertuples(index=False):
v = (str(getattr(row, "sonnet_verdict", "") or "")).strip()
if v not in ("yes", "contextual", "no"):
continue
acc = str(getattr(row, "uniprot_acc", "") or "")
rec = {
"symbol": str(getattr(row, "hgnc_symbol", "") or ""),
"acc": acc,
"sonnet": v,
"UniProt": acc in uniprot_opt,
"CSPA": acc in cspa_opt,
"GO CC": int(getattr(row, "go_surface_flag", 0) or 0) == 1,
"HPA": int(getattr(row, "hpa_surface_flag", 0) or 0) == 1,
"SURFY": int(getattr(row, "surfy_surface_flag", 0) or 0) == 1,
}
records.append(rec)
df = pd.DataFrame(records)
print(f"\nGenes with a rated Sonnet verdict: {len(df):,}")
print(df["sonnet"].value_counts())
# Sort individual DBs by overall agreement.
def _db_overall(label: str) -> float:
n_match = sum(_vote_match("yes" if row[label] else "no", row["sonnet"])
for _, row in df.iterrows())
return n_match / len(df)
db_overall_acc = {label: _db_overall(label) for label in DB_LABELS}
db_sorted = sorted(DB_LABELS, key=lambda d: -db_overall_acc[d])
callers: list[tuple[str, str]] = []
for k in ENSEMBLE_KS:
callers.append((f"≥{k} DB", "ensemble"))
for label in db_sorted:
callers.append((label, "single"))
def caller_vote(caller: str, kind: str, row: pd.Series) -> str:
if kind == "single":
return "yes" if row[caller] else "no"
k = int(caller.lstrip("≥").rstrip(" DB"))
return "yes" if sum(bool(row[d]) for d in DB_LABELS) >= k else "no"
rows_long: list[dict] = []
overall_acc: dict[str, float] = {}
for caller_label, kind in callers:
for bucket in BUCKETS:
sub = df if bucket == "overall" else df[df["sonnet"] == bucket]
if sub.empty:
continue
n_match = sum(
_vote_match(caller_vote(caller_label, kind, r), r["sonnet"])
for _, r in sub.iterrows()
)
frac = n_match / len(sub)
if bucket == "overall":
overall_acc[caller_label] = frac
rows_long.append({
"caller": caller_label,
"bucket": bucket,
"bucket_label": BUCKET_LABEL[bucket],
"fraction": frac,
"n_total": len(sub),
})
plot_df = pd.DataFrame(rows_long)
print("\nOverall agreement with Sonnet (across whole proteome):")
for caller_label, _ in callers:
print(f" {caller_label:10s} {overall_acc[caller_label]*100:5.1f}%")
caller_order = [c[0] for c in callers]
palette = [
(ENSEMBLE_PALETTE[int(c[0].lstrip("≥").rstrip(" DB"))] if c[1] == "ensemble"
else DB_PALETTE[c[0]])
for c in callers
]
fig, ax = plt.subplots(figsize=(17, 6.5))
sns.barplot(
data=plot_df,
x="bucket_label", y="fraction",
hue="caller",
order=[BUCKET_LABEL[b] for b in BUCKETS],
hue_order=caller_order,
palette=palette,
edgecolor="none", saturation=1.0,
width=0.92,
ax=ax,
)
n_callers = len(caller_order)
n_buckets = len(BUCKETS)
bar_width = ax.patches[0].get_width()
gap = bar_width * 0.4
for caller_idx in range(n_callers):
shift = gap if caller_idx >= 5 else 0
for j in range(n_buckets):
patch = ax.patches[caller_idx * n_buckets + j]
patch.set_x(patch.get_x() + shift)
for i, caller_label in enumerate(caller_order):
for j, bucket in enumerate(BUCKETS):
patch = ax.patches[i * n_buckets + j]
cell = plot_df[(plot_df["caller"] == caller_label) & (plot_df["bucket"] == bucket)]
if cell.empty:
continue
frac = cell.iloc[0]["fraction"]
ax.text(
patch.get_x() + patch.get_width() / 2,
patch.get_height() + 0.01,
f"{frac:.0%}",
ha="center", va="bottom",
fontsize=11, color=BRAND_INK,
)
ax.set_xlabel("")
ax.set_ylabel("Fraction agreeing with\nSonnet (+ NCBI) verdict")
ax.set_ylim(0, 1.14)
handles, _ = ax.get_legend_handles_labels()
legend_labels = [f"{lbl} ({overall_acc[lbl]:.0%})" for lbl in caller_order]
ax.legend(
handles, legend_labels,
title="Caller (overall agreement)",
loc="upper left", bbox_to_anchor=(1.02, 1.0),
frameon=False, fontsize=14,
)
totals = {
row["bucket"]: row["n_total"]
for _, row in plot_df.iterrows()
if row["caller"] == caller_order[0]
}
subtitle = (
f"n(overall) = {totals.get('overall', 0):,} · "
f"n(yes) = {totals.get('yes', 0):,} · "
f"n(contextual) = {totals.get('contextual', 0):,} · "
f"n(no) = {totals.get('no', 0):,} "
f"(whole protein-coding proteome with a rated Sonnet verdict)"
)
ax.text(
0.5, -0.18, subtitle,
transform=ax.transAxes, ha="center", va="top",
fontsize=14, color=BRAND_NEUTRAL,
)
sns.despine(ax=ax, top=True, right=True)
fig.tight_layout()
out_pdf = Path("db_vs_sonnet_whole_proteome.pdf")
out_png = Path("db_vs_sonnet_whole_proteome.png")
fig.savefig(out_pdf, bbox_inches="tight", metadata={"Subject": GIST_URL})
fig.savefig(out_png, bbox_inches="tight", dpi=300, metadata={"Source": GIST_URL})
print(f"Wrote {out_pdf} + {out_png}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment