beccajcarlson · June 8, 2026 02:44
diff --git a/01_db_vs_sonnet_whole_proteome.md b/01_db_vs_sonnet_whole_proteome.md
diff --git a/make_db_vs_sonnet_whole_proteome.py b/make_db_vs_sonnet_whole_proteome.py
 # /// script
 # requires-python = ">=3.11"
 # dependencies = [
 #   "matplotlib>=3.9",
 #   "pandas>=2.2",
 #   "seaborn>=0.13",
 #   "httpx>=0.27",
 # ]
 # ///
 """Reproduce ``db_vs_sonnet_whole_proteome.{pdf,png}`` from the public repo.

 Whole-genome DB ↔ Sonnet agreement plot. The 147-gene bench has hand-
 curated ground truth; on the whole proteome it doesn't exist, so we use
 the Sonnet (+ NCBI) triage verdict as the **reference** and ask how
 often each surface DB (under its bench-optimized cutoff) and each
 ≥k-DB ensemble agrees with Sonnet, split by Sonnet's verdict bucket.

 Soft-credit rule (same as the bench plot): DB "yes" matches Sonnet
 "yes" or "contextual"; DB "no" matches Sonnet "no" only.

 Four buckets per caller:
 * **overall** — across all genes with a rated Sonnet verdict
 * **Sonnet = yes** — sensitivity-like
 * **Sonnet = contextual** — DB must yes-vote for a match
 * **Sonnet = no** — specificity-like

 10 callers: 5 ensembles (≥1..≥5 of the 5 surface DBs vote yes, teal
 ramp) + 5 individual DBs under their optimized cutoffs (sorted desc by
 overall agreement). Sonnet is the reference and does not appear as a
 bar (would be 100% by construction).

 Visual styling matches the in-repo ``_plotting_config`` (Deliverome
 categorical palette + Manrope-when-available). Inlined so the gist
 runs standalone.

 Data: catalog fetched live from
 ``https://api.deliverome.org/surfaceome/v1/catalog`` (~19,324 genes
 with per-gene Sonnet+NCBI verdict); per-DB votes from
 candidate_universe.tsv + db_optimized_cutoffs.tsv via
 raw.githubusercontent.com.

 Standalone — ``uv run make_db_vs_sonnet_whole_proteome.py``.
 """
 from __future__ import annotations

 import io
 from pathlib import Path

 import httpx
 import matplotlib.font_manager as fm
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns

 REPO = "Deliverome-Project/accessible-surfaceome"
 BRANCH = "main"
 BASE = f"https://raw.githubusercontent.com/{REPO}/{BRANCH}"

 CATALOG_TSV = f"{BASE}/data/processed/catalog/whole_proteome_catalog.tsv"
 OPT_CUTOFFS_TSV = f"{BASE}/data/processed/triage_bench/db_optimized_cutoffs.tsv"

 # Published reproduction gist (embedded into output PNG Source / PDF
 # Subject metadata — mirrors save_figure in _plotting_config.py).
 GIST_URL = "https://gist.github.com/beccajcarlson/1265c867a3bbb08efd81262789e1f013"

 # ──── Inline brand styling — sentinel: brand-style-v3 ────
 # Mirrors src/accessible_surfaceome/audit/_plotting_config.py so the gist
 # stays self-contained. Kept in sync via tests/test_figure_gists_styling.py.
 BRAND_PALETTE = [
    "#BC3C4C",  # maroon-light
    "#3D6B60",  # teal-mid
    "#F4AA28",  # amber-bright
    "#8878C8",  # lavender-bright
    "#6E1428",  # maroon-dark
    "#7AAB9F",  # teal-light
 ]
 BRAND_SEQUENTIAL = {
    "maroon":   ["#3E0A18", "#6E1428", "#922038", "#BC3C4C", "#F0A098", "#FDE8E6"],
    "teal":     ["#152E28", "#244840", "#3D6B60", "#4D8A80", "#7AAB9F", "#CCE8E4"],
    "amber":    ["#5A2608", "#8C4210", "#C07830", "#F4AA28", "#F4C070", "#FAECD4"],
    "lavender": ["#1E1450", "#3A2888", "#5848A8", "#8878C8", "#A090D4", "#E4E0F8"],
 }
 BRAND_CLAUDE_ORANGE = "#d87851"
 BRAND_INK = "#1F1718"
 BRAND_NEUTRAL = "#6F5D5A"
 BRAND_GRID = "#E6DAD4"


 def _register_brand_fonts() -> None:
    candidates = [
        Path(__file__).resolve().parents[3] / "assets" / "fonts",
        Path.cwd() / "assets" / "fonts",
    ]
    for fonts_dir in candidates:
        if fonts_dir.is_dir():
            for path in sorted(list(fonts_dir.glob("*.ttf")) + list(fonts_dir.glob("*.otf"))):
                try:
                    fm.fontManager.addfont(str(path))
                except Exception:  # noqa: BLE001
                    continue
            return


 def _apply_brand_style() -> None:
    """Inline equivalent of `setup_plotting_style`. Sentinel: brand-style-v3.
    v2: bumped sizes ~25% + explicit medium weight (avoids ExtraLight default
    that matplotlib picks from the Manrope variable file). Companion to the
    static Manrope-{regular,medium,semibold,bold}.otf files in assets/fonts/."""
    _register_brand_fonts()
    sns.set_style("whitegrid")
    sns.set_context("notebook", font_scale=1.0)
    plt.rcParams.update({
        "savefig.dpi": 300,
        "savefig.bbox": "tight",
        "figure.facecolor": "none",
        "savefig.facecolor": "none",
        "font.family": "sans-serif",
        "font.sans-serif": ["Manrope", "Outfit", "DejaVu Sans", "Liberation Sans", "Arial"],
        "font.weight": "medium",
        "font.size": 21,
        "axes.labelsize": 25,
        "axes.labelweight": "medium",
        "axes.titlesize": 0,
        "axes.titlepad": 0,
        "axes.spines.top": False,
        "axes.spines.right": False,
        "axes.grid": True,
        "axes.axisbelow": True,
        "axes.edgecolor": BRAND_GRID,
        "axes.labelcolor": BRAND_INK,
        "axes.facecolor": "none",
        "text.color": BRAND_INK,
        "grid.alpha": 0.35,
        "grid.linestyle": "-",
        "grid.linewidth": 0.7,
        "grid.color": BRAND_GRID,
        "xtick.labelsize": 20,
        "ytick.labelsize": 20,
        "xtick.color": BRAND_INK,
        "ytick.color": BRAND_INK,
        "legend.frameon": False,
        "legend.fontsize": 20,
        "patch.edgecolor": "none",
        "patch.linewidth": 0.0,
    })


 DB_LABELS = ["UniProt", "GO CC", "HPA", "SURFY", "CSPA"]
 ENSEMBLE_KS = [1, 2, 3, 4, 5]
 DB_PALETTE = {label: BRAND_PALETTE[i] for i, label in enumerate(DB_LABELS)}
 # Sequential teal ramp for the ≥k ensembles (light → dark = permissive → strict).
 ENSEMBLE_PALETTE = {
    1: "#7AAB9F",  # teal-light  (≥1 = permissive union)
    2: "#4D8A80",
    3: "#3D6B60",
    4: "#244840",
    5: "#152E28",  # teal-deep   (≥5 = strict intersection)
 }

 BUCKETS = ["overall", "yes", "contextual", "no"]
 BUCKET_LABEL = {
    "overall":    "overall",
    "yes":        "Sonnet = yes",
    "contextual": "Sonnet = contextual\n(yes-vote = match)",
    "no":         "Sonnet = no\n(no-vote = match)",
 }


 def _fetch_tsv(url: str) -> pd.DataFrame:
    local = Path(__file__).resolve().parents[3] / url[len(BASE) + 1:]
    if local.is_file():
        return pd.read_csv(local, sep="\t")
    r = httpx.get(url, timeout=30)
    r.raise_for_status()
    return pd.read_csv(io.StringIO(r.text), sep="\t")


 def _vote_match(db_vote: str, sonnet: str) -> bool:
    """DB calls yes/no; Sonnet calls yes/contextual/no. Soft credit on positive side."""
    if db_vote == sonnet:
        return True
    return db_vote in ("yes", "contextual") and sonnet in ("yes", "contextual")


 def main() -> None:
    _apply_brand_style()

    # Catalog now comes from a static TSV (sourced from D1 by
    # scripts/export_whole_proteome_catalog_to_tsv.py) instead of the
    # live Worker — keeps the gist self-contained and snapshot-pinned
    # to whatever branch/SHA this script is fetched from. Each row
    # carries the v1-style ``*_surface_flag`` columns AND the
    # canonical Sonnet+NCBI verdict, so this single TSV replaces the
    # CATALOG_URL + CAND_TSV pair the figure used previously.
    catalog = _fetch_tsv(CATALOG_TSV)
    print(f"  loaded {len(catalog):,} catalog rows; sonnet = claude-sonnet-4-6")

    opt = _fetch_tsv(OPT_CUTOFFS_TSV)
    uniprot_opt = set(opt.loc[opt["uniprot_optimized"] == 1, "accession"].astype(str))
    cspa_opt = set(opt.loc[opt["cspa_optimized"] == 1, "accession"].astype(str))

    records = []
    for row in catalog.itertuples(index=False):
        v = (str(getattr(row, "sonnet_verdict", "") or "")).strip()
        if v not in ("yes", "contextual", "no"):
            continue
        acc = str(getattr(row, "uniprot_acc", "") or "")
        rec = {
            "symbol": str(getattr(row, "hgnc_symbol", "") or ""),
            "acc": acc,
            "sonnet": v,
            "UniProt": acc in uniprot_opt,
            "CSPA": acc in cspa_opt,
            "GO CC": int(getattr(row, "go_surface_flag", 0) or 0) == 1,
            "HPA": int(getattr(row, "hpa_surface_flag", 0) or 0) == 1,
            "SURFY": int(getattr(row, "surfy_surface_flag", 0) or 0) == 1,
        }
        records.append(rec)

    df = pd.DataFrame(records)
    print(f"\nGenes with a rated Sonnet verdict: {len(df):,}")
    print(df["sonnet"].value_counts())

    # Sort individual DBs by overall agreement.
    def _db_overall(label: str) -> float:
        n_match = sum(_vote_match("yes" if row[label] else "no", row["sonnet"])
                      for _, row in df.iterrows())
        return n_match / len(df)

    db_overall_acc = {label: _db_overall(label) for label in DB_LABELS}
    db_sorted = sorted(DB_LABELS, key=lambda d: -db_overall_acc[d])

    callers: list[tuple[str, str]] = []
    for k in ENSEMBLE_KS:
        callers.append((f"≥{k} DB", "ensemble"))
    for label in db_sorted:
        callers.append((label, "single"))

    def caller_vote(caller: str, kind: str, row: pd.Series) -> str:
        if kind == "single":
            return "yes" if row[caller] else "no"
        k = int(caller.lstrip("≥").rstrip(" DB"))
        return "yes" if sum(bool(row[d]) for d in DB_LABELS) >= k else "no"

    rows_long: list[dict] = []
    overall_acc: dict[str, float] = {}
    for caller_label, kind in callers:
        for bucket in BUCKETS:
            sub = df if bucket == "overall" else df[df["sonnet"] == bucket]
            if sub.empty:
                continue
            n_match = sum(
                _vote_match(caller_vote(caller_label, kind, r), r["sonnet"])
                for _, r in sub.iterrows()
            )
            frac = n_match / len(sub)
            if bucket == "overall":
                overall_acc[caller_label] = frac
            rows_long.append({
                "caller": caller_label,
                "bucket": bucket,
                "bucket_label": BUCKET_LABEL[bucket],
                "fraction": frac,
                "n_total": len(sub),
            })

    plot_df = pd.DataFrame(rows_long)

    print("\nOverall agreement with Sonnet (across whole proteome):")
    for caller_label, _ in callers:
        print(f"  {caller_label:10s}  {overall_acc[caller_label]*100:5.1f}%")

    caller_order = [c[0] for c in callers]
    palette = [
        (ENSEMBLE_PALETTE[int(c[0].lstrip("≥").rstrip(" DB"))] if c[1] == "ensemble"
         else DB_PALETTE[c[0]])
        for c in callers
    ]

    fig, ax = plt.subplots(figsize=(17, 6.5))
    sns.barplot(
        data=plot_df,
        x="bucket_label", y="fraction",
        hue="caller",
        order=[BUCKET_LABEL[b] for b in BUCKETS],
        hue_order=caller_order,
        palette=palette,
        edgecolor="none", saturation=1.0,
        width=0.92,
        ax=ax,
    )

    n_callers = len(caller_order)
    n_buckets = len(BUCKETS)
    bar_width = ax.patches[0].get_width()
    gap = bar_width * 0.4
    for caller_idx in range(n_callers):
        shift = gap if caller_idx >= 5 else 0
        for j in range(n_buckets):
            patch = ax.patches[caller_idx * n_buckets + j]
            patch.set_x(patch.get_x() + shift)

    for i, caller_label in enumerate(caller_order):
        for j, bucket in enumerate(BUCKETS):
            patch = ax.patches[i * n_buckets + j]
            cell = plot_df[(plot_df["caller"] == caller_label) & (plot_df["bucket"] == bucket)]
            if cell.empty:
                continue
            frac = cell.iloc[0]["fraction"]
            ax.text(
                patch.get_x() + patch.get_width() / 2,
                patch.get_height() + 0.01,
                f"{frac:.0%}",
                ha="center", va="bottom",
                fontsize=11, color=BRAND_INK,
            )

    ax.set_xlabel("")
    ax.set_ylabel("Fraction agreeing with\nSonnet (+ NCBI) verdict")
    ax.set_ylim(0, 1.14)

    handles, _ = ax.get_legend_handles_labels()
    legend_labels = [f"{lbl}  ({overall_acc[lbl]:.0%})" for lbl in caller_order]
    ax.legend(
        handles, legend_labels,
        title="Caller (overall agreement)",
        loc="upper left", bbox_to_anchor=(1.02, 1.0),
        frameon=False, fontsize=14,
    )

    totals = {
        row["bucket"]: row["n_total"]
        for _, row in plot_df.iterrows()
        if row["caller"] == caller_order[0]
    }
    subtitle = (
        f"n(overall) = {totals.get('overall', 0):,}  ·  "
        f"n(yes) = {totals.get('yes', 0):,}  ·  "
        f"n(contextual) = {totals.get('contextual', 0):,}  ·  "
        f"n(no) = {totals.get('no', 0):,}  "
        f"(whole protein-coding proteome with a rated Sonnet verdict)"
    )
    ax.text(
        0.5, -0.18, subtitle,
        transform=ax.transAxes, ha="center", va="top",
        fontsize=14, color=BRAND_NEUTRAL,
    )
    sns.despine(ax=ax, top=True, right=True)

    fig.tight_layout()
    out_pdf = Path("db_vs_sonnet_whole_proteome.pdf")
    out_png = Path("db_vs_sonnet_whole_proteome.png")
    fig.savefig(out_pdf, bbox_inches="tight", metadata={"Subject": GIST_URL})
    fig.savefig(out_png, bbox_inches="tight", dpi=300, metadata={"Source": GIST_URL})
    print(f"Wrote {out_pdf} + {out_png}")


 if __name__ == "__main__":
    main()
	# /// script
	# requires-python = ">=3.11"
	# dependencies = [
	# "matplotlib>=3.9",
	# "pandas>=2.2",
	# "seaborn>=0.13",
	# "httpx>=0.27",
	# ]
	# ///
	"""Reproduce ``db_vs_sonnet_whole_proteome.{pdf,png}`` from the public repo.

	Whole-genome DB ↔ Sonnet agreement plot. The 147-gene bench has hand-
	curated ground truth; on the whole proteome it doesn't exist, so we use
	the Sonnet (+ NCBI) triage verdict as the reference and ask how
	often each surface DB (under its bench-optimized cutoff) and each
	≥k-DB ensemble agrees with Sonnet, split by Sonnet's verdict bucket.

	Soft-credit rule (same as the bench plot): DB "yes" matches Sonnet
	"yes" or "contextual"; DB "no" matches Sonnet "no" only.

	Four buckets per caller:
	* overall — across all genes with a rated Sonnet verdict
	* Sonnet = yes — sensitivity-like
	* Sonnet = contextual — DB must yes-vote for a match
	* Sonnet = no — specificity-like

	10 callers: 5 ensembles (≥1..≥5 of the 5 surface DBs vote yes, teal
	ramp) + 5 individual DBs under their optimized cutoffs (sorted desc by
	overall agreement). Sonnet is the reference and does not appear as a
	bar (would be 100% by construction).

	Visual styling matches the in-repo ``_plotting_config`` (Deliverome
	categorical palette + Manrope-when-available). Inlined so the gist
	runs standalone.

	Data: catalog fetched live from
	``https://api.deliverome.org/surfaceome/v1/catalog`` (~19,324 genes
	with per-gene Sonnet+NCBI verdict); per-DB votes from
	candidate_universe.tsv + db_optimized_cutoffs.tsv via
	raw.githubusercontent.com.

	Standalone — ``uv run make_db_vs_sonnet_whole_proteome.py``.
	"""
	from __future__ import annotations

	import io
	from pathlib import Path

	import httpx
	import matplotlib.font_manager as fm
	import matplotlib.pyplot as plt
	import pandas as pd
	import seaborn as sns

	REPO = "Deliverome-Project/accessible-surfaceome"
	BRANCH = "main"
	BASE = f"https://raw.githubusercontent.com/{REPO}/{BRANCH}"

	CATALOG_TSV = f"{BASE}/data/processed/catalog/whole_proteome_catalog.tsv"
	OPT_CUTOFFS_TSV = f"{BASE}/data/processed/triage_bench/db_optimized_cutoffs.tsv"

	# Published reproduction gist (embedded into output PNG Source / PDF
	# Subject metadata — mirrors save_figure in _plotting_config.py).
	GIST_URL = "https://gist.github.com/beccajcarlson/1265c867a3bbb08efd81262789e1f013"

	# ──── Inline brand styling — sentinel: brand-style-v3 ────
	# Mirrors src/accessible_surfaceome/audit/_plotting_config.py so the gist
	# stays self-contained. Kept in sync via tests/test_figure_gists_styling.py.
	BRAND_PALETTE = [
	"#BC3C4C", # maroon-light
	"#3D6B60", # teal-mid
	"#F4AA28", # amber-bright
	"#8878C8", # lavender-bright
	"#6E1428", # maroon-dark
	"#7AAB9F", # teal-light
	]
	BRAND_SEQUENTIAL = {
	"maroon": ["#3E0A18", "#6E1428", "#922038", "#BC3C4C", "#F0A098", "#FDE8E6"],
	"teal": ["#152E28", "#244840", "#3D6B60", "#4D8A80", "#7AAB9F", "#CCE8E4"],
	"amber": ["#5A2608", "#8C4210", "#C07830", "#F4AA28", "#F4C070", "#FAECD4"],
	"lavender": ["#1E1450", "#3A2888", "#5848A8", "#8878C8", "#A090D4", "#E4E0F8"],
	}
	BRAND_CLAUDE_ORANGE = "#d87851"
	BRAND_INK = "#1F1718"
	BRAND_NEUTRAL = "#6F5D5A"
	BRAND_GRID = "#E6DAD4"


	def _register_brand_fonts() -> None:
	candidates = [
	Path(__file__).resolve().parents[3] / "assets" / "fonts",
	Path.cwd() / "assets" / "fonts",
	]
	for fonts_dir in candidates:
	if fonts_dir.is_dir():
	for path in sorted(list(fonts_dir.glob(".ttf")) + list(fonts_dir.glob(".otf"))):
	try:
	fm.fontManager.addfont(str(path))
	except Exception: # noqa: BLE001
	continue
	return


	def _apply_brand_style() -> None:
	"""Inline equivalent of `setup_plotting_style`. Sentinel: brand-style-v3.
	v2: bumped sizes ~25% + explicit medium weight (avoids ExtraLight default
	that matplotlib picks from the Manrope variable file). Companion to the
	static Manrope-{regular,medium,semibold,bold}.otf files in assets/fonts/."""
	_register_brand_fonts()
	sns.set_style("whitegrid")
	sns.set_context("notebook", font_scale=1.0)
	plt.rcParams.update({
	"savefig.dpi": 300,
	"savefig.bbox": "tight",
	"figure.facecolor": "none",
	"savefig.facecolor": "none",
	"font.family": "sans-serif",
	"font.sans-serif": ["Manrope", "Outfit", "DejaVu Sans", "Liberation Sans", "Arial"],
	"font.weight": "medium",
	"font.size": 21,
	"axes.labelsize": 25,
	"axes.labelweight": "medium",
	"axes.titlesize": 0,
	"axes.titlepad": 0,
	"axes.spines.top": False,
	"axes.spines.right": False,
	"axes.grid": True,
	"axes.axisbelow": True,
	"axes.edgecolor": BRAND_GRID,
	"axes.labelcolor": BRAND_INK,
	"axes.facecolor": "none",
	"text.color": BRAND_INK,
	"grid.alpha": 0.35,
	"grid.linestyle": "-",
	"grid.linewidth": 0.7,
	"grid.color": BRAND_GRID,
	"xtick.labelsize": 20,
	"ytick.labelsize": 20,
	"xtick.color": BRAND_INK,
	"ytick.color": BRAND_INK,
	"legend.frameon": False,
	"legend.fontsize": 20,
	"patch.edgecolor": "none",
	"patch.linewidth": 0.0,
	})


	DB_LABELS = ["UniProt", "GO CC", "HPA", "SURFY", "CSPA"]
	ENSEMBLE_KS = [1, 2, 3, 4, 5]
	DB_PALETTE = {label: BRAND_PALETTE[i] for i, label in enumerate(DB_LABELS)}
	# Sequential teal ramp for the ≥k ensembles (light → dark = permissive → strict).
	ENSEMBLE_PALETTE = {
	1: "#7AAB9F", # teal-light (≥1 = permissive union)
	2: "#4D8A80",
	3: "#3D6B60",
	4: "#244840",
	5: "#152E28", # teal-deep (≥5 = strict intersection)
	}

	BUCKETS = ["overall", "yes", "contextual", "no"]
	BUCKET_LABEL = {
	"overall": "overall",
	"yes": "Sonnet = yes",
	"contextual": "Sonnet = contextual\n(yes-vote = match)",
	"no": "Sonnet = no\n(no-vote = match)",
	}


	def _fetch_tsv(url: str) -> pd.DataFrame:
	local = Path(__file__).resolve().parents[3] / url[len(BASE) + 1:]
	if local.is_file():
	return pd.read_csv(local, sep="\t")
	r = httpx.get(url, timeout=30)
	r.raise_for_status()
	return pd.read_csv(io.StringIO(r.text), sep="\t")


	def _vote_match(db_vote: str, sonnet: str) -> bool:
	"""DB calls yes/no; Sonnet calls yes/contextual/no. Soft credit on positive side."""
	if db_vote == sonnet:
	return True
	return db_vote in ("yes", "contextual") and sonnet in ("yes", "contextual")


	def main() -> None:
	_apply_brand_style()

	# Catalog now comes from a static TSV (sourced from D1 by
	# scripts/export_whole_proteome_catalog_to_tsv.py) instead of the
	# live Worker — keeps the gist self-contained and snapshot-pinned
	# to whatever branch/SHA this script is fetched from. Each row
	# carries the v1-style ``*_surface_flag`` columns AND the
	# canonical Sonnet+NCBI verdict, so this single TSV replaces the
	# CATALOG_URL + CAND_TSV pair the figure used previously.
	catalog = _fetch_tsv(CATALOG_TSV)
	print(f" loaded {len(catalog):,} catalog rows; sonnet = claude-sonnet-4-6")

	opt = _fetch_tsv(OPT_CUTOFFS_TSV)
	uniprot_opt = set(opt.loc[opt["uniprot_optimized"] == 1, "accession"].astype(str))
	cspa_opt = set(opt.loc[opt["cspa_optimized"] == 1, "accession"].astype(str))

	records = []
	for row in catalog.itertuples(index=False):
	v = (str(getattr(row, "sonnet_verdict", "") or "")).strip()
	if v not in ("yes", "contextual", "no"):
	continue
	acc = str(getattr(row, "uniprot_acc", "") or "")
	rec = {
	"symbol": str(getattr(row, "hgnc_symbol", "") or ""),
	"acc": acc,
	"sonnet": v,
	"UniProt": acc in uniprot_opt,
	"CSPA": acc in cspa_opt,
	"GO CC": int(getattr(row, "go_surface_flag", 0) or 0) == 1,
	"HPA": int(getattr(row, "hpa_surface_flag", 0) or 0) == 1,
	"SURFY": int(getattr(row, "surfy_surface_flag", 0) or 0) == 1,
	}
	records.append(rec)

	df = pd.DataFrame(records)
	print(f"\nGenes with a rated Sonnet verdict: {len(df):,}")
	print(df["sonnet"].value_counts())

	# Sort individual DBs by overall agreement.
	def _db_overall(label: str) -> float:
	n_match = sum(_vote_match("yes" if row[label] else "no", row["sonnet"])
	for _, row in df.iterrows())
	return n_match / len(df)

	db_overall_acc = {label: _db_overall(label) for label in DB_LABELS}
	db_sorted = sorted(DB_LABELS, key=lambda d: -db_overall_acc[d])

	callers: list[tuple[str, str]] = []
	for k in ENSEMBLE_KS:
	callers.append((f"≥{k} DB", "ensemble"))
	for label in db_sorted:
	callers.append((label, "single"))

	def caller_vote(caller: str, kind: str, row: pd.Series) -> str:
	if kind == "single":
	return "yes" if row[caller] else "no"
	k = int(caller.lstrip("≥").rstrip(" DB"))
	return "yes" if sum(bool(row[d]) for d in DB_LABELS) >= k else "no"

	rows_long: list[dict] = []
	overall_acc: dict[str, float] = {}
	for caller_label, kind in callers:
	for bucket in BUCKETS:
	sub = df if bucket == "overall" else df[df["sonnet"] == bucket]
	if sub.empty:
	continue
	n_match = sum(
	_vote_match(caller_vote(caller_label, kind, r), r["sonnet"])
	for _, r in sub.iterrows()
	)
	frac = n_match / len(sub)
	if bucket == "overall":
	overall_acc[caller_label] = frac
	rows_long.append({
	"caller": caller_label,
	"bucket": bucket,
	"bucket_label": BUCKET_LABEL[bucket],
	"fraction": frac,
	"n_total": len(sub),
	})

	plot_df = pd.DataFrame(rows_long)

	print("\nOverall agreement with Sonnet (across whole proteome):")
	for caller_label, _ in callers:
	print(f" {caller_label:10s} {overall_acc[caller_label]*100:5.1f}%")

	caller_order = [c[0] for c in callers]
	palette = [
	(ENSEMBLE_PALETTE[int(c[0].lstrip("≥").rstrip(" DB"))] if c[1] == "ensemble"
	else DB_PALETTE[c[0]])
	for c in callers
	]

	fig, ax = plt.subplots(figsize=(17, 6.5))
	sns.barplot(
	data=plot_df,
	x="bucket_label", y="fraction",
	hue="caller",
	order=[BUCKET_LABEL[b] for b in BUCKETS],
	hue_order=caller_order,
	palette=palette,
	edgecolor="none", saturation=1.0,
	width=0.92,
	ax=ax,
	)

	n_callers = len(caller_order)
	n_buckets = len(BUCKETS)
	bar_width = ax.patches[0].get_width()
	gap = bar_width * 0.4
	for caller_idx in range(n_callers):
	shift = gap if caller_idx >= 5 else 0
	for j in range(n_buckets):
	patch = ax.patches[caller_idx * n_buckets + j]
	patch.set_x(patch.get_x() + shift)

	for i, caller_label in enumerate(caller_order):
	for j, bucket in enumerate(BUCKETS):
	patch = ax.patches[i * n_buckets + j]
	cell = plot_df[(plot_df["caller"] == caller_label) & (plot_df["bucket"] == bucket)]
	if cell.empty:
	continue
	frac = cell.iloc[0]["fraction"]
	ax.text(
	patch.get_x() + patch.get_width() / 2,
	patch.get_height() + 0.01,
	f"{frac:.0%}",
	ha="center", va="bottom",
	fontsize=11, color=BRAND_INK,
	)

	ax.set_xlabel("")
	ax.set_ylabel("Fraction agreeing with\nSonnet (+ NCBI) verdict")
	ax.set_ylim(0, 1.14)

	handles, _ = ax.get_legend_handles_labels()
	legend_labels = [f"{lbl} ({overall_acc[lbl]:.0%})" for lbl in caller_order]
	ax.legend(
	handles, legend_labels,
	title="Caller (overall agreement)",
	loc="upper left", bbox_to_anchor=(1.02, 1.0),
	frameon=False, fontsize=14,
	)

	totals = {
	row["bucket"]: row["n_total"]
	for _, row in plot_df.iterrows()
	if row["caller"] == caller_order[0]
	}
	subtitle = (
	f"n(overall) = {totals.get('overall', 0):,} · "
	f"n(yes) = {totals.get('yes', 0):,} · "
	f"n(contextual) = {totals.get('contextual', 0):,} · "
	f"n(no) = {totals.get('no', 0):,} "
	f"(whole protein-coding proteome with a rated Sonnet verdict)"
	)
	ax.text(
	0.5, -0.18, subtitle,
	transform=ax.transAxes, ha="center", va="top",
	fontsize=14, color=BRAND_NEUTRAL,
	)
	sns.despine(ax=ax, top=True, right=True)

	fig.tight_layout()
	out_pdf = Path("db_vs_sonnet_whole_proteome.pdf")
	out_png = Path("db_vs_sonnet_whole_proteome.png")
	fig.savefig(out_pdf, bbox_inches="tight", metadata={"Subject": GIST_URL})
	fig.savefig(out_png, bbox_inches="tight", dpi=300, metadata={"Source": GIST_URL})
	print(f"Wrote {out_pdf} + {out_png}")


	if __name__ == "__main__":
	main()
No results found