Skip to content

Instantly share code, notes, and snippets.

@ipeirotis
Created February 24, 2026 14:16
Show Gist options
  • Select an option

  • Save ipeirotis/4dd5ae1b70f0c99a3b322dc952b31cf4 to your computer and use it in GitHub Desktop.

Select an option

Save ipeirotis/4dd5ae1b70f0c99a3b322dc952b31cf4 to your computer and use it in GitHub Desktop.
Grading using a council of LLMs
import json
import os
import re
import time
from tqdm import tqdm
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple
import pandas as pd
from dotenv import load_dotenv
load_dotenv()
# --- Providers (current SDKs) ---
from openai import OpenAI
from anthropic import Anthropic
from google import genai
# -------------------------------
# Config
# -------------------------------
RUBRIC_DIMENSIONS = [
"problem_framing",
"metrics_economics",
"risk_ethics",
"experimentation",
"communication",
]
RUBRIC_TEXT = """
You are grading an oral exam for AI/ML Product Management. The students are undergraduate students at NYU Stern. For most, this was their first technical product course.
CRITICAL CONTEXT ON EXAM CONDITIONS:
The AI proctor for this exam had significant design flaws that negatively impacted student performance. Specifically:
1. Stacked Questions: The agent often asked 3-4 distinct questions in a single turn.
2. Moving Targets: When students asked for clarification, the agent often changed the question entirely rather than repeating it.
3. Audio-Only Menus: The agent read long lists of complex options verbally, causing cognitive overload.
Because of this, you must apply the following "Interference Protocols" when grading:
- The "Pick One" Rule: If the agent asked multiple questions at once and the student only answered one or two, grade them ONLY on what they answered. Do not penalize for missing parts of a compound question.
- The "Benefit of Doubt" Rule: If the agent rephrased a question during clarification, credit the student for answering *any* version of the question presented in that sequence.
- Ignore "Stalling": Disregard phrases like "Can you repeat that?" or hesitation. These are valid coping strategies for a poor interface, not signs of ignorance.
- Jargon Leniency: Focus on conceptual understanding over perfect industry terminology (e.g., if they describe "churn" correctly but call it "usage drop," accept it).
Grade on these five dimensions (0-4 each; 0=missing, 4=excellent), using evidence from the transcript:
1) Problem framing: Translating business problems into ML specs. (Did they understand the core user problem?)
2) Metrics & economics: Trade-offs, costs, and counter-metrics. (Focus on their logic regarding trade-offs, even if they struggled to pick a specific metric from a verbal list).
3) Risk & ethics: FAT-P, security risks, failure modes, governance. (Did they identify the harm, even if they needed the options repeated?)
4) Experimentation: A/B testing, hypotheses, validation, controls.
5) Communication: Concise, structured, and handles pushback. *CRITICAL:* Do not penalize the student for confusion caused by the agent's shifting questions. Grade their ability to synthesize the information they *did* hear.
Return JSON that matches the requested schema exactly.
"""
# (c) cutoff: do not grade anything before Dec 11, 2025 (NY local date already in your export)
CUTOFF_DATE = "2025-12-11" # inclusive
# (d) ignore this student
IGNORE_NETID = "kr888"
TRANSCRIPTS_DIR = os.getenv("TRANSCRIPTS_DIR", "elevenlabs_transcripts")
OUTDIR = os.getenv("GRADING_OUTDIR", "grading_out")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")
ANTHROPIC_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-sonnet-4.5")
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-2.5-pro")
SLEEP_BETWEEN_CALLS = 0.15 # gentle pacing
# -------------------------------
# Utilities
# -------------------------------
def sanitize_filename(s: str, max_len: int = 120) -> str:
s = (s or "").strip().replace(" ", "_")
s = re.sub(r"[^A-Za-z0-9._-]+", "-", s)
s = re.sub(r"-{2,}", "-", s).strip("-._")
return s[:max_len] if len(s) > max_len else s
def status(msg: str) -> None:
# One-line flush for real-time visibility
print(msg, flush=True)
def parse_date_ymd(s: str) -> datetime:
return datetime.strptime(s, "%Y-%m-%d")
def ensure_dir(p: str) -> None:
os.makedirs(p, exist_ok=True)
def read_text(path: str) -> str:
with open(path, "r", encoding="utf-8") as f:
return f.read()
def write_text(path: str, text: str) -> None:
with open(path, "w", encoding="utf-8") as f:
f.write(text)
def write_json(path: str, obj: Any) -> None:
with open(path, "w", encoding="utf-8") as f:
json.dump(obj, f, ensure_ascii=False, indent=2)
def extract_json_block(txt: str) -> str:
"""
Extract the most likely JSON object from model output.
Handles markdown fences and extra prose.
"""
txt = (txt or "").strip()
txt = re.sub(r"^```(?:json)?\s*", "", txt)
txt = re.sub(r"\s*```$", "", txt)
# Find first JSON object via brace matching (more robust than regex).
start = txt.find("{")
if start == -1:
raise ValueError("No '{' found in output")
depth = 0
in_str = False
esc = False
for i in range(start, len(txt)):
ch = txt[i]
if in_str:
if esc:
esc = False
elif ch == "\\":
esc = True
elif ch == '"':
in_str = False
else:
if ch == '"':
in_str = True
elif ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
return txt[start : i + 1]
raise ValueError("Unclosed JSON object in output")
def repair_json_with_model(bad_text: str) -> Dict[str, Any]:
"""
Use a cheap deterministic OpenAI pass to return valid JSON only.
This is a guardrail, not a new grade.
"""
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
repair_prompt = (
"You will be given text that should be a JSON object but is invalid.\n"
"Return ONLY a valid JSON object. Do not change keys or meaning.\n"
"If there are unescaped quotes or trailing commas, fix them.\n\n"
f"TEXT:\n{bad_text}"
)
resp = client.responses.create(
model=OPENAI_MODEL,
input=repair_prompt,
)
fixed = resp.output_text.strip()
json_str = extract_json_block(fixed)
return json.loads(json_str)
def strict_json_from_text(txt: str) -> Dict[str, Any]:
"""
Strict parse with one repair attempt.
"""
json_str = extract_json_block(txt)
try:
return json.loads(json_str)
except json.JSONDecodeError:
# One repair attempt, then re-raise if still broken
return repair_json_with_model(json_str)
# -------------------------------
# Consolidate transcripts
# -------------------------------
@dataclass
class StudentBundle:
netid: str
student: str
projectid: str
attempts_count: int
date_first: str
date_last: str
duration_total: int
messages_total: int
transcript_concat: str
def load_and_bundle() -> List[StudentBundle]:
index_path = os.path.join(TRANSCRIPTS_DIR, "index.csv")
df = pd.read_csv(index_path)
# Expect at least these columns from your exporter
# student, netid, projectid, attempt, date_ymd, call_duration_secs, message_count, transcript_file
needed = {"student", "netid", "projectid", "attempt", "date_ymd", "call_duration_secs", "message_count", "transcript_file"}
missing = needed - set(df.columns)
if missing:
raise ValueError(f"index.csv missing columns: {sorted(missing)}")
# (c) filter date
cutoff_dt = parse_date_ymd(CUTOFF_DATE)
df = df[df["date_ymd"].apply(lambda s: parse_date_ymd(str(s)) >= cutoff_dt)]
# (d) ignore netid
df = df[df["netid"].astype(str).str.lower() != IGNORE_NETID.lower()]
# stable ordering
df = df.sort_values(["netid", "projectid", "attempt", "date_ymd"])
bundles: List[StudentBundle] = []
for (netid, projectid), g in df.groupby(["netid", "projectid"], dropna=False):
g = g.sort_values(["attempt", "date_ymd"])
student = str(g.iloc[0]["student"])
netid = str(netid)
projectid = str(projectid)
parts = []
for _, row in g.iterrows():
attempt = int(row["attempt"])
date_ymd = str(row["date_ymd"])
tf = str(row["transcript_file"])
tpath = os.path.join(TRANSCRIPTS_DIR, tf)
parts.append(f"\n\n===== ATTEMPT {attempt:02d} | {date_ymd} | {tf} =====\n\n")
parts.append(read_text(tpath))
bundles.append(
StudentBundle(
netid=netid,
student=student,
projectid=projectid,
attempts_count=len(g),
date_first=str(g["date_ymd"].min()),
date_last=str(g["date_ymd"].max()),
duration_total=int(g["call_duration_secs"].sum()),
messages_total=int(g["message_count"].sum()),
transcript_concat="".join(parts).strip() + "\n",
)
)
return bundles
# -------------------------------
# Prompting + schemas
# -------------------------------
GRADE_SCHEMA = {
"type": "object",
"properties": {
"scores": {
"type": "object",
"properties": {d: {"type": "integer", "minimum": 0, "maximum": 4} for d in RUBRIC_DIMENSIONS},
"required": RUBRIC_DIMENSIONS,
"additionalProperties": False,
},
"overall": {"type": "integer", "minimum": 0, "maximum": 20},
"letter": {"type": "string"},
"confidence": {"type": "integer", "minimum": 1, "maximum": 5},
"evidence": {
"type": "array",
"items": {
"type": "object",
"properties": {
"dimension": {"type": "string"},
"quote": {"type": "string"},
"comment": {"type": "string"},
},
"required": ["dimension", "quote", "comment"],
"additionalProperties": False,
},
},
"strengths": {"type": "array", "items": {"type": "string"}},
"improvements": {"type": "array", "items": {"type": "string"}},
"notes_for_chair": {"type": "string"},
},
"required": ["scores", "overall", "letter", "confidence", "evidence", "strengths", "improvements", "notes_for_chair"],
"additionalProperties": False,
}
STUDENT_FEEDBACK_SCHEMA = {
"type": "object",
"properties": {
"rubric_breakdown": {
"type": "object",
"properties": {d: {"type": "string"} for d in RUBRIC_DIMENSIONS},
"required": RUBRIC_DIMENSIONS,
"additionalProperties": False,
},
"top_strengths": {"type": "array", "items": {"type": "string"}},
"top_actions": {"type": "array", "items": {"type": "string"}},
"closing": {"type": "string"},
},
"required": ["rubric_breakdown", "top_strengths", "top_actions", "closing"],
"additionalProperties": False,
}
def scoring_prompt(student: str, netid: str, projectid: str, transcript: str) -> str:
return f"""{RUBRIC_TEXT}
Student: {student}
NetID: {netid}
Project ID: {projectid}
Transcript (may include multiple attempts, in order):
{transcript}
Return ONLY valid JSON matching this schema:
{json.dumps(GRADE_SCHEMA)}
"""
def revise_prompt(original_json: Dict[str, Any], peer_jsons: List[Dict[str, Any]]) -> str:
return f"""{RUBRIC_TEXT}
You previously graded the exam as:
{json.dumps(original_json, ensure_ascii=False)}
Two other graders produced:
{json.dumps(peer_jsons, ensure_ascii=False)}
Revise your grade if warranted. If you disagree, explain why in notes_for_chair.
Return ONLY valid JSON matching:
{json.dumps(GRADE_SCHEMA)}
"""
def chair_prompt(student: str, netid: str, projectid: str, transcript: str,
revised: Dict[str, Dict[str, Any]]) -> str:
return f"""{RUBRIC_TEXT}
You are the chair. You must produce:
1) Final rubric scores (0-4 each) and overall (0-20), plus letter grade.
2) A student-facing feedback object (no numeric grades shown in the prose).
3) A short internal summary justifying any overrides.
Student: {student}
NetID: {netid}
Project ID: {projectid}
Transcript:
{transcript}
Revised reviews:
{json.dumps(revised, ensure_ascii=False)}
Output JSON with this schema ONLY:
{json.dumps({
"type": "object",
"properties": {
"final_grade": GRADE_SCHEMA,
"student_feedback": STUDENT_FEEDBACK_SCHEMA,
"chair_internal_summary": {"type": "string"}
},
"required": ["final_grade","student_feedback","chair_internal_summary"],
"additionalProperties": False
})}
IMPORTANT: Output only valid JSON. No markdown. No commentary. No trailing commas.
"""
# -------------------------------
# Model calls
# -------------------------------
def call_openai(prompt: str) -> Dict[str, Any]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
resp = client.responses.create(
model=OPENAI_MODEL,
input=prompt,
)
# SDK returns output text blocks; simplest: use .output_text
txt = resp.output_text
return strict_json_from_text(txt)
def call_anthropic(prompt: str) -> Dict[str, Any]:
client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
msg = client.messages.create(
model=ANTHROPIC_MODEL,
max_tokens=3000,
temperature=0.0,
messages=[{"role": "user", "content": prompt}],
)
txt = "".join([b.text for b in msg.content if getattr(b, "type", "") == "text"])
return strict_json_from_text(txt)
def call_gemini(prompt: str) -> Dict[str, Any]:
client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
resp = client.models.generate_content(
model=GEMINI_MODEL,
contents=prompt,
)
txt = resp.text or ""
return strict_json_from_text(txt)
# -------------------------------
# Main council pipeline
# -------------------------------
def grade_one(bundle: StudentBundle, run_dir: str) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""
Returns: (final_grade_json, student_feedback_json) from chair output
Persists intermediate artifacts in run_dir for audit.
"""
# Round 1
p1 = scoring_prompt(bundle.student, bundle.netid, bundle.projectid, bundle.transcript_concat)
status(f" → Round 1: ChatGPT")
r1_openai = call_openai(p1)
status(f" ✓ Round 1: ChatGPT done")
time.sleep(SLEEP_BETWEEN_CALLS)
status(f" → Round 1: Claude")
r1_anth = call_anthropic(p1)
status(f" ✓ Round 1: Claude done")
time.sleep(SLEEP_BETWEEN_CALLS)
status(f" → Round 1: Gemini")
r1_gem = call_gemini(p1)
status(f" ✓ Round 1: Gemini done")
time.sleep(SLEEP_BETWEEN_CALLS)
# Round 2 (share peers; each revises)
status(f" → Round 2 revisions")
r2_openai = call_openai(revise_prompt(r1_openai, [r1_anth, r1_gem]))
status(f" ✓ Round 2: ChatGPT revised")
time.sleep(SLEEP_BETWEEN_CALLS)
r2_anth = call_anthropic(revise_prompt(r1_anth, [r1_openai, r1_gem]))
status(f" ✓ Round 2: Claude revised")
time.sleep(SLEEP_BETWEEN_CALLS)
r2_gem = call_gemini(revise_prompt(r1_gem, [r1_openai, r1_anth]))
status(f" ✓ Round 2: Gemini revised")
time.sleep(SLEEP_BETWEEN_CALLS)
revised = {"chatgpt": r2_openai, "claude": r2_anth, "gemini": r2_gem}
# Chair (Claude)
status(f" → Chair summary (Claude)")
chair_out = call_anthropic(chair_prompt(bundle.student, bundle.netid, bundle.projectid, bundle.transcript_concat, revised))
status(f" ✓ Chair summary done")
# Save audit artifacts
sid = f"{sanitize_filename(bundle.netid)}__{sanitize_filename(bundle.projectid)}"
ensure_dir(os.path.join(run_dir, "audit", sid))
audit_dir = os.path.join(run_dir, "audit", sid)
write_json(os.path.join(audit_dir, "r1_openai.json"), r1_openai)
write_json(os.path.join(audit_dir, "r1_claude.json"), r1_anth)
write_json(os.path.join(audit_dir, "r1_gemini.json"), r1_gem)
write_json(os.path.join(audit_dir, "r2_openai.json"), r2_openai)
write_json(os.path.join(audit_dir, "r2_claude.json"), r2_anth)
write_json(os.path.join(audit_dir, "r2_gemini.json"), r2_gem)
write_json(os.path.join(audit_dir, "chair.json"), chair_out)
return chair_out["final_grade"], chair_out["student_feedback"]
def main() -> None:
ensure_dir(OUTDIR)
run_dir = os.path.join(OUTDIR, datetime.now().strftime("run_%Y%m%d_%H%M%S"))
ensure_dir(run_dir)
ensure_dir(os.path.join(run_dir, "feedback"))
ensure_dir(os.path.join(run_dir, "audit"))
bundles = load_and_bundle()
rows = []
for i, b in enumerate(tqdm(bundles, desc="Grading students", unit="student"), start=1):
status(f"[{i}/{len(bundles)}] netid={b.netid} student={b.student} project={b.projectid} attempts={b.attempts_count}")
final_grade, student_feedback = grade_one(b, run_dir)
# Write student feedback file
fname = f"{sanitize_filename(b.netid)}.txt"
fpath = os.path.join(run_dir, "feedback", fname)
# Student-facing prose (no numeric grade in prose)
fb = student_feedback
text = []
text.append(f"Feedback for {b.student} ({b.netid}) | Project {b.projectid}\n")
text.append("Rubric breakdown:\n")
for d in RUBRIC_DIMENSIONS:
text.append(f"- {d.replace('_',' ').title()}: {fb['rubric_breakdown'][d]}")
text.append("\nTop strengths:")
for s in fb["top_strengths"]:
text.append(f"- {s}")
text.append("\nTop actions for improvement:")
for a in fb["top_actions"]:
text.append(f"- {a}")
text.append(f"\n{fb['closing']}\n")
write_text(fpath, "\n".join(text))
# CSV row
scores = final_grade["scores"]
rows.append({
"netid": b.netid,
"student": b.student,
"projectid": b.projectid,
"attempts_count": b.attempts_count,
"date_first": b.date_first,
"date_last": b.date_last,
"duration_total_secs": b.duration_total,
"messages_total": b.messages_total,
**{f"score_{d}": int(scores[d]) for d in RUBRIC_DIMENSIONS},
"overall_0_20": int(final_grade["overall"]),
"letter": final_grade["letter"],
"confidence_1_5": int(final_grade["confidence"]),
"feedback_file": os.path.relpath(fpath, start=run_dir),
})
grades_path = os.path.join(run_dir, "grades.csv")
pd.DataFrame(rows).sort_values(["netid", "projectid"]).to_csv(grades_path, index=False)
print(f"Wrote: {grades_path}")
print(f"Feedback dir: {os.path.join(run_dir, 'feedback')}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment