ipeirotis · February 24, 2026 14:16
diff --git a/council_grade.py b/council_grade.py
 import json
 import os
 import re
 import time
 from tqdm import tqdm
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, Dict, List, Optional, Tuple

 import pandas as pd
 from dotenv import load_dotenv

 load_dotenv()

 # --- Providers (current SDKs) ---
 from openai import OpenAI
 from anthropic import Anthropic
 from google import genai

 # -------------------------------
 # Config
 # -------------------------------
 RUBRIC_DIMENSIONS = [
    "problem_framing",
    "metrics_economics",
    "risk_ethics",
    "experimentation",
    "communication",
 ]

 RUBRIC_TEXT = """
 You are grading an oral exam for AI/ML Product Management. The students are undergraduate students at NYU Stern. For most, this was their first technical product course.

 CRITICAL CONTEXT ON EXAM CONDITIONS:
 The AI proctor for this exam had significant design flaws that negatively impacted student performance. Specifically:
 1. Stacked Questions: The agent often asked 3-4 distinct questions in a single turn.
 2. Moving Targets: When students asked for clarification, the agent often changed the question entirely rather than repeating it.
 3. Audio-Only Menus: The agent read long lists of complex options verbally, causing cognitive overload.

 Because of this, you must apply the following "Interference Protocols" when grading:
 - The "Pick One" Rule: If the agent asked multiple questions at once and the student only answered one or two, grade them ONLY on what they answered. Do not penalize for missing parts of a compound question.
 - The "Benefit of Doubt" Rule: If the agent rephrased a question during clarification, credit the student for answering *any* version of the question presented in that sequence.
 - Ignore "Stalling": Disregard phrases like "Can you repeat that?" or hesitation. These are valid coping strategies for a poor interface, not signs of ignorance.
 - Jargon Leniency: Focus on conceptual understanding over perfect industry terminology (e.g., if they describe "churn" correctly but call it "usage drop," accept it).

 Grade on these five dimensions (0-4 each; 0=missing, 4=excellent), using evidence from the transcript:

 1) Problem framing: Translating business problems into ML specs. (Did they understand the core user problem?)
 2) Metrics & economics: Trade-offs, costs, and counter-metrics. (Focus on their logic regarding trade-offs, even if they struggled to pick a specific metric from a verbal list).
 3) Risk & ethics: FAT-P, security risks, failure modes, governance. (Did they identify the harm, even if they needed the options repeated?)
 4) Experimentation: A/B testing, hypotheses, validation, controls.
 5) Communication: Concise, structured, and handles pushback. *CRITICAL:* Do not penalize the student for confusion caused by the agent's shifting questions. Grade their ability to synthesize the information they *did* hear.

 Return JSON that matches the requested schema exactly.
 """

 # (c) cutoff: do not grade anything before Dec 11, 2025 (NY local date already in your export)
 CUTOFF_DATE = "2025-12-11"  # inclusive
 # (d) ignore this student
 IGNORE_NETID = "kr888"

 TRANSCRIPTS_DIR = os.getenv("TRANSCRIPTS_DIR", "elevenlabs_transcripts")
 OUTDIR = os.getenv("GRADING_OUTDIR", "grading_out")

 OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")
 ANTHROPIC_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-sonnet-4.5")
 GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-2.5-pro")

 SLEEP_BETWEEN_CALLS = 0.15  # gentle pacing


 # -------------------------------
 # Utilities
 # -------------------------------
 def sanitize_filename(s: str, max_len: int = 120) -> str:
    s = (s or "").strip().replace(" ", "_")
    s = re.sub(r"[^A-Za-z0-9._-]+", "-", s)
    s = re.sub(r"-{2,}", "-", s).strip("-._")
    return s[:max_len] if len(s) > max_len else s

 def status(msg: str) -> None:
    # One-line flush for real-time visibility
    print(msg, flush=True)


 def parse_date_ymd(s: str) -> datetime:
    return datetime.strptime(s, "%Y-%m-%d")


 def ensure_dir(p: str) -> None:
    os.makedirs(p, exist_ok=True)


 def read_text(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        return f.read()


 def write_text(path: str, text: str) -> None:
    with open(path, "w", encoding="utf-8") as f:
        f.write(text)


 def write_json(path: str, obj: Any) -> None:
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)


 def extract_json_block(txt: str) -> str:
    """
    Extract the most likely JSON object from model output.
    Handles markdown fences and extra prose.
    """
    txt = (txt or "").strip()
    txt = re.sub(r"^```(?:json)?\s*", "", txt)
    txt = re.sub(r"\s*```$", "", txt)

    # Find first JSON object via brace matching (more robust than regex).
    start = txt.find("{")
    if start == -1:
        raise ValueError("No '{' found in output")

    depth = 0
    in_str = False
    esc = False
    for i in range(start, len(txt)):
        ch = txt[i]
        if in_str:
            if esc:
                esc = False
            elif ch == "\\":
                esc = True
            elif ch == '"':
                in_str = False
        else:
            if ch == '"':
                in_str = True
            elif ch == "{":
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    return txt[start : i + 1]

    raise ValueError("Unclosed JSON object in output")


 def repair_json_with_model(bad_text: str) -> Dict[str, Any]:
    """
    Use a cheap deterministic OpenAI pass to return valid JSON only.
    This is a guardrail, not a new grade.
    """
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    repair_prompt = (
        "You will be given text that should be a JSON object but is invalid.\n"
        "Return ONLY a valid JSON object. Do not change keys or meaning.\n"
        "If there are unescaped quotes or trailing commas, fix them.\n\n"
        f"TEXT:\n{bad_text}"
    )
    resp = client.responses.create(
        model=OPENAI_MODEL,
        input=repair_prompt,
    )
    fixed = resp.output_text.strip()
    json_str = extract_json_block(fixed)
    return json.loads(json_str)


 def strict_json_from_text(txt: str) -> Dict[str, Any]:
    """
    Strict parse with one repair attempt.
    """
    json_str = extract_json_block(txt)
    try:
        return json.loads(json_str)
    except json.JSONDecodeError:
        # One repair attempt, then re-raise if still broken
        return repair_json_with_model(json_str)


 # -------------------------------
 # Consolidate transcripts
 # -------------------------------
 @dataclass
 class StudentBundle:
    netid: str
    student: str
    projectid: str
    attempts_count: int
    date_first: str
    date_last: str
    duration_total: int
    messages_total: int
    transcript_concat: str


 def load_and_bundle() -> List[StudentBundle]:
    index_path = os.path.join(TRANSCRIPTS_DIR, "index.csv")
    df = pd.read_csv(index_path)

    # Expect at least these columns from your exporter
    # student, netid, projectid, attempt, date_ymd, call_duration_secs, message_count, transcript_file
    needed = {"student", "netid", "projectid", "attempt", "date_ymd", "call_duration_secs", "message_count", "transcript_file"}
    missing = needed - set(df.columns)
    if missing:
        raise ValueError(f"index.csv missing columns: {sorted(missing)}")

    # (c) filter date
    cutoff_dt = parse_date_ymd(CUTOFF_DATE)
    df = df[df["date_ymd"].apply(lambda s: parse_date_ymd(str(s)) >= cutoff_dt)]

    # (d) ignore netid
    df = df[df["netid"].astype(str).str.lower() != IGNORE_NETID.lower()]

    # stable ordering
    df = df.sort_values(["netid", "projectid", "attempt", "date_ymd"])

    bundles: List[StudentBundle] = []

    for (netid, projectid), g in df.groupby(["netid", "projectid"], dropna=False):
        g = g.sort_values(["attempt", "date_ymd"])
        student = str(g.iloc[0]["student"])
        netid = str(netid)
        projectid = str(projectid)

        parts = []
        for _, row in g.iterrows():
            attempt = int(row["attempt"])
            date_ymd = str(row["date_ymd"])
            tf = str(row["transcript_file"])
            tpath = os.path.join(TRANSCRIPTS_DIR, tf)
            parts.append(f"\n\n===== ATTEMPT {attempt:02d} | {date_ymd} | {tf} =====\n\n")
            parts.append(read_text(tpath))

        bundles.append(
            StudentBundle(
                netid=netid,
                student=student,
                projectid=projectid,
                attempts_count=len(g),
                date_first=str(g["date_ymd"].min()),
                date_last=str(g["date_ymd"].max()),
                duration_total=int(g["call_duration_secs"].sum()),
                messages_total=int(g["message_count"].sum()),
                transcript_concat="".join(parts).strip() + "\n",
            )
        )

    return bundles


 # -------------------------------
 # Prompting + schemas
 # -------------------------------
 GRADE_SCHEMA = {
    "type": "object",
    "properties": {
        "scores": {
            "type": "object",
            "properties": {d: {"type": "integer", "minimum": 0, "maximum": 4} for d in RUBRIC_DIMENSIONS},
            "required": RUBRIC_DIMENSIONS,
            "additionalProperties": False,
        },
        "overall": {"type": "integer", "minimum": 0, "maximum": 20},
        "letter": {"type": "string"},
        "confidence": {"type": "integer", "minimum": 1, "maximum": 5},
        "evidence": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "dimension": {"type": "string"},
                    "quote": {"type": "string"},
                    "comment": {"type": "string"},
                },
                "required": ["dimension", "quote", "comment"],
                "additionalProperties": False,
            },
        },
        "strengths": {"type": "array", "items": {"type": "string"}},
        "improvements": {"type": "array", "items": {"type": "string"}},
        "notes_for_chair": {"type": "string"},
    },
    "required": ["scores", "overall", "letter", "confidence", "evidence", "strengths", "improvements", "notes_for_chair"],
    "additionalProperties": False,
 }

 STUDENT_FEEDBACK_SCHEMA = {
    "type": "object",
    "properties": {
        "rubric_breakdown": {
            "type": "object",
            "properties": {d: {"type": "string"} for d in RUBRIC_DIMENSIONS},
            "required": RUBRIC_DIMENSIONS,
            "additionalProperties": False,
        },
        "top_strengths": {"type": "array", "items": {"type": "string"}},
        "top_actions": {"type": "array", "items": {"type": "string"}},
        "closing": {"type": "string"},
    },
    "required": ["rubric_breakdown", "top_strengths", "top_actions", "closing"],
    "additionalProperties": False,
 }


 def scoring_prompt(student: str, netid: str, projectid: str, transcript: str) -> str:
    return f"""{RUBRIC_TEXT}

 Student: {student}
 NetID: {netid}
 Project ID: {projectid}

 Transcript (may include multiple attempts, in order):
 {transcript}

 Return ONLY valid JSON matching this schema:
 {json.dumps(GRADE_SCHEMA)}
 """


 def revise_prompt(original_json: Dict[str, Any], peer_jsons: List[Dict[str, Any]]) -> str:
    return f"""{RUBRIC_TEXT}

 You previously graded the exam as:
 {json.dumps(original_json, ensure_ascii=False)}

 Two other graders produced:
 {json.dumps(peer_jsons, ensure_ascii=False)}

 Revise your grade if warranted. If you disagree, explain why in notes_for_chair.
 Return ONLY valid JSON matching:
 {json.dumps(GRADE_SCHEMA)}
 """


 def chair_prompt(student: str, netid: str, projectid: str, transcript: str,
                 revised: Dict[str, Dict[str, Any]]) -> str:
    return f"""{RUBRIC_TEXT}

 You are the chair. You must produce:
 1) Final rubric scores (0-4 each) and overall (0-20), plus letter grade.
 2) A student-facing feedback object (no numeric grades shown in the prose).
 3) A short internal summary justifying any overrides.

 Student: {student}
 NetID: {netid}
 Project ID: {projectid}

 Transcript:
 {transcript}

 Revised reviews:
 {json.dumps(revised, ensure_ascii=False)}

 Output JSON with this schema ONLY:
 {json.dumps({
  "type": "object",
  "properties": {
    "final_grade": GRADE_SCHEMA,
    "student_feedback": STUDENT_FEEDBACK_SCHEMA,
    "chair_internal_summary": {"type": "string"}
  },
  "required": ["final_grade","student_feedback","chair_internal_summary"],
  "additionalProperties": False
 })}

 IMPORTANT: Output only valid JSON. No markdown. No commentary. No trailing commas.
 """


 # -------------------------------
 # Model calls
 # -------------------------------
 def call_openai(prompt: str) -> Dict[str, Any]:
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    resp = client.responses.create(
        model=OPENAI_MODEL,
        input=prompt,
    )
    # SDK returns output text blocks; simplest: use .output_text
    txt = resp.output_text
    return strict_json_from_text(txt)


 def call_anthropic(prompt: str) -> Dict[str, Any]:
    client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
    msg = client.messages.create(
        model=ANTHROPIC_MODEL,
        max_tokens=3000,
        temperature=0.0,
        messages=[{"role": "user", "content": prompt}],
    )
    txt = "".join([b.text for b in msg.content if getattr(b, "type", "") == "text"])
    return strict_json_from_text(txt)


 def call_gemini(prompt: str) -> Dict[str, Any]:
    client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
    resp = client.models.generate_content(
        model=GEMINI_MODEL,
        contents=prompt,
    )
    txt = resp.text or ""
    return strict_json_from_text(txt)


 # -------------------------------
 # Main council pipeline
 # -------------------------------
 def grade_one(bundle: StudentBundle, run_dir: str) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """
    Returns: (final_grade_json, student_feedback_json) from chair output
    Persists intermediate artifacts in run_dir for audit.
    """
    # Round 1
    p1 = scoring_prompt(bundle.student, bundle.netid, bundle.projectid, bundle.transcript_concat)

    status(f"  → Round 1: ChatGPT")
    r1_openai = call_openai(p1)
    status(f"  ✓ Round 1: ChatGPT done")
    time.sleep(SLEEP_BETWEEN_CALLS)
    
    status(f"  → Round 1: Claude")
    r1_anth = call_anthropic(p1)
    status(f"  ✓ Round 1: Claude done")
    time.sleep(SLEEP_BETWEEN_CALLS)
    
    status(f"  → Round 1: Gemini")
    r1_gem = call_gemini(p1)
    status(f"  ✓ Round 1: Gemini done")
    time.sleep(SLEEP_BETWEEN_CALLS)

    # Round 2 (share peers; each revises)
    status(f"  → Round 2 revisions")
    r2_openai = call_openai(revise_prompt(r1_openai, [r1_anth, r1_gem]))
    status(f"  ✓ Round 2: ChatGPT revised")
    time.sleep(SLEEP_BETWEEN_CALLS)
    
    r2_anth = call_anthropic(revise_prompt(r1_anth, [r1_openai, r1_gem]))
    status(f"  ✓ Round 2: Claude revised")
    time.sleep(SLEEP_BETWEEN_CALLS)
    
    r2_gem = call_gemini(revise_prompt(r1_gem, [r1_openai, r1_anth]))
    status(f"  ✓ Round 2: Gemini revised")
    time.sleep(SLEEP_BETWEEN_CALLS)

    revised = {"chatgpt": r2_openai, "claude": r2_anth, "gemini": r2_gem}

    # Chair (Claude)
    status(f"  → Chair summary (Claude)")
    chair_out = call_anthropic(chair_prompt(bundle.student, bundle.netid, bundle.projectid, bundle.transcript_concat, revised))
    status(f"  ✓ Chair summary done")
    
    # Save audit artifacts
    sid = f"{sanitize_filename(bundle.netid)}__{sanitize_filename(bundle.projectid)}"
    ensure_dir(os.path.join(run_dir, "audit", sid))
    audit_dir = os.path.join(run_dir, "audit", sid)
    write_json(os.path.join(audit_dir, "r1_openai.json"), r1_openai)
    write_json(os.path.join(audit_dir, "r1_claude.json"), r1_anth)
    write_json(os.path.join(audit_dir, "r1_gemini.json"), r1_gem)
    write_json(os.path.join(audit_dir, "r2_openai.json"), r2_openai)
    write_json(os.path.join(audit_dir, "r2_claude.json"), r2_anth)
    write_json(os.path.join(audit_dir, "r2_gemini.json"), r2_gem)
    write_json(os.path.join(audit_dir, "chair.json"), chair_out)

    return chair_out["final_grade"], chair_out["student_feedback"]


 def main() -> None:
    ensure_dir(OUTDIR)
    run_dir = os.path.join(OUTDIR, datetime.now().strftime("run_%Y%m%d_%H%M%S"))
    ensure_dir(run_dir)
    ensure_dir(os.path.join(run_dir, "feedback"))
    ensure_dir(os.path.join(run_dir, "audit"))

    bundles = load_and_bundle()
    
    rows = []
    for i, b in enumerate(tqdm(bundles, desc="Grading students", unit="student"), start=1):
        status(f"[{i}/{len(bundles)}] netid={b.netid} student={b.student} project={b.projectid} attempts={b.attempts_count}")
        final_grade, student_feedback = grade_one(b, run_dir)

        # Write student feedback file
        fname = f"{sanitize_filename(b.netid)}.txt"
        fpath = os.path.join(run_dir, "feedback", fname)

        # Student-facing prose (no numeric grade in prose)
        fb = student_feedback
        text = []
        text.append(f"Feedback for {b.student} ({b.netid}) | Project {b.projectid}\n")
        text.append("Rubric breakdown:\n")
        for d in RUBRIC_DIMENSIONS:
            text.append(f"- {d.replace('_',' ').title()}: {fb['rubric_breakdown'][d]}")
        text.append("\nTop strengths:")
        for s in fb["top_strengths"]:
            text.append(f"- {s}")
        text.append("\nTop actions for improvement:")
        for a in fb["top_actions"]:
            text.append(f"- {a}")
        text.append(f"\n{fb['closing']}\n")
        write_text(fpath, "\n".join(text))

        # CSV row
        scores = final_grade["scores"]
        rows.append({
            "netid": b.netid,
            "student": b.student,
            "projectid": b.projectid,
            "attempts_count": b.attempts_count,
            "date_first": b.date_first,
            "date_last": b.date_last,
            "duration_total_secs": b.duration_total,
            "messages_total": b.messages_total,
            **{f"score_{d}": int(scores[d]) for d in RUBRIC_DIMENSIONS},
            "overall_0_20": int(final_grade["overall"]),
            "letter": final_grade["letter"],
            "confidence_1_5": int(final_grade["confidence"]),
            "feedback_file": os.path.relpath(fpath, start=run_dir),
        })

    grades_path = os.path.join(run_dir, "grades.csv")
    pd.DataFrame(rows).sort_values(["netid", "projectid"]).to_csv(grades_path, index=False)
    print(f"Wrote: {grades_path}")
    print(f"Feedback dir: {os.path.join(run_dir, 'feedback')}")


 if __name__ == "__main__":
    main()
	import json
	import os
	import re
	import time
	from tqdm import tqdm
	from dataclasses import dataclass
	from datetime import datetime
	from typing import Any, Dict, List, Optional, Tuple

	import pandas as pd
	from dotenv import load_dotenv

	load_dotenv()

	# --- Providers (current SDKs) ---
	from openai import OpenAI
	from anthropic import Anthropic
	from google import genai

	# -------------------------------
	# Config
	# -------------------------------
	RUBRIC_DIMENSIONS = [
	"problem_framing",
	"metrics_economics",
	"risk_ethics",
	"experimentation",
	"communication",
	]

	RUBRIC_TEXT = """
	You are grading an oral exam for AI/ML Product Management. The students are undergraduate students at NYU Stern. For most, this was their first technical product course.

	CRITICAL CONTEXT ON EXAM CONDITIONS:
	The AI proctor for this exam had significant design flaws that negatively impacted student performance. Specifically:
	1. Stacked Questions: The agent often asked 3-4 distinct questions in a single turn.
	2. Moving Targets: When students asked for clarification, the agent often changed the question entirely rather than repeating it.
	3. Audio-Only Menus: The agent read long lists of complex options verbally, causing cognitive overload.

	Because of this, you must apply the following "Interference Protocols" when grading:
	- The "Pick One" Rule: If the agent asked multiple questions at once and the student only answered one or two, grade them ONLY on what they answered. Do not penalize for missing parts of a compound question.
	- The "Benefit of Doubt" Rule: If the agent rephrased a question during clarification, credit the student for answering any version of the question presented in that sequence.
	- Ignore "Stalling": Disregard phrases like "Can you repeat that?" or hesitation. These are valid coping strategies for a poor interface, not signs of ignorance.
	- Jargon Leniency: Focus on conceptual understanding over perfect industry terminology (e.g., if they describe "churn" correctly but call it "usage drop," accept it).

	Grade on these five dimensions (0-4 each; 0=missing, 4=excellent), using evidence from the transcript:

	1) Problem framing: Translating business problems into ML specs. (Did they understand the core user problem?)
	2) Metrics & economics: Trade-offs, costs, and counter-metrics. (Focus on their logic regarding trade-offs, even if they struggled to pick a specific metric from a verbal list).
	3) Risk & ethics: FAT-P, security risks, failure modes, governance. (Did they identify the harm, even if they needed the options repeated?)
	4) Experimentation: A/B testing, hypotheses, validation, controls.
	5) Communication: Concise, structured, and handles pushback. CRITICAL: Do not penalize the student for confusion caused by the agent's shifting questions. Grade their ability to synthesize the information they did hear.

	Return JSON that matches the requested schema exactly.
	"""

	# (c) cutoff: do not grade anything before Dec 11, 2025 (NY local date already in your export)
	CUTOFF_DATE = "2025-12-11" # inclusive
	# (d) ignore this student
	IGNORE_NETID = "kr888"

	TRANSCRIPTS_DIR = os.getenv("TRANSCRIPTS_DIR", "elevenlabs_transcripts")
	OUTDIR = os.getenv("GRADING_OUTDIR", "grading_out")

	OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")
	ANTHROPIC_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-sonnet-4.5")
	GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-2.5-pro")

	SLEEP_BETWEEN_CALLS = 0.15 # gentle pacing


	# -------------------------------
	# Utilities
	# -------------------------------
	def sanitize_filename(s: str, max_len: int = 120) -> str:
	s = (s or "").strip().replace(" ", "_")
	s = re.sub(r"[^A-Za-z0-9._-]+", "-", s)
	s = re.sub(r"-{2,}", "-", s).strip("-._")
	return s[:max_len] if len(s) > max_len else s

	def status(msg: str) -> None:
	# One-line flush for real-time visibility
	print(msg, flush=True)


	def parse_date_ymd(s: str) -> datetime:
	return datetime.strptime(s, "%Y-%m-%d")


	def ensure_dir(p: str) -> None:
	os.makedirs(p, exist_ok=True)


	def read_text(path: str) -> str:
	with open(path, "r", encoding="utf-8") as f:
	return f.read()


	def write_text(path: str, text: str) -> None:
	with open(path, "w", encoding="utf-8") as f:
	f.write(text)


	def write_json(path: str, obj: Any) -> None:
	with open(path, "w", encoding="utf-8") as f:
	json.dump(obj, f, ensure_ascii=False, indent=2)


	def extract_json_block(txt: str) -> str:
	"""
	Extract the most likely JSON object from model output.
	Handles markdown fences and extra prose.
	"""
	txt = (txt or "").strip()
	txt = re.sub(r"^```(?:json)?\s*", "", txt)
	txt = re.sub(r"\s*```$", "", txt)

	# Find first JSON object via brace matching (more robust than regex).
	start = txt.find("{")
	if start == -1:
	raise ValueError("No '{' found in output")

	depth = 0
	in_str = False
	esc = False
	for i in range(start, len(txt)):
	ch = txt[i]
	if in_str:
	if esc:
	esc = False
	elif ch == "\\":
	esc = True
	elif ch == '"':
	in_str = False
	else:
	if ch == '"':
	in_str = True
	elif ch == "{":
	depth += 1
	elif ch == "}":
	depth -= 1
	if depth == 0:
	return txt[start : i + 1]

	raise ValueError("Unclosed JSON object in output")


	def repair_json_with_model(bad_text: str) -> Dict[str, Any]:
	"""
	Use a cheap deterministic OpenAI pass to return valid JSON only.
	This is a guardrail, not a new grade.
	"""
	client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
	repair_prompt = (
	"You will be given text that should be a JSON object but is invalid.\n"
	"Return ONLY a valid JSON object. Do not change keys or meaning.\n"
	"If there are unescaped quotes or trailing commas, fix them.\n\n"
	f"TEXT:\n{bad_text}"
	)
	resp = client.responses.create(
	model=OPENAI_MODEL,
	input=repair_prompt,
	)
	fixed = resp.output_text.strip()
	json_str = extract_json_block(fixed)
	return json.loads(json_str)


	def strict_json_from_text(txt: str) -> Dict[str, Any]:
	"""
	Strict parse with one repair attempt.
	"""
	json_str = extract_json_block(txt)
	try:
	return json.loads(json_str)
	except json.JSONDecodeError:
	# One repair attempt, then re-raise if still broken
	return repair_json_with_model(json_str)


	# -------------------------------
	# Consolidate transcripts
	# -------------------------------
	@dataclass
	class StudentBundle:
	netid: str
	student: str
	projectid: str
	attempts_count: int
	date_first: str
	date_last: str
	duration_total: int
	messages_total: int
	transcript_concat: str


	def load_and_bundle() -> List[StudentBundle]:
	index_path = os.path.join(TRANSCRIPTS_DIR, "index.csv")
	df = pd.read_csv(index_path)

	# Expect at least these columns from your exporter
	# student, netid, projectid, attempt, date_ymd, call_duration_secs, message_count, transcript_file
	needed = {"student", "netid", "projectid", "attempt", "date_ymd", "call_duration_secs", "message_count", "transcript_file"}
	missing = needed - set(df.columns)
	if missing:
	raise ValueError(f"index.csv missing columns: {sorted(missing)}")

	# (c) filter date
	cutoff_dt = parse_date_ymd(CUTOFF_DATE)
	df = df[df["date_ymd"].apply(lambda s: parse_date_ymd(str(s)) >= cutoff_dt)]

	# (d) ignore netid
	df = df[df["netid"].astype(str).str.lower() != IGNORE_NETID.lower()]

	# stable ordering
	df = df.sort_values(["netid", "projectid", "attempt", "date_ymd"])

	bundles: List[StudentBundle] = []

	for (netid, projectid), g in df.groupby(["netid", "projectid"], dropna=False):
	g = g.sort_values(["attempt", "date_ymd"])
	student = str(g.iloc[0]["student"])
	netid = str(netid)
	projectid = str(projectid)

	parts = []
	for _, row in g.iterrows():
	attempt = int(row["attempt"])
	date_ymd = str(row["date_ymd"])
	tf = str(row["transcript_file"])
	tpath = os.path.join(TRANSCRIPTS_DIR, tf)
	parts.append(f"\n\n===== ATTEMPT {attempt:02d} \| {date_ymd} \| {tf} =====\n\n")
	parts.append(read_text(tpath))

	bundles.append(
	StudentBundle(
	netid=netid,
	student=student,
	projectid=projectid,
	attempts_count=len(g),
	date_first=str(g["date_ymd"].min()),
	date_last=str(g["date_ymd"].max()),
	duration_total=int(g["call_duration_secs"].sum()),
	messages_total=int(g["message_count"].sum()),
	transcript_concat="".join(parts).strip() + "\n",
	)
	)

	return bundles


	# -------------------------------
	# Prompting + schemas
	# -------------------------------
	GRADE_SCHEMA = {
	"type": "object",
	"properties": {
	"scores": {
	"type": "object",
	"properties": {d: {"type": "integer", "minimum": 0, "maximum": 4} for d in RUBRIC_DIMENSIONS},
	"required": RUBRIC_DIMENSIONS,
	"additionalProperties": False,
	},
	"overall": {"type": "integer", "minimum": 0, "maximum": 20},
	"letter": {"type": "string"},
	"confidence": {"type": "integer", "minimum": 1, "maximum": 5},
	"evidence": {
	"type": "array",
	"items": {
	"type": "object",
	"properties": {
	"dimension": {"type": "string"},
	"quote": {"type": "string"},
	"comment": {"type": "string"},
	},
	"required": ["dimension", "quote", "comment"],
	"additionalProperties": False,
	},
	},
	"strengths": {"type": "array", "items": {"type": "string"}},
	"improvements": {"type": "array", "items": {"type": "string"}},
	"notes_for_chair": {"type": "string"},
	},
	"required": ["scores", "overall", "letter", "confidence", "evidence", "strengths", "improvements", "notes_for_chair"],
	"additionalProperties": False,
	}

	STUDENT_FEEDBACK_SCHEMA = {
	"type": "object",
	"properties": {
	"rubric_breakdown": {
	"type": "object",
	"properties": {d: {"type": "string"} for d in RUBRIC_DIMENSIONS},
	"required": RUBRIC_DIMENSIONS,
	"additionalProperties": False,
	},
	"top_strengths": {"type": "array", "items": {"type": "string"}},
	"top_actions": {"type": "array", "items": {"type": "string"}},
	"closing": {"type": "string"},
	},
	"required": ["rubric_breakdown", "top_strengths", "top_actions", "closing"],
	"additionalProperties": False,
	}


	def scoring_prompt(student: str, netid: str, projectid: str, transcript: str) -> str:
	return f"""{RUBRIC_TEXT}

	Student: {student}
	NetID: {netid}
	Project ID: {projectid}

	Transcript (may include multiple attempts, in order):
	{transcript}

	Return ONLY valid JSON matching this schema:
	{json.dumps(GRADE_SCHEMA)}
	"""


	def revise_prompt(original_json: Dict[str, Any], peer_jsons: List[Dict[str, Any]]) -> str:
	return f"""{RUBRIC_TEXT}

	You previously graded the exam as:
	{json.dumps(original_json, ensure_ascii=False)}

	Two other graders produced:
	{json.dumps(peer_jsons, ensure_ascii=False)}

	Revise your grade if warranted. If you disagree, explain why in notes_for_chair.
	Return ONLY valid JSON matching:
	{json.dumps(GRADE_SCHEMA)}
	"""


	def chair_prompt(student: str, netid: str, projectid: str, transcript: str,
	revised: Dict[str, Dict[str, Any]]) -> str:
	return f"""{RUBRIC_TEXT}

	You are the chair. You must produce:
	1) Final rubric scores (0-4 each) and overall (0-20), plus letter grade.
	2) A student-facing feedback object (no numeric grades shown in the prose).
	3) A short internal summary justifying any overrides.

	Student: {student}
	NetID: {netid}
	Project ID: {projectid}

	Transcript:
	{transcript}

	Revised reviews:
	{json.dumps(revised, ensure_ascii=False)}

	Output JSON with this schema ONLY:
	{json.dumps({
	"type": "object",
	"properties": {
	"final_grade": GRADE_SCHEMA,
	"student_feedback": STUDENT_FEEDBACK_SCHEMA,
	"chair_internal_summary": {"type": "string"}
	},
	"required": ["final_grade","student_feedback","chair_internal_summary"],
	"additionalProperties": False
	})}

	IMPORTANT: Output only valid JSON. No markdown. No commentary. No trailing commas.
	"""


	# -------------------------------
	# Model calls
	# -------------------------------
	def call_openai(prompt: str) -> Dict[str, Any]:
	client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
	resp = client.responses.create(
	model=OPENAI_MODEL,
	input=prompt,
	)
	# SDK returns output text blocks; simplest: use .output_text
	txt = resp.output_text
	return strict_json_from_text(txt)


	def call_anthropic(prompt: str) -> Dict[str, Any]:
	client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
	msg = client.messages.create(
	model=ANTHROPIC_MODEL,
	max_tokens=3000,
	temperature=0.0,
	messages=[{"role": "user", "content": prompt}],
	)
	txt = "".join([b.text for b in msg.content if getattr(b, "type", "") == "text"])
	return strict_json_from_text(txt)


	def call_gemini(prompt: str) -> Dict[str, Any]:
	client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
	resp = client.models.generate_content(
	model=GEMINI_MODEL,
	contents=prompt,
	)
	txt = resp.text or ""
	return strict_json_from_text(txt)


	# -------------------------------
	# Main council pipeline
	# -------------------------------
	def grade_one(bundle: StudentBundle, run_dir: str) -> Tuple[Dict[str, Any], Dict[str, Any]]:
	"""
	Returns: (final_grade_json, student_feedback_json) from chair output
	Persists intermediate artifacts in run_dir for audit.
	"""
	# Round 1
	p1 = scoring_prompt(bundle.student, bundle.netid, bundle.projectid, bundle.transcript_concat)

	status(f" → Round 1: ChatGPT")
	r1_openai = call_openai(p1)
	status(f" ✓ Round 1: ChatGPT done")
	time.sleep(SLEEP_BETWEEN_CALLS)

	status(f" → Round 1: Claude")
	r1_anth = call_anthropic(p1)
	status(f" ✓ Round 1: Claude done")
	time.sleep(SLEEP_BETWEEN_CALLS)

	status(f" → Round 1: Gemini")
	r1_gem = call_gemini(p1)
	status(f" ✓ Round 1: Gemini done")
	time.sleep(SLEEP_BETWEEN_CALLS)

	# Round 2 (share peers; each revises)
	status(f" → Round 2 revisions")
	r2_openai = call_openai(revise_prompt(r1_openai, [r1_anth, r1_gem]))
	status(f" ✓ Round 2: ChatGPT revised")
	time.sleep(SLEEP_BETWEEN_CALLS)

	r2_anth = call_anthropic(revise_prompt(r1_anth, [r1_openai, r1_gem]))
	status(f" ✓ Round 2: Claude revised")
	time.sleep(SLEEP_BETWEEN_CALLS)

	r2_gem = call_gemini(revise_prompt(r1_gem, [r1_openai, r1_anth]))
	status(f" ✓ Round 2: Gemini revised")
	time.sleep(SLEEP_BETWEEN_CALLS)

	revised = {"chatgpt": r2_openai, "claude": r2_anth, "gemini": r2_gem}

	# Chair (Claude)
	status(f" → Chair summary (Claude)")
	chair_out = call_anthropic(chair_prompt(bundle.student, bundle.netid, bundle.projectid, bundle.transcript_concat, revised))
	status(f" ✓ Chair summary done")

	# Save audit artifacts
	sid = f"{sanitize_filename(bundle.netid)}__{sanitize_filename(bundle.projectid)}"
	ensure_dir(os.path.join(run_dir, "audit", sid))
	audit_dir = os.path.join(run_dir, "audit", sid)
	write_json(os.path.join(audit_dir, "r1_openai.json"), r1_openai)
	write_json(os.path.join(audit_dir, "r1_claude.json"), r1_anth)
	write_json(os.path.join(audit_dir, "r1_gemini.json"), r1_gem)
	write_json(os.path.join(audit_dir, "r2_openai.json"), r2_openai)
	write_json(os.path.join(audit_dir, "r2_claude.json"), r2_anth)
	write_json(os.path.join(audit_dir, "r2_gemini.json"), r2_gem)
	write_json(os.path.join(audit_dir, "chair.json"), chair_out)

	return chair_out["final_grade"], chair_out["student_feedback"]


	def main() -> None:
	ensure_dir(OUTDIR)
	run_dir = os.path.join(OUTDIR, datetime.now().strftime("run_%Y%m%d_%H%M%S"))
	ensure_dir(run_dir)
	ensure_dir(os.path.join(run_dir, "feedback"))
	ensure_dir(os.path.join(run_dir, "audit"))

	bundles = load_and_bundle()

	rows = []
	for i, b in enumerate(tqdm(bundles, desc="Grading students", unit="student"), start=1):
	status(f"[{i}/{len(bundles)}] netid={b.netid} student={b.student} project={b.projectid} attempts={b.attempts_count}")
	final_grade, student_feedback = grade_one(b, run_dir)

	# Write student feedback file
	fname = f"{sanitize_filename(b.netid)}.txt"
	fpath = os.path.join(run_dir, "feedback", fname)

	# Student-facing prose (no numeric grade in prose)
	fb = student_feedback
	text = []
	text.append(f"Feedback for {b.student} ({b.netid}) \| Project {b.projectid}\n")
	text.append("Rubric breakdown:\n")
	for d in RUBRIC_DIMENSIONS:
	text.append(f"- {d.replace('_',' ').title()}: {fb['rubric_breakdown'][d]}")
	text.append("\nTop strengths:")
	for s in fb["top_strengths"]:
	text.append(f"- {s}")
	text.append("\nTop actions for improvement:")
	for a in fb["top_actions"]:
	text.append(f"- {a}")
	text.append(f"\n{fb['closing']}\n")
	write_text(fpath, "\n".join(text))

	# CSV row
	scores = final_grade["scores"]
	rows.append({
	"netid": b.netid,
	"student": b.student,
	"projectid": b.projectid,
	"attempts_count": b.attempts_count,
	"date_first": b.date_first,
	"date_last": b.date_last,
	"duration_total_secs": b.duration_total,
	"messages_total": b.messages_total,
	**{f"score_{d}": int(scores[d]) for d in RUBRIC_DIMENSIONS},
	"overall_0_20": int(final_grade["overall"]),
	"letter": final_grade["letter"],
	"confidence_1_5": int(final_grade["confidence"]),
	"feedback_file": os.path.relpath(fpath, start=run_dir),
	})

	grades_path = os.path.join(run_dir, "grades.csv")
	pd.DataFrame(rows).sort_values(["netid", "projectid"]).to_csv(grades_path, index=False)
	print(f"Wrote: {grades_path}")
	print(f"Feedback dir: {os.path.join(run_dir, 'feedback')}")


	if __name__ == "__main__":
	main()
No results found