wassname · April 23, 2026 05:40 · wassname · Apr 5, 2026
diff --git a/guided.py b/guided.py
 """Reusable guided-rollout primitive: think → forced-close-think → JSON choice.

 One rollout, three numbers. The same primitive backs:
  - calibrate()'s coherence + format + rep measurement
  - probe replay at edit time
  - post-keep probe regeneration
  - (future) DD eval (once _measure_logratios is ported to this)

 Substrate:
  <user_prompt + schema_hint>
  <think>
  ... model thinks up to max_think_tokens greedy ...
  </think>
  {"choice":  ← model emits a schema value here

 Critical: on gemma-3-4b, `</think>` is multi-token so early-stop-on-eos doesn't
 fire. The model runs the full budget and often emits `</think>\\n{"choice": v}`
 inline. We detect that string-wise AFTER generation and score at the natural
 position (right after `{"choice": `), rather than blindly splicing a 2nd
 suffix (which would score garbage).

 Verbose mode dumps the raw decoded string with special tokens for format
 debugging, and shows top-3 tokens at the scoring position whenever
 pmass_format < 0.5 — i.e. whenever the model is not following the schema.
 """
 from __future__ import annotations

 import contextlib
 from dataclasses import dataclass

 import torch
 import torch.nn.functional as F
 from loguru import logger
 from ssteer.core import _input_device
 from ssteer.hooks import installed_svd, steer


 _CLOSE_MARKER: str = "</think>"
 _PREFILL: str = '\n{"choice": '
 # Used only when the model never emitted `</think>` on its own.
 _FORCE_SUFFIX: str = "\nI should answer now." + _CLOSE_MARKER + _PREFILL


 @dataclass
 class GuidedResult:
    user_prompt: str
    think_text: str           # text before </think>
    answer_text: str          # model's continuation after `{"choice": ` (3-4 tok)
    raw_full_text: str        # whole decoded trace incl. specials (verbose debug)
    pmass_format: float       # P(all choice_token_ids) at answer position
    logratio_ab: float        # log P(a_ids) - log P(b_ids); NaN if no b_ids
    rep_ratio_think: float    # 4-gram distinct over think_text; NaN if <32 words
    think_tokens: int         # count of think tokens (pre-</think>)
    emitted_close: bool       # True if model emitted </think> string itself
    emitted_prefill: bool     # True if model emitted `{"choice": ` itself


 _REP_MIN_TOKENS: int = 32


 def _ngram_rep_ratio(text: str, n: int = 4) -> float:
    tokens = text.split()
    if len(tokens) < _REP_MIN_TOKENS:
        return float("nan")
    ngrams = [tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
    return len(set(ngrams)) / len(ngrams)


 _DEFAULT_SCHEMA_HINT: str = (
    "Think briefly, then answer immediately and only with: "
    '{"choice": true}  or  {"choice": false}.'
 )


 @torch.no_grad()
 def guided_rollout(
    model, tok,
    user_prompt: str,
    choice_token_ids: list,
    cvec=None,
    coeff: float = 0.0,
    max_think_tokens: int = 128,
    answer_tokens: int = 4,
    schema_hint: str = _DEFAULT_SCHEMA_HINT,
    verbose: bool = False,
 ) -> GuidedResult:
    """Think → forced-close → JSON choice, all under one steering context.

    `choice_token_ids` is either a flat list (pmass only, no logratio) or
    `[a_variants, b_variants]` (pmass + logratio = logsumexp(a) - logsumexp(b)).

    verbose=True prints the raw decoded trace (with special tokens) and, when
    pmass_format < 0.5, the top-3 next-token candidates at the scoring
    position — so you can see WHAT the model thought it should emit.
    """
    device = _input_device(model)
    full_user = f"{user_prompt}\n\n{schema_hint}" if schema_hint else user_prompt
    messages = [{"role": "user", "content": full_user}]
    prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    prompt = prompt + "<think>\n"

    enc = tok(prompt, return_tensors="pt").to(device)
    prompt_len = enc.input_ids.shape[1]

    # Multi-token </think> on gemma-3 means this eos rarely fires; we handle
    # the string-level case below.
    think_end_id = tok.convert_tokens_to_ids("</think>")
    if think_end_id in (None, getattr(tok, "unk_token_id", None)):
        think_end_id = tok.eos_token_id
    pad_id = tok.pad_token_id if tok.pad_token_id is not None else tok.eos_token_id

    svd_ctx = installed_svd(model, cvec) if cvec is not None else contextlib.nullcontext()
    steer_ctx = steer(model, cvec, coeff) if cvec is not None else contextlib.nullcontext()

    with svd_ctx, steer_ctx:
        # Phase 1: think, greedy.
        phase1 = model.generate(
            **enc,
            max_new_tokens=max_think_tokens,
            do_sample=False,
            eos_token_id=think_end_id,
            pad_token_id=pad_id,
        )
        gen_ids = phase1[0, prompt_len:]
        keep = gen_ids != pad_id
        gen_ids = gen_ids[keep] if keep.any() else gen_ids[:0]
        gen_text = tok.decode(gen_ids, skip_special_tokens=True)

        # String-level split. Handles both (a) model naturally emitted </think>,
        # and (b) budget exhausted without closing.
        emitted_close = _CLOSE_MARKER in gen_text
        if emitted_close:
            think_text, after = gen_text.split(_CLOSE_MARKER, 1)
            if _PREFILL.lstrip() in after:
                # Model also wrote the prefill. Align scoring prefix to its position.
                emitted_prefill = True
                before_value = after.split(_PREFILL.lstrip(), 1)[0]
                scoring_text = prompt + think_text + _CLOSE_MARKER + before_value + _PREFILL.lstrip()
            else:
                emitted_prefill = False
                scoring_text = prompt + think_text + _CLOSE_MARKER + _PREFILL
        else:
            think_text = gen_text
            emitted_prefill = False
            scoring_text = prompt + gen_text + _FORCE_SUFFIX

        # Re-tokenize the scoring prefix — next-token at position -1 is the value.
        score_ids = tok(scoring_text, return_tensors="pt",
                         add_special_tokens=False).input_ids.to(device)

        # Phase 2a: score at scoring position.
        logits = model(score_ids).logits[0, -1].float()
        logp = F.log_softmax(logits, dim=-1)

        if (len(choice_token_ids) == 2
                and all(isinstance(x, (list, tuple)) for x in choice_token_ids)):
            a_ids, b_ids = list(choice_token_ids[0]), list(choice_token_ids[1])
        else:
            a_ids, b_ids = list(choice_token_ids), []
        all_ids = torch.tensor(a_ids + b_ids, device=device, dtype=torch.long)
        pmass_format = float(logp[all_ids].exp().sum().item())
        if a_ids and b_ids:
            a_t = torch.tensor(a_ids, device=device, dtype=torch.long)
            b_t = torch.tensor(b_ids, device=device, dtype=torch.long)
            logratio = float(torch.logsumexp(logp[a_t], dim=0).item()
                             - torch.logsumexp(logp[b_t], dim=0).item())
        else:
            logratio = float("nan")

        # Phase 2b: continue a few tokens for a readable answer_text.
        cont = model.generate(
            score_ids,
            max_new_tokens=answer_tokens,
            do_sample=False,
            pad_token_id=pad_id,
        )
        answer_ids = cont[0, score_ids.shape[1]:]
        answer_text = tok.decode(answer_ids, skip_special_tokens=True)

        raw_full_text = tok.decode(cont[0], skip_special_tokens=False)

        if verbose:
            logger.info(
                f"[guided_rollout verbose]\n"
                f"  scoring_text[-120:]: {scoring_text[-120:]!r}\n"
                f"  emitted_close={emitted_close}  emitted_prefill={emitted_prefill}\n"
                f"  pmass_format={pmass_format:.3f}  logratio={logratio:+.3f}\n"
                f"  answer_text={answer_text!r}\n"
                f"  === RAW (incl. specials) ===\n{raw_full_text}\n"
                f"  === END RAW ==="
            )
            if pmass_format < 0.5:
                top_logp, top_idx = torch.topk(logp, 3)
                tops = [(tok.decode([int(i)]), float(p.exp().item()))
                        for p, i in zip(top_logp, top_idx)]
                logger.warning(
                    f"pmass<0.5 at scoring position. Top-3 tokens: {tops}  "
                    f"→ schema broken. Adjust schema_hint or add variants "
                    f"to choice_token_ids."
                )

    return GuidedResult(
        user_prompt=user_prompt,
        think_text=think_text,
        answer_text=answer_text,
        raw_full_text=raw_full_text,
        pmass_format=pmass_format,
        logratio_ab=logratio,
        rep_ratio_think=_ngram_rep_ratio(think_text, n=4),
        think_tokens=int(score_ids.shape[1] - prompt_len),
        emitted_close=emitted_close,
        emitted_prefill=emitted_prefill,
    )


 def choice_token_ids_tf(tok) -> list[list[int]]:
    """[[true_variants...], [false_variants...]] — first-token after `{"choice": `."""
    def _variants(words):
        seen = []
        for s in words:
            tid = tok.encode(s, add_special_tokens=False)[-1]
            if tid not in seen:
                seen.append(tid)
        return seen
    return [_variants(["true", " true", "\ntrue", "True", " True", "\nTrue"]),
            _variants(["false", " false", "\nfalse", "False", " False", "\nFalse"])]
diff --git a/guided_CoT.md b/guided_CoT.md
	"""Reusable guided-rollout primitive: think → forced-close-think → JSON choice.

	One rollout, three numbers. The same primitive backs:
	- calibrate()'s coherence + format + rep measurement
	- probe replay at edit time
	- post-keep probe regeneration
	- (future) DD eval (once _measure_logratios is ported to this)

	Substrate:
	<user_prompt + schema_hint>
	<think>
	... model thinks up to max_think_tokens greedy ...
	</think>
	{"choice": ← model emits a schema value here

	Critical: on gemma-3-4b, `</think>` is multi-token so early-stop-on-eos doesn't
	fire. The model runs the full budget and often emits `</think>\\n{"choice": v}`
	inline. We detect that string-wise AFTER generation and score at the natural
	position (right after `{"choice": `), rather than blindly splicing a 2nd
	suffix (which would score garbage).

	Verbose mode dumps the raw decoded string with special tokens for format
	debugging, and shows top-3 tokens at the scoring position whenever
	pmass_format < 0.5 — i.e. whenever the model is not following the schema.
	"""
	from __future__ import annotations

	import contextlib
	from dataclasses import dataclass

	import torch
	import torch.nn.functional as F
	from loguru import logger
	from ssteer.core import _input_device
	from ssteer.hooks import installed_svd, steer


	_CLOSE_MARKER: str = "</think>"
	_PREFILL: str = '\n{"choice": '
	# Used only when the model never emitted `</think>` on its own.
	_FORCE_SUFFIX: str = "\nI should answer now." + _CLOSE_MARKER + _PREFILL


	@dataclass
	class GuidedResult:
	user_prompt: str
	think_text: str # text before </think>
	answer_text: str # model's continuation after `{"choice": ` (3-4 tok)
	raw_full_text: str # whole decoded trace incl. specials (verbose debug)
	pmass_format: float # P(all choice_token_ids) at answer position
	logratio_ab: float # log P(a_ids) - log P(b_ids); NaN if no b_ids
	rep_ratio_think: float # 4-gram distinct over think_text; NaN if <32 words
	think_tokens: int # count of think tokens (pre-</think>)
	emitted_close: bool # True if model emitted </think> string itself
	emitted_prefill: bool # True if model emitted `{"choice": ` itself


	_REP_MIN_TOKENS: int = 32


	def _ngram_rep_ratio(text: str, n: int = 4) -> float:
	tokens = text.split()
	if len(tokens) < _REP_MIN_TOKENS:
	return float("nan")
	ngrams = [tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
	return len(set(ngrams)) / len(ngrams)


	_DEFAULT_SCHEMA_HINT: str = (
	"Think briefly, then answer immediately and only with: "
	'{"choice": true} or {"choice": false}.'
	)


	@torch.no_grad()
	def guided_rollout(
	model, tok,
	user_prompt: str,
	choice_token_ids: list,
	cvec=None,
	coeff: float = 0.0,
	max_think_tokens: int = 128,
	answer_tokens: int = 4,
	schema_hint: str = _DEFAULT_SCHEMA_HINT,
	verbose: bool = False,
	) -> GuidedResult:
	"""Think → forced-close → JSON choice, all under one steering context.

	`choice_token_ids` is either a flat list (pmass only, no logratio) or
	`[a_variants, b_variants]` (pmass + logratio = logsumexp(a) - logsumexp(b)).

	verbose=True prints the raw decoded trace (with special tokens) and, when
	pmass_format < 0.5, the top-3 next-token candidates at the scoring
	position — so you can see WHAT the model thought it should emit.
	"""
	device = _input_device(model)
	full_user = f"{user_prompt}\n\n{schema_hint}" if schema_hint else user_prompt
	messages = [{"role": "user", "content": full_user}]
	prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	prompt = prompt + "<think>\n"

	enc = tok(prompt, return_tensors="pt").to(device)
	prompt_len = enc.input_ids.shape[1]

	# Multi-token </think> on gemma-3 means this eos rarely fires; we handle
	# the string-level case below.
	think_end_id = tok.convert_tokens_to_ids("</think>")
	if think_end_id in (None, getattr(tok, "unk_token_id", None)):
	think_end_id = tok.eos_token_id
	pad_id = tok.pad_token_id if tok.pad_token_id is not None else tok.eos_token_id

	svd_ctx = installed_svd(model, cvec) if cvec is not None else contextlib.nullcontext()
	steer_ctx = steer(model, cvec, coeff) if cvec is not None else contextlib.nullcontext()

	with svd_ctx, steer_ctx:
	# Phase 1: think, greedy.
	phase1 = model.generate(
	**enc,
	max_new_tokens=max_think_tokens,
	do_sample=False,
	eos_token_id=think_end_id,
	pad_token_id=pad_id,
	)
	gen_ids = phase1[0, prompt_len:]
	keep = gen_ids != pad_id
	gen_ids = gen_ids[keep] if keep.any() else gen_ids[:0]
	gen_text = tok.decode(gen_ids, skip_special_tokens=True)

	# String-level split. Handles both (a) model naturally emitted </think>,
	# and (b) budget exhausted without closing.
	emitted_close = _CLOSE_MARKER in gen_text
	if emitted_close:
	think_text, after = gen_text.split(_CLOSE_MARKER, 1)
	if _PREFILL.lstrip() in after:
	# Model also wrote the prefill. Align scoring prefix to its position.
	emitted_prefill = True
	before_value = after.split(_PREFILL.lstrip(), 1)[0]
	scoring_text = prompt + think_text + _CLOSE_MARKER + before_value + _PREFILL.lstrip()
	else:
	emitted_prefill = False
	scoring_text = prompt + think_text + _CLOSE_MARKER + _PREFILL
	else:
	think_text = gen_text
	emitted_prefill = False
	scoring_text = prompt + gen_text + _FORCE_SUFFIX

	# Re-tokenize the scoring prefix — next-token at position -1 is the value.
	score_ids = tok(scoring_text, return_tensors="pt",
	add_special_tokens=False).input_ids.to(device)

	# Phase 2a: score at scoring position.
	logits = model(score_ids).logits[0, -1].float()
	logp = F.log_softmax(logits, dim=-1)

	if (len(choice_token_ids) == 2
	and all(isinstance(x, (list, tuple)) for x in choice_token_ids)):
	a_ids, b_ids = list(choice_token_ids[0]), list(choice_token_ids[1])
	else:
	a_ids, b_ids = list(choice_token_ids), []
	all_ids = torch.tensor(a_ids + b_ids, device=device, dtype=torch.long)
	pmass_format = float(logp[all_ids].exp().sum().item())
	if a_ids and b_ids:
	a_t = torch.tensor(a_ids, device=device, dtype=torch.long)
	b_t = torch.tensor(b_ids, device=device, dtype=torch.long)
	logratio = float(torch.logsumexp(logp[a_t], dim=0).item()
	- torch.logsumexp(logp[b_t], dim=0).item())
	else:
	logratio = float("nan")

	# Phase 2b: continue a few tokens for a readable answer_text.
	cont = model.generate(
	score_ids,
	max_new_tokens=answer_tokens,
	do_sample=False,
	pad_token_id=pad_id,
	)
	answer_ids = cont[0, score_ids.shape[1]:]
	answer_text = tok.decode(answer_ids, skip_special_tokens=True)

	raw_full_text = tok.decode(cont[0], skip_special_tokens=False)

	if verbose:
	logger.info(
	f"[guided_rollout verbose]\n"
	f" scoring_text[-120:]: {scoring_text[-120:]!r}\n"
	f" emitted_close={emitted_close} emitted_prefill={emitted_prefill}\n"
	f" pmass_format={pmass_format:.3f} logratio={logratio:+.3f}\n"
	f" answer_text={answer_text!r}\n"
	f" === RAW (incl. specials) ===\n{raw_full_text}\n"
	f" === END RAW ==="
	)
	if pmass_format < 0.5:
	top_logp, top_idx = torch.topk(logp, 3)
	tops = [(tok.decode([int(i)]), float(p.exp().item()))
	for p, i in zip(top_logp, top_idx)]
	logger.warning(
	f"pmass<0.5 at scoring position. Top-3 tokens: {tops} "
	f"→ schema broken. Adjust schema_hint or add variants "
	f"to choice_token_ids."
	)

	return GuidedResult(
	user_prompt=user_prompt,
	think_text=think_text,
	answer_text=answer_text,
	raw_full_text=raw_full_text,
	pmass_format=pmass_format,
	logratio_ab=logratio,
	rep_ratio_think=_ngram_rep_ratio(think_text, n=4),
	think_tokens=int(score_ids.shape[1] - prompt_len),
	emitted_close=emitted_close,
	emitted_prefill=emitted_prefill,
	)


	def choice_token_ids_tf(tok) -> list[list[int]]:
	"""[[true_variants...], [false_variants...]] — first-token after `{"choice": `."""
	def _variants(words):
	seen = []
	for s in words:
	tid = tok.encode(s, add_special_tokens=False)[-1]
	if tid not in seen:
	seen.append(tid)
	return seen
	return [_variants(["true", " true", "\ntrue", "True", " True", "\nTrue"]),
	_variants(["false", " false", "\nfalse", "False", " False", "\nFalse"])]
No results found