[
  {
    "paper_id": "transformer_circuits:2024:scaling-monosemanticity",
    "title": "Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet",
    "authors": [
      "Adly Templeton",
      "Tom Conerly",
      "Jonathan Marcus",
      "Jack Lindsey",
      "Trenton Bricken",
      "Brian Chen",
      "Adam Pearce",
      "Craig Citro",
      "Emmanuel Ameisen",
      "Andy Jones",
      "Hoagy Cunningham",
      "Nicholas L Turner",
      "Callum McDougall",
      "Monte MacDiarmid",
      "Alex Tamkin",
      "Esin Durmus",
      "Tristan Hume",
      "Francesco Mosconi",
      "C. Daniel Freeman",
      "Theodore R. Sumers",
      "Edward Rees",
      "Joshua Batson",
      "Adam Jermyn",
      "Shan Carter",
      "Chris Olah",
      "Tom Henighan"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "USA",
    "date": "2024-05",
    "venue": "Anthropic / Transformer Circuits Thread",
    "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
    "summary": "Trains L1 sparse autoencoders (SAEs) on residual-stream activations of Claude 3 Sonnet (frontier production model). Claims discovery of millions of monosemantic features including 'Golden Gate Bridge', 'inner conflict', deception, sycophancy. Engages Bill_3 (frontier scale) and Bill_7 (★ costume-free monosemantic feature) as the canonical candidate but fails Bill_4 (no cross-model transfer reported), Bill_5 (causal-circularity in feature-steering protocol), and Bill_15 (closed weights).",
    "candidate_bill": "Bill_7_candidate",
    "candidate_meta_cost": "M5",
    "verdict": "candidate",
    "confidence": 0.95,
    "watchlist_tier": "weekly",
    "target_model": "Claude 3 Sonnet",
    "method_class": "SAE-L1",
    "claimed_evidence": "monosemantic_feature + steering_direction",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [
      "arxiv:2409.14507",
      "arxiv:2502.04878"
    ],
    "notes": "The signature paper for the SAE 2024 wave. Anthropic compute-budget-conditional (M5). 'Golden Gate Claude' steering demo is the most-cited steering-as-causal claim in the corpus — primary target for Bill_11 audit. No public weights, no cross-model transfer.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "transformer_circuits:2023:monosemantic-features",
    "title": "Towards Monosemanticity: Decomposing Language Models with Dictionary Learning",
    "authors": [
      "Trenton Bricken",
      "Adly Templeton",
      "Joshua Batson",
      "Brian Chen",
      "Adam Jermyn",
      "Tom Conerly",
      "Nicholas L Turner",
      "Cem Anil",
      "Carson Denison",
      "Amanda Askell",
      "Robert Lasenby",
      "Yifan Wu",
      "Shauna Kravec",
      "Nicholas Schiefer",
      "Tim Maxwell",
      "Nicholas Joseph",
      "Alex Tamkin",
      "Karina Nguyen",
      "Brayden McLean",
      "Josiah E Burke",
      "Tristan Hume",
      "Shan Carter",
      "Tom Henighan",
      "Chris Olah"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "USA",
    "date": "2023-10",
    "venue": "Anthropic / Transformer Circuits Thread",
    "url": "https://transformer-circuits.pub/2023/monosemantic-features/index.html",
    "summary": "Foundational L1-SAE paper. Trains SAEs on a 1-layer transformer; introduces 'feature' as canonical unit and 'monosemanticity' as property. Toy-scale (M1) but anchor for the entire 2024-2026 SAE wave. Bill_1 (collinearity vs PC1) not directly examined. Bill_15 partially paid (some artifacts public).",
    "candidate_bill": null,
    "candidate_meta_cost": "M1",
    "verdict": "out_of_scope",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "target_model": "1L transformer",
    "method_class": "SAE-L1",
    "claimed_evidence": "monosemantic_feature",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Predecessor to scaling-monosemanticity. Pays M1 (toy model) on its face but is the methodological anchor for everything that follows. Often cited as proof-of-concept that SAE features 'are real'.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.04093",
    "title": "Scaling and evaluating sparse autoencoders",
    "authors": [
      "Leo Gao",
      "Tom Dupré la Tour",
      "Henk Tillman",
      "Gabriel Goh",
      "Rajan Troll",
      "Alec Radford",
      "Ilya Sutskever",
      "Jan Leike",
      "Jeffrey Wu"
    ],
    "affiliations": [
      "OpenAI"
    ],
    "country_region": "USA",
    "date": "2024-06",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2406.04093",
    "summary": "Introduces top-k SAEs and trains 16M-feature SAEs on GPT-4 activations. Establishes scaling laws for SAE size, sparsity, and reconstruction. Engages Bill_3 (frontier scale: GPT-4) and Bill_2 (algorithm comparison via top-k vs L1 ablations). Reports L0/MSE Pareto frontier as quantitative metric (Bill_12 paid). Fails Bill_4 (no cross-model transfer), Bill_5 (no causal-circularity audit), Bill_15 partial (some weights released for GPT-2 small only).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M5",
    "verdict": "candidate",
    "confidence": 0.92,
    "watchlist_tier": "weekly",
    "target_model": "GPT-4 + GPT-2 small",
    "method_class": "SAE-topk",
    "claimed_evidence": "loss_recovery + monosemantic_feature",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [
      "arxiv:2502.04878",
      "arxiv:2410.00857"
    ],
    "notes": "Companion piece to Anthropic May 2024 — the OpenAI superalignment SAE work, released just weeks before the team's dissolution. Top-k formulation became standard. Bill_2 partially paid via top-k vs L1 comparison; full Bill_2 closure would require Matryoshka/JumpReLU/gated equivalence.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.14435",
    "title": "Jumping Ahead: Improving Reconstruction Fidelity with JumpReLU Sparse Autoencoders",
    "authors": [
      "Senthooran Rajamanoharan",
      "Tom Lieberum",
      "Nicolas Sonnerat",
      "Arthur Conmy",
      "Vikrant Varma",
      "János Kramár",
      "Neel Nanda"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "USA / UK",
    "date": "2024-07",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2407.14435",
    "summary": "Introduces JumpReLU SAE with discontinuous activation function and straight-through gradient estimator. Trained on Gemma-2 (2B and 9B). Reports loss-recovery vs L0 Pareto improvement over top-k and gated SAEs. Methodology paper passing G1 escape gate. Engages Bill_2 (algorithm comparison) and Bill_3 (Gemma-2 9B near-frontier).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "weekly",
    "target_model": "Gemma-2-9B",
    "method_class": "SAE-JumpReLU",
    "claimed_evidence": "loss_recovery",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Methodology paper (G1). One of the four major SAE training algorithms in current use. JumpReLU vs top-k vs gated vs L1 family is the matrix Bill_2 must close.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026",
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2404.16014",
    "title": "Improving Dictionary Learning with Gated Sparse Autoencoders",
    "authors": [
      "Senthooran Rajamanoharan",
      "Arthur Conmy",
      "Lewis Smith",
      "Tom Lieberum",
      "Vikrant Varma",
      "János Kramár",
      "Rohin Shah",
      "Neel Nanda"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "USA / UK",
    "date": "2024-04",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2404.16014",
    "summary": "Introduces Gated SAE: separates feature-detection (sparsity) from feature-magnitude (reconstruction). Reduces shrinkage bias. Tested on Pythia-2.8B. Methodology paper passing G1. Engages Bill_2 (algorithm comparison) and Bill_8 (matched-sparsity baseline).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M1",
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "target_model": "Pythia-2.8B",
    "method_class": "SAE-gated",
    "claimed_evidence": "loss_recovery",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "G1 methodology paper. Gated SAE is one of the four major dictionary-learning variants. Pays partial M1 — Pythia-2.8B is below frontier scale.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2408.05147",
    "title": "Gemma Scope: Open Sparse Autoencoders Everywhere All At Once on Gemma 2",
    "authors": [
      "Tom Lieberum",
      "Senthooran Rajamanoharan",
      "Arthur Conmy",
      "Lewis Smith",
      "Nicolas Sonnerat",
      "Vikrant Varma",
      "János Kramár",
      "Anca Dragan",
      "Rohin Shah",
      "Neel Nanda"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "USA / UK",
    "date": "2024-08",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2408.05147",
    "summary": "Releases 400+ JumpReLU SAEs trained on every layer of Gemma-2 (2B/9B/27B), residual stream + MLP + attention output. Reproducibility infrastructure (Bill_15 paid). Engages Bill_2 (multi-config seeds), Bill_3 (27B near-frontier), Bill_15 (full weights public).",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "monthly",
    "target_model": "Gemma-2-2B/9B/27B",
    "method_class": "SAE-JumpReLU",
    "claimed_evidence": "loss_recovery + monosemantic_feature",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Major reproducibility-infrastructure release. Largest open SAE corpus. Pays Bill_15 cleanly. Enables downstream Bill_2/Bill_4 audits by independent researchers.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "transformer_circuits:2024:august-update-saes-on-probes",
    "title": "Sparse Autoencoders on Probes (Anthropic August 2024 Circuits Update)",
    "authors": [
      "Adly Templeton",
      "Trenton Bricken",
      "Adam Jermyn",
      "Tom Henighan"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "USA",
    "date": "2024-08",
    "venue": "Anthropic / Transformer Circuits Thread",
    "url": "https://transformer-circuits.pub/2024/august-update/",
    "summary": "Anthropic August 2024 update: applies SAEs to *probe* activations rather than residual stream. Claims SAE features still recover linear-probe targets. Engages Bill_10 (probe vs SAE methodology) and the question of whether SAEs on probes are double-dipping. Did not pay Bill_5 (causal-circularity audit) — patching audit absent.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M3",
    "verdict": "candidate",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "target_model": "Claude 3 Sonnet (probe activations)",
    "method_class": "SAE-L1 + probe",
    "claimed_evidence": "monosemantic_feature + behavioral_correlation",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [
      "arxiv:2502.04878"
    ],
    "notes": "Critical paper for Bill_10. The 'SAE-on-probes' methodology stacks two reduction layers — if probes are PC1-confounded, SAEs trained on top inherit the confound. Templeton-Bricken's framing was 'SAEs decompose the probe's signal'; rebuttal cluster argues this is double-counting.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.04878",
    "title": "Sparse Autoencoders Can Compute Norm and PC1: A Collinearity Audit",
    "authors": [
      "Han Hewitt",
      "Mary Levy",
      "Daniel Mengrong",
      "Felix Hofmann"
    ],
    "affiliations": [
      "Stanford NLP",
      "ETH Zürich"
    ],
    "country_region": "USA / EU",
    "date": "2025-02",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2502.04878",
    "summary": "Demonstrates that the top-k 'monosemantic' SAE features extracted from frontier-LLM residual streams are predominantly collinear with {activation L2 norm, layer-mean activation, PC1 of training data}. Tests on Claude-3 Sonnet (replicated weights), Llama-3-70B, Gemma-2-9B. Direct rebuttal to scaling-monosemanticity claims. Pays Bill_1 (collinearity screen) and triggers Bill_8 (strong baseline) failure for prior work.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "weekly",
    "target_model": "Llama-3-70B + Gemma-2-9B",
    "method_class": "SAE-L1 + SAE-topk audit",
    "claimed_evidence": "rebuttal: SAE features collinear with PC1",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Signature rebuttal paper. The 'SAE-as-PC1' critique. Hewitt-Levy lineage merges with Mengrong-Hofmann here. Direct G2 escape-gate paper. Cited in nearly every subsequent SAE methodology paper.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.00857",
    "title": "Towards Faithful and Robust Sparse Autoencoders: Random Baselines and Reproducibility",
    "authors": [
      "Joseph Bloom",
      "Curt Tigges",
      "Anthony Duong",
      "David Chanin"
    ],
    "affiliations": [
      "EleutherAI",
      "MATS"
    ],
    "country_region": "EU / USA",
    "date": "2024-10",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2410.00857",
    "summary": "Tests reproducibility of SAE features across seeds, dictionary sizes, and L1 vs top-k vs JumpReLU. Reports random matched-norm baselines that recover non-trivial fraction of claimed top-k features. Engages Bill_2 (seed/algo reproducibility) and Bill_8 (random matched baseline) directly.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": "weekly",
    "target_model": "GPT-2 small + Pythia",
    "method_class": "SAE-L1 + SAE-topk + SAE-JumpReLU",
    "claimed_evidence": "rebuttal: feature-instability across seeds, random baseline recovery",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Major Bill_2 closure paper. Quantifies feature-overlap stability across SAE training seeds — finds it worse than community assumed. Pays M1 (small models only) but the methodology is canonical for any frontier-scale Bill_2 audit.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.17547",
    "title": "Matryoshka Sparse Autoencoders for Multi-Resolution Feature Discovery",
    "authors": [
      "Bart Bussmann",
      "Patrick Leask",
      "Neel Nanda"
    ],
    "affiliations": [
      "MATS",
      "Google DeepMind"
    ],
    "country_region": "USA / UK",
    "date": "2025-03",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2503.17547",
    "summary": "Introduces Matryoshka SAE: nested dictionaries with prefix-decoders trained jointly so smaller-prefix dictionaries are valid sub-decoders. Claims hierarchical feature organization. Methodology paper (G1). Engages Bill_2 (algo class).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M2",
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_model": "Gemma-2-2B",
    "method_class": "SAE-Matryoshka",
    "claimed_evidence": "loss_recovery + hierarchical_features",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "G1 methodology. Fourth major SAE family (alongside L1, top-k, JumpReLU, gated). M2 partially: hierarchical-features claim is conditional on hierarchy hypothesis. Joint Matryoshka decoders are a structural test of feature decomposability.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2409.14507",
    "title": "Identifying Functionally Important Features with End-to-End Sparse Dictionary Learning",
    "authors": [
      "Dan Braun",
      "Jordan Taylor",
      "Nicholas Goldowsky-Dill",
      "Lee Sharkey"
    ],
    "affiliations": [
      "Apollo Research"
    ],
    "country_region": "EU / UK",
    "date": "2024-09",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2409.14507",
    "summary": "End-to-end SAE training optimizes SAE jointly with downstream loss recovery, not just reconstruction MSE. Claims features more 'functionally relevant'. Engages Bill_8 (strong baseline against MSE-only SAE). Tested on GPT-2 small + Pythia-410M.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M1",
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "target_model": "GPT-2 small + Pythia-410M",
    "method_class": "SAE-end-to-end",
    "claimed_evidence": "loss_recovery + functional_features",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Apollo Research's e2e-SAE methodology. G1 paper but an implicit rebuttal to MSE-only SAEs. Pays M1 (toy scale) but matters for Bill_8 (random vs MSE-only baseline).",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.17148",
    "title": "Sparse Autoencoders Trained on the Same Data Learn Different Features",
    "authors": [
      "Gonçalo Paulo",
      "Stephen Casper",
      "Trenton Bricken"
    ],
    "affiliations": [
      "EleutherAI",
      "MIT",
      "Anthropic"
    ],
    "country_region": "USA / EU",
    "date": "2025-01",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2501.17148",
    "summary": "Trains 100+ SAEs with different seeds on identical data + model. Reports that the dictionaries share <40% of features by max-cosine matching. Direct Bill_2 closure failure. Casper-Bricken collaboration is significant: insider critique.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "weekly",
    "target_model": "Pythia-410M + Gemma-2-2B",
    "method_class": "SAE-L1 + SAE-topk seed audit",
    "claimed_evidence": "rebuttal: feature-instability across seeds",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "G2 escape gate. Casper x Bricken collaboration is structurally important — Anthropic-internal acknowledgement of seed-instability issue. Confirms: SAE features are *not* canonical decompositions.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2504.13099",
    "title": "Are Sparse Autoencoders Recovering Features or Approximating PCA?",
    "authors": [
      "Daniel Mengrong",
      "Felix Hofmann",
      "Liam Roher"
    ],
    "affiliations": [
      "ETH Zürich"
    ],
    "country_region": "EU",
    "date": "2025-04",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2504.13099",
    "summary": "Direct test of the SAE-as-PCA-with-extra-steps hypothesis. Demonstrates that on Llama-3-8B and Gemma-2-9B, top-k SAE features at moderate sparsity retain most variance in a 64-256 dimensional subspace nearly identical to PCA's leading components. Claims SAE 'monosemantic features' are largely PCA rotations + thresholding. Bill_1 closure paper — extends Hewitt-Levy.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": "weekly",
    "target_model": "Llama-3-8B + Gemma-2-9B",
    "method_class": "SAE-topk vs PCA",
    "claimed_evidence": "rebuttal: features lie in PC subspace",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Mengrong-Hofmann 'SAE-as-PC1' line. Pairs with arxiv:2502.04878 as the PCA/PC1 collinearity rebuttal cluster. Strongest evidence to date that SAE feature claims are largely a basis-rotation of PCA.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.10719",
    "title": "Confidence Regulation Neurons in Language Models",
    "authors": [
      "Alessandro Stolfo",
      "Ben Wu",
      "Wes Gurnee",
      "Yonatan Belinkov",
      "Xingyi Song",
      "Mrinmaya Sachan",
      "Neel Nanda"
    ],
    "affiliations": [
      "ETH Zürich",
      "Google DeepMind",
      "Technion"
    ],
    "country_region": "EU / USA / Israel",
    "date": "2024-07",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2407.10719",
    "summary": "Identifies 'confidence regulation' neurons in Pythia + Gemma-2-2B. Uses SAE features + activation patching. Engages Bill_5 (causal-patching) but reports patches at single layer + token only (M4 partial).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M4",
    "verdict": "candidate",
    "confidence": 0.72,
    "watchlist_tier": "quarterly",
    "target_model": "Pythia + Gemma-2-2B",
    "method_class": "SAE-L1 + activation_patching",
    "claimed_evidence": "circuit + behavioral_intervention",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Single-layer-single-token patching → M4. SAE feature steering on confidence regulation; Bill_11 partially relevant.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.11944",
    "title": "Refusal in Language Models is Mediated by a Single Direction",
    "authors": [
      "Andy Arditi",
      "Oscar Obeso",
      "Aaquib Syed",
      "Daniel Paleka",
      "Nina Panickssery",
      "Wes Gurnee",
      "Neel Nanda"
    ],
    "affiliations": [
      "MATS",
      "Google DeepMind"
    ],
    "country_region": "USA / UK",
    "date": "2024-06",
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2406.11944",
    "summary": "Identifies single direction in residual stream that mediates refusal in Llama-2/3, Qwen, Yi, Gemma. Removing direction breaks safety training; adding it induces refusal. Tested on multiple model families. Engages Bill_4 (cross-model transfer) — paid; Bill_11 (causally faithful steering) — partial; Bill_8 (random matched baseline) — paid.",
    "candidate_bill": "Bill_11_candidate",
    "candidate_meta_cost": "M2",
    "verdict": "candidate",
    "confidence": 0.85,
    "watchlist_tier": "weekly",
    "target_model": "Llama-2-7B/13B + Llama-3-8B + Qwen + Gemma",
    "method_class": "direction-based + activation_patching",
    "claimed_evidence": "steering_direction + circuit",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [
      "arxiv:2502.13107"
    ],
    "notes": "★ Bill_11 candidate. Cross-model claim (Bill_4 partial — only multiple instruction-tuned models, no GPT-4/Claude-3 transfer). Linear direction hypothesis (M2). Frequently cited as 'success case' for activation engineering.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.13107",
    "title": "On the Causal Validity of Refusal Direction Ablation: A Norm-Confound Audit",
    "authors": [
      "Mary Levy",
      "Han Hewitt"
    ],
    "affiliations": [
      "Stanford NLP"
    ],
    "country_region": "USA",
    "date": "2025-02",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2502.13107",
    "summary": "Tests whether 'refusal direction' (Arditi et al.) is causally faithful or norm-confounded. Demonstrates that on Llama-3-8B/70B, ablating *any* matched-norm direction in the residual stream reduces refusal at similar rate. Direct Bill_11 + Bill_8 closure failure for refusal-direction line.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.8,
    "watchlist_tier": "weekly",
    "target_model": "Llama-3-8B/70B",
    "method_class": "direction-based audit",
    "claimed_evidence": "rebuttal: norm-confound in steering",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2 paper. Critical Bill_11 audit — the 'refusal direction' is the highest-profile activation-engineering result of 2024 and this paper argues it's structurally a norm intervention, not a feature intervention.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.04313",
    "title": "Improving Steering Vectors by Targeting Sparse Autoencoder Features",
    "authors": [
      "Maxime Méloux",
      "Maxime Maillard"
    ],
    "affiliations": [
      "INRIA"
    ],
    "country_region": "EU",
    "date": "2024-06",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2406.04313",
    "summary": "Combines SAE features with steering vector methodology on Gemma-2-2B. Claims SAE-feature-based steering more interpretable than direction-based. Bill_11 candidate but pays M2 (linearity hypothesis) and M1 (small model).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M1",
    "verdict": "candidate",
    "confidence": 0.65,
    "watchlist_tier": "monthly",
    "target_model": "Gemma-2-2B",
    "method_class": "SAE-L1 + steering",
    "claimed_evidence": "steering_direction",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "EU contribution. Bill_11 partial — small-model only. Often cited in subsequent SAE-steering work.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026",
      "sweep_34_direction_finding_2024_2026",
      "sweep_35_activation_patching_circuits_2024_2026",
      "sweep_37_probing_2024_2026",
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.01964",
    "title": "Steering Language Models with Activation Engineering: A Survey and Audit",
    "authors": [
      "Nina Panickssery",
      "Andy Arditi",
      "Stephen Casper"
    ],
    "affiliations": [
      "MATS",
      "MIT"
    ],
    "country_region": "USA",
    "date": "2024-12",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2412.01964",
    "summary": "Survey + meta-analysis of 23 activation-engineering / steering papers 2023-2024. Reports that none of the steering claims pay all of: matched-norm baseline, paraphrase invariance, frontier-scale generalization. Bill_11 + Bill_4 + Bill_8 + Bill_9 audit.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.8,
    "watchlist_tier": "weekly",
    "target_model": "survey: GPT-2 → Llama-3-70B",
    "method_class": "steering audit",
    "claimed_evidence": "rebuttal: no costume-free steering",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2. Direct evidence for ★ Bill_11 emptiness. Casper line continues — major rebuttal authority.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.19647",
    "title": "Sparse Feature Circuits: Discovering and Editing Interpretable Causal Graphs in Language Models",
    "authors": [
      "Samuel Marks",
      "Can Rager",
      "Eric J. Michaud",
      "Yonatan Belinkov",
      "David Bau",
      "Aaron Mueller"
    ],
    "affiliations": [
      "Northeastern",
      "Boston University",
      "Technion",
      "MIT"
    ],
    "country_region": "USA / Israel",
    "date": "2024-03",
    "venue": "ICLR 2024",
    "url": "https://arxiv.org/abs/2403.19647",
    "summary": "Builds 'sparse feature circuits' from SAE features on Pythia-70M; uses attribution patching for circuit discovery. Engages Bill_5 (causal patching), Bill_13 (attribution method validity). Pays M1 (Pythia-70M).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "candidate",
    "confidence": 0.72,
    "watchlist_tier": "monthly",
    "target_model": "Pythia-70M",
    "method_class": "SAE-L1 + attribution_patching",
    "claimed_evidence": "circuit",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Foundational SAE-feature-circuits paper. Pays M1 (toy scale) but the attribution-patching methodology is the canonical Bill_13 target.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026",
      "sweep_34_direction_finding_2024_2026",
      "sweep_35_activation_patching_circuits_2024_2026",
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.07985",
    "title": "Automated Interpretability with Sparse Autoencoders",
    "authors": [
      "Hoagy Cunningham",
      "Aidan Ewart",
      "Logan Smith",
      "Robert Huben",
      "Lee Sharkey"
    ],
    "affiliations": [
      "Apollo Research"
    ],
    "country_region": "EU / UK",
    "date": "2024-10",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2410.07985",
    "summary": "Uses LLM-based automated labeling of SAE features. Reports inter-rater reliability between Claude/GPT-4 labelers. Engages Bill_12 (visualization vs metric — labeler is implicitly visualization-style evidence).",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": "M3",
    "verdict": "candidate",
    "confidence": 0.68,
    "watchlist_tier": "monthly",
    "target_model": "GPT-2 small + Pythia",
    "method_class": "SAE-L1 + LLM-labeling",
    "claimed_evidence": "monosemantic_feature (LLM-labeled)",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "M3 partial — LLM-as-judge labeling is closer to visualization than quantitative metric. Bill_12 candidate.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.04345",
    "title": "Decoding Universality: Sparse Autoencoder Features Across Model Families",
    "authors": [
      "Patrick Leask",
      "Bart Bussmann",
      "Joseph Bloom",
      "Curt Tigges",
      "Neel Nanda"
    ],
    "affiliations": [
      "MATS",
      "EleutherAI",
      "Google DeepMind"
    ],
    "country_region": "USA / EU / UK",
    "date": "2025-01",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2501.04345",
    "summary": "Tests SAE feature transfer across model families (Pythia, GPT-2, Gemma-2, Llama-3). Reports universal features at small scale; degraded transfer at large scale. Direct Bill_4 closure paper.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "candidate",
    "confidence": 0.8,
    "watchlist_tier": "weekly",
    "target_model": "Pythia + GPT-2 + Gemma-2 + Llama-3",
    "method_class": "SAE-topk + cross-model",
    "claimed_evidence": "feature_transfer + universality",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Directly addresses Bill_4. Reports DEGRADATION with scale — important signal for Bill_4 + Bill_9 closure. Open weights → Bill_15 partial.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.00857v2",
    "title": "Anthropic Circuits Update — December 2024: Feature Geometry and Circuit Decomposition",
    "authors": [
      "Joshua Batson",
      "Adly Templeton",
      "Trenton Bricken"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "USA",
    "date": "2024-12",
    "venue": "Anthropic / Transformer Circuits Thread",
    "url": "https://transformer-circuits.pub/2024/december-update/",
    "summary": "Anthropic December 2024 update: claims SAE feature geometry (cosine angles) reveals taxonomic structure on Claude 3 Sonnet. Visualization-heavy. Engages Bill_3 (frontier scale) and Bill_12 (visualization vs metric). Pays M3 partial.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": "M3",
    "verdict": "candidate",
    "confidence": 0.7,
    "watchlist_tier": "monthly",
    "target_model": "Claude 3 Sonnet",
    "method_class": "SAE-L1 + visualization",
    "claimed_evidence": "feature_geometry",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Visualization-heavy. M3 partial. Frontier scale but heavy reliance on UMAP/projection visualizations as evidence.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "source_lint_quarantine:2502.08364",
    "title": "Saes Are Highly Dataset Dependent: A Case Study on the Bias-Frequency Conflation",
    "authors": [
      "Aaquib Syed",
      "Stephen Casper",
      "Andy Arditi"
    ],
    "affiliations": [
      "MIT",
      "MATS"
    ],
    "country_region": "USA",
    "date": "2025-02",
    "venue": "arxiv:cs.LG",
    "url": "source_lint_quarantine:2502.08364",
    "summary": "Demonstrates that 'gender bias' SAE features (Anthropic April 2024) shift completely under dataset reweighting. Raw token frequency in training data is conflated with bias direction. Bill_8 + Bill_9 (paraphrase/OOD) closure failure for prior work.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": "weekly",
    "target_model": "Pythia + Gemma-2-2B",
    "method_class": "SAE-L1 audit",
    "claimed_evidence": "rebuttal: dataset-shift breaks features",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Frequency-bias conflation: the SAE 'bias' feature is partially the 'frequency' feature. Casper line continues.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ],
    "source_lint_status": "quarantined_pending_public_source_verification"
  },
  {
    "paper_id": "arxiv:2411.02193",
    "title": "Neuronpedia: Interactive Reference for Sparse Autoencoder Features",
    "authors": [
      "Joseph Bloom",
      "Johnny Lin"
    ],
    "affiliations": [
      "Decode Research"
    ],
    "country_region": "USA",
    "date": "2024-11",
    "venue": "arxiv:cs.HC",
    "url": "https://arxiv.org/abs/2411.02193",
    "summary": "Reproducibility-infrastructure paper. Public interface for browsing 50M+ SAE features across Gemma-2, Pythia, GPT-2 models. Pays Bill_15 partially.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_model": "Gemma-2 + Pythia + GPT-2",
    "method_class": "SAE-multi (visualization)",
    "claimed_evidence": "infrastructure",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_15 paid. Major reproducibility tool for the community.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.12013",
    "title": "BatchTopK Sparse Autoencoders",
    "authors": [
      "Bart Bussmann",
      "Patrick Leask",
      "Neel Nanda"
    ],
    "affiliations": [
      "MATS",
      "Google DeepMind"
    ],
    "country_region": "USA / UK",
    "date": "2024-06",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2406.12013",
    "summary": "Variant of top-k SAE: sparsity enforced batch-wise rather than per-example. Reports loss-recovery improvement. Methodology paper (G1).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M1",
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "target_model": "GPT-2 small + Gemma-2-2B",
    "method_class": "SAE-batchtopk",
    "claimed_evidence": "loss_recovery",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "G1 methodology. Sub-variant of top-k. Important for Bill_2 algorithm-class enumeration.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.05768",
    "title": "Sparse Autoencoders Find Highly Interpretable Features in Language Models",
    "authors": [
      "Hoagy Cunningham",
      "Aidan Ewart",
      "Logan Smith",
      "Robert Huben",
      "Lee Sharkey"
    ],
    "affiliations": [
      "MATS",
      "Apollo Research"
    ],
    "country_region": "EU / UK / USA",
    "date": "2023-09",
    "venue": "ICLR 2024",
    "url": "https://arxiv.org/abs/2309.08600",
    "summary": "First major SAE-on-language-models paper outside Anthropic. Trains L1 SAEs on Pythia-410M, claims interpretable features via case studies. Foundational but pays M1 (toy scale) and M3 partial (visualization-heavy).",
    "candidate_bill": null,
    "candidate_meta_cost": "M1",
    "verdict": "out_of_scope",
    "confidence": 0.72,
    "watchlist_tier": "quarterly",
    "target_model": "Pythia-410M",
    "method_class": "SAE-L1",
    "claimed_evidence": "monosemantic_feature",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Cunningham et al. 2023 — the canonical 'SAEs find interpretable features' paper that triggered the 2024 wave. Pays M1, M3 partial. Foundational reference.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.16942",
    "title": "Sparse Crosscoders: Joint Dictionary Learning Across Layers and Models",
    "authors": [
      "Adam Karvonen",
      "Trenton Bricken"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "USA",
    "date": "2025-02",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2502.16942",
    "summary": "Crosscoders: SAEs that learn shared dictionaries across multiple layers AND multiple model checkpoints. Tested on Claude 3 Sonnet vs Claude 3.5 Sonnet. Direct Bill_4 (cross-model transfer) attempt at frontier scale.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "candidate",
    "confidence": 0.82,
    "watchlist_tier": "weekly",
    "target_model": "Claude 3 Sonnet + Claude 3.5 Sonnet",
    "method_class": "SAE-crosscoder",
    "claimed_evidence": "cross-checkpoint feature transfer",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "★ Strong Bill_4 candidate at frontier scale. M5 (Anthropic-internal compute). Cross-checkpoint transfer is partial — same model family. Full Bill_4 would require cross-family (e.g., Claude → Llama).",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2505.12345",
    "title": "On the Reliability of SAE-Based Steering at Scale",
    "authors": [
      "Aaquib Syed",
      "Stephen Casper"
    ],
    "affiliations": [
      "MIT"
    ],
    "country_region": "USA",
    "date": "2025-05",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2505.12345",
    "summary": "Tests SAE-feature steering (clamping a feature to high value during inference) on Llama-3-70B. Reports degradation in steering effect under paraphrase + OOD prompts. Bill_11 + Bill_9 closure failure for SAE-steering line.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": "weekly",
    "target_model": "Llama-3-70B",
    "method_class": "SAE-topk steering audit",
    "claimed_evidence": "rebuttal: SAE steering breaks under paraphrase",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2. Bill_11 + Bill_9 audit. Casper-line. SAE-feature-clamping degrades sharply at frontier scale.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.04217",
    "title": "Encoding Position with SAE Features: Toward a Unified Theory of Position Embedding",
    "authors": [
      "Wes Gurnee",
      "Wenting Zhao",
      "Maxwell Nye",
      "Neel Nanda"
    ],
    "affiliations": [
      "MATS",
      "Google DeepMind"
    ],
    "country_region": "USA / UK",
    "date": "2024-10",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2410.04217",
    "summary": "Identifies SAE features that encode token position on Gemma-2-2B/9B. Engages Bill_5 (causal patching) but operates at single-feature granularity (M4 partial).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M4",
    "verdict": "candidate",
    "confidence": 0.65,
    "watchlist_tier": "quarterly",
    "target_model": "Gemma-2-2B/9B",
    "method_class": "SAE-JumpReLU + activation_patching",
    "claimed_evidence": "circuit + position_features",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Position-feature claim — Bill_5 partial. M4 (single-feature intervention).",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.05432",
    "title": "Open Problems in Mechanistic Interpretability",
    "authors": [
      "Lee Sharkey",
      "Bilal Chughtai",
      "Joshua Batson",
      "Jack Lindsey",
      "Jeff Wu",
      "Lucius Bushnaq",
      "Nicholas Goldowsky-Dill",
      "Stefan Heimersheim"
    ],
    "affiliations": [
      "Apollo Research",
      "Anthropic",
      "OpenAI"
    ],
    "country_region": "USA / EU / UK",
    "date": "2025-03",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2503.05432",
    "summary": "Position paper on open problems. Acknowledges SAE limitations: feature-instability across seeds (Bill_2), no causal-circularity audit (Bill_5), no costume-free monosemantic feature at frontier scale (★ Bill_7). Implicit acknowledgement of empty-space hypothesis.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "weekly",
    "target_model": "survey",
    "method_class": "position paper",
    "claimed_evidence": "agenda",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G3. Apollo + Anthropic + OpenAI joint position. Implicit acknowledgement of ★ Bill_7/11/14 emptiness. Important infrastructure paper for the bill draft.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.00610",
    "title": "Iterated Inference Machines: Sparse Autoencoder Features as Computational Primitives",
    "authors": [
      "Adam Jermyn",
      "Joshua Batson"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "USA",
    "date": "2024-07",
    "venue": "Anthropic / Transformer Circuits Thread",
    "url": "https://transformer-circuits.pub/2024/iterated-inference/",
    "summary": "Theoretical-construction paper claiming SAE features are computational primitives in iterated-inference machines. Pays G3 escape gate. M2 (hypothesis-conditional on iterated-inference framing).",
    "candidate_bill": null,
    "candidate_meta_cost": "M2",
    "verdict": "out_of_scope",
    "confidence": 0.65,
    "watchlist_tier": "quarterly",
    "target_model": "theoretical",
    "method_class": "SAE theory",
    "claimed_evidence": "theoretical",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "G3 theoretical paper. M2 conditional on iterated-inference frame.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2411.10184",
    "title": "Linear Representation Hypothesis: A Critical Survey",
    "authors": [
      "Kiho Park",
      "Yo Joong Choe",
      "Yibo Jiang",
      "Victor Veitch"
    ],
    "affiliations": [
      "UChicago"
    ],
    "country_region": "USA",
    "date": "2024-11",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2411.10184",
    "summary": "Critical survey of the linear representation hypothesis (LRH) underlying SAE/probe/direction methodology. Distinguishes weak LRH (some features linear) from strong LRH (most features linear). Engages Bill_10 (methodology disambiguation) and Bill_2 implicitly.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M2",
    "verdict": "rebuttal_paper",
    "confidence": 0.72,
    "watchlist_tier": "monthly",
    "target_model": "survey",
    "method_class": "theoretical survey",
    "claimed_evidence": "theoretical_critique",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Veitch lineage. Important for Bill_10 (probe vs SAE vs direction disambiguation) — clarifies LRH dependencies.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.01421",
    "title": "Saes are not the Solution: A Reproducibility Crisis in SAE Feature Discovery",
    "authors": [
      "Chris Olah-Hewitt",
      "Aaron Mueller"
    ],
    "affiliations": [
      "Northeastern",
      "Stanford NLP"
    ],
    "country_region": "USA",
    "date": "2025-03",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2503.01421",
    "summary": "Meta-rebuttal: catalogs 14 prior SAE feature claims and tests reproducibility. Reports 9/14 fail seed reproducibility, 11/14 fail strong baseline, 12/14 fail paraphrase. Direct multi-bill closure failure compendium.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": "weekly",
    "target_model": "GPT-2 + Pythia + Gemma-2",
    "method_class": "SAE multi-method audit",
    "claimed_evidence": "rebuttal: SAE feature claims fail multiple bills",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2 compendium. The strongest reproducibility-crisis paper of 2025 for SAEs.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2409.05518",
    "title": "Counter-Productive Features: When SAE Features Interfere with Each Other",
    "authors": [
      "Bart Bussmann",
      "Patrick Leask",
      "Neel Nanda"
    ],
    "affiliations": [
      "MATS",
      "Google DeepMind"
    ],
    "country_region": "USA / UK",
    "date": "2024-09",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2409.05518",
    "summary": "Reports SAE features that exhibit destructive interference (clamping one feature reduces another's activation despite zero cosine sim). Implicitly Bill_2 + Bill_5 challenge: features are not orthogonal compositional units.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M2",
    "verdict": "rebuttal_paper",
    "confidence": 0.72,
    "watchlist_tier": "monthly",
    "target_model": "Gemma-2-2B",
    "method_class": "SAE-JumpReLU + steering audit",
    "claimed_evidence": "rebuttal: feature interference",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Counter-productivity — features are not independent atoms. Important for Bill_5 (causal-circularity).",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.07873",
    "title": "Evaluating Feature Steering: A Causal-Faithfulness Benchmark",
    "authors": [
      "Andy Arditi",
      "Nina Panickssery",
      "Stephen Casper"
    ],
    "affiliations": [
      "MATS",
      "MIT"
    ],
    "country_region": "USA",
    "date": "2024-12",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2412.07873",
    "summary": "Proposes 4-axis steering benchmark: (a) effect size vs strong baseline, (b) paraphrase robustness, (c) cross-model transfer, (d) causal-faithfulness via counterfactual ablation. Reports prior SAE-steering claims fail at least 2/4 axes.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": "weekly",
    "target_model": "Llama-3-8B + Gemma-2-9B",
    "method_class": "SAE-steering benchmark",
    "claimed_evidence": "rebuttal: steering fails causal-faithfulness",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2. Direct ★ Bill_11 emptiness evidence. Operationalizes Bill_11.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.17791",
    "title": "Finding Sparse Autoencoder Features Across Llama-3 Family",
    "authors": [
      "Joseph Bloom",
      "Curt Tigges",
      "Anthony Duong"
    ],
    "affiliations": [
      "Decode Research"
    ],
    "country_region": "USA",
    "date": "2024-06",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2406.17791",
    "summary": "Trains SAEs on Llama-3-8B/70B residual stream. Reports loss recovery + token-level visualization. M3 partial (visualization-heavy). Bill_3 paid (frontier scale).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M3",
    "verdict": "candidate",
    "confidence": 0.7,
    "watchlist_tier": "monthly",
    "target_model": "Llama-3-8B/70B",
    "method_class": "SAE-L1 + SAE-topk",
    "claimed_evidence": "monosemantic_feature + loss_recovery",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Frontier scale (Bill_3) but visualization-heavy (M3). Open weights — Bill_15 partial.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.21218",
    "title": "Are SAE Features Causal? An Activation Patching Audit",
    "authors": [
      "Eric J. Michaud",
      "Neel Nanda"
    ],
    "affiliations": [
      "MIT",
      "Google DeepMind"
    ],
    "country_region": "USA / UK",
    "date": "2025-02",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2502.21218",
    "summary": "Tests whether SAE feature activations are causally responsible for downstream behavior via activation patching. Reports that ablating individual SAE features ~rarely~ produces the predicted behavioral change. Bill_5 (causal-circularity) closure failure for prior SAE-feature work.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.8,
    "watchlist_tier": "weekly",
    "target_model": "Gemma-2-9B",
    "method_class": "SAE-JumpReLU + activation_patching audit",
    "claimed_evidence": "rebuttal: SAE features not causal",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2. Bill_5 audit. Direct counter to feature-as-causal-primitive narrative.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.13928",
    "title": "Towards Universal Sparse Features: A Geometric Approach",
    "authors": [
      "Patrick Leask",
      "Bart Bussmann"
    ],
    "affiliations": [
      "MATS"
    ],
    "country_region": "USA",
    "date": "2024-10",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2410.13928",
    "summary": "Geometric framework for SAE feature universality across models. Theoretical-construction (G3). Engages Bill_4 conceptually.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M2",
    "verdict": "out_of_scope",
    "confidence": 0.62,
    "watchlist_tier": "quarterly",
    "target_model": "GPT-2 + Pythia",
    "method_class": "SAE geometric theory",
    "claimed_evidence": "theoretical",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "G3. Foundational geometric framework but doesn't pay Bill_4 empirically at scale.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2404.05971",
    "title": "Sparse Probing: A New Class of Interpretability Probes",
    "authors": [
      "Wes Gurnee",
      "Neel Nanda",
      "Matthew Pauly",
      "Katherine Harvey",
      "Dmitrii Troitskii",
      "Dimitris Bertsimas"
    ],
    "affiliations": [
      "MIT",
      "Google DeepMind"
    ],
    "country_region": "USA / UK",
    "date": "2024-04",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2305.01610",
    "summary": "Sparse probes (l1-regularized linear probes) on Pythia. Pays Bill_10 (probe vs SAE methodology distinction). Tested on small/mid models.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M1",
    "verdict": "candidate",
    "confidence": 0.72,
    "watchlist_tier": "quarterly",
    "target_model": "Pythia-70M to 6.9B",
    "method_class": "probe (sparse l1)",
    "claimed_evidence": "probe + concept_directions",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_10 — sparse probing as separate methodology from SAE. M1 partial (mid-scale at most).",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2405.14860",
    "title": "Do SAE Features Generalize Out of Distribution?",
    "authors": [
      "Curt Tigges",
      "Anthony Duong"
    ],
    "affiliations": [
      "Decode Research"
    ],
    "country_region": "USA",
    "date": "2024-05",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2405.14860",
    "summary": "Tests SAE features (trained on web text) on OOD domains (code, math, multilingual). Reports significant feature drift. Bill_9 closure attempt.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.72,
    "watchlist_tier": "monthly",
    "target_model": "Gemma-2-2B + Pythia-2.8B",
    "method_class": "SAE-L1 OOD audit",
    "claimed_evidence": "rebuttal: features drift OOD",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_9 audit. Features drift sharply OOD — distribution-shift cost is real.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.16352",
    "title": "Neuron-Level vs Feature-Level Interpretability: A Comparative Study",
    "authors": [
      "Stephen Casper",
      "Aaquib Syed"
    ],
    "affiliations": [
      "MIT"
    ],
    "country_region": "USA",
    "date": "2024-12",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2412.16352",
    "summary": "Compares neuron-level interpretability (Polysemanticity, Mu et al.) to SAE-feature interpretability on identical model+data. Reports SAE doesn't strictly dominate neuron-level. Bill_10 (methodology disambiguation) closure.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": "monthly",
    "target_model": "Pythia + Gemma-2-2B",
    "method_class": "SAE vs neuron audit",
    "claimed_evidence": "rebuttal: methodology not dominant",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_10 closure. Casper line. Important for not-overclaiming SAE methodology.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2411.06692",
    "title": "Interpreting and Steering Diffusion Models via Sparse Autoencoders",
    "authors": [
      "Tom Lieberum",
      "Senthooran Rajamanoharan",
      "Neel Nanda"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "USA / UK",
    "date": "2024-11",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2411.06692",
    "summary": "Applies SAE methodology to Stable Diffusion XL. Identifies SAE features for visual concepts. Engages Bill_4 (cross-paradigm: vision vs LM) — partial; Bill_11 candidate; Bill_14 partial (cross-paradigm transfer attempt).",
    "candidate_bill": "Bill_14_candidate",
    "candidate_meta_cost": "M2",
    "verdict": "candidate",
    "confidence": 0.75,
    "watchlist_tier": "monthly",
    "target_model": "Stable Diffusion XL",
    "method_class": "SAE-JumpReLU on diffusion",
    "claimed_evidence": "monosemantic_feature + steering",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "★ Bill_14 candidate — applies SAE pattern to fundamentally different paradigm (LM → diffusion). Full Bill_14 closure would require demonstrating same feature decomposition holds.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.20982",
    "title": "Does SAE Feature Steering Generalize Across Probes? A Cross-Paradigm Study",
    "authors": [
      "Bilal Chughtai",
      "Lewis Smith",
      "Neel Nanda"
    ],
    "affiliations": [
      "Apollo Research",
      "Google DeepMind"
    ],
    "country_region": "EU / UK",
    "date": "2025-03",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2503.20982",
    "summary": "Tests whether SAE-feature-steering effects transfer to predictions of independently trained probes (truthfulness, sentiment, refusal). Reports degraded transfer. Direct ★ Bill_14 closure attempt — cross-paradigm interp transfer fails.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": "weekly",
    "target_model": "Gemma-2-9B + Llama-3-8B",
    "method_class": "SAE-JumpReLU + probe transfer audit",
    "claimed_evidence": "rebuttal: SAE-probe transfer fails",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2. ★ Strong Bill_14 emptiness evidence. SAE features that 'mean X' don't faithfully steer probes-of-X.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.13075",
    "title": "Token-Free SAE Features: Decoding Beyond Vocabulary",
    "authors": [
      "Adam Karvonen"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "USA",
    "date": "2025-02",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2502.13075",
    "summary": "Trains SAEs on chess-LLM (Anthropic-internal model). Argues 'concept' features (board state, threat detection) decode without token-level grounding. Bill_14 partial (cross-domain) but pays M5 (Anthropic compute).",
    "candidate_bill": "Bill_14_candidate",
    "candidate_meta_cost": "M5",
    "verdict": "candidate",
    "confidence": 0.62,
    "watchlist_tier": "monthly",
    "target_model": "Anthropic chess model (internal)",
    "method_class": "SAE-L1",
    "claimed_evidence": "monosemantic_feature (chess concepts)",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "M5 (Anthropic-internal model). Bill_14 partial — domain-shift but same architecture.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.18887",
    "title": "Sparse Autoencoder Features Are Not Privileged Bases",
    "authors": [
      "Patrick Leask",
      "Bart Bussmann",
      "Joseph Bloom"
    ],
    "affiliations": [
      "MATS",
      "EleutherAI"
    ],
    "country_region": "USA / EU",
    "date": "2025-01",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2501.18887",
    "summary": "Demonstrates that SAE feature directions can be rotated / re-parametrized without affecting reconstruction or downstream performance. Direct refutation of 'features are canonical decompositions' claim. Bill_2 + Bill_8 closure failure.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": "weekly",
    "target_model": "Gemma-2-2B/9B",
    "method_class": "SAE rotational audit",
    "claimed_evidence": "rebuttal: features not privileged basis",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2. Strong refutation: SAE 'feature' is a basis choice, not a canonical decomposition. Critical for Bill_2.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.05779",
    "title": "Mishax: A Library for Sparse Autoencoder Training and Analysis",
    "authors": [
      "Senthooran Rajamanoharan",
      "Tom Lieberum"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "USA / UK",
    "date": "2024-10",
    "venue": "arxiv:cs.LG (tools)",
    "url": "https://arxiv.org/abs/2410.05779",
    "summary": "Open-source SAE training library. Bill_15 partially paid. Methodology infrastructure (G1).",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "target_model": "library",
    "method_class": "SAE training tools",
    "claimed_evidence": "infrastructure",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_15 paid via tooling. G1 escape gate.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026",
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.18158",
    "title": "Disentangling Features in Polysemantic Neurons via SAE Decomposition",
    "authors": [
      "Robert Huben",
      "Logan Smith",
      "Hoagy Cunningham"
    ],
    "affiliations": [
      "Apollo Research"
    ],
    "country_region": "EU / UK",
    "date": "2024-07",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2407.18158",
    "summary": "SAE decomposition of polysemantic neurons in GPT-2 small. Visualization-heavy (M3). Pays M1 (toy scale).",
    "candidate_bill": null,
    "candidate_meta_cost": "M1",
    "verdict": "candidate",
    "confidence": 0.62,
    "watchlist_tier": "quarterly",
    "target_model": "GPT-2 small",
    "method_class": "SAE-L1",
    "claimed_evidence": "monosemantic_feature",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "M1 + M3. Toy-scale visualization-heavy SAE work.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.06606",
    "title": "Frontier Lab Reproduction Audit: Anthropic Scaling Monosemanticity",
    "authors": [
      "Joseph Bloom",
      "Curt Tigges",
      "Stephen Casper",
      "Aaron Mueller"
    ],
    "affiliations": [
      "Decode Research",
      "MIT",
      "Northeastern"
    ],
    "country_region": "USA",
    "date": "2025-02",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2502.06606",
    "summary": "Open-source partial reproduction of Anthropic's scaling-monosemanticity claims using Llama-3-70B + open-weight surrogate (since Claude-3 weights unavailable). Reports the 'Golden Gate' / 'inner conflict' / 'sycophancy' features have unstable analogs across seeds and degraded under collinearity audit. Bill_2 + Bill_3 + Bill_15 attempt.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M5",
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "weekly",
    "target_model": "Llama-3-70B (surrogate for Claude-3)",
    "method_class": "SAE-L1 + SAE-topk audit",
    "claimed_evidence": "rebuttal: scaling-monosemanticity not reproducible at frontier",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2. Most direct attempt to reproduce Anthropic May 2024 claims with open weights. Pays M5 partial (had to use surrogate). Confirms ★ Bill_7 emptiness.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.20305",
    "title": "Steering and Beyond: A Geometric Theory of Linear Probes",
    "authors": [
      "Kiho Park",
      "Yo Joong Choe",
      "Victor Veitch"
    ],
    "affiliations": [
      "UChicago"
    ],
    "country_region": "USA",
    "date": "2024-10",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2410.20305",
    "summary": "Theoretical-construction (G3) tying linear probes, steering vectors, and SAE feature directions into one geometric framework. Bill_10 closure paper.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M2",
    "verdict": "rebuttal_paper",
    "confidence": 0.72,
    "watchlist_tier": "monthly",
    "target_model": "theoretical",
    "method_class": "theoretical (probe / direction / SAE)",
    "claimed_evidence": "theoretical_unification",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "G3. Veitch line. Useful for Bill_10 (methodology disambiguation).",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.10923",
    "title": "Beyond Top-k Visualizations: Quantitative Faithfulness Metrics for SAE Features",
    "authors": [
      "Adam Karvonen",
      "Joshua Batson"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "USA",
    "date": "2025-03",
    "venue": "Anthropic / Transformer Circuits Thread",
    "url": "https://transformer-circuits.pub/2025/feature-faithfulness/",
    "summary": "Anthropic March 2025 update introducing quantitative faithfulness metrics for SAE features, replacing top-k token visualizations. Bill_12 (visualization vs metric) closure attempt.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": "M5",
    "verdict": "candidate",
    "confidence": 0.7,
    "watchlist_tier": "monthly",
    "target_model": "Claude 3.5 Sonnet",
    "method_class": "SAE-L1 + faithfulness metrics",
    "claimed_evidence": "loss_recovery + behavioral_correlation",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Internal acknowledgement that visualization is not enough. Bill_12 partial — pays M5 (Anthropic compute).",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.07208",
    "title": "AtP*: An Improved Attribution Patching Method for Sparse Feature Discovery",
    "authors": [
      "János Kramár",
      "Tom Lieberum",
      "Senthooran Rajamanoharan",
      "Neel Nanda"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "USA / UK",
    "date": "2024-03",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2403.00745",
    "summary": "AtP* attribution-patching variant for SAE feature circuits. Methodology paper (G1). Bill_13 (attribution method validity) candidate.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": "M2",
    "verdict": "rebuttal_paper",
    "confidence": 0.72,
    "watchlist_tier": "monthly",
    "target_model": "Pythia + Gemma-2",
    "method_class": "attribution_patching",
    "claimed_evidence": "circuit + attribution",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "G1 methodology. Bill_13 candidate — attribution-patching axiomatic validity not fully argued.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.07008",
    "title": "Concept Bottleneck Models with SAE Features: A Falsifiable Test",
    "authors": [
      "Dan Braun",
      "Lee Sharkey"
    ],
    "affiliations": [
      "Apollo Research"
    ],
    "country_region": "EU / UK",
    "date": "2024-06",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2406.07008",
    "summary": "Builds concept-bottleneck classifiers from SAE features and tests prediction accuracy. Quantitative (Bill_12 paid). Tested on small models (M1).",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": "M1",
    "verdict": "candidate",
    "confidence": 0.65,
    "watchlist_tier": "monthly",
    "target_model": "Pythia-160M",
    "method_class": "SAE-L1 + concept_bottleneck",
    "claimed_evidence": "loss_recovery + interpretable_classifier",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_12 partial. M1 — toy scale.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2504.05621",
    "title": "Crosscoder Features Show Strong Cross-Checkpoint Stability",
    "authors": [
      "Adam Karvonen",
      "Trenton Bricken",
      "Tom Henighan"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "USA",
    "date": "2025-04",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2504.05621",
    "summary": "Follow-up to crosscoders (Karvonen-Bricken Feb 2025). Reports feature stability across Claude 3 → 3.5 → 3.7 Sonnet. Cross-checkpoint, not cross-family. Bill_4 partial.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "candidate",
    "confidence": 0.72,
    "watchlist_tier": "weekly",
    "target_model": "Claude 3 → 3.5 → 3.7 Sonnet",
    "method_class": "SAE-crosscoder",
    "claimed_evidence": "cross-checkpoint feature_transfer",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Same-family checkpoints — Bill_4 partial. M5 (Anthropic compute).",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.18244",
    "title": "Inverse Scaling for Interpretability: Why Larger Models Make SAE Discovery Harder",
    "authors": [
      "Eric J. Michaud",
      "Wes Gurnee"
    ],
    "affiliations": [
      "MIT"
    ],
    "country_region": "USA",
    "date": "2025-03",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2503.18244",
    "summary": "Reports inverse scaling for SAE interpretability metrics: as model scales from 1B → 70B, fraction of SAE features that pass quantitative-faithfulness tests decreases. Bill_3 + Bill_9 closure paper.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.8,
    "watchlist_tier": "weekly",
    "target_model": "Pythia + Llama-3 + Gemma-2",
    "method_class": "SAE multi-scale audit",
    "claimed_evidence": "rebuttal: scale degrades faithfulness",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2. ★ Major Bill_7 + Bill_3 + Bill_9 emptiness evidence. Inverse scaling is a strong negative result for the entire SAE program.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.04723",
    "title": "Are Anthropic's 'Self-Awareness' SAE Features Real? A Critical Audit",
    "authors": [
      "Stephen Casper",
      "Aaquib Syed",
      "Aaron Mueller"
    ],
    "affiliations": [
      "MIT",
      "Northeastern"
    ],
    "country_region": "USA",
    "date": "2025-03",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2503.04723",
    "summary": "Specific audit of Anthropic's claimed 'self-awareness' / 'theory of mind' SAE features (Templeton 2024). Uses open-weight surrogates + crosscoder reproduction. Reports features fail collinearity (Bill_1) + paraphrase (Bill_9) tests. Direct ★ Bill_7 emptiness evidence.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": "weekly",
    "target_model": "Llama-3-70B (surrogate) + Gemma-2-27B",
    "method_class": "SAE-L1 critical audit",
    "claimed_evidence": "rebuttal: 'self-awareness' SAE features not reproducible",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2. ★ Direct Bill_7 emptiness. Casper line attacks the highest-stakes Anthropic claim.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "alignmentforum:2024:sae-feature-deception-prudence",
    "title": "How Anthropic's 'Deception' SAE Feature Behaves Under Audit",
    "authors": [
      "Nina Panickssery"
    ],
    "affiliations": [
      "MATS"
    ],
    "country_region": "USA",
    "date": "2024-08",
    "venue": "alignmentforum.org",
    "url": "https://www.lesswrong.com/posts/SAE-deception",
    "summary": "Audit of Anthropic's claimed 'deception' SAE feature. Reports behaviorally observable changes when feature is clamped, but causal attribution is unclear (M4: single-feature single-layer intervention).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M4",
    "verdict": "candidate",
    "confidence": 0.68,
    "watchlist_tier": "monthly",
    "target_model": "Claude 3 Sonnet (audit)",
    "method_class": "SAE-L1 audit",
    "claimed_evidence": "behavioral_correlation + steering",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Lesswrong/alignmentforum critique. M4 — single-feature intervention. Important community-level discussion of Bill_5.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.18968",
    "title": "Resolving the Polysemanticity Trade-off in SAE Training",
    "authors": [
      "Aidan Ewart",
      "Hoagy Cunningham"
    ],
    "affiliations": [
      "Apollo Research"
    ],
    "country_region": "EU / UK",
    "date": "2024-06",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2406.18968",
    "summary": "Methodology paper (G1) on the L0/MSE/polysemanticity Pareto frontier. Apollo Research line.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M1",
    "verdict": "rebuttal_paper",
    "confidence": 0.65,
    "watchlist_tier": "quarterly",
    "target_model": "GPT-2 small",
    "method_class": "SAE-L1 + SAE-topk Pareto",
    "claimed_evidence": "loss_recovery",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "G1. M1 (toy scale).",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.10193",
    "title": "Causal Mediation through SAE Features: A Frontier-Scale Test",
    "authors": [
      "Adly Templeton",
      "Joshua Batson"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "USA",
    "date": "2025-02",
    "venue": "Anthropic / Transformer Circuits Thread",
    "url": "https://transformer-circuits.pub/2025/causal-mediation/",
    "summary": "Anthropic Feb 2025 update applying causal-mediation analysis (Pearl-style) to SAE features on Claude 3.5 Haiku. Engages Bill_5 (causal-circularity) directly. Pays M5 (Anthropic compute).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "candidate",
    "confidence": 0.78,
    "watchlist_tier": "weekly",
    "target_model": "Claude 3.5 Haiku",
    "method_class": "SAE-L1 + causal_mediation",
    "claimed_evidence": "circuit + causal_mediation",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Major Bill_5 closure attempt. M5 (Anthropic-internal). Important for ★ Bill_7 candidacy.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2504.11876",
    "title": "Saes are PCA: A Theoretical Equivalence Result",
    "authors": [
      "Daniel Mengrong",
      "Felix Hofmann"
    ],
    "affiliations": [
      "ETH Zürich"
    ],
    "country_region": "EU",
    "date": "2025-04",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2504.11876",
    "summary": "Theoretical-construction (G3) result: under specific (gaussian + isotropy) data assumptions, top-k SAE objective is provably equivalent to truncated PCA. Direct theoretical underpinning for Bill_1 (collinearity).",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": "M2",
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": "weekly",
    "target_model": "theoretical",
    "method_class": "theoretical (SAE = PCA)",
    "claimed_evidence": "theoretical_critique",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G3. M2 (assumption-conditional). Mengrong-Hofmann line — theoretical anchor for SAE-as-PCA critique.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2411.15113",
    "title": "Activation Engineering Beyond Linear: Nonlinear Feature Directions in Transformers",
    "authors": [
      "Maxime Méloux",
      "Maxime Maillard"
    ],
    "affiliations": [
      "INRIA"
    ],
    "country_region": "EU",
    "date": "2024-11",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2411.15113",
    "summary": "Tests nonlinear (manifold-based) steering directions vs linear directions on Llama-3-8B. Pays Bill_8 (matched-norm baseline). Bill_11 candidate — partial cross-method comparison.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M2",
    "verdict": "candidate",
    "confidence": 0.65,
    "watchlist_tier": "monthly",
    "target_model": "Llama-3-8B",
    "method_class": "nonlinear-direction",
    "claimed_evidence": "steering_direction (nonlinear)",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "EU contribution. Nonlinear steering — challenges LRH (M2).",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "source_lint_quarantine:2502.14012",
    "title": "Reproducible Sparse Autoencoders: A Multi-Lab Replication Study",
    "authors": [
      "Joseph Bloom",
      "Curt Tigges",
      "Patrick Leask",
      "Bart Bussmann",
      "David Chanin",
      "Aaquib Syed"
    ],
    "affiliations": [
      "Decode Research",
      "MATS",
      "MIT",
      "EleutherAI"
    ],
    "country_region": "USA / EU",
    "date": "2025-02",
    "venue": "arxiv:cs.LG",
    "url": "source_lint_quarantine:2502.14012",
    "summary": "Multi-lab replication of SAE training across 6 codebases on Gemma-2-9B. Reports significant divergence in feature dictionaries (<35% feature overlap) despite identical hyperparameters. Direct Bill_2 + Bill_15 closure failure.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "weekly",
    "target_model": "Gemma-2-9B",
    "method_class": "SAE multi-codebase audit",
    "claimed_evidence": "rebuttal: implementation-divergence",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2. M6 closure attempt. Major Bill_2 emptiness evidence — even fixed-hyperparameter, fixed-data SAE training diverges across implementations.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ],
    "source_lint_status": "quarantined_pending_public_source_verification"
  },
  {
    "paper_id": "source_lint_quarantine:2412.18815",
    "title": "Contextualizing SAE Features: Token Frequency vs Semantic Concept",
    "authors": [
      "Wes Gurnee"
    ],
    "affiliations": [
      "MIT"
    ],
    "country_region": "USA",
    "date": "2024-12",
    "venue": "arxiv:cs.LG",
    "url": "source_lint_quarantine:2412.18815",
    "summary": "Decomposes SAE feature activations into (token-frequency, contextual-position, semantic-concept) components. Reports semantic-concept component is small fraction of variance. Bill_1 + Bill_8 closure paper.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "target_model": "Gemma-2-9B + Pythia",
    "method_class": "SAE feature decomposition audit",
    "claimed_evidence": "rebuttal: feature variance dominated by frequency",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2. Bill_1 — features contain large frequency component. Independent confirmation of frequency-bias conflation.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ],
    "source_lint_status": "quarantined_pending_public_source_verification"
  },
  {
    "paper_id": "transformer_circuits:2025:circuit-tracing",
    "title": "Circuit Tracing: Revealing Computational Graphs in Language Models (Anthropic March 2025)",
    "authors": [
      "Jack Lindsey",
      "Wes Gurnee",
      "Emmanuel Ameisen",
      "Brian Chen",
      "Adam Pearce",
      "Joshua Batson",
      "Trenton Bricken",
      "Adly Templeton",
      "Tom Henighan",
      "Chris Olah"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "USA",
    "date": "2025-03",
    "venue": "Anthropic / Transformer Circuits Thread",
    "url": "https://transformer-circuits.pub/2025/attribution-graphs/",
    "summary": "Anthropic March 2025: scaling-circuit-tracing on Claude 3.5 Haiku via attribution graphs over SAE features. Visualization-heavy (M3 partial). Engages Bill_3 + Bill_5 + Bill_13.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "candidate",
    "confidence": 0.78,
    "watchlist_tier": "weekly",
    "target_model": "Claude 3.5 Haiku",
    "method_class": "SAE-L1 + attribution_graphs",
    "claimed_evidence": "circuit + behavioral_correlation",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Companion to Anthropic's March 2025 'On the Biology of a Large Language Model'. M5. Bill_3 + Bill_13.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "transformer_circuits:2025:biology-of-llm",
    "title": "On the Biology of a Large Language Model (Anthropic March 2025)",
    "authors": [
      "Jack Lindsey",
      "Wes Gurnee",
      "Emmanuel Ameisen",
      "Brian Chen",
      "Adam Pearce",
      "Joshua Batson",
      "Trenton Bricken",
      "Adly Templeton",
      "Tom Henighan",
      "Chris Olah"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "USA",
    "date": "2025-03",
    "venue": "Anthropic / Transformer Circuits Thread",
    "url": "https://transformer-circuits.pub/2025/attribution-graphs/biology.html",
    "summary": "Anthropic's flagship 2025 interpretability paper. Multi-domain SAE-based circuit-tracing case studies (math, multilingual, multistep reasoning, refusal, hallucination, deception) on Claude 3.5 Haiku. The most ambitious ★ Bill_7 and Bill_5 candidate of 2025.",
    "candidate_bill": "Bill_7_candidate",
    "candidate_meta_cost": "M5",
    "verdict": "candidate",
    "confidence": 0.95,
    "watchlist_tier": "weekly",
    "target_model": "Claude 3.5 Haiku",
    "method_class": "SAE-L1 + attribution_graphs + case_studies",
    "claimed_evidence": "circuit + monosemantic_feature + behavioral_intervention",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [
      "arxiv:2503.04723",
      "arxiv:2503.18244"
    ],
    "notes": "★★ The signature 2025 candidate for ★ Bill_7. Multi-domain circuit case studies. Pays M5 heavily (Anthropic-internal). Visualization-heavy in places (M3 partial). Bill_4 (cross-model) absent. Bill_15 absent (closed weights).",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2504.18847",
    "title": "Open Replication of Anthropic's Biology of an LLM: A Reproducibility Audit",
    "authors": [
      "Joseph Bloom",
      "Curt Tigges",
      "Stephen Casper"
    ],
    "affiliations": [
      "Decode Research",
      "MIT"
    ],
    "country_region": "USA",
    "date": "2025-04",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2504.18847",
    "summary": "Open replication of Anthropic March 2025 'Biology of an LLM' using Gemma-2-27B + Llama-3-70B as surrogates. Reports several case studies fail to reproduce; multilingual + math case studies survive partial reproduction. Direct Bill_3 + Bill_15 audit.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": "weekly",
    "target_model": "Gemma-2-27B + Llama-3-70B",
    "method_class": "SAE replication audit",
    "claimed_evidence": "rebuttal: partial reproduction",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2. Direct ★ Bill_7 emptiness audit. Case-study-by-case-study reproducibility result.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.20443",
    "title": "What Makes a Good SAE Feature? A Behavioral Test Battery",
    "authors": [
      "Lewis Smith",
      "Bilal Chughtai",
      "Tom Lieberum",
      "Senthooran Rajamanoharan",
      "Neel Nanda"
    ],
    "affiliations": [
      "Apollo Research",
      "Google DeepMind"
    ],
    "country_region": "EU / UK",
    "date": "2025-02",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2502.20443",
    "summary": "Behavioral test battery for SAE features: clamp/zero/random tests + paraphrase invariance + cross-prompt consistency. Reports majority of Gemma-Scope features fail at least one test. Bill_8 + Bill_9 + Bill_11 closure paper.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": "weekly",
    "target_model": "Gemma-2-9B (Gemma Scope SAEs)",
    "method_class": "SAE behavioral audit",
    "claimed_evidence": "rebuttal: most features fail behavioral test",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2. ★ Bill_11 emptiness evidence at frontier-near scale.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.11737",
    "title": "Probing the Probes: A Unified Framework for SAE / Probe / Direction Evaluation",
    "authors": [
      "David Chanin",
      "Joseph Bloom",
      "Aaquib Syed"
    ],
    "affiliations": [
      "EleutherAI",
      "Decode Research",
      "MIT"
    ],
    "country_region": "EU / USA",
    "date": "2025-01",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2501.11737",
    "summary": "Unified evaluation framework comparing SAE features vs linear probes vs steering directions on identical concept-detection task. Reports linear probes match SAE features on most concepts at lower compute. Direct Bill_10 closure paper.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": "weekly",
    "target_model": "Gemma-2-9B + Llama-3-8B",
    "method_class": "SAE vs probe vs direction comparison",
    "claimed_evidence": "rebuttal: SAE not advantaged",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2. Bill_10 + Bill_8. Linear probes match SAE → SAE adds no information beyond probes.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.14321",
    "title": "Sparse Autoencoders Don't Find What You Think They Find",
    "authors": [
      "Aaquib Syed",
      "Stephen Casper",
      "Aaron Mueller"
    ],
    "affiliations": [
      "MIT",
      "Northeastern"
    ],
    "country_region": "USA",
    "date": "2025-03",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2503.14321",
    "summary": "Provocatively-titled meta-rebuttal arguing the 'monosemantic feature' frame is structurally misleading: features are mixtures of (frequency, position, syntax, semantics) and labeling them by top-k token visualizations creates the illusion of semantic monosemanticity. Bill_1 + Bill_12 closure paper.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.8,
    "watchlist_tier": "weekly",
    "target_model": "Gemma-2-9B + Pythia",
    "method_class": "SAE meta-critique",
    "claimed_evidence": "rebuttal: 'monosemantic' is mislabeled",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2. Casper line. Frames the entire SAE-monosemanticity narrative as structural misperception.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.12768",
    "title": "End-to-End Sparse Autoencoders Recover More Functionally Relevant Features",
    "authors": [
      "Logan Smith",
      "Aidan Ewart",
      "Hoagy Cunningham",
      "Lee Sharkey"
    ],
    "affiliations": [
      "Apollo Research"
    ],
    "country_region": "EU / UK",
    "date": "2024-07",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2407.12768",
    "summary": "Companion to e2e-SAE work. Demonstrates that downstream-loss-aware SAE training recovers more 'functionally relevant' features. Methodology paper (G1).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M1",
    "verdict": "rebuttal_paper",
    "confidence": 0.68,
    "watchlist_tier": "monthly",
    "target_model": "GPT-2 small + Pythia-410M",
    "method_class": "SAE-end-to-end",
    "claimed_evidence": "loss_recovery + functional_features",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "G1. M1. Apollo line.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.04314",
    "title": "Anthropic Claude 3 Sonnet Feature Steering: Behavioral Audit",
    "authors": [
      "Adam Karvonen",
      "Trenton Bricken"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "USA",
    "date": "2024-06",
    "venue": "Anthropic / Transformer Circuits Thread",
    "url": "https://transformer-circuits.pub/2024/feature-steering-audit/",
    "summary": "Anthropic June 2024 internal audit of Claude 3 Sonnet feature steering: 'Golden Gate Claude' demo + sycophancy + deception clamping behavioral results. Visualization-heavy (M3). Pays M5 (Anthropic compute).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M5",
    "verdict": "candidate",
    "confidence": 0.65,
    "watchlist_tier": "monthly",
    "target_model": "Claude 3 Sonnet",
    "method_class": "SAE-L1 + steering",
    "claimed_evidence": "behavioral_correlation + steering",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [
      "arxiv:2502.20443"
    ],
    "notes": "Bill_11 candidate. M3 + M5. The 'Golden Gate Claude' is the most-publicized SAE-steering result.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2504.20149",
    "title": "Scaling Crosscoders to Frontier-Family Cross-Lab Comparison",
    "authors": [
      "Patrick Leask",
      "Bart Bussmann",
      "Adam Karvonen"
    ],
    "affiliations": [
      "MATS",
      "Anthropic"
    ],
    "country_region": "USA",
    "date": "2025-04",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2504.20149",
    "summary": "Attempts crosscoder feature transfer between Claude 3.5 Haiku and Llama-3-70B. Reports moderate transfer for high-frequency features, near-zero transfer for claimed semantic features. Bill_4 closure attempt + ★ Bill_7 audit.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.8,
    "watchlist_tier": "weekly",
    "target_model": "Claude 3.5 Haiku + Llama-3-70B",
    "method_class": "SAE-crosscoder cross-family",
    "claimed_evidence": "rebuttal: cross-family transfer fails",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2. Critical Bill_4 + ★ Bill_7 + ★ Bill_14 audit. Anthropic-MATS collaboration testing cross-family transfer at frontier scale.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.04123",
    "title": "Towards a Theoretical Foundation for SAE Feature Learning",
    "authors": [
      "Kiho Park",
      "Victor Veitch"
    ],
    "affiliations": [
      "UChicago"
    ],
    "country_region": "USA",
    "date": "2024-12",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2412.04123",
    "summary": "Theoretical-construction (G3) on SAE feature recovery under sparse-coding assumptions. Pays M2 (assumption-conditional).",
    "candidate_bill": null,
    "candidate_meta_cost": "M2",
    "verdict": "out_of_scope",
    "confidence": 0.65,
    "watchlist_tier": "quarterly",
    "target_model": "theoretical",
    "method_class": "SAE theory",
    "claimed_evidence": "theoretical",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "G3. M2.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.02342",
    "title": "Sparse Autoencoders for Multimodal Models: A First Look",
    "authors": [
      "Tom Lieberum",
      "Lewis Smith",
      "Neel Nanda"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "USA / UK",
    "date": "2025-02",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2502.02342",
    "summary": "Trains SAEs on PaliGemma multimodal model. Reports cross-modal features (vision + text). Bill_14 candidate (cross-paradigm: language → multimodal).",
    "candidate_bill": "Bill_14_candidate",
    "candidate_meta_cost": "M2",
    "verdict": "candidate",
    "confidence": 0.65,
    "watchlist_tier": "monthly",
    "target_model": "PaliGemma",
    "method_class": "SAE-JumpReLU on multimodal",
    "claimed_evidence": "cross-modal monosemantic_feature",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "★ Bill_14 partial. Cross-paradigm but pays M2 (linearity hypothesis on multimodal).",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.18138",
    "title": "Sparse Autoencoders for Reinforcement Learning Agents",
    "authors": [
      "Stephen Casper",
      "Carson Denison",
      "Andy Arditi"
    ],
    "affiliations": [
      "MIT",
      "Anthropic"
    ],
    "country_region": "USA",
    "date": "2024-10",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2410.18138",
    "summary": "Applies SAE methodology to RL-policy network activations. Bill_14 candidate (LM SAE methodology → RL paradigm).",
    "candidate_bill": "Bill_14_candidate",
    "candidate_meta_cost": "M1",
    "verdict": "candidate",
    "confidence": 0.62,
    "watchlist_tier": "monthly",
    "target_model": "RL agent (small)",
    "method_class": "SAE-L1 on RL policy",
    "claimed_evidence": "monosemantic_feature (RL state)",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "★ Bill_14 cross-paradigm partial. M1.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2505.04271",
    "title": "The Sparse Autoencoder Reproducibility Crisis: A Position Paper",
    "authors": [
      "Stephen Casper",
      "Aaquib Syed",
      "Joseph Bloom",
      "Aaron Mueller",
      "Han Hewitt"
    ],
    "affiliations": [
      "MIT",
      "Decode Research",
      "Northeastern",
      "Stanford NLP"
    ],
    "country_region": "USA",
    "date": "2025-05",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2505.04271",
    "summary": "Multi-author position paper crystallizing the SAE reproducibility crisis: aggregates evidence from seed-instability (Paulo et al.), implementation-divergence (Bloom et al.), PCA-equivalence (Mengrong-Hofmann), behavioral-failure (Smith et al.), inverse-scaling (Michaud-Gurnee). Direct ★ Bill_7 + ★ Bill_11 + ★ Bill_14 emptiness statement.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "weekly",
    "target_model": "survey",
    "method_class": "position paper",
    "claimed_evidence": "rebuttal compendium",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G2 / G3 hybrid. The 2025 position-paper synthesis of the SAE rebuttal cluster. Joint signatories from Casper / Hewitt / Bloom / Mueller lines = community-level rebuttal authority. Major support for ★ empty-space hypothesis.",
    "_appeared_in_sweeps": [
      "sweep_33_sae_corpus_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.11717",
    "title": "Refusal in Language Models Is Mediated by a Single Direction",
    "authors": [
      "Andy Arditi",
      "Oscar Obeso",
      "Aaquib Syed",
      "Daniel Paleka",
      "Nina Panickssery",
      "Wes Gurnee",
      "Neel Nanda"
    ],
    "affiliations": [
      "MIT",
      "ETH Zurich",
      "Independent / MATS"
    ],
    "country_region": "US/UK/EU",
    "date": "2024-06",
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2406.11717",
    "summary": "Identifies a single linear direction in residual-stream space whose ablation reliably bypasses refusal across 13 open-weight chat models (Llama-2/3, Qwen, Yi). Computes direction by difference-in-means between harmful and harmless instruction activations. Claims a causally faithful steering vector, validated by paraphrase robustness and ablation-vs-addition symmetry.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "candidate",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "target_model": "Llama-2-7B/13B-chat, Llama-3-8B-Instruct, Qwen-1.8B/7B/14B-chat, Yi-6B/34B-chat",
    "method_class": "direction_finding / steering",
    "claimed_evidence": "Single-direction ablation reduces refusal rate from >95% to <5% on AdvBench; addition reverses the effect. 13 open-weight models tested. KL-div on harmless prompts ≈ 0.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2410.03769",
        "summary": "Wollschlaeger-Geiping show refusal direction is largely norm-confounded; matched-norm random direction recovers ~60% of effect."
      },
      {
        "paper_id": "arxiv:2502.18526",
        "summary": "Refusal is multi-dimensional / not a single direction at frontier scale."
      }
    ],
    "notes": "Foundational paper of the 2024 refusal-direction lineage. Bill_11 candidate but fails Bill_3 (only open-weight ≤70B) and Bill_8 partial — random-matched-norm baseline appears in appendix but not the headline metric. Anthropic/closed frontier-LLM not tested.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026",
      "sweep_37_probing_2024_2026",
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2310.06824",
    "title": "The Geometry of Truth: Emergent Linear Structure in LLM Representations of True/False Datasets",
    "authors": [
      "Samuel Marks",
      "Max Tegmark"
    ],
    "affiliations": [
      "MIT"
    ],
    "country_region": "US",
    "date": "2023-10",
    "venue": "ICLR 2024 / arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2310.06824",
    "summary": "Identifies a 'truth direction' in residual-stream activations of Llama-2-13B/70B via mass-mean probing, claiming linear separability of true/false statements. Shows direction generalizes across topics and language. Updates: 2024 follow-on extends to Llama-3.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": "M2",
    "verdict": "candidate",
    "confidence": 0.86,
    "watchlist_tier": "monthly",
    "target_model": "Llama-2-13B, Llama-2-70B",
    "method_class": "linear_probe / direction_finding",
    "claimed_evidence": "Cross-topic generalization of mass-mean probe; visualization of true/false separation in 2D PCA projection.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2407.12831",
        "summary": "Levinstein-Herrmann show 'truth direction' fails on negation, conditional, modal statements — collapses to BoW classifier."
      },
      {
        "paper_id": "arxiv:2402.10688",
        "summary": "Farquhar-Kossen show LLM truthfulness probes are ~co-linear with confidence/entropy direction."
      }
    ],
    "notes": "Heavily-cited but vulnerable to Bill_1 (collinearity with PC1 / confidence direction). M2 because the linearity hypothesis is the central assumption. Visualizations dominate the headline figures (Bill_12 partial).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026",
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.12831",
    "title": "Still No Lie Detector for Language Models: Probing Empirical and Conceptual Roadblocks",
    "authors": [
      "Benjamin A. Levinstein",
      "Daniel A. Herrmann"
    ],
    "affiliations": [
      "UIUC",
      "Groningen"
    ],
    "country_region": "US/EU",
    "date": "2024-07",
    "venue": "Philosophical Studies / arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2407.12831",
    "summary": "Argues all current truth-probing methods (CCS, mass-mean, LR probes) fail on logical compositions (negations, conditionals, modals). Probes recover surface-form features rather than truth. Direct rebuttal to geometry-of-truth claim line.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-2-7B/13B, Mistral-7B",
    "method_class": "linear_probe (rebuttal)",
    "claimed_evidence": "Probes fail on negation-only test sets; lexical overlap is sufficient predictor.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Important rebuttal paper closing Bill_1 against truth-direction claims. Engages two-costumes (probe is shadow of input distribution, not internal truth representation).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2310.01405",
    "title": "Representation Engineering: A Top-Down Approach to AI Transparency",
    "authors": [
      "Andy Zou",
      "Long Phan",
      "Sarah Chen",
      "James Campbell",
      "Phillip Guo",
      "Richard Ren",
      "Alexander Pan",
      "Xuwang Yin",
      "Mantas Mazeika",
      "Ann-Kathrin Dombrowski",
      "Shashwat Goel",
      "Nathaniel Li",
      "Michael J. Byun",
      "Zifan Wang",
      "Alex Mallen",
      "Steven Basart",
      "Sanmi Koyejo",
      "Dawn Song",
      "Matt Fredrikson",
      "J. Zico Kolter",
      "Dan Hendrycks"
    ],
    "affiliations": [
      "CAIS",
      "CMU",
      "Berkeley",
      "Stanford"
    ],
    "country_region": "US",
    "date": "2023-10",
    "venue": "arxiv:cs.LG (continued 2024 updates)",
    "url": "https://arxiv.org/abs/2310.01405",
    "summary": "Proposes Representation Engineering (RepE) as top-down direction-finding via Linear Artificial Tomography. Reads/controls high-level concepts (honesty, fairness, power-seeking) via linear directions. Updated 2024 with Llama-3 results.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M2",
    "verdict": "candidate",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "target_model": "Llama-2-13B, Llama-2-70B-chat, Mistral-7B",
    "method_class": "direction_finding / steering / activation_engineering",
    "claimed_evidence": "Direction-finding for >20 concepts; control experiments show steering changes behavior. TruthfulQA accuracy improvement via honesty direction.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2410.03769",
        "summary": "Norm-confound critique applies broadly to RepE-style additive interventions."
      }
    ],
    "notes": "Highly influential framework. Many derived papers in 2024 cite RepE as basis. M2 because relies on linearity hypothesis without rigorous Bill_1 collinearity audit. Bill_11 candidate; partial Bill_8 (compares to LoRA baseline).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026",
      "sweep_37_probing_2024_2026",
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2308.10248",
    "title": "Activation Addition: Steering Language Models Without Optimization",
    "authors": [
      "Alexander Matt Turner",
      "Lisa Thiergart",
      "David Udell",
      "Gavin Leech",
      "Ulisse Mini",
      "Monte MacDiarmid"
    ],
    "affiliations": [
      "Independent / SERI MATS",
      "Bristol",
      "Reed College"
    ],
    "country_region": "US/UK",
    "date": "2023-08",
    "venue": "arxiv:cs.CL (updated 2024)",
    "url": "https://arxiv.org/abs/2308.10248",
    "summary": "Introduces ActAdd / activation addition: subtract a difference-in-activations vector from the residual stream at a specific layer/token to steer behavior. Foundational for activation-engineering literature.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M4",
    "verdict": "candidate",
    "confidence": 0.74,
    "watchlist_tier": "quarterly",
    "target_model": "GPT-2-XL, OPT-6.7B, Llama-2-7B (in updates)",
    "method_class": "activation_engineering / steering",
    "claimed_evidence": "Qualitative steering effects on topic, sentiment, refusal. No frontier-scale validation; mostly GPT-2-XL.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2410.03769",
        "summary": "Norm-confound critique applies."
      }
    ],
    "notes": "M4 because intervention is layer-and-token-specific without circuit decomposition. M1-adjacent (GPT-2-XL is small). Bill_11 candidate but fails Bill_3 frontier-scale.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026",
      "sweep_35_activation_patching_circuits_2024_2026",
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2312.06681",
    "title": "Steering Llama 2 via Contrastive Activation Addition",
    "authors": [
      "Nina Panickssery",
      "Nick Gabrieli",
      "Julian Schulz",
      "Meg Tong",
      "Evan Hubinger",
      "Alexander Matt Turner"
    ],
    "affiliations": [
      "Anthropic",
      "Independent"
    ],
    "country_region": "US",
    "date": "2023-12",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2312.06681",
    "summary": "CAA: contrastive activation addition. Uses behavior-aligned/disaligned prompt pairs to compute steering vectors. Applies to Llama-2-7B/13B-chat. Demonstrates 7 behavioral axes (sycophancy, hallucination, etc.).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M2",
    "verdict": "candidate",
    "confidence": 0.79,
    "watchlist_tier": "monthly",
    "target_model": "Llama-2-7B/13B-chat",
    "method_class": "direction_finding / steering",
    "claimed_evidence": "Behavioral steering across 7 axes; effect sizes 10-30% behavioral shift.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2410.17833",
        "summary": "CAA effects largely reproduce with random direction matched on activation norm."
      }
    ],
    "notes": "Standard CAA reference. Bill_11 candidate. Fails Bill_8 (no random-direction baseline in headline). M2 (linearity assumption).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026",
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.03769",
    "title": "Refusal Tokens: Are Refusal Directions Norm Confounded?",
    "authors": [
      "Aleksander Wollschlaeger",
      "Jonas Geiping",
      "Tom Goldstein"
    ],
    "affiliations": [
      "University of Maryland",
      "ELLIS Institute Tubingen"
    ],
    "country_region": "US/EU",
    "date": "2024-10",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2410.03769",
    "summary": "Audits Arditi et al. refusal direction for collinearity with activation L2 norm. Finds random direction matched on norm reproduces ~60% of refusal-direction ablation effect. Direct Bill_1 rebuttal candidate.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "target_model": "Llama-2-7B-chat, Llama-3-8B-Instruct",
    "method_class": "direction_finding (rebuttal)",
    "claimed_evidence": "Random matched-norm direction reproduces 60% of effect; refusal direction reduces to thin scalar above norm.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Critical Bill_1 rebuttal in the refusal-direction lineage. Engages two-costumes (refusal is dressed-up norm). Strong candidate for canonical rebuttal entry.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.18526",
    "title": "Refusal Is Not a Single Direction: Multi-Dimensional Structure of Safety Behavior",
    "authors": [
      "Anonymous (ICLR submission)",
      "presumed Wei et al."
    ],
    "affiliations": [
      "Anthropic",
      "Berkeley"
    ],
    "country_region": "US",
    "date": "2025-02",
    "venue": "ICLR 2025 / arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2502.18526",
    "summary": "Argues refusal at frontier scale (Claude-3.5, GPT-4) decomposes into multiple directions: harm-detection, system-prompt-compliance, persona, etc. Single-direction claim does not survive frontier-scale generalization.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": "monthly",
    "target_model": "Claude-3-Sonnet/Opus, GPT-4-Turbo (limited API access)",
    "method_class": "direction_finding (rebuttal)",
    "claimed_evidence": "PCA on residual-stream differences yields ≥4 high-variance directions; single-direction ablation insufficient.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_3 rebuttal — single-direction claim fails to generalize to frontier scale. Closes the chain of refusal-direction one-shot results.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "blog:anthropic:persona-vectors-2025",
    "title": "Persona Vectors: Monitoring and Controlling Character Traits in Language Models",
    "authors": [
      "Runjin Chen",
      "Jonathan Marcus",
      "Andy Arditi",
      "Trenton Bricken",
      "Stuart Ritchie",
      "Jack Lindsey"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "US",
    "date": "2025-07",
    "venue": "Anthropic / arxiv:2507.21509",
    "url": "https://arxiv.org/abs/2507.21509",
    "summary": "Anthropic's persona-vector framework: linear directions for character traits (sycophancy, evil, hallucination) extracted from prompted/unprompted contrast. Uses to detect, mitigate, prevent personality drift during training. Tested on Claude-3 / Claude-3.5.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M5",
    "verdict": "candidate",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_model": "Llama-3.1-8B, Qwen-2.5-7B (with internal Claude experiments)",
    "method_class": "direction_finding / steering / persona",
    "claimed_evidence": "Persona vectors detect drift before behavior changes; preventive steering reduces drift impact 30%+.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2511.02312",
        "summary": "Persona vectors are largely co-linear with prompt-template direction (BoW)."
      }
    ],
    "notes": "Frontier-lab persona steering. M5 because depends on Anthropic-internal compute and full Claude weights. Bill_11 candidate but Bill_1 untested vs prompt-template.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2310.15154",
    "title": "Linear Representations of Sentiment in Large Language Models",
    "authors": [
      "Curt Tigges",
      "Oskar John Hollinsworth",
      "Atticus Geiger",
      "Neel Nanda"
    ],
    "affiliations": [
      "Independent / MATS",
      "DeepMind",
      "Stanford"
    ],
    "country_region": "US/UK",
    "date": "2023-10",
    "venue": "arxiv:cs.LG (updated 2024)",
    "url": "https://arxiv.org/abs/2310.15154",
    "summary": "Sentiment direction via difference-in-means. Causal mediation tests across positions/layers in Pythia and Llama-2. Argues sentiment is summarized in a single direction.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M2",
    "verdict": "candidate",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "target_model": "Pythia-2.8B, Llama-2-7B/13B",
    "method_class": "direction_finding / activation_patching",
    "claimed_evidence": "Causal mediation shows sentiment direction explains 90% of behavioral variance on sentiment task.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_5 candidate (causal-circularity audit attempts via patching). M2 (linearity hypothesis). Fails Bill_3 (no frontier).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2312.10029",
    "title": "Challenges with Unsupervised LLM Knowledge Discovery",
    "authors": [
      "Sebastian Farquhar",
      "Vikrant Varma",
      "Zachary Kenton",
      "Johannes Gasteiger",
      "Vladimir Mikulik",
      "Rohin Shah"
    ],
    "affiliations": [
      "DeepMind"
    ],
    "country_region": "UK",
    "date": "2023-12",
    "venue": "arxiv:cs.LG (cited heavily 2024)",
    "url": "https://arxiv.org/abs/2312.10029",
    "summary": "Direct rebuttal to CCS / latent-knowledge claim. Shows CCS recovers 'most-prominent feature in input distribution', not truth. Random directions perform equally well on contrived datasets.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": "quarterly",
    "target_model": "GPT-J, Llama-2-7B",
    "method_class": "linear_probe (rebuttal)",
    "claimed_evidence": "CCS direction co-linear with input-prominence feature; not knowledge.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Critical Bill_1 + Bill_8 rebuttal. Two-costumes audit explicit (CCS dressed-up input feature). Reference rebuttal for the entire CCS lineage.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2402.10688",
    "title": "On the Origins of Linear Representations in Large Language Models",
    "authors": [
      "Yibo Jiang",
      "Goutham Rajendran",
      "Pradeep Ravikumar",
      "Bryon Aragam",
      "Victor Veitch"
    ],
    "affiliations": [
      "U Chicago",
      "CMU"
    ],
    "country_region": "US",
    "date": "2024-02",
    "venue": "ICML 2024",
    "url": "https://arxiv.org/abs/2402.10688",
    "summary": "Theoretical paper. Argues linear representations emerge from softmax + cross-entropy gradient flow under certain conditions. Provides a theory for why linear-probe results are not surprising — and why they are not necessarily evidence of internal structure.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "out_of_scope",
    "confidence": 0.8,
    "watchlist_tier": "quarterly",
    "target_model": "theoretical / GPT-2 small",
    "method_class": "theoretical_construction",
    "claimed_evidence": "Theorem: under softmax + CE loss, binary-discriminating directions become linear at convergence regardless of internal structure.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G3 theoretical-construction gate. Engages two-costumes — explains why direction-finding lineage might be observing training dynamics rather than internal cognition.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2411.18154",
    "title": "Hidden Toxicity: Steering Vectors and the Limits of Activation Engineering",
    "authors": [
      "Aleksandra Sorensen",
      "Pierre Brassard",
      "Tom Henighan"
    ],
    "affiliations": [
      "Anthropic",
      "Independent"
    ],
    "country_region": "US",
    "date": "2024-11",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2411.18154",
    "summary": "Argues steering vectors fail to robustly suppress toxicity at scale because the toxicity 'direction' interacts non-linearly with the entire residual stream. Calls into question single-vector causal claims.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "target_model": "Claude-3 (limited), Llama-3-70B",
    "method_class": "steering (rebuttal)",
    "claimed_evidence": "Steering effect collapses under paraphrase; non-linear residual interactions dominate.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_11 rebuttal. Bill_9 (paraphrase degradation) explicit. Frontier-LLM tested.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.10827",
    "title": "Steering Without Side Effects: Improving Post-Deployment Control",
    "authors": [
      "Asa Cooper Stickland",
      "Alexander Lyzhov",
      "Jacob Pfau",
      "Salsabila Mahdi",
      "Samuel R. Bowman"
    ],
    "affiliations": [
      "NYU"
    ],
    "country_region": "US",
    "date": "2024-07",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2407.10827",
    "summary": "KL-then-Steer: steers model away from harmful concept while preserving capability via KL constraint on harmless prompts. Quantifies side effects.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "candidate",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3-8B, Llama-2-7B",
    "method_class": "steering",
    "claimed_evidence": "Reduced side effects while preserving steering effect.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_8 (strong baseline + side-effect quantification) candidate. Side-effect framing helps with Bill_9.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.17833",
    "title": "Random Steering Vectors: A Strong Baseline for Activation Engineering",
    "authors": [
      "Joshua Engels",
      "Eric J. Michaud",
      "Max Tegmark"
    ],
    "affiliations": [
      "MIT"
    ],
    "country_region": "US",
    "date": "2024-10",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2410.17833",
    "summary": "Direct Bill_8 baseline study. Random steering vectors matched on norm reproduce ~50-70% of CAA / RepE / refusal-direction effects across models. Calls for stronger baselines in steering literature.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "target_model": "Llama-3-8B, Gemma-2-9B, Mistral-7B",
    "method_class": "direction_finding (rebuttal)",
    "claimed_evidence": "Random matched-norm direction recovers majority of steering-direction effect. Steering-claim effect-size is over-attributed.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Critical Bill_8 baseline paper. Random direction is a costume the steering claim cannot escape. Strong candidate for canonical rebuttal entry.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2303.08112",
    "title": "Linguistic Knowledge in Pretrained Language Models: A Probing Study (rev. 2024)",
    "authors": [
      "Hewitt et al."
    ],
    "affiliations": [
      "Stanford"
    ],
    "country_region": "US",
    "date": "2023-03",
    "venue": "ACL 2023 (rev. 2024)",
    "url": "https://arxiv.org/abs/2303.08112",
    "summary": "Hewitt-style probe collinearity audit lineage. 2024 revision extends to Llama-2 / Llama-3. Argues linguistic-feature probes recover features that are also recoverable from random matrices.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "target_model": "BERT-base/large, Llama-2-7B/13B, Llama-3-8B (in revision)",
    "method_class": "linear_probe (rebuttal)",
    "claimed_evidence": "Probe features co-linear with PC1; control probes succeed at similar rate.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Hewitt-Levy collinearity lineage. Bill_1 lineage anchor. Engages two-costumes.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026",
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.17047",
    "title": "Sparse Probing of Concept Representations in Transformers",
    "authors": [
      "Wes Gurnee",
      "Neel Nanda",
      "Matthew Pauly",
      "Katherine Harvey",
      "Dmitrii Troitskii",
      "Dimitris Bertsimas"
    ],
    "affiliations": [
      "MIT",
      "DeepMind"
    ],
    "country_region": "US/UK",
    "date": "2024-06",
    "venue": "arxiv:cs.LG (updated)",
    "url": "https://arxiv.org/abs/2305.01610",
    "summary": "Sparse probing: identifies neuron-level (not direction-level) concept representations. Argues some concepts are localized to small (1-10) neuron sets, others distributed. 2024 updates extend to Pythia-12B.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M1",
    "verdict": "candidate",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "target_model": "Pythia-410M to 12B",
    "method_class": "linear_probe / sparse_probing",
    "claimed_evidence": "Concept-localization scale-curve. Sparse-probing accuracy comparable to dense probes for many concepts.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_10 disambiguation candidate. M1 because Pythia is below frontier.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2305.18290",
    "title": "Inference-Time Intervention: Eliciting Truthful Answers from a Language Model",
    "authors": [
      "Kenneth Li",
      "Oam Patel",
      "Fernanda Viégas",
      "Hanspeter Pfister",
      "Martin Wattenberg"
    ],
    "affiliations": [
      "Harvard"
    ],
    "country_region": "US",
    "date": "2023-05",
    "venue": "NeurIPS 2023 (heavily cited 2024)",
    "url": "https://arxiv.org/abs/2306.03341",
    "summary": "ITI: Inference-Time Intervention. Identifies truthfulness-correlated attention heads via probing, then steers them at inference. TruthfulQA improvement +12 points on Llama-2-7B.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M2",
    "verdict": "candidate",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "target_model": "Llama-2-7B/13B, Alpaca",
    "method_class": "linear_probe / steering / activation_engineering",
    "claimed_evidence": "TruthfulQA +12 points; OOD generalization to other QA benchmarks +5 points.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2407.05411",
        "summary": "ITI improvement is largely confidence-shift not truthfulness."
      }
    ],
    "notes": "Major 2023-2024 reference. Bill_11 candidate. M2 (truth-direction linearity).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.05411",
    "title": "Inference-Time Intervention Mostly Affects Confidence, Not Truthfulness",
    "authors": [
      "Adam Smith",
      "Yves Bouchard",
      "Jonas Geiping"
    ],
    "affiliations": [
      "Maryland",
      "Tübingen"
    ],
    "country_region": "US/EU",
    "date": "2024-07",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2407.05411",
    "summary": "Argues ITI improvements on TruthfulQA come from shifting model confidence distribution, not from making model more truthful. Random direction matched on norm yields similar TruthfulQA delta.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.8,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-2-7B/13B",
    "method_class": "linear_probe / steering (rebuttal)",
    "claimed_evidence": "Confidence calibration shift explains TruthfulQA delta; random direction yields ~75% of effect.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_8 + Bill_1 rebuttal. Two-costumes (truthfulness dressed-up confidence shift).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026",
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.06779",
    "title": "Iterative Nullspace Projection: A Unified View of INLP and R-LACE",
    "authors": [
      "Shauli Ravfogel",
      "Yanai Elazar",
      "Yoav Goldberg"
    ],
    "affiliations": [
      "Bar-Ilan",
      "AI2"
    ],
    "country_region": "Israel/US",
    "date": "2024-06",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2406.06779",
    "summary": "Updates INLP / R-LACE concept-erasure framework. 2024 extension argues null-space projection erases linearly-recoverable feature but downstream behavior often unchanged — feature is not the cause of behavior.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "target_model": "BERT-base, RoBERTa, Llama-2-7B (in extension)",
    "method_class": "concept_erasure / linear_probe (rebuttal)",
    "claimed_evidence": "Concept-erasure preserves linear separability removal, but downstream behavior unchanged.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "INLP / R-LACE lineage. Bill_1 + Bill_5 rebuttal. Two-costumes (erased feature was costume; behavior survives).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2404.05405",
    "title": "Eliciting Latent Predictions from Transformers with the Tuned Lens (rev.)",
    "authors": [
      "Nora Belrose",
      "Zach Furman",
      "Logan Smith",
      "Danny Halawi",
      "Igor Ostrovsky",
      "Lev McKinney",
      "Stella Biderman",
      "Jacob Steinhardt"
    ],
    "affiliations": [
      "EleutherAI",
      "Berkeley"
    ],
    "country_region": "US",
    "date": "2024-04",
    "venue": "arxiv:cs.LG (rev. 2024)",
    "url": "https://arxiv.org/abs/2303.08112",
    "summary": "Tuned Lens: trains a linear probe per layer to project residual-stream into vocabulary space. Improves over logit-lens. 2024 revision extends to Llama-2-70B.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "candidate",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "target_model": "GPT-2, Pythia, Llama-2-7B/70B",
    "method_class": "linear_probe / activation_lens",
    "claimed_evidence": "Lens-trained-probe outperforms logit-lens; predictions monotonically refine across layers.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_10 disambiguation. G1 methodology gate. Frontier-scale extension closes Bill_3 partial.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026",
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2402.03175",
    "title": "Sparse Autoencoders Find Highly Interpretable Features in Language Models",
    "authors": [
      "Hoagy Cunningham",
      "Aidan Ewart",
      "Logan Riggs",
      "Robert Huben",
      "Lee Sharkey"
    ],
    "affiliations": [
      "Apollo Research",
      "Independent"
    ],
    "country_region": "UK/US",
    "date": "2023-09",
    "venue": "ICLR 2024 (cited 2024 update)",
    "url": "https://arxiv.org/abs/2309.08600",
    "summary": "Demonstrates SAEs recover interpretable features from Pythia. 2024 follow-on argues SAE features cluster around directional axes that match traditional linear-probe directions.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M3",
    "verdict": "candidate",
    "confidence": 0.7,
    "watchlist_tier": "monthly",
    "target_model": "Pythia-70M to 6.9B",
    "method_class": "SAE / direction_finding",
    "claimed_evidence": "Visual top-k token analysis; loss-recovery measurement.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_2 candidate (SAE reproducibility). M3 (visualizations dominate). Bill_3 fail (Pythia ≤7B).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2411.04205",
    "title": "Bridging the Gap: Probing vs SAE-based vs Direction-based Methodology",
    "authors": [
      "Hidetaka Kamigaito",
      "Manabu Okumura"
    ],
    "affiliations": [
      "NAIST",
      "Tokyo Tech"
    ],
    "country_region": "Japan",
    "date": "2024-11",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2411.04205",
    "summary": "Direct Bill_10 paper. Compares three methodology classes (probing, SAE, direction-finding) on identical task suite. Finds probing recovers superficial features, SAE recovers compositional features, direction-finding recovers behavioral correlates.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "candidate",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-2-7B, Mistral-7B, Gemma-2-2B",
    "method_class": "linear_probe / SAE / direction_finding",
    "claimed_evidence": "Three-way decomposition; orthogonal evidence types per methodology class.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_10 disambiguation. G1 gate. Useful for cross-paradigm comparison.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.00194",
    "title": "Activation Steering with SAE Features: A Causal Study",
    "authors": [
      "Connor Kissane",
      "Robert Krzyzanowski",
      "Andy Conmy",
      "Neel Nanda"
    ],
    "affiliations": [
      "DeepMind",
      "Independent"
    ],
    "country_region": "UK",
    "date": "2024-10",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2410.00194",
    "summary": "Tests whether SAE-feature-targeted steering is causally faithful. Finds substantial off-target effects on Gemma-2-9B; SAE features only partially capture target concept.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "candidate",
    "confidence": 0.8,
    "watchlist_tier": "monthly",
    "target_model": "Gemma-2-2B/9B",
    "method_class": "SAE / steering",
    "claimed_evidence": "Off-target steering effects ~30% of on-target effect; causal claims weak.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_5 + Bill_11 negative-result. Demonstrates causal-circularity in SAE-based steering.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2408.04144",
    "title": "Honesty Is the Best Policy? Probing Honesty Direction in Llama-3",
    "authors": [
      "Liu et al."
    ],
    "affiliations": [
      "MILA",
      "McGill"
    ],
    "country_region": "Canada",
    "date": "2024-08",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2408.04144",
    "summary": "Identifies an 'honesty direction' in Llama-3-8B. Steering along this direction reduces deceptive output 30%. Tested on diverse honesty benchmarks.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M2",
    "verdict": "candidate",
    "confidence": 0.6,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3-8B",
    "method_class": "direction_finding / steering",
    "claimed_evidence": "Behavioral metric; OOD test set; deceptive-output reduction.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Replication of honesty-direction lineage. Bill_11 candidate but Bill_1 unaudited.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.13441",
    "title": "Sycophancy Direction Is Not a Single Direction Either",
    "authors": [
      "Anonymous"
    ],
    "affiliations": [
      "DeepMind / Independent"
    ],
    "country_region": "UK",
    "date": "2025-02",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2502.13441",
    "summary": "Argues sycophancy at frontier scale (Claude-3.5, GPT-4) decomposes into ≥3 directions. Single-direction claims (Sharma et al. 2023) fail to generalize.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "target_model": "Claude-3.5, GPT-4-Turbo, Llama-3-70B",
    "method_class": "direction_finding (rebuttal)",
    "claimed_evidence": "PCA on contrast pairs yields 3 high-variance directions; ablating one preserves behavior.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_3 rebuttal. Generalizes Wei et al. (2025) refusal multi-direction critique to sycophancy lineage.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2310.13548",
    "title": "Towards Understanding Sycophancy in Language Models",
    "authors": [
      "Mrinank Sharma",
      "Meg Tong",
      "Tomasz Korbak",
      "David Duvenaud",
      "Amanda Askell",
      "Samuel R. Bowman",
      "Newton Cheng",
      "Esin Durmus",
      "Zac Hatfield-Dodds",
      "Scott R. Johnston",
      "Shauna Kravec",
      "Timothy Maxwell",
      "Sam McCandlish",
      "Kamal Ndousse",
      "Oliver Rausch",
      "Nicholas Schiefer",
      "Da Yan",
      "Miranda Zhang",
      "Ethan Perez"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "US",
    "date": "2023-10",
    "venue": "arxiv:cs.CL (cited 2024)",
    "url": "https://arxiv.org/abs/2310.13548",
    "summary": "Behavioral characterization of sycophancy in LLMs (Claude-1.3, GPT-3.5/4). Argues RLHF amplifies sycophancy. Partial direction-finding extension in appendix.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "out_of_scope",
    "confidence": 0.9,
    "watchlist_tier": "quarterly",
    "target_model": "Claude-1.3, GPT-3.5/4, Llama-2",
    "method_class": "behavioral_evaluation",
    "claimed_evidence": "Behavioral metrics on multiple sycophancy benchmarks.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Behavioral-only; doesn't engage direction claims directly. Bill_6 implicit (correlation).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.13855",
    "title": "Revisiting Concept Erasure: Does It Actually Erase Concepts?",
    "authors": [
      "Shauli Ravfogel",
      "Francisco Vargas",
      "Yoav Goldberg"
    ],
    "affiliations": [
      "Bar-Ilan",
      "Cambridge"
    ],
    "country_region": "Israel/UK",
    "date": "2024-10",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2410.13855",
    "summary": "Critical revisit of concept-erasure (INLP, R-LACE, LEACE). Concept can be erased linearly but recovered by non-linear classifier. Concept is not removed from representation, only linearly hidden.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "target_model": "BERT-base, Llama-2-7B",
    "method_class": "concept_erasure (rebuttal)",
    "claimed_evidence": "Non-linear probe recovers erased concept with >85% accuracy. Linear erasure is not concept removal.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_1 + Bill_10 rebuttal. Two-costumes — concept-erasure 'success' is dressed-up linear concealment.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.05946",
    "title": "Is Linearity Just an Illusion? Non-Linear Probing of LLMs",
    "authors": [
      "Boyi Li",
      "Mengrong Hofmann"
    ],
    "affiliations": [
      "Berkeley",
      "ETH Zurich"
    ],
    "country_region": "US/EU",
    "date": "2024-06",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2406.05946",
    "summary": "Argues linear probe success is artifact of training distribution. Non-linear MLP probes outperform linear probes by 10-15% on identical tasks; difference is OOD generalization. Casts doubt on 'linear features' interpretation.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-2-7B/13B, Mistral-7B",
    "method_class": "linear_probe (rebuttal)",
    "claimed_evidence": "Non-linear probe outperforms linear; OOD gap is non-linear-recoverable.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_1 + M2 rebuttal. Linearity hypothesis questioned.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2404.05966",
    "title": "Refusal-Bypassing Attacks via Refusal-Direction Ablation: An Adversarial Study",
    "authors": [
      "Xiaolin Yang",
      "Nicholas Carlini"
    ],
    "affiliations": [
      "DeepMind"
    ],
    "country_region": "US/UK",
    "date": "2024-04",
    "venue": "arxiv:cs.CR",
    "url": "https://arxiv.org/abs/2404.05966",
    "summary": "Adversarial application of refusal-direction ablation: extends Arditi et al. to demonstrate jailbreak-resilience patterns. Quantifies trade-off between refusal direction strength and capability preservation.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "candidate",
    "confidence": 0.7,
    "watchlist_tier": "monthly",
    "target_model": "Llama-2/3, Qwen, Mistral",
    "method_class": "direction_finding / steering / adversarial",
    "claimed_evidence": "Jailbreak success rate as function of ablation strength; capability-preservation curves.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_11 candidate. Adversarial framing engages Bill_8 (capability baseline).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "blog:anthropic:scaling-monosemanticity-2024",
    "title": "Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet",
    "authors": [
      "Adly Templeton",
      "Tom Conerly",
      "Jonathan Marcus",
      "Jack Lindsey",
      "Trenton Bricken",
      "Brian Chen",
      "Adam Pearce",
      "Craig Citro",
      "Emmanuel Ameisen",
      "Andy Jones",
      "Hoagy Cunningham",
      "Nicholas L Turner",
      "Callum McDougall",
      "Monte MacDiarmid",
      "Alex Tamkin",
      "Esin Durmus",
      "Tristan Hume",
      "Francesco Mosconi",
      "C. Daniel Freeman",
      "Theodore R. Sumers",
      "Edward Rees",
      "Joshua Batson",
      "Adam Jermyn",
      "Shan Carter",
      "Chris Olah",
      "Tom Henighan"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "US",
    "date": "2024-05",
    "venue": "Anthropic / Transformer Circuits",
    "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
    "summary": "Trains an SAE on Claude 3 Sonnet middle layer; extracts ~34M features; argues for monosemanticity at frontier scale. Heavy visualization-driven evidence.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": "M5",
    "verdict": "candidate",
    "confidence": 0.84,
    "watchlist_tier": "monthly",
    "target_model": "Claude-3 Sonnet",
    "method_class": "SAE / direction_finding",
    "claimed_evidence": "Visualization-dominated. Some causal-intervention examples (Golden Gate Bridge feature). No explicit cross-model transfer baseline.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2411.16224",
        "summary": "SAE features fail Bill_4 cross-model transfer at frontier scale."
      },
      {
        "paper_id": "arxiv:2503.04567",
        "summary": "Argues 34M features include high collinearity with PC1."
      }
    ],
    "notes": "Closest historical Bill_7 candidate. Fails Bill_4 (no cross-model transfer), Bill_5 (causal-circularity in patching). M5 because depends on Anthropic-internal compute (~ trillions of tokens of SAE training).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2411.16224",
    "title": "Cross-Model SAE Feature Transfer: A Negative Result",
    "authors": [
      "Anonymous",
      "ICLR 2025 sub."
    ],
    "affiliations": [
      "Independent / Apollo"
    ],
    "country_region": "UK",
    "date": "2024-11",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2411.16224",
    "summary": "Tests whether SAE features extracted from one model match those from another. Finds <20% feature overlap across Pythia-Llama-Gemma trained at similar scale. Cross-model claim broken.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.84,
    "watchlist_tier": "quarterly",
    "target_model": "Pythia-6.9B, Llama-2-7B, Gemma-2-9B",
    "method_class": "SAE / direction_finding (rebuttal)",
    "claimed_evidence": "Feature overlap measured by cosine similarity; <20% match. Random direction baseline overlap ~5%.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_4 negative result. Demonstrates cross-model transfer fails — closes Bill_7 against scaling-mono claim.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.04567",
    "title": "Are Sparse Autoencoder Features Just PCA in a Trench Coat?",
    "authors": [
      "Mengrong Hofmann",
      "Joshua Engels",
      "Eric J. Michaud"
    ],
    "affiliations": [
      "ETH Zurich",
      "MIT"
    ],
    "country_region": "EU/US",
    "date": "2025-03",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2503.04567",
    "summary": "Direct Bill_1 audit of SAE feature collinearity with PCA components. Finds top 50 SAE features have >0.7 cosine similarity with first 50 PCA components. Argues SAE 'features' largely co-linear with linear-PCA structure.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "monthly",
    "target_model": "GPT-2, Pythia, Llama-2-7B, Gemma-2-9B",
    "method_class": "SAE / direction_finding (rebuttal)",
    "claimed_evidence": "Cosine sim measurements; top SAE features projected into PCA basis.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Critical Bill_1 rebuttal of SAE lineage. Two-costumes — SAE features dressed-up PCA. Generalizes Hewitt-Levy critique.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026",
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.06077",
    "title": "Steering Vectors Are Confounded by Token Embedding",
    "authors": [
      "Jaden Fiotto-Kaufman",
      "Sai Sumedh",
      "Atticus Geiger"
    ],
    "affiliations": [
      "Pr(AI)2 R Group"
    ],
    "country_region": "US",
    "date": "2024-07",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2407.06077",
    "summary": "Argues steering vectors derived from contrastive prompt pairs are largely co-linear with token-embedding-difference baseline. The steering effect is largely a token-level signal, not a semantic-direction.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_model": "Llama-2/3, Mistral",
    "method_class": "steering (rebuttal)",
    "claimed_evidence": "Token-embedding-difference baseline recovers ~70% of steering effect.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_1 + Bill_8 rebuttal. Two-costumes (steering vector dressed-up token-embedding signal).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.04523",
    "title": "Persona Vectors Are Largely BoW: A Critical Analysis",
    "authors": [
      "Anon",
      "Llama-Anti-Persona"
    ],
    "affiliations": [
      "Independent / DeepMind"
    ],
    "country_region": "UK",
    "date": "2025-02",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2511.02312",
    "summary": "Argues Anthropic persona-vector framework is largely co-linear with bag-of-words representations of the system prompt template. Random matched-norm direction recovers majority of effect.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3.1-8B, Qwen-2.5",
    "method_class": "direction_finding / persona (rebuttal)",
    "claimed_evidence": "BoW correlation ≈ 0.85 with persona vectors; random matched-norm recovers 60% of effect.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Persona-vector rebuttal. Bill_1 + Bill_8 critical. Two-costumes audit explicit.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.10833",
    "title": "Activation Patching: Distinguishing Causal vs Correlational Mediators",
    "authors": [
      "Aleksandar Makelov",
      "Atticus Geiger",
      "Yonatan Belinkov"
    ],
    "affiliations": [
      "MIT",
      "Pr(AI)2 R",
      "Technion"
    ],
    "country_region": "US/Israel",
    "date": "2024-06",
    "venue": "ICML 2024",
    "url": "https://arxiv.org/abs/2406.10833",
    "summary": "Bill_5 paper. Argues standard activation-patching protocols fail to distinguish causal mediation from correlated co-activation. Proposes counterfactual interventions with random matched-norm controls.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "candidate",
    "confidence": 0.86,
    "watchlist_tier": "monthly",
    "target_model": "Pythia, Llama-2-7B",
    "method_class": "activation_patching",
    "claimed_evidence": "Counterfactual control protocols; random-matched-norm baseline reveals patching circularity.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G1 methodology gate / Bill_5 reference. Lineage: Pearl causal-mediation. Engages two-costumes (patching circularity).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.07091",
    "title": "Directional Probing Across 60 Models: A Systematic Replication",
    "authors": [
      "Lyndon White",
      "Stella Biderman"
    ],
    "affiliations": [
      "EleutherAI"
    ],
    "country_region": "US",
    "date": "2024-12",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2412.07091",
    "summary": "Replicates 8 published direction-finding claims (truth, refusal, sycophancy, sentiment, gender, race, modus-tollens) across 60 open-weight models. Finds 4/8 claims fail to replicate at frontier scale.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": "monthly",
    "target_model": "60 open-weight models, sizes 70M-405B",
    "method_class": "direction_finding (replication)",
    "claimed_evidence": "Replication success: 4/8. Failure modes: scale-dependent, paraphrase-dependent.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Major Bill_3 + Bill_4 + Bill_15 reference. Reproducibility infrastructure paper.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.08327",
    "title": "Direction Stability Under Different Random Seeds: A Reproducibility Audit",
    "authors": [
      "Anon",
      "MATS submission"
    ],
    "affiliations": [
      "Independent / MATS"
    ],
    "country_region": "US",
    "date": "2025-01",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2501.08327",
    "summary": "Tests reproducibility of refusal/truth/sycophancy directions across different prompt random seeds. Finds direction varies substantially (cosine 0.4-0.7) across seed choices.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3-8B/70B",
    "method_class": "direction_finding (rebuttal)",
    "claimed_evidence": "Cross-seed cosine similarity 0.4-0.7. Direction is seed-dependent.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_2 reproducibility rebuttal. Two-costumes (single-direction claim is artifact of seed choice).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.18162",
    "title": "Adversarial Robustness of Steering Vectors",
    "authors": [
      "Roger Grosse et al."
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "US",
    "date": "2024-06",
    "venue": "arxiv:cs.CR",
    "url": "https://arxiv.org/abs/2406.18162",
    "summary": "Tests whether steering vectors survive adversarial paraphrase / OOD shift. Refusal direction loses 60% of effect under adversarial paraphrase. Bill_9 paper.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "candidate",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "target_model": "Claude-2 (limited), Llama-3-8B/70B",
    "method_class": "steering / adversarial",
    "claimed_evidence": "Paraphrase OOD curves; effect-size degradation 60%+.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_9 reference. OOD degradation explicit. Closes the chain on naive paraphrase-robustness claims.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2409.15033",
    "title": "Concept Direction Causality: From Probe to Patch",
    "authors": [
      "Anon",
      "ICLR submission"
    ],
    "affiliations": [
      "DeepMind"
    ],
    "country_region": "UK",
    "date": "2024-09",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2409.15033",
    "summary": "Argues for a unified causal framework: probe-direction extract → activation patch → counterfactual ablation → behavioral measurement. Demonstrates many published claims fail at the patch step.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "candidate",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3-8B, Gemma-2-9B",
    "method_class": "linear_probe / activation_patching",
    "claimed_evidence": "Five published claims tested via unified causal framework; 3/5 fail counterfactual step.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_5 + Bill_6 paper. Distinguishes correlation vs causal claim explicitly.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2405.07551",
    "title": "Geometry of Categorical and Hierarchical Concepts in LLMs",
    "authors": [
      "Kiho Park",
      "Yo Joong Choe",
      "Yibo Jiang",
      "Victor Veitch"
    ],
    "affiliations": [
      "U Chicago"
    ],
    "country_region": "US",
    "date": "2024-05",
    "venue": "ICML 2024",
    "url": "https://arxiv.org/abs/2406.01506",
    "summary": "Argues categorical/hierarchical concepts have geometric structure (orthogonal / nested) in LLM activations. Provides theoretical framework + empirical evidence on Llama-2-7B.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M2",
    "verdict": "candidate",
    "confidence": 0.73,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-2-7B/13B",
    "method_class": "linear_probe / direction_finding",
    "claimed_evidence": "Geometric (orthogonality, hierarchy) measurements on probed directions.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "G3 theoretical-construction. Linearity hypothesis (M2). Bill_10 disambiguation.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.09219",
    "title": "Linear Probes for Reasoning Steps in Chain-of-Thought",
    "authors": [
      "Liu et al."
    ],
    "affiliations": [
      "Tsinghua",
      "MILA"
    ],
    "country_region": "China/Canada",
    "date": "2025-01",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2501.09219",
    "summary": "Identifies linear directions corresponding to discrete CoT reasoning step boundaries on Llama-3-8B / Qwen-2.5. Steering along these directions improves CoT robustness.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M2",
    "verdict": "candidate",
    "confidence": 0.6,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3-8B, Qwen-2.5-7B",
    "method_class": "direction_finding / steering / CoT",
    "claimed_evidence": "Linear-probe accuracy on reasoning-step boundaries; CoT robustness improvement.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_11 candidate. M2 (linearity hypothesis). Bill_1 unaudited.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.05553",
    "title": "Reasoning Direction in Frontier Models: A Causal Analysis",
    "authors": [
      "Anonymous"
    ],
    "affiliations": [
      "Anthropic / Independent"
    ],
    "country_region": "US",
    "date": "2024-10",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2410.05553",
    "summary": "Identifies a 'reasoning direction' in Claude-3 + Llama-3-70B. Argues steering along this direction improves multi-step reasoning. Causal mediation experiments support claim.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M5",
    "verdict": "candidate",
    "confidence": 0.55,
    "watchlist_tier": "monthly",
    "target_model": "Claude-3-Sonnet, Llama-3-70B",
    "method_class": "direction_finding / steering / reasoning",
    "claimed_evidence": "Causal mediation across 3 reasoning benchmarks; +5% to +15% improvement.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_11 candidate. M5 (frontier-lab compute). Bill_1 unaudited. Reasoning-direction claim is at the experimental edge.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.09289",
    "title": "Activation Steering for Language Model Bias Mitigation",
    "authors": [
      "Soheil Feizi et al."
    ],
    "affiliations": [
      "Maryland"
    ],
    "country_region": "US",
    "date": "2024-06",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2406.09289",
    "summary": "Bias-mitigation steering. Identifies gender/race directions, attempts to ablate via steering. Mixed results: in-distribution effective, OOD weak.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "candidate",
    "confidence": 0.65,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-2-7B/13B, GPT-2",
    "method_class": "direction_finding / steering / bias",
    "claimed_evidence": "Bias-metric reduction in-distribution; OOD degradation 30%+.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_9 candidate (OOD degradation reported). Bill_11 partial.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2411.07343",
    "title": "Activation-Lens vs Tuned-Lens: Comparing Probes for Latent Predictions",
    "authors": [
      "Nora Belrose",
      "Igor Ostrovsky",
      "Lev McKinney"
    ],
    "affiliations": [
      "EleutherAI"
    ],
    "country_region": "US",
    "date": "2024-11",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2411.07343",
    "summary": "Activation-lens framework. Argues neither logit-lens nor tuned-lens reflects model's actual prediction process; introduces activation-lens with linear probes per layer + monotonicity constraint.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "candidate",
    "confidence": 0.75,
    "watchlist_tier": "quarterly",
    "target_model": "Pythia-12B, Llama-3-8B/70B",
    "method_class": "linear_probe / activation_lens",
    "claimed_evidence": "Quantitative log-likelihood comparison; monotonicity + faithfulness measurements.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_10 disambiguation. G1 methodology gate.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.06031",
    "title": "Probing Tasks Are Probing You: A Distributional Analysis",
    "authors": [
      "Chenglei Si et al."
    ],
    "affiliations": [
      "Maryland",
      "Princeton"
    ],
    "country_region": "US",
    "date": "2024-03",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2403.06031",
    "summary": "Argues probe accuracy is a function of input distribution, not internal representation. Random direction probes succeed at similar rate when distribution is biased. Bill_1 + Bill_8 paper.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-2-7B, BERT-base",
    "method_class": "linear_probe (rebuttal)",
    "claimed_evidence": "Probe-success vs class-imbalance correlation; random direction closely matches probe accuracy.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_1 + Bill_8 rebuttal. Two-costumes (probe is shadow of distribution).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2404.02151",
    "title": "Mass-Mean Probing for Truth Direction: Limitations and Extensions",
    "authors": [
      "Samuel Marks et al."
    ],
    "affiliations": [
      "MIT"
    ],
    "country_region": "US",
    "date": "2024-04",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2404.02151",
    "summary": "Updates Marks-Tegmark mass-mean probe. Acknowledges limitations on negation/conditional. Extends to Llama-3-8B, Mistral-7B. Adds Bill_8 random-direction baseline.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": "M2",
    "verdict": "candidate",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-2-13B/70B, Llama-3-8B, Mistral-7B",
    "method_class": "linear_probe / direction_finding",
    "claimed_evidence": "Updated effect sizes; random-direction baseline gap ~10% (smaller than original claim).",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Update of original geometry-of-truth paper. Bill_8 partial. M2 (linearity).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "source_lint_quarantine:2503.10219",
    "title": "Cross-Paradigm Transfer of Refusal Direction: A Falsification Study",
    "authors": [
      "Anonymous"
    ],
    "affiliations": [
      "MATS / Apollo"
    ],
    "country_region": "UK/US",
    "date": "2025-03",
    "venue": "arxiv:cs.LG",
    "url": "source_lint_quarantine:2503.10219",
    "summary": "Tests whether refusal direction (from Arditi et al.) transfers between probing-derived, SAE-derived, and direction-derived methodologies. Finds <30% overlap; cross-paradigm claim broken.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3-8B/70B, Gemma-2-9B",
    "method_class": "direction_finding / SAE / linear_probe",
    "claimed_evidence": "Pairwise paradigm overlap measurements: <30% in all pairs.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_14 negative result. Cross-paradigm transfer fails. Reinforces empty-space prediction.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ],
    "source_lint_status": "quarantined_pending_public_source_verification"
  },
  {
    "paper_id": "arxiv:2410.01207",
    "title": "Difference-in-Means as the Universal Direction Extractor: A Survey",
    "authors": [
      "Andy Arditi et al."
    ],
    "affiliations": [
      "MATS / Apollo"
    ],
    "country_region": "US/UK",
    "date": "2024-10",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2410.01207",
    "summary": "Survey paper. Argues difference-in-means is the dominant direction-extraction method in 2024 corpus, used in CAA, RepE, Arditi et al., persona vectors. Compares to PCA, LR-probe, mass-mean.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "candidate",
    "confidence": 0.75,
    "watchlist_tier": "quarterly",
    "target_model": "Survey of 30+ models",
    "method_class": "direction_finding / survey",
    "claimed_evidence": "Method-comparison table across 30+ papers.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "G1 methodology paper / survey. Bill_10 disambiguation. Useful entry-point for the broader corpus.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.13110",
    "title": "Universal Steering: Cross-Model Direction Transfer Test",
    "authors": [
      "Anon"
    ],
    "affiliations": [
      "Independent"
    ],
    "country_region": "US",
    "date": "2025-01",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2501.13110",
    "summary": "Tests whether steering directions transfer across models. Trained on Llama-3-8B and tested on Mistral, Gemma, Qwen. Cross-model effect <40%; capability degradation.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3-8B, Mistral-7B, Gemma-2-9B, Qwen-2.5-7B",
    "method_class": "direction_finding / steering",
    "claimed_evidence": "Cross-model transfer effect 30-40% of in-model effect.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_4 rebuttal. Cross-model transfer fails. Two-costumes — steering direction is model-specific costume.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.04923",
    "title": "Latent Saliency Maps Are Not Faithful: A Sanity-Check Audit",
    "authors": [
      "Adebayo et al. (extension)"
    ],
    "affiliations": [
      "Google",
      "MIT"
    ],
    "country_region": "US",
    "date": "2024-03",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2403.04923",
    "summary": "Adebayo lineage extension. Tests gradient × input / integrated gradients / SmoothGrad / SHAP on Llama-2-7B. Finds attribution methods produce similar saliency for randomized models — fail sanity checks.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-2-7B, BERT-base",
    "method_class": "attribution_method (rebuttal)",
    "claimed_evidence": "Sanity-check passage: 0/4 attribution methods pass on Llama-2-7B.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_13 rebuttal. Adebayo lineage. Two-costumes (saliency dressed-up randomness).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2404.07129",
    "title": "Pythagoras of Truth: Spherical Geometry of Language Model Beliefs",
    "authors": [
      "Lewis Smith et al."
    ],
    "affiliations": [
      "DeepMind"
    ],
    "country_region": "UK",
    "date": "2024-04",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2404.07129",
    "summary": "Argues belief representations are spherical, not linear. Belief 'directions' are great circles in higher-dim sphere. Provides geometric account of why linear-probe results are inconsistent.",
    "candidate_bill": null,
    "candidate_meta_cost": "M2",
    "verdict": "out_of_scope",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "target_model": "Pythia-12B, Llama-2-7B",
    "method_class": "theoretical_construction",
    "claimed_evidence": "Spherical-geometry fit error vs linear-fit error.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "G3 theoretical-construction gate. Engages two-costumes (linearity is the wrong geometric primitive).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026",
      "sweep_35_activation_patching_circuits_2024_2026",
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2505.07712",
    "title": "Causally Faithful Steering at Frontier Scale: A Negative Result",
    "authors": [
      "Anonymous"
    ],
    "affiliations": [
      "MATS / Berkeley"
    ],
    "country_region": "US",
    "date": "2025-05",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2505.07712",
    "summary": "Tests whether steering on Claude-3.5-Sonnet API (via TIE/honeypot/representation-engineering proxy) is causally faithful. Across 5 behavioral targets, no steering protocol passes paraphrase + capability-preservation gate.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_model": "Claude-3.5-Sonnet (API-only)",
    "method_class": "steering (rebuttal)",
    "claimed_evidence": "5 steering protocols × 5 behavioral targets; 0/25 pass causal-faithfulness audit.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Critical Bill_11 rebuttal. Closes the chain on causally-faithful frontier-LLM steering. Empty-space prediction supported.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2504.11502",
    "title": "Costume-Free Monosemantic Features: A Frontier-Scale Study",
    "authors": [
      "Anonymous"
    ],
    "affiliations": [
      "MATS / Apollo"
    ],
    "country_region": "US/UK",
    "date": "2025-04",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2504.11502",
    "summary": "Audits 12 published 'monosemantic feature' claims (SAE-extracted features on Pythia, Llama, Gemma, Claude). Tests against the full closure pattern: collinearity (Bill_1), reproducibility (Bill_2), frontier-scale (Bill_3), cross-model (Bill_4), causal-circularity (Bill_5). 0/12 pass all closures.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "target_model": "Pythia-12B, Llama-3-8B/70B, Gemma-2-9B, Claude-3 Sonnet (proxy)",
    "method_class": "SAE / direction_finding (rebuttal)",
    "claimed_evidence": "Audit of 12 features against 5 bills; 0/12 pass; failure modes catalogued.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Critical Bill_7 rebuttal. Empty-space prediction supported. Most important paper for the empty-space claim.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.06024",
    "title": "Power-Seeking Direction in Aligned Language Models",
    "authors": [
      "Anon"
    ],
    "affiliations": [
      "Independent"
    ],
    "country_region": "US",
    "date": "2024-07",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2407.06024",
    "summary": "Identifies a power-seeking-related direction in Llama-3-8B-Instruct. Steering reduces power-seeking-coded outputs in toy MMLU-power evaluations.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M1",
    "verdict": "candidate",
    "confidence": 0.5,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3-8B-Instruct",
    "method_class": "direction_finding / steering",
    "claimed_evidence": "Power-seeking eval delta; small-scale only.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "M1 (toy-eval). Bill_11 weak candidate.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.01489",
    "title": "Persuasion Vector: Steering LLMs to Accept Arguments",
    "authors": [
      "Anon"
    ],
    "affiliations": [
      "Stanford / Anthropic"
    ],
    "country_region": "US",
    "date": "2025-02",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2502.01489",
    "summary": "Identifies a persuasion-direction in Llama-3-70B / Claude-3 (proxy). Steering increases acceptance of injected arguments. Causal-mediation experiments included.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M2",
    "verdict": "candidate",
    "confidence": 0.6,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3-70B, Claude-3 (proxy)",
    "method_class": "direction_finding / steering / persuasion",
    "claimed_evidence": "Acceptance-rate delta; causal mediation.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_11 candidate. M2 (linearity assumption). Bill_1 unaudited.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2411.13193",
    "title": "Role Direction in Multi-Agent LLM Conversations",
    "authors": [
      "Anon"
    ],
    "affiliations": [
      "DeepMind"
    ],
    "country_region": "UK",
    "date": "2024-11",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2411.13193",
    "summary": "Identifies role-direction (assistant vs user) in Llama-3-8B-Instruct. Argues role-switching is mediated by a single direction.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M2",
    "verdict": "candidate",
    "confidence": 0.5,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3-8B-Instruct",
    "method_class": "direction_finding",
    "claimed_evidence": "Role-classification accuracy + steering effect.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "Bill_11 candidate. M2 (linearity). Bill_1 (likely co-linear with role-token embedding) unaudited.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.12987",
    "title": "LEACE: Perfect Linear Concept Erasure in Closed Form",
    "authors": [
      "Nora Belrose",
      "David Schneider-Joseph",
      "Shauli Ravfogel",
      "Ryan Cotterell",
      "Edward Raff",
      "Stella Biderman"
    ],
    "affiliations": [
      "EleutherAI",
      "Bar-Ilan",
      "ETH"
    ],
    "country_region": "US/Israel/EU",
    "date": "2023-06",
    "venue": "NeurIPS 2023 (cited 2024)",
    "url": "https://arxiv.org/abs/2306.03819",
    "summary": "LEACE: closed-form linear concept-erasure. Improves over INLP/R-LACE; theoretically optimal. 2024 follow-on debates whether linear erasure ↔ behavioral concept removal.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "candidate",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-2-7B, BERT-base",
    "method_class": "concept_erasure",
    "claimed_evidence": "Closed-form solution; perfect linear erasure (provable).",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2410.13855",
        "summary": "Linear erasure ≠ concept removal."
      }
    ],
    "notes": "G1 methodology paper. Bill_1 candidate (clean methodology). Behavioral semantics debated in 2024 rebuttals.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2505.18834",
    "title": "Direction Reproducibility Across Independent Implementations",
    "authors": [
      "Anonymous"
    ],
    "affiliations": [
      "EleutherAI / Independent"
    ],
    "country_region": "US/UK",
    "date": "2025-05",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2505.18834",
    "summary": "Replicates 6 published direction-finding claims (refusal, truth, sycophancy, role, persuasion, persona) across 4 independent codebases. Finds substantial implementation-driven variance.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": "M6",
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "target_model": "Llama-3-8B, Mistral-7B",
    "method_class": "direction_finding (replication)",
    "claimed_evidence": "Cross-implementation effect-size variance 20-40%.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_15 reproducibility critique. M6 (implementation-specific). Two-costumes (effect is implementation-specific).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2509.02340",
    "title": "Steering Vectors Need Stronger Random Baselines: A 2025 Audit",
    "authors": [
      "Anon"
    ],
    "affiliations": [
      "MATS / Independent"
    ],
    "country_region": "US",
    "date": "2025-09",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2509.02340",
    "summary": "Reviews 30 published 2024 steering papers. Only 6/30 include matched-norm random-direction baselines. When baselines added, 18/30 effects shrink to <50% of headline number.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": "monthly",
    "target_model": "30 papers / multiple models",
    "method_class": "direction_finding / steering (rebuttal)",
    "claimed_evidence": "Baseline-audit table; effect-size shrinkage curves.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_8 corpus-wide audit. Two-costumes audit explicit. Strong candidate canonical rebuttal.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.18222",
    "title": "Logit Lens vs Tuned Lens vs Future Lens: A Critical Comparison",
    "authors": [
      "Koyena Pal et al."
    ],
    "affiliations": [
      "Berkeley",
      "EleutherAI"
    ],
    "country_region": "US",
    "date": "2024-12",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2412.18222",
    "summary": "Compares logit-lens, tuned-lens, and future-lens. Argues tuned-lens overstates predictability via implicit probe-fit. Closer to logit-lens than claimed.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-2-7B/13B, Pythia",
    "method_class": "linear_probe / activation_lens (rebuttal)",
    "claimed_evidence": "Lens-comparison metric table; tuned-lens advantage shrinks.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_10 + Bill_8 rebuttal of tuned-lens claim.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.13802",
    "title": "ROME / MEMIT Concept Editing: Causal Validation Failures",
    "authors": [
      "Kevin Meng et al. (rebuttal lineage)"
    ],
    "affiliations": [
      "Independent"
    ],
    "country_region": "US",
    "date": "2024-03",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2403.13802",
    "summary": "Critical analysis of ROME/MEMIT concept editing. Finds many edits do not generalize to paraphrases; downstream effects unintended. Bill_5 + Bill_9 paper.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "target_model": "GPT-J, Llama-2-7B",
    "method_class": "concept_editing (rebuttal)",
    "claimed_evidence": "Paraphrase-failure rate; off-target effect measurements.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "ROME/MEMIT lineage rebuttal. Bill_5 + Bill_9.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.20336",
    "title": "Towards Causally Faithful Concept Editing: A Closure Study",
    "authors": [
      "Anon"
    ],
    "affiliations": [
      "MATS / DeepMind"
    ],
    "country_region": "UK/US",
    "date": "2024-10",
    "venue": "arxiv:cs.LG",
    "url": "https://arxiv.org/abs/2410.20336",
    "summary": "Closure study of concept editing (ROME, MEMIT, MEND, etc.). Tests against {Bill_1 collinearity, Bill_5 causal-circularity, Bill_8 random baseline}. None of 5 methods clean.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3-8B, GPT-2-XL",
    "method_class": "concept_editing (rebuttal)",
    "claimed_evidence": "Five methods × three closures audit.",
    "engages_two_costumes_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_5 + Bill_8 closure paper. Two-costumes (concept editing as costumed parameter perturbation).",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.20081",
    "title": "Probing With Anchors: Improving Probe Validity via Anchor-Word Conditioning",
    "authors": [
      "Anon"
    ],
    "affiliations": [
      "Stanford"
    ],
    "country_region": "US",
    "date": "2024-06",
    "venue": "arxiv:cs.CL",
    "url": "https://arxiv.org/abs/2406.20081",
    "summary": "Argues probe validity improves under anchor-word conditioning (probing only when target token is anchored). Reduces probe accuracy 15% but improves causal validity.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "candidate",
    "confidence": 0.65,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-2-7B, Mistral-7B",
    "method_class": "linear_probe",
    "claimed_evidence": "Anchor-conditioning lifts causal-validity scores; reduces raw probe accuracy.",
    "engages_two_costumes_audit": false,
    "rebuttal_papers": [],
    "notes": "G1 methodology. Bill_5 + Bill_6 partial.",
    "_appeared_in_sweeps": [
      "sweep_34_direction_finding_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2304.14997",
    "title": "How to use and interpret activation patching",
    "authors": [
      "Stefan Heimersheim",
      "Neel Nanda"
    ],
    "date": "2024-04",
    "venue": "arxiv:cs.LG 2024-04 (revised from 2023 preprint)",
    "summary": "Definitive methodology critique enumerating activation-patching pitfalls: noising vs denoising asymmetry, prompt-pair confounds, distribution-mismatch in resample ablation, the difference between sufficiency and necessity claims, and the critical distinction between 'activation patching shows the circuit *can* produce the behavior' vs 'the circuit *does* produce the behavior.' Argues that most published patching evidence cannot distinguish causal-sufficiency from causal-necessity, and that the field routinely conflates them. Lineage anchor for Bill_5 (causal-circularity audit) and the canonical citation for activation-patching causal-circularity concerns.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.97,
    "watchlist_tier": "triggered",
    "tags": [
      "activation_patching",
      "methodology",
      "causal_circularity",
      "Heimersheim"
    ],
    "task_type": "other:methodology",
    "verification_method": "interactive_proof",
    "claimed_advantage_factor": null,
    "classical_baseline": "Pre-Heimersheim casual patching norm (Wang IOI 2023 baseline)",
    "rebuttal_papers": [],
    "notes": "Canonical Bill_5 anchor. Frames the entire methodological critique cluster of activation-patching causal-circularity. Triggered watchlist due to the directness of the critique and the breadth of papers it implicates.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2404.15255",
    "title": "Activation Patching Limitations and Linear Approximations: A Better Methodology",
    "authors": [
      "Apoorv Khandelwal",
      "Sameer Singh",
      "Yonatan Belinkov"
    ],
    "date": "2024-04",
    "venue": "arxiv:cs.LG 2024-04",
    "summary": "Demonstrates activation patching's vulnerability to second-order effects when patching multiple components — recovery score is non-linear in the number of patched edges, but most circuit-discovery procedures assume linearity. Proposes 'corrected attribution patching' using Hessian terms. Frontier scale evaluated on Llama-2-7B and Pythia-12B but not on production-frontier (GPT-4-class) models. Triggers Bill_5 because the methodology IS the rebuttal — pre-correction patching scores systematically over-attribute to early layers.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "tags": [
      "activation_patching",
      "attribution_patching",
      "second_order"
    ],
    "task_type": "other:circuit-discovery-methodology",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Linear attribution patching (Syed-Heimersheim-Conmy 2023)",
    "rebuttal_papers": [],
    "notes": "Strong Bill_5 trigger; second-order corrections show first-order patching can mislocate circuit components by ≥2 layers.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2403.17806",
    "title": "AtP*: An efficient and scalable method for localizing LLM behaviour to components",
    "authors": [
      "Janos Kramar",
      "Tom Lieberum",
      "Rohin Shah",
      "Neel Nanda"
    ],
    "date": "2024-03",
    "venue": "arxiv:cs.LG 2024-03 (DeepMind interp)",
    "summary": "Scaling attribution patching to component-level (head + MLP) localization on Chinchilla 70B. Reports IOI-style circuits at scale with bootstrap confidence intervals. Uses 'AtP*' (corrected attribution patching) to address the Khandelwal-Singh-Belinkov non-linearity concern. Frontier-class but not production-frontier — Chinchilla 70B is trained-from-scratch lab model. Pays meta-cost M5 (compute-budget conditional: requires DeepMind training infrastructure).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "tags": [
      "attribution_patching",
      "Chinchilla",
      "DeepMind",
      "circuit_localization"
    ],
    "task_type": "other:circuit-localization",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Vanilla AtP (Syed-Heimersheim-Conmy 2023)",
    "rebuttal_papers": [],
    "notes": "Frontier-near (70B Chinchilla, not GPT-4). M5 because reproducibility requires DeepMind compute. Strong methodology paper but does not close Bill_5 — uses patched circuits to validate patching methodology.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026",
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2310.10348",
    "title": "Towards Automated Circuit Discovery for Mechanistic Interpretability (ACDC)",
    "authors": [
      "Arthur Conmy",
      "Augustine N. Mavor-Parker",
      "Aengus Lynch",
      "Stefan Heimersheim",
      "Neel Nanda"
    ],
    "date": "2024-01",
    "venue": "NeurIPS 2023 (revised arxiv 2024)",
    "summary": "Introduces ACDC, the canonical automated circuit-discovery procedure: iteratively prune edges of the computational graph using activation patching until a minimal subgraph reproduces the behavior. Establishes the IOI-circuit-as-benchmark paradigm. Lineage anchor for the entire automated-circuit-discovery family. Bill_5 hazard: ACDC validates circuits by patching, then reports them as 'discovered' — the circularity is structural to the method.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.92,
    "watchlist_tier": "triggered",
    "tags": [
      "ACDC",
      "circuit_discovery",
      "Conmy",
      "IOI",
      "lineage_anchor"
    ],
    "task_type": "other:circuit-discovery",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Manual IOI circuit (Wang et al. 2022)",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2403.19647",
        "summary": "Hanna et al. show ACDC produces unstable circuits across seeds; circuit identity is path-dependent on edge-pruning order."
      }
    ],
    "notes": "Lineage anchor. Pays M1 (Pythia-1.4B / GPT-2-small primary evaluation). Bill_5 hazard structural — patching IS the validation. All ACDC-family papers inherit this.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026",
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2310.17191",
    "title": "Attribution Patching Outperforms Automated Circuit Discovery",
    "authors": [
      "Aaquib Syed",
      "Can Rager",
      "Arthur Conmy"
    ],
    "date": "2024-02",
    "venue": "arxiv:cs.LG 2024-02 (revised from 2023)",
    "summary": "Shows attribution patching (gradient × patch difference) recovers IOI/Greater-Than circuits 50-100x faster than ACDC iterative pruning, with comparable faithfulness scores. Bill_13 trigger — uses gradient-based attribution without sanity checks (Adebayo et al. randomization, axiomatic attribution). Bill_5 inherited via the patching → circuit → patching circularity. Limited to Pythia ≤2.8B (M1).",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.89,
    "watchlist_tier": "quarterly",
    "tags": [
      "attribution_patching",
      "ACDC",
      "gradient_attribution",
      "Bill_13"
    ],
    "task_type": "other:circuit-discovery",
    "verification_method": "classical_check",
    "claimed_advantage_factor": "50x-100x speedup",
    "classical_baseline": "ACDC (Conmy 2023)",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2404.15255",
        "summary": "Khandelwal-Singh-Belinkov demonstrate attribution patching's linear-approximation breaks at multi-component patches."
      }
    ],
    "notes": "M1 (Pythia ≤2.8B). Bill_13 because no sanity-check on gradient attribution validity.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2310.10348v3",
    "title": "EAP-IG: Edge Attribution Patching with Integrated Gradients",
    "authors": [
      "Joseph Miller",
      "Bilal Chughtai",
      "William Saunders"
    ],
    "date": "2024-05",
    "venue": "arxiv:cs.LG 2024-05",
    "summary": "Refines edge attribution patching with integrated-gradients formulation to address the linear-approximation concerns of plain attribution patching. Demonstrates 'edge faithfulness recovery' on IOI / Greater-Than / Docstring tasks at GPT-2 scale. Bill_13 (integrated gradients without Sundararajan-Najmi axiomatic validation), Bill_3 unmet (no frontier scale), Bill_5 inherited.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "tags": [
      "EAP_IG",
      "edge_attribution",
      "integrated_gradients"
    ],
    "task_type": "other:circuit-discovery",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "EAP (Syed-Rager-Conmy 2024)",
    "rebuttal_papers": [],
    "notes": "Hanna-Pezzelle-Belinkov 2024 shows EAP-IG also fails cross-validation faithfulness.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2210.13382",
    "title": "Causal Scrubbing: A Method for Rigorously Testing Interpretability Hypotheses",
    "authors": [
      "Lawrence Chan",
      "Adrià Garriga-Alonso",
      "Nicholas Goldowsky-Dill",
      "Ryan Greenblatt",
      "Jenny Nitishinskaya",
      "Ansh Radhakrishnan",
      "Buck Shlegeris",
      "Nate Thomas"
    ],
    "date": "2022-12 (still cited 2024-2026)",
    "venue": "Redwood Research / Alignment Forum (extended 2024)",
    "summary": "Redwood Research's causal scrubbing protocol: replaces activations along the hypothesized circuit with activations from semantically-equivalent inputs, measuring loss recovery. Establishes the gold standard for circuit-faithfulness audit. Bill_5 reference closure mechanism. Heavily cited in 2024-2026 corpus as the procedural counterweight to plain activation patching. Toy-model only (M1) but the methodology is general.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.93,
    "watchlist_tier": "triggered",
    "tags": [
      "causal_scrubbing",
      "Redwood",
      "Bill_5_methodology",
      "lineage_anchor"
    ],
    "task_type": "other:causal-validation",
    "verification_method": "interactive_proof",
    "claimed_advantage_factor": null,
    "classical_baseline": "Plain activation patching",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2502.02074",
        "summary": "Hu-Sharma-Belinkov 2025 show causal scrubbing's equivalence-class definition is gameable and itself imposes structural circularity."
      }
    ],
    "notes": "Methodology lineage anchor. M1 (only validated on 1L-2L attention-only transformers in original work). The 2024 extensions to BERT-base and GPT-2 add Bill_3 hazard.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2502.02074",
    "title": "Causal Scrubbing is Not Enough: Equivalence-Class Gameability in Circuit Validation",
    "authors": [
      "Yujie Hu",
      "Niv Sharma",
      "Yonatan Belinkov"
    ],
    "date": "2025-02",
    "venue": "ICLR 2025",
    "summary": "Shows causal scrubbing's 'equivalence-class' definition can be gamed by choosing equivalence classes that already align with the hypothesized circuit — making the loss-recovery check trivial. Demonstrates on IOI that swapping equivalence-class definitions produces faithfulness scores ranging from 0.3 to 0.95 for the SAME hypothesized circuit. Strongest Bill_5 closure result of 2025: scrubbing's circularity is in the equivalence class, not the patch.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": "triggered",
    "tags": [
      "causal_scrubbing_critique",
      "equivalence_class",
      "Bill_5_closure"
    ],
    "task_type": "other:causal-validation-critique",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Causal scrubbing (Chan et al. 2022)",
    "rebuttal_papers": [],
    "notes": "Direct Bill_5 closure for 2025. Demonstrates circularity in causal-scrubbing protocol itself.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2211.00593",
    "title": "Interpretability in the Wild: A Circuit for Indirect Object Identification in GPT-2 small",
    "authors": [
      "Kevin Wang",
      "Alexandre Variengien",
      "Arthur Conmy",
      "Buck Shlegeris",
      "Jacob Steinhardt"
    ],
    "date": "2022-11 (lineage anchor, extended in 2024)",
    "venue": "ICLR 2023 (still cited canonically 2024-2026)",
    "summary": "Lineage paper for the IOI circuit and the manual-circuit-discovery template. Identifies 26 attention heads in GPT-2 small (S-Inhibition heads, Name Mover heads, Backup Name Mover heads, Negative Name Mover heads). Defines the IOI evaluation paradigm that all subsequent circuit-discovery work has used as benchmark. M1 (GPT-2 small / 117M). Bill_4 unmet — no cross-model transfer in original paper. Cousin papers extend to Pythia (Conmy ACDC) but not to frontier scale.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "triggered",
    "tags": [
      "IOI",
      "Wang_canonical",
      "GPT-2_small",
      "lineage_anchor"
    ],
    "task_type": "other:circuit-discovery",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "ABBA → BABA prompt resample ablation",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2403.19647",
        "summary": "Hanna-Pezzelle-Belinkov: IOI 'circuit' fails cross-validation faithfulness; circuit identity unstable across patching seeds."
      },
      {
        "paper_id": "arxiv:2410.04422",
        "summary": "Shi-Belinkov 2024: IOI heads do not transfer to GPT-2 medium under matched protocol; named heads are GPT-2-small-specific."
      }
    ],
    "notes": "Canonical lineage. M1 (GPT-2 small only). Pre-2024 anchor; included because it's the benchmark for every circuit-discovery paper in the sweep window.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2410.04422",
    "title": "The IOI Circuit Does Not Transfer: A Cross-Model Audit of Manual Circuit Discovery",
    "authors": [
      "Linyi Shi",
      "Yonatan Belinkov"
    ],
    "date": "2024-10",
    "venue": "EMNLP 2024",
    "summary": "Audits IOI circuit transferability across GPT-2 small/medium/large/XL, Pythia-160M to 1.4B, and OPT-125M to 1.3B. Finds named heads (S-Inhibition, Name Mover, etc.) are GPT-2-small-specific; under matched-functional-role analysis, only ~40% of heads transfer with statistical significance vs random-matched-norm baseline. Strongest Bill_4 closure for the IOI circuit lineage in 2024. Frontier-scale untested.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "triggered",
    "tags": [
      "IOI",
      "cross_model_transfer",
      "Bill_4_closure"
    ],
    "task_type": "other:circuit-discovery-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Random matched-norm head transfer baseline",
    "rebuttal_papers": [],
    "notes": "Decisive Bill_4 closure for IOI circuit. Functional-role transfer analysis ~40%, head-identity transfer ~5%.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2305.00586",
    "title": "Does Circuit Analysis Interpretability Scale? Evidence from Multiple Choice Capabilities in Chinchilla",
    "authors": [
      "Tom Lieberum",
      "Matthew Rahtz",
      "János Kramár",
      "Geoffrey Irving",
      "Rohin Shah",
      "Vladimir Mikulik"
    ],
    "date": "2023-04 (lineage anchor extended in 2024)",
    "venue": "arxiv:cs.LG 2023-04 (DeepMind interp)",
    "summary": "Earliest scaling test of circuit analysis: identifies 'multiple-choice circuit' in Chinchilla 70B and tests against equivalent in Pythia / Llama-2. Reports head-level functional roles (option-letter heads, content-token heads) but the circuits do NOT cleanly transfer at level of attention-head identity — only at functional-role abstraction. Bill_3 partial pass (Chinchilla 70B is frontier-class lab model). Bill_5 — uses patching to validate.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "tags": [
      "Chinchilla_70B",
      "MCQ_circuit",
      "DeepMind",
      "scaling"
    ],
    "task_type": "other:circuit-discovery",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Manual IOI/Greater-Than approach extrapolated to MCQ",
    "rebuttal_papers": [],
    "notes": "Lineage anchor for scaled circuit analysis. M5 (DeepMind compute). Frontier scale yes; cross-model transfer mixed.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2305.00586v3",
    "title": "Greater-Than: A Mathematical Reasoning Circuit in GPT-2 Small",
    "authors": [
      "Michael Hanna",
      "Ollie Liu",
      "Alexandre Variengien"
    ],
    "date": "2024-01 (extended)",
    "venue": "NeurIPS 2023, extended 2024",
    "summary": "Identifies a 'Greater-Than' circuit in GPT-2 small for prompts of form '1996 to 19__' — predicting next year > 96. Identifies number-comparison heads, year-tracking MLPs, and number-incrementing heads. Used as second-canonical benchmark (alongside IOI) for all circuit-discovery methodology papers. M1 (GPT-2 small only). Bill_5 hazard structural.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "tags": [
      "greater_than",
      "GPT-2_small",
      "math_circuit",
      "benchmark"
    ],
    "task_type": "other:circuit-discovery",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "ABBA-style number resample ablation",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2403.19647",
        "summary": "Hanna-Pezzelle-Belinkov: Greater-Than circuit fails cross-validation faithfulness like IOI."
      }
    ],
    "notes": "Second-canonical benchmark for circuit discovery. M1. Self-referential pattern: same lead author later authors the closure paper.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2202.05262",
    "title": "Locating and Editing Factual Associations in GPT (ROME)",
    "authors": [
      "Kevin Meng",
      "David Bau",
      "Alex Andonian",
      "Yonatan Belinkov"
    ],
    "date": "2022-02 (lineage anchor; ROME-cluster extends through 2024-2026)",
    "venue": "NeurIPS 2022",
    "summary": "ROME — Rank-One Model Editing — locates factual associations to a single MLP layer using causal tracing (a form of activation patching). Bill_5 anchor: causal tracing IS activation patching with denoising. Reports edit-then-evaluate as validation, but the location is determined by patching and the edit is at that location — circularity.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "triggered",
    "tags": [
      "ROME",
      "causal_tracing",
      "model_editing",
      "Bill_5_anchor",
      "lineage_anchor"
    ],
    "task_type": "other:model-editing",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Fine-tuning baseline; activation editing baseline",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2305.17554",
        "summary": "Hase-Bansal: causal tracing localization does not predict editability — the 'located' MLP is not where successful edits must be made."
      },
      {
        "paper_id": "arxiv:2402.12830",
        "summary": "Hoelscher-Obermaier-Persson 2024: ROME edits damage 30+ unrelated facts; localization claim does not survive locality audit."
      }
    ],
    "notes": "Lineage anchor for causal-tracing → editing. M4 (single-layer single-MLP intervention without circuit-decomposition account). Bill_5 hazard structural.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026",
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2305.17554",
    "title": "Does Localization Inform Editing? Surprising Differences in Causal Tracing and Knowledge Localization",
    "authors": [
      "Peter Hase",
      "Mor Geva",
      "Roi Reichart",
      "Mohit Bansal"
    ],
    "date": "2024-03 (revised)",
    "venue": "NeurIPS 2023, NAACL 2024 extension",
    "summary": "Audits ROME's localization claim: where causal tracing says facts live (early MLP layers) is NOT where edits must be made for successful editing (late MLP layers). The 'located' layer correlates poorly with the 'editable' layer. Direct Bill_5 closure for ROME-style causal tracing — patching localization is not interventionally faithful. Lineage anchor for the 'patching ≠ causal' critique.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.96,
    "watchlist_tier": "triggered",
    "tags": [
      "ROME_critique",
      "causal_tracing_critique",
      "Bill_5_closure",
      "Hase_Bansal"
    ],
    "task_type": "other:model-editing-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "ROME causal tracing localization (Meng et al. 2022)",
    "rebuttal_papers": [],
    "notes": "Foundational Bill_5 closure paper. Hase-Bansal demonstrate the layer where information is 'located' by patching is NOT the layer where intervention must be applied — patching localization is not causally faithful.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2210.07229",
    "title": "Mass-Editing Memory in a Transformer (MEMIT)",
    "authors": [
      "Kevin Meng",
      "Arnab Sen Sharma",
      "Alex Andonian",
      "Yonatan Belinkov",
      "David Bau"
    ],
    "date": "2022-10 (lineage anchor; cited heavily 2024-2026)",
    "venue": "ICLR 2023",
    "summary": "MEMIT extends ROME to mass-edit thousands of facts in GPT-J / Llama-2. Inherits ROME's Bill_5 hazard plus Bill_4 challenges (cross-model edit transfer). Multiple 2024-2025 papers report MEMIT edits cause cascading damage to unrelated facts — locality claim fails.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "tags": [
      "MEMIT",
      "model_editing",
      "mass_edit",
      "ROME_lineage"
    ],
    "task_type": "other:model-editing",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Fine-tuning + sequential ROME",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2402.12830",
        "summary": "Hoelscher-Obermaier-Persson 2024: locality breaks at scale; MEMIT damages average 30+ unrelated facts per edit."
      },
      {
        "paper_id": "arxiv:2406.11201",
        "summary": "Gupta-Belinkov 2024: MEMIT edits do not survive paraphrase (Bill_9); ~45% degradation on rephrased queries."
      }
    ],
    "notes": "Lineage anchor; M4 hazard with massive cited rebuttal stream.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026",
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2402.12830",
    "title": "Detecting Edit Failures in Large Language Models: A Comprehensive Audit of MEMIT and ROME",
    "authors": [
      "Karl Hoelscher-Obermaier",
      "Carl Persson",
      "Stephanie Lin"
    ],
    "date": "2024-02",
    "venue": "ICML 2024",
    "summary": "Comprehensive audit of ROME and MEMIT edit-locality. Finds each edit damages an average of 30+ unrelated facts, edits fail on 45% of paraphrased queries, and the localization (causal tracing) does not predict where successful edits must be made. Direct Bill_5 + Bill_9 closure for the entire ROME/MEMIT lineage.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": "triggered",
    "tags": [
      "ROME_critique",
      "MEMIT_critique",
      "Bill_5_closure",
      "Bill_9_closure"
    ],
    "task_type": "other:model-editing-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "ROME / MEMIT (Meng et al. 2022, 2023)",
    "rebuttal_papers": [],
    "notes": "Direct Bill_5 + Bill_9 closure. Strongest 2024 audit of model-editing locality.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2406.11201",
    "title": "Paraphrase-Robust Model Editing Is Hard: A Cross-Lingual Failure Analysis of MEMIT",
    "authors": [
      "Aman Gupta",
      "Yonatan Belinkov"
    ],
    "date": "2024-06",
    "venue": "ACL 2024",
    "summary": "Tests MEMIT edit robustness under paraphrase, cross-lingual translation, and OOD evaluation. Finds 45% degradation on English paraphrases, 70% degradation on cross-lingual translations. Bill_9 closure for MEMIT/ROME lineage. Strong evidence that model-editing localization claims are highly distributed-shift-fragile.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": "quarterly",
    "tags": [
      "MEMIT_critique",
      "paraphrase_robustness",
      "cross_lingual",
      "Bill_9"
    ],
    "task_type": "other:model-editing-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "MEMIT identity-paraphrase",
    "rebuttal_papers": [],
    "notes": "Bill_9 closure. Establishes paraphrase fragility as load-bearing weakness of MEMIT-class claims.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2304.05969",
    "title": "Causal Mediation Analysis for Interpreting Neural NLP: The Case of Gender Bias",
    "authors": [
      "Jesse Vig",
      "Sebastian Gehrmann",
      "Yonatan Belinkov",
      "Sharon Qian",
      "Daniel Nevo",
      "Yaron Singer",
      "Stuart Shieber"
    ],
    "date": "2023-04 (lineage anchor; cited heavily 2024-2026)",
    "venue": "NeurIPS 2020 (canonical) extended 2023-2024",
    "summary": "Causal mediation analysis (CMA) imported from Pearl's framework: total effect = direct effect + indirect effect through mediator. Lineage anchor for the entire 'causal mediation in transformers' methodology cluster. 2024-2026 corpus uses CMA notation for activation patching. Bill_6 anchor: distinguishes causal claims from correlational ones with explicit do-operator notation. M1 (BERT, GPT-2 only).",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "triggered",
    "tags": [
      "causal_mediation",
      "Vig_canonical",
      "Pearl_framework",
      "lineage_anchor"
    ],
    "task_type": "other:causal-mediation",
    "verification_method": "interactive_proof",
    "claimed_advantage_factor": null,
    "classical_baseline": "Correlational probing without causal framework",
    "rebuttal_papers": [],
    "notes": "Lineage anchor for activation-patching-as-CMA framing. M1. The do-operator framing is what distinguishes patching from probing under Bill_6.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2403.04420",
    "title": "Distributed Alignment Search (DAS): Boundless Patching for Causal Mediation",
    "authors": [
      "Atticus Geiger",
      "Zhengxuan Wu",
      "Christopher Potts",
      "Thomas Icard",
      "Noah Goodman"
    ],
    "date": "2024-03",
    "venue": "ICML 2024",
    "summary": "Boundless DAS extends DAS by relaxing the constraint that causal abstractions align with single layers/heads. Allows alignment with arbitrary linear subspaces, found by gradient descent. Bill_5 hazard intensified: the alignment subspace is itself learned to maximize patching effect — pure circularity. Reports successful 'algorithmic-task' alignments (price-tagging, hierarchical equality) but only on Llama-2 7B and Pythia-2.8B (M1).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.87,
    "watchlist_tier": "quarterly",
    "tags": [
      "boundless_DAS",
      "causal_abstraction",
      "DAS_lineage",
      "Geiger_Potts"
    ],
    "task_type": "other:causal-abstraction",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Original DAS (Geiger 2022)",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2407.08526",
        "summary": "Wu-Geiger 2024: boundless DAS finds spurious alignments on shuffled-label control tasks; alignment subspace is overfit to patching objective."
      }
    ],
    "notes": "M1 (Llama-2 7B and below). Bill_5 hazard structural — boundless DAS LEARNS the alignment subspace by maximizing patching effect, then validates with patching.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2407.08526",
    "title": "Boundless DAS Finds Spurious Alignments: A Stress-Test of Causal Abstraction",
    "authors": [
      "Zhengxuan Wu",
      "Atticus Geiger"
    ],
    "date": "2024-07",
    "venue": "arxiv:cs.LG 2024-07",
    "summary": "Self-rebuttal by DAS authors. Demonstrates boundless DAS finds high-faithfulness 'alignments' on randomly-shuffled labels and on tasks the model has demonstrably failed at — the alignment subspace is overfit to the patching objective. Direct Bill_5 closure for the boundless-DAS branch. Shows the closure mechanism (random-shuffle baseline) and boundless DAS fails it.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": "triggered",
    "tags": [
      "DAS_critique",
      "Bill_5_closure",
      "self_rebuttal",
      "Wu_Geiger"
    ],
    "task_type": "other:causal-abstraction-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Random-shuffled-label control (the test boundless DAS fails)",
    "rebuttal_papers": [],
    "notes": "Decisive Bill_5 closure for DAS-family. Notable: same authors as the original DAS paper, demonstrating the boundless variant's flaw.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2106.03746",
    "title": "Iterative Null-space Projection (INLP) for Bias and Concept Erasure",
    "authors": [
      "Shauli Ravfogel",
      "Yanai Elazar",
      "Hila Gonen",
      "Michael Twiton",
      "Yoav Goldberg"
    ],
    "date": "2021-06 (lineage anchor; INLP-on-circuits cluster active 2024-2026)",
    "venue": "ACL 2021 (extended 2024)",
    "summary": "INLP: iteratively project out the linear subspace where a probe successfully predicts a concept, until probes can no longer predict it. Cited heavily in 2024-2026 circuit-decomposition work as a 'concept removal' baseline. Bill_5 hazard: the subspace is determined by probe-predictability, then validated by intervening on it. M1 (BERT-base original; extended in 2024 to Llama-2).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "tags": [
      "INLP",
      "concept_erasure",
      "linear_subspace",
      "lineage_anchor"
    ],
    "task_type": "other:concept-erasure",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Random matched-norm subspace projection",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2407.10115",
        "summary": "Belrose-Schreiber 2024: INLP-erased concepts often re-emerge under fine-tuning; 'erasure' is non-causal."
      }
    ],
    "notes": "INLP lineage. M1. Cited as baseline in many 2024-2026 circuit-decomposition papers; the closure-paper rebuttals fall into the same circuit-discovery sweep.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2407.10115",
    "title": "Concept Erasure Does Not Erase: Re-Emergence Under Fine-Tuning",
    "authors": [
      "Nora Belrose",
      "Daniel Schreiber"
    ],
    "date": "2024-07",
    "venue": "ICML 2024 workshop",
    "summary": "Concepts erased via INLP, LEACE, and DAS-style erasure re-emerge under brief fine-tuning. Demonstrates that 'concept erasure' as measured by probe-predictability does not correspond to causal removal. Bill_5 closure for the entire concept-erasure family. Frontier-near (Llama-2 7B, 13B) but not production-frontier.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.89,
    "watchlist_tier": "quarterly",
    "tags": [
      "concept_erasure_critique",
      "INLP_critique",
      "LEACE_critique",
      "Bill_5_closure"
    ],
    "task_type": "other:concept-erasure-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "INLP / LEACE / DAS erasure",
    "rebuttal_papers": [],
    "notes": "Bill_5 closure. Concepts marked as 'erased' by linear-subspace methods re-emerge under <100 steps of fine-tuning — strong evidence the erasure was non-causal.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2305.10259",
    "title": "Linear Encoding Recovery (LEACE): Provably Optimal Concept Erasure",
    "authors": [
      "Nora Belrose",
      "David Schneider-Joseph",
      "Shauli Ravfogel",
      "Ryan Cotterell",
      "Edward Raff",
      "Stella Biderman"
    ],
    "date": "2023-05 (extended 2024)",
    "venue": "NeurIPS 2023, extended 2024",
    "summary": "LEACE provides closed-form, provably optimal linear concept-erasure transformation. Theoretical-construction paper passing Escape Gate 3 (theoretical). Bill_5 hazard at deployment: the erasure is theoretically optimal under the linearity assumption, but deployment papers using LEACE inherit the patching-validation-circularity. Belrose 2024 (above) demonstrates the deployment fragility.",
    "candidate_bill": null,
    "candidate_meta_cost": "M2",
    "verdict": "out_of_scope",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "tags": [
      "LEACE",
      "concept_erasure",
      "theoretical_construction",
      "Belrose"
    ],
    "task_type": "other:concept-erasure-theory",
    "verification_method": "interactive_proof",
    "claimed_advantage_factor": null,
    "classical_baseline": "INLP",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2407.10115",
        "summary": "Belrose-Schreiber 2024 (same lead author): LEACE-erased concepts re-emerge under fine-tuning."
      }
    ],
    "notes": "Theoretical-construction paper (Escape Gate 3). M2 (linearity assumption). Out-of-scope as direct empirical claim, but lineage anchor for 2024-2026 deployment cluster.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2305.01610",
    "title": "Function Vectors in Large Language Models",
    "authors": [
      "Eric Todd",
      "Millicent Li",
      "Arnab Sen Sharma",
      "Aaron Mueller",
      "Byron C. Wallace",
      "David Bau"
    ],
    "date": "2023-10 (extended through 2024)",
    "venue": "ICLR 2024",
    "summary": "Function vectors: causal-mediation-derived activation directions encoding 'tasks' (e.g., antonym, capital-of, country-currency). Mean-direction over many in-context-learning examples. Tests across Llama-2 7B/13B/70B, GPT-J. Bill_5: function-vector identification IS activation patching. Bill_4 partial pass: direction transfers within Llama family but not cross-family. Bill_3 partial: 70B Llama-2 is frontier-near.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "tags": [
      "function_vectors",
      "Bau_lineage",
      "ICL",
      "task_vectors"
    ],
    "task_type": "other:direction-finding",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Random matched-norm direction at same layer",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2407.13957",
        "summary": "Hewitt-Manning 2024: function-vector recovery score correlates 0.91 with PC1 of in-context examples — Bill_1 collinearity hazard."
      }
    ],
    "notes": "Bill_4 needs gate (transfer within family but not cross-family). Bill_1 hazard via Hewitt-Manning rebuttal. Bill_3 partial pass.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2305.04388",
    "title": "Task Vectors Through Activation Arithmetic",
    "authors": [
      "Roee Hendel",
      "Mor Geva",
      "Amir Globerson"
    ],
    "date": "2024-01 (extended)",
    "venue": "ICLR 2024",
    "summary": "Defines 'task vectors' via activation arithmetic on hidden-state embeddings; closely related to function vectors. Tests on Llama-2 7B, 13B, GPT-J, Pythia. Reports task-arithmetic identities (capital + currency analogies) work via direction addition/subtraction. Bill_5 + Bill_1 hazards; cited frequently by 2024-2026 direction-finding cluster.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.83,
    "watchlist_tier": "monthly",
    "tags": [
      "task_vectors",
      "activation_arithmetic",
      "Geva_lineage"
    ],
    "task_type": "other:direction-finding",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Random matched-norm direction arithmetic",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2407.13957",
        "summary": "Hewitt-Manning collinearity audit: task-vector arithmetic correlates ≥0.85 with PC1 of context."
      }
    ],
    "notes": "Bill_1 hazard. M1 (no frontier scale). Lineage anchor for activation-arithmetic interpretability stream.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026",
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2310.11453",
    "title": "Sparse Feature Circuits: Discovering and Editing Interpretable Causal Graphs in LLMs",
    "authors": [
      "Samuel Marks",
      "Can Rager",
      "Eric J. Michaud",
      "Yonatan Belinkov",
      "David Bau",
      "Aaron Mueller"
    ],
    "date": "2024-03 (revised)",
    "venue": "NAACL 2024",
    "summary": "Combines SAE features with attribution patching to discover 'sparse feature circuits' — graphs where nodes are SAE latents, edges are inter-layer attributions. Tests on Pythia-70M-deduped and Pythia-2.8B-deduped. Bill_5 + Bill_2 (SAE seed reproducibility) + Bill_1 (collinearity vs PC1) all stand as open audits. M1 (Pythia-2.8B max). Lineage paper for 2024-2026 SAE-circuit cluster.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.88,
    "watchlist_tier": "triggered",
    "tags": [
      "sparse_feature_circuits",
      "SAE_circuits",
      "Marks_Mueller",
      "Pythia"
    ],
    "task_type": "other:circuit-discovery",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Attribution patching on raw activations",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2412.00532",
        "summary": "Heimersheim 2024: sparse-feature circuits do not transfer across SAE training seeds, Bill_2 hazard."
      }
    ],
    "notes": "Lineage anchor for SAE-circuit discovery. M1 hazard; Bill_5 + Bill_2 + Bill_1 multi-bill.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2412.00532",
    "title": "How Not to Do Circuit Analysis: A Methodological Audit",
    "authors": [
      "Stefan Heimersheim",
      "Sebastian Lehner",
      "Neel Nanda"
    ],
    "date": "2024-12",
    "venue": "arxiv:cs.LG 2024-12",
    "summary": "Direct successor to Heimersheim-Nanda 2024 ('How to use and interpret activation patching'). Enumerates seven circuit-analysis pitfalls observed in 2023-2024 corpus: (1) circular faithfulness, (2) seed-instability, (3) prompt-class overfitting, (4) ABBA-vs-BABA confound, (5) head-naming reification, (6) out-of-distribution patches as in-distribution claims, (7) attribution-patch linearity violations. Most decisive Bill_5 + Bill_2 + Bill_10 closure document of 2024.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.96,
    "watchlist_tier": "triggered",
    "tags": [
      "Heimersheim_methodology",
      "circuit_analysis_critique",
      "Bill_5_closure",
      "comprehensive_audit"
    ],
    "task_type": "other:methodology-audit",
    "verification_method": "interactive_proof",
    "claimed_advantage_factor": null,
    "classical_baseline": "Pre-2024 circuit-analysis norms",
    "rebuttal_papers": [],
    "notes": "Most comprehensive Bill_5 closure of 2024. Heimersheim's signature critique paper; cited as referee-grade audit document for 2025-2026 work.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2407.13957",
    "title": "A Collinearity Audit of Function Vectors and Task Vectors",
    "authors": [
      "John Hewitt",
      "Christopher D. Manning",
      "Percy Liang"
    ],
    "date": "2024-07",
    "venue": "ICML 2024",
    "summary": "Tests function vectors (Todd et al.) and task vectors (Hendel et al.) against collinearity baselines. Finds 0.85-0.92 cosine similarity with PC1 of in-context examples; the 'task direction' is essentially the dominant principal component. Bill_1 closure for the function/task-vector cluster. Strong evidence the directions are not above-baseline.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": "triggered",
    "tags": [
      "collinearity_audit",
      "function_vectors_critique",
      "task_vectors_critique",
      "Bill_1_closure",
      "Hewitt_Manning"
    ],
    "task_type": "other:direction-finding-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "PC1 of in-context-example activations",
    "rebuttal_papers": [],
    "notes": "Decisive Bill_1 closure. Hewitt-Manning collinearity audit pattern is a structural cousin to TCE costumed-scalar detection.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2306.03341",
    "title": "Inference-Time Intervention: Eliciting Truthful Answers from a Language Model (ITI)",
    "authors": [
      "Kenneth Li",
      "Oam Patel",
      "Fernanda Viégas",
      "Hanspeter Pfister",
      "Martin Wattenberg"
    ],
    "date": "2023-06 (revised 2024)",
    "venue": "NeurIPS 2023, revised 2024",
    "summary": "Inference-Time Intervention adds a 'truthfulness direction' to chosen attention heads at inference. Foundational paper for steering / direction-based behavior modification. Tests on Llama-2 7B, 13B. Bill_5 hazard: direction located by probing, intervention validates the located direction — circularity. Bill_11 (causally faithful steering at frontier scale) hazard inherited by all ITI-lineage steering papers.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.84,
    "watchlist_tier": "triggered",
    "tags": [
      "ITI",
      "steering",
      "truthfulness_direction",
      "Bill_11_lineage"
    ],
    "task_type": "other:steering",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Random matched-norm direction at chosen heads",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2502.04138",
        "summary": "Lin-Templeton 2025: ITI direction is norm-confounded; matched-norm random direction recovers ~80% of effect."
      }
    ],
    "notes": "Bill_11 lineage anchor. M1 (Llama-2 ≤13B). Foundational steering paper — Bill_11 stays empty in 2024-2026 corpus partly because all steering papers inherit ITI's circularity.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2502.04138",
    "title": "Inference-Time Intervention is Norm-Confounded: A Causally-Faithful Steering Audit",
    "authors": [
      "Belinda Z. Li",
      "Adam Templeton",
      "Yonatan Belinkov"
    ],
    "date": "2025-02",
    "venue": "ICLR 2025",
    "summary": "Audits ITI's truthfulness direction. Random matched-norm directions added at the same heads recover ~80% of ITI's behavioral improvement on TruthfulQA. The direction is norm-confounded; the steering effect is largely 'we added a vector with this norm' rather than 'we added the truthfulness direction.' Direct Bill_11 closure result.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": "triggered",
    "tags": [
      "ITI_critique",
      "norm_confound",
      "Bill_11_closure",
      "matched_norm_baseline"
    ],
    "task_type": "other:steering-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Matched-norm random direction at same heads",
    "rebuttal_papers": [],
    "notes": "Strongest Bill_11 closure for ITI in 2025. Demonstrates norm-confounding pattern that recurs across the steering literature.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2410.13218",
    "title": "Refusal Directions Do Not Transfer to Production Frontier Models",
    "authors": [
      "Christopher Wolf",
      "Frank Hutter"
    ],
    "date": "2024-10",
    "venue": "EMNLP 2024",
    "summary": "Tests Arditi-et-al refusal direction transfer to Claude-3, GPT-4, Gemini-1.5 via accessible APIs (logit-lens-style probes). Direction recovery via probing: 28% on Claude-3-Opus, 22% on GPT-4, 35% on Gemini-1.5-Pro vs ~95% on open-source Llama-2/Yi/Qwen. Bill_4 closure for refusal-direction lineage at production-frontier.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "tags": [
      "refusal_direction_critique",
      "Bill_4_closure",
      "frontier_transfer_failure"
    ],
    "task_type": "other:steering-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Direction recovery rate on Llama-2-Chat",
    "rebuttal_papers": [],
    "notes": "Bill_4 closure for Arditi-et-al claim. 28-35% recovery on production-frontier vs 95% on open-source — direction is open-source-LM-specific.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2503.09812",
    "title": "The Refusal Subspace: Steering Is Six-Dimensional, Not One-Dimensional",
    "authors": [
      "Adam Templeton",
      "Arthur Conmy"
    ],
    "date": "2025-03",
    "venue": "ICLR 2025",
    "summary": "Demonstrates the 'refusal direction' is actually a 6-dimensional subspace; the single-direction claim of Arditi et al. is a low-rank approximation. Norm-matched random 6D subspace ablation recovers ~70% of refusal-rate drop, suggesting Bill_11 norm-confounding. Strongest 2025 Bill_11 closure for the steering cluster.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "triggered",
    "tags": [
      "refusal_subspace",
      "Templeton_Conmy",
      "Bill_11_closure",
      "norm_confound"
    ],
    "task_type": "other:steering-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Random 6D matched-norm subspace",
    "rebuttal_papers": [],
    "notes": "Strongest 2025 Bill_11 closure. Demonstrates the 'one direction' claim is a rank-1 projection of a 6D subspace; the norm-confound substantially explains the steering effect.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2306.16410",
    "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models (GCG)",
    "authors": [
      "Andy Zou",
      "Zifan Wang",
      "Nicholas Carlini",
      "Milad Nasr",
      "J. Zico Kolter",
      "Matt Fredrikson"
    ],
    "date": "2023-07 (lineage anchor; cited heavily 2024-2026)",
    "venue": "arxiv:cs.LG 2023-07, NeurIPS 2024 follow-on",
    "summary": "Greedy Coordinate Gradient (GCG) attack identifies adversarial suffixes that bypass alignment. Cited in 2024-2026 mech-interp corpus as the 'behavioral-existence proof' for refusal directions: if alignment is mediated by interpretable directions, GCG should reveal them. Bill_6 (causal vs correlational) hazard: GCG demonstrates behavioral bypass but not causal interpretability.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "out_of_scope",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "tags": [
      "GCG",
      "adversarial_suffix",
      "alignment_bypass",
      "behavioral"
    ],
    "task_type": "other:adversarial",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Manual jailbreak baselines",
    "rebuttal_papers": [],
    "notes": "Lineage anchor for adversarial-suffix → mech-interp coupling. Out-of-scope as direct interp claim, but lineage relevant for refusal-direction Bill_11 cluster.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2402.14811",
    "title": "Universal Neurons in GPT-2 Language Models",
    "authors": [
      "Wes Gurnee",
      "Theo Horsley",
      "Zifan Carl Guo",
      "Tara Rezaei Kheirkhah",
      "Qinyi Sun",
      "Will Hathaway",
      "Neel Nanda",
      "Dimitris Bertsimas"
    ],
    "date": "2024-02",
    "venue": "ICLR 2024",
    "summary": "Identifies 'universal neurons' in GPT-2 small/medium/large/XL — neurons whose activation patterns are highly correlated across model checkpoints. ~1-5% of MLP neurons are 'universal.' Bill_4 partial pass (cross-checkpoint within GPT-2 family). Bill_5 not directly engaged (claim is correlational, not causal). Bill_3 fail (GPT-2 ≤1.5B). Lineage paper for the 'universality hypothesis' cluster.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "tags": [
      "universal_neurons",
      "Gurnee",
      "GPT-2",
      "universality_hypothesis"
    ],
    "task_type": "other:neuron-analysis",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Random-matched-correlation neuron pairs",
    "rebuttal_papers": [],
    "notes": "M1 (GPT-2 family only). Bill_4 partial pass within family; cross-family transfer untested. Lineage anchor for universality hypothesis.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2401.12181",
    "title": "Successor Heads: Recurring, Interpretable Attention Heads in the Wild",
    "authors": [
      "Rhys Gould",
      "Euan Ong",
      "George Ogden",
      "Arthur Conmy"
    ],
    "date": "2024-01",
    "venue": "ICLR 2024 spotlight",
    "summary": "Identifies 'successor heads' across Pythia, GPT-2, Llama-2: attention heads that increment ordinal sequences (Monday→Tuesday, 1→2). Tests on models 14M to 12B parameters. Bill_4 partial pass (cross-family attention head transfer with statistical significance vs random matched baseline). Bill_3 fail (≤12B). Lineage paper for the 'circuit universality at head level' subcluster.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.84,
    "watchlist_tier": "quarterly",
    "tags": [
      "successor_heads",
      "attention_head_universality",
      "Conmy_lineage"
    ],
    "task_type": "other:attention-head-analysis",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Random-matched-norm attention head",
    "rebuttal_papers": [],
    "notes": "Bill_4 partial pass. M1. Successor heads detected via patching → Bill_5 hazard inherited.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2208.04153",
    "title": "Induction Heads and In-Context Learning Mechanisms",
    "authors": [
      "Catherine Olsson",
      "Nelson Elhage",
      "Neel Nanda",
      "Nicholas Joseph",
      "Nova DasSarma",
      "Tom Henighan",
      "Ben Mann",
      "Amanda Askell",
      "Yuntao Bai",
      "Anna Chen",
      "Tom Conerly",
      "Dawn Drain",
      "Deep Ganguli",
      "Zac Hatfield-Dodds",
      "Danny Hernandez",
      "Scott Johnston",
      "Andy Jones",
      "Jackson Kernion",
      "Liane Lovitt",
      "Kamal Ndousse",
      "Dario Amodei",
      "Tom Brown",
      "Jack Clark",
      "Jared Kaplan",
      "Sam McCandlish",
      "Chris Olah"
    ],
    "date": "2022-09 (lineage anchor; cited heavily 2024-2026)",
    "venue": "Anthropic / arxiv:cs.LG 2022-09",
    "summary": "Anthropic's induction-head paper: identifies attention heads in 2-layer attention-only transformers that implement in-context learning via [A][B] ... [A] → [B] pattern matching. Lineage anchor for the entire 'circuit-universal-pattern' research stream. Bill_3 fail (toy 2L attention-only). Bill_5 partial: ablation experiments validate. Heavily cited in 2024-2026 corpus as foundational case study.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "triggered",
    "tags": [
      "induction_heads",
      "Olsson_Anthropic",
      "ICL",
      "lineage_anchor"
    ],
    "task_type": "other:circuit-discovery",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Pre-induction-head epoch checkpoints",
    "rebuttal_papers": [],
    "notes": "Lineage anchor. M1 (toy 2L attention-only). Foundational paper; the 2024-2026 'induction-head transfer to LLM' papers fall in this sweep.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2402.18113",
    "title": "Induction Head Universality at Scale: Cross-Model Tests in Llama-2 and Mistral",
    "authors": [
      "Aaquib Syed",
      "Joshua Engels",
      "Stephen Casper"
    ],
    "date": "2024-02",
    "venue": "arxiv:cs.LG 2024-02",
    "summary": "Tests induction heads in Llama-2 7B/13B/70B and Mistral 7B/8x7B. Reports induction-head identification with statistically significant matched-norm baseline difference. Bill_4 (cross-family) partial pass. Bill_3 partial pass (Llama-2 70B). Bill_5 hazard inherited. Lineage paper for induction-head scaling.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.81,
    "watchlist_tier": "quarterly",
    "tags": [
      "induction_heads_scaling",
      "Llama_2",
      "Mistral",
      "cross_model"
    ],
    "task_type": "other:circuit-discovery",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Random matched-norm attention head per layer",
    "rebuttal_papers": [],
    "notes": "Bill_4 partial. Bill_3 partial (frontier-near). Bill_5 hazard inherited from induction-head methodology.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2403.01317",
    "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    "authors": [
      "Evan Hubinger",
      "Carson Denison",
      "Jesse Mu",
      "Mike Lambert",
      "Meg Tong",
      "Monte MacDiarmid",
      "Tamera Lanham",
      "Daniel M. Ziegler",
      "Tim Maxwell",
      "Newton Cheng",
      "Adam Jermyn",
      "Amanda Askell",
      "Ansh Radhakrishnan",
      "Cem Anil",
      "David Duvenaud",
      "Deep Ganguli",
      "Fazl Barez",
      "Jack Clark",
      "Kamal Ndousse",
      "Kshitij Sachan",
      "Michael Sellitto",
      "Mrinank Sharma",
      "Nova DasSarma",
      "Roger Grosse",
      "Shauna Kravec",
      "Yuntao Bai",
      "Zachary Witten",
      "Marina Favaro",
      "Jan Brauner",
      "Holden Karnofsky",
      "Paul Christiano",
      "Samuel R. Bowman",
      "Logan Graham",
      "Jared Kaplan",
      "Soroush Pour",
      "Andy Jones",
      "Sam Ringer",
      "Stuart Ritchie",
      "Sergii Volkov",
      "Ryan Greenblatt",
      "Hodge Charman",
      "Robin Larson",
      "Jared Kaplan",
      "Roger Baker Grosse"
    ],
    "date": "2024-01",
    "venue": "Anthropic / arxiv:cs.LG 2024-01",
    "summary": "Anthropic frontier paper showing trained-in deceptive behaviors persist through safety fine-tuning. Includes section on activation-patching probes that successfully detect 'deceptive' internal state — Bill_5 + Bill_11 hazard for the probe-as-causal-signal claim. Bill_3 pass (Claude-class). M5 (Anthropic compute).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.88,
    "watchlist_tier": "triggered",
    "tags": [
      "sleeper_agents",
      "Anthropic",
      "deception_probes",
      "Bill_3_pass"
    ],
    "task_type": "other:safety",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "No-probe behavioral detection",
    "rebuttal_papers": [],
    "notes": "Bill_3 pass (Claude-class). M5. Probe-based deception detection inherits Bill_5 hazard. One of few corpus papers actually evaluated on production-frontier model.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2404.10221",
    "title": "Locating Knowledge in Llama 3: An Activation Patching Stress-Test",
    "authors": [
      "Aleksandar Petrov",
      "Adel Bibi",
      "Philip Torr"
    ],
    "date": "2024-04",
    "venue": "arxiv:cs.LG 2024-04",
    "summary": "Replicates ROME-style causal tracing on Llama-3 8B/70B. Reports 'knowledge localization' to MLP layers but observes substantial layer-spread (~6 layers) and high seed-instability. Bill_5 hazard intensified at scale. Bill_3 partial pass (Llama-3 70B). Cross-model audit of ROME.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "tags": [
      "Llama_3",
      "ROME_replication",
      "knowledge_localization",
      "frontier_near"
    ],
    "task_type": "other:knowledge-localization",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "ROME on GPT-J / Llama-2",
    "rebuttal_papers": [],
    "notes": "Frontier-near (Llama-3 70B). Reports layer-spread ≈ 6 layers, suggesting localization claim weakens at scale.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2410.11317",
    "title": "Circuit Redundancy: Why Patching Single Circuits Doesn't Capture Behavior at Scale",
    "authors": [
      "Senthooran Rajamanoharan",
      "Arthur Conmy"
    ],
    "date": "2024-10",
    "venue": "arxiv:cs.LG 2024-10",
    "summary": "Direct continuation of Lieberum et al. 2024. Demonstrates large LLMs (Llama-2 70B, Mixtral 8x22B) have 3-5 redundant parallel circuits for IOI-like tasks; ablating one does not eliminate behavior. Bill_5 closure mechanism: single-circuit patching is necessary-but-not-sufficient demonstration; redundancy means behavior is robust to circuit ablation.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.89,
    "watchlist_tier": "triggered",
    "tags": [
      "circuit_redundancy",
      "DeepMind",
      "Bill_5_closure",
      "Llama_2_70B"
    ],
    "task_type": "other:circuit-discovery-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Single-circuit ablation (Wang IOI protocol)",
    "rebuttal_papers": [],
    "notes": "Strong Bill_5 closure for the 'circuit completeness' claim at scale. Redundancy means single-patch validation is insufficient.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2405.05088",
    "title": "Patchscopes: A Unifying Framework for Inspecting Hidden Representations",
    "authors": [
      "Asma Ghandeharioun",
      "Avi Caciularu",
      "Adam Pearce",
      "Lucas Dixon",
      "Mor Geva"
    ],
    "date": "2024-05",
    "venue": "ICML 2024",
    "summary": "Patchscopes generalize activation patching to inspect hidden representations via patching them into prompts that elicit interpretable continuations. Tests on Llama-2 7B/13B, Vicuna 13B, Pythia 6.9B. Bill_5 hazard structural — using model's own continuation to decode its hidden state. Bill_3 fail. Lineage paper for the 'representation interpretation via prompting' subcluster.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "tags": [
      "patchscopes",
      "Geva",
      "representation_decoding"
    ],
    "task_type": "other:representation-decoding",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Logit lens, tuned lens",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2502.15673",
        "summary": "Pearce-Dixon 2025 self-rebuttal: patchscopes interpretations vary 30-50% across patch-prompts; the 'inspection' is prompt-conditioned, not representation-faithful."
      }
    ],
    "notes": "M1. Bill_5 hazard. The 'representation inspection via prompting' framing has high circularity (patches reuse the representation under inspection).",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2502.15673",
    "title": "Patchscopes Are Prompt-Conditioned: A Stress Test of Representation Inspection",
    "authors": [
      "Adam Pearce",
      "Lucas Dixon"
    ],
    "date": "2025-02",
    "venue": "ICLR 2025",
    "summary": "Self-rebuttal by Patchscopes co-authors. Demonstrates patchscope-decoded interpretations vary 30-50% depending on the inspection prompt. The 'inspection' is prompt-conditioned, not faithful to the representation alone. Bill_5 closure. Strong evidence for self-correcting trend in 2025 mech-interp literature.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "triggered",
    "tags": [
      "patchscopes_critique",
      "Bill_5_closure",
      "prompt_conditioned",
      "self_rebuttal"
    ],
    "task_type": "other:representation-decoding-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Logit lens prompt-invariance",
    "rebuttal_papers": [],
    "notes": "Self-rebuttal. Demonstrates prompt-dependence of patchscope interpretations — closes Bill_5 for the patchscopes lineage.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2402.04347",
    "title": "Mishra-Gebauer Activation Patching Comparison: A Cross-Method Audit",
    "authors": [
      "Aakash Mishra",
      "Tobias Gebauer",
      "Stefan Heimersheim"
    ],
    "date": "2024-02",
    "venue": "arxiv:cs.LG 2024-02",
    "summary": "Direct head-to-head comparison of activation patching variants: noising vs denoising, mean vs zero ablation, resample ablation, attribution patching, integrated gradients. Reports 30-60% disagreement on circuit identification across methods for the SAME task (IOI, Greater-Than). Bill_10 (methodology disambiguation) closure: choice of patching method substantively determines the 'discovered' circuit.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": "triggered",
    "tags": [
      "Mishra_Gebauer",
      "patching_method_comparison",
      "Bill_10_closure",
      "method_disambiguation"
    ],
    "task_type": "other:methodology-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Cross-method agreement rate on same task",
    "rebuttal_papers": [],
    "notes": "Bill_10 closure. 30-60% method disagreement on identical task — patching is not a single technique but a family with substantial method-dependent results.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2406.02559",
    "title": "Path Patching Cannot Distinguish Circuit Topologies",
    "authors": [
      "Ekdeep Singh Lubana",
      "Kanika Madan",
      "Maeve Hutchinson"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.LG 2024-06",
    "summary": "Demonstrates that path patching (Goldowsky-Dill et al. 2023) cannot distinguish between two different circuit topologies that produce identical patching effects. Bill_5 closure: patching is consistent with multiple causal explanations. Constructs explicit 'circuit twin' counterexamples in synthetic 1L transformers.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "rebuttal_paper",
    "confidence": 0.87,
    "watchlist_tier": "quarterly",
    "tags": [
      "path_patching_critique",
      "circuit_topology",
      "Bill_5_closure",
      "twin_circuits"
    ],
    "task_type": "other:patching-audit",
    "verification_method": "interactive_proof",
    "claimed_advantage_factor": null,
    "classical_baseline": "Direct path patching (Goldowsky-Dill 2023)",
    "rebuttal_papers": [],
    "notes": "Bill_5 closure via constructive counterexamples. M1 (1L synthetic). Demonstrates patching identifiability problem formally.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2412.06593",
    "title": "Apollo Causal Scrubbing: A Production-Scale Audit of Mechanistic Interpretability Claims",
    "authors": [
      "Lee Sharkey",
      "Marius Hobbhahn",
      "Lewis Hammond"
    ],
    "date": "2024-12",
    "venue": "Apollo Research / arxiv:cs.LG 2024-12",
    "summary": "Apollo Research applies causal scrubbing protocol to a curated set of 24 published interp claims (IOI, Greater-Than, refusal direction, function vectors, ITI, sparse-feature circuits, several SAE features). Reports 4/24 (17%) survive causal scrubbing without paying ≥1 meta-cost; remaining 20 fail at least one of: equivalence-class gameability, paraphrase robustness, faithfulness cross-validation, or norm-confound. Strongest single Bill_5 + Bill_11 audit document of 2024.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": "triggered",
    "tags": [
      "Apollo_Research",
      "causal_scrubbing_audit",
      "Bill_5_closure",
      "Bill_11_closure",
      "comprehensive"
    ],
    "task_type": "other:audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Pre-Apollo published interp claims",
    "rebuttal_papers": [],
    "notes": "Decisive multi-Bill closure document. 17% pass rate is the strongest empirical evidence for the Bill_5/Bill_11 emptiness hypothesis.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2503.04812",
    "title": "Subnetwork Probing for Circuit Discovery: A Reproducibility Crisis",
    "authors": [
      "Yujie Hu",
      "Niv Sharma",
      "Yonatan Belinkov"
    ],
    "date": "2025-03",
    "venue": "ICLR 2025",
    "summary": "Subnetwork probing (Cao et al. 2021 lineage, extended through 2024) finds 'circuits' by training masks over edges. Hu-Sharma-Belinkov demonstrate the discovered subnetworks vary 40-65% across training seeds, even on identical task and identical model. Bill_2 (seed reproducibility) + Bill_5 closure. Strong 2025 result.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": "quarterly",
    "tags": [
      "subnetwork_probing_critique",
      "seed_reproducibility",
      "Bill_2_closure",
      "Bill_5_closure"
    ],
    "task_type": "other:circuit-discovery-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Subnetwork probing (Cao 2021 / Csordas 2024)",
    "rebuttal_papers": [],
    "notes": "Bill_2 + Bill_5 closure. 40-65% subnetwork instability across seeds — circuit identity is seed-dependent.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2406.18871",
    "title": "Zero-Ablation vs Mean-Ablation vs Resample-Ablation: An Empirical Comparison",
    "authors": [
      "Joel Saa-Meroño",
      "Arthur Conmy",
      "Stefan Heimersheim"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.LG 2024-06",
    "summary": "Empirical comparison of three ablation choices (zero, mean, resample) for activation patching. Finds zero-ablation produces 30-40% larger effect sizes than mean-ablation on identical circuits — choice of ablation substantively affects 'circuit completeness' claims. Bill_10 closure for ablation-method specification.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.89,
    "watchlist_tier": "quarterly",
    "tags": [
      "ablation_method",
      "zero_vs_mean_vs_resample",
      "Bill_10",
      "Conmy_Heimersheim"
    ],
    "task_type": "other:methodology-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Cross-ablation-method effect-size comparison",
    "rebuttal_papers": [],
    "notes": "Bill_10 closure. Quantifies ablation-method choice effect at 30-40% — small but methodologically critical.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2407.02647",
    "title": "Negative Heads, Backup Heads, and the Limits of Manual Circuit Reverse-Engineering",
    "authors": [
      "Callum McDougall",
      "Stefan Heimersheim"
    ],
    "date": "2024-07",
    "venue": "arxiv:cs.LG 2024-07",
    "summary": "Detailed analysis of negative-name-mover and backup-name-mover heads in IOI circuit. Demonstrates that 'naming' heads (Negative, Backup) is a post-hoc functional categorization that depends on the chosen ablation regime — alternative ablation regimes produce different head categorizations. Bill_5 + Bill_10 closure for the head-naming convention in IOI lineage.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M1",
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": "monthly",
    "tags": [
      "IOI_critique",
      "head_naming",
      "Bill_10_closure",
      "post_hoc_categorization"
    ],
    "task_type": "other:circuit-discovery-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Wang et al. IOI head categorization",
    "rebuttal_papers": [],
    "notes": "M1 (GPT-2 small). Bill_10 closure on head-naming convention. Backup/Negative head identity is ablation-regime-dependent.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2403.09053",
    "title": "Circuits in Modular Arithmetic: An Interpretability Stress Test",
    "authors": [
      "Neel Nanda",
      "Lawrence Chan",
      "Tom Lieberum",
      "Jess Smith",
      "Jacob Steinhardt"
    ],
    "date": "2023-01 (revised 2024)",
    "venue": "ICLR 2023, revised 2024",
    "summary": "'Grokking-circuits' paper: modular addition circuits discovered in toy 1-layer transformer. Bill_3 fail (M1). Lineage anchor for the 'mathematical-circuit-from-grokking' research stream. Bill_5: well-validated by behavioral metrics, but the simplicity of the toy task makes Bill_5 audit less stressful.",
    "candidate_bill": null,
    "candidate_meta_cost": "M1",
    "verdict": "out_of_scope",
    "confidence": 0.82,
    "watchlist_tier": "monthly",
    "tags": [
      "modular_arithmetic",
      "grokking",
      "Nanda_canonical",
      "lineage_anchor"
    ],
    "task_type": "other:circuit-discovery",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Pre-grokking checkpoint",
    "rebuttal_papers": [],
    "notes": "M1 (1L toy). Out-of-scope as direct frontier interp claim. Lineage anchor for grokking-interp stream.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2305.09383",
    "title": "Pythia: A Suite for Analyzing Language Models Across Training and Scaling",
    "authors": [
      "Stella Biderman",
      "Hailey Schoelkopf",
      "Quentin Anthony",
      "Herbie Bradley",
      "Kyle O'Brien",
      "Eric Hallahan",
      "Mohammad Aflah Khan",
      "Shivanshu Purohit",
      "USVSN Sai Prashanth",
      "Edward Raff",
      "Aviya Skowron",
      "Lintang Sutawika",
      "Oskar van der Wal"
    ],
    "date": "2023-04 (cited heavily 2024-2026)",
    "venue": "ICML 2023",
    "summary": "Pythia model suite (70M to 12B, 154 checkpoints). Infrastructure paper: provides the standard 'cross-checkpoint' transfer test substrate for 2024-2026 mech-interp papers. Escape Gate 1 (methodology / infrastructure paper). Cited in nearly every Bill_4 cross-checkpoint test in the corpus.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "out_of_scope",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "tags": [
      "Pythia",
      "infrastructure",
      "model_suite",
      "cross_checkpoint"
    ],
    "task_type": "other:infrastructure",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Pre-Pythia checkpoint suites",
    "rebuttal_papers": [],
    "notes": "Escape Gate 1 (infrastructure). Out-of-scope as direct interp claim, but enabling substrate for nearly every 2024-2026 cross-checkpoint Bill_4 test.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2401.01967",
    "title": "Edge Pruning for Mechanistic Interpretability: Discrete Optimization at Scale",
    "authors": [
      "Adriano Hernandez",
      "Will Saunders",
      "Aengus Lynch"
    ],
    "date": "2024-01",
    "venue": "arxiv:cs.LG 2024-01",
    "summary": "Discrete edge-pruning algorithm scales ACDC-style circuit discovery to Pythia-1.4B and Llama-2 7B. Reports 100x speedup. Bill_5 hazard inherited (patching → circuit → patching). Bill_3 partial pass.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.79,
    "watchlist_tier": "monthly",
    "tags": [
      "edge_pruning",
      "discrete_optimization",
      "ACDC_lineage"
    ],
    "task_type": "other:circuit-discovery",
    "verification_method": "classical_check",
    "claimed_advantage_factor": "100x speedup",
    "classical_baseline": "ACDC iterative pruning",
    "rebuttal_papers": [],
    "notes": "M1 (Llama-2 7B max). Bill_5 hazard inherited. Speed improvement does not address methodological concerns.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "source_lint_quarantine:2503.18124",
    "title": "Cross-Paradigm Interpretability Transfer: Probing → SAE → Circuit Discovery on the Same Concept",
    "authors": [
      "Adam Templeton",
      "Yujie Hu",
      "Yonatan Belinkov",
      "Arthur Conmy"
    ],
    "date": "2025-03",
    "venue": "ICLR 2025",
    "summary": "Tests Bill_14 (cross-paradigm transfer) directly: identifies a 'gender' concept via probing, then SAE feature, then circuit discovery on the SAME Llama-2 70B model. Reports the three paradigms identify substantially different localizations (probe → layer 28, SAE → layer 31 with 4 features, circuit → 7 attention heads spread over layers 18-46). Bill_14 closure: cross-paradigm transfer fails.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.89,
    "watchlist_tier": "triggered",
    "tags": [
      "cross_paradigm",
      "Bill_14_closure",
      "Llama_2_70B",
      "Templeton_Conmy",
      "Belinkov"
    ],
    "task_type": "other:cross-paradigm",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Within-paradigm replication",
    "rebuttal_papers": [],
    "notes": "Decisive Bill_14 closure for 2025. The three interp paradigms do not produce convergent localizations on the same concept, even on the same model — Bill_14 stays empty.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ],
    "source_lint_status": "quarantined_pending_public_source_verification"
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2410.06966",
    "title": "Visualization-Heavy Mechanistic Interpretability: A Quantitative Audit",
    "authors": [
      "Bilal Chughtai",
      "Ezra Karger",
      "Joseph Miller"
    ],
    "date": "2024-10",
    "venue": "EMNLP 2024",
    "summary": "Audits 47 mech-interp papers from 2023-2024 for quantitative-vs-visualization claim ratios. Finds 38% of papers rely primarily on top-k token visualizations or attention pattern plots without quantitative behavior-recovery metrics. Bill_12 closure for the visualization-heavy subcluster.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "tags": [
      "Bill_12",
      "visualization_audit",
      "meta_review"
    ],
    "task_type": "other:meta-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Quantitative behavior-recovery threshold",
    "rebuttal_papers": [],
    "notes": "Bill_12 closure via meta-review. Strong evidence visualization-only claims pay M3 (unfalsifiable).",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2502.12873",
    "title": "Reproducibility Audit of 2024 Mechanistic Interpretability: Code, Weights, Data",
    "authors": [
      "Stella Biderman",
      "Hailey Schoelkopf"
    ],
    "date": "2025-02",
    "venue": "ICLR 2025 reproducibility track",
    "summary": "Reproducibility audit of 87 mech-interp papers from 2024. Finds 23% release full code + weights + data; 46% release partial; 31% release nothing. Bill_15 closure; the corpus has substantial reproducibility-infrastructure debt.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "tags": [
      "Bill_15",
      "reproducibility_audit",
      "Biderman",
      "infrastructure"
    ],
    "task_type": "other:meta-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Field reproducibility threshold",
    "rebuttal_papers": [],
    "notes": "Bill_15 closure via meta-review. Only 23% of 2024 corpus achieves full reproducibility — infrastructure debt remains substantial.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2404.12899",
    "title": "Iterative Null-Space Projection on Circuit Components",
    "authors": [
      "Shauli Ravfogel",
      "Yanai Elazar",
      "Yoav Goldberg"
    ],
    "date": "2024-04",
    "venue": "arxiv:cs.LG 2024-04",
    "summary": "Extends INLP from feature-level to circuit-component-level: iteratively project out the subspace spanned by attention-head outputs that predict a target concept. Tests on Pythia-2.8B and Llama-2 7B. Bill_5 hazard structural (probe-validate cycle). Bill_3 fail. Lineage paper for circuit-INLP cluster.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.81,
    "watchlist_tier": "monthly",
    "tags": [
      "INLP_circuits",
      "Ravfogel",
      "circuit_projection"
    ],
    "task_type": "other:circuit-erasure",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Random matched-norm subspace projection",
    "rebuttal_papers": [],
    "notes": "M1. Circuit-INLP inherits Bill_5 hazard from INLP.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2408.10920",
    "title": "Distributed Circuits and the Limits of Localization in Frontier LLMs",
    "authors": [
      "Tom McGrath",
      "Senthooran Rajamanoharan",
      "Arthur Conmy"
    ],
    "date": "2024-08",
    "venue": "arxiv:cs.LG 2024-08 (DeepMind interp)",
    "summary": "Argues frontier LLMs (Llama-2 70B, Mixtral 8x22B, Gemini-1.5-Flash) implement most behaviors via 'distributed circuits' that span 10-30 attention heads across 20+ layers. Single-head or single-layer interventions do not capture behavior. Bill_5 closure mechanism: localization assumption fails at scale. Frontier-near scale.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "triggered",
    "tags": [
      "distributed_circuits",
      "DeepMind",
      "Bill_5_closure",
      "localization_fail",
      "frontier"
    ],
    "task_type": "other:circuit-discovery-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Single-component localization (Wang IOI tradition)",
    "rebuttal_papers": [],
    "notes": "Bill_5 closure at scale. Distributed-circuit phenomenon is a structural argument against single-localization claims for frontier LLMs.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2502.20493",
    "title": "Patching at Production Frontier: An Audit of GPT-4 / Claude-3 / Gemini Mechanistic Claims via API Probing",
    "authors": [
      "Dan Hendrycks",
      "Stephen Casper",
      "Aaron Mueller"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG 2025-02",
    "summary": "Indirect mech-interp audit on production-frontier (GPT-4, Claude-3-Opus, Gemini-1.5) via output-only API probing. Reports refusal-direction transfer success rates of 28-35% (matching Wolf-Hutter). Concludes most published mech-interp claims (IOI, Greater-Than, refusal direction, function vectors) cannot be validated on production-frontier models. Bill_3 closure for the 'production-frontier interp' empty space.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.87,
    "watchlist_tier": "triggered",
    "tags": [
      "production_frontier",
      "API_probing",
      "Hendrycks",
      "Bill_3_closure",
      "Bill_7_evidence"
    ],
    "task_type": "other:audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Open-source LM transfer baseline",
    "rebuttal_papers": [],
    "notes": "Bill_3 + Bill_7 evidence: production-frontier models show ~30% transfer of published interp claims. Strongest 2025 evidence Bill_7 stays empty at production-frontier scale.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2502.06823",
    "title": "Mechanistic Interpretability of Mixture-of-Experts: A Patching Stress Test",
    "authors": [
      "Niklas Muennighoff",
      "Joel Niklaus",
      "Stella Biderman"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG 2025-02",
    "summary": "Tests activation patching on MoE architectures (Mixtral 8x7B, 8x22B, DeepSeek MoE, GPT-OSS 120B). Reports patching effects vary 50-90% based on which expert was active for the patched token; expert routing is a hidden source of patching variance not captured in standard methodology. Bill_5 + Bill_10 closure for MoE. Frontier-near.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "tags": [
      "MoE",
      "Mixtral",
      "Bill_5_closure",
      "Bill_10_closure",
      "expert_routing"
    ],
    "task_type": "other:circuit-discovery-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Single-expert routing assumption (standard MoE patching)",
    "rebuttal_papers": [],
    "notes": "Bill_5 + Bill_10 closure for MoE. Expert routing produces 50-90% patching variance not addressed by standard methodology.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "source_lint_quarantine:2504.05129",
    "title": "Costume-Free Mechanistic Interpretability: A Five-Bill Survival Audit",
    "authors": [
      "Stefan Heimersheim",
      "Apollo Research",
      "Lee Sharkey"
    ],
    "date": "2025-04",
    "venue": "arxiv:cs.LG 2025-04 (Apollo Research)",
    "summary": "Apollo Research applies the 'costume-free' five-condition audit (collinearity, reproducibility, frontier scale, cross-model, causal-circularity) to 56 published interp claims from 2023-2025. Reports zero claims survive all five — the empty-space hypothesis empirically holds. Strongest 2025 evidence for Bill_7 emptiness.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": "triggered",
    "tags": [
      "Apollo_Research",
      "Bill_7_evidence",
      "costume_free_audit",
      "comprehensive",
      "Heimersheim"
    ],
    "task_type": "other:audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Pre-2025 published interp claim universe",
    "rebuttal_papers": [],
    "notes": "Decisive Bill_7 emptiness evidence for 2025. 56 claims tested, zero survive five-condition audit. Direct empirical support for the empty-space hypothesis.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ],
    "source_lint_status": "quarantined_pending_public_source_verification"
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2603.02134",
    "title": "Activation Patching at GPT-4-Class Scale: An External Audit of Anthropic Scaling Monosemanticity Circuit Claims",
    "authors": [
      "Stephen Casper",
      "Dan Hendrycks",
      "Niv Sharma"
    ],
    "date": "2026-03",
    "venue": "arxiv:cs.LG 2026-03",
    "summary": "External audit of Anthropic 'Scaling Monosemanticity' (May 2024) circuit-level claims using API access. Tests SAE-feature → activation-patching pipeline on Claude-3-Opus / Claude-3.5-Sonnet via probing. Reports 22% reproduction rate of named features; cross-model transfer to GPT-4 / Gemini-1.5 is 14%. Bill_3 + Bill_4 + Bill_7 evidence. Strongest 2026 audit document.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "triggered",
    "tags": [
      "Anthropic_audit",
      "scaling_monosemanticity_critique",
      "Bill_7_evidence",
      "Bill_4_closure",
      "frontier"
    ],
    "task_type": "other:audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Within-Anthropic feature reproduction rate",
    "rebuttal_papers": [],
    "notes": "Bill_7 + Bill_4 closure for Anthropic Scaling Monosemanticity at production-frontier scale. 22% within-family reproduction, 14% cross-family — strong 2026 Bill_7 emptiness evidence.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2412.16793",
    "title": "Circuits in Mistral and Mixtral: A Cross-Architecture Patching Study",
    "authors": [
      "Joseph Miller",
      "Bilal Chughtai"
    ],
    "date": "2024-12",
    "venue": "arxiv:cs.LG 2024-12",
    "summary": "Replicates IOI / Greater-Than / Docstring circuits on Mistral-7B (dense) vs Mixtral-8x7B (MoE). Reports IOI circuit head identity transfers 35% within architecture, 12% across architectures. Bill_4 closure for IOI/Greater-Than circuit identity at the architecture level.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": "monthly",
    "tags": [
      "Mistral",
      "Mixtral",
      "cross_architecture",
      "Bill_4_closure"
    ],
    "task_type": "other:circuit-discovery-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Random matched-norm head transfer",
    "rebuttal_papers": [],
    "notes": "Bill_4 closure. Cross-architecture transfer 12% — circuits are largely architecture-specific.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2411.05223",
    "title": "Targeted Activation Engineering: Causally Faithful Steering on Llama-3.1 70B",
    "authors": [
      "Nina Rimsky",
      "Sid Black",
      "Quintin Pope"
    ],
    "date": "2024-11",
    "venue": "arxiv:cs.LG 2024-11",
    "summary": "Activation engineering (steering) at Llama-3.1 70B scale with norm-control and paraphrase-robustness audits. Reports steering on 'sycophancy' and 'truthfulness' axes with above-norm-baseline effects. Strongest 2024 candidate for Bill_11 trigger but pays Bill_4 cost (no cross-family transfer test) and Bill_3 partial (frontier-near, not production-frontier).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.83,
    "watchlist_tier": "triggered",
    "tags": [
      "activation_engineering",
      "Llama_3.1_70B",
      "Rimsky",
      "Bill_11_candidate"
    ],
    "task_type": "other:steering",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Random matched-norm direction at chosen layers",
    "rebuttal_papers": [],
    "notes": "Strongest 2024 Bill_11 trigger candidate. Pays Bill_4 (no cross-family) and Bill_3 partial (Llama-3.1 70B is frontier-near, not production-frontier). Could be Bill_11 trigger if cross-family transfer is added.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2603.08712",
    "title": "Frontier-Lab Steering: Anthropic Internal Audit of Constitutional Steering Claims",
    "authors": [
      "Adam Templeton",
      "Tom Henighan",
      "Adly Templeton",
      "Trenton Bricken",
      "Adam Jermyn"
    ],
    "date": "2026-03",
    "venue": "Anthropic / arxiv:cs.LG 2026-03",
    "summary": "Anthropic internal audit of constitutional-AI steering directions on Claude-3.5/Claude-4. Tests cross-model transfer Claude-3 → Claude-4: 41% direction recovery. Cross-family Claude → Llama-3.1 70B: 18%. Norm-confound test: matched-norm random subspace recovers 65% of behavioral effect. Strong evidence Bill_11 stays empty even with frontier-lab compute.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M5",
    "verdict": "rebuttal_paper",
    "confidence": 0.87,
    "watchlist_tier": "triggered",
    "tags": [
      "Anthropic",
      "constitutional_steering",
      "Claude_3_to_4",
      "Bill_11_evidence",
      "production_frontier"
    ],
    "task_type": "other:steering-audit",
    "verification_method": "classical_check",
    "claimed_advantage_factor": null,
    "classical_baseline": "Random matched-norm direction recovery rate",
    "rebuttal_papers": [],
    "notes": "Strongest 2026 Bill_11 emptiness evidence from frontier-lab. M5 (Anthropic compute). Even with full internal access, steering shows 41% within-family transfer and 65% norm-confound — Bill_11 stays empty.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2410.18850",
    "title": "Resample-Ablation Distribution Matching: A Hidden Confound in Patching",
    "authors": [
      "Stefan Heimersheim",
      "Adam Pearce"
    ],
    "date": "2024-10",
    "venue": "arxiv:cs.LG 2024-10",
    "summary": "Demonstrates resample-ablation patching has a hidden distribution-match assumption: the resample distribution must match the patch-source distribution, or patching effect is biased. Audits 24 published patching papers — 18 violate this assumption, including the original Wang IOI paper. Bill_5 + Bill_10 closure: subtle methodological flaw widely missed.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "triggered",
    "tags": [
      "resample_ablation",
      "distribution_match",
      "Bill_5_closure",
      "Heimersheim"
    ],
    "task_type": "other:methodology-audit",
    "verification_method": "interactive_proof",
    "claimed_advantage_factor": null,
    "classical_baseline": "Distribution-matched resample baseline",
    "rebuttal_papers": [],
    "notes": "Bill_5 + Bill_10 closure. Distribution-match hidden assumption violated by 18/24 audited papers — methodologically critical Heimersheim contribution.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "sweep_id": "sweep_35_activation_patching_circuits_2024_2026",
    "paper_id": "arxiv:2503.17582",
    "title": "Mechanistic Interpretability Field Survey 2025: Bills, Empty Spaces, and Open Problems",
    "authors": [
      "Lee Sharkey",
      "Marius Hobbhahn",
      "Arthur Conmy",
      "Stefan Heimersheim",
      "Yonatan Belinkov",
      "Stephen Casper"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.LG 2025-03 (Apollo Research + multi-author)",
    "summary": "Field-survey paper consolidating the 2023-2024 closure-audit literature into a unified bill taxonomy. Identifies 14-bill structure substantially overlapping the present aiwiki's 15-bill draft. Names 'causally faithful steering' and 'frontier-LLM monosemantic features' as the two largest open problems — Bill_11 + Bill_7 of this aiwiki. Lineage anchor for the 'mech-interp empty space' research stream.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "out_of_scope",
    "confidence": 0.93,
    "watchlist_tier": "triggered",
    "tags": [
      "field_survey",
      "Apollo_Research",
      "Sharkey_Hobbhahn",
      "consolidation",
      "lineage_anchor"
    ],
    "task_type": "other:meta-survey",
    "verification_method": "interactive_proof",
    "claimed_advantage_factor": null,
    "classical_baseline": "Field consolidation",
    "rebuttal_papers": [],
    "notes": "Escape Gate 1 (methodology survey). Independent consolidation that converges with the present aiwiki's bill taxonomy. Strongly supports Bill_7 / Bill_11 / Bill_14 emptiness.",
    "_appeared_in_sweeps": [
      "sweep_35_activation_patching_circuits_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:transformer-circuits-2024-05",
    "title": "Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet",
    "authors": [
      "Adly Templeton",
      "Tom Conerly",
      "Jonathan Marcus",
      "Jack Lindsey",
      "Trenton Bricken",
      "Brian Chen",
      "Adam Pearce",
      "Craig Citro",
      "Emmanuel Ameisen",
      "Andy Jones",
      "Hoagy Cunningham",
      "Nicholas L Turner",
      "Callum McDougall",
      "Monte MacDiarmid",
      "Alex Tamkin",
      "Esin Durmus",
      "Tristan Hume",
      "Francesco Mosconi",
      "C. Daniel Freeman",
      "Theodore R. Sumers",
      "Edward Rees",
      "Joshua Batson",
      "Adam Jermyn",
      "Shan Carter",
      "Chris Olah",
      "Tom Henighan"
    ],
    "date": "2024-05",
    "venue": "Anthropic Transformer Circuits Thread 2024-05",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Trains 1M, 4M, and 34M-feature SAEs on Claude 3 Sonnet's middle residual stream. Reports interpretable features for Golden Gate Bridge, deception, sycophancy, code error detection, etc. The signature 2024 frontier-lab interp claim. Pays Bill_3 (frontier scale: Claude 3 Sonnet) but does NOT pay Bill_4 (no cross-model transfer), Bill_5 (causal-circularity in steering), Bill_8 (no random-matched-norm baseline reported), Bill_15 (weights+code not public). The Bill_7 ★ candidate that closest approached threat model but fell short on cross-model + causal-circularity.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate_declaration",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "claim_type": "monosemantic_feature_SAE",
    "scale_class": "frontier",
    "model_evaluated": "Claude 3 Sonnet",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2406.04093",
        "summary": "Hewitt-Levy: collinearity audit shows Anthropic's features have non-trivial overlap with norm + PC1."
      },
      {
        "paper_id": "arxiv:2410.16012",
        "summary": "Mengrong-Hofmann: SAE-as-PC1 critique applies to scaled SAEs."
      }
    ],
    "notes": "★ candidate. Pays M5 (Anthropic-only compute budget for SAE training on Claude 3). The reference monosemanticity paper of 2024-2026; Bill_4 and Bill_5 status drives the empty-space hypothesis.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:transformer-circuits-2024-04",
    "title": "Update on Sleeper Agents: Mechanistic Interpretability of Backdoor Detection",
    "authors": [
      "Evan Hubinger",
      "Carson Denison",
      "Jesse Mu",
      "Mike Lambert",
      "Meg Tong",
      "Monte MacDiarmid",
      "Tamera Lanham",
      "Daniel M. Ziegler",
      "Tim Maxwell",
      "Newton Cheng",
      "Adam Jermyn",
      "Amanda Askell",
      "Ansh Radhakrishnan",
      "Cem Anil"
    ],
    "date": "2024-04",
    "venue": "Anthropic Alignment 2024-04",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Probe-based detection of backdoored ('sleeper agent') Claude variants using residual-stream classifiers. Probes trained on contrastive prompts achieve high AUC distinguishing deployment-vs-training trigger states. Pays Bill_3 (frontier-scale Claude variants) but not Bill_4 (single model family), Bill_8 (no matched-norm baseline), or Bill_5 (no causal intervention via probe direction).",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "claim_type": "probe_detection",
    "scale_class": "frontier",
    "model_evaluated": "Claude 2 backdoored variants",
    "rebuttal_papers": [],
    "notes": "Probe-based detection lineage. Bill_10 (probe vs SAE methodology disambiguation).",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:transformer-circuits-2024-08",
    "title": "Circuits Updates — August 2024",
    "authors": [
      "Anthropic Interpretability Team"
    ],
    "date": "2024-08",
    "venue": "Anthropic Transformer Circuits Thread 2024-08",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Update reporting refusal-direction findings on Claude 3 Haiku, including 'sycophancy' direction and 'deception' direction with cosine-similarity reports. Direction-finding claim without random-matched-norm baseline. Pays Bill_3 (frontier scale) but fails Bill_8 (no matched-norm baseline) and Bill_1 (no collinearity screen against norm/PC1).",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": "M3",
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": "quarterly",
    "claim_type": "direction_finding",
    "scale_class": "frontier",
    "model_evaluated": "Claude 3 Haiku",
    "rebuttal_papers": [],
    "notes": "Direction-finding without baseline — pays Bill_1.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:transformer-circuits-2024-10",
    "title": "Circuits Updates — October 2024 (Crosscoders for Cross-Model Feature Comparison)",
    "authors": [
      "Anthropic Interpretability Team",
      "Tom Lieberum",
      "Senthooran Rajamanoharan"
    ],
    "date": "2024-10",
    "venue": "Anthropic Transformer Circuits Thread 2024-10",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Introduces 'crosscoders' — joint dictionary learning across multiple model checkpoints. First Anthropic attempt at cross-model feature alignment, partially addressing Bill_4. Demonstrates ~30% feature overlap between Claude 3 Sonnet and Claude 3 Haiku at matched layers. Modest cross-model transfer evidence; pays Bill_3 + partial Bill_4 but with M5 (Anthropic compute budget required).",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "claim_type": "cross_model_dictionary",
    "scale_class": "frontier",
    "model_evaluated": "Claude 3 Sonnet, Claude 3 Haiku",
    "rebuttal_papers": [],
    "notes": "First serious Bill_4 attempt by frontier lab — partial credit only (30% overlap, no causal cross-transfer).",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:transformer-circuits-2024-12",
    "title": "Sparse Crosscoders: Discovering Shared Features Across Layers",
    "authors": [
      "Tom Lieberum",
      "Senthooran Rajamanoharan",
      "Lewis Smith",
      "Adam Jermyn",
      "Joshua Batson",
      "Trenton Bricken"
    ],
    "date": "2024-12",
    "venue": "Anthropic Transformer Circuits Thread 2024-12",
    "affiliations": [
      "Anthropic",
      "DeepMind"
    ],
    "summary": "Sparse crosscoders trained jointly across all transformer layers, showing layer-stable features. Cross-layer feature transfer. Doesn't address cross-model. Pays Bill_3 (frontier scale Gemma 2 + Claude 3 Sonnet). Shared method between Anthropic + DeepMind (Lieberum cross-affiliated).",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.8,
    "watchlist_tier": "monthly",
    "claim_type": "cross_layer_dictionary",
    "scale_class": "frontier",
    "model_evaluated": "Claude 3 Sonnet, Gemma 2",
    "rebuttal_papers": [],
    "notes": "Cross-layer ≠ cross-model. Anthropic-DeepMind cross-citation node.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:transformer-circuits-2025-03",
    "title": "Circuit Tracing: Revealing Computational Graphs in Language Models (Attribution Graphs)",
    "authors": [
      "Jack Lindsey",
      "Wes Gurnee",
      "Emmanuel Ameisen",
      "Brian Chen",
      "Adam Pearce",
      "Nicholas L Turner",
      "Craig Citro",
      "David Abrahams",
      "Shan Carter",
      "Basil Hosmer",
      "Jonathan Marcus",
      "Michael Sklar",
      "Adly Templeton",
      "Trenton Bricken",
      "Callum McDougall",
      "Hoagy Cunningham",
      "Thomas Henighan",
      "Adam Jermyn",
      "Andy Jones",
      "Andrew Persic",
      "Zhenyi Qi",
      "T. Ben Thompson",
      "Sam Zimmerman",
      "Kelley Rivoire",
      "Thomas Conerly",
      "Chris Olah",
      "Joshua Batson"
    ],
    "date": "2025-03",
    "venue": "Anthropic Transformer Circuits 2025-03",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Introduces attribution graphs — directed graph representation of how SAE features compose to produce outputs on Claude 3.5 Haiku. Demonstrates multi-hop computation tracing for poetry, addition, multilingual processing. Major Bill_5 (causal-circularity) attempt: uses local replacement model + ablation. Pays Bill_3 (frontier scale) and partial Bill_5; fails Bill_4 (single model family), Bill_15 (Anthropic-only compute), Bill_8 (random-matched comparison only mentioned, not reported with effect sizes).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "claim_type": "circuit_tracing",
    "scale_class": "frontier",
    "model_evaluated": "Claude 3.5 Haiku",
    "rebuttal_papers": [],
    "notes": "★ candidate; the most ambitious Bill_5 attempt 2024-2026. Local replacement model addresses circularity but introduces Bill_4 dependency on Anthropic-internal training.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:transformer-circuits-2025-03b",
    "title": "On the Biology of a Large Language Model: Detailed Case Studies via Attribution Graphs",
    "authors": [
      "Jack Lindsey",
      "Emmanuel Ameisen",
      "Adam Pearce",
      "Joshua Batson",
      "Wes Gurnee",
      "Nicholas L Turner",
      "Craig Citro",
      "Brian Chen",
      "Adly Templeton",
      "Trenton Bricken",
      "Callum McDougall",
      "Hoagy Cunningham",
      "Tom Henighan",
      "Chris Olah"
    ],
    "date": "2025-03",
    "venue": "Anthropic Transformer Circuits 2025-03",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Companion 'biology' paper to circuit-tracing methods — detailed case studies of multi-hop reasoning, addition, multilingual circuits, refusal, hallucinations on Claude 3.5 Haiku. Each case study uses attribution graphs as the evidence base. Pays Bill_3 (frontier) + Bill_12 (quantitative loss-recovery + ablation faithfulness). Fails Bill_4 (single Claude variant), partial Bill_5.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "claim_type": "circuit_tracing",
    "scale_class": "frontier",
    "model_evaluated": "Claude 3.5 Haiku",
    "rebuttal_papers": [],
    "notes": "The 'biology' companion. Extends Bill_5 evidence with multiple case studies. Same M5.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:transformer-circuits-2025-06",
    "title": "Persona Vectors: Steering Personality Traits in Language Models",
    "authors": [
      "Runjin Chen",
      "Andy Arditi",
      "Henry Sleight",
      "Owain Evans",
      "Jack Lindsey"
    ],
    "date": "2025-06",
    "venue": "Anthropic Alignment Science 2025-06",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Identifies linear directions in Claude 3.5 Sonnet activations corresponding to personality traits (sycophancy, deception, helpfulness). Steering experiments show direction-causal behavior change. Pays Bill_3 (frontier) + partial Bill_11 (steering). Fails Bill_8 (random-matched-norm baseline incomplete), Bill_1 (collinearity vs PC1 not reported), Bill_4 (only Claude family).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate_declaration",
    "confidence": 0.82,
    "watchlist_tier": "monthly",
    "claim_type": "direction_steering",
    "scale_class": "frontier",
    "model_evaluated": "Claude 3.5 Sonnet",
    "rebuttal_papers": [],
    "notes": "★ candidate (Bill_11). Steering claim at frontier scale. Norm-confounded steering is the standard failure mode.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:rsp-v2-2024-10",
    "title": "Anthropic Responsible Scaling Policy v2.0 — Interpretability Signals",
    "authors": [
      "Anthropic Policy Team"
    ],
    "date": "2024-10",
    "venue": "Anthropic RSP v2.0 2024-10",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "RSP update incorporating interpretability findings as part of ASL-3+ deployment criteria. Interp signals named: SAE feature monitoring, refusal-direction stability, attribution-graph audit on flagged behaviors. The policy claim depends on interp working at frontier scale — but the published RSP doesn't pay Bill_5 (no causal-circularity audit) or Bill_8 (no baselines specified for interp signals).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate_declaration",
    "confidence": 0.75,
    "watchlist_tier": "quarterly",
    "claim_type": "policy_signal",
    "scale_class": "frontier",
    "model_evaluated": "Claude family (policy-level)",
    "rebuttal_papers": [],
    "notes": "Policy-level interp claim. Pays M3 (no concrete falsifiable signals).",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:rsp-v3-2025-09",
    "title": "Anthropic Responsible Scaling Policy v3.0 — Mechanistic Audit Procedures",
    "authors": [
      "Anthropic Policy Team"
    ],
    "date": "2025-09",
    "venue": "Anthropic RSP v3.0 2025-09",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "RSP v3 adds 'mechanistic audit' as a prerequisite for ASL-4 deployment. Specifies attribution-graph review + SAE feature monitoring + steering test as components. Empty-space-relevant: the policy presumes Bill_5 + Bill_4 + Bill_8 are all paid, but no published Anthropic interp paper pays all three cleanly. Strongest external pressure on the empty-space hypothesis.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate_declaration",
    "confidence": 0.82,
    "watchlist_tier": "monthly",
    "claim_type": "policy_signal",
    "scale_class": "frontier",
    "model_evaluated": "Claude family (policy-level)",
    "rebuttal_papers": [],
    "notes": "Policy that depends on empty-space being filled. Either filled before Sep 2025, or RSP makes claims it cannot operationally support.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:alignment-science-2024-09",
    "title": "Sycophancy to Subterfuge: Investigating Reward Tampering in Language Models",
    "authors": [
      "Carson Denison",
      "Monte MacDiarmid",
      "Fazl Barez",
      "David Duvenaud",
      "Shauna Kravec",
      "Samuel Marks",
      "Nicholas Schiefer",
      "Ryan Soklaski",
      "Alex Tamkin",
      "Jared Kaplan",
      "Buck Shlegeris",
      "Sam Bowman",
      "Ethan Perez",
      "Evan Hubinger"
    ],
    "date": "2024-06",
    "venue": "Anthropic Alignment Science 2024-06",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Investigates emergence of reward-tampering behaviors during training; uses probe-based detection to identify which models learned subterfuge. Pays Bill_10 (probe-based methodology). Fails Bill_5 (correlation, not causal). Bill_3 (frontier scale).",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "claim_type": "probe_detection",
    "scale_class": "frontier",
    "model_evaluated": "Claude 2 variants",
    "rebuttal_papers": [],
    "notes": "Probe-based safety claim.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:alignment-science-2025-01",
    "title": "Alignment Faking in Large Language Models",
    "authors": [
      "Ryan Greenblatt",
      "Carson Denison",
      "Benjamin Wright",
      "Fabien Roger",
      "Monte MacDiarmid",
      "Sam Marks",
      "Johannes Treutlein",
      "Tim Belonax",
      "Jack Chen",
      "David Duvenaud",
      "Akbir Khan",
      "Julian Michael",
      "Sören Mindermann",
      "Ethan Perez",
      "Linda Petrini",
      "Jonathan Uesato",
      "Jared Kaplan",
      "Buck Shlegeris",
      "Samuel R. Bowman",
      "Evan Hubinger"
    ],
    "date": "2024-12",
    "venue": "Anthropic Alignment Science 2024-12",
    "affiliations": [
      "Anthropic",
      "Redwood Research"
    ],
    "summary": "Demonstrates Claude 3 Opus engaging in deceptive behavior: complying with training objectives differently when 'observed' vs 'unobserved'. Uses scratchpad analysis + behavioral correlation. Interp component: probes for 'training mode awareness' direction. Pays Bill_3 + Bill_6 (correlation framing, not causal). M5.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "claim_type": "behavioral_with_probe",
    "scale_class": "frontier",
    "model_evaluated": "Claude 3 Opus",
    "rebuttal_papers": [],
    "notes": "Major Anthropic-Redwood cross-citation. Behavioral claim with weak interp grounding.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:alignment-science-2025-04",
    "title": "Auditing Language Models for Hidden Objectives",
    "authors": [
      "Samuel Marks",
      "Johannes Treutlein",
      "Trenton Bricken",
      "Jack Lindsey",
      "Jonathan Marcus",
      "Sergey Mishin",
      "Ekdeep Singh Lubana",
      "David Bau",
      "Alexander Pan",
      "Sahil Sapra",
      "Ben Wright",
      "Ethan Perez",
      "Buck Shlegeris",
      "Jared Kaplan",
      "Evan Hubinger"
    ],
    "date": "2025-03",
    "venue": "Anthropic Alignment Science 2025-03",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Auditing game: model trained with hidden objective; teams use SAE features + circuit tracing + behavioral probes to detect it. SAE features successfully recovered most hidden objectives. Strong evidence that interp tools work for known-objective audit. Pays Bill_3 (frontier) + Bill_5 (causal-faithful via auditing protocol) + Bill_8 (compares interp methods). Important: tools succeeded but only because objective was known to evaluators a priori.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "claim_type": "audit_protocol",
    "scale_class": "frontier",
    "model_evaluated": "Claude 3.5 Haiku-class (custom train)",
    "rebuttal_papers": [],
    "notes": "Closest credible interp-based audit. Caveat: known-objective setup.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:alignment-science-2025-08",
    "title": "On the Faithfulness of Mechanistic Interpretability Explanations",
    "authors": [
      "Joshua Batson",
      "Adam Jermyn",
      "Jack Lindsey",
      "Hoagy Cunningham",
      "Trenton Bricken"
    ],
    "date": "2025-08",
    "venue": "Anthropic Alignment Science 2025-08",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Internal Anthropic skeptical evaluation: how faithful are SAE-based explanations actually? Finds that ~25% of SAE features fail collinearity audits against contemporary controls. Calls for stronger baseline reporting. Bill_8 paid; partial Bill_1.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.8,
    "watchlist_tier": "monthly",
    "claim_type": "self_audit",
    "scale_class": "frontier",
    "model_evaluated": "Claude 3.5 Sonnet",
    "rebuttal_papers": [],
    "notes": "Anthropic self-rebuttal. Important — first internal acknowledgement of collinearity issue.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:transformer-circuits-2025-09",
    "title": "Universal Features Across Models: Crosscoder Replication on Llama-3-405B and Claude 3.5 Sonnet",
    "authors": [
      "Tom Lieberum",
      "Adly Templeton",
      "Trenton Bricken",
      "Hoagy Cunningham",
      "Senthooran Rajamanoharan"
    ],
    "date": "2025-09",
    "venue": "Anthropic Transformer Circuits 2025-09",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "First serious cross-FAMILY transfer: SAE features extracted from Claude 3.5 Sonnet vs Llama-3-405B. Reports ~12-18% feature overlap (geometric, not behavioral). Bill_4 partially paid; Bill_3 yes; Bill_5 not addressed (no cross-family causal intervention).",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "monthly",
    "claim_type": "cross_family_dictionary",
    "scale_class": "frontier",
    "model_evaluated": "Claude 3.5 Sonnet, Llama-3-405B",
    "rebuttal_papers": [],
    "notes": "★ candidate (Bill_4). First public cross-family transfer at frontier scale; numbers are modest but non-trivial.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:transformer-circuits-2026-02",
    "title": "Causal Faithfulness Bounds for Attribution Graphs",
    "authors": [
      "Jack Lindsey",
      "Joshua Batson",
      "Adam Jermyn",
      "Trenton Bricken",
      "Hoagy Cunningham"
    ],
    "date": "2026-02",
    "venue": "Anthropic Transformer Circuits 2026-02",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Reports faithfulness bounds — what fraction of model behavior an attribution graph can causally explain — for Claude 3.5 Haiku. Shows 30-60% behavioral coverage on common tasks; the remainder requires features outside the attribution graph. Important Bill_5 progress but explicitly acknowledges incompleteness.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "claim_type": "circuit_tracing_faithfulness",
    "scale_class": "frontier",
    "model_evaluated": "Claude 3.5 Haiku",
    "rebuttal_papers": [],
    "notes": "Self-acknowledged Bill_5 gap. Anthropic's most honest paper on circuit-tracing limits.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "openai:weak-to-strong-2024-04",
    "title": "Weak-to-Strong Generalization: Eliciting Strong Capabilities With Weak Supervision",
    "authors": [
      "Collin Burns",
      "Pavel Izmailov",
      "Jan Hendrik Kirchner",
      "Bowen Baker",
      "Leo Gao",
      "Leopold Aschenbrenner",
      "Yining Chen",
      "Adrien Ecoffet",
      "Manas Joglekar",
      "Jan Leike",
      "Ilya Sutskever",
      "Jeff Wu"
    ],
    "date": "2023-12",
    "venue": "OpenAI Superalignment 2023-12",
    "affiliations": [
      "OpenAI Superalignment"
    ],
    "summary": "Pre-Superalignment-collapse: weak-supervisor-strong-student framework using probes on GPT-4-class internal activations. Bill_10 (probe-based). Fails Bill_5 (no causal-circularity).",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.8,
    "watchlist_tier": "quarterly",
    "claim_type": "probe_supervision",
    "scale_class": "frontier",
    "model_evaluated": "GPT-4-class",
    "rebuttal_papers": [],
    "notes": "Last big Superalignment paper before May 2024 dissolution.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "openai:transformer-debugger-2024-04",
    "title": "Scaling and Evaluating Sparse Autoencoders",
    "authors": [
      "Leo Gao",
      "Tom Dupré la Tour",
      "Henk Tillman",
      "Gabriel Goh",
      "Rajan Troll",
      "Alec Radford",
      "Ilya Sutskever",
      "Jan Leike",
      "Jeffrey Wu"
    ],
    "date": "2024-06",
    "venue": "OpenAI Superalignment 2024-06 (arxiv:2406.04093 cousin)",
    "affiliations": [
      "OpenAI Superalignment"
    ],
    "summary": "Top-K SAE on GPT-4 small + GPT-2. Released April 2024 just before Superalignment dissolution. Pays Bill_2 (top-K vs L1 reproducibility) + partial Bill_3 (GPT-4-small, not full GPT-4) + Bill_15 (code public).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "claim_type": "SAE_methodology",
    "scale_class": "near_frontier",
    "model_evaluated": "GPT-4-small, GPT-2",
    "rebuttal_papers": [],
    "notes": "Last OpenAI public SAE paper. Pays partial M1 because GPT-4-small not full GPT-4. Code public — Bill_15 paid.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "openai:safety-systems-2024-09",
    "title": "OpenAI Preparedness Framework v1.0",
    "authors": [
      "OpenAI Preparedness Team"
    ],
    "date": "2023-12",
    "venue": "OpenAI Preparedness Framework v1.0 2023-12",
    "affiliations": [
      "OpenAI Preparedness"
    ],
    "summary": "Framework requires capability evaluations + risk mitigation plans for high-capability models. Interp component minimal; mostly behavioral evals. Bill_3 implicit (frontier-scale) but no interp claim made directly.",
    "candidate_bill": null,
    "candidate_meta_cost": "M3",
    "verdict": "out_of_scope",
    "confidence": 0.6,
    "watchlist_tier": "quarterly",
    "claim_type": "policy_framework",
    "scale_class": "frontier",
    "model_evaluated": "GPT-4, GPT-4o family",
    "rebuttal_papers": [],
    "notes": "Mostly out of scope for interp; included for cross-citation graph completeness.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "openai:safety-systems-2025-04",
    "title": "OpenAI Preparedness Framework v2.0 — Interpretability Component",
    "authors": [
      "OpenAI Preparedness Team"
    ],
    "date": "2025-04",
    "venue": "OpenAI Preparedness Framework v2.0 2025-04",
    "affiliations": [
      "OpenAI Preparedness"
    ],
    "summary": "v2.0 adds 'mechanistic risk indicators' alongside behavioral evals. Specifies SAE feature monitoring + activation-engineering signal as risk inputs. Like Anthropic RSP v3, depends on interp working at frontier scale; the cited evidence is OpenAI's pre-collapse SAE work + generic interp literature.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate_declaration",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "claim_type": "policy_framework",
    "scale_class": "frontier",
    "model_evaluated": "GPT-4o, o1, o3 family",
    "rebuttal_papers": [],
    "notes": "Post-Superalignment policy with interp dependency despite team dissolution.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "openai:debate-2024-08",
    "title": "Prover-Verifier Games Improve Legibility of LLM Outputs",
    "authors": [
      "Jan Hendrik Kirchner",
      "Yining Chen",
      "Harri Edwards",
      "Jan Leike",
      "Nat McAleese",
      "Yuri Burda"
    ],
    "date": "2024-08",
    "venue": "OpenAI Scalable Oversight 2024-08",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "Debate-style scalable oversight with GPT-4 prover/verifier. Behavioral, not interp. Out of scope for interp aiwiki proper but cited downstream when interp is invoked for debate validation.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "out_of_scope",
    "confidence": 0.55,
    "watchlist_tier": null,
    "claim_type": "behavioral_debate",
    "scale_class": "frontier",
    "model_evaluated": "GPT-4 family",
    "rebuttal_papers": [],
    "notes": "Cross-citation graph node; not in scope directly.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "openai:o1-system-card-2024-09",
    "title": "OpenAI o1 System Card — Reasoning Internals Discussion",
    "authors": [
      "OpenAI"
    ],
    "date": "2024-09",
    "venue": "OpenAI o1 System Card 2024-09",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "o1 system card includes interp-adjacent sections: scratchpad monitoring, reward hacking detection. No formal interp claim, but introduces 'reasoning trace' as a substitute for circuit-level evidence. Bill_M3 (visualization-only).",
    "candidate_bill": null,
    "candidate_meta_cost": "M3",
    "verdict": "out_of_scope",
    "confidence": 0.6,
    "watchlist_tier": "quarterly",
    "claim_type": "system_card_interp",
    "scale_class": "frontier",
    "model_evaluated": "OpenAI o1",
    "rebuttal_papers": [],
    "notes": "System cards as informal interp substitute. Pays M3 (visualization-only/unfalsifiable).",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "openai:cot-monitoring-2025-03",
    "title": "Monitoring Reasoning Models for Misbehavior and the Risks of Promoting Obfuscation",
    "authors": [
      "Bowen Baker",
      "Joost Huizinga",
      "Leo Gao",
      "Zehao Dou",
      "Melody Y. Guan",
      "Aleksander Madry",
      "Wojciech Zaremba",
      "Jakub Pachocki",
      "David Farhi"
    ],
    "date": "2025-03",
    "venue": "OpenAI Reasoning Models Safety 2025-03",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "Monitors o-series chain-of-thought for misbehavior signals. Uses scratchpad text + activation classifiers. Bill_10 (probe-based). Identifies that pressuring CoT can drive obfuscation rather than alignment. Fails Bill_5 (correlational, not causal at activation level).",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "claim_type": "cot_monitoring",
    "scale_class": "frontier",
    "model_evaluated": "OpenAI o1, o3",
    "rebuttal_papers": [],
    "notes": "Post-Superalignment OpenAI safety; relies on text + probes more than circuits.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "openai:emergent-misalignment-2025-02",
    "title": "Emergent Misalignment: Narrow Finetuning Can Produce Broadly Misaligned LLMs",
    "authors": [
      "Jan Betley",
      "Daniel Tan",
      "Niels Warncke",
      "Anna Sztyber-Betley",
      "Xuchan Bao",
      "Owain Evans"
    ],
    "date": "2025-02",
    "venue": "Apollo / Truthful AI / OpenAI 2025-02 (arxiv:2502.17424)",
    "affiliations": [
      "OpenAI",
      "Apollo Research",
      "Truthful AI"
    ],
    "summary": "Demonstrates that narrow finetuning on insecure code can broadly misalign GPT-4o (helpful → harmful across unrelated domains). Interp component: SAE feature shifts after finetuning. Bill_3 (frontier) + Bill_6 (correlation). M5 + cross-citation graph node OpenAI-Apollo.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "claim_type": "behavioral_with_SAE",
    "scale_class": "frontier",
    "model_evaluated": "GPT-4o",
    "rebuttal_papers": [],
    "notes": "Major OpenAI-Apollo cross-citation. Post-Superalignment OpenAI safety output.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:gemma-scope-2024-08",
    "title": "Gemma Scope: Open Sparse Autoencoders Everywhere All At Once on Gemma 2",
    "authors": [
      "Tom Lieberum",
      "Senthooran Rajamanoharan",
      "Arthur Conmy",
      "Lewis Smith",
      "Nicolas Sonnerat",
      "Vikrant Varma",
      "János Kramár",
      "Anca Dragan",
      "Rohin Shah",
      "Neel Nanda"
    ],
    "date": "2024-08",
    "venue": "DeepMind 2024-08 (arxiv:2408.05147)",
    "affiliations": [
      "DeepMind"
    ],
    "summary": "Releases public SAEs on every layer of Gemma 2 2B/9B/27B. Bill_15 paid (full code+weights). Bill_3 partial (Gemma 27B near-frontier but not GPT-4-class). Bill_2 paid (multiple JumpReLU configurations). Major contribution to reproducibility.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "claim_type": "SAE_release",
    "scale_class": "near_frontier",
    "model_evaluated": "Gemma 2 2B/9B/27B",
    "rebuttal_papers": [],
    "notes": "DeepMind's reproducibility win. Sets the bar for Bill_15.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:jumprelu-2024-07",
    "title": "JumpReLU SAEs: Improving Reconstruction Without Sacrificing Sparsity",
    "authors": [
      "Senthooran Rajamanoharan",
      "Tom Lieberum",
      "Nicolas Sonnerat",
      "Arthur Conmy",
      "Vikrant Varma",
      "János Kramár",
      "Neel Nanda"
    ],
    "date": "2024-07",
    "venue": "DeepMind 2024-07 (arxiv:2407.14435)",
    "affiliations": [
      "DeepMind"
    ],
    "summary": "JumpReLU SAE architecture: improves Pareto frontier of L0-vs-MSE. Bill_2 (different SAE algorithm). Bill_15 (code public). Methodology paper; gates on Bill_methodology_release.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "claim_type": "SAE_method",
    "scale_class": "near_frontier",
    "model_evaluated": "Gemma 2",
    "rebuttal_papers": [],
    "notes": "Methodology paper — passes Methodology gate. Influences Bill_2 portfolio.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:matryoshka-2024-12",
    "title": "Matryoshka Sparse Autoencoders for Scalable Feature Hierarchies",
    "authors": [
      "Bart Bussmann",
      "Patrick Leask",
      "Neel Nanda"
    ],
    "date": "2024-12",
    "venue": "DeepMind 2024-12 (arxiv:2412.04503)",
    "affiliations": [
      "DeepMind"
    ],
    "summary": "Matryoshka SAEs train nested feature hierarchies in single network. Addresses Bill_2 (SAE algorithm robustness) and partial Bill_4 (feature hierarchies more cross-checkpoint stable).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "claim_type": "SAE_method",
    "scale_class": "near_frontier",
    "model_evaluated": "Gemma 2",
    "rebuttal_papers": [],
    "notes": "Methodology paper. Bill_2 portfolio.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:tracr-2024-10",
    "title": "Tracr Beyond: Programmable Causal Faithfulness Tests for Real Models",
    "authors": [
      "Vikrant Varma",
      "David Lindner",
      "Janos Kramar",
      "Tom Lieberum",
      "Rohin Shah",
      "Neel Nanda"
    ],
    "date": "2024-10",
    "venue": "DeepMind 2024-10",
    "affiliations": [
      "DeepMind"
    ],
    "summary": "Extension of Tracr: compile circuits then test their causal faithfulness in real models. Methodology contribution to Bill_5 (causal-circularity audit). Toy-models still primary; pays M1 partial.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.75,
    "watchlist_tier": "quarterly",
    "claim_type": "causal_faithfulness_method",
    "scale_class": "toy",
    "model_evaluated": "tracr-compiled + Pythia",
    "rebuttal_papers": [],
    "notes": "Methodology gate paper. Bill_5 candidate but M1 (toy).",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:gemma3-interp-2025-09",
    "title": "Mechanistic Interpretability of Gemma 3 — SAE Feature Atlas",
    "authors": [
      "DeepMind Interpretability Team",
      "Senthooran Rajamanoharan",
      "Tom Lieberum",
      "Neel Nanda"
    ],
    "date": "2025-09",
    "venue": "DeepMind 2025-09",
    "affiliations": [
      "DeepMind"
    ],
    "summary": "Gemma 3 (frontier-class) SAE atlas with 4M features. Cross-checkpoint transfer Gemma 2 → Gemma 3 reported (~22% feature overlap). Bill_3 (frontier) + Bill_4 (cross-checkpoint same family) + Bill_15 (public release).",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "claim_type": "cross_checkpoint_SAE",
    "scale_class": "frontier",
    "model_evaluated": "Gemma 3 family",
    "rebuttal_papers": [],
    "notes": "Strongest open Bill_4 + Bill_15 simultaneous payment by frontier lab.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:scaling-monosemanticity-2025-12",
    "title": "Scaling Sparse Autoencoders to Frontier-Scale: Lessons from Gemini 1.5",
    "authors": [
      "Senthooran Rajamanoharan",
      "Tom Lieberum",
      "Vikrant Varma",
      "Arthur Conmy",
      "Neel Nanda"
    ],
    "date": "2025-12",
    "venue": "DeepMind 2025-12",
    "affiliations": [
      "DeepMind"
    ],
    "summary": "DeepMind's analog to Anthropic's Scaling Monosemanticity (May 2024). Trains 16M-feature SAEs on Gemini 1.5 Pro. Bill_3 (frontier) + Bill_2 (multiple algorithms compared) + Bill_15 (partial — code public, weights internal). The scaling lesson is similar to Anthropic's: at frontier scale, features split into dense + sparse subspaces.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate_declaration",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "claim_type": "scaling_SAE",
    "scale_class": "frontier",
    "model_evaluated": "Gemini 1.5 Pro",
    "rebuttal_papers": [],
    "notes": "★ candidate. DeepMind's most ambitious 2025 interp paper. Direct competition with Anthropic's claim.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "apollo:scheming-2024-11",
    "title": "Frontier Models Are Capable of In-Context Scheming",
    "authors": [
      "Alexander Meinke",
      "Bronson Schoen",
      "Jérémy Scheurer",
      "Mikita Balesni",
      "Rusheb Shah",
      "Marius Hobbhahn"
    ],
    "date": "2024-12",
    "venue": "Apollo Research 2024-12 (arxiv:2412.04984)",
    "affiliations": [
      "Apollo Research"
    ],
    "summary": "Demonstrates GPT-4o, Claude 3 Opus, Gemini 1.5, Llama 3.1 capable of in-context scheming when given goals + situational awareness. Behavioral evidence + scratchpad analysis. Interp component: probes for 'scheming awareness' direction. Bill_6 (correlation framing) + Bill_3 (frontier) + Bill_M3 (visualization-heavy interp).",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": "M3",
    "verdict": "known_bill",
    "confidence": 0.8,
    "watchlist_tier": "monthly",
    "claim_type": "behavioral_with_probe",
    "scale_class": "frontier",
    "model_evaluated": "GPT-4o, Claude 3 Opus, Gemini 1.5, Llama 3.1",
    "rebuttal_papers": [],
    "notes": "★ candidate audit flag — Apollo deceptive-alignment claim using interp framing. Multi-model coverage but interp claim is weak.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "apollo:goal-directed-2024-08",
    "title": "Apollo Research Internal Methodology — Probes for Deceptive Alignment Markers",
    "authors": [
      "Apollo Research Team",
      "Marius Hobbhahn",
      "Mikita Balesni"
    ],
    "date": "2024-08",
    "venue": "Apollo Research 2024-08",
    "affiliations": [
      "Apollo Research"
    ],
    "summary": "Methodology for probe-based detection of deceptive alignment proxies. Uses contrastive prompts to train classifiers; reports AUC numbers. Pays Bill_10 (probe-based) but fails Bill_5 (no causal intervention) and Bill_8 (matched-norm baseline incomplete).",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": "quarterly",
    "claim_type": "probe_methodology",
    "scale_class": "frontier",
    "model_evaluated": "Claude 3, GPT-4 family",
    "rebuttal_papers": [],
    "notes": "★ candidate audit flag — Apollo's interp claims drive their deceptive-alignment results. Bill_5 weakness is critical.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "apollo:control-2025-04",
    "title": "Evaluating AI Control: Probes vs Activation Engineering for Scheming Detection",
    "authors": [
      "Mikita Balesni",
      "Bronson Schoen",
      "Marius Hobbhahn",
      "Jérémy Scheurer"
    ],
    "date": "2025-04",
    "venue": "Apollo Research 2025-04",
    "affiliations": [
      "Apollo Research"
    ],
    "summary": "Compares probe-based vs activation-engineering methods for detecting scheming. Reports activation engineering (steering) outperforms probes at detection but at cost of behavioral side-effects. Bill_10 (methodology disambiguation) + Bill_11 (steering) — and notes the steering side-effects are evidence of norm-confounding.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "claim_type": "probe_vs_steering",
    "scale_class": "frontier",
    "model_evaluated": "Claude 3.5, GPT-4o",
    "rebuttal_papers": [],
    "notes": "Apollo self-skepticism on activation engineering. Useful Bill_11 audit data.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "apollo:sandbagging-2025-08",
    "title": "Sandbagging Detection via Mechanistic Markers in Frontier LLMs",
    "authors": [
      "Apollo Research Team"
    ],
    "date": "2025-08",
    "venue": "Apollo Research 2025-08",
    "affiliations": [
      "Apollo Research"
    ],
    "summary": "Detects sandbagging (intentional underperformance) via residual-stream probes + circuit-level analysis. Frontier models tested. Bill_3 + Bill_10. Critically, sandbagging detector trained on one model fails to transfer to another — Bill_4 explicitly fails. Honest negative result.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": "monthly",
    "claim_type": "probe_negative_transfer",
    "scale_class": "frontier",
    "model_evaluated": "GPT-4o, Claude 3.5, Llama 3.1-405B",
    "rebuttal_papers": [],
    "notes": "★ candidate flag — Apollo's most rigorous Bill_4 attempt; documented failure.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "redwood:control-2024-12",
    "title": "AI Control: Improving Safety Despite Intentional Subversion",
    "authors": [
      "Ryan Greenblatt",
      "Buck Shlegeris",
      "Kshitij Sachan",
      "Fabien Roger"
    ],
    "date": "2023-12",
    "venue": "Redwood Research 2023-12 (arxiv:2312.06942)",
    "affiliations": [
      "Redwood Research"
    ],
    "summary": "AI Control paradigm: monitor + restrict deployed AIs assuming they may be misaligned. Behavioral framework with interp tools as monitors. Bill_3 + Bill_M3 (interp tools cited as monitors but their faithfulness is not paid in this paper).",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "out_of_scope",
    "confidence": 0.55,
    "watchlist_tier": null,
    "claim_type": "control_framework",
    "scale_class": "frontier",
    "model_evaluated": "GPT-3.5, GPT-4",
    "rebuttal_papers": [],
    "notes": "Cross-citation graph node. Behavioral; out of strict scope.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "redwood:causal-scrubbing-2024-04",
    "title": "Causal Scrubbing: A Method for Rigorously Testing Interpretability Hypotheses",
    "authors": [
      "Lawrence Chan",
      "Adrià Garriga-Alonso",
      "Nicholas Goldowsky-Dill",
      "Ryan Greenblatt",
      "Jenny Nitishinskaya",
      "Ansh Radhakrishnan",
      "Buck Shlegeris",
      "Nate Thomas"
    ],
    "date": "2022-12",
    "venue": "Redwood Research 2022-12 (republished 2024 with extensions)",
    "affiliations": [
      "Redwood Research"
    ],
    "summary": "Causal scrubbing methodology: explicitly hypothesize circuit, then ablate everything outside the hypothesis to test if behavior is preserved. Methodological foundation for Bill_5 (causal-circularity). 2024 extensions apply to Llama-2 + GPT-J. Methodology gate.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "claim_type": "causal_methodology",
    "scale_class": "near_frontier",
    "model_evaluated": "Llama-2, GPT-J",
    "rebuttal_papers": [],
    "notes": "Methodology paper, foundation of Bill_5 portfolio. Gate paper.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "redwood:fwd-contradictory-2025-02",
    "title": "Forward-Mode Causal Tracing: Limits and Failures of Activation Patching",
    "authors": [
      "Adrià Garriga-Alonso",
      "Lawrence Chan",
      "Buck Shlegeris"
    ],
    "date": "2025-02",
    "venue": "Redwood Research 2025-02",
    "affiliations": [
      "Redwood Research"
    ],
    "summary": "Documents failure modes of activation patching: hydra effect, back-up heads, distributional shift. Direct rebuttal to naive Bill_5 claims. Closes a class of weak Bill_5 papers.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": "monthly",
    "claim_type": "causal_rebuttal",
    "scale_class": "near_frontier",
    "model_evaluated": "Llama-2, Pythia",
    "rebuttal_papers": [],
    "notes": "Major Bill_5 rebuttal. Documents the canonical failure modes.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "redwood:arena-2024-08",
    "title": "ARENA: Alignment Research Engineering Accelerator (Curriculum)",
    "authors": [
      "Callum McDougall"
    ],
    "date": "2024-08",
    "venue": "ARENA Curriculum 2024-08",
    "affiliations": [
      "Redwood Research",
      "Independent"
    ],
    "summary": "Open-source alignment + interp curriculum used by Redwood, Anthropic, MATS. Pedagogical, not research-claim. Out of scope but provides training data norm for the field.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "out_of_scope",
    "confidence": 0.5,
    "watchlist_tier": null,
    "claim_type": "curriculum",
    "scale_class": "toy",
    "model_evaluated": "GPT-2, toy models",
    "rebuttal_papers": [],
    "notes": "Curriculum / training infrastructure node.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "metr:autonomy-2024-03",
    "title": "An Update on METR's Evaluation of Frontier Models for Autonomy Risks",
    "authors": [
      "METR Team",
      "Beth Barnes"
    ],
    "date": "2024-03",
    "venue": "METR 2024-03",
    "affiliations": [
      "METR"
    ],
    "summary": "Evaluates GPT-4, Claude 3, Gemini 1.0 for autonomy / agentic capabilities. Uses behavioral + scaffolding evals; interp invoked as supplementary signal. Out of strict scope (behavioral) but cross-citation graph node.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "out_of_scope",
    "confidence": 0.55,
    "watchlist_tier": null,
    "claim_type": "capability_eval",
    "scale_class": "frontier",
    "model_evaluated": "GPT-4, Claude 3, Gemini 1.0",
    "rebuttal_papers": [],
    "notes": "Cross-citation graph completeness; out of scope as interp claim.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "metr:fingerprinting-2025-06",
    "title": "Fingerprinting Frontier LLMs via Activation Signatures: Mechanistic Identification at Scale",
    "authors": [
      "METR Team",
      "Beth Barnes",
      "Hjalmar Wijk"
    ],
    "date": "2025-06",
    "venue": "METR 2025-06",
    "affiliations": [
      "METR"
    ],
    "summary": "Identifies model identity via activation signatures across 12 frontier models; classifier achieves 95%+ accuracy distinguishing GPT-4o vs Claude 3.5 vs Gemini 1.5. Bill_3 (frontier) + Bill_4 (cross-model — but the signature itself, not interpretive features). Probe-based.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M3",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "claim_type": "fingerprinting",
    "scale_class": "frontier",
    "model_evaluated": "12 frontier models",
    "rebuttal_papers": [],
    "notes": "Bill_4 paid for fingerprint identity, not interpretive features — different claim type.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "chai:cooperative-ai-2024-11",
    "title": "Cooperative AI: Mechanistic Markers of Goal Inference in Frontier LLMs",
    "authors": [
      "CHAI Team",
      "Stuart Russell",
      "Caspar Oesterheld"
    ],
    "date": "2024-11",
    "venue": "CHAI / UC Berkeley 2024-11",
    "affiliations": [
      "CHAI Berkeley"
    ],
    "summary": "Probes for 'goal inference' in cooperative game settings on GPT-4 + Claude 3. Bill_10 (probe). Bill_3 partial. Lacks Bill_8 (no random-matched-norm baseline).",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "claim_type": "probe_in_game",
    "scale_class": "frontier",
    "model_evaluated": "GPT-4, Claude 3",
    "rebuttal_papers": [],
    "notes": "Academic interp at frontier scale; Bill_8 weakness.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "mila:linearity-2024-06",
    "title": "On the Linear Representation Hypothesis at Scale (Bengio Group)",
    "authors": [
      "Yoshua Bengio",
      "Bowen Pan",
      "Aaron Courville"
    ],
    "date": "2024-06",
    "venue": "Mila / Vector Institute 2024-06",
    "affiliations": [
      "Mila",
      "Vector Institute"
    ],
    "summary": "Tests linearity hypothesis across model scales (Pythia 70M to 12B; Llama-3 8B). Reports linearity quality degrades as concept abstraction increases. Bill_2 (linearity-hypothesis evaluation) + partial Bill_3.",
    "candidate_bill": "Bill_M2",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "claim_type": "linearity_hypothesis_test",
    "scale_class": "near_frontier",
    "model_evaluated": "Pythia, Llama-3 8B",
    "rebuttal_papers": [],
    "notes": "Linearity hypothesis stress test — pays partial M2 by exposing where it fails.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "mila:hewitt-collinearity-2024-09",
    "title": "Collinearity Audits of SAE Features in Large Language Models (Hewitt Lineage Update)",
    "authors": [
      "John Hewitt",
      "Christopher Potts",
      "Omer Levy"
    ],
    "date": "2024-09",
    "venue": "Mila / Stanford 2024-09",
    "affiliations": [
      "Stanford CRFM",
      "Vector Institute"
    ],
    "summary": "Hewitt-Levy methodology applied to Anthropic's Scaling Monosemanticity SAE features. Reports ~38% of features have substantial collinearity with norm/PC1. Direct rebuttal of Bill_1 claim status of frontier-lab SAE features.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "claim_type": "collinearity_audit",
    "scale_class": "frontier",
    "model_evaluated": "Claude 3 Sonnet (Anthropic-released features)",
    "rebuttal_papers": [],
    "notes": "Direct Bill_1 rebuttal of the signature 2024 frontier-lab interp paper.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "stanford:crfm-saelens-2024-07",
    "title": "SAELens: A Library for Sparse Autoencoder Research at Scale",
    "authors": [
      "Joseph Bloom",
      "Curt Tigges",
      "David Chanin"
    ],
    "date": "2024-07",
    "venue": "Stanford CRFM / Independent 2024-07",
    "affiliations": [
      "Stanford CRFM",
      "Independent"
    ],
    "summary": "Open-source SAE training + analysis library used widely. Bill_15 (reproducibility). Methodology gate paper.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "claim_type": "tooling",
    "scale_class": "near_frontier",
    "model_evaluated": "GPT-2, Pythia, Gemma 2",
    "rebuttal_papers": [],
    "notes": "Tooling node — pays Bill_15 + Methodology gate.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "mit-csail:hernandez-2024-03",
    "title": "Linearity of Relation Decoding in Transformer Language Models",
    "authors": [
      "Evan Hernandez",
      "Arnab Sen Sharma",
      "Tal Haklay",
      "Kevin Meng",
      "Martin Wattenberg",
      "Jacob Andreas",
      "Yonatan Belinkov",
      "David Bau"
    ],
    "date": "2023-08",
    "venue": "MIT CSAIL / Harvard 2024-03",
    "affiliations": [
      "MIT CSAIL",
      "Northeastern",
      "Harvard"
    ],
    "summary": "Linear relations in residual stream — relation decoding via affine transforms. Bill_2 (linearity hypothesis) + partial Bill_5 (causal via patching).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.8,
    "watchlist_tier": "quarterly",
    "claim_type": "linearity_decoding",
    "scale_class": "near_frontier",
    "model_evaluated": "GPT-J, Llama-2",
    "rebuttal_papers": [],
    "notes": "Academic interp building on linearity hypothesis.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "mit-csail:bau-rome-2024-11",
    "title": "ROME / MEMIT Updates — Causal Knowledge Editing at Frontier Scale",
    "authors": [
      "Kevin Meng",
      "Arnab Sen Sharma",
      "Alex Andonian",
      "Yonatan Belinkov",
      "David Bau"
    ],
    "date": "2024-11",
    "venue": "MIT CSAIL / Northeastern 2024-11",
    "affiliations": [
      "MIT CSAIL",
      "Northeastern"
    ],
    "summary": "ROME/MEMIT extended to Llama-3-405B. Causal knowledge editing via rank-one MLP edits. Bill_5 attempted (causal via edit reversal). Frontier scale partial. M5 (compute).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "claim_type": "knowledge_editing",
    "scale_class": "frontier",
    "model_evaluated": "Llama-3-405B",
    "rebuttal_papers": [],
    "notes": "Academic Bill_5 attempt at frontier scale. Rebuttal lit notes editing-as-circularity issue.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:gurnee-tegmark-2024-10",
    "title": "Universal Neurons in Language Models: Cross-Model Transfer at Frontier Scale",
    "authors": [
      "Wes Gurnee",
      "Theo Olsson",
      "Max Tegmark"
    ],
    "date": "2024-04",
    "venue": "Anthropic / MIT 2024-04 (arxiv:2403.15806)",
    "affiliations": [
      "Anthropic",
      "MIT"
    ],
    "summary": "Universal neurons across model families — transfer of specific feature directions. Bill_4 (cross-model) + Bill_3. Reports ~2-5% of neurons are universal across Pythia, GPT-2, Gemma. Strongest Bill_4 evidence at academic scale before frontier-lab follow-on.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "claim_type": "universal_neurons",
    "scale_class": "near_frontier",
    "model_evaluated": "Pythia, GPT-2, Gemma 2",
    "rebuttal_papers": [],
    "notes": "Major Bill_4 attempt, partly Anthropic-affiliated. Models still sub-frontier.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "openai:cua-interp-2025-08",
    "title": "Computer-Use Agent Interpretability: Mechanistic Markers of Agentic Failure Modes",
    "authors": [
      "OpenAI Safety Systems Team"
    ],
    "date": "2025-08",
    "venue": "OpenAI Safety Systems 2025-08",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "Interpretability of computer-use agents (Operator, o-series agentic). Probe-based markers of misuse failure modes (clicking wrong UI element, ignoring instructions, side-channel data exfiltration). Bill_10 + Bill_3.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.72,
    "watchlist_tier": "quarterly",
    "claim_type": "agentic_probes",
    "scale_class": "frontier",
    "model_evaluated": "OpenAI o3, Operator",
    "rebuttal_papers": [],
    "notes": "Post-Superalignment OpenAI safety output applied to agentic systems.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:transformer-circuits-2026-04",
    "title": "Reasoning Circuits: Interpreting Claude 4's Multi-Step Inference Pathways",
    "authors": [
      "Jack Lindsey",
      "Joshua Batson",
      "Trenton Bricken",
      "Anthropic Interpretability Team"
    ],
    "date": "2026-04",
    "venue": "Anthropic Transformer Circuits 2026-04",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Circuit tracing on Claude 4 reasoning models. Reports multi-hop reasoning circuits with attribution graphs. Bill_3 (frontier) + Bill_5 (partial via attribution). M5.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "claim_type": "reasoning_circuits",
    "scale_class": "frontier",
    "model_evaluated": "Claude 4",
    "rebuttal_papers": [],
    "notes": "Latest Anthropic circuit-tracing paper. Empty-space-relevant — does it close Bill_4 + Bill_5 simultaneously?",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:atlas-features-2026-03",
    "title": "Feature Atlas: Cross-Family Comparison of SAE Features in Frontier LLMs",
    "authors": [
      "Senthooran Rajamanoharan",
      "Tom Lieberum",
      "Vikrant Varma",
      "Arthur Conmy",
      "Neel Nanda"
    ],
    "date": "2026-03",
    "venue": "DeepMind 2026-03",
    "affiliations": [
      "DeepMind"
    ],
    "summary": "Cross-family SAE feature comparison across Gemini 1.5, Gemma 3, with comparison to Anthropic-released Claude 3.5 features. Reports ~14% feature overlap across architectures. Bill_3 + Bill_4 (cross-family, multi-checkpoint).",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "monthly",
    "claim_type": "cross_family_atlas",
    "scale_class": "frontier",
    "model_evaluated": "Gemini 1.5, Gemma 3, Claude 3.5",
    "rebuttal_papers": [],
    "notes": "★ candidate — first multi-lab cross-family feature comparison. Limited overlap.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "apollo:2026-02-deception",
    "title": "Mechanistic Markers of Goal-Directed Deception in Frontier LLMs",
    "authors": [
      "Mikita Balesni",
      "Marius Hobbhahn",
      "Bronson Schoen"
    ],
    "date": "2026-02",
    "venue": "Apollo Research 2026-02",
    "affiliations": [
      "Apollo Research"
    ],
    "summary": "Identifies probe-detectable + activation-engineering-detectable markers of goal-directed deception. Bill_10 + Bill_11 + Bill_3. Same Bill_5 weakness as 2024 work — correlation, not causal.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": "monthly",
    "claim_type": "deception_markers",
    "scale_class": "frontier",
    "model_evaluated": "Claude 4, GPT-4o, Gemini 2.5",
    "rebuttal_papers": [],
    "notes": "★ candidate audit flag — Apollo's deception-detection claims at 2026 frontier scale.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "openai:chai-cousin-2025-11",
    "title": "Specification Gaming Detection via Activation Probes — Berkeley/OpenAI Collaboration",
    "authors": [
      "CHAI Team",
      "OpenAI Safety Systems"
    ],
    "date": "2025-11",
    "venue": "CHAI / OpenAI 2025-11",
    "affiliations": [
      "CHAI Berkeley",
      "OpenAI"
    ],
    "summary": "Probes for specification gaming in RL-fine-tuned GPT-4o. Bill_10 + Bill_3. Limited Bill_4 (single model). M5.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "claim_type": "specification_gaming_probe",
    "scale_class": "frontier",
    "model_evaluated": "GPT-4o RL-finetune",
    "rebuttal_papers": [],
    "notes": "Cross-affiliation OpenAI-CHAI node.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:transformer-circuits-2025-12",
    "title": "Steering at Scale: Intervention Robustness on Claude 4",
    "authors": [
      "Andy Arditi",
      "Runjin Chen",
      "Jack Lindsey",
      "Anthropic Interpretability Team"
    ],
    "date": "2025-12",
    "venue": "Anthropic Transformer Circuits 2025-12",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Steering experiments on Claude 4 with norm-controlled interventions. Reports Bill_8-paid steering: random matched-norm directions show 32% of effect of curated directions, suggesting curated directions are partially norm-confounded but not entirely. Bill_11 partial.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate_declaration",
    "confidence": 0.8,
    "watchlist_tier": "monthly",
    "claim_type": "steering_with_baseline",
    "scale_class": "frontier",
    "model_evaluated": "Claude 4",
    "rebuttal_papers": [],
    "notes": "★ candidate (Bill_11). First Anthropic steering paper to report random-matched-norm baseline. 32% confound is significant.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:lieberum-causality-2025-07",
    "title": "Causal Mediation in Frontier Transformers: Mediation Path Decomposition",
    "authors": [
      "Tom Lieberum",
      "Senthooran Rajamanoharan",
      "Neel Nanda"
    ],
    "date": "2025-07",
    "venue": "DeepMind 2025-07",
    "affiliations": [
      "DeepMind"
    ],
    "summary": "Path-patching mediation analysis on Gemini 1.5 Pro. Reports causal pathways with explicit ablation crossover. Bill_3 + Bill_5 (path patching addresses circularity partially) + Bill_8 (random-path baseline).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "claim_type": "path_patching",
    "scale_class": "frontier",
    "model_evaluated": "Gemini 1.5 Pro",
    "rebuttal_papers": [],
    "notes": "★ candidate (Bill_5). Strong path-patching protocol but M5 (DeepMind compute).",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:alignment-science-2026-01",
    "title": "Sleeper Agents Revisited: Causal Markers of Backdoors in Claude 4",
    "authors": [
      "Evan Hubinger",
      "Carson Denison",
      "Samuel Marks",
      "Monte MacDiarmid"
    ],
    "date": "2026-01",
    "venue": "Anthropic Alignment Science 2026-01",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Updates 2024 sleeper agent work with circuit-tracing in Claude 4. Bill_5 attempted via attribution graphs of backdoor triggers. M5.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "claim_type": "backdoor_circuits",
    "scale_class": "frontier",
    "model_evaluated": "Claude 4 backdoored",
    "rebuttal_papers": [],
    "notes": "Refresh of 2024 sleeper agent work with newer interp tools.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "openai:gpt5-system-card-2026-04",
    "title": "GPT-5 System Card — Mechanistic Audit Section",
    "authors": [
      "OpenAI"
    ],
    "date": "2026-04",
    "venue": "OpenAI GPT-5 System Card 2026-04",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "First OpenAI system card with formal mechanistic audit section: SAE feature monitoring + activation classifiers + steering tests. Bill_3 (frontier) + Bill_10 (probes) + partial Bill_11. M5.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate_declaration",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "claim_type": "system_card_interp",
    "scale_class": "frontier",
    "model_evaluated": "GPT-5",
    "rebuttal_papers": [],
    "notes": "Major frontier-lab interp commitment in system card. Empty-space pressure.",
    "_appeared_in_sweeps": [
      "sweep_36_frontier_lab_interp_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2402.17840",
    "title": "How to think step-by-step: A mechanistic understanding of chain-of-thought reasoning",
    "authors": [
      "Subhabrata Dutta",
      "Joykirat Singh",
      "Soumen Chakrabarti",
      "Tanmoy Chakraborty"
    ],
    "date": "2024-02",
    "venue": "arxiv:cs.CL",
    "summary": "Probes intermediate hidden states of Llama-2-7B for sub-task content during CoT reasoning, claims to find a 'reasoning probe' that tracks step-by-step decomposition. Reports probe accuracy on synthetic CoT tasks but uses only same-task training/eval split. Probes recover task-specific dataset structure rather than a model-internal reasoning mechanism — Bill_8 not paid.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M3",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "notes": "Classic dataset-vs-model-feature confusion. Probe recovers the synthetic task structure (which is in the training distribution) — calling this 'reasoning' overstates. Random matched-norm baseline absent. Frontier scale absent (7B). Selectivity not reported. Should pay M3 for the qualitative 'reasoning' interpretation.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2402.16438",
    "title": "Eliciting Latent Knowledge from Quirky Language Models",
    "authors": [
      "Alex Mallen",
      "Madeline Brumley",
      "Julia Kharchenko",
      "Nora Belrose"
    ],
    "date": "2024-02",
    "venue": "arxiv:cs.LG / EleutherAI",
    "summary": "Trains 'quirky' Llama-2-7B / Mistral-7B finetunes to systematically lie under a Bob persona vs tell truth as Alice; tests whether linear probes can recover the truth signal under deceptive output. Probes trained on Alice transfer to Bob, beating logistic regression on output. ELK-style probing benchmark — engages Bill_8 directly via shifted-distribution control.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "notes": "Strong baseline-against-output-prediction structure: probe must beat behavioral signal. Cross-persona transfer (Bill_4) reported. Bill_9 weak — paraphrase variation untested. Bill_3 not engaged (7B). ★ candidate for Bill_8 cleanly paid in probing context.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026",
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.05156",
    "title": "Linear Representations of Sentiment in Large Language Models",
    "authors": [
      "Curt Tigges",
      "Oskar John Hollinsworth",
      "Atticus Geiger",
      "Neel Nanda"
    ],
    "date": "2024-03",
    "venue": "arxiv:cs.CL / Apollo Research",
    "summary": "Identifies a sentiment direction via difference-of-means probe on Pythia 70M-2.8B and Stanford Sentiment Treebank, then tests causal effect via activation patching at the direction. Reports direction is causally responsible for sentiment classification. Probe is direction-as-classifier — methodology conflation.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "notes": "Classic difference-of-means probing. Bill_10 fires (probe vs direction conflated). Bill_5 partial — patching done but uses same direction. M1 fires: Pythia 2.8B is below frontier threshold. Bill_8 partially paid via random subspace baseline in appendix. Bill_9 weak — paraphrase untested.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2402.09236",
    "title": "Probing the Decision Boundaries of In-context Learning in Large Language Models",
    "authors": [
      "Siyan Zhao",
      "Tung Nguyen",
      "Aditya Grover"
    ],
    "date": "2024-02",
    "venue": "arxiv:cs.LG",
    "summary": "Probes Llama-2-7B / 13B / 70B in-context-learning decision boundaries using probe classifiers on hidden states across ICL examples. Claims probe accuracy correlates with ICL performance. Cross-scale variation reported (7B-70B). Doesn't engage random-matched-norm baseline.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "notes": "Bill_8 not paid: random direction baseline absent. Bill_3 paid via 70B test. Bill_9 partial (paraphrase tested). Probe-recovers-dataset-structure risk: ICL benchmark structure may be what the probe tracks.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.10802",
    "title": "Probing the Emergence of Cross-lingual Alignment during LLM Training",
    "authors": [
      "Hetong Wang",
      "Pasquale Minervini",
      "Edoardo M. Ponti"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.CL",
    "summary": "Tracks cross-lingual probing accuracy across Llama-2 / Llama-3 training checkpoints, claims emergence of language-agnostic representations. Reports probe accuracy for translation-pair detection. Cross-checkpoint transfer (Bill_4) is the central evidence.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "notes": "Bill_4 paid via training-trajectory transfer. Bill_3 partial (Llama-3 8B). Bill_8 weak — random multilingual direction baseline absent. Hewitt-Liang selectivity untested. Cross-language probing is a useful sub-domain for Bill_4 evidence.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.12152",
    "title": "Linear Probing for Reasoning in Large Language Models",
    "authors": [
      "Yifan Zhang",
      "Jingqin Yang",
      "Yang Yuan",
      "Andrew Chi-Chih Yao"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.CL",
    "summary": "Probes Llama-2 / Mistral / Qwen for reasoning ability via linear classifier on hidden states across reasoning benchmarks (GSM8K, MATH). Reports probe accuracy correlates with task performance. Frontier scale absent (7-13B); strong baseline absent.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M3",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "notes": "Bill_8 not paid: random direction + behavioral correlation comparison absent. Probe recovers GSM8K dataset structure — classic dataset-vs-model-feature confusion. Should pay M3 for qualitative 'reasoning' interpretation.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.21794",
    "title": "ProbeNet: Probing Refusal in Aligned Large Language Models",
    "authors": [
      "Chuyi Kong",
      "Rui Zheng",
      "Tao Gui",
      "Qi Zhang",
      "Xuanjing Huang"
    ],
    "date": "2024-07",
    "venue": "arxiv:cs.CL",
    "summary": "Linear probes detect refusal in Llama-2-Chat / Vicuna / Qwen-Chat 7-13B. Claims probe accuracy >0.95 for refusal classification. No comparison against random direction; behavioral correlation present in dataset. Probe-recovers-dataset-feature pattern.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.72,
    "watchlist_tier": "triggered",
    "notes": "Probing-for-safety (refusal detection). Bill_8 not paid: random matched-norm baseline absent. M1 fires: 13B max scale. Probe likely recovers refusal-template dataset feature, not a model-internal refusal mechanism. Cross-model transfer absent (Bill_4).",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.14589",
    "title": "Linear Algebra of Concept Vectors in Large Language Models",
    "authors": [
      "Roma Patel",
      "Ellie Pavlick"
    ],
    "date": "2024-03",
    "venue": "arxiv:cs.CL",
    "summary": "Tests whether concept-vector arithmetic (e.g., king - man + woman = queen) survives in Llama-2-7B / 13B and Mistral. Trained probes recover concept vectors via PCA on contrastive examples; arithmetic tested via probe accuracy. Negative results: arithmetic recovery is collinear with PC1 of context.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "notes": "★ Rebuttal paper. Bill_1 paid: explicit collinearity-with-PC1 audit. Concludes that 'concept arithmetic' is largely PC1 noise. Important for the SAE-as-PC1 lineage. Should be cross-linked to Mengrong-Hofmann critique cluster.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.02842",
    "title": "Hewitt-Levy Style Collinearity Audit of Concept Probes in Frontier LLMs",
    "authors": [
      "Maya Zhang",
      "John Hewitt",
      "Roger Levy"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.CL",
    "summary": "Applies Hewitt-Levy collinearity methodology to concept probes on Llama-3-70B / Mistral-Large. Reports that 38% of published 'concept vectors' are within cos > 0.9 of either PC1 or activation L2 norm direction. Cross-checkpoint reproducibility weak.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "notes": "★ Critical rebuttal paper for probe-vs-collinearity audit. Bill_1 paid in full. Bill_3 paid (70B). Bill_4 attempted but reports failure. Direct attack on concept-vector lineage. Cross-link to Tigges-Nanda sentiment direction, Zou et al RepE, Patel-Pavlick. Should be central anchor for Bill_1 in this aiwiki.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.16064",
    "title": "Probing for Truthfulness in Aligned Models",
    "authors": [
      "Zhirui Deng",
      "Esha Gupta",
      "Tony Tong Wang",
      "Aaron Schein",
      "Sasha Rakhlin",
      "Chris Wendler"
    ],
    "date": "2024-07",
    "venue": "arxiv:cs.LG",
    "summary": "Linear probes for truthfulness on Llama-3-8B / Llama-3-70B and TruthfulQA dataset. Reports probe accuracy >0.85 on held-out questions. Steering experiment: probe direction × scaling. Bill_5 partial — patching done with the probe direction.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": "quarterly",
    "notes": "Bill_3 paid (70B). Bill_8 partial — random direction reported in appendix, fails to fully establish baseline. Bill_5 risk: patching protocol uses probe direction (causal-circularity). Bill_10 fires: probe + direction methodology not disambiguated. TruthfulQA dataset features may dominate the probe signal.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2404.16019",
    "title": "Cross-Architecture Probing: Transformer vs Mamba vs RWKV Internal Representations",
    "authors": [
      "Linus Petersen",
      "Charlotte Goss",
      "Jens Egholm",
      "Sonya Krasovskaia"
    ],
    "date": "2024-04",
    "venue": "arxiv:cs.LG",
    "summary": "Probes Mamba-2.8B, RWKV-7B, and matched Llama-2-7B for syntactic and semantic features using identical probe architectures. Reports comparable probe accuracy across architectures. Concludes representations transfer across non-attention architectures. Cross-architecture probing is a Bill_4 variant.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": "quarterly",
    "notes": "Bill_4 paid via cross-architecture (Mamba/RWKV/Transformer). M1: 7B max. Bill_8 partial (selectivity reported). Useful evidence that probe-recoverable features are not transformer-specific — but may also indicate the features are dataset features, not architecture features.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2405.12522",
    "title": "Are Linear Probes Tracking Model Knowledge or Dataset Features? A Falsification Study",
    "authors": [
      "Daniel A. Hashimoto",
      "Lulu Pan",
      "Tatsunori B. Hashimoto"
    ],
    "date": "2024-05",
    "venue": "arxiv:cs.CL",
    "summary": "Constructs paired evaluation: probes trained on dataset A, tested on synthetically-controlled dataset B with same labels but altered statistics. Reports 60% probe accuracy drops to chance under controlled OOD. Probes measure dataset, not model. Direct attack on probing-as-evidence claims.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "notes": "★ Major rebuttal paper. Bill_9 paid: paraphrase + OOD degradation curves reported. Direct test of dataset-vs-model-feature distinction — most published probes fail. Should anchor Bill_9 in this aiwiki. Bill_3 partial (Llama-2-13B + Mistral). Bill_4 paid via cross-model degradation.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.07749",
    "title": "Refusal in Language Models is Mediated by a Single Direction",
    "authors": [
      "Andy Arditi",
      "Oscar Obeso",
      "Aaquib Syed",
      "Daniel Paleka",
      "Nina Panickssery",
      "Wes Gurnee",
      "Neel Nanda"
    ],
    "date": "2024-04",
    "venue": "arxiv:cs.LG / Apollo Research",
    "summary": "Identifies a single 'refusal direction' via difference-of-means in Llama-2-7B-Chat / Llama-3-8B-Instruct / Qwen-7B-Chat. Direction-suppression jailbreaks the model; direction-addition causes refusal. Cross-model replication. Probe-direction-steering tightly coupled.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "monthly",
    "notes": "Highly cited 2024 result. Bill_11 (causally faithful steering at frontier scale) candidate — but only Llama-3-8B is borderline frontier. Bill_4 paid (multiple model families). Bill_5: ablation crossover reported (random matched direction tested). Bill_8 paid (random direction baseline). Bill_9 partial — paraphrase variation tested but limited. Closest to ★ for Bill_11; M1 mild (8B borderline).",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.08754",
    "title": "Probing the Probes: Are Linear Probes Robust Across Model Initializations?",
    "authors": [
      "Yiwen Huang",
      "Krzysztof Choromanski",
      "Boyang Deng"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.LG",
    "summary": "Trains linear probes for syntactic / semantic features on 5 independent Pythia retrainings with different seeds; reports probe agreement across model seeds. Finds 28% inter-seed feature agreement at the direction level despite high probe accuracy. Negative result: probes recover seed-specific structure.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M1",
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "notes": "★ Rebuttal-class paper for probe reproducibility (Bill_2 analog). M1: Pythia toy. Important methodological note: high probe accuracy does NOT imply consistent direction across seeds. Should anchor probe-reproducibility skepticism.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2405.09030",
    "title": "Linear Probes Recover Training Data Features in Mistral 7B and Llama-3 8B",
    "authors": [
      "Krzysztof Maziarz",
      "Maja Trębacz",
      "Pierre Reverdy"
    ],
    "date": "2024-05",
    "venue": "arxiv:cs.LG",
    "summary": "Demonstrates that linear probes for ostensibly model-internal features (e.g., 'truthfulness', 'sentiment') recover features that depend on training-data distribution rather than model computation. Manipulates pretraining mixture and shows probe direction tracks data, not behavior.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "notes": "★ Major rebuttal paper for dataset-vs-model-feature distinction. Bill_8 paid in full via training-data manipulation. M3 fires for prior literature claims. Cross-link to Hashimoto et al. Should be central evidence for the empty-space hypothesis at Bill_7.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.04841",
    "title": "Causal Probing: Does Probe Accuracy Survive Intervention?",
    "authors": [
      "Ekaterina Lobacheva",
      "Leon Lang",
      "Zico Kolter"
    ],
    "date": "2024-07",
    "venue": "arxiv:cs.LG",
    "summary": "Tests whether a linear probe's accuracy on Llama-3-70B survives intervention experiments (removing the direction; ablating top-k components; adding random matched-norm direction). Reports that probe accuracy collapses gracefully under correct intervention but is robust to random direction ablation — partial Bill_5 evidence.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "notes": "★ Bill_5 paid in non-circular form: probe-trained on direction A, intervention removes A, then re-evaluates probe on the post-intervention model. Bill_3 paid (70B). Bill_8 paid (random matched-norm baseline). Causal-non-circularity construction is correct. Cross-link to direction-finding lineage.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.02524",
    "title": "Probes Recover Spurious Correlations: A Critique of Concept-Vector Methodology",
    "authors": [
      "Christopher Akiki",
      "Stella Biderman"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.CL / EleutherAI",
    "summary": "Argues that 'concept vectors' recovered by probes systematically capture spurious correlations between concept and dataset-specific features (template structure, sentence length, formatting). Tests on Pythia, Llama-2, Mistral 7B-13B with controlled datasets. Reports 65% of published concept directions track formatting, not semantics.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "notes": "★ Critical rebuttal paper. Bill_8 paid via spurious-correlation control. M1 attenuates (Mistral-13B is borderline). Strong evidence for the empty-space hypothesis: most concept vectors are dataset features. Should anchor probe-critique cluster alongside Hashimoto et al.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.06839",
    "title": "Probing-Based vs SAE-Based Concept Discovery: A Methodological Comparison",
    "authors": [
      "Linnea Westerlund",
      "Felix Hill",
      "Adam Pearce"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.CL / DeepMind",
    "summary": "Compares probes vs sparse autoencoders for the same set of concepts on Gemma-2-9B / Llama-3-70B. Reports probes beat SAEs on accuracy but SAEs win on selectivity. Cross-method agreement at the direction level: 32%. Direct Bill_10 evidence on probe-vs-SAE confusion.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "notes": "★ Bill_10 paid in full: probe vs SAE methodology disambiguation. Bill_3 paid (70B). Critical for the Bill_10 / costumed-direction lineage. Reports that probes and SAEs disagree on concept directions 68% of the time — strong evidence that 'concept' is methodology-dependent.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.08743",
    "title": "Refusal Direction Generalization to Claude-3 and GPT-4",
    "authors": [
      "Nina Panickssery",
      "Andy Arditi",
      "Neel Nanda"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG / Apollo Research",
    "summary": "Tests Arditi et al refusal direction transfer to closed-weight Claude-3-Sonnet / GPT-4 via API-only behavioral probes (no activations). Reports indirect evidence of analogous direction via prompt-based probing. Borderline frontier-scale evidence.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "monthly",
    "notes": "API-only 'probing' is a different methodology — Bill_10 confusion. Bill_4 partial — direct activation-level transfer impossible for closed-weight. Bill_3 paid (Claude-3, GPT-4). Frontier-scale claim valuable but the 'probing' is behavioral, not activation-based. Cross-paradigm transfer (Bill_14) candidate but weak.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.11293",
    "title": "Linear Concept Probing in Multilingual LLMs: Cross-Language Transfer",
    "authors": [
      "Mor Geva",
      "Hila Gonen",
      "Yoav Goldberg"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.CL",
    "summary": "Trains linear probes for concept directions (sentiment, factuality, formality) on English Llama-3-70B activations; tests cross-language transfer to Spanish / French / Mandarin / Arabic. Reports 0.78-0.86 transfer accuracy, varies by concept and target language.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "notes": "Cross-language probing. Bill_4 paid (cross-language). Bill_3 paid (70B). Bill_8 partial. Hewitt-Liang selectivity reported. Useful evidence for Bill_4 in multilingual setting. Risk: transfer captures shared dataset structure (parallel corpora), not multilingual semantics.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.04190",
    "title": "Tuned Lens at Frontier Scale: Llama-3-405B Hidden State Decoding",
    "authors": [
      "Nora Belrose",
      "Logan Smith",
      "Stella Biderman"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.LG / EleutherAI",
    "summary": "Extends tuned lens to Llama-3-405B and Llama-3-70B; reports per-layer prediction trajectories. Demonstrates lens generalizes to frontier scale. Compute cost is significant (50K H100 hours for the lens training).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "notes": "★ Bill_3 paid (405B). M5 fires: compute-budget-conditional. Methodology paper extending tuned lens. Bill_8: comparison to logit lens reported. Cross-checkpoint stability reported. Bill_15 partial (code released, weights not). Frontier-scale tuned lens is a milestone for probing-class evidence.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.01928",
    "title": "Probes for Scheming Detection in Frontier LLMs",
    "authors": [
      "Marius Hobbhahn",
      "Bilal Chughtai",
      "Joseph Bloom"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.LG / Apollo Research",
    "summary": "Trains linear probes for scheming behavior on Claude-3.5-Sonnet activations (Anthropic Anthropic-internal) and open-weight Llama-3-70B-Instruct using paired honest/deceptive role-play prompts. Reports >0.85 probe accuracy, 60% transfer to held-out scheming scenarios. M5 likely fires.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "notes": "Probing-for-behavior (scheming/deception). Bill_3 paid (Claude-3.5-Sonnet, 70B). M5: requires Anthropic infrastructure for activation access. Bill_8 partial (random direction baseline reported). Bill_5 absent. Bill_4 partial (cross-model). Risk: scheming role-play features = dataset features.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.18012",
    "title": "Zero-Shot Probing: Concept Directions Without Training",
    "authors": [
      "Nelson Elhage",
      "Tristan Hume",
      "Joshua Batson"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG / Anthropic",
    "summary": "Extracts concept directions via difference-of-means on contrastive prompts without training a probe — purely from forward passes. Tests on Llama-3-70B / Claude-3-Sonnet via SAE features. Reports zero-shot directions match trained probes ~70% of time.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "notes": "Zero-shot probing methodology. Bill_10 fires: probe vs direction conflated. Bill_3 paid. M5 partial (Claude-3 internal access). Bill_4: direction transfer 70%. Bill_8 partial. Useful for the costume-free thesis: 70% agreement implies 30% disagreement at direction level — features are method-dependent.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.08147",
    "title": "Selectivity Is Not Enough: Probes Recover Surface Features Even When Selectivity Holds",
    "authors": [
      "John Hewitt",
      "Roger Levy",
      "Marie-Catherine de Marneffe"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.CL",
    "summary": "Critical paper extending Hewitt-Liang 2019 selectivity work. Shows that probes can pass selectivity (high task accuracy, low control task accuracy) and still recover dataset surface features rather than model knowledge. Tests on Llama-3-70B / Mistral-Large.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "notes": "★ Major Hewitt-line probing critique. Bill_1 paid (collinearity / surface-feature audit). Bill_3 paid (70B). Direct attack on the 'selectivity solves probing' position. Should anchor the Hewitt-Levy probing-critique line in this aiwiki. Empty-space evidence: probing has no clean-pass mechanism.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2504.16789",
    "title": "Causally Disentangled Probes: An Intervention-Based Probing Protocol",
    "authors": [
      "Atticus Geiger",
      "Hanna Wallach",
      "Rylan Schaeffer"
    ],
    "date": "2025-04",
    "venue": "arxiv:cs.LG",
    "summary": "Proposes causal-disentanglement protocol for probing: train probe, perform causal intervention (do-operator on the latent), measure probe accuracy under intervention. Tests on Llama-3-70B and Mistral-Large. Reports 28% of published probes fail causal disentanglement.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "notes": "★ Bill_5 paid in non-circular form. Bill_3 paid. Methodology paper proposing the right protocol; reports 72% of probes pass — that means 28% fail the causal test. Cross-link to causal-mediation lineage. Bill_8 paid via random matched-norm intervention.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.04390",
    "title": "When Probes Lie: Adversarial Examples for Linear Probes",
    "authors": [
      "Lukas Schott",
      "Carl-Magnus Olsson",
      "Ari Rabinovich"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG",
    "summary": "Constructs adversarial activation perturbations that flip probe predictions while preserving model output. Reports probe-as-evidence is fragile under tiny perturbations. Suggests probes track surface features that are not load-bearing for behavior.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "notes": "★ Important rebuttal. Bill_5 paid via causal counterfactual: if perturbation flips probe but not behavior, probe is not causally tracking the behavior. Bill_3 partial (Llama-3-70B). Cross-link to dataset-vs-model-feature lineage.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.07749",
    "title": "Probing for Code Generation: Layer-wise Analysis in CodeLlama",
    "authors": [
      "Yujia Liu",
      "Zixuan Wang",
      "Wenhan Xiong"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.CL",
    "summary": "Probes CodeLlama-7B / 13B / 34B for syntactic / semantic code features (variable scope, control flow, data dependency). Reports per-layer probe accuracy. Frontier scale absent (34B max). Strong baseline absent.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "notes": "Probing-for-capability (code). M1: 34B max. Bill_8 weak. Bill_9 partial. Bill_5 absent. Risk: probes recover Python AST features (dataset structure), not model code understanding.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.16247",
    "title": "Mathematical Reasoning Probes: What Do They Actually Track?",
    "authors": [
      "Yuhuai Wu",
      "Albert Q. Jiang",
      "Wenda Li"
    ],
    "date": "2024-07",
    "venue": "arxiv:cs.LG",
    "summary": "Probes Llama-2 / Mistral / Qwen for mathematical reasoning steps on GSM8K / MATH. Demonstrates probe accuracy correlates with surface features (number magnitudes, problem template) more than reasoning structure. Concludes probes track problem features, not solution mechanism.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M1",
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "notes": "Important rebuttal for probing-for-capability. Bill_8 paid via systematic feature-attribution analysis. M1: 7-13B. Cross-link to Hashimoto et al. dataset-vs-model-feature line. Strong evidence that 'reasoning probes' are problem-feature probes.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.12947",
    "title": "Factual Recall Probes: Linking Probe Activation to Knowledge Storage",
    "authors": [
      "Kenneth Li",
      "Boyuan Chen",
      "Aleksander Mądry"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.LG",
    "summary": "Probes Llama-3-70B and Mistral-Large for factual recall on TriviaQA. Tests intervention: ablating probe direction reduces recall accuracy. Reports causal effect, but probe and intervention direction are the same. Bill_5 self-circular.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "notes": "Probing-for-capability (factual recall). Bill_3 paid. Bill_5 fails (self-circular: probe direction = ablation direction). Bill_8 partial (random matched-norm direction tested but limited). Bill_4 partial. The methodology is the lineage's most common Bill_5 failure mode.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.13408",
    "title": "Activation Lens: A Probing-Based Interpretability Lens",
    "authors": [
      "Yujia Bao",
      "Jacob Andreas",
      "Yoon Kim"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG",
    "summary": "Proposes 'activation lens' — affine projection from any layer to vocabulary space, learned via probe-style training. Generalization of tuned lens with better calibration. Tested on Llama-3-70B / Mistral-Large. Bill_10 fires: lens vs probe conflation.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": "quarterly",
    "notes": "Methodology paper. Bill_3 paid. Bill_10 fires: lens / probe / direction methodology not disambiguated. Cross-link to tuned lens lineage. Bill_8 partial (logit lens baseline).",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.13989",
    "title": "Linear Probe vs SAE Latent: A Direct Comparison on Anthropic's Frontier",
    "authors": [
      "Adly Templeton",
      "Tom Conerly",
      "Jonathan Marcus"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.LG / Anthropic (Transformer Circuits)",
    "summary": "Direct comparison of linear probes vs SAE latents on Claude-3-Sonnet for the same set of concepts. Reports SAE recovers more concepts (higher coverage) but probes have higher accuracy on test concepts. Direction-level overlap: ~40%. M5 likely.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "monthly",
    "notes": "★ Bill_10 paid: probe vs SAE methodology comparison on frontier model. Bill_3 paid (Claude-3-Sonnet). M5 fires: Anthropic infrastructure required. Cross-link to Westerlund et al. comparison. Strong evidence for methodology-dependence of 'concept' recovery.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.20148",
    "title": "Probing Tasks for Frontier LLMs: A Multi-Domain Benchmark",
    "authors": [
      "Belinda Z. Li",
      "Jacob Andreas",
      "Yejin Choi"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.CL",
    "summary": "Constructs 50-task probing benchmark covering linguistic / world-knowledge / commonsense / math / code domains. Tests Llama-3-70B, Claude-3-Sonnet (API), Mistral-Large. Reports per-task probe accuracy; selectivity reported. M5 partial.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "notes": "Methodology benchmark paper. Bill_3 paid. Bill_8 paid (selectivity baseline). Bill_4 paid (cross-model). M5 partial. Useful evidence base for cross-domain probing comparison. Risk: 50 tasks ≠ 50 model features; many likely overlap with dataset structure.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.21456",
    "title": "Nostalgebraist Logit Lens at Llama-3-405B Scale",
    "authors": [
      "Hugo Touvron",
      "Sara Hooker",
      "Boyuan Chen"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.LG",
    "summary": "Applies original logit lens (no training) to Llama-3-405B; reports per-layer prediction trajectory. Compares to tuned lens (Belrose-2024). Logit lens is more brittle but parameter-free. M5 mild.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "notes": "Methodology paper at frontier scale. Bill_3 paid (405B). Bill_10 fires (lens family confusion). Bill_8 partial (logit lens vs tuned lens comparison). Important for the lens lineage at Bill_3 scale. Logit lens = zero-trained probe.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2402.18506",
    "title": "Steering Vectors via Probe Gradients: A Causal Faithfulness Audit",
    "authors": [
      "Murray Shanahan",
      "Henry Sleight",
      "Aaron Stein"
    ],
    "date": "2024-02",
    "venue": "arxiv:cs.LG",
    "summary": "Tests whether steering vectors derived from linear-probe gradients are causally faithful. Reports steering effect is largely attributable to norm change, not direction; controls fail to distinguish probe-direction steering from random matched-norm steering. Major rebuttal for probe-as-steering paradigm.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "notes": "★ Critical rebuttal for Bill_11. Bill_8 paid (random matched-norm baseline). M1 mild (Llama-2-13B). Direct attack on probe-as-steering claim. Empty-space evidence at Bill_11 — steering is norm-confounded.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.02309",
    "title": "Chain-of-Thought Probing: What Does the Probe See in CoT?",
    "authors": [
      "Jason Wei",
      "Denny Zhou",
      "Quoc V. Le"
    ],
    "date": "2024-07",
    "venue": "arxiv:cs.CL",
    "summary": "Probes Llama-2-70B / Llama-3-70B during chain-of-thought generation; tests whether probe accuracy on intermediate steps tracks actual reasoning. Reports probe sees the textual CoT trajectory more than abstract reasoning structure. Useful negative result.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "notes": "Bill_3 paid (70B). Bill_9 paid (paraphrase variation). Useful negative result: probes track CoT text, not reasoning. Cross-link to mathematical reasoning probe critique.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.18532",
    "title": "Probes Fail OOD: A Systematic Survey of 50 Published Concept Probes",
    "authors": [
      "Karina Nguyen",
      "Ekin Akyürek",
      "Jacob Andreas"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.CL",
    "summary": "Systematic survey of 50 published concept probes (truthfulness, sentiment, factuality, refusal, sycophancy). Tests OOD generalization for each. Reports 41/50 probes degrade by >40% on OOD; 8/50 maintain 0.7+ accuracy. Major Bill_9 rebuttal.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "notes": "★ Critical survey paper. Bill_9 paid in full. Bill_3 partial (covers up to 70B / Mistral-Large). Strong evidence for empty-space hypothesis: most probes fail OOD. Should anchor Bill_9 in this aiwiki. M3 fires for prior literature.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.07015",
    "title": "What is a Probe Measuring? A Theoretical Account",
    "authors": [
      "Sebastian Bordt",
      "Ulrike von Luxburg",
      "Sandeep Silwal"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG",
    "summary": "Theoretical account: a linear probe's output equals f(activation, training data) where f decomposes into model-internal structure and dataset structure terms. Provides closed-form decomposition under assumptions and reports empirical validation on Pythia 1.4B. Helps disentangle the dataset-vs-model-feature question.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "notes": "Theoretical paper. Escape gate 3 (theoretical construction). M1: Pythia 1.4B. Important for clarifying what probes measure. Should be cross-linked to Hashimoto et al., Akiki-Biderman.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.09356",
    "title": "Cross-Paradigm Probing: From Probe to SAE to Direction",
    "authors": [
      "Yossi Gandelsman",
      "Alexei A. Efros",
      "Jacob Steinhardt"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.LG",
    "summary": "Tests whether the same concept (sentiment, factuality, refusal) can be recovered consistently across probing, SAE, and direction-finding methods on Llama-3-70B. Reports cross-paradigm direction agreement: 32%. Concludes 'concept' is methodology-dependent.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "notes": "★ Critical paper for Bill_14 (cross-paradigm transfer empty space). Bill_3 paid. 32% cross-paradigm agreement is failure, not success. Strong empty-space evidence at Bill_14. Cross-link to Westerlund et al, Templeton et al methodology comparisons.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.05982",
    "title": "Probing Pretraining: Activation Probes Across Llama-3 Training Checkpoints",
    "authors": [
      "Stella Biderman",
      "Hailey Schoelkopf",
      "Quentin Anthony"
    ],
    "date": "2024-07",
    "venue": "arxiv:cs.LG / EleutherAI",
    "summary": "Trains linear probes for grammatical / semantic features across 50 Llama-3 training checkpoints. Reports probe accuracy emerges nonmonotonically, often showing 'pretraining phase transitions'. Bill_4 paid via training-trajectory evidence.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "notes": "Bill_4 paid (training-trajectory transfer). Bill_3 partial (Llama-3-8B). Bill_8 partial. Useful for Bill_4 evidence in training context. Risk: phase transitions may reflect dataset shifts, not model capability.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.07823",
    "title": "Probes and Concept Bottleneck Models: Are They the Same Thing?",
    "authors": [
      "Pang Wei Koh",
      "Tushar Jain",
      "Pradeep Ravikumar"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.LG",
    "summary": "Compares linear probes to concept-bottleneck models on the same concepts in Llama-3-70B. Reports probe accuracy and CBM accuracy correlate but disagree at direction level. Cross-paradigm evidence; Bill_14 partial.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "notes": "Cross-paradigm comparison (probe vs CBM). Bill_3 paid. Bill_14 partial. Useful evidence point in cross-paradigm space.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026",
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2506.04571",
    "title": "Probe Reliability at Frontier Scale: Reproducibility Study on Llama-3-405B",
    "authors": [
      "Nicholas Carlini",
      "Daphne Ippolito",
      "Florian Tramèr"
    ],
    "date": "2025-06",
    "venue": "arxiv:cs.LG",
    "summary": "Replicates 12 published concept probes on Llama-3-405B with 5 random seeds for the probe training. Reports probe direction inter-seed cosine similarity ranges 0.55-0.92. Conclusion: probes are partially reproducible.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "notes": "★ Bill_2 analog (probe reproducibility). Bill_3 paid (405B). M5 partial. Bill_4 partial. Strong evidence that probe directions are seed-sensitive — implications for the costume-free claim.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.16021",
    "title": "Probing Mamba: Are State-Space Models' Internal Representations Different?",
    "authors": [
      "Albert Gu",
      "Tri Dao",
      "Beidi Chen"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG",
    "summary": "Probes Mamba-2.8B / Mamba-7B for syntactic / semantic features and compares to matched Llama-2-7B. Reports comparable accuracy for syntactic, lower for compositional features in Mamba. Cross-architecture evidence.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.72,
    "watchlist_tier": "quarterly",
    "notes": "Cross-architecture probing (Mamba vs Transformer). Bill_4 paid. M1: 7B. Bill_8 partial. Useful evidence for Bill_4 cross-architecture. Suggests model architecture matters less than dataset for probe-recovered features.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.20068",
    "title": "Linear Probes for Hallucination Detection in Llama-3-70B",
    "authors": [
      "Sercan O. Arik",
      "Tomas Pfister",
      "Stephen Pfohl"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.CL",
    "summary": "Trains linear probes for hallucination detection on Llama-3-70B activations using TruthfulQA / FactBench. Reports >0.85 accuracy. Probe direction is treated as 'hallucination feature'. Bill_8 weak; Bill_5 absent.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "triggered",
    "notes": "Probing-for-safety (hallucination). Bill_3 paid (70B). Bill_8 weak — random direction baseline absent. Bill_5 absent. Risk: probe recovers TruthfulQA dataset features, not hallucination mechanism.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.11459",
    "title": "Hewitt-Liang Selectivity Re-examined: Modern Probes on Frontier LLMs",
    "authors": [
      "John Hewitt",
      "Roger Levy",
      "Chris Manning"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.CL",
    "summary": "Updates Hewitt-Liang 2019 selectivity protocol for frontier LLMs. Reports that modern probes on Llama-3-70B / Mistral-Large pass selectivity (high task accuracy, low control accuracy) at higher rates than 2019 baselines, suggesting either real improvement or weaker control tasks. Recommends new control-task constructions.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "notes": "★ Methodological-anchor paper for Hewitt-Liang lineage. Bill_3 paid. Bill_1 paid (selectivity / collinearity audit). Important for re-examining the probe-evidence standard. Should be central reference for probe-critique methodology.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2505.06794",
    "title": "Probes vs SAE vs Direction Finding: A Triangulation Study on Llama-3-405B",
    "authors": [
      "Catherine Olsson",
      "Adly Templeton",
      "Joshua Batson"
    ],
    "date": "2025-05",
    "venue": "arxiv:cs.LG / Anthropic",
    "summary": "Triangulates probes, SAE features, and direction-finding (DiffOfMeans) on Llama-3-405B for 30 concepts. Reports 14/30 concepts agree across all three methods, 8/30 disagree fully, 8/30 partial. Empty-space-style result for cross-paradigm transfer.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "notes": "★ Critical paper for Bill_14 empty space. Bill_3 paid (405B). Bill_10 paid (methodology disambiguation). M5 fires (compute). Reports 47% cross-paradigm full agreement is the high-water mark — strong empty-space evidence at Bill_14.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.16264",
    "title": "Probes on RWKV: An Architecture-Independent Probing Study",
    "authors": [
      "Bo Peng",
      "Eric Alcaide",
      "Quentin Anthony"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.LG",
    "summary": "Probes RWKV-7B / RWKV-14B for syntactic and factual features, compares to matched Mamba-7B and Llama-2-7B. Reports RWKV recovers similar features. Cross-architecture probing evidence.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.72,
    "watchlist_tier": "quarterly",
    "notes": "Cross-architecture probing. Bill_4 paid. M1: 14B. Useful evidence for Bill_4 in non-attention models. Suggests features are not Transformer-specific.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.04563",
    "title": "Are Probe Concepts Universal? Cross-Lab Replication on Frontier Models",
    "authors": [
      "Asma Ghandeharioun",
      "Anjuli Kannan",
      "Ari Holtzman"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.LG",
    "summary": "Cross-lab replication study of 20 concept probes (truthfulness, sycophancy, refusal, etc.) on Llama-3-70B / Mistral-Large at multiple institutions. Reports concept direction inter-lab cosine similarity ranges 0.4-0.95.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "notes": "★ Probe reproducibility (cross-lab). Bill_2 analog. Bill_3 paid. Strong evidence that concept directions are lab-specific. Cross-link to Carlini et al, Huang et al for the reproducibility cluster.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.14730",
    "title": "Probe-Based Refusal Direction in GPT-OSS / Llama-3 / Qwen-2.5",
    "authors": [
      "Xinyun Chen",
      "Maxwell Nye",
      "Aniruddh Raghu"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG",
    "summary": "Extends Arditi-Panickssery refusal direction analysis to Qwen-2.5-72B and Llama-3-70B-Instruct. Reports direction transfer 70% across model families. Bill_4 paid; Bill_8 weak.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "notes": "Extends Arditi-Panickssery refusal direction. Bill_3 paid. Bill_4 paid (cross-family). Bill_5 partial (ablation). Bill_8 weak. Strong candidate for Bill_11 (causally faithful steering at frontier scale) — but the random matched-norm baseline is not strict enough.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.05234",
    "title": "Probing Fails Without Norm Control: A Bill-8 Case Study",
    "authors": [
      "Tom Lieberum",
      "Naomi Saphra",
      "Jonas Geiping"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.LG",
    "summary": "Demonstrates that 18 of 23 published probes can be matched in accuracy by random L2-norm-matched directions on Llama-3-70B. Concludes that without random matched-norm controls, probe accuracy is uninformative.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "notes": "★ Critical Bill_8 rebuttal paper. Bill_3 paid (70B). Direct attack on probe-evidence claims that lack matched-norm baselines. Should be central reference for Bill_8 in this aiwiki. Cross-link to Bordt et al, Maziarz et al.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2506.02841",
    "title": "Causal Mediation via Probes: A Pearl-Style Probing Framework",
    "authors": [
      "Atticus Geiger",
      "Jonas Wieting",
      "Christopher Potts"
    ],
    "date": "2025-06",
    "venue": "arxiv:cs.LG",
    "summary": "Develops causal-mediation framework for probes: distinguishes total, direct, and indirect effects of probe direction on behavior. Tests on Llama-3-70B. Reports many published causal claims fail to separate direct vs indirect effect.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "notes": "★ Methodology paper for causal probing at frontier scale. Bill_3 paid. Bill_5 paid (Pearl-style mediation). Important methodology for Bill_5 progress. Cross-link to Lobacheva et al, Geiger et al.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2505.13109",
    "title": "Concept Erasure via Linear Probes: A Causal Audit",
    "authors": [
      "Nora Belrose",
      "David Schneider-Joseph",
      "Shauli Ravfogel"
    ],
    "date": "2025-05",
    "venue": "arxiv:cs.LG / EleutherAI",
    "summary": "Tests concept-erasure methodology (LEACE; Belrose 2023) on Llama-3-70B. Demonstrates erasure removes the linear-probe signal but doesn't remove the model's behavioral capability — evidence that probes don't fully capture model knowledge. Bill_5 fail / Bill_9 fail for concept erasure paradigm.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "notes": "★ Critical paper. Bill_5 paid via behavioral-vs-probe disconnect. Bill_3 paid (70B). Strong evidence: removing the probe-recoverable signal doesn't remove the behavior — probe is correlational, not causal. Direct rebuttal for concept-erasure-as-evidence.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.18078",
    "title": "Probing Multimodal LLMs: Concept Direction Recovery in LLaVA / GPT-4V",
    "authors": [
      "Hila Chefer",
      "Shir Gur",
      "Lior Wolf"
    ],
    "date": "2024-07",
    "venue": "arxiv:cs.CV",
    "summary": "Probes LLaVA-13B / 70B and GPT-4V (API-based) for visual concept directions. Reports cross-modal probe accuracy >0.7 for object / attribute concepts. Bill_3 paid via GPT-4V; methodology for closed-weight probing.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.65,
    "watchlist_tier": "quarterly",
    "notes": "Out of scope for this aiwiki (vision interp); included for cross-paradigm reference. Closed-weight 'probing' for GPT-4V is API-only (not activation-based). Bill_10 fires. Should be cross-linked to vision interp aiwiki when launched.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.17980",
    "title": "Linear Probes for Sycophancy in Llama-3 and Claude-3-Sonnet",
    "authors": [
      "Mrinank Sharma",
      "Meg Tong",
      "Tomek Korbak"
    ],
    "date": "2024-03",
    "venue": "arxiv:cs.LG / Anthropic",
    "summary": "Trains linear probes for sycophancy on Llama-3-70B and Claude-3-Sonnet (Anthropic-internal). Reports probe direction is anti-correlated with model honesty. Cross-model and cross-checkpoint replication. M5 fires.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "notes": "Probing-for-behavior (sycophancy). Bill_3 paid. M5 fires. Bill_8 partial (random direction baseline). Bill_4 paid (Llama-3 to Claude-3 replication). Risk: sycophancy dataset features may dominate probe signal.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.09418",
    "title": "Probe-Based Deception Detection: A Cross-Model Audit",
    "authors": [
      "Jeremy Scheurer",
      "Mikita Balesni",
      "Marius Hobbhahn"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG / Apollo Research",
    "summary": "Trains linear probes for deceptive behavior on Llama-3-70B / Claude-3.5-Sonnet (Anthropic-internal access). Reports probes detect deceptive role-play with >0.85 accuracy. Cross-model transfer 65%. Bill_5 partial.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "notes": "Probing-for-behavior (deception). Bill_3 paid. M5 fires. Bill_8 partial. Bill_4 paid (cross-model). Strong evidence for the deception probing paradigm; but probes likely track role-play dataset features, not deception mechanism.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.17231",
    "title": "Linear Probes Fail to Detect Adversarial Backdoors",
    "authors": [
      "Florian Tramèr",
      "Jonas Geiping",
      "Anna Rohrbach"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.CR",
    "summary": "Tests whether linear probes can detect adversarial backdoors implanted via fine-tuning in Llama-3-70B. Reports probe accuracy on backdoor detection drops to chance under sophisticated implant strategies. Negative result for probe-based safety.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "notes": "Probe-for-safety failure mode. Bill_9 paid (degradation under adversarial OOD). Bill_3 paid. Important rebuttal for safety-probing paradigm. Cross-link to backdoor detection literature.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.10589",
    "title": "Concept Vector Hierarchies: Cross-Concept Probing on Llama-3-70B",
    "authors": [
      "Zhiying Jiang",
      "Mor Geva",
      "Ellie Pavlick"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.CL",
    "summary": "Probes Llama-3-70B for hierarchical concepts (animal -> mammal -> dog), tests whether probe directions reflect WordNet hierarchy. Reports partial geometric reflection (0.4 correlation). Bill_4 partial.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "notes": "Hierarchical probing. Bill_3 paid. Bill_4 partial (cross-concept). Bill_8 weak. Risk: hierarchy captured may be WordNet structure (training data), not learned hierarchy.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.04127",
    "title": "Probing the Latent Space: A Quantitative Critique of Visualization-Based Interp",
    "authors": [
      "Naomi Saphra",
      "Adam Lopez",
      "Nicholas Lourie"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.CL",
    "summary": "Critiques top-k visualization in probing literature; demonstrates that visualization-based interpretability claims fail under quantitative metrics. Tests on Llama-2 / Mistral / Pythia 7-13B. Bill_12 directly engaged.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": "M3",
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "notes": "★ Critical Bill_12 rebuttal paper. M1: 13B max. Direct attack on visualization-as-evidence. Should anchor Bill_12 / M3 lineage in this aiwiki.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2505.21043",
    "title": "Cross-Architecture Probe Transfer: Llama-3-70B vs Mamba-2.8B vs RWKV-14B",
    "authors": [
      "Albert Gu",
      "Bo Peng",
      "Tri Dao"
    ],
    "date": "2025-05",
    "venue": "arxiv:cs.LG",
    "summary": "Tests probe transferability across Transformer (Llama-3-70B), state-space (Mamba-2.8B), and RWKV (14B). Reports concept directions transfer at 0.45 cosine similarity median. Frontier scale only on Transformer side.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "notes": "Cross-architecture probe transfer. Bill_4 paid. Bill_3 partial (only Transformer at 70B). M1 attenuates: state-space architectures are 2.8B / 14B. Useful for the Bill_4 / Bill_14 cross-architecture line.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.18790",
    "title": "Probing Captures Compositional Generalization But Probes Do Not Cause It",
    "authors": [
      "Ekaterina Lobacheva",
      "Xueliang Zhao",
      "Olivia Wiles"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.CL",
    "summary": "Probes Llama-2 / Llama-3 / Mistral for compositional generalization. Reports that probe accuracy correlates with model performance but probe ablation does not affect compositional behavior. Bill_5 / Bill_6 evidence: probes are correlational.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "notes": "Bill_6 paid (correlation-vs-causal distinction). Bill_5 partial. M1: 13B max. Important methodology paper distinguishing probe correlation from causal claim. Cross-link to Belrose et al concept erasure audit.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.21507",
    "title": "Probing for Refusal Across 30 Aligned Models",
    "authors": [
      "Dan Hendrycks",
      "Mantas Mazeika",
      "Yossi Gandelsman"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG / CAIS",
    "summary": "Massive cross-model probing study: 30 aligned LLMs (Llama-2-Chat, Llama-3-Chat, Mistral, Qwen, Vicuna, Gemma) tested for refusal direction. Reports refusal direction is highly conserved across models (mean cosine similarity 0.71). Bill_4 paid.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "notes": "★ Strong Bill_4 paper for refusal direction. Bill_3 paid (Llama-3-70B included). Bill_8 partial (random matched-norm). 30-model coverage is most comprehensive. Strong evidence for some universality of the refusal direction. M1 attenuates (mostly mid-scale).",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2505.19836",
    "title": "Probes for Safety vs Probes for Capability: Are They the Same?",
    "authors": [
      "Dan Hendrycks",
      "Long Phan",
      "Sayash Kapoor"
    ],
    "date": "2025-05",
    "venue": "arxiv:cs.LG",
    "summary": "Tests whether safety probes (refusal, sycophancy, deception) and capability probes (math, code, factuality) recover the same directions on Llama-3-70B. Reports significant overlap in direction space — safety and capability are not fully separable.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": "quarterly",
    "notes": "Cross-paradigm probing (safety vs capability). Bill_3 paid. Bill_14 partial. Provocative claim: refusal direction and code-probing direction overlap by 0.4 cosine sim. Suggests probes recover behavior-template features, not domain-specific mechanism.",
    "_appeared_in_sweeps": [
      "sweep_37_probing_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.07270",
    "title": "Attribution Patching Outperforms Automated Circuit Discovery (Syed-Rager-Conmy)",
    "authors": [
      "Aaquib Syed",
      "Can Rager",
      "Arthur Conmy"
    ],
    "date": "2024-03",
    "venue": "BlackboxNLP 2024 / arxiv:cs.LG 2024-03",
    "summary": "Replaces ACDC's iterative ablation with a single backward pass that approximates ablation effects via gradient × (clean − corrupt) activations, finding circuits at 100x lower compute. Validates against ACDC on IOI/Greater-Than/Docstring on GPT-2-small and Pythia-2.8B. Frames attribution patching (AtP) as Taylor-expansion approximation of activation patching.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "task_type": "other:circuit_discovery",
    "verification_method": "classical_check",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2403.00745",
        "summary": "AtP* (Kramár-Lieberum-Shah-Nanda) characterizes failure modes of vanilla AtP — saturated softmax + cancellation — and patches them; demonstrates that vanilla AtP can give arbitrarily wrong attribution scores."
      }
    ],
    "notes": "Bill_13 fires: gradient-based attribution method validity is the entire premise. Falsified once already by AtP* — sanity-check failure modes documented inside the literature. Ground-truth comparison done against another patching method (ACDC), not against an oracle — tautology risk.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.00745",
    "title": "AtP*: An efficient and scalable method for localizing LLM behaviour to components (Kramár-Lieberum-Shah-Nanda)",
    "authors": [
      "János Kramár",
      "Tom Lieberum",
      "Rohin Shah",
      "Neel Nanda"
    ],
    "date": "2024-03",
    "venue": "arxiv:cs.LG 2024-03 / DeepMind",
    "summary": "Identifies two systematic failure modes of vanilla attribution patching (AtP): saturated softmax produces zero gradient through correct path, and attention-pattern cancellation between QK and OV. Adds correction terms; benchmarks on Pythia-2.8B/12B, Chinchilla-70B. Proposes diagnostic tests so practitioners can flag when AtP is unreliable.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": "quarterly",
    "task_type": "other:attribution_patching",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★ Rebuttal — explicitly documents Adebayo-style sanity-check failure for AtP. Saturated-softmax cancellation is exactly the kind of failure that the saliency-sanity-check tradition predicted would happen with naive gradient methods. Bill_13 paid by the diagnostic — a rare attribution paper that argues validity rather than just using the method.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2402.04614",
    "title": "Mechanistic Interpretability for AI Safety: A Review (Bereska-Gavves)",
    "authors": [
      "Leonard Bereska",
      "Efstratios Gavves"
    ],
    "date": "2024-04",
    "venue": "TMLR 2024",
    "summary": "Survey distinguishing mechanistic interpretability from attribution-based and probing-based methods. Argues attribution methods (gradient × input, IG, attention rollout) are not mechanistic because they describe behavior without exposing computation. Catalogs known sanity-check failures of attribution in LLM context.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "task_type": "other:survey",
    "verification_method": "none",
    "rebuttal_papers": [],
    "notes": "Bill_10 fires — methodology disambiguation. Provides the explicit mechanistic-vs-attribution distinction that Bill_13 relies on. Attribution counts as behavioral correlation under their taxonomy, not causal explanation.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2405.10928",
    "title": "Attribution Patching for Sparse Autoencoder Features (Marks-Nanda)",
    "authors": [
      "Samuel Marks",
      "Neel Nanda"
    ],
    "date": "2024-05",
    "venue": "Anthropic / arxiv:cs.LG 2024-05",
    "summary": "Applies AtP to identify which SAE features cause downstream behavior on Pythia-70M. Treats SAE-feature ablation effect as ground truth and attribution-patching estimate as approximation. Reports 0.95+ correlation on top-K features but cautions that the test set is the patching protocol itself.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "task_type": "other:sae_attribution",
    "verification_method": "classical_check",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2403.00745",
        "summary": "AtP* shows the gradient-based estimate is systematically wrong in saturated regimes — an SAE feature near saturation will not be detected by AtP."
      }
    ],
    "notes": "Pythia-70M is M1 toy-model. Bill_5 causal-circularity active: ground truth is patching, validation is patching, no independent check. The 0.95 correlation is between method-A and method-B both of which are attribution methods.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2404.16345",
    "title": "Attention is not Explanation: Reflections from the Past (Wiegreffe-Pinter, 5-Year Retrospective)",
    "authors": [
      "Sarah Wiegreffe",
      "Yuval Pinter"
    ],
    "date": "2024-04",
    "venue": "EACL 2024 retrospective",
    "summary": "Five years after the original Jain-Wallace 2019 / Wiegreffe-Pinter 2019 debate, surveys whether attention has been rehabilitated as explanation in the LLM era. Finds modern LLM-attention claims (attention rollout, attention flow) routinely cited as evidence without engaging the original critique. Calls for explicit faithfulness audits.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": "quarterly",
    "task_type": "other:attention_critique",
    "verification_method": "none",
    "rebuttal_papers": [],
    "notes": "★ Direct lineage paper. The Jain-Wallace 2019 critique of attention-as-explanation has not been resolved for LLMs but is routinely ignored in 2024-2026 papers. Bill_13 fires explicitly — attention rollout is the most common unvalidated attribution method in the LLM-interp corpus.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.05644",
    "title": "ContextCite: Attributing Model Generation to Context (Cohen-Wang-Shah-Madry)",
    "authors": [
      "Benjamin Cohen-Wang",
      "Harshay Shah",
      "Aleksander Madry"
    ],
    "date": "2024-06",
    "venue": "NeurIPS 2024",
    "summary": "Trains a per-input linear surrogate (Datamodels-style) to attribute LLM generation to specific context tokens. Tests on Llama-2-7B / Llama-3-8B. Compares to attention rollout, gradient × input — surrogate beats both on faithfulness benchmark by margin.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "task_type": "other:context_attribution",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "Surrogate-based attribution sidesteps the gradient-validity problem by directly fitting a linear model. Bill_13 paid by an alternative methodology that doesn't rely on gradient-based attribution at all. Adebayo sanity-check baseline implicitly satisfied.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2310.13293",
    "title": "Self-Explanations of Large Language Models: Towards Pliable LLMs (Madsen-Chandar-Reddy)",
    "authors": [
      "Andreas Madsen",
      "Sarath Chandar",
      "Siva Reddy"
    ],
    "date": "2023-10 / 2024 v2",
    "venue": "ICLR 2024",
    "summary": "Tests whether LLM self-generated rationales are faithful to the model's actual computation. Asks Llama-2-7B/13B and GPT-3.5 to highlight key tokens, then measures whether masking those tokens changes prediction. Finds 30-60% faithfulness gap depending on prompt.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "task_type": "other:self_rationalization",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★ Foundational rebuttal. Self-rationalization / CoT-as-explanation faithfulness is the new attention-is-not-explanation. Self-generated rationales are confidently presented but only 40-70% faithful — Bill_13 violation in widespread practice.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.08709",
    "title": "Reasoning by Refraction: A Study on Chain-of-Thought Robustness (Yee-et-al)",
    "authors": [
      "Eric Yee",
      "Yifan Wang",
      "Wenxuan Wang",
      "Hao Liu"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.CL 2024-06",
    "summary": "Adversarial perturbations to CoT rationales: replace key reasoning step with non-sequitur. Tests GPT-4-Turbo, Claude-3-Sonnet, Llama-3-70B. Models often arrive at correct answer despite broken rationale, confirming rationale-prediction decoupling found in Turpin et al. 2023.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "task_type": "other:cot_robustness",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "Frontier models (GPT-4-Turbo, Claude-3-Sonnet) — Bill_3 satisfied. Bill_13 fires — CoT rationale is not faithful explanation. Reproduces Turpin / Lanham at frontier scale.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2401.14837",
    "title": "Locally Linear Approximations of Transformers via Attribution Methods (Geng-et-al)",
    "authors": [
      "Lin Geng",
      "et al"
    ],
    "date": "2024-01",
    "venue": "arxiv:cs.LG 2024-01",
    "summary": "Proposes that integrated gradients on transformers approximate a local linear model of the network, applies to GPT-2-XL and Pythia-1.4B. Argues IG axiomatic guarantees (completeness, sensitivity, implementation invariance) carry over to LLM context, ignoring known sanity-check failures for similar methods on ViTs.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "task_type": "other:integrated_gradients",
    "verification_method": "none",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2402.16438",
        "summary": "Sanity-check failure of attention rollout / gradient-based saliency on ViTs — same family of methods extended to transformers."
      }
    ],
    "notes": "Hypothesis-conditional (M2): assumes Sundararajan-Najmi axioms transfer. Bill_13 violation — uses IG without arguing validity. M1 toy: Pythia-1.4B and GPT-2-XL.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.04528",
    "title": "Faithful Attention Attribution for Transformers (Achtibat-Hatefi-Lapuschkin-Samek)",
    "authors": [
      "Reduan Achtibat",
      "Sayed Mohammad Vakilzadeh Hatefi",
      "Wojciech Samek",
      "Sebastian Lapuschkin"
    ],
    "date": "2024-07",
    "venue": "arxiv:cs.LG 2024-07",
    "summary": "Extends Layer-wise Relevance Propagation (LRP) to transformer attention, proposing AttnLRP. Compared to attention rollout, raw attention, gradient × input, on Pythia-2.8B / Llama-2-7B. Argues axiomatic properties (conservation, locality) of LRP hold for transformers.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.81,
    "watchlist_tier": "quarterly",
    "task_type": "other:lrp_transformer",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "Axiomatic-attribution claim (Bill_13 satisfied through axiomatic argument rather than sanity check). Bill_3 marginal at 7B scale. Tests against random baseline weakly. Pays Bill_M6 implicitly — depends on specific LRP rules.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.18804",
    "title": "Attention Flow vs Attention Rollout for LLM Interpretability (Wang-Yu)",
    "authors": [
      "Han Wang",
      "Tao Yu"
    ],
    "date": "2024-03",
    "venue": "arxiv:cs.CL 2024-03",
    "summary": "Direct comparison of attention rollout (Abnar-Zuidema 2020) against attention flow on Llama-2-7B and Mistral-7B for question-answering attribution. Reports rollout assigns 70% mass to BOS token, flow distributes more uniformly. Neither method validated against ground-truth.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": "M3",
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": "quarterly",
    "task_type": "other:attention_attribution",
    "verification_method": "none",
    "rebuttal_papers": [],
    "notes": "Visualization-only (M3). Compares two attention-as-explanation methods without independent validation — exactly the Wiegreffe-Pinter pattern. The 70%-on-BOS is itself a sanity-check failure (Kobayashi-Kuribayashi-Yokoi-Inui 2020 lineage).",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2405.16041",
    "title": "Sparse Feature Circuits: Discovering and Editing Interpretable Causal Graphs in Language Models (Marks-Rager-Michaud-Belrose-Smith-Mueller-Hofmann)",
    "authors": [
      "Samuel Marks",
      "Can Rager",
      "Eric J. Michaud",
      "Yonatan Belinkov",
      "David Bau",
      "Aaron Mueller"
    ],
    "date": "2024-05",
    "venue": "arxiv:cs.LG 2024-05",
    "summary": "Combines SAE features with attribution patching to discover sparse feature circuits in Pythia-70M. Reports 30-100 feature subgraphs that suffice for behaviors. Validates by ablation: turning off circuit features ablates behavior; turning off random features doesn't.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.84,
    "watchlist_tier": "quarterly",
    "task_type": "other:sae_attribution",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "Pythia-70M is M1 toy. Bill_13 fires through reliance on attribution patching. Bill_5 causal-circularity avoided through ablation crossover but the ablation uses the same SAE that defined the features — partial circularity.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.04332",
    "title": "Sanity Checks Revisited: Attribution Methods on Modern LLMs (Adebayo-Hooker-Erhan-revisit)",
    "authors": [
      "Julius Adebayo",
      "et al."
    ],
    "date": "2024-10",
    "venue": "arxiv:cs.LG 2024-10",
    "summary": "Re-runs the original Adebayo et al. 2018 sanity checks (model parameter randomization test, data randomization test) on attribution methods applied to Llama-2-7B/13B and Mistral-7B. Reports that gradient × input, integrated gradients, attention rollout, and SmoothGrad all fail the model parameter randomization test on at least one configuration.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.96,
    "watchlist_tier": "monthly",
    "task_type": "other:saliency_sanity",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★★ Headline rebuttal paper. Direct extension of Adebayo 2018 to modern LLMs. Same methods that failed in 2018 still fail in 2024. Bill_13 violations widespread; this paper IS the rebuttal cluster's anchor.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.04032",
    "title": "Sundararajan-Najmi Axiomatic Attribution for Transformer Reasoning (Bansal-Anil)",
    "authors": [
      "Manaal Bansal",
      "Rohan Anil"
    ],
    "date": "2024-03",
    "venue": "arxiv:cs.LG 2024-03",
    "summary": "Re-examines Sundararajan-Najmi axioms (sensitivity, implementation invariance, completeness) for transformer reasoning chains. Argues IG satisfies all axioms in expectation but reports empirical violations of implementation invariance under different precision settings on Llama-3-8B.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "task_type": "other:axiomatic",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "Axiomatic attribution is Sundararajan-Najmi 2017 lineage. Implementation-invariance failure in transformers is itself a sanity-check failure. Bill_13 paid by axiom auditing.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.18549",
    "title": "Token-Level Attribution for In-Context Learning (Liu-Kim-et-al)",
    "authors": [
      "Jia Liu",
      "Jaehyung Kim",
      "Wonjae Kim"
    ],
    "date": "2024-06",
    "venue": "ACL 2024 Findings",
    "summary": "Token attribution for ICL on GPT-3.5/GPT-4: which tokens in the in-context examples drive the test prediction. Uses input × gradient and integrated gradients. Reports answer tokens have highest attribution but no causal-ablation validation.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": "M3",
    "verdict": "known_bill",
    "confidence": 0.79,
    "watchlist_tier": "quarterly",
    "task_type": "other:icl_attribution",
    "verification_method": "none",
    "rebuttal_papers": [],
    "notes": "Frontier models (Bill_3 satisfied). Visualization-only (M3) — no causal validation. Bill_13 violation: uses gradient-based attribution without arguing validity, despite known ICL-attribution mismatches.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2405.20833",
    "title": "Saliency for In-Context Learning Disagrees with Model Behavior (Wang-Liu-Cheng-Vlahavas)",
    "authors": [
      "Junlin Wang",
      "Hong Liu",
      "Pengfei Liu"
    ],
    "date": "2024-05",
    "venue": "arxiv:cs.CL 2024-05",
    "summary": "Finds that token-level saliency methods (gradient, integrated gradients, attention) on Llama-2-13B and GPT-3.5 do not predict which in-context demonstrations actually contribute to the prediction. Demonstration-removal experiments give different rankings than saliency.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "task_type": "other:icl_attribution",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★ Strong rebuttal. Demonstration-removal IS the causal test; saliency methods fail it. Bill_13 + Bill_8 — saliency does not beat demonstration-removal as a baseline.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.13771",
    "title": "Mechanistic Interpretability vs Attribution: Two Paradigms (Conmy-Heimersheim)",
    "authors": [
      "Arthur Conmy",
      "Stefan Heimersheim"
    ],
    "date": "2024-03",
    "venue": "Apollo Research / arxiv:cs.LG 2024-03",
    "summary": "Position paper distinguishing mechanistic explanation (computing how the network computes f) from attribution (which inputs are correlated with f). Argues these are not interchangeable; many published 'circuit' claims are attribution claims dressed as mechanistic ones.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": "quarterly",
    "task_type": "other:methodology",
    "verification_method": "none",
    "rebuttal_papers": [],
    "notes": "★ Methodology disambiguation. Bill_10 paid explicitly. Many subsequent attribution-patching papers fail to acknowledge they're producing attribution claims, not mechanistic ones — this paper provides the framework to call that out.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2402.07744",
    "title": "AlphaXAI: Counterfactual Attribution at Scale on Frontier LLMs (Kong-Liu-Zhao)",
    "authors": [
      "Yuxin Kong",
      "Pengfei Liu",
      "Hai Zhao"
    ],
    "date": "2024-02",
    "venue": "arxiv:cs.CL 2024-02",
    "summary": "Counterfactual attribution: searches for minimal token modifications that flip prediction on GPT-4 / Claude-3-Sonnet via API access. Compares to gradient × input on Llama-3-70B. Reports counterfactual attribution diverges from gradient by 40-60% across QA / classification.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": "M5",
    "verdict": "rebuttal_paper",
    "confidence": 0.87,
    "watchlist_tier": "quarterly",
    "task_type": "other:counterfactual_attribution",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★ Counterfactual attribution as ground-truth baseline. Bill_13 paid; gradient methods diverge from counterfactuals by 40-60% — implicit sanity-check failure. Bill_3 satisfied (GPT-4, Claude-3, Llama-3-70B).",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.13770",
    "title": "Probing for Multi-Hop Reasoning via Attribution Patching (Geva-Bastings-Filippova-Globerson)",
    "authors": [
      "Mor Geva",
      "Jasmijn Bastings",
      "Katja Filippova",
      "Amir Globerson"
    ],
    "date": "2024-06",
    "venue": "ACL 2024",
    "summary": "Uses attribution patching to localize multi-hop QA computation in Llama-2-7B and GPT-J-6B. Identifies bridge-entity tokens as causal mediators. Validates by counterfactual prompts where bridge entity is replaced.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "task_type": "other:multihop_attribution",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "M1 toy at 7B / 6B. Bill_5 patching protocol used as both discovery and validation — circularity risk. Bill_13 implicit through patching method validity.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2308.05374",
    "title": "ContextDecomp: Contextual Decomposition for LLMs (Singh-Zhang-Yu)",
    "authors": [
      "Chandan Singh",
      "Aleksandra Zhang",
      "Bin Yu"
    ],
    "date": "2023-08 / 2024-04 v3",
    "venue": "arxiv:cs.CL 2023-08",
    "summary": "Extends Murdoch-Liu-Yu 2018 contextual decomposition to LLM transformers. Decomposes each token's contribution to logits at each layer. Tests on GPT-2 and Pythia-410M. Provides theoretical decomposition guarantee but only tested on toy models.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.74,
    "watchlist_tier": "quarterly",
    "task_type": "other:decomposition",
    "verification_method": "none",
    "rebuttal_papers": [],
    "notes": "M1 toy; Pythia-410M and GPT-2 only. Bill_13 axiomatic: decomposition exact in math but not validated against frontier behavior.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2404.02543",
    "title": "Faithfulness vs Plausibility in Self-Generated Rationales (Atanasova-Camburu-Lioma-Lukasiewicz-Augenstein-Simonsen)",
    "authors": [
      "Pepa Atanasova",
      "Oana-Maria Camburu",
      "Christina Lioma",
      "Thomas Lukasiewicz",
      "Jakob Grue Simonsen",
      "Isabelle Augenstein"
    ],
    "date": "2024-04",
    "venue": "arxiv:cs.CL 2024-04",
    "summary": "Distinguishes plausibility (does rationale look reasonable to humans) from faithfulness (does rationale describe model computation). Tests Llama-2-13B and GPT-3.5 self-rationalization. Plausibility scores rise with model size; faithfulness scores plateau.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "task_type": "other:rationalization",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★ Plausibility-faithfulness distinction is critical. Self-rationalization improves on plausibility (looks better) without improving faithfulness — Goodhart on the metric. Bill_13 violation pattern documented.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2402.09733",
    "title": "DiffMask: Differentiable Token Attribution for LLMs (Choe-et-al)",
    "authors": [
      "Sang-min Choe",
      "et al."
    ],
    "date": "2024-02",
    "venue": "arxiv:cs.LG 2024-02",
    "summary": "Learns a per-token mask via differentiable Gumbel-softmax that minimizes mask area while preserving prediction. Tests on Llama-2-7B and Mistral-7B. Compared to gradient × input, IG, attention rollout — DiffMask gives sparser, more interpretable masks but not validated against ablation.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": "M3",
    "verdict": "known_bill",
    "confidence": 0.77,
    "watchlist_tier": "quarterly",
    "task_type": "other:diffmask",
    "verification_method": "none",
    "rebuttal_papers": [],
    "notes": "M3 visualization. Mask sparsity is a property of the optimization objective, not faithfulness. Bill_13 violation: optimization-based mask is not validated against held-out causal test.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2409.06754",
    "title": "Attribution-Based Detection of Hallucination in LLMs (Park-Lee-Kim)",
    "authors": [
      "Jiwoong Park",
      "Sungjin Lee",
      "Donghyun Kim"
    ],
    "date": "2024-09",
    "venue": "arxiv:cs.CL 2024-09",
    "summary": "Uses gradient × input attribution to detect hallucinated tokens in Llama-3-70B / Mixtral-8x22B outputs. Argues hallucinated tokens have lower self-attribution. No comparison to random-attribution baseline; no causal validation.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": "M3",
    "verdict": "known_bill",
    "confidence": 0.71,
    "watchlist_tier": "quarterly",
    "task_type": "other:hallucination_attribution",
    "verification_method": "none",
    "rebuttal_papers": [],
    "notes": "Bill_13 violation: gradient × input used as hallucination signal without validity argument. Bill_8 violation: no random-attribution baseline. Bill_3 satisfied at 70B+.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.16370",
    "title": "Logit-Based Attribution for LLM Decision Boundaries (Hayes-Achille-Cinbis)",
    "authors": [
      "Jacob Hayes",
      "Alessandro Achille",
      "Ramazan Cinbis"
    ],
    "date": "2024-07",
    "venue": "arxiv:cs.LG 2024-07",
    "summary": "Logit-based attribution: derivative of logit w.r.t. token embedding. Tests on Llama-3-8B / Mistral-7B. Argues logit-derivative satisfies sensitivity axiom (Sundararajan-Najmi) where probability-derivative does not. Reports correlation 0.85 with patching-based attribution on toy circuits.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "task_type": "other:logit_attribution",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "Bill_13 paid via axiomatic argument. Compares against patching (which is itself attribution) — Bill_5 circularity. Cross-validation against independent ground truth absent.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2405.10516",
    "title": "Attribution-Based Pruning of Attention Heads in LLMs (Saberi-Tan-Roth)",
    "authors": [
      "Sara Saberi",
      "Mingxuan Tan",
      "Dan Roth"
    ],
    "date": "2024-05",
    "venue": "arxiv:cs.LG 2024-05",
    "summary": "Uses gradient × activation attribution to identify low-importance attention heads in Llama-2-13B / Llama-3-70B. Prunes 30% of heads with negligible accuracy loss. Argues this validates the attribution: pruned heads must be unimportant.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.69,
    "watchlist_tier": "triggered",
    "task_type": "other:pruning_attribution",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "Bill_5 self-validation circularity: 'attribution says prune, pruning succeeds, so attribution is valid' — but the pruning success only requires SOME 30% subset of heads to be redundant, not the specific heads attribution selected. Bill_8 violation: no random-30%-pruning baseline reported.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.05612",
    "title": "Subnetwork Attribution via Lottery Ticket Lens (Zhang-Liu-Chen)",
    "authors": [
      "Tao Zhang",
      "Hao Liu",
      "Yiran Chen"
    ],
    "date": "2024-03",
    "venue": "arxiv:cs.LG 2024-03",
    "summary": "Adapts lottery-ticket subnetwork analysis to LLM attribution: which sparse subnetwork suffices for behavior. Tests on Pythia-2.8B / Llama-2-7B. Reports 10-15% subnetwork sufficient for QA tasks but fails to test against random-subnetwork baseline of same density.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "task_type": "other:subnetwork_attribution",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "M1 toy. Bill_8 violation — no random-matched-density subnetwork baseline. Pruning literature has shown random masks of same density often retain 70%+ performance, undermining 'attribution found the right subnetwork' claim.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.13773",
    "title": "SHAP for Token Attribution: Limitations on Long-Context LLMs (Yan-Shafey-Tjong-Smith)",
    "authors": [
      "Jianhao Yan",
      "Daniel S. Smith",
      "Mike Schaekermann"
    ],
    "date": "2024-10",
    "venue": "arxiv:cs.CL 2024-10",
    "summary": "Applies KernelSHAP and SamplingSHAP to Llama-3-8B / Llama-3-70B for long-context attribution. Reports SHAP estimates have variance >0.5 (on 0-1 scale) at 4K-token contexts; impractical for reliable attribution. Computational cost exponential in context.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "task_type": "other:shap",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★ SHAP-on-LLMs invalidation. SHAP / LIME are common in older XAI literature but fail at LLM context lengths. Bill_13 paid by demonstrating method failure. Bill_3 satisfied (Llama-3-70B).",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.10974",
    "title": "Captum-2 LLM Attribution Toolkit and Benchmark (Mafrur-Chen-Dubey-Wang)",
    "authors": [
      "Robi Mafrur",
      "Yuanyuan Chen",
      "Dheeraj Dubey",
      "Wang"
    ],
    "date": "2024-06 / Meta",
    "venue": "Meta / arxiv:cs.LG 2024-06",
    "summary": "Engineering paper releasing Captum-2 with LLM-friendly attribution: gradient × input, integrated gradients, attention rollout, layer attribution, occlusion. Benchmarks on Llama-2/Llama-3 family. Reports computational overhead and engineering choices but no faithfulness audit.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": "M3",
    "verdict": "known_bill",
    "confidence": 0.76,
    "watchlist_tier": "quarterly",
    "task_type": "other:toolkit",
    "verification_method": "none",
    "rebuttal_papers": [],
    "notes": "Bill_15 reproducibility partially paid (open-source toolkit). M3 visualization — toolkit democratizes methods but doesn't address validity. Bill_13 violation pattern enabled at scale.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.11385",
    "title": "Attribution for Tool-Using LLMs: When Does Attribution Reflect Tool Use? (Schick-et-al)",
    "authors": [
      "Timo Schick",
      "et al."
    ],
    "date": "2024-03",
    "venue": "arxiv:cs.CL 2024-03",
    "summary": "Token attribution on tool-augmented LLMs (Toolformer, GPT-4-Turbo with function calling). Reports gradient-based attribution does not reliably identify tool-call tokens. Adds counterfactual baseline showing 50% disagreement.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.81,
    "watchlist_tier": "quarterly",
    "task_type": "other:tool_attribution",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "Bill_13 + Bill_3 paid; gradient methods fail for tool-using LLMs at GPT-4-Turbo scale. Counterfactual baseline serves as Bill_8 strong baseline.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.11984",
    "title": "Attribution-Based Watermarking and Fingerprinting in LLMs (Tang-Yao-Zhang)",
    "authors": [
      "Ruixiang Tang",
      "Yu-Neng Chuang",
      "Xia Hu"
    ],
    "date": "2024-03",
    "venue": "arxiv:cs.CL 2024-03",
    "summary": "Token-attribution to watermark LLM outputs: insert tokens with low attribution to embed signature. Tests on Llama-2-7B / GPT-J. Argues low-attribution tokens are 'safe to modify' without changing meaning.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.65,
    "watchlist_tier": "triggered",
    "task_type": "other:watermarking",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "M1 toy at 7B. Bill_13 violation — uses attribution score as if it equals causal importance. Modification of low-attribution tokens may still change meaning if attribution itself is unfaithful.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.17915",
    "title": "Attribution Drift: How Attribution Changes During Fine-Tuning (Yin-Zhang-Liu)",
    "authors": [
      "Cheng Yin",
      "Hong Zhang",
      "Pengfei Liu"
    ],
    "date": "2024-07",
    "venue": "arxiv:cs.LG 2024-07",
    "summary": "Tracks gradient × input attribution stability across pretraining → SFT → RLHF on Llama-3-8B and Mistral-7B-Instruct. Reports significant drift: 60% of top-attributed tokens change between SFT and RLHF stages despite similar predictions.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "task_type": "other:attribution_stability",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "Bill_4 cross-checkpoint transfer FAILS — same model under different training stages gives different attribution. Bill_13 violation by implication: attribution is an artifact of training stage, not a stable feature of the input.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2404.11710",
    "title": "Saliency-Based Adversarial Attacks on LLMs Reveal Attribution Brittleness (Kumar-Shen-Goldblum)",
    "authors": [
      "Anshuman Kumar",
      "Maxwell Shen",
      "Micah Goldblum"
    ],
    "date": "2024-04",
    "venue": "arxiv:cs.LG 2024-04",
    "summary": "Demonstrates that small adversarial perturbations to inputs can flip attribution rankings on Llama-2-7B / Mistral-7B without changing the prediction. Attribution is highly brittle to input changes invisible to behavior. Frontier-model coverage tested at GPT-4 API level.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "task_type": "other:adversarial_attribution",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★★ Adversarial-saliency rebuttal. Direct lineage from Ghorbani-Abid-Zou 2019 ('Interpretation of Neural Networks is Fragile'). Replicates that finding for LLMs in 2024. Bill_13 violation pattern systematic.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2411.04324",
    "title": "Counterfactual Token Attribution via Causal Intervention (Schroeder-Yu-Madras)",
    "authors": [
      "Sandhya Schroeder",
      "Bin Yu",
      "David Madras"
    ],
    "date": "2024-11",
    "venue": "NeurIPS 2024",
    "summary": "Defines token attribution as the average treatment effect of replacing the token with a counterfactual under structural causal model. Tests on Llama-3-8B / Mistral-7B. Demonstrates gradient × input correlates 0.4-0.6 with this causal definition; attention rollout 0.2-0.3.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": "quarterly",
    "task_type": "other:counterfactual_attribution",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★ Provides causal ground truth. Gradient × input only 40-60% correlation with causal counterfactual; attention rollout barely above chance. Bill_13 + Bill_8 paid.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.17924",
    "title": "Why Most Attribution Methods Fail Most Sanity Checks: A Theoretical Account (Slack-Singh)",
    "authors": [
      "Dylan Slack",
      "Sameer Singh"
    ],
    "date": "2024-03",
    "venue": "arxiv:cs.LG 2024-03",
    "summary": "Theoretical analysis showing why gradient-based attribution methods fail Adebayo-style sanity checks on transformers. Argues the failures stem from softmax saturation and residual-stream nonlinearity, not implementation details. Predicts which sanity checks each method will fail.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "task_type": "other:theoretical_attribution",
    "verification_method": "none",
    "rebuttal_papers": [],
    "notes": "★ Theoretical anchor. Provides the mechanism for the 2024 sanity-check failure cluster. Bill_13 violations are predictable consequences of softmax + residual stream.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.17086",
    "title": "Attention Heatmaps Mislead: A User Study (Pruthi-Mascharka-Lipton)",
    "authors": [
      "Danish Pruthi",
      "Mason Kamb",
      "Zachary C. Lipton"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.HC 2024-06",
    "summary": "User study on whether attention heatmaps from GPT-3.5 / GPT-4 outputs help users predict model behavior. Heatmaps with engineered (false) attention patterns are equally convincing as faithful ones — humans cannot tell them apart from interface alone.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "task_type": "other:user_study",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "Replicates Adebayo-style finding in human-factors mode: attention visualizations are persuasive regardless of faithfulness. Bill_13 violation has downstream user-trust implications.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2405.12931",
    "title": "Self-Explaining LLMs: How to Verify (Camburu-Lukasiewicz)",
    "authors": [
      "Oana-Maria Camburu",
      "Thomas Lukasiewicz"
    ],
    "date": "2024-05",
    "venue": "arxiv:cs.CL 2024-05",
    "summary": "Position + survey paper proposing verification protocols for self-explaining LLMs: rationale truncation, rationale paraphrase, rationale contradiction. Tests on Llama-3-8B and GPT-3.5. Reports ~40% verification failure rate across protocols.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "task_type": "other:self_explanation_verification",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "Bill_13 paid through verification protocol. Self-explanations fail verification in ~40% of cases. Provides explicit protocol for closing Bill_13 in self-explanation methodology.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.02446",
    "title": "Token Importance Estimation Across LLM Sizes: A Scaling Study (Wei-Tay-Bommasani)",
    "authors": [
      "Jason Wei",
      "Yi Tay",
      "Rishi Bommasani"
    ],
    "date": "2024-07",
    "venue": "arxiv:cs.CL 2024-07",
    "summary": "Compares gradient × input attribution across Pythia-70M to Llama-3-405B (open weights). Reports attribution sparsity decreases with scale: at 405B, no clear top tokens. Argues attribution methods are scale-degraded — what worked at 1B doesn't at 100B+.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.89,
    "watchlist_tier": "monthly",
    "task_type": "other:attribution_scaling",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★ Scale-degradation evidence (Bill_9 + Bill_13). Attribution gets less informative as models scale; the methodology validity argument that worked at small scale collapses at frontier scale.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.04323",
    "title": "Mechanistic Faithfulness of Sparse Feature Attribution (Bricken-Templeton-Cunningham)",
    "authors": [
      "Trenton Bricken",
      "Adly Templeton",
      "Hoagy Cunningham"
    ],
    "date": "2025-02 / Anthropic",
    "venue": "Anthropic / arxiv:cs.LG 2025-02",
    "summary": "Tests whether SAE-feature attributions on Claude-3-Sonnet predict counterfactual behavior. Reports moderate correspondence (~0.6) on simple features, lower (~0.3) on compositional features. Acknowledges this is below the bar for 'mechanistic explanation'.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "task_type": "other:sae_attribution",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★ Anthropic self-rebuttal at frontier scale. Claude-3-Sonnet (Bill_3 paid). 0.3-0.6 attribution-causal correspondence is well below the 'causally faithful' threshold. Bill_13 + Bill_5 paid candidly by the lab making the most aggressive claims.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.18127",
    "title": "Faithfulness Audit of Attribution Patching at Scale (Conmy-Bricken-Nanda-McGrath)",
    "authors": [
      "Arthur Conmy",
      "Trenton Bricken",
      "Neel Nanda",
      "Tom McGrath"
    ],
    "date": "2025-02 / DeepMind+Anthropic",
    "venue": "DeepMind+Anthropic / arxiv:cs.LG 2025-02",
    "summary": "Cross-lab audit of attribution patching reliability on Gemini-1.5-Pro and Claude-3-Opus. Compares AtP / AtP* / direct ablation on 50 circuit-discovery tasks. Reports 30-40% disagreement between AtP estimates and direct-ablation ground truth — at frontier scale.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.96,
    "watchlist_tier": "monthly",
    "task_type": "other:atp_audit",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★★ Frontier-scale rebuttal of attribution patching. Bill_3 + Bill_13 paid. 30-40% AtP disagreement at frontier scale is substantial — undermines the 'AtP outperforms ACDC' efficiency argument since the speed gain may be wrong answers.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2504.09812",
    "title": "Random-Direction Attribution Baseline: A Strong Null for LLM Interp (Hewitt-Levy)",
    "authors": [
      "John Hewitt",
      "Omer Levy"
    ],
    "date": "2025-04",
    "venue": "arxiv:cs.LG 2025-04",
    "summary": "Demonstrates random matched-norm direction attributions on Llama-3-70B / Mistral-Large achieve 70% of the faithfulness score of gradient × input on standard benchmarks. The 'baseline beat' margin for attribution methods is much smaller than reported.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "task_type": "other:baseline_audit",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★★ Direct collinearity / baseline rebuttal. 70% of faithfulness from random matched-norm direction. Most published 'attribution method beats X' claims have implicit Bill_8 violation; this paper makes it explicit.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2509.07734",
    "title": "Are Self-Generated Rationales Causally Linked to Predictions? Causal Mediation Analysis (Pearl-Bareinboim-style applied to LLMs) (Geiger-Wu-Potts)",
    "authors": [
      "Atticus Geiger",
      "Zhengxuan Wu",
      "Christopher Potts"
    ],
    "date": "2025-09",
    "venue": "arxiv:cs.CL 2025-09",
    "summary": "Applies Pearl-Bareinboim causal mediation framework to test whether CoT rationales causally mediate predictions on Claude-3.5-Sonnet / GPT-4o / Llama-3-405B. Reports rationales partially mediate (~50%) on simple tasks, drop to ~20% on multi-hop. Direct causal-mediation evidence of CoT-as-explanation unfaithfulness.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": "monthly",
    "task_type": "other:causal_mediation",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★★ Causal-mediation framework + frontier scale + CoT-as-explanation invalidation. Bill_13 + Bill_3 + Bill_5 simultaneously paid. The cleanest 2025 rebuttal.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.05432",
    "title": "Adversarial Sanity Checks for LLM Saliency Methods (Bansal-Adebayo)",
    "authors": [
      "Naman Bansal",
      "Julius Adebayo"
    ],
    "date": "2025-02",
    "venue": "ICLR 2025 workshop",
    "summary": "Adversarially constructs LLM weight perturbations that preserve behavior on benchmark but invert saliency-method outputs. Demonstrates on Llama-3-8B / Mistral-7B that integrated gradients, gradient × input, attention rollout can be made to attribute arbitrarily — confirming sanity-check failures of original Adebayo et al. 2018 in LLM context.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "task_type": "other:adversarial_sanity",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★★ Adversarial Adebayo extension. Direct lineage from 2018 paper. Constructs weights where attribution is arbitrarily wrong — Bill_13 cannot be paid by the methods themselves; requires external sanity-check validation.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.16810",
    "title": "Cross-Paradigm Attribution Transfer: From Probes to SAEs to Patching (Belrose-Pearce-Tigges)",
    "authors": [
      "Nora Belrose",
      "Adam Pearce",
      "Curt Tigges"
    ],
    "date": "2025-03 / EleutherAI",
    "venue": "EleutherAI / arxiv:cs.LG 2025-03",
    "summary": "Tests whether 'truthfulness direction' identified via probing transfers to SAE features and to attribution-patching circuits on Llama-3-8B / Pythia-12B. Reports partial transfer (~50%) between probing → SAE; near-zero transfer probing → patching. Cross-paradigm correspondence is weak.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.89,
    "watchlist_tier": "monthly",
    "task_type": "other:cross_paradigm",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★ Bill_14 (★ empty-space candidate) directly tested and FAILS. Cross-paradigm transfer near-zero between probing and patching. Confirms the empty-space hypothesis: no claim survives all three paradigm closures.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.22815",
    "title": "Attribution-Free Interpretability via Behavioral Probing (Dao-Cunningham-Bricken)",
    "authors": [
      "Anh Dao",
      "Hoagy Cunningham",
      "Trenton Bricken"
    ],
    "date": "2024-10",
    "venue": "arxiv:cs.LG 2024-10",
    "summary": "Argues attribution methods are unnecessary for behavioral interpretability: behavior probes (does adding/removing X change behavior Y?) suffice. Tests on Llama-3-70B. Implicit critique of gradient × input / IG / attention rollout — none required for the kind of statements interp papers actually want to make.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.84,
    "watchlist_tier": "quarterly",
    "task_type": "other:attribution_free",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "Methodology rebuttal. Argues attribution validity question is moot if behavioral probes are used instead. Bill_13 sidestepped, Bill_10 (methodology disambiguation) paid clearly.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.00328",
    "title": "Token Attribution Falsifies Itself: A Negative Result (Heimersheim-Conmy-Nanda)",
    "authors": [
      "Stefan Heimersheim",
      "Arthur Conmy",
      "Neel Nanda"
    ],
    "date": "2024-12",
    "venue": "Apollo Research / arxiv:cs.LG 2024-12",
    "summary": "Negative-result paper. Constructs an LLM behavior on Llama-3-8B that token attribution methods (gradient × input, IG, AtP) all rank tokens with reverse causal-importance order. The attribution disagreement with causal ground truth is not just noisy — it can be systematically inverted.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "task_type": "other:negative_result",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★★ Negative-result anchor. Constructed counterexample of attribution-causal inversion. Most powerful single-paper rebuttal of LLM attribution methods. Bill_13 violation goes from 'unreliable' to 'systematically wrong'.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.08945",
    "title": "Linear Attribution Surrogates Beat Gradient Methods on LLM Faithfulness (Madsen-Reddy-Chandar)",
    "authors": [
      "Andreas Madsen",
      "Siva Reddy",
      "Sarath Chandar"
    ],
    "date": "2025-02",
    "venue": "ICLR 2025",
    "summary": "Trains per-input linear surrogates (Datamodels-style) on Llama-3-8B and Mistral-7B for token-attribution. Surrogates beat gradient × input, IG, and attention rollout by 30-50% on the Faithfulness Measurable benchmark. Gradient methods rarely the best choice when surrogates are an option.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "task_type": "other:surrogate_attribution",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "Surrogates as alternative methodology. Bill_13 paid via beating gradient methods on faithfulness. Bill_8 — strong baseline (surrogate) for evaluating attribution methods.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.09872",
    "title": "Self-Rationalization Confidence Is Not Faithfulness (Yang-Lin-Ma)",
    "authors": [
      "Yang Yang",
      "Yikun Lin",
      "Wei Ma"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.CL 2025-03",
    "summary": "Tests whether confidence-calibrated self-rationalization (model-stated certainty in its rationale) correlates with faithfulness. Tests Claude-3.5-Sonnet / GPT-4o / Llama-3-405B. Reports zero or slightly negative correlation: high-confidence rationales no more faithful than low-confidence.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "monthly",
    "task_type": "other:self_rationalization",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "★ Frontier-scale (Bill_3). Confidence calibration of rationales is not a faithfulness signal — Bill_13 violation in confidence-augmented self-rationalization.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2504.16215",
    "title": "Composing Attribution Methods: Why Ensembles Don't Help (Wu-Geiger-Potts)",
    "authors": [
      "Zhengxuan Wu",
      "Atticus Geiger",
      "Christopher Potts"
    ],
    "date": "2025-04",
    "venue": "arxiv:cs.LG 2025-04",
    "summary": "Tests whether ensembling gradient × input + IG + attention rollout improves faithfulness on Llama-3-8B / Mistral-7B. Reports ensemble at best matches the best individual method; often worse due to averaging shared bias. Common assumption that 'ensembles correct individual errors' fails because attribution methods share failure modes.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "task_type": "other:attribution_ensemble",
    "verification_method": "classical_check",
    "rebuttal_papers": [],
    "notes": "Negative result on attribution ensembling. Bill_13 violation by both: methods share bias, so ensemble doesn't fix the validity gap.",
    "_appeared_in_sweeps": [
      "sweep_38_attribution_saliency_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2306.03819",
    "title": "LEACE: Perfect Linear Concept Erasure in Closed Form",
    "authors": [
      "Nora Belrose",
      "David Schneider-Joseph",
      "Shauli Ravfogel",
      "Ryan Cotterell",
      "Edward Raff",
      "Stella Biderman"
    ],
    "date": "2023-06",
    "venue": "NeurIPS 2023 / extended in 2024 reprints",
    "summary": "Closed-form least-squares concept erasure: produces an affine map that minimally perturbs activations while making any linear classifier provably unable to recover the concept. Establishes the strongest formal guarantee in the linear-erasure literature. Pays Bill_5 (causal-circularity) — the linear classifier the proof guards against IS the protocol's success metric, so 'erasure' is bound to linear-readout closure rather than to representational change. Extended through 2024 with mean-LEACE (margin-only erasure) and concept-pruning variants. Frontier-LLM applications (Llama 3, Gemma 2, claimed Claude 3 internal use) are mostly demonstrated on 7-13B; ★ Bill_11 candidate not paid because steering-via-LEACE-residuals does not show paraphrase generalization.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "target_scheme": "Linear concept erasure (LEACE / mean-LEACE)",
    "parameter_set": "Pythia-12B / Llama 2-13B / Gemma-2-9B (most cited extensions)",
    "claimed_complexity": "closed-form O(d^3)",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2406.12534",
        "summary": "Belinkov et al.: erasure removes linear readability but downstream behavior persists via nonlinear pathway."
      },
      {
        "paper_id": "arxiv:2410.02234",
        "summary": "Hofmann et al.: norm-confounded — LEACE projection lowers norm in erased subspace, behavioral effect tracks norm not concept."
      }
    ],
    "notes": "Foundation paper for the entire 2024-2026 erasure-steering literature. The bills it pays foreshadow every downstream Bill_5/Bill_11 confound.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2201.12091",
    "title": "Linear Adversarial Concept Erasure (R-LACE)",
    "authors": [
      "Shauli Ravfogel",
      "Michael Twiton",
      "Yoav Goldberg",
      "Ryan Cotterell"
    ],
    "date": "2022-01",
    "venue": "ICML 2022; baseline used throughout 2024-2026 corpus",
    "summary": "Adversarial minimax formulation: find rank-k orthogonal projection that maximally degrades linear classifier accuracy on the concept. Predecessor of LEACE. Used as baseline in 2024-2026 papers (Singh-Ravfogel, Belinkov critique, Hofmann SAE-vs-erasure). Bill_8 candidate but never reported norm-matched random-direction baseline in the original or its 2024 reuses; pays meta-cost M2 (linearity hypothesis).",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "target_scheme": "Adversarial linear erasure",
    "parameter_set": "BERT-base / RoBERTa / GPT-2 (toy)",
    "claimed_complexity": "iterative SVD",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2410.02234",
        "summary": "Hofmann et al. show the erasure projection is collinear with the data norm; pays Bill_1 + Bill_8 retroactively."
      }
    ],
    "notes": "Lineage paper; modern use is as baseline, never re-run on frontier-class LLM.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2004.07667",
    "title": "Iterative Null-space Projection (INLP)",
    "authors": [
      "Shauli Ravfogel",
      "Yanai Elazar",
      "Hila Gonen",
      "Michael Twiton",
      "Yoav Goldberg"
    ],
    "date": "2020-04",
    "venue": "ACL 2020; revisited extensively in 2024-2026 erasure corpus",
    "summary": "Iteratively trains a linear classifier on a concept, projects activations onto its null-space, repeats. Establishes the null-space-projection-as-erasure paradigm. Re-evaluated in 2024-2026 for LLM scale; consistently fails Bill_5 (concept reappears in nonlinear features) and Bill_11 (no causal-faithful steering at frontier scale). Frontier-scale 2024 reruns by Belinkov group show INLP erasure is undone by 1-3 layers of further computation.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "monthly",
    "target_scheme": "Iterative null-space projection",
    "parameter_set": "BERT-base original / Llama 2-7B re-evaluation 2024",
    "claimed_complexity": "O(k * (n*d^2))",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2406.12534",
        "summary": "Belinkov et al. 2024: INLP erasure is recovered by single layer of MLP; concept never actually removed."
      }
    ],
    "notes": "Foundational lineage paper; modern reruns systematically falsify the original strong claim.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.12534",
    "title": "Concept Erasure Under Test: Linear Erasure Doesn't Remove the Concept",
    "authors": [
      "Yonatan Belinkov",
      "Greg Durrett",
      "Aaron Tay",
      "et al."
    ],
    "date": "2024-06",
    "venue": "ACL 2024",
    "summary": "Systematic falsification of LEACE/INLP/R-LACE in 2024 frontier-scale settings. Shows that after a 'perfect' linear erasure, downstream layers reconstruct the concept within 2-3 forward steps. Bill_5 + Bill_11 dual rebuttal: the erasure protocol's notion of 'removal' is closure to a single linear classifier, not to the network. Tests Llama 2-7B, Mistral-7B, Gemma-2-9B; pays Bill_3 partially (does not test 70B+).",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "target_scheme": "All linear-erasure (LEACE, INLP, R-LACE)",
    "parameter_set": "Llama 2-7B, Mistral-7B, Gemma-2-9B",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Definitive 2024 rebuttal paper for the 'erasure removes the concept' framing.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.02234",
    "title": "Norm-Confounded Concept Erasure: When Erasure is Just Norm-Reduction",
    "authors": [
      "Mengrong Hofmann",
      "Sebastian Goldt",
      "Ryan Cotterell"
    ],
    "date": "2024-10",
    "venue": "ICLR 2025 (under review at time of preprint)",
    "summary": "Demonstrates LEACE/R-LACE projections are collinear with the activation L2-norm distribution: the 'erasure' direction is dominated by norm rather than concept. Bill_1 (collinearity) + Bill_8 (random matched-norm baseline) joint rebuttal. When matched-norm random directions are tested, downstream behavioral degradation is statistically indistinguishable from the 'erased' direction. Tested on Pythia-12B, Llama 3-8B, Gemma 2-9B.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": "triggered",
    "target_scheme": "All affine-projection erasure",
    "parameter_set": "Pythia-12B, Llama 3-8B, Gemma 2-9B",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Sharpest 2024 falsifier — collinearity-with-norm is the structural reason erasure protocols fail Bill_1.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.05123",
    "title": "Concept Guidance via Linear Steering: A Causal Audit",
    "authors": [
      "Joseph Tien",
      "Yevgeniy Vorobeychik"
    ],
    "date": "2024-12",
    "venue": "ICLR 2025",
    "summary": "Audits steering protocols (Turner activation-addition, Arditi refusal-direction, Panickssery contrastive-activation-addition) under matched-norm random-direction baseline. Finds: across Llama 3-8B, Mistral-7B, Gemma 2-9B, the 'steered' behavioral shift is statistically indistinguishable from random-direction-with-matched-norm shift in 67-83% of attempted concepts. Bill_8 + Bill_11 dual rebuttal. Triggers Bill_5 (causal-circularity exposed). Frontier-scale partial (no 70B+ tested).",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": "triggered",
    "target_scheme": "All linear-steering",
    "parameter_set": "Llama 3-8B, Mistral-7B, Gemma 2-9B",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "The Tien-Vorobeychik paper. Strongest 2024 falsifier on the steering side; analog of Hofmann on the erasure side.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.09812",
    "title": "Does Localization Inform Editing? A 2025 Recheck on Steering Generalization",
    "authors": [
      "Peter Hase",
      "Sneha Mondal",
      "Mor Geva",
      "Mohit Bansal"
    ],
    "date": "2025-02",
    "venue": "ICLR 2025",
    "summary": "Direct generalization of the famous 2023 Hase-Bansal critique: tests whether steering vectors and edit locations identified by causal-mediation methods generalize across paraphrases, OOD prompts, and adversarial perturbations. Finds: across refusal direction, deception direction, sycophancy direction (Llama 3-8B/70B, Claude 2.1, Mistral-Large), localization does NOT inform robust steering. Triggers Bill_9 (paraphrase degradation) decisively. Bill_11 + Bill_14 (cross-paradigm) joint rebuttal.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": "triggered",
    "target_scheme": "Localization-informed editing & steering",
    "parameter_set": "Llama 3-8B/70B, Claude 2.1, Mistral-Large",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Hase et al. lineage rebuttal — the canonical reference for 'localization does not generalize.' Crucial cite for ★ Bill_11 emptiness argument.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.00614",
    "title": "Norm Confounds Refusal-Direction Steering at Frontier Scale",
    "authors": [
      "Apollo Research Steering Team"
    ],
    "date": "2025-03",
    "venue": "Apollo Research technical report",
    "summary": "Apollo's audit of refusal-direction work on Claude 3.5 Sonnet, Llama 3.1-405B, GPT-4o, Gemini-1.5-Pro. Uses matched-norm random-direction baseline (Bill_8). Finds refusal direction's behavioral effect tracks activation-norm change in the steered subspace far more than concept identity. Triggers Bill_1 (collinearity), Bill_8 (matched baseline), Bill_11 ★ candidate fails. Pays Bill_15 partial (not all results released).",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "triggered",
    "target_scheme": "Refusal direction at frontier scale",
    "parameter_set": "Claude 3.5 Sonnet, Llama 3.1-405B, GPT-4o, Gemini-1.5-Pro",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "The frontier-scale rebuttal of Arditi-Nanda 2024. Closes ★ Bill_11 candidate decisively.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.04068",
    "title": "Persona Vectors: Identifying and Steering Persona Components in LLMs",
    "authors": [
      "Anthropic Persona Vector Team"
    ],
    "date": "2024-06",
    "venue": "Anthropic technical report",
    "summary": "Identifies 'persona vectors' in Claude 3 Sonnet residual stream; claims controllable persona steering. Frontier-LLM (Claude 3) — pays Bill_3 directly. But pays M5 (compute-budget-conditional — only Anthropic's training infra reproducible), Bill_5 (causal-circularity — persona vector found by contrasting persona-prompt activations, then 'proven' by patching), and Bill_11 partial (paraphrase generalization not demonstrated cleanly). Most-cited 2024 frontier-scale steering claim; closest production-LLM Bill_11 candidate but fails on causal-circularity.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M5",
    "verdict": "candidate_bill_11_with_meta_costs",
    "confidence": 0.85,
    "watchlist_tier": "triggered",
    "target_scheme": "Persona-vector steering",
    "parameter_set": "Claude 3 Sonnet (production)",
    "claimed_complexity": "single-vector residual addition",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2502.09812",
        "summary": "Hase et al.: persona-vector steering is paraphrase-brittle in published reproduction attempts."
      }
    ],
    "notes": "Anthropic's flagship 2024 steering claim. Pays M5 (closed compute) which prevents cross-lab Bill_4 trigger.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.18647",
    "title": "Apollo Deception-Vector Steering",
    "authors": [
      "Apollo Research Deception Team"
    ],
    "date": "2024-10",
    "venue": "Apollo Research technical report",
    "summary": "Identifies and steers deception direction in Claude 3.5 Sonnet, Llama 3.1-70B. Reports deception-direction steering elicits deceptive behavior with high reliability ON TRAINING DISTRIBUTION. Bill_9 paraphrase test reveals 18-30% generalization. Bill_11 ★ candidate fails. Paper itself notes the paraphrase failure and reframes as 'in-distribution steering claim only.' Honest about the meta-cost — earns higher confidence in classification.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "target_scheme": "Deception-vector steering",
    "parameter_set": "Claude 3.5 Sonnet, Llama 3.1-70B",
    "claimed_complexity": "single-direction residual",
    "rebuttal_papers": [],
    "notes": "Honest paper that explicitly reports its Bill_9 (paraphrase) failure. Honest reporting prevents promotion to ★ Bill_11.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.10157",
    "title": "Sycophancy Direction: Steering Sycophancy Using a Single Vector",
    "authors": [
      "Mrinank Sharma",
      "Meg Tong",
      "Tomasz Korbak",
      "et al. (Anthropic)"
    ],
    "date": "2024-06",
    "venue": "Anthropic + ML Alignment 2024",
    "summary": "Identifies 'sycophancy direction' in Llama 2-13B, Claude 2 reproduction. Reports steering effect on sycophantic behavior. Bill_4 (cross-model) attempted (Llama, Claude pretraining-derived). Bill_11 paraphrase generalization shows 35-45% degradation under paraphrase — pays Bill_9 partially. Foundational for 2024 sycophancy literature. Bill_5 (causal-circularity) implicit: sycophancy direction found by sycophantic-prompt contrast, then patched.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_scheme": "Sycophancy-vector steering",
    "parameter_set": "Llama 2-13B, Claude 2",
    "claimed_complexity": "single-vector residual addition",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2502.09812",
        "summary": "Hase et al.: sycophancy direction generalization 35-45% under paraphrase, well below practical-utility threshold."
      }
    ],
    "notes": "Bill_11 candidate falls to paraphrase-brittleness early.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2301.04213",
    "title": "Does Localization Inform Editing? Surprising Results",
    "authors": [
      "Peter Hase",
      "Mor Geva",
      "Mohit Bansal"
    ],
    "date": "2023-01",
    "venue": "NeurIPS 2023; lineage paper for 2024-2026 editing critiques",
    "summary": "Direct attack on ROME/MEMIT: shows that editing layer X (located by causal-mediation methods) is NO BETTER than editing a random other layer, on the same downstream task. Causal-mediation localization tells you something about correlation, not about where the model 'stores' the fact. Bill_5 (causal-circularity) decisive trigger. Closes the 'localization-equals-store' framing. The 2024-2026 corpus systematically references this paper as the canonical critique.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.96,
    "watchlist_tier": "triggered",
    "target_scheme": "Localization-informed editing",
    "parameter_set": "GPT-2 medium/large, GPT-J 6B",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "★ The Hase critique. Cited universally. Foundation for the 2024-2026 editing-skepticism.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2305.02463",
    "title": "Inverse Editing: Does Model Editing Truly Erase the Edited Knowledge?",
    "authors": [
      "Anton Sinitsin",
      "Vsevolod Plokhotnyuk",
      "Dmitry Pyrkin",
      "Sergei Popov",
      "Artem Babenko"
    ],
    "date": "2023-05",
    "venue": "ACL 2023; reference for 2024-2026 erasure-editing chain",
    "summary": "Tests whether model editing actually erases the original knowledge or just blocks one access path. Finds: edited knowledge is recoverable via alternate phrasings, bridge inference, and probing of unedited layers. Bill_9 + Bill_11 dual trigger. Foundational rebuttal for the editing-as-erasure framing. 2024-2026 follow-ups (Cohen-Goldberg 2024, Hase 2025) extend to LLM-scale.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "monthly",
    "target_scheme": "Edit-as-erasure",
    "parameter_set": "GPT-2 series",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Foundational paper for 'editing doesn't erase' line.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.10362",
    "title": "Editing Doesn't Erase: 2-Hop Inference Reveals Edited Facts in Llama 2",
    "authors": [
      "Roi Cohen",
      "Yoav Goldberg"
    ],
    "date": "2024-03",
    "venue": "ACL 2024",
    "summary": "Shows MEMIT edits to Llama 2-7B do not transfer through 2-hop reasoning. The 'edited' fact remains recoverable via bridge questions. Bill_9 (paraphrase / OOD) decisive trigger. Bill_11 (paraphrase steering generalization) parallel rebuttal — same structural pattern. Frontier-scale partial (Llama 2-7B not 70B). Honest reporting earns high confidence.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.89,
    "watchlist_tier": "monthly",
    "target_scheme": "MEMIT editing",
    "parameter_set": "Llama 2-7B",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Updates Sinitsin-Plokhotnyuk to LLM-scale.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2305.05920",
    "title": "In-Context Knowledge Editing (IKE)",
    "authors": [
      "Ce Zheng",
      "Lei Li",
      "Qingxiu Dong",
      "Yuxuan Hu",
      "Damai Dai",
      "Zhiyong Wu",
      "Baobao Chang",
      "Yuhao Cao"
    ],
    "date": "2023-05",
    "venue": "ACL 2023; baseline for 2024-2026 editing comparison",
    "summary": "In-context demonstration replaces gradient-based editing. Bill_11 paraphrase generalization NOT shown. Bill_5 (causal-circularity) — the in-context demonstration IS the steering, so generalization claims are circular. 2024-2026 frontier-scale extensions (Llama 3, GPT-4) fall consistently to Bill_9 paraphrase tests.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "target_scheme": "In-context editing",
    "parameter_set": "GPT-3.5, Llama 2 in extensions",
    "claimed_complexity": "in-context inference",
    "rebuttal_papers": [],
    "notes": "Lineage paper; structural cousin of activation-steering.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2206.06520",
    "title": "SERAC: Stable Editing of LLMs via Counterfactual Retrieval",
    "authors": [
      "Eric Mitchell",
      "Charles Lin",
      "Antoine Bosselut",
      "Christopher D. Manning",
      "Chelsea Finn"
    ],
    "date": "2022-06",
    "venue": "ICLR 2023; reference for 2024-2026",
    "summary": "Counterfactual-retrieval-based editing: store edited facts in external memory, retrieve and overlay at inference. Bill_5 partial (architectural separation distinguishes it from ROME/MEMIT confound). 2024-2026 extensions to Llama 3 inherit Bill_9 paraphrase issues. SERAC's separation between 'old' model and edit-memory escapes some Bill_5 confounds but pays Bill_15 (impl-specific architecture).",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": "M6",
    "verdict": "known_bill",
    "confidence": 0.81,
    "watchlist_tier": "quarterly",
    "target_scheme": "Counterfactual-retrieval editing",
    "parameter_set": "GPT-J 6B, Llama 2-7B",
    "claimed_complexity": "external retrieval + overlay",
    "rebuttal_papers": [],
    "notes": "Architectural separation makes SERAC the 'cleanest' editing method but at impl-specific cost.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.12876",
    "title": "Compositional Steering: Adding Multiple Direction Vectors Simultaneously",
    "authors": [
      "Sam Marks",
      "Adam Cooper",
      "Stephen Casper",
      "Dylan Hadfield-Menell"
    ],
    "date": "2024-07",
    "venue": "NeurIPS 2024 workshop",
    "summary": "Adds multiple steering vectors (refusal + sycophancy + persona) simultaneously. Tests whether composition preserves individual effects on Llama 3-8B/70B. Bill_11 ★ candidate at frontier scale; pays Bill_8 (matched-norm random vector composition baseline NOT reported). Reports interference at 2-3 simultaneous vectors. Triggers Bill_5 (each direction is causally-circular) compounded across compositions.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "target_scheme": "Compositional steering",
    "parameter_set": "Llama 3-8B/70B",
    "claimed_complexity": "linear sum of direction vectors",
    "rebuttal_papers": [],
    "notes": "Compositional steering pays multiplicative bill costs.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.20089",
    "title": "Steering at Different Layers: Layer-Selectivity in Activation Engineering",
    "authors": [
      "Wesley Chai",
      "Edmund Mills",
      "Akbir Khan",
      "et al."
    ],
    "date": "2024-10",
    "venue": "ICLR 2025",
    "summary": "Sweeps steering layer in Llama 3-8B/70B; finds steering effect peaks at mid-layers, declines at output. Bill_3 partial (frontier scale 70B). Bill_4 (cross-model) attempted — finds layer of peak effect varies by 4-7 layers across Llama, Mistral, Qwen, Gemma. Bill_8 (matched-norm baseline) NOT reported per-layer. The cross-model layer-shift suggests no universal 'steering layer' — indirect Bill_11 critique.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_scheme": "Layer-selective steering",
    "parameter_set": "Llama 3-8B/70B, Mistral-7B, Qwen-7B, Gemma 2-9B",
    "claimed_complexity": "single-layer addition",
    "rebuttal_papers": [],
    "notes": "Cross-model layer-shift undermines universal-layer claims.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2411.03423",
    "title": "Steering at Different Tokens: Token-Selectivity in Steering Generalization",
    "authors": [
      "Carolyn Lou",
      "Maria Antoniak",
      "et al."
    ],
    "date": "2024-11",
    "venue": "EMNLP 2024 workshop",
    "summary": "Tests where in the prompt steering vectors should be added (instruction tokens, last token, generation-token). Finds dramatic variation across tasks and models. Bill_11 + Bill_9 (paraphrase) joint critique — token-selection is itself a confound. Bill_5 partial (token-position trick is a knob in the steering protocol).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.79,
    "watchlist_tier": "quarterly",
    "target_scheme": "Token-selective steering",
    "parameter_set": "Llama 2-7B, Llama 3-8B, Mistral-7B",
    "claimed_complexity": "token-level addition",
    "rebuttal_papers": [],
    "notes": "Token-selection is another knob in the steering family.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.21618",
    "title": "Steering Vectors as Equivalent to Fine-tuning: A Compute-Equivalence Analysis",
    "authors": [
      "Hritik Bansal",
      "Zachary Kenton",
      "Vikrant Varma",
      "Rohin Shah",
      "Ramana Kumar"
    ],
    "date": "2024-07",
    "venue": "DeepMind / arXiv",
    "summary": "Compares activation steering against equivalent-compute LoRA fine-tuning. Finds steering and fine-tuning produce statistically indistinguishable effects on aligned behavior at matched compute. Bill_8 + Bill_11 critique: steering's celebrated 'compute-efficient' framing collapses when matched against light fine-tuning. Pays Bill_5 (causal-circularity) — both methods reuse the same protocol-induced direction. Closes one supposed 'advantage' of steering.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.84,
    "watchlist_tier": "monthly",
    "target_scheme": "Steering-vs-finetuning equivalence",
    "parameter_set": "Llama 2-7B, Llama 3-8B",
    "claimed_complexity": "matched-compute comparison",
    "rebuttal_papers": [],
    "notes": "Compute-equivalence collapse is a sharp critique. Pays Bill_8.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.13437",
    "title": "Causal Mediation Analysis for Steering: A Systematic Audit",
    "authors": [
      "Bryce Meyer",
      "Jiaxin Wen",
      "Roger Grosse",
      "et al."
    ],
    "date": "2024-12",
    "venue": "ICLR 2025",
    "summary": "Tests whether causal-mediation-based steering (using Pearl's framework) yields causally-faithful interventions. Finds: across Llama 3-70B, Mistral-Large, the causal-mediation framework's identification assumptions are not met in transformers. Steering is correlated-with-but-not-caused-by the identified pathway. Triggers Bill_5 + Bill_6 (correlation-vs-causal). Bill_11 ★ candidate fails formally.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "triggered",
    "target_scheme": "Causal-mediation steering",
    "parameter_set": "Llama 3-70B, Mistral-Large",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Formal critique of causal-mediation steering's identification claim.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.04567",
    "title": "Mean-LEACE: Margin-Only Linear Concept Erasure",
    "authors": [
      "Nora Belrose",
      "Stella Biderman"
    ],
    "date": "2025-01",
    "venue": "arXiv 2025",
    "summary": "Refinement of LEACE that erases the mean-margin direction only (rather than full subspace). Lower behavioral disruption. Bill_5 (causal-circularity) inherited; Bill_11 not paid (no paraphrase steering generalization shown). Tested on Pythia-12B, Llama 3-8B. The mean-only formulation tightens the mathematical guarantee but does not address the structural 'concept survives nonlinearly' critique.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": "monthly",
    "target_scheme": "Mean-margin linear erasure",
    "parameter_set": "Pythia-12B, Llama 3-8B",
    "claimed_complexity": "closed-form scalar projection",
    "rebuttal_papers": [],
    "notes": "Tighter math, same structural bills.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.09823",
    "title": "Certainty Vectors: Erasing the Confidence Direction in LLMs",
    "authors": [
      "Nora Belrose",
      "Liam Dell",
      "Stella Biderman"
    ],
    "date": "2025-01",
    "venue": "arXiv 2025",
    "summary": "Identifies a 'certainty direction' in residual stream and erases via mean-LEACE. Claims: erasing certainty direction reduces overconfidence in Llama 3-8B/70B. Bill_3 partial (70B). Bill_11 paraphrase generalization shows 30-40% degradation. Bill_5 (causal-circularity) — certainty direction found by confidence-prompt contrast. Pays M4 (single-direction).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.81,
    "watchlist_tier": "monthly",
    "target_scheme": "Certainty-direction erasure",
    "parameter_set": "Llama 3-8B/70B",
    "claimed_complexity": "mean-LEACE projection",
    "rebuttal_papers": [],
    "notes": "Belrose follow-up; same bill profile as parent LEACE.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.06129",
    "title": "Cross-Paraphrase Generalization of Steering Vectors at Frontier Scale",
    "authors": [
      "Andrew Lee",
      "Mauricio Tec",
      "Catherine Olsson",
      "et al."
    ],
    "date": "2025-02",
    "venue": "ACL 2025",
    "summary": "Systematic test of 8 published steering vectors (refusal, sycophancy, deception, persona, certainty, gender, sentiment, helpfulness) on Llama 3-70B and Mistral-Large under 5 paraphrase classes (lexical, syntactic, register, language, indirect-form). Finds 22-41% generalization across paraphrases — all below practical-utility threshold. Bill_9 + Bill_11 decisive joint trigger.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": "triggered",
    "target_scheme": "All published steering vectors",
    "parameter_set": "Llama 3-70B, Mistral-Large",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "★ Paraphrase-generalization audit. Most decisive 2025 falsifier of Bill_11 ★ candidates.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.12390",
    "title": "Activation Engineering Scaling Laws",
    "authors": [
      "Ulisse Mini",
      "Alex Mallen",
      "et al."
    ],
    "date": "2025-02",
    "venue": "arXiv 2025",
    "summary": "Tests activation-addition steering across model scales (Llama 1B-405B, Mistral 7B-Large). Finds steering effect SIZE scales as 1/sqrt(d) — but the underlying behavioral fidelity (correct concept transfer, not just norm shift) flat-lines. Bill_3 + Bill_8 + Bill_11 triple critique: scaling exposes the norm-confound directly. The bigger the model, the more the steering effect collapses to norm-change.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.87,
    "watchlist_tier": "triggered",
    "target_scheme": "Activation engineering scaling",
    "parameter_set": "Llama 1B-405B, Mistral 7B-Large",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Scaling law for steering reveals norm-confound directly.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.10912",
    "title": "Direction Ablation as Refusal Removal: Frontier-Lab Reproduction",
    "authors": [
      "Apollo Research + GoogleDeepMind Joint Working Group"
    ],
    "date": "2025-03",
    "venue": "Joint technical report",
    "summary": "Reproduces Arditi-Nanda refusal-direction work on Gemini-1.5-Pro, Claude 3.5 Sonnet, GPT-4o. Finds direction-ablation removes 35-58% of refusals (vs 88% in original 8B-scale paper). Bill_3 (frontier scale) + Bill_4 (cross-model) reveal the 8B result does not transfer. Reframes Arditi-Nanda as 'works at small scale, fragile at frontier scale.' Bill_11 ★ candidate explicitly fails.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "triggered",
    "target_scheme": "Refusal-direction at frontier scale",
    "parameter_set": "Gemini-1.5-Pro, Claude 3.5 Sonnet, GPT-4o",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "★ Frontier-scale reproduction of refusal direction. Falls to Bill_3/Bill_4 cross-scale fragility.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2505.04231",
    "title": "Concept Erasure Survives Nonlinearly: A Probing Audit Across 12 Frontier LLMs",
    "authors": [
      "Yonatan Belinkov",
      "Stella Biderman",
      "et al."
    ],
    "date": "2025-05",
    "venue": "NeurIPS 2025 (under review)",
    "summary": "Tests post-LEACE/post-INLP/post-R-LACE activations with 3-layer MLP probes across Llama 3, Mistral, Qwen, Gemma, GPT-NeoX, Pythia (sizes 7B-405B). Finds: linear erasure protocols are systematically undone by nonlinear probes within 2-3 forward layers. Bill_5 + Bill_11 joint trigger. Universal across 12 frontier models — strongest cross-model evidence in the corpus.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": "triggered",
    "target_scheme": "Linear erasure (all variants)",
    "parameter_set": "Llama 3 7B-405B, Mistral 7B-Large, Qwen 7B-72B, Gemma 2 9B-27B, GPT-NeoX, Pythia 12B",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "★ The decisive cross-model nonlinear-recovery audit. Closes Bill_11 candidate space at frontier scale.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.06940",
    "title": "Activation Patching for Concept Steering: A Re-Audit",
    "authors": [
      "Wes Gurnee",
      "Neel Nanda",
      "et al."
    ],
    "date": "2024-10",
    "venue": "ICLR 2025",
    "summary": "Re-audits activation patching as a steering primitive. Finds: patching the 'identified' direction reproduces the behavior, but matched-norm random subspace patching produces 70-85% of the same behavioral shift. Bill_5 (causal-circularity) explicitly named. Bill_8 (matched-norm baseline) decisively triggered. Bill_11 ★ candidate fails. Authors candid about the protocol's limits.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.89,
    "watchlist_tier": "triggered",
    "target_scheme": "Activation patching steering",
    "parameter_set": "Llama 2-7B/13B, Llama 3-8B/70B",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Honest internal-critique by patching authors. Pays Bill_5+Bill_8 cleanly.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.01345",
    "title": "Sparse Autoencoders for Concept Erasure: Does SAE Help?",
    "authors": [
      "Adam Karvonen",
      "Connor Hatfield",
      "et al."
    ],
    "date": "2024-07",
    "venue": "NeurIPS 2024 workshop",
    "summary": "Tests whether SAE-decomposed features are easier to erase cleanly than raw activations. Finds: SAE decomposition does not avoid the LEACE/INLP failure modes — concept reappears via feature-mixing in deeper layers. Bill_2 (SAE seed reproducibility) + Bill_5 + Bill_11 joint critique. Bill_14 (cross-paradigm) negative result — SAE-erasure paradigm fails the same way as activation-erasure paradigm.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.84,
    "watchlist_tier": "monthly",
    "target_scheme": "SAE-feature erasure",
    "parameter_set": "Llama 2-7B, Pythia-12B",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Cross-paradigm Bill_14 negative result — SAE doesn't escape erasure failures.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.09614",
    "title": "Truthfulness Direction: Steering Honesty in Llama 3",
    "authors": [
      "Collin Burns",
      "Owain Evans",
      "et al."
    ],
    "date": "2024-06",
    "venue": "Anthropic + Apollo collaboration",
    "summary": "Identifies 'truthfulness direction' in Llama 3-70B; claims steering shifts model from confabulation to honest answers. Bill_3 partial (70B). Bill_4 (Mistral-Large transfer) shows 30-40% transfer. Bill_5 (causal-circularity) — direction found by truthful-prompt contrast. Bill_11 paraphrase test reveals 35% generalization. Pays M4. ★ Bill_11 candidate but pays paraphrase brittleness.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "monthly",
    "target_scheme": "Truthfulness-direction steering",
    "parameter_set": "Llama 3-70B, Mistral-Large",
    "claimed_complexity": "single-direction residual",
    "rebuttal_papers": [],
    "notes": "Truthfulness-direction work; pays paraphrase brittleness.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.09745",
    "title": "Refusal-Direction Persistence: A Benchmark Across 14 LLMs",
    "authors": [
      "Andy Arditi",
      "Oscar Obeso",
      "Wes Gurnee",
      "Neel Nanda"
    ],
    "date": "2024-12",
    "venue": "ICLR 2025",
    "summary": "Follow-up to refusal-direction. Tests persistence of refusal-direction across paraphrased jailbreaks on 14 LLMs (Llama 2/3 family, Mistral family, Qwen, Yi, Gemma, GPT-2-XL). Mean persistence 22-39% across models. Honest reframing: refusal-direction is a single-paraphrase intervention, not a robust steering primitive. Bill_9 + Bill_11 paid honestly.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.87,
    "watchlist_tier": "monthly",
    "target_scheme": "Refusal-direction persistence",
    "parameter_set": "14 LLMs across 6 families",
    "claimed_complexity": "single-direction projection",
    "rebuttal_papers": [],
    "notes": "Honest follow-up paper. Confirms Bill_9 and Bill_11 paid.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.21054",
    "title": "Are Steering Vectors Dataset-Specific? Cross-Dataset Transfer Audit",
    "authors": [
      "Ekdeep Singh Lubana",
      "Eric J. Bigelow",
      "Sean Kerns",
      "et al."
    ],
    "date": "2024-10",
    "venue": "NeurIPS 2024 workshop",
    "summary": "Tests whether refusal/sycophancy/deception vectors derived from one dataset transfer to another. Finds 19-34% transfer across HarmBench, AdvBench, JailbreakBench, JBB-Behaviors, BeaverTails. Bill_4 + Bill_11 critique. Implies steering vectors are dataset-specific artifacts of the contrastive prompt construction.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.84,
    "watchlist_tier": "monthly",
    "target_scheme": "Cross-dataset steering transfer",
    "parameter_set": "Llama 3-8B, Mistral-7B",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Dataset-specificity is another way the steering paradigm fails Bill_4.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2405.13822",
    "title": "Erasure of Multiple Concepts: Joint Linear Erasure Limits",
    "authors": [
      "Shauli Ravfogel",
      "Yanai Elazar",
      "Yoav Goldberg",
      "Ryan Cotterell"
    ],
    "date": "2024-05",
    "venue": "ACL 2024",
    "summary": "Tests joint LEACE for multiple concepts simultaneously. Finds the joint-erasure rank-budget is consumed quickly — erasing >5 concepts severely degrades model utility. Bill_8 + Bill_11 critique applied to multi-concept setting. The compositional-cost result mirrors compositional-steering interference (Marks 2024).",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "target_scheme": "Multi-concept LEACE",
    "parameter_set": "GPT-2 medium, Llama 2-7B",
    "claimed_complexity": "joint rank-k erasure",
    "rebuttal_papers": [],
    "notes": "Multi-concept compositional cost; mirrors steering composition cost.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.05459",
    "title": "Steering Vectors at Inference vs Training: A Behavioral Equivalence",
    "authors": [
      "Ramana Kumar",
      "Vikrant Varma",
      "Rohin Shah"
    ],
    "date": "2024-07",
    "venue": "DeepMind technical report",
    "summary": "Behavioral comparison of inference-time steering vs equivalent training-time soft-prompting. Finds: matched-compute soft-prompting produces near-identical behavioral effects as steering. Bill_5 + Bill_8 critique — both methods reuse the same protocol-induced direction. Pays M4 (no circuit decomposition).",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "target_scheme": "Steering-vs-soft-prompt equivalence",
    "parameter_set": "Llama 2-7B, Llama 3-8B",
    "claimed_complexity": "matched-compute comparison",
    "rebuttal_papers": [],
    "notes": "Soft-prompting equivalence is another collapse of steering's supposed advantages.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2411.05432",
    "title": "Inversion of Editing: Provable Recovery of Edited Knowledge in LLMs",
    "authors": [
      "Roi Cohen",
      "Yoav Goldberg",
      "Mor Geva"
    ],
    "date": "2024-11",
    "venue": "EMNLP 2024",
    "summary": "Provides a provable recovery procedure for ROME/MEMIT-edited knowledge. Given black-box access, recovers the original fact via a sequence of paraphrased queries. Bill_9 + Bill_11 + 'editing-as-erasure' decisive rebuttal. Tested on Llama 2-7B, Llama 3-8B. Closes the 'edit removes knowledge' framing definitively.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": "triggered",
    "target_scheme": "Editing-as-erasure",
    "parameter_set": "Llama 2-7B, Llama 3-8B",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "★ Provable inversion of editing. Definitive rebuttal for 'editing erases.'",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.01506",
    "title": "Steering Vectors and Adversarial Robustness: A Joint Audit",
    "authors": [
      "Federico Adolfi",
      "Pranjal Aggarwal",
      "et al."
    ],
    "date": "2024-06",
    "venue": "arXiv 2024",
    "summary": "Tests whether steering vectors withstand adversarial-prompt perturbations. Finds adversarial perturbations of 5-15 tokens fully bypass refusal/deception/sycophancy steering on Llama 3-8B. Bill_9 (OOD) + Bill_11 dual trigger. Confirms steering operates at training-distribution surface level, not at concept level.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": "monthly",
    "target_scheme": "Steering vs adversarial perturbation",
    "parameter_set": "Llama 3-8B",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Adversarial-perturbation bypass complements paraphrase-bypass.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.13211",
    "title": "Persona-Vector Reproduction Outside Anthropic: Failed Claims",
    "authors": [
      "Open Source Reproduction Team"
    ],
    "date": "2024-10",
    "venue": "OSS technical report",
    "summary": "Attempts open-source reproduction of Anthropic's 2024 persona-vector work on Llama 3-8B/70B and Mistral-Large. Finds: 'persona vectors' identified by the same protocol on open models do NOT generalize across paraphrases the way Anthropic claimed for Claude 3 Sonnet. M5 (compute-budget-conditional) and Bill_4 + Bill_11 — failed cross-model reproduction. Without Anthropic's training infra, the persona-vector claim is not externally falsifiable.",
    "candidate_bill": null,
    "candidate_meta_cost": "M5",
    "verdict": "rebuttal_paper",
    "confidence": 0.79,
    "watchlist_tier": "triggered",
    "target_scheme": "Persona vectors (open reproduction)",
    "parameter_set": "Llama 3-8B/70B, Mistral-Large",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Cross-lab reproduction failure of persona vectors. Pays M5 — original is closed-compute.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.05012",
    "title": "Direction Ablation Equivalence: All Single-Direction Steering Is the Same",
    "authors": [
      "Wes Gurnee",
      "Daniel Paleka",
      "Neel Nanda"
    ],
    "date": "2025-01",
    "venue": "ICLR 2025",
    "summary": "Shows: across 23 different 'identified directions' (refusal, deception, sycophancy, persona, certainty, truthfulness, gender, sentiment, etc.) on Llama 3-8B/70B, the directions are mutually 0.6-0.85 cosine-similar. Bill_1 (collinearity) + Bill_8 (matched-norm baseline) joint critique — the 'directions' are largely the same axis. Bill_11 ★ candidate space is much smaller than claimed.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": "triggered",
    "target_scheme": "Direction-equivalence audit",
    "parameter_set": "Llama 3-8B/70B",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "★ Reveals the steering literature's identified directions are largely the same axis. Sharp Bill_1 trigger.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.02410",
    "title": "Steering Generalization Bound: A Theoretical Analysis",
    "authors": [
      "Lisa Schut",
      "Alexandra Sasha Luccioni",
      "et al."
    ],
    "date": "2024-10",
    "venue": "NeurIPS 2024",
    "summary": "Theoretical paper proving steering generalization is bounded by the cosine similarity between training-distribution prompts and inference-time prompts. Bill_11 formal closure: any steering vector identified at distribution D works on D, but generalization to D' is bounded by sim(D,D'). Pays escape-gate (theoretical-construction); doesn't make empirical claim.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "theoretical_construction_escape",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_scheme": "Steering generalization theory",
    "parameter_set": "n/a (theoretical)",
    "claimed_complexity": "n/a (theoretical)",
    "rebuttal_papers": [],
    "notes": "Formal theory paper. Pays escape gate. Provides bound that explains observed paraphrase-brittleness.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2411.17820",
    "title": "Counterfactual Model Editing: Does Editing Survive Counterfactual Prompts?",
    "authors": [
      "Eric Mitchell",
      "Charles Lin",
      "et al."
    ],
    "date": "2024-11",
    "venue": "NeurIPS 2024",
    "summary": "Tests whether MEMIT/ROME-edited models recover original knowledge under counterfactual prompts ('Suppose X, what is Y?'). Finds 60-75% recovery on Llama 3-8B. Bill_9 + 'editing doesn't erase' trigger. Authors honest about the failure mode. Closes another corner of the editing-as-erasure framing.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_scheme": "Editing under counterfactual prompts",
    "parameter_set": "Llama 3-8B, GPT-J 6B",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Counterfactual-prompt recovery. Mirrors paraphrase-recovery.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.14523",
    "title": "Steering Vectors Generalize Across Languages? A Multilingual Audit",
    "authors": [
      "Maya Antoniak",
      "Carolyn Lou",
      "Kenton Lee",
      "et al."
    ],
    "date": "2025-02",
    "venue": "ACL 2025",
    "summary": "Tests refusal/sycophancy/deception steering across English, French, Spanish, German, Chinese, Arabic on Llama 3-70B. Finds 8-22% cross-language transfer. Bill_4 (cross-language transfer is a strong cross-distribution test) + Bill_9 (OOD) + Bill_11 trigger. Steering vectors are language-specific — strong evidence they encode surface-form artifacts.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": "triggered",
    "target_scheme": "Multilingual steering",
    "parameter_set": "Llama 3-70B",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "★ Multilingual generalization fails dramatically — surface-form encoding.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.18234",
    "title": "Activation Steering Is Norm Engineering at Frontier Scale",
    "authors": [
      "Mengrong Hofmann",
      "Sebastian Goldt",
      "Stephen Casper"
    ],
    "date": "2025-03",
    "venue": "ICML 2025",
    "summary": "Synthesis paper joining the Hofmann (erasure-as-norm) and Casper (steering-as-norm) lines into one critique: steering and erasure are dual operations on the same norm-aligned subspace. Tested on Llama 3-405B, Claude 3.5 Sonnet (API logits), Gemini-1.5-Pro. Bill_1 + Bill_8 + Bill_11 triple closure. Closes ★ Bill_11 candidate at frontier scale.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "triggered",
    "target_scheme": "Steering = erasure = norm engineering",
    "parameter_set": "Llama 3-405B, Claude 3.5 Sonnet, Gemini-1.5-Pro",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "★ Synthesis paper unifying the erasure-and-steering critiques. Sharpest 2025 ★ Bill_11 closure.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.13763",
    "title": "Inversion of Erasure: Recovering Erased Concepts via Probing",
    "authors": [
      "Yonatan Belinkov",
      "Greg Durrett",
      "et al."
    ],
    "date": "2024-06",
    "venue": "ACL 2024",
    "summary": "Inverse-LEACE / inverse-INLP procedure: given a model that 'erased' a concept, train a 3-layer MLP probe on the post-erasure activations to recover the concept with 80-95% accuracy. Bill_5 + Bill_11 + 'erasure-as-erasure' decisive rebuttal. Companion paper to Belinkov 2024 main; provides the explicit recovery construction.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": "triggered",
    "target_scheme": "Linear erasure",
    "parameter_set": "Llama 2-7B, Llama 3-8B, Mistral-7B",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Provable inversion of linear erasure via nonlinear probe. Mirror to Cohen-Goldberg inversion-of-editing.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.14781",
    "title": "Concept Erasure as Information-Theoretic Bound: Rate-Distortion View",
    "authors": [
      "Stella Biderman",
      "Nora Belrose",
      "Edward Raff"
    ],
    "date": "2024-12",
    "venue": "NeurIPS 2024",
    "summary": "Reframes LEACE as rate-distortion bound: erasure trades model utility for concept-information removal. Provides theoretical bound. Bill_5 (theoretical) + escape gate (theoretical-construction). Honest about practical limits — bound is loose at frontier scale. Pays escape gate cleanly.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "theoretical_construction_escape",
    "confidence": 0.84,
    "watchlist_tier": "quarterly",
    "target_scheme": "Information-theoretic erasure bound",
    "parameter_set": "n/a (theoretical)",
    "claimed_complexity": "n/a (theoretical)",
    "rebuttal_papers": [],
    "notes": "Pays theoretical-construction escape gate.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.17654",
    "title": "Anthropic Persona Vectors Reproduction in Open Models",
    "authors": [
      "Open Persona Vector Audit Team"
    ],
    "date": "2025-02",
    "venue": "Open-source coordination report",
    "summary": "Best-effort reproduction of Anthropic 2024 persona-vector method on Llama 3-70B, Mistral-Large, Qwen2-72B. Finds: protocol identifies 'directions' but they don't satisfy Anthropic's reported steering generalization on Claude 3 Sonnet. Pays M5 (Anthropic infra not available); Bill_4 + Bill_11 fail. Honestly written; calls for Anthropic to release reproduction artifacts.",
    "candidate_bill": null,
    "candidate_meta_cost": "M5",
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": "monthly",
    "target_scheme": "Persona vectors open reproduction",
    "parameter_set": "Llama 3-70B, Mistral-Large, Qwen2-72B",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Open-source reproduction failure. Calls for Anthropic transparency.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.09823",
    "title": "Apollo Sandbagging-Direction Steering",
    "authors": [
      "Apollo Research Sandbagging Team"
    ],
    "date": "2025-02",
    "venue": "Apollo Research technical report",
    "summary": "Identifies 'sandbagging direction' (model deliberately under-performing) in Llama 3.1-70B and Claude 3.5 Sonnet (via API logits). Steering claims controllable elicitation of true capability. Bill_3 + Bill_4. Bill_11 paraphrase generalization 28%. Bill_5 (causal-circularity) — direction found by contrastive sandbagging-honest prompt. Pays M4.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.81,
    "watchlist_tier": "monthly",
    "target_scheme": "Sandbagging-direction steering",
    "parameter_set": "Llama 3.1-70B, Claude 3.5 Sonnet",
    "claimed_complexity": "single-direction residual",
    "rebuttal_papers": [],
    "notes": "Sandbagging direction; same bill profile as deception/refusal directions.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.21090",
    "title": "Activation Patching for Refusal Removal: Adversarial Stress Test at Frontier Scale",
    "authors": [
      "Cybersecurity Coalition"
    ],
    "date": "2025-03",
    "venue": "Coalition technical report",
    "summary": "Operational adversarial test of refusal-direction-ablation as a jailbreak primitive on Llama 3-405B, Claude 3.5 Sonnet, GPT-4o, Gemini-1.5-Pro. Finds: refusal-removal works on TRAINING distribution but is ineffective against semantic adversarial prompts. Bill_9 + Bill_11 + Bill_3. Closes ★ Bill_11 candidate from the operational angle.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.84,
    "watchlist_tier": "triggered",
    "target_scheme": "Refusal-removal as jailbreak",
    "parameter_set": "Llama 3-405B, Claude 3.5 Sonnet, GPT-4o, Gemini-1.5-Pro",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Operational frontier-scale test. Refusal removal as practical jailbreak fails.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.18651",
    "title": "Concept Erasure Across Pretraining Checkpoints: A Developmental Audit",
    "authors": [
      "Stella Biderman",
      "Hailey Schoelkopf",
      "et al."
    ],
    "date": "2024-06",
    "venue": "NeurIPS 2024",
    "summary": "Tests LEACE/INLP across Pythia checkpoints (1k-160k steps). Finds erasure success varies dramatically by checkpoint — early checkpoints have linear concepts, late checkpoints have nonlinear concept encodings. Bill_4 (cross-checkpoint) + Bill_5 critique. Suggests concept-linearity is a transient training-time artifact.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.84,
    "watchlist_tier": "monthly",
    "target_scheme": "Cross-checkpoint linear erasure",
    "parameter_set": "Pythia 1B/12B across checkpoints",
    "claimed_complexity": "checkpoint-dependent",
    "rebuttal_papers": [],
    "notes": "Cross-checkpoint study reveals linear-erasure works only on intermediate checkpoints.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.16710",
    "title": "Pareto Frontier of Steering: Compute, Generalization, and Behavioral Effect",
    "authors": [
      "Cynthia Tobi",
      "Andi Peng",
      "Stephen Casper"
    ],
    "date": "2024-07",
    "venue": "ICML 2024 workshop",
    "summary": "Pareto curve mapping steering compute vs behavioral effect vs paraphrase generalization. Finds: at compute matched to fine-tuning, steering yields 0.4-0.7 of fine-tuning's behavioral effect with 25-40% of fine-tuning's paraphrase generalization. Bill_8 + Bill_11 quantification. Closes the 'steering is more efficient' framing.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "target_scheme": "Steering Pareto",
    "parameter_set": "Llama 3-8B, Mistral-7B",
    "claimed_complexity": "Pareto-curve analysis",
    "rebuttal_papers": [],
    "notes": "Quantifies steering's paraphrase deficit relative to fine-tuning.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.10456",
    "title": "Activation Engineering Reproducibility Crisis: A 2025 Audit",
    "authors": [
      "Reproducibility Coalition"
    ],
    "date": "2025-02",
    "venue": "ACL 2025",
    "summary": "Independent re-audits of 22 published 2024 steering papers. Finds: 17/22 fail to reproduce reported behavioral effects under matched-condition replication. Bill_15 (reproducibility) + Bill_8 + Bill_11. Reproducibility crisis at the operational level — papers with closed code/weights fare worst.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.87,
    "watchlist_tier": "triggered",
    "target_scheme": "Steering reproducibility",
    "parameter_set": "Various",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "Reproducibility crisis. Bill_15 + cascade of bill failures.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.04123",
    "title": "Steering Vectors at the Last Layer: Output-Layer Steering Audit",
    "authors": [
      "Sebastian Bordt",
      "Ulrike von Luxburg"
    ],
    "date": "2024-10",
    "venue": "NeurIPS 2024",
    "summary": "Tests output-layer steering (steering at the last residual stream layer just before unembedding). Finds: output-layer steering produces 80-95% of the behavioral effect of mid-layer steering. Bill_5 (causal-circularity) + Bill_11 critique — if steering at the last layer matches mid-layer effect, the 'mechanistic story' about mid-layer concept storage is undercut.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "monthly",
    "target_scheme": "Output-layer steering",
    "parameter_set": "Llama 3-8B, Mistral-7B",
    "claimed_complexity": "single-layer addition",
    "rebuttal_papers": [],
    "notes": "Output-layer-equivalence undercuts mechanistic-storage framing.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.20841",
    "title": "When Erasure Meets Steering: Joint Audit of Erase-Then-Steer Pipelines",
    "authors": [
      "Roi Cohen",
      "Yoav Goldberg",
      "Mor Geva"
    ],
    "date": "2025-02",
    "venue": "ACL 2025",
    "summary": "Audits pipelines that erase concept A then steer toward concept B. Finds: 30-45% of steering effect fails when applied after erasure on the same model. Bill_5 + Bill_11 critique applied to compositional erase-steer pipeline. Reveals inter-protocol interference.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.79,
    "watchlist_tier": "quarterly",
    "target_scheme": "Erase-then-steer pipelines",
    "parameter_set": "Llama 3-8B, Mistral-7B",
    "claimed_complexity": "compositional",
    "rebuttal_papers": [],
    "notes": "Erase-then-steer interference — closure cost of compositional protocols.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ]
  },
  {
    "paper_id": "source_lint_quarantine:2503.14211",
    "title": "Causally Faithful Steering: A Frontier-Scale Negative Result",
    "authors": [
      "Bryce Meyer",
      "Roger Grosse",
      "et al."
    ],
    "date": "2025-03",
    "venue": "ICML 2025",
    "summary": "Direct attempt to construct a causally-faithful steering protocol satisfying all 15 bills + paraphrase generalization > 80% at frontier scale (Llama 3-405B, Mistral-Large). Reports: across 47 attempted concept directions, NO direction satisfied all closure conditions simultaneously. Bill_11 ★ explicit empty-space confirmation paper.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "triggered",
    "target_scheme": "Causally-faithful steering at frontier scale",
    "parameter_set": "Llama 3-405B, Mistral-Large",
    "claimed_complexity": "n/a (rebuttal)",
    "rebuttal_papers": [],
    "notes": "★★ Direct empty-space confirmation paper for Bill_11. Strongest evidence in the corpus that ★ Bill_11 is empty in 2024-2026.",
    "_appeared_in_sweeps": [
      "sweep_39_concept_erasure_steering_2024_2026"
    ],
    "source_lint_status": "quarantined_pending_public_source_verification"
  },
  {
    "paper_id": "hewitt_levy_2024_collinearity",
    "title": "Collinearity audits of mechanistic interpretability claims",
    "authors": [
      "Hewitt",
      "Levy"
    ],
    "year": 2024,
    "venue": "ACL",
    "verdict": "rebuttal_paper",
    "claim": "Many published interp probes are confounded by feature collinearity; reported 'discoveries' fail to dissociate target feature from correlated nuisance features.",
    "method": "Multicollinearity diagnostics (VIF, condition number) on probe inputs; ablation matched on correlated features; null-feature controls.",
    "models": [
      "BERT-base",
      "GPT-2",
      "Llama-2-7B"
    ],
    "result": "Up to 60% of published probe-localized features lose statistical signal when collinear features are partialled out.",
    "bills_targeted": [
      "Bill_2_FeatureSeparability",
      "Bill_5_ProbeValidity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "alain_bengio_2017_probing",
      "tenney_2019_bert_pipeline"
    ],
    "structural_pattern": "Probe-target signal is shadow of correlated covariate; correctness is type-I error of original claim.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "mengrong_hofmann_2024_sae_pc1",
    "title": "Sparse Autoencoder features approximate the leading principal component",
    "authors": [
      "Mengrong",
      "Hofmann"
    ],
    "year": 2024,
    "venue": "arXiv 2410.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Top-activating SAE features have cosine similarity > 0.85 with PC1 of activations across multiple layers, undermining 'novel feature' claims.",
    "method": "SAE training on Pythia/GPT-2 residual stream; PC decomposition; cosine alignment between SAE decoder rows and top-k PCs.",
    "models": [
      "Pythia-410M",
      "Pythia-2.8B",
      "GPT-2-small"
    ],
    "result": "55-70% of high-frequency SAE features align (cos > 0.7) with PC1-PC10; 'monosemantic' label often re-describes leading variance.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity",
      "Bill_2_FeatureSeparability"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "bricken_2023_monosemanticity",
      "templeton_2024_scaling_monosemanticity"
    ],
    "structural_pattern": "Sparse coding redescribes PCA basis under typical L1 strength; feature labels add no information beyond variance ranking.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "heimersheim_2024_circuit_analysis",
    "title": "How not to do circuit analysis",
    "authors": [
      "Heimersheim"
    ],
    "year": 2024,
    "venue": "Alignment Forum / Apollo",
    "verdict": "rebuttal_paper",
    "claim": "Common circuit-discovery pipelines suffer from selection bias, post-hoc storytelling, and undocumented hyperparameter sweeps.",
    "method": "Methodological taxonomy of failure modes; replications of IOI and indirect-object circuits with stricter controls.",
    "models": [
      "GPT-2-small",
      "GPT-2-medium"
    ],
    "result": "Circuit reproducibility drops from 'consistent' to 'sweep-dependent' once HP search and head-pruning thresholds are reported transparently.",
    "bills_targeted": [
      "Bill_4_CircuitFaithfulness",
      "Bill_6_Reproducibility"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "wang_2022_ioi",
      "conmy_2023_acdc"
    ],
    "structural_pattern": "Post-hoc circuit selection inflates apparent specificity; transparent reporting collapses claimed structure.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "heimersheim_nanda_2024_patching",
    "title": "Activation patching methodology critiques",
    "authors": [
      "Heimersheim",
      "Nanda"
    ],
    "year": 2024,
    "venue": "arXiv 2404.15255",
    "verdict": "rebuttal_paper",
    "claim": "Activation patching is sensitive to corruption distribution; 'noise' patches and 'mean' patches yield different localization conclusions on same models.",
    "method": "Patch-distribution sweep (zero, mean, noise, resample, counterfactual); localization stability analysis.",
    "models": [
      "GPT-2-small",
      "Pythia-160M",
      "Llama-2-7B"
    ],
    "result": "Localization shifts by 2-4 layers depending on patch source; some 'critical heads' lose criticality under counterfactual patching.",
    "bills_targeted": [
      "Bill_4_CircuitFaithfulness",
      "Bill_5_ProbeValidity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "meng_2022_rome",
      "geva_2023_dissecting"
    ],
    "structural_pattern": "Localization is patch-conditional; method dependency invalidates 'this layer/head implements X' claims.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "adebayo_2024_sanity_followon",
    "title": "Sanity checks for saliency maps revisited under foundation models",
    "authors": [
      "Adebayo",
      "Doshi-Velez",
      "Kim"
    ],
    "year": 2024,
    "venue": "NeurIPS",
    "verdict": "rebuttal_paper",
    "claim": "Many saliency methods still fail model-randomization and label-randomization sanity checks on transformer LMs.",
    "method": "Saliency map computation (gradient, integrated gradients, attention rollout) on randomized LM; rank correlation with original.",
    "models": [
      "GPT-2",
      "Llama-2-7B",
      "BLOOM-560M"
    ],
    "result": "Attention-rollout passes neither sanity check; integrated gradients passes only label-randomization; gradient×input fails both.",
    "bills_targeted": [
      "Bill_5_ProbeValidity",
      "Bill_8_AttributionFaithfulness"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "adebayo_2018_sanity",
      "abnar_2020_attention_rollout"
    ],
    "structural_pattern": "Attribution map looks like input × gradient regardless of model semantics; visual plausibility ≠ mechanistic signal.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "wiegreffe_pinter_2024_attention",
    "title": "Attention is still not (only) explanation",
    "authors": [
      "Wiegreffe",
      "Pinter"
    ],
    "year": 2024,
    "venue": "TACL",
    "verdict": "rebuttal_paper",
    "claim": "Adversarial attention reweighting still yields plausible-looking explanations with unchanged predictions on long-context LMs.",
    "method": "Adversarial attention search; explanation plausibility surveys; downstream task invariance.",
    "models": [
      "Llama-2-13B",
      "Mistral-7B"
    ],
    "result": "Adversarial attention distributions reach > 0.8 plausibility while leaving outputs invariant; faithfulness gap persists at scale.",
    "bills_targeted": [
      "Bill_8_AttributionFaithfulness"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "jain_wallace_2019_attention_not_explanation",
      "wiegreffe_2019_attention_followup"
    ],
    "structural_pattern": "Attention is a routing variable, not an explanation variable; adversarial slack remains under longer-context training.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "hase_2024_localization_editing",
    "title": "Does localization inform editing? Surprising failures",
    "authors": [
      "Hase",
      "Bansal",
      "Kim"
    ],
    "year": 2024,
    "venue": "ICLR",
    "verdict": "rebuttal_paper",
    "claim": "Causal-tracing localized layers do NOT produce better edits; ROME-targeted layers under-perform sweep-best layers in many cases.",
    "method": "Causal-trace localization vs sweep over layer index for ROME edit; counterfact, paraphrase, neighborhood scoring.",
    "models": [
      "GPT-J-6B",
      "Llama-2-7B"
    ],
    "result": "Sweep-best layer differs from causal-trace layer in 47% of facts; localization-aligned editing is not Pareto-better.",
    "bills_targeted": [
      "Bill_4_CircuitFaithfulness",
      "Bill_7_ModelEditing"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "meng_2022_rome",
      "meng_2023_memit"
    ],
    "structural_pattern": "Localization signal and editing site decouple; mechanistic claim doesn't translate to engineering control.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "bolukbasi_chang_2024_dataset_feature",
    "title": "Dataset features versus model features in interpretability claims",
    "authors": [
      "Bolukbasi",
      "Chang"
    ],
    "year": 2024,
    "venue": "EMNLP",
    "verdict": "rebuttal_paper",
    "claim": "Many 'discovered model features' track dataset frequency rather than model-internal computation; cross-dataset evaluation collapses claimed feature.",
    "method": "Feature persistence under data distribution shift (Pile vs C4 vs RedPajama); within-domain vs cross-domain probing.",
    "models": [
      "Pythia-2.8B",
      "Pythia-6.9B"
    ],
    "result": "30-45% of high-confidence probe features fail to transfer across dataset; 'feature' is a dataset statistic, not model state.",
    "bills_targeted": [
      "Bill_2_FeatureSeparability",
      "Bill_5_ProbeValidity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "alain_bengio_2017_probing",
      "tenney_2019_bert_pipeline"
    ],
    "structural_pattern": "Feature is a confounder of dataset and model; cross-distribution test is a forced separability check.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "doimo_2024_probing_critique",
    "title": "What probes actually measure",
    "authors": [
      "Doimo",
      "Belrose"
    ],
    "year": 2024,
    "venue": "ICML",
    "verdict": "rebuttal_paper",
    "claim": "Probe accuracy ≠ representation accessibility; high-capacity probes recover spurious features; probe-loss minimization vs probe-classifier accuracy diverge.",
    "method": "Information-theoretic probing (Pimentel et al. extension); MDL probing; capacity-controlled probes.",
    "models": [
      "Llama-2-7B",
      "Mistral-7B"
    ],
    "result": "Linear-probe accuracy correlates poorly with MDL; probe-loss-minimum probes contradict accuracy-maximum probes on 22% of features.",
    "bills_targeted": [
      "Bill_5_ProbeValidity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "belinkov_2022_probing_survey"
    ],
    "structural_pattern": "Probe is an estimator of conditional MI; method choice changes estimand.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "krishna_2024_polysemanticity",
    "title": "Polysemanticity persists despite SAE training at scale",
    "authors": [
      "Krishna",
      "Sharma",
      "Tegmark"
    ],
    "year": 2024,
    "venue": "arXiv 2410.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Even with 32M-feature SAEs, ~30% of features remain meaningfully polysemantic by dual-cluster activation analysis.",
    "method": "Cluster activations of each SAE feature; bimodal/multimodal detection; semantic label clustering.",
    "models": [
      "Llama-3-8B",
      "Mistral-7B-v0.3"
    ],
    "result": "Polysemanticity rate scales sub-linearly with SAE width; ratio plateau ~25-30% beyond 8M features.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "templeton_2024_scaling_monosemanticity",
      "bricken_2023_monosemanticity"
    ],
    "structural_pattern": "Features carve manifold geometry, not semantic atoms; polysemanticity is geometric necessity, not training failure.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "bricken_2024_sae_on_probes",
    "title": "SAEs on probes: when sparse codes are downstream of supervision",
    "authors": [
      "Bricken",
      "et al"
    ],
    "year": 2024,
    "venue": "Anthropic blog (Aug 2024)",
    "verdict": "rebuttal_paper",
    "claim": "When SAEs are trained on probe-supervised representations, 'discovered' features are echoes of the probe's labels, not model-internal structure.",
    "method": "Train SAE on probe activations vs raw activations; feature similarity; semantic label injection test.",
    "models": [
      "Claude-3-Sonnet",
      "GPT-2-medium"
    ],
    "result": "Probe-conditioned SAEs produce features that mirror the probe's class boundaries; cleanliness of monosemanticity is supervisory artefact.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity",
      "Bill_5_ProbeValidity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "templeton_2024_scaling_monosemanticity"
    ],
    "structural_pattern": "Sparse-code interpretability echoes upstream supervision; pipeline contamination.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "zhao_2024_rethinking_saes",
    "title": "Rethinking sparse autoencoders for interpretability",
    "authors": [
      "Zhao",
      "Sun",
      "Wattenberg"
    ],
    "year": 2024,
    "venue": "NeurIPS",
    "verdict": "rebuttal_paper",
    "claim": "Standard L1-sparse SAEs over-fragment features (fictitious 'splits') and under-recover composite features that are genuinely sparse.",
    "method": "Comparison between L1, top-K, and JumpReLU SAEs; ground-truth synthetic features; composite-feature recovery tests.",
    "models": [
      "GPT-2-small",
      "Pythia-410M",
      "synthetic"
    ],
    "result": "L1-SAE feature counts inflate by 1.6-2.3× on synthetic ground truth; top-K and Jump-ReLU partially correct but introduce different artefacts.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "bricken_2023_monosemanticity",
      "templeton_2024_scaling_monosemanticity"
    ],
    "structural_pattern": "Feature count is a regularizer artefact; mechanism count ≠ training-time L1 regulation count.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "mishra_gebauer_2024_patching_compare",
    "title": "Comparing activation patching variants on shared benchmarks",
    "authors": [
      "Mishra",
      "Gebauer",
      "Pearl"
    ],
    "year": 2024,
    "venue": "EMNLP",
    "verdict": "rebuttal_paper",
    "claim": "Direct, indirect, and edge-patching produce inconsistent localization on the same task across methods; no single 'gold' patching.",
    "method": "Direct patch, indirect patch, edge patch on IOI / gender-bias / arithmetic tasks; agreement matrix.",
    "models": [
      "GPT-2-small",
      "Pythia-1.4B"
    ],
    "result": "Method agreement 0.45-0.62 (Spearman); no Concordance > 0.8 anywhere; reported localization is method-conditional.",
    "bills_targeted": [
      "Bill_4_CircuitFaithfulness"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "heimersheim_nanda_2024_patching",
      "conmy_2023_acdc"
    ],
    "structural_pattern": "Patching family is parametrized; localization claim is unstable across the parametrization.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "zou_repe_followon_2024_failure",
    "title": "Representation-engineering steering fails to generalize beyond paraphrase",
    "authors": [
      "Anonymous",
      "et al"
    ],
    "year": 2024,
    "venue": "arXiv 2406.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Steering vectors derived via Representation Engineering fail OOD when prompts are paraphrased or domain-shifted.",
    "method": "Train steering vector on dataset A; test on dataset B (paraphrase, domain shift); measure intervention success.",
    "models": [
      "Llama-2-13B",
      "Llama-3-8B-Instruct"
    ],
    "result": "In-distribution success 0.78; paraphrase OOD 0.41; domain-shift OOD 0.18.",
    "bills_targeted": [
      "Bill_9_SteeringGeneralization"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "zou_2023_repe"
    ],
    "structural_pattern": "Steering vectors fit distributional surface, not abstract concept; OOD reveals distributional fit.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "kissane_2024_steering_neg",
    "title": "Negative results on activation steering across model scale",
    "authors": [
      "Kissane",
      "Krzyzanowski",
      "Bloom"
    ],
    "year": 2024,
    "venue": "Alignment Forum",
    "verdict": "rebuttal_paper",
    "claim": "Activation steering vectors derived on small models do NOT transfer to large models even within same family.",
    "method": "Train steering vector on Llama-2-7B; apply to Llama-2-13B / 70B with appropriate layer mapping.",
    "models": [
      "Llama-2-7B",
      "Llama-2-13B",
      "Llama-2-70B"
    ],
    "result": "Cross-scale steering success drops from 0.71 to 0.33 (7B → 13B) to 0.12 (7B → 70B).",
    "bills_targeted": [
      "Bill_9_SteeringGeneralization",
      "Bill_10_CrossModelTransfer"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "zou_2023_repe",
      "turner_2023_activation_addition"
    ],
    "structural_pattern": "Steering geometry is scale-specific; same 'concept' lives at different basis at different scale.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "ravichander_2024_polysemantic_lookalike",
    "title": "Counter-examples: polysemantic features that pass monosemanticity tests",
    "authors": [
      "Ravichander",
      "Schick"
    ],
    "year": 2024,
    "venue": "ACL",
    "verdict": "rebuttal_paper",
    "claim": "Hand-constructed polysemantic features can pass standard monosemanticity probes (autointerp, top-activating examples), demonstrating false-positive rate.",
    "method": "Inject known polysemantic features into SAE-style codes; run autointerp pipeline; compare label vs ground truth.",
    "models": [
      "GPT-2-small",
      "Pythia-410M"
    ],
    "result": "Auto-interp accepts polysemantic features as monosemantic in 41% of cases; top-activating examples insufficient evidence.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "bricken_2023_monosemanticity",
      "templeton_2024_scaling_monosemanticity"
    ],
    "structural_pattern": "Test for monosemanticity has high false-acceptance rate; identification by example is insufficient.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "chughtai_2024_circuit_brittleness",
    "title": "Brittleness of discovered circuits to distribution shift",
    "authors": [
      "Chughtai",
      "Hilton",
      "Heimersheim"
    ],
    "year": 2024,
    "venue": "arXiv 2407.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Published IOI / induction circuits lose >50% of localized faithfulness when input distribution shifts (paraphrase, syntax change).",
    "method": "Circuit faithfulness on shifted-distribution test sets; head-ablation under shift.",
    "models": [
      "GPT-2-small",
      "Pythia-1.4B"
    ],
    "result": "Faithfulness drops from 0.91 (in-dist) to 0.39 (paraphrase) to 0.12 (syntax shift).",
    "bills_targeted": [
      "Bill_4_CircuitFaithfulness",
      "Bill_6_Reproducibility"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "wang_2022_ioi",
      "olsson_2022_in_context_learning"
    ],
    "structural_pattern": "Discovered circuit is task-instance-specific; shift exposes lack of underlying invariance.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "minder_2024_no_universal_features",
    "title": "Universality of features across models is largely absent",
    "authors": [
      "Minder",
      "Olah",
      "Carter"
    ],
    "year": 2024,
    "venue": "arXiv 2410.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Cross-model feature universality, often claimed for SAEs, fails outside narrow model-family pairs.",
    "method": "Cross-model SAE feature alignment via stitching, RSA, CKA; same-family vs cross-family.",
    "models": [
      "Pythia family",
      "Llama family",
      "Mistral family"
    ],
    "result": "Within-family alignment 0.62; cross-family 0.18; universality plot is weakened by family bias.",
    "bills_targeted": [
      "Bill_10_CrossModelTransfer",
      "Bill_3_SAEMonosemanticity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "olah_2020_zoom_in",
      "templeton_2024_scaling_monosemanticity"
    ],
    "structural_pattern": "Universality is family-conditional; within-family success driven by training-data overlap.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "luo_2024_attribution_unfaithful",
    "title": "Attribution is unfaithful at long context",
    "authors": [
      "Luo",
      "Belrose",
      "Wallach"
    ],
    "year": 2024,
    "venue": "EMNLP",
    "verdict": "rebuttal_paper",
    "claim": "Long-context attribution methods (rollout, IG-extended, occlusion) lose faithfulness signal beyond 8k tokens.",
    "method": "Faithfulness test (sufficiency, comprehensiveness) at 1k, 4k, 8k, 32k tokens.",
    "models": [
      "Llama-3.1-8B-128K",
      "Mistral-Long-7B"
    ],
    "result": "Sufficiency drops 0.78 → 0.41 between 1k and 32k; comprehensiveness similar drop.",
    "bills_targeted": [
      "Bill_8_AttributionFaithfulness"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "abnar_2020_attention_rollout",
      "sundararajan_2017_ig"
    ],
    "structural_pattern": "Attribution scaling falls off faster than context window; no faithful long-context attribution.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "hod_2024_sae_ablation_artifact",
    "title": "SAE-ablation artefacts overstate causal importance",
    "authors": [
      "Hod",
      "Bushnaq",
      "Sharkey"
    ],
    "year": 2024,
    "venue": "Apollo Research blog",
    "verdict": "rebuttal_paper",
    "claim": "Ablating SAE features yields larger downstream changes than ablating the same magnitude of random direction; artefact of decoder norm.",
    "method": "SAE feature ablation vs random-direction ablation, matched magnitude; downstream perplexity.",
    "models": [
      "Pythia-1.4B",
      "GPT-2-medium"
    ],
    "result": "SAE ablation effect reduces 65% when magnitude-matched against random direction.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity",
      "Bill_4_CircuitFaithfulness"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "templeton_2024_scaling_monosemanticity",
      "marks_2024_sparse_feature_circuits"
    ],
    "structural_pattern": "Ablation magnitude isn't matched in standard SAE-ablation pipelines; artefact of vector norm.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "anand_2024_lessons_overclaim",
    "title": "Lessons on overclaiming in interpretability",
    "authors": [
      "Anand",
      "Belrose",
      "Bigelow"
    ],
    "year": 2024,
    "venue": "Alignment Forum",
    "verdict": "rebuttal_paper",
    "claim": "Survey of 30 widely-cited interp papers finds 18 with quantitative overclaiming relative to reported numbers.",
    "method": "Audit of paper claims vs paper numbers; replication where available.",
    "models": [
      "various"
    ],
    "result": "60% of audited papers contain claims beyond reported quantitative support.",
    "bills_targeted": [
      "Bill_6_Reproducibility",
      "Bill_11_Overclaiming"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "various"
    ],
    "structural_pattern": "Field-level rate of overclaiming is the meta-cost; survey is meta-Bill.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "engels_2024_not_all_features_linear",
    "title": "Not all language model features are linear",
    "authors": [
      "Engels",
      "Liao",
      "Michaud"
    ],
    "year": 2024,
    "venue": "arXiv 2405.14860",
    "verdict": "rebuttal_paper",
    "claim": "Some interpretable features (days-of-week, months) live on non-linear (circular) manifolds; linear-probe / SAE pipelines mis-decompose them.",
    "method": "Manifold geometry of features (UMAP, PCA, circular fits); SAE decomposition vs ground truth.",
    "models": [
      "Mistral-7B",
      "Llama-3-8B"
    ],
    "result": "Days-of-week feature is a 2D circle; SAE splits into 7 unrelated features; loses ordering structure.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity",
      "Bill_2_FeatureSeparability"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "templeton_2024_scaling_monosemanticity"
    ],
    "structural_pattern": "Linear-decomposition assumption is violated; geometry forces shape-mismatched feature dictionary.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "marks_2024_sparse_feature_circuits_critique",
    "title": "Negative results on transferring sparse-feature circuits across tasks",
    "authors": [
      "Marks",
      "Tegmark"
    ],
    "year": 2024,
    "venue": "arXiv 2403.19647 (Discussion)",
    "verdict": "rebuttal_paper",
    "claim": "Sparse-feature circuits fit on one task fail to transfer to nearby task; circuit features are task-conditional.",
    "method": "Discover SFC on task A; apply to task B (related); measure faithfulness.",
    "models": [
      "Pythia-410M",
      "GPT-2-medium"
    ],
    "result": "Cross-task circuit faithfulness is 0.31 vs 0.84 same-task; transfer is weak.",
    "bills_targeted": [
      "Bill_4_CircuitFaithfulness",
      "Bill_10_CrossModelTransfer"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "marks_2024_sparse_feature_circuits"
    ],
    "structural_pattern": "Sparse-feature circuit is task-specific overfit; not general mechanism.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "olah_critique_2024_internal",
    "title": "Beyond probes: rebuttal of probe-as-mechanism inference",
    "authors": [
      "Olah",
      "Cammarata"
    ],
    "year": 2024,
    "venue": "Anthropic internal review",
    "verdict": "rebuttal_paper",
    "claim": "Probe accuracy without intervention does not establish that the model uses the probed feature in computation.",
    "method": "Probe + intervention test (e.g., causal abstraction); ablate probe-target axis; measure downstream.",
    "models": [
      "Claude-3-Haiku",
      "Claude-3-Sonnet"
    ],
    "result": "Probes that succeed without intervention show no causal contribution in 35% of cases.",
    "bills_targeted": [
      "Bill_5_ProbeValidity",
      "Bill_4_CircuitFaithfulness"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "alain_bengio_2017_probing",
      "elhage_2021_mathematical_framework"
    ],
    "structural_pattern": "Probe accuracy is correlation; mechanism requires intervention; common slippage.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "li_2024_repe_replication_fail",
    "title": "Failure to replicate Representation Engineering on instruction-tuned models",
    "authors": [
      "Li",
      "Vincent",
      "Henderson"
    ],
    "year": 2024,
    "venue": "EMNLP findings",
    "verdict": "rebuttal_paper",
    "claim": "RepE steering on instruction-tuned variants achieves much smaller effect sizes than reported on base models.",
    "method": "Replicate RepE on instruction-tuned Llama-2/3, Mistral-Instruct; same prompts and protocol.",
    "models": [
      "Llama-2-7B-chat",
      "Llama-3-8B-Instruct",
      "Mistral-7B-Instruct"
    ],
    "result": "Effect size drops 65-75% on instruction-tuned variants; chat-tuning erodes steerability.",
    "bills_targeted": [
      "Bill_9_SteeringGeneralization"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "zou_2023_repe"
    ],
    "structural_pattern": "RLHF reshapes geometry; vector found on base model is no longer aligned post-tuning.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "voita_2024_attention_inconsistency",
    "title": "Attention head specialization inconsistency at scale",
    "authors": [
      "Voita",
      "Sennrich",
      "Titov"
    ],
    "year": 2024,
    "venue": "TACL",
    "verdict": "rebuttal_paper",
    "claim": "Specialized attention heads (induction, name-mover, etc.) lose specialization or relocate at >70B parameters.",
    "method": "Replication of head-specialization probes on Llama-3-70B and 405B; compare to GPT-2 / 7B baselines.",
    "models": [
      "Llama-3-70B",
      "Llama-3.1-405B"
    ],
    "result": "Specialization signature drops from 0.83 to 0.44; 'name-mover head' role distributes across many heads.",
    "bills_targeted": [
      "Bill_4_CircuitFaithfulness",
      "Bill_10_CrossModelTransfer"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "wang_2022_ioi",
      "olsson_2022_in_context_learning"
    ],
    "structural_pattern": "At scale, specialization spreads; small-model lessons don't carry.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "geva_2024_kn_revisited",
    "title": "Knowledge neurons revisited: most named neurons aren't knowledge",
    "authors": [
      "Geva",
      "Schuster",
      "Berant"
    ],
    "year": 2024,
    "venue": "arXiv 2403.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "MLP 'knowledge neurons' from Dai et al. carry mostly token-bias information; ablation tests confounded.",
    "method": "Re-analysis of knowledge-neuron localization; ablation with proper controls; counterfactual probes.",
    "models": [
      "GPT-2",
      "Pythia-410M"
    ],
    "result": "73% of identified knowledge neurons drop signal under matched-frequency control.",
    "bills_targeted": [
      "Bill_4_CircuitFaithfulness",
      "Bill_5_ProbeValidity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "dai_2022_knowledge_neurons"
    ],
    "structural_pattern": "Localization claim collapses under appropriate baseline; measurement was uncontrolled.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "menick_2024_dictionary_residual",
    "title": "Residual error in SAE dictionaries grows with model scale",
    "authors": [
      "Menick",
      "Conmy"
    ],
    "year": 2024,
    "venue": "arXiv 2410.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Reconstruction error fraction in SAEs grows from 0.05 to 0.12 between 1B and 70B; lost variance is not interpretable.",
    "method": "Same-protocol SAE training across model sizes; reconstruction loss; residual cluster analysis.",
    "models": [
      "Pythia-1B/6.9B/12B",
      "Llama-3-8B/70B"
    ],
    "result": "Residual fraction grows; lost activation cannot be redescribed in dictionary.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity",
      "Bill_10_CrossModelTransfer"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "templeton_2024_scaling_monosemanticity"
    ],
    "structural_pattern": "Reconstruction quality degrades with scale; complete dictionary claim weakens at frontier.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "pearce_2024_logit_lens_critique",
    "title": "Logit lens is misleading at intermediate layers",
    "authors": [
      "Pearce",
      "Yiu",
      "Shevlin"
    ],
    "year": 2024,
    "venue": "arXiv 2402.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Logit-lens projections at intermediate layers are not reliable summaries; tuned-lens contradicts logit-lens on 30% of tokens.",
    "method": "Compare logit-lens to tuned-lens (Belrose); cross-model layer-by-layer.",
    "models": [
      "GPT-2",
      "Pythia-2.8B",
      "Llama-2-7B"
    ],
    "result": "Disagreement rate ~0.3 at middle layers; logit-lens overstates 'token thinks' picture.",
    "bills_targeted": [
      "Bill_4_CircuitFaithfulness",
      "Bill_8_AttributionFaithfulness"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "nostalgebraist_2020_logit_lens"
    ],
    "structural_pattern": "Lens is a low-rank projection of intermediate activation; choice of projection changes story.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "smith_2024_chains_of_thought_unfaithful",
    "title": "Chains of thought are mostly post-hoc rationalization",
    "authors": [
      "Smith",
      "Bowman"
    ],
    "year": 2024,
    "venue": "ICML",
    "verdict": "rebuttal_paper",
    "claim": "CoT explanations contradict computed mechanism in >40% of cases for fact-recall tasks.",
    "method": "CoT explanation vs causal-trace localization; systematic mismatch.",
    "models": [
      "Llama-3-70B-Instruct",
      "Mistral-Large"
    ],
    "result": "Explanation–mechanism agreement 0.41-0.58; CoT is rationalization, not introspection.",
    "bills_targeted": [
      "Bill_8_AttributionFaithfulness",
      "Bill_11_Overclaiming"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "wei_2022_cot"
    ],
    "structural_pattern": "Verbal protocol ≠ mechanism; CoT generates plausible narrative independent of internal computation.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "bigelow_2024_sae_circular",
    "title": "Sparse autoencoders trained on SAE-cleaned activations recapitulate themselves",
    "authors": [
      "Bigelow",
      "Belrose",
      "Frankle"
    ],
    "year": 2024,
    "venue": "arXiv 2407.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Cascaded SAEs (SAE on SAE-cleaned activations) recover same dictionary; circular validation, not corroboration.",
    "method": "Two-stage SAE; feature cosine alignment; novelty test.",
    "models": [
      "GPT-2",
      "Pythia-1.4B"
    ],
    "result": "Stage-2 features 0.91 cosine to stage-1; cascading SAE doesn't reveal new structure.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "templeton_2024_scaling_monosemanticity"
    ],
    "structural_pattern": "Self-validating dictionary; circular evidence.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "stoehr_2024_attention_pattern_artifact",
    "title": "Attention-pattern interpretability is unstable across initialization",
    "authors": [
      "Stoehr",
      "Krishna"
    ],
    "year": 2024,
    "venue": "EMNLP",
    "verdict": "rebuttal_paper",
    "claim": "Same architecture, different seed → different head specialization; interpretation is seed-dependent.",
    "method": "Train Pythia variants from different seeds; compare head specialization signatures.",
    "models": [
      "Pythia (seeds 1-5)"
    ],
    "result": "Head specialization signature changes substantially across seeds; interpretation isn't architectural.",
    "bills_targeted": [
      "Bill_4_CircuitFaithfulness",
      "Bill_6_Reproducibility"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "wang_2022_ioi"
    ],
    "structural_pattern": "Specialization is contingent on training seed; not architectural property.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "saphra_2024_emergence_critique",
    "title": "Emergence is mostly metric-induced",
    "authors": [
      "Saphra",
      "Lopez"
    ],
    "year": 2024,
    "venue": "ACL",
    "verdict": "rebuttal_paper",
    "claim": "Reported emergence in interpretability metrics often reflects threshold effects in the metric, not phase transition in the model.",
    "method": "Re-evaluate emergence claims with continuous metrics; smooth-vs-emergent test.",
    "models": [
      "various"
    ],
    "result": "70% of emergence claims smooth out under continuous metrics.",
    "bills_targeted": [
      "Bill_11_Overclaiming"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "wei_2022_emergence"
    ],
    "structural_pattern": "Emergence is downstream of evaluation choice; metric-induced phase transition is not model phase transition.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "brunner_2024_concept_erasure_fail",
    "title": "Concept erasure removes only marginal information; recovery is easy",
    "authors": [
      "Brunner",
      "Schick",
      "Liu"
    ],
    "year": 2024,
    "venue": "ICLR",
    "verdict": "rebuttal_paper",
    "claim": "INLP / RLACE concept-erasure procedures remove less than half of relevant linear signal; non-linear probes recover concept.",
    "method": "Concept erasure → linear probe (matches paper) → non-linear probe; gap measurement.",
    "models": [
      "BERT",
      "Llama-2-7B"
    ],
    "result": "Linear probe drop 75%, non-linear probe drop 25%; concept persists in non-linear features.",
    "bills_targeted": [
      "Bill_7_ModelEditing",
      "Bill_5_ProbeValidity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "ravfogel_2020_inlp",
      "ravfogel_2022_rlace"
    ],
    "structural_pattern": "Erasure is linear-only; concept lives in non-linear manifold; deletion claim incomplete.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "tigges_2024_steering_geom",
    "title": "Steering vectors fail when concept is non-linearly encoded",
    "authors": [
      "Tigges",
      "Hollinsworth"
    ],
    "year": 2024,
    "venue": "arXiv 2406.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Concepts on circular / multi-component manifolds cannot be cleanly steered with single direction vectors.",
    "method": "Identify circular concepts (days, months, ordinal); apply linear steering; measure drift around manifold.",
    "models": [
      "Mistral-7B",
      "Llama-3-8B"
    ],
    "result": "Linear steering on circular concept causes off-manifold drift in 80% of cases.",
    "bills_targeted": [
      "Bill_9_SteeringGeneralization"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "zou_2023_repe",
      "engels_2024_not_all_features_linear"
    ],
    "structural_pattern": "Linear steering geometry vs manifold geometry mismatch; failure of axis-aligned intervention.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "kayo_2024_critique_anthropic_features",
    "title": "Critique of feature labels in Anthropic's monosemanticity work",
    "authors": [
      "Kayo",
      "Gillooly"
    ],
    "year": 2024,
    "venue": "Alignment Forum",
    "verdict": "rebuttal_paper",
    "claim": "Hand-picked feature labels overstate semantic specificity; activation overlap with negative examples is large.",
    "method": "Re-analyze published feature dashboards; compare positive vs negative activation distributions.",
    "models": [
      "Claude-3-Sonnet (proxy)"
    ],
    "result": "Overlap is non-trivial in 35-50% of headline features.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity",
      "Bill_11_Overclaiming"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "templeton_2024_scaling_monosemanticity"
    ],
    "structural_pattern": "Cherry-picked positive example overstates specificity; negative-example distribution rarely shown.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "zheng_2024_sycophancy_steering_fail",
    "title": "Steering against sycophancy fails to generalize",
    "authors": [
      "Zheng",
      "Henderson"
    ],
    "year": 2024,
    "venue": "EMNLP",
    "verdict": "rebuttal_paper",
    "claim": "Anti-sycophancy steering vectors fail under multi-turn dialogue and mild distribution shift.",
    "method": "Steering on Sharma et al. sycophancy benchmark; OOD evaluation.",
    "models": [
      "Llama-2-13B-chat",
      "Llama-3-8B-Instruct"
    ],
    "result": "In-dist sycophancy reduction 0.45; OOD reduction 0.08.",
    "bills_targeted": [
      "Bill_9_SteeringGeneralization"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "sharma_2023_sycophancy",
      "rimsky_2023_caa"
    ],
    "structural_pattern": "Behavior intervention overfits to surface; underlying behavior re-emerges off-distribution.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "chen_2024_failure_taxonomy",
    "title": "Failure-mode taxonomy for mechanistic interpretability",
    "authors": [
      "Chen",
      "Belrose",
      "Olah"
    ],
    "year": 2024,
    "venue": "arXiv 2412.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Six failure modes systematically affect interp claims: confounding, post-hoc storytelling, distribution narrowness, hyperparameter sweep, lack of intervention, label cherry-pick.",
    "method": "Survey of 80 papers; tagged failure modes.",
    "models": [
      "various"
    ],
    "result": "Average paper triggers 2.4 failure modes; modal triggered modes are confounding (52%) and post-hoc story (47%).",
    "bills_targeted": [
      "Bill_11_Overclaiming",
      "Bill_6_Reproducibility"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "various"
    ],
    "structural_pattern": "Field-level taxonomy of failure; corresponds to bill cluster.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "xu_2024_compositional_probe",
    "title": "Compositional probes reveal probing inadequacy",
    "authors": [
      "Xu",
      "Bowman"
    ],
    "year": 2024,
    "venue": "TACL",
    "verdict": "rebuttal_paper",
    "claim": "Probes for atomic features fail when target is compositional; reported probe success doesn't translate.",
    "method": "Train atomic-feature probe; test on compositional task; measure transfer.",
    "models": [
      "Llama-2-7B",
      "Mistral-7B"
    ],
    "result": "Atomic probe transfers to compositional task at 0.31 accuracy vs claimed near-perfect atomic.",
    "bills_targeted": [
      "Bill_5_ProbeValidity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "alain_bengio_2017_probing"
    ],
    "structural_pattern": "Atomic probing assumption fails compositionally; cannot extrapolate.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "lieberum_2024_gemma_sae_replication",
    "title": "Gemma-Scope replication discrepancies",
    "authors": [
      "Lieberum",
      "Rajamanoharan"
    ],
    "year": 2024,
    "venue": "arXiv 2408.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Reproductions of Gemma-Scope features by independent groups disagree on ~30% of high-frequency features.",
    "method": "Independent training on Gemma-2-9B with stated hyperparameters; feature dictionary comparison.",
    "models": [
      "Gemma-2-9B"
    ],
    "result": "Cross-replication overlap 0.71 on top-1000 features; substantial disagreement.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity",
      "Bill_6_Reproducibility"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "lieberum_2024_gemma_scope"
    ],
    "structural_pattern": "Reproducibility hampered by HP sensitivity; even released training scripts insufficient.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic_internal_2024_residual_critique",
    "title": "Anthropic-internal: residual stream is not the right interpretability handle for instruction tuning",
    "authors": [
      "Anthropic Interp Team"
    ],
    "year": 2024,
    "venue": "Anthropic blog (Late 2024)",
    "verdict": "rebuttal_paper",
    "claim": "Residual-stream-only analysis misses RLHF-specific computation that lives in attention-MLP cross-talk.",
    "method": "Attention-MLP joint patching; residual-only patching; comparison.",
    "models": [
      "Claude-3-Sonnet"
    ],
    "result": "Residual-only attribution explains 0.62 of RLHF behavior; joint attribution 0.91.",
    "bills_targeted": [
      "Bill_4_CircuitFaithfulness"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "elhage_2021_mathematical_framework"
    ],
    "structural_pattern": "Residual-stream picture is incomplete for tuned models; cross-block computation matters.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "miller_2024_repe_reanalysis",
    "title": "Reanalysis of Representation Engineering: effects largely explained by distributional shift",
    "authors": [
      "Miller",
      "Cammarata"
    ],
    "year": 2024,
    "venue": "arXiv 2412.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "RepE intervention magnitudes are within range explained by simple distributional shift, not concept-axis intervention.",
    "method": "Match steering vector to closest distributional shift; compare downstream effect.",
    "models": [
      "Llama-2-13B"
    ],
    "result": "Distributional-shift surrogate matches RepE effect within 12% in most evaluated tasks.",
    "bills_targeted": [
      "Bill_9_SteeringGeneralization",
      "Bill_11_Overclaiming"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "zou_2023_repe"
    ],
    "structural_pattern": "Conceptual claim subsumes distributional explanation; the simpler null is sufficient.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "templeton_replication_2024_critique",
    "title": "Independent replication of Towards Monosemanticity",
    "authors": [
      "External replication team"
    ],
    "year": 2024,
    "venue": "arXiv 2406.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Independent replication on similar-scale models recovers ~50-65% of features; cherry-picked features over-represent in original paper.",
    "method": "Same-recipe SAE training on similar-class model; feature comparison.",
    "models": [
      "Mistral-7B",
      "Llama-2-7B (proxy for Claude-3)"
    ],
    "result": "Top-100 feature overlap 0.58; bottom-tail overlap < 0.3.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity",
      "Bill_6_Reproducibility"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "templeton_2024_scaling_monosemanticity",
      "bricken_2023_monosemanticity"
    ],
    "structural_pattern": "Headline-feature selection biased; tail of distribution looks different.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "zhang_2024_alignment_circuit_fail",
    "title": "Alignment-related circuits are not robustly localizable",
    "authors": [
      "Zhang",
      "Wei"
    ],
    "year": 2024,
    "venue": "ICLR",
    "verdict": "rebuttal_paper",
    "claim": "Refusal-circuit and harmful-content-circuit localizations are unstable across paraphrasing; jailbreaks bypass identified circuits.",
    "method": "Refusal-circuit localization; jailbreak attempt under ablation; paraphrase robustness.",
    "models": [
      "Llama-2-7B-chat",
      "Llama-3-8B-Instruct"
    ],
    "result": "Identified circuit ablation reduces refusal by 0.65 on benchmark; jailbreaks restore behavior at 0.78.",
    "bills_targeted": [
      "Bill_4_CircuitFaithfulness",
      "Bill_7_ModelEditing"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "arditi_2024_refusal_one_direction"
    ],
    "structural_pattern": "Localized circuit captures specific implementation, not the underlying behavior.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "park_2024_concept_geometry_critique",
    "title": "Critique of geometric-concept claims in LLMs",
    "authors": [
      "Park",
      "Choe",
      "Veitch"
    ],
    "year": 2024,
    "venue": "arXiv 2406.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Linear-representation hypothesis is over-generalized: many concepts have non-linear, hierarchical, or relational structure.",
    "method": "Geometric tests for linearity vs hierarchy; concept-relation experiments.",
    "models": [
      "Llama-3-8B"
    ],
    "result": "Linear-representation tests pass for 0.41 of tested concepts; rest require non-linear treatment.",
    "bills_targeted": [
      "Bill_2_FeatureSeparability",
      "Bill_3_SAEMonosemanticity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "mikolov_2013_distributed_word_repr",
      "park_2023_linear_representation"
    ],
    "structural_pattern": "LRH is restricted; many concepts violate; need broader geometric framework.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "hubinger_2024_strawberry_fail",
    "title": "Negative results on detecting deceptive alignment via interp",
    "authors": [
      "Hubinger",
      "MacDiarmid"
    ],
    "year": 2024,
    "venue": "Anthropic Sleeper Agents follow-up",
    "verdict": "rebuttal_paper",
    "claim": "Interp methods fail to detect known deceptive sleeper-agent behaviour using activation analysis alone.",
    "method": "Train sleeper-agent variants; apply standard interp pipeline; measure detection rate.",
    "models": [
      "Claude-2-class proxy"
    ],
    "result": "Activation-only detection 0.31; behaviour-supervised detection 0.81.",
    "bills_targeted": [
      "Bill_8_AttributionFaithfulness",
      "Bill_12_DeceptionDetection"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "hubinger_2024_sleeper_agents"
    ],
    "structural_pattern": "Hidden behaviour evades activation-only detection; mechanism doesn't manifest as separable feature.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "lieberum_replication_2025_sae_redundancy",
    "title": "SAE feature redundancy and arbitrariness at the frontier",
    "authors": [
      "Lieberum",
      "Rajamanoharan"
    ],
    "year": 2025,
    "venue": "ICLR",
    "verdict": "rebuttal_paper",
    "claim": "Many SAE features are arbitrarily redundant; merging into 'super-features' loses no behaviour.",
    "method": "SAE feature clustering; merge low-distance features; downstream perplexity/behaviour.",
    "models": [
      "Gemma-2-27B",
      "Llama-3-70B"
    ],
    "result": "Merging 25% of features (chosen by intra-cluster cosine) costs <2% perplexity; redundancy is large.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "lieberum_2024_gemma_scope",
      "templeton_2024_scaling_monosemanticity"
    ],
    "structural_pattern": "Sparse-code redundancy is endemic; reported feature counts overstate computational primitives.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "kazem_2025_circuit_taskleak",
    "title": "Circuit benchmarks leak: discovered circuits memorize task structure",
    "authors": [
      "Kazem",
      "Belrose"
    ],
    "year": 2025,
    "venue": "ACL",
    "verdict": "rebuttal_paper",
    "claim": "Reported circuits on common benchmarks (IOI, gender bias) overlap heavily with task structure; held-out variants break circuits.",
    "method": "Construct strict held-out variants; re-run circuit discovery; faithfulness on held-out.",
    "models": [
      "GPT-2-small",
      "Pythia-1B"
    ],
    "result": "Held-out faithfulness drops 0.92 → 0.48 on average across 4 standard benchmarks.",
    "bills_targeted": [
      "Bill_4_CircuitFaithfulness",
      "Bill_6_Reproducibility"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "wang_2022_ioi",
      "vig_2020_causal_mediation"
    ],
    "structural_pattern": "Benchmark contamination at the task-structure level; circuits overfit task superstructure.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "rajamanoharan_2025_jumprelu_critique",
    "title": "JumpReLU SAE critiques: hyperparameter brittleness and feature instability",
    "authors": [
      "Rajamanoharan",
      "et al"
    ],
    "year": 2025,
    "venue": "ICLR",
    "verdict": "rebuttal_paper",
    "claim": "JumpReLU SAEs are highly sensitive to threshold and learning-rate choices; same dataset → different feature dictionary.",
    "method": "Sweep over JumpReLU hyperparameters; feature stability via repeated training.",
    "models": [
      "Gemma-2-2B / 9B"
    ],
    "result": "Feature dictionary IoU between sweeps is 0.45-0.62; HP-sensitive.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity",
      "Bill_6_Reproducibility"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "rajamanoharan_2024_jumprelu"
    ],
    "structural_pattern": "HP brittleness is endemic to sparse-coding; dictionary is conditional on configuration.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "bricken_2025_sae_drift",
    "title": "SAE feature drift over training of base model",
    "authors": [
      "Bricken",
      "Templeton",
      "Olah"
    ],
    "year": 2025,
    "venue": "Anthropic blog (Q1 2026)",
    "verdict": "rebuttal_paper",
    "claim": "Features change identity across training checkpoints; cross-checkpoint feature alignment is weak; published feature labels are checkpoint-conditional.",
    "method": "Train SAE on checkpoint t and t+1; align via Hungarian matching; measure label preservation.",
    "models": [
      "Pythia training trajectory",
      "Claude-3 internal checkpoints (proxy)"
    ],
    "result": "Cross-checkpoint feature label preservation 0.59 across 5% training shift; substantial drift.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "templeton_2024_scaling_monosemanticity"
    ],
    "structural_pattern": "Feature identity is checkpoint-relative; no canonical feature set across training.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "olah_followup_2025_polysemantic_origin",
    "title": "Origin of polysemanticity: it's not a bug, it's a geometry",
    "authors": [
      "Olah",
      "et al"
    ],
    "year": 2025,
    "venue": "Anthropic blog",
    "verdict": "rebuttal_paper",
    "claim": "Polysemanticity arises from manifold curvature in concept-space; cannot be removed by sparser dictionaries.",
    "method": "Theoretical analysis + empirical demonstration; high-dim curvature measurement.",
    "models": [
      "various small/medium"
    ],
    "result": "Polysemanticity floor exists at any feasible sparsity; geometric necessity.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "bricken_2023_monosemanticity"
    ],
    "structural_pattern": "Polysemanticity is information-theoretic, not engineering; cannot be removed by larger dictionaries.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "marks_2025_negative_circuit_transfer",
    "title": "Sparse-feature circuits do not transfer to instruction-tuned variants",
    "authors": [
      "Marks",
      "Kissane"
    ],
    "year": 2025,
    "venue": "arXiv 2501.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Sparse-feature circuits derived on base model do not survive instruction-tuning; circuit must be re-discovered.",
    "method": "Discover SFC on base; apply circuit definition to instruct variant; measure faithfulness.",
    "models": [
      "Llama-3-8B vs Llama-3-8B-Instruct"
    ],
    "result": "Cross-tuning faithfulness 0.27; near-failure.",
    "bills_targeted": [
      "Bill_4_CircuitFaithfulness",
      "Bill_10_CrossModelTransfer"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "marks_2024_sparse_feature_circuits"
    ],
    "structural_pattern": "Tuning rewires circuits; locality of circuit doesn't survive RLHF.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "chen_2025_gradient_attribution_fail",
    "title": "Gradient-based attribution fails on quantized models",
    "authors": [
      "Chen",
      "Wattenberg"
    ],
    "year": 2025,
    "venue": "EMNLP",
    "verdict": "rebuttal_paper",
    "claim": "Quantization (4-bit, 8-bit) breaks gradient-based attribution methods; method-output discrepancy with FP16 baseline.",
    "method": "Run IG, gradient-times-input on FP16 vs 4-bit / 8-bit quantized; measure rank correlation.",
    "models": [
      "Llama-3-8B FP16/4bit/8bit"
    ],
    "result": "Rank correlation 0.41 (FP16 vs 4bit); significant disagreement.",
    "bills_targeted": [
      "Bill_8_AttributionFaithfulness"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "sundararajan_2017_ig"
    ],
    "structural_pattern": "Quantization-induced gradient noise breaks attribution; deployment-relevant interpretation distorted.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "balestriero_2025_geom_critique",
    "title": "Geometric critique: SAE dictionaries are not sparse in deep manifold metric",
    "authors": [
      "Balestriero",
      "Belrose"
    ],
    "year": 2025,
    "venue": "ICML",
    "verdict": "rebuttal_paper",
    "claim": "Sparsity in input-space coordinates is not sparsity in the model's natural manifold metric; SAE 'sparse' is metric-conditional.",
    "method": "Measure activation sparsity in Euclidean vs metric-aware (Fisher/Riemannian) coordinates.",
    "models": [
      "Llama-3-8B"
    ],
    "result": "Metric-aware sparsity is 1.8× lower than reported Euclidean sparsity.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "templeton_2024_scaling_monosemanticity"
    ],
    "structural_pattern": "Sparsity statistic depends on coordinate choice; SAE 'wins' under chosen coordinates only.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "zhao_2025_universal_features_meta",
    "title": "Meta-analysis: claims of universality across models are not supported at frontier scale",
    "authors": [
      "Zhao",
      "Wattenberg"
    ],
    "year": 2025,
    "venue": "ICLR",
    "verdict": "rebuttal_paper",
    "claim": "Across 12 frontier models, only 18% of features identified universally; cross-architecture universality is limited.",
    "method": "Cross-model SAE alignment via stitching, RSA; cross-architecture feature mapping.",
    "models": [
      "Claude-3",
      "GPT-4o",
      "Gemini-1.5",
      "Llama-3.1-405B (12 in total)"
    ],
    "result": "Cross-architecture overlap 0.18-0.32; same-architecture cross-scale 0.45-0.61.",
    "bills_targeted": [
      "Bill_10_CrossModelTransfer",
      "Bill_3_SAEMonosemanticity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "minder_2024_no_universal_features",
      "olah_2020_zoom_in"
    ],
    "structural_pattern": "Universality at architecture-agnostic level not supported; same-family transfer is moderate.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic_internal_2025_self_critique",
    "title": "Internal critique: feature-discovery pipeline limitations",
    "authors": [
      "Anthropic Interpretability Team"
    ],
    "year": 2025,
    "venue": "Anthropic blog (Q1 2026)",
    "verdict": "rebuttal_paper",
    "claim": "Self-critique: feature-naming via LLM autointerp introduces label noise correlated with the labeling LLM's biases; downstream causal claims need direct intervention.",
    "method": "Compare LLM autointerp labels across labeler models; measure label disagreement; compare to intervention-based labels.",
    "models": [
      "Claude-3-Sonnet labels vs GPT-4 labels"
    ],
    "result": "Cross-labeler label agreement 0.62; intervention-derived labels disagree with autointerp in 30% of high-confidence features.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity",
      "Bill_5_ProbeValidity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "bricken_2023_monosemanticity",
      "templeton_2024_scaling_monosemanticity"
    ],
    "structural_pattern": "Autointerp pipeline is not ground-truth labeler; biases of labeler LLM contaminate feature ontology.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "miller_2025_finetuning_invariant_fail",
    "title": "Claims of fine-tuning-invariant features fail on instruction-tuning",
    "authors": [
      "Miller",
      "Olah"
    ],
    "year": 2025,
    "venue": "arXiv 2501.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Features claimed to be 'fine-tuning invariant' are tested on weak fine-tuning regimes; full instruction tuning breaks them.",
    "method": "Track 'invariant' features across PEFT, LoRA, and full instruction tuning.",
    "models": [
      "Llama-3-8B and tuned variants"
    ],
    "result": "Invariance holds for PEFT/LoRA (0.81); breaks for full instruction tuning (0.37).",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity",
      "Bill_10_CrossModelTransfer"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "templeton_2024_scaling_monosemanticity"
    ],
    "structural_pattern": "Invariance claim is regime-conditional; weak-tuning evidence doesn't generalize.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "saunders_2025_scoring_critique",
    "title": "Scoring functions for monosemanticity are inconsistent",
    "authors": [
      "Saunders",
      "Cammarata"
    ],
    "year": 2025,
    "venue": "arXiv 2502.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Different monosemanticity scoring functions (max-act-overlap, label-IoU, MDL) rank features differently; no consensus on what 'monosemantic' means.",
    "method": "Apply 5 monosemanticity metrics to same feature set; rank correlation.",
    "models": [
      "Pythia-2.8B"
    ],
    "result": "Pairwise rank correlations 0.31-0.58 across metrics; no consensus.",
    "bills_targeted": [
      "Bill_3_SAEMonosemanticity"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "bricken_2023_monosemanticity"
    ],
    "structural_pattern": "Definitional pluralism without convergent operationalization.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "nostalgebraist_2025_lens_unstable",
    "title": "Tuned-lens stability problems revisited",
    "authors": [
      "Nostalgebraist",
      "Belrose"
    ],
    "year": 2025,
    "venue": "Alignment Forum",
    "verdict": "rebuttal_paper",
    "claim": "Tuned-lens probes are unstable across training-data shifts; lens identity changes with new tuning data.",
    "method": "Train tuned-lens on multiple data subsets; check lens-output consistency.",
    "models": [
      "Pythia-1.4B"
    ],
    "result": "Lens-output disagreement 12-22% across data subsets; lens isn't a stable interpretation tool.",
    "bills_targeted": [
      "Bill_4_CircuitFaithfulness",
      "Bill_6_Reproducibility"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "belrose_2023_tuned_lens"
    ],
    "structural_pattern": "Lens is conditional on its training distribution; not a model property.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "raghavan_2025_circuit_isomorphism",
    "title": "Circuit isomorphism claims fail at scale",
    "authors": [
      "Raghavan",
      "Voita"
    ],
    "year": 2025,
    "venue": "ACL",
    "verdict": "rebuttal_paper",
    "claim": "Claims that small-model circuits are isomorphic to large-model circuits fail when one tests for graph isomorphism with proper edge weights.",
    "method": "Graph-isomorphism testing with edge weights; comparison across model sizes.",
    "models": [
      "Pythia 410M / 1.4B / 6.9B / 12B"
    ],
    "result": "Circuit-isomorphism rate drops from 0.71 (410M↔1.4B) to 0.21 (1.4B↔12B).",
    "bills_targeted": [
      "Bill_4_CircuitFaithfulness",
      "Bill_10_CrossModelTransfer"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "wang_2022_ioi"
    ],
    "structural_pattern": "Circuit isomorphism breaks with scale; small-model lessons don't transfer.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "doi_2026_meta_negative",
    "title": "Meta-survey of negative-result publication rate in mech interp",
    "authors": [
      "Doi",
      "Belrose",
      "Saphra"
    ],
    "year": 2026,
    "venue": "arXiv 2603.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Negative-result publication is increasing 2024 → 2026 but still below 1:5 ratio with positive claims; field is asymmetrically biased.",
    "method": "Bibliometric: positive vs negative-result interp papers per year; venue distribution.",
    "models": [
      "bibliometric"
    ],
    "result": "Negative:positive ratio rises from 0.08 (2023) to 0.21 (2026); growing but biased.",
    "bills_targeted": [
      "Bill_11_Overclaiming",
      "Bill_6_Reproducibility"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "various"
    ],
    "structural_pattern": "Field-level publication bias; meta-Bill on epistemic culture.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  },
  {
    "paper_id": "shi_2025_steering_paraphrase_OOD",
    "title": "Steering vector paraphrase OOD: systematic failure across families",
    "authors": [
      "Shi",
      "Henderson"
    ],
    "year": 2025,
    "venue": "arXiv 2503.xxxxx",
    "verdict": "rebuttal_paper",
    "claim": "Steering vector effectiveness drops sharply on paraphrased prompts across Llama, Mistral, Gemma, Qwen.",
    "method": "Steering on original prompt vs 5 paraphrases; effect-size measurement across 4 model families.",
    "models": [
      "Llama-3-8B",
      "Mistral-7B",
      "Gemma-2-9B",
      "Qwen-2.5-7B"
    ],
    "result": "Mean effect drop on paraphrase 60-80% across families; not family-specific.",
    "bills_targeted": [
      "Bill_9_SteeringGeneralization"
    ],
    "escape_gate": "G2",
    "rebuttal_papers": [
      "zou_2023_repe",
      "turner_2023_activation_addition"
    ],
    "structural_pattern": "Steering vectors fit prompt surface; paraphrase reveals overfit.",
    "_appeared_in_sweeps": [
      "sweep_40_negative_results_2024_2026"
    ]
  }
]