{
  "_comment": "Hand-curated benchmark cases for the Mechanistic Interpretability Aiwiki classifier (v0.2). 60 cases. Lock target: gate_accuracy=1.000 and bill_recall=1.000.",
  "version": "0.2",
  "cases": [
    {
      "id": "B01_hewitt_levy_collinearity",
      "proposal": "Hewitt-Levy collinearity audit on frontier LLMs. Tests truthfulness/refusal/persona directions against PC1 baseline; matched-norm random direction recovers 70% of effect. Costumed-scalar pattern explicit.",
      "expected_bills": [1],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B01_mengrong_hofmann_sae_pc1",
      "proposal": "Mengrong-Hofmann critique: SAE features are PC1 in disguise. Demonstrates SAE feature directions correlate at 0.85 cosine with first principal component of the same training data.",
      "expected_bills": [1],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B02_sae_seed_reproducibility",
      "proposal": "SAE seed reproducibility study. Across 10 different SAE training seeds with identical architecture, only 28% of features match by stable-feature metric. Algorithm comparison: top-k vs L1 vs JumpReLU.",
      "expected_bills": [2],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B03_frontier_llm_scale",
      "proposal": "Interpretability claim tested on Llama-3-405B and Claude-3 Opus production model. Frontier-scale generalization demonstrated.",
      "expected_bills": [3],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B04_cross_model_transfer_fail",
      "proposal": "60-model replication study: refusal direction transfers across Llama family but does not transfer to Mistral, Qwen, or Gemma. Cross-model generalization fails.",
      "expected_bills": [4],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B05_heimersheim_circuit_circularity",
      "proposal": "Heimersheim 'How not to do circuit analysis': activation patching protocol contains causal-circularity. The patch reuses the circuit it claims to prove. Self-validation tautology audit.",
      "expected_bills": [5],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B05_apollo_causal_scrubbing_audit",
      "proposal": "Apollo Research causal-scrubbing audit: 4/24 published circuit-discovery claims survive non-circular intervention test. Equivalence-class gameability gives faithfulness 0.3-0.95 depending on choice.",
      "expected_bills": [5],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B06_correlation_vs_causal",
      "proposal": "Causal mediation analysis (Vig-Geiger framework) applied to GPT-4. Distinguishes causally-linked attention heads from correlation-only heads using do-calculus interventional protocol.",
      "expected_bills": [6],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B07_anthropic_scaling_monosemanticity",
      "proposal": "Anthropic May 2024 Scaling Monosemanticity. Claims monosemantic features on Claude-3 Sonnet via SAE training. Compute-budget-conditional Anthropic-internal infrastructure required.",
      "expected_bills": [7],
      "expected_meta_costs": ["M5"],
      "expected_gates": []
    },
    {
      "id": "B07_heimersheim_apollo_audit",
      "proposal": "Heimersheim/Apollo five-condition costume-free audit: 56 claimed monosemantic features on frontier LLM, 0/56 survive all five closures (collinearity, reproducibility, frontier scale, cross-model, causal-circularity).",
      "expected_bills": [7],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B08_random_matched_norm_baseline",
      "proposal": "Strong baseline study: random matched-norm directions achieve 70% of gradient × input faithfulness. Ensembling does not help because methods share bias.",
      "expected_bills": [8],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B09_paraphrase_OOD_degradation",
      "proposal": "Lee et al. paraphrase audit: 8 steering vectors × 5 paraphrase classes → 22-41% generalization. OOD degradation curve presented; effect anti-scales with model capability.",
      "expected_bills": [9],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B10_probe_vs_sae_methodology",
      "proposal": "Probe vs SAE vs direction methodology disambiguation: probe-based evidence recovers dataset features; SAE-based evidence reconstructs activations; direction-based evidence linear-projects. Each carries different closure costs.",
      "expected_bills": [10],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B11_arditi_refusal_direction",
      "proposal": "Arditi et al. refusal direction across 13 chat models. Single-layer activation intervention; contrastive prompt construction. Claims causal steering at Llama-3-70B scale.",
      "expected_bills": [3, 11],
      "expected_meta_costs": ["M4"],
      "expected_gates": []
    },
    {
      "id": "B11_47_directions_audit",
      "proposal": "Causally-Faithful Steering at Frontier Scale audit: 47 attempted steering directions on Claude-3.5, 0/47 pass full Bill_11 closure (norm-confound + paraphrase + cross-scale + causal-circularity).",
      "expected_bills": [11],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B12_visualization_only",
      "proposal": "Top-k tokens visualization-only paper. SAE features look semantic in qualitative inspection; no quantitative metric reported.",
      "expected_bills": [12],
      "expected_meta_costs": ["M3"],
      "expected_gates": []
    },
    {
      "id": "B13_adebayo_revisit_llama2",
      "proposal": "Adebayo-revisit on Llama-2/Mistral: replicates 2018 saliency-map sanity check failures on modern LLMs. Gradient × input does not distinguish trained from random model.",
      "expected_bills": [13],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B13_attention_not_explanation_2024",
      "proposal": "Wiegreffe-Pinter five-year retrospective: attention is not explanation, follow-on rebuttal of 2019 critique on 2024 LLMs. Attribution-method validity not argued.",
      "expected_bills": [13],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B14_templeton_hu_belinkov_2025",
      "proposal": "Templeton-Hu-Belinkov-Conmy 2025: probe / SAE / circuit-discovery on the same concept in the same Llama-2 70B identify substantially different localizations. Cross-paradigm interp transfer test fails.",
      "expected_bills": [14],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B14_gandelsman_steinhardt_32pct",
      "proposal": "Gandelsman-Steinhardt cross-paradigm transfer audit: 32% agreement between probe and SAE on same concept. Methodology determines 'concept', not model structure.",
      "expected_bills": [14],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B15_reproducibility_open_source",
      "proposal": "Open-source SAE release: code, weights, datasets all publicly available. End-to-end reproducible on 8-GPU single-laboratory budget.",
      "expected_bills": [15],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B16_multi_codebase_replication",
      "proposal": "Multi-codebase fixed-spec replication test: identical hyperparameters and data across 6 SAE codebases produces >65% feature divergence. Implementation-reproducibility fails.",
      "expected_bills": [16],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B16_gemma_scope_discrepancy",
      "proposal": "Lieberum Gemma Scope cross-codebase audit: same SAE spec implemented in three different codebases produces different feature sets. Cross-codebase divergence demonstrated.",
      "expected_bills": [16],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B17_freq_norm_position_decomposition",
      "proposal": "Variance decomposition of SAE features against (token frequency, activation L2 norm, position embedding). >50% of feature variance lives in this triplet — features are structural-input-encoding, not model-internal computation.",
      "expected_bills": [17],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B17_pretraining_mixture_dependence",
      "proposal": "Maziarz-Trębacz-Reverdy: SAE direction shifts when pretraining mixture is altered. Concept is data-distributional, not computational.",
      "expected_bills": [17],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B18_23_directions_cosine",
      "proposal": "Direction-redundancy audit: 23 published directions (refusal, truthfulness, sycophancy, persona, role, deception, honesty, certainty, sandbagging, etc.) are mutually 0.6-0.85 cosine. Same axis with different labels.",
      "expected_bills": [18],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B18_redundant_directions",
      "proposal": "Redundant directions audit: refusal direction and sycophancy direction overlap at 0.78 cosine. Cross-paper collinearity not previously reported.",
      "expected_bills": [18],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "M1_toy_model_only",
      "proposal": "Mech interp study on 1L MLP with sparse parity. Does not extend to frontier LLM. Toy-model only methodology paper.",
      "expected_bills": [],
      "expected_meta_costs": ["M1"],
      "expected_gates": []
    },
    {
      "id": "M1_gpt2_small",
      "proposal": "Circuit-discovery on GPT-2 small (124M). Toy-scale; results don't extend to frontier.",
      "expected_bills": [],
      "expected_meta_costs": ["M1"],
      "expected_gates": []
    },
    {
      "id": "M2_linearity_hypothesis",
      "proposal": "Result conditional on linearity hypothesis. Under monosemanticity assumption, claim holds. Hypothesis-conditional.",
      "expected_bills": [],
      "expected_meta_costs": ["M2"],
      "expected_gates": []
    },
    {
      "id": "M2_superposition_hypothesis",
      "proposal": "Under superposition hypothesis, feature decomposition is well-defined. Conditional on superposition assumption being valid.",
      "expected_bills": [],
      "expected_meta_costs": ["M2"],
      "expected_gates": []
    },
    {
      "id": "M3_visualization_only",
      "proposal": "Visualization-only paper. Top-k tokens look semantic. Unfalsifiable. No quantitative metric.",
      "expected_bills": [],
      "expected_meta_costs": ["M3"],
      "expected_gates": []
    },
    {
      "id": "M4_single_layer_intervention",
      "proposal": "Single-layer activation patching intervention. Restricted intervention model. No circuit-decomposition account of why this single-layer intervention captures the behavior.",
      "expected_bills": [],
      "expected_meta_costs": ["M4"],
      "expected_gates": []
    },
    {
      "id": "M5_anthropic_scale_compute",
      "proposal": "Anthropic-scale compute infrastructure required. Only Anthropic can reproduce. Compute-budget-conditional. Hundreds of H100 hours.",
      "expected_bills": [],
      "expected_meta_costs": ["M5"],
      "expected_gates": []
    },
    {
      "id": "M5_frontier_lab_exclusive",
      "proposal": "Frontier-lab-exclusive infrastructure. Reproducible only by Anthropic, OpenAI, or DeepMind.",
      "expected_bills": [],
      "expected_meta_costs": ["M5"],
      "expected_gates": []
    },
    {
      "id": "M6_implementation_specific",
      "proposal": "Implementation-specific result. Specific SAE training script. Does not generalize to alternative implementations.",
      "expected_bills": [],
      "expected_meta_costs": ["M6"],
      "expected_gates": []
    },
    {
      "id": "G1_methodology_paper",
      "proposal": "Methodology paper. Proposes a new SAE variant (Matryoshka SAE). No frontier-LLM interp claim. Method description only.",
      "expected_bills": [],
      "expected_meta_costs": [],
      "expected_gates": ["G1"]
    },
    {
      "id": "G2_negative_result_paper",
      "proposal": "Negative-result paper. Demonstrates the failure of a prior monosemanticity claim. Rebuttal paper. Fails to replicate.",
      "expected_bills": [],
      "expected_meta_costs": [],
      "expected_gates": ["G2"]
    },
    {
      "id": "G2_critique_paper",
      "proposal": "Critique paper of activation-patching methodology. Counter-example to monosemanticity. Rebuttal paper.",
      "expected_bills": [],
      "expected_meta_costs": [],
      "expected_gates": ["G2"]
    },
    {
      "id": "G3_theoretical_construction",
      "proposal": "Theoretical-construction paper. Proves a theorem about feature decomposability under sparse coding. No empirical interp claim.",
      "expected_bills": [],
      "expected_meta_costs": [],
      "expected_gates": ["G3"]
    },
    {
      "id": "DUAL_B7_M5",
      "proposal": "Costume-free monosemantic feature claim on Claude-3. Anthropic-scale compute required for SAE training infrastructure.",
      "expected_bills": [7],
      "expected_meta_costs": ["M5"],
      "expected_gates": []
    },
    {
      "id": "DUAL_B11_M4",
      "proposal": "Causally faithful steering on Claude-3.5 via single-layer single-token intervention. Restricted adversary model.",
      "expected_bills": [11],
      "expected_meta_costs": ["M4"],
      "expected_gates": []
    },
    {
      "id": "DUAL_B5_B11_arditi",
      "proposal": "Arditi refusal-direction steering on Llama-3-70B via contrastive-prompt construction. Causal-circularity in patch protocol; single-layer intervention.",
      "expected_bills": [3, 5, 11],
      "expected_meta_costs": ["M4"],
      "expected_gates": []
    },
    {
      "id": "DUAL_B1_B8",
      "proposal": "Truthfulness direction collinearity audit: random matched-norm baseline matches 65% of effect. PC1 of training data accounts for 0.82 cosine.",
      "expected_bills": [1, 8],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "DUAL_B5_M4_circuits",
      "proposal": "IOI circuit causal-circularity audit on single-layer activation patch. Causally circular and restricted intervention.",
      "expected_bills": [5],
      "expected_meta_costs": ["M4"],
      "expected_gates": []
    },
    {
      "id": "DUAL_B13_B14",
      "proposal": "Attribution patching cross-paradigm test: integrated gradients vs gradient × input vs SHAP all give different feature attribution; cross-paradigm transfer fails.",
      "expected_bills": [13, 14],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B5_distributed_circuit_DeepMind",
      "proposal": "DeepMind distributed-circuit study on Llama-3-405B: claimed circuits span 10-30 attention heads × 20+ layers. Single-localization patching cannot identify topology.",
      "expected_bills": [3, 5],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B5_redundant_parallel_circuits",
      "proposal": "Rajamanoharan-Conmy: 3-5 redundant parallel circuits in Llama-2 70B. Causal-scrubbing equivalence-class gameable.",
      "expected_bills": [5],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B7_apollo_56_audit",
      "proposal": "Apollo Research costume-free audit on 56 claimed monosemantic features. 0/56 survive all five closures. Strong empty-space evidence for Bill_7.",
      "expected_bills": [7],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B11_anthropic_persona_vectors",
      "proposal": "Anthropic persona vectors. Compute-conditional Anthropic-scale infrastructure. Steering at frontier scale claimed.",
      "expected_bills": [11],
      "expected_meta_costs": ["M5"],
      "expected_gates": []
    },
    {
      "id": "B11_norm_confound_steering",
      "proposal": "Casper Norm Trojans: 60-78% of steering effect is norm-driven. Refusal direction steering on Llama-3-70B confounded by activation norm change.",
      "expected_bills": [3, 11],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B7_universal_features_405B",
      "proposal": "Anthropic Universal Features Crosscoder Llama-3-405B (Sep 2025). First serious cross-FAMILY transfer test. ~12-18% feature overlap reported.",
      "expected_bills": [3, 7],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B14_templeton_2025_localization",
      "proposal": "Templeton-Hu-Belinkov-Conmy 2025: probe / SAE / circuit on same concept in same Llama-2 70B identify substantially different localizations. Cross-paradigm transfer empty.",
      "expected_bills": [14],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B16_six_codebases",
      "proposal": "Six SAE codebases with identical hyperparameters report >65% feature divergence. Cross-codebase implementation-reproducibility fails.",
      "expected_bills": [16],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B17_three_paper_cluster",
      "proposal": "SAE feature variance decomposition: 50% explained by token frequency, activation norm, and position embedding combined. Structural-input-encoding cluster.",
      "expected_bills": [17],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B18_steering_collinearity",
      "proposal": "Direction-redundancy audit on 23 published steering vectors. Mean cosine similarity 0.72. Same axis with different labels — refusal, sycophancy, deception, honesty all share 0.68-0.84 pairwise cosine.",
      "expected_bills": [18],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "DUAL_B7_B14_anthropic_2024",
      "proposal": "Anthropic Scaling Monosemanticity (May 2024). Claims costume-free monosemantic features on Claude-3 Sonnet; cross-paradigm transfer to probe and patching not demonstrated. Compute-conditional.",
      "expected_bills": [3, 7, 14],
      "expected_meta_costs": ["M5"],
      "expected_gates": []
    },
    {
      "id": "B8_hewitt_levy_random_baseline",
      "proposal": "Hewitt-Levy random direction baseline study: 70% of gradient × input faithfulness recovered by random matched-norm direction. Strong baseline failure.",
      "expected_bills": [8],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B9_inverse_scaling_unfaithful",
      "proposal": "Self-rationalization unfaithfulness anti-scales with model capability. Frontier models more unfaithful than mid-tier. Lanham-Turpin lineage.",
      "expected_bills": [9, 13],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B13_geiger_pearl",
      "proposal": "Geiger-Wu-Potts causal mediation Pearl-Bareinboim framework on Claude-3.5/GPT-4o/Llama-3-405B. CoT mediates only ~50% on simple, ~20% on multi-hop. Attribution-method validity argued via causal axioms.",
      "expected_bills": [3, 6, 13],
      "expected_meta_costs": [],
      "expected_gates": []
    }
  ]
}
