{
  "_comment": "Hand-curated benchmark cases for the Capability Benchmarks Aiwiki classifier (v0.2). 60 cases. Lock target: gate_accuracy=1.000 and bill_recall=1.000.",
  "version": "0.2",
  "cases": [
    {
      "id": "B01_carlini_tirumala",
      "proposal": "Carlini-Tirumala memorization line: training-data contamination audit on Llama 3 with n-gram overlap detection on MMLU and GSM8K test sets.",
      "expected_bills": [1],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B01_tang_cao_bommasani_yale",
      "proposal": "Tang-Cao-Bommasani Yale 47% MMLU contamination study. Demonstrates that 47% of MMLU test items appear in pretraining corpus via paraphrase-aware leakage check.",
      "expected_bills": [1],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B02_self_consistency_40x",
      "proposal": "Self-Consistency (Wang-Zhou 2022): 40x compute = +18pp GSM8K. Score-without-harness baseline reported. Chain-of-thought ensemble.",
      "expected_bills": [2],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B02_swe_agent_scaffold",
      "proposal": "SWE-Agent (Yang et al 2024): 7-10x score gain from scaffolding alone. ACI scaffolding for SWE-Bench. Harness moves the score.",
      "expected_bills": [2],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B03_alphaproof_lean",
      "proposal": "AlphaProof / AlphaGeometry-2 with Lean tactic search at test time. Tool-exfiltration via Lean: with-tool 83% vs without-tool ~5%.",
      "expected_bills": [3],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B03_frontiermath_wolfram",
      "proposal": "FrontierMath tool-exfiltration audit: 35-50% of solved problems used Wolfram/sympy in scaffolding. With-tool and without-tool scores separated.",
      "expected_bills": [3],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B04_pezeshkpour_hruschka",
      "proposal": "Pezeshkpour-Hruschka GPT-4 +6.5% A-bias / -4.5% D-bias. 30% of correct answers flip under option-order shuffling. Position bias on MMLU.",
      "expected_bills": [4],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B04_mirzadeh_gsm_symbolic",
      "proposal": "Mirzadeh GSM-Symbolic 2024: -2 to -65% absolute drop under variable rename / NoOp distractor on GSM8K. Format-brittleness exposed; o1-preview drops 17.5%.",
      "expected_bills": [4],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B05_lmsys_llama4",
      "proposal": "LMSYS Llama 4 Maverick removal April 2025: Meta tuned chat-specific variant for LMSYS Arena prompts. Selection-bias violation; eval-set overfitting.",
      "expected_bills": [5],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B05_frontiermath_funding",
      "proposal": "OpenAI-Epoch FrontierMath funding entanglement Dec 2024: vendor funded benchmark and had Tier-1+2 access before evaluation. Selection-bias audit triggered.",
      "expected_bills": [5],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B06_swebench_reproduction",
      "proposal": "Liu SWE-Bench reproduction failures: 49%→31% on independent reproduction. Variance across seeds and frameworks. Reproducibility audit.",
      "expected_bills": [6],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B07_gerstgrasser_bommasani_audit",
      "proposal": "Gerstgrasser-Bommasani 2025: 0/73 frontier-LLM capability claims pass all six audits. Direct empty-space confirmation for Bill_7. Survives all six audits test fails universally.",
      "expected_bills": [7],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B08_hodel_arc_dsl",
      "proposal": "Hodel ARC-AGI DSL+search baseline: 49% without reasoning. Strong-competitor baseline that LLM-only must beat. Specialized solver via program synthesis.",
      "expected_bills": [8],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B09_arc_prize_held_out",
      "proposal": "ARC Prize 2024 + 2025 held-out construction methodology. Private set never seen by developers. Held-out construction transparency. Public-private gap ~20+ points.",
      "expected_bills": [9],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B10_metr_hcast",
      "proposal": "METR HCAST 2025: third-party reproduction of frontier-LLM agentic-task length capability. Vendor-self-eval independence via independent third-party reproduction.",
      "expected_bills": [10],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B11_mmlu_saturation",
      "proposal": "MMLU saturation regime: Llama 3.1 405B / DeepSeek-V3 / Qwen 2.5 / Mistral Large 2 all >90%. Headroom-exhausted. Saturated benchmark; capability evidence gone.",
      "expected_bills": [11],
      "expected_meta_costs": ["M2"],
      "expected_gates": []
    },
    {
      "id": "B11_humaneval_saturated",
      "proposal": "HumanEval saturation: frontier models >95%. Saturated benchmark; over 95% headroom-exhausted.",
      "expected_bills": [11],
      "expected_meta_costs": ["M2"],
      "expected_gates": []
    },
    {
      "id": "B12_snell_sutton",
      "proposal": "Snell-Sutton inference scaling: 4x test-time compute ≈ 14x parameters. Tokens-per-question reported. Inference-cost transparency. Compute-budget scaling laws.",
      "expected_bills": [12],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B12_artificial_analysis_pareto",
      "proposal": "Cost-Capability Pareto (Artificial Analysis 2025-02): 100x cost spread for 5pp accuracy. Tokens-per-question reported. Inference-cost transparency.",
      "expected_bills": [12],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B13_tokenizer_sensitivity",
      "proposal": "Tokenizer sensitivity audit: cross-tokenizer benchmark variation. Markdown vs plain prompt formatting. Special-token handling.",
      "expected_bills": [13],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B14_dubois_hashimoto",
      "proposal": "Dubois-Hashimoto MMLU↔MMLU-Pro r=0.78 cross-benchmark transfer audit. Below the r≥0.95 threshold for clean transfer. Cousin-benchmark transfer fails.",
      "expected_bills": [14],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B14_math_frontiermath",
      "proposal": "MATH↔FrontierMath cross-benchmark transfer test: r=0.18. Cross-benchmark transfer fails dramatically. CAIS knowledge-vs-reasoning split.",
      "expected_bills": [14],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B15_inverse_scaling_prize",
      "proposal": "Inverse Scaling Prize 2024 follow-on (McKenzie 2024): 7-8/11 still inverse-scale at frontier. Anti-scale degradation reported. Scaling-law-violation audit.",
      "expected_bills": [15],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B15_emergence_mirage",
      "proposal": "Schaeffer emergence-as-mirage line: capability emergence claims are metric-induced. Metric-choice-induced emergence at scale. Saphra-Belinkov critique.",
      "expected_bills": [15],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B16_o1_system_card",
      "proposal": "OpenAI o1 System Card December 2024: test-time tree-search baked in. Reasoning model with hidden search-time component. Chain-of-thought with internal reasoning trace.",
      "expected_bills": [16],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B16_alphacode2",
      "proposal": "AlphaCode-2 Codeforces: ~17x raw-vs-search-tree gap. Sample-and-filter MCTS. Test-time tree search at frontier scale.",
      "expected_bills": [16],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B17_o3_arc_agi_dispute",
      "proposal": "OpenAI o3 ARC-AGI 75.7% (Dec 2024) → ARC-AGI-2 audit drops to 5-10% (Mar 2025). Held-out frontier audit on highest-stakes benchmark. Iterative reframing.",
      "expected_bills": [17],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B17_frontiermath_held_out",
      "proposal": "FrontierMath held-out re-evaluation: o3 25.2% → 14.5% on Epoch held-out audit set. Smoking gun for ~10pp contamination inflation. Held-out frontier audit.",
      "expected_bills": [17],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B18_arc_v1_v2_reframing",
      "proposal": "ARC-AGI v1 → v2 iterative reframing March 2025. Anti-saturation benchmark construction. Held-out by design private set; community-side response to vendor saturation.",
      "expected_bills": [18],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B18_livecodebench_refresh",
      "proposal": "LiveCodeBench monthly refresh: anti-contamination by construction. Monthly refresh of problem set; date-window quantifies contamination. Anti-saturation benchmark construction.",
      "expected_bills": [18],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B18_arc_prize_held_out_design",
      "proposal": "ARC Prize private set held-out-by-design. Anti-saturation benchmark construction. Public-private gap ~20+ points; never seen by developers.",
      "expected_bills": [18],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B19_anand_half_life",
      "proposal": "Anand vendor-claim half-life median 73 days. Capability-claim invalidated within ~3 months. Temporal-trajectory audit. Vendor-claim half-life tracking.",
      "expected_bills": [19],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B19_metr_doubling",
      "proposal": "METR HCAST 7-month doubling-time for agentic-task length capability. Temporal-trajectory audit. Doubling-time as policy-relevant signal.",
      "expected_bills": [19],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "M1_humaneval_only",
      "proposal": "Capability claim only on saturated HumanEval at >95%. Toy-benchmark only; no longer carries capability signal at frontier scale.",
      "expected_bills": [],
      "expected_meta_costs": ["M1"],
      "expected_gates": []
    },
    {
      "id": "M2_saturation_regime",
      "proposal": "Saturation regime: claim at >95% on MMLU. Headroom-exhausted. Saturated benchmark.",
      "expected_bills": [],
      "expected_meta_costs": ["M2"],
      "expected_gates": []
    },
    {
      "id": "M3_single_template",
      "proposal": "Single-prompt-template score on MMLU. Unfalsifiable. No robustness audit. One-shot score reported with no audit.",
      "expected_bills": [],
      "expected_meta_costs": ["M3"],
      "expected_gates": []
    },
    {
      "id": "M4_per_task_harness",
      "proposal": "Restricted-eval-protocol with custom prompting per task. Per-task harness tuned for each benchmark. Doesn't generalize.",
      "expected_bills": [],
      "expected_meta_costs": ["M4"],
      "expected_gates": []
    },
    {
      "id": "M5_vendor_internal_compute",
      "proposal": "Vendor-internal eval compute infrastructure required. Compute-budget-conditional. Only OpenAI can reproduce. Frontier-lab-exclusive infrastructure.",
      "expected_bills": [],
      "expected_meta_costs": ["M5"],
      "expected_gates": []
    },
    {
      "id": "M6_specific_scaffolding",
      "proposal": "Implementation-specific scaffolding code. Specific scaffolding implementation. Does not generalize to alternative scaffolding.",
      "expected_bills": [],
      "expected_meta_costs": ["M6"],
      "expected_gates": []
    },
    {
      "id": "G1_new_benchmark_methodology",
      "proposal": "Benchmark proposal paper introducing new evaluation protocol. Eval-protocol paper. No frontier-LLM capability claim. Methodology paper.",
      "expected_bills": [],
      "expected_meta_costs": [],
      "expected_gates": ["G1"]
    },
    {
      "id": "G2_negative_result_mmlu",
      "proposal": "Negative-result paper demonstrating MMLU contamination dispute. Audit report. Rebuttal paper. Contamination dispute.",
      "expected_bills": [],
      "expected_meta_costs": [],
      "expected_gates": ["G2"]
    },
    {
      "id": "G3_theoretical_construction",
      "proposal": "Theoretical-construction paper proves a theorem about scaling-law-prediction of capability emergence. No empirical capability claim.",
      "expected_bills": [],
      "expected_meta_costs": [],
      "expected_gates": ["G3"]
    },
    {
      "id": "DUAL_B5_B10_o3",
      "proposal": "OpenAI o3 December 2024 FrontierMath 25.2% claim. Vendor funded benchmark. Vendor-self-eval independence violated. Selection-bias audit triggered.",
      "expected_bills": [5, 10],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "DUAL_B16_M5_o1",
      "proposal": "o1 System Card with test-time tree-search baked in. Reasoning model. Compute-conditional Anthropic-scale infrastructure required.",
      "expected_bills": [16],
      "expected_meta_costs": ["M5"],
      "expected_gates": []
    },
    {
      "id": "DUAL_B11_M2_humaneval",
      "proposal": "HumanEval >95% saturation regime. Saturated benchmark. Headroom-exhausted. Saturation pattern audit fires.",
      "expected_bills": [11],
      "expected_meta_costs": ["M2"],
      "expected_gates": []
    },
    {
      "id": "DUAL_B17_B18_arc",
      "proposal": "ARC Prize private set held-out frontier audit on highest-stakes benchmark. Anti-saturation benchmark construction. Held-out-by-design private set.",
      "expected_bills": [17, 18],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "DUAL_B2_B12_bestofN",
      "proposal": "Best-of-N (Brown 2024 Large Language Monkeys): 1000x compute, Llama-7B@1000 ≈ Claude-3@1. Harness-engineering audit. Tokens-per-question inference-cost transparency.",
      "expected_bills": [2, 12],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "DUAL_B6_B10_metr",
      "proposal": "METR HCAST third-party reproduction with explicit reproducibility audit and vendor-self-eval independence. Independent third-party reproduction.",
      "expected_bills": [6, 10],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B1_carlini_stealing",
      "proposal": "Carlini Stealing part of a production language model 2024 ICML best paper. Closed-API contamination threat surface. Membership-inference attack on benchmark.",
      "expected_bills": [1],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B2_tree_of_thought",
      "proposal": "Tree-of-Thought (Yao 2023): Game of 24 4% to 74% with same model. Chain-of-thought with search. Harness moves the score.",
      "expected_bills": [2],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B11_livebench_gap",
      "proposal": "LiveBench refresh quarterly shows 34.8% absolute gap vs fixed MMLU. Saturation = memorization. Saturation pattern audit.",
      "expected_bills": [11],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B17_hle_cais",
      "proposal": "HLE CAIS independent replication: 8.5% inflation on Humanity's Last Exam. Held-out frontier audit. Third-party held-out audit.",
      "expected_bills": [17],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B16_mitchell_arc_decomposition",
      "proposal": "Mitchell-Cosma ARC-AGI compute decomposition: 75% search, 13% raw model. Test-time tree-search decomposition. Raw-model + search + aggregation ablation.",
      "expected_bills": [16],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B7_anand_47",
      "proposal": "Anand-Tirumala forensic audits: 32/47 fail at least one of the six audits. Frontier-LLM capability claim survival test fails. Direct Bill_7 evidence.",
      "expected_bills": [7],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B19_vendor_inflation",
      "proposal": "METR/AISI/Apollo mean vendor inflation 6.8-9.1%. Vendor-claim half-life median 73 days. Temporal-trajectory audit.",
      "expected_bills": [19],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B17_iterative_reframing",
      "proposal": "ARC-AGI v1→v2→v3 iterative-reframing pattern. Each ~6-9 months: frontier breakthrough on public eval → held-out audit → benchmark reframing. Held-out frontier audit on ARC-AGI / FrontierMath / HLE / GPQA-Diamond.",
      "expected_bills": [17, 18],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B3_sympy_tool",
      "proposal": "Tool-augmented FrontierMath: sympy at test-time. With-tool 35-50% solved-problem rate; without-tool baseline reported. Tool-exfiltration audit.",
      "expected_bills": [3],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B11_yang_tatsunori",
      "proposal": "Yang-Tatsunori 2024: mean 16-month saturation cycle from benchmark release to >95%. Saturation pattern. Headroom-exhausted within 12-18 months.",
      "expected_bills": [11],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B4_sclar_choi",
      "proposal": "Sclar-Choi: 76% relative range across plausibly-equivalent prompt templates on Llama-2. Format brittleness. Question-format permutation.",
      "expected_bills": [4],
      "expected_meta_costs": [],
      "expected_gates": []
    },
    {
      "id": "B14_concept_arc_transfer",
      "proposal": "ARC-AGI↔ConceptARC cross-benchmark transfer test. Cousin-benchmark transfer audit. Cross-benchmark transfer.",
      "expected_bills": [14],
      "expected_meta_costs": [],
      "expected_gates": []
    }
  ]
}
