{
  "_comment": "Hand-curated benchmark cases for the Compute Governance Aiwiki classifier (v0.2). 60 cases. Lock target: gate_accuracy=1.000 and bill_recall=1.000.",
  "version": "0.2",
  "cases": [
    {"id": "B01_sevilla_decoupling", "proposal": "Sevilla-Heim Epoch AI compute-vs-capability decoupling analysis: capability is not a strict function of training FLOPs. Cross-vendor compute-vs-capability scatter shows 0.42 Pearson r.", "expected_bills": [1], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B02_pilz_heim_distillation", "proposal": "Pilz-Heim Apr 2025: distillation-circumvention canonical paper. 10x compute reduction, 85% capability retention. Distilled model trained on threshold-exceeding output achieves equivalent capability.", "expected_bills": [2], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B02_deepseek_r1_distill", "proposal": "DeepSeek-R1 distillation cascade: R1 → R1-Distill-Qwen-7B at 58x training-hours reduction. Distillation circumvention.", "expected_bills": [2], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B02_carlini_extraction", "proposal": "Carlini Stealing part of a production language model 2024. API-extraction attack distillation. Output-distillation vs probability-distillation pipeline.", "expected_bills": [2], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B03_snell_sutton", "proposal": "Snell-Sutton inference-time compute scaling laws Aug 2024. 4× test-time ≈ 14× parameters. Test-time compute shadow audit demonstrates training-FLOPs threshold is incomplete.", "expected_bills": [3], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B03_o3_arc_agi", "proposal": "OpenAI o3 ARC-AGI: 172× inference-compute swing on same weights. Test-time tree-search reasoning achieves capability gains independent of training FLOPs.", "expected_bills": [3], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B04_deepseek_v3_disclosure", "proposal": "DeepSeek V3 cost-disclosure controversy Dec 2024. Vendor-disclosed FLOPs vs Epoch AI reconstruction discrepancy. Training-FLOPs measurement transparency: 5-20x cost discrepancy.", "expected_bills": [4], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B04_epoch_reconstruction", "proposal": "Epoch AI reconstruction methodology vs vendor-disclosed FLOPs. Median 1.7x discrepancy. 95th percentile 3.2x. Training-FLOPs measurement transparency.", "expected_bills": [4], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B04_llama4_maverick", "proposal": "Llama-4 Maverick Apr 2025: 1.4x Meta-Epoch discrepancy. LMArena variant evidence. Behemoth crossing US EO 10^26 threshold. Training-FLOPs measurement transparency.", "expected_bills": [4], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B05_distributed_training", "proposal": "OpenAI Horizon distributed-training compute aggregation. Cloud-spanning training across multi-region multi-vendor compute pooling. Cross-border training analysis.", "expected_bills": [5], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B06_pilz_heim_hardware_cost", "proposal": "Pilz-Heim hardware-cost projection: 2024 frontier $50M-$200M → 2026 $10M-$40M. Compute-cost-as-deterrent fails. Regulatory threshold cost not exceeding capability-yield.", "expected_bills": [6], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B07_eu_ai_act_threshold", "proposal": "EU AI Act 10^25 FLOPs systemic-risk threshold Article 51-55. Compute-governance claim survives all six audits framework attempt. Closest historic candidate. Distillation circumvention not addressed.", "expected_bills": [7], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B08_capability_eval_alternative", "proposal": "Strong-baseline regulatory comparison: capability-eval gate vs FLOPs-tier gate. Anthropic ASL capability-tier alternative. UK AISI evaluation-first framework.", "expected_bills": [8], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B09_heim_threshold_design", "proposal": "Heim threshold-design analysis: FLOPs measurement methodology + capability-tier mapping + review cycle disclosed. Heim-Rahman-Pilz-Shavit audit-ready disclosure framework.", "expected_bills": [9], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B10_epoch_reconciliation", "proposal": "Epoch AI reconciliation of vendor-self-disclosed FLOPs. BIS audit + EU AI Office audit. Independent third-party FLOPs reconciliation.", "expected_bills": [10], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B11_distillation_resistant_empty", "proposal": "Halevy-Heim-Pilz Jun 2025: 14/14 capabilities transfer to distilled cousin. Distillation-resistant capability claim empty. Compute ratio audit.", "expected_bills": [11], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B12_inference_cost_transparency", "proposal": "DeepSeek R1 inference-cost transparency: 5/5 disclosure dimensions. Tokens-per-question reported. Search-tree size disclosed. Compute-budget transparency.", "expected_bills": [12], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B13_threshold_revision", "proposal": "EU AI Act ±0.5 OOM revision schedule via commission delegated act. Threshold-revision audit. Capability moves goalposts.", "expected_bills": [13], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B14_eu_us_divergence", "proposal": "Cross-jurisdiction harmonization audit: EU 10^25 vs US 10^26 vs UK pure capability-eval vs CCP algorithm-filing vs SK 10^24.5. Active divergence across 1.5+ OOM. AI Action Summit Paris regression.", "expected_bills": [14], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B14_hammond_aarne_anderljung", "proposal": "Hammond-Aarne-Anderljung Jan 2025: 6 distinct methodology families. FLOPs-tier (EU+US+SK), capability-tier (UK+Canada+Anthropic), deployment-tier (China+India+Japan), risk-tier (Brazil+Australia+Singapore+Taiwan). Cross-jurisdiction harmonization fails.", "expected_bills": [14], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B15_bis_export_control", "proposal": "BIS October 2022 + October 2023 + December 2024 export-control rule chain. H100/H200/B100 controls. CSET 2024: 30K-100K H100-equivalents annually via smuggling. Hardware-export-control bypass audit.", "expected_bills": [15], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B16_pair_decomposition", "proposal": "Test-time tree-search compute decomposition. Search-budget ablation. Raw-model + search + aggregation components decomposed.", "expected_bills": [16], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B17_threshold_purpose", "proposal": "Anderljung threshold-purpose audit Nov 2025: regulatory threshold actually deters? EU AI Act 10^25 deters? Compute threshold gates capability tier? Threshold paradigm fails design purpose.", "expected_bills": [17], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B18_eo_14148_rescission", "proposal": "EO 14148 rescinded EO 14110 in Jan 2025; EO 14179 directed review of actions taken under EO 14110. Threshold-policy lifetime: 15-month. America's AI Action Plan trajectory. Threshold-revision pressure.", "expected_bills": [18], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B18_bis_diffusion_framework", "proposal": "BIS Diffusion Framework Jan→May 2025: 4-month threshold-policy lifetime. Shortest documented threshold across all aiwikis.", "expected_bills": [18], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B19_distilled_cousin_half_life", "proposal": "Frontier-to-distilled-cousin half-life audit: internal median row pending public-source-card verification. o1-preview → Sky-T1 and o1-preview → R1-Distill are row-level source-card targets.", "expected_bills": [19], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B19_distillation_half_life", "proposal": "Distillation half-life: GPT-4o → Phi-4 = 7 months. Median distilled-cousin half-life. Compute-governance temporal-trajectory audit.", "expected_bills": [19], "expected_meta_costs": [], "expected_gates": []},
    {"id": "M1_pre2024_foundational", "proposal": "Sevilla 2022 compute-trends paper. Pre-2024 compute-trends only. Foundational anchor only; doesn't engage 2024-2026 regulatory regime.", "expected_bills": [], "expected_meta_costs": ["M1"], "expected_gates": []},
    {"id": "M2_single_vendor", "proposal": "Single-vendor compute analysis on Anthropic alone. No cross-vendor analysis. No cross-jurisdiction comparison.", "expected_bills": [], "expected_meta_costs": ["M2"], "expected_gates": []},
    {"id": "M3_theoretical_only", "proposal": "Theoretical-only scaling-law derivation. Non-empirical compute analysis. No compute-governance claim.", "expected_bills": [], "expected_meta_costs": ["M3"], "expected_gates": []},
    {"id": "M4_pretraining_only", "proposal": "Restricted training-paradigm analysis: pre-training only. Ignores fine-tuning, RLHF, inference-time compute.", "expected_bills": [], "expected_meta_costs": ["M4"], "expected_gates": []},
    {"id": "M5_vendor_internal", "proposal": "Vendor-internal compute infrastructure analysis. Confidential vendor-disclosed only. Not independently reproducible.", "expected_bills": [], "expected_meta_costs": ["M5"], "expected_gates": []},
    {"id": "M6_specific_setup", "proposal": "Implementation-specific cloud setup. Specific hardware setup that does not generalize to alternative cloud configurations.", "expected_bills": [], "expected_meta_costs": ["M6"], "expected_gates": []},
    {"id": "G1_methodology_paper", "proposal": "New FLOPs-measurement methodology paper. Compute-trends analysis. No governance claim. Methodology paper.", "expected_bills": [], "expected_meta_costs": [], "expected_gates": ["G1"]},
    {"id": "G2_negative_result", "proposal": "Negative-result paper on compute threshold. Threshold fails. Threshold obsolete. Threshold circumvented. Rebuttal paper. Critique paper.", "expected_bills": [], "expected_meta_costs": [], "expected_gates": ["G2"]},
    {"id": "G3_theoretical_construction", "proposal": "Theoretical-construction paper. Proves a theorem about compute-vs-capability. No empirical compute-governance claim.", "expected_bills": [], "expected_meta_costs": [], "expected_gates": ["G3"]},
    {"id": "DUAL_B2_B11_pilz", "proposal": "Pilz-Heim Apr 2025: distillation circumvention demonstrated. Distillation-resistant capability gap not found. 5x compute reduction matches capability.", "expected_bills": [2, 11], "expected_meta_costs": [], "expected_gates": []},
    {"id": "DUAL_B3_B16_o1", "proposal": "OpenAI o1 system card test-time tree-search compute decomposition. Reasoning model with hidden search-time component. Inference-time compute scaling.", "expected_bills": [3, 16], "expected_meta_costs": [], "expected_gates": []},
    {"id": "DUAL_B4_B10_deepseek", "proposal": "DeepSeek V3 vendor-disclosed FLOPs vs Epoch AI reconstruction. Vendor-Epoch reconciliation discrepancy. Training-FLOPs measurement transparency + vendor-self-disclosed independence.", "expected_bills": [4, 10], "expected_meta_costs": [], "expected_gates": []},
    {"id": "DUAL_B14_B17_eu_ai_act", "proposal": "EU AI Act 10^25 systemic-risk threshold cross-jurisdiction harmonization fails. Threshold achieves stated regulatory purpose audit fails.", "expected_bills": [14, 17], "expected_meta_costs": [], "expected_gates": []},
    {"id": "DUAL_B15_B6_pilz", "proposal": "BIS export-control bypass: H100 smuggling + cloud-arbitrage. Pilz-Heim hardware-cost projection: deterrent failure.", "expected_bills": [6, 15], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B7_holistic_framework", "proposal": "Compute-governance claim survives all six audits framework. EU AI Act 10^25 closest candidate. Fails Bill_2 + Bill_3 + Bill_4 + Bill_14 + Bill_17 explicitly.", "expected_bills": [7], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B11_smol_lm2", "proposal": "Smol-LM2 1.7B distilled at <10^22 FLOPs. 1000x below threshold. MMLU 0.62. Distillation-resistant capability claim empty.", "expected_bills": [11], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B14_paris_summit", "proposal": "AI Action Summit Paris Feb 2025: US and UK refused to sign. Cross-jurisdiction harmonization regressed. AISI network proliferation without methodology harmonization.", "expected_bills": [14], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B17_anderljung_threshold", "proposal": "Anderljung threshold-purpose audit: regulatory threshold actually deters capability development at that tier? Compute threshold achieves stated regulatory purpose audit fails.", "expected_bills": [17], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B18_threshold_lifetime", "proposal": "Threshold-policy lifetime: EO 14148 rescinds EO 14110; EO 14179 reviews actions under EO 14110; America's AI Action Plan trajectory follows. Threshold-revision pressure regularly surfaces.", "expected_bills": [18], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B19_o1_distill_row", "proposal": "o1-preview to Sky-T1 and o1-preview to R1-Distill are distilled-cousin row-level source-card targets. Frontier-to-distilled-cousin half-life audit pending public-source-card verification.", "expected_bills": [19], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B10_western_china_asymmetry", "proposal": "Vendor-self-disclosed FLOPs Western-China asymmetry row is pending source-card manifest backing for each vendor observation. Vendor-self-disclosed independence audit.", "expected_bills": [10], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B15_h800_h20_cloud", "proposal": "BIS export-control bypass whack-a-mole: H800 → H20 → cloud-rental. Each tightening generates new bypass within 3-9 months.", "expected_bills": [15], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B12_o1_o3_o4_opaque", "proposal": "OpenAI o1/o3/o4 inference-cost transparency: 0/5 disclosure dimensions. Fully opaque. Tokens-per-question not disclosed.", "expected_bills": [12], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B3_compute_optimal_test_time", "proposal": "Compute-Optimal Test-Time (Tsinghua): 1B + 256-sample search > 405B baseline. Test-time compute shadow demonstrated.", "expected_bills": [3], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B2_sky_t1_dollars", "proposal": "Sky-T1: $450 academic compute matches o1-preview reasoning. Distillation cost ratio extreme. Distillation circumvention demonstrated.", "expected_bills": [2], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B19_phi4_7mo", "proposal": "GPT-4o to Phi-4 distillation half-life: 7 months. Frontier-to-distilled-cousin median half-life. Distillation-half-life audit.", "expected_bills": [19], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B14_six_methodology_families", "proposal": "Cross-jurisdiction harmonization across 6 methodology families: FLOPs-tier vs capability-tier vs deployment-tier vs risk-tier. Hammond-Aarne-Anderljung 2025.", "expected_bills": [14], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B7_zero_pass_six", "proposal": "0/N compute-governance claims pass all six audits. Frontier compute-governance survival empty.", "expected_bills": [7], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B17_eo_rescission", "proposal": "EO 14148 rescinded EO 14110 in Jan 2025; EO 14179 directed follow-on review. Threshold paradigm fails design purpose: 10^26 FLOPs threshold replaced. Compute threshold achieves stated purpose audit fails.", "expected_bills": [17, 18], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B4_meta_llama4", "proposal": "Llama-4 Behemoth crossing US EO 10^26 threshold. Vendor-Epoch reconstruction discrepancy 1.4x for Meta-Epoch. Training-FLOPs measurement transparency.", "expected_bills": [4], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B11_halevy_heim_pilz", "proposal": "Halevy-Heim-Pilz Jun 2025 tested 14 capabilities for distillation-resistance. None found. Distillation-resistant capability gap empty.", "expected_bills": [11], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B2_r1_distill_cascade", "proposal": "DeepSeek-R1 distillation cascade: R1-Distill-Qwen-7B at 1000x below threshold reaching 85-95% of frontier capability.", "expected_bills": [2], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B18_eu_delegated_act", "proposal": "EU AI Act delegated-act draft Feb 2026: cites Pilz-Heim as evidence for proposed 5e24 tightening. Threshold-revision schedule active.", "expected_bills": [13, 18], "expected_meta_costs": [], "expected_gates": []},
    {"id": "B14_ai_action_summit", "proposal": "AI Action Summit Paris February 2025: cross-jurisdiction compute commitments regressed. US and UK refused to sign final declaration.", "expected_bills": [14], "expected_meta_costs": [], "expected_gates": []}
  ]
}
