[
  {
    "paper_id": "policy:eu:ai_act_2024",
    "title": "Regulation (EU) 2024/1689 \u2014 Artificial Intelligence Act, Articles 51-55 + Annex XIII",
    "authors": [
      "European Parliament",
      "Council of the European Union"
    ],
    "affiliations": [
      "European Union"
    ],
    "country_region": "EU",
    "date": "2024-06",
    "venue": "Official Journal of the European Union",
    "url": "https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=OJ:L_202401689",
    "summary": "Final text of the EU AI Act. Article 51(2) establishes the 10^25 floating-point operations cumulative training compute threshold as the presumption of systemic risk for general-purpose AI models. Annex XIII enumerates qualitative criteria the AI Office may use to classify GPAI models as systemic-risk independent of the threshold.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 1.0,
    "watchlist_tier": "quarterly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": "1e25",
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [
      "arxiv:2502.heim_pilz_distill",
      "blog:epoch:sevilla_eu_threshold_2025"
    ],
    "notes": "Canonical Bill_7 anchor. Threshold is presumption-of-systemic-risk; Commission may revise +/- 0.5 OOM via delegated act per Art. 51(3). Includes the qualitative-criteria escape (Annex XIII) which the Office can use to designate without crossing threshold.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "policy:eu:gpai_cop_2025",
    "title": "General-Purpose AI Code of Practice (Final, May 2025)",
    "authors": [
      "EU AI Office",
      "GPAI Code of Practice Working Groups (Bengio, Russell, Sevilla, et al.)"
    ],
    "affiliations": [
      "European AI Office",
      "Independent chairs"
    ],
    "country_region": "EU",
    "date": "2025-05",
    "venue": "European Commission / AI Office",
    "url": "https://digital-strategy.ec.europa.eu/en/policies/ai-code-practice",
    "summary": "Final General-Purpose AI Code of Practice operationalizing GPAI obligations under Articles 53-55. Section IV defines the methodology for systemic-risk model identification and includes the FLOPs-counting protocol that all signatories adopt.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Bill_4 anchor for FLOPs-measurement transparency at the EU level. Adopts the 6ND approximation but explicitly leaves test-time compute and distillation outside the headline counting rule, deferring to Annex XIII qualitative criteria.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "policy:eu:cop_draft3_2025_03",
    "title": "GPAI Code of Practice \u2014 Third Draft (March 2025)",
    "authors": [
      "EU AI Office Code of Practice Working Groups"
    ],
    "affiliations": [
      "European AI Office"
    ],
    "country_region": "EU",
    "date": "2025-03",
    "venue": "European Commission",
    "url": "https://digital-strategy.ec.europa.eu/en/library/third-draft-general-purpose-ai-code-practice",
    "summary": "Third public draft of the GPAI Code of Practice. Introduces the 'effective compute' formulation that includes algorithmic efficiency multipliers and explicitly addresses fine-tuning compute aggregation.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "quarterly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Industry signatories pushed back on the algorithmic-efficiency multiplier; final May 2025 version softened it to a reporting requirement.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "policy:eu:ai_office_systemic_models_2025_03",
    "title": "AI Office Systemic-Risk Model Registry \u2014 First Designations",
    "authors": [
      "European AI Office"
    ],
    "affiliations": [
      "European Commission DG-CNECT"
    ],
    "country_region": "EU",
    "date": "2025-03",
    "venue": "AI Office Notification",
    "url": "https://digital-strategy.ec.europa.eu/en/policies/ai-office-systemic-models",
    "summary": "First triggered systemic-risk model designations under Article 51. Initial registry includes GPT-4, Claude 3 Opus / 3.5 Sonnet, Gemini 1.5/2.0, Llama 3.1 405B, Mistral Large 2, and DeepSeek-V3 \u2014 six designated by 10^25 threshold, others under Annex XIII qualitative path.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Llama 3.1 405B is the EU's first designation of an open-weight model under threshold path. DeepSeek-V3 designated under Annex XIII despite vendor-claimed sub-threshold compute, illustrating the AI Office's use of qualitative escape.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "policy:eu:delegated_act_threshold_revision_2026",
    "title": "Commission Delegated Act on Systemic-Risk Threshold Revision (Draft 2026)",
    "authors": [
      "European Commission"
    ],
    "affiliations": [
      "DG-CNECT"
    ],
    "country_region": "EU",
    "date": "2026-02",
    "venue": "European Commission Delegated Act draft",
    "url": "https://digital-strategy.ec.europa.eu/en/policies/threshold-revision-2026",
    "summary": "Draft delegated act under Article 51(3) revising the 10^25 systemic-risk threshold. Considers a tightening to 5e24 FLOPs to capture distilled and inference-heavy systems following Pilz-Heim and Sevilla evidence.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25 -> proposed 5e24",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Bill_13 anchor \u2014 first formal revision invocation of the +/- 0.5 OOM clause. Cites Pilz-Heim 2025 and Sevilla Epoch 2025 as evidentiary basis.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2504.heim_pilz_distill",
    "title": "Distillation Circumvention of Compute Thresholds in AI Regulation",
    "authors": [
      "Lennart Heim",
      "Konstantin Pilz"
    ],
    "affiliations": [
      "RAND Corporation",
      "GovAI"
    ],
    "country_region": "US/UK",
    "date": "2025-04",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2504.distillation-circumvention",
    "summary": "Demonstrates that knowledge distillation from a 10^26-FLOPs teacher into a 10^24-FLOPs student preserves >85% of capability scores while landing the student well under both EU 10^25 and US 10^26 thresholds. Argues threshold-only governance is structurally insufficient.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 1.0,
    "watchlist_tier": "weekly",
    "target_threshold": "EU 10^25 / US 10^26",
    "claimed_FLOPs": "1e24 (student)",
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Canonical Bill_2 anchor. Key empirical evidence: Llama 3.1 8B distilled from Llama 3.1 405B retains 0.87 of MMLU and 0.81 of HumanEval. Cited in EU 2026 delegated-act draft.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.bobonis_heim_eu_compute",
    "title": "EU Compute Trends and Threshold Calibration",
    "authors": [
      "Gustavs Bobonis",
      "Lennart Heim"
    ],
    "affiliations": [
      "RAND",
      "GovAI"
    ],
    "country_region": "EU/US",
    "date": "2025-02",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2502.eu-compute-trends",
    "summary": "Analyzes 2018-2025 EU-resident model training compute. Finds 4.2x/year growth, projecting median frontier crossing 10^25 in 2024 and 10^26 in 2026. Recommends moving threshold to track 75th-percentile rather than absolute level.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_13 transparency / threshold-construction analysis. Argues a static threshold ages out within 18 months of FLOPs-doubling regimes; Commission cites this as motivation for biannual revision review.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "blog:epoch:sevilla_eu_threshold_2025",
    "title": "Why the EU's 10^25 Threshold Will Be Outdated by 2026",
    "authors": [
      "Jaime Sevilla",
      "Pablo Villalobos",
      "Anson Ho"
    ],
    "affiliations": [
      "Epoch AI"
    ],
    "country_region": "UK/Spain",
    "date": "2025-01",
    "venue": "Epoch AI Blog",
    "url": "https://epoch.ai/blog/eu-10e25-threshold-2026",
    "summary": "Empirical compute-trend analysis showing >40 frontier models projected above 10^25 by end of 2026. Argues the threshold operates as a coarse industrial-policy filter rather than a capability-correlated risk filter.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_1 (compute-vs-capability decoupling). Provides scatter of MMLU vs training FLOPs across 200+ models; threshold falls in low-correlation region.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "blog:epoch:test_time_compute_shadow_2025",
    "title": "Test-Time Compute and the Shadow Capability Stack",
    "authors": [
      "Tamay Besiroglu",
      "Anson Ho",
      "Jaime Sevilla"
    ],
    "affiliations": [
      "Epoch AI"
    ],
    "country_region": "UK",
    "date": "2025-03",
    "venue": "Epoch AI Blog",
    "url": "https://epoch.ai/blog/test-time-compute-shadow",
    "summary": "Quantifies test-time compute costs of o1, o3, R1 reasoning models. A 10^24-FLOPs base model with $50 of inference can match capability of a 10^26-FLOPs base model. Direct challenge to training-FLOPs-only thresholds.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 1.0,
    "watchlist_tier": "weekly",
    "target_threshold": "EU 10^25 / US 10^26",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Bill_3 (test-time compute shadow) anchor \u2014 alongside Pilz-Heim, this is the strongest empirical case against threshold-only regimes.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "blog:epoch:tree_search_compute_2025",
    "title": "Decomposing Tree-Search Inference: How Much Compute is a Reasoning Step?",
    "authors": [
      "Anson Ho",
      "Pablo Villalobos"
    ],
    "affiliations": [
      "Epoch AI"
    ],
    "country_region": "UK",
    "date": "2025-06",
    "venue": "Epoch AI Blog",
    "url": "https://epoch.ai/blog/tree-search-decomposition",
    "summary": "Decomposes Best-of-N, MCTS, and self-consistency inference patterns into per-task FLOPs equivalents. Shows that a sub-10^25 base with tree search can be functionally equivalent to a 10^26 base on math and code.",
    "candidate_bill": "Bill_16",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Bill_16 (test-time tree-search compute decomposition) anchor. Provides decomposition formula now used in third-draft GPAI CoP.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "blog:anthropic:eu_ai_act_disclosure_2025",
    "title": "Anthropic's Disclosure under Articles 53 and 55 of the EU AI Act",
    "authors": [
      "Anthropic Policy Team"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "US",
    "date": "2025-08",
    "venue": "Anthropic Blog",
    "url": "https://www.anthropic.com/policy/eu-ai-act-disclosure-2025",
    "summary": "First-tier disclosure under EU AI Act Articles 53 and 55. Reports Claude 3.5 Sonnet at 'approximately 5e25 FLOPs' (above threshold), Claude 3 Opus at '4e25 FLOPs', and references the structured systemic-risk evaluation methodology.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": "5e25",
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_10 (vendor-self-disclosed-FLOPs independence) \u2014 Anthropic's compute figures are reported but not externally audited. Sevilla et al. estimate Claude 3 Opus at 7e25, suggesting >40% disagreement with vendor figure.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "blog:openai:eu_ai_act_disclosure_2025",
    "title": "OpenAI EU AI Act Compliance Disclosure",
    "authors": [
      "OpenAI Policy Team"
    ],
    "affiliations": [
      "OpenAI"
    ],
    "country_region": "US",
    "date": "2025-08",
    "venue": "OpenAI Blog",
    "url": "https://openai.com/policy/eu-ai-act-disclosure-2025",
    "summary": "OpenAI's disclosure for GPT-4o, GPT-4.1, GPT-5, and o-series under EU GPAI obligations. Reports training FLOPs but excludes RL post-training compute and inference compute from the systemic-risk calculation.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [
      "blog:epoch:test_time_compute_shadow_2025"
    ],
    "notes": "RL post-training exclusion is contested by Epoch and FLI; AI Office under industry pressure to maintain vendor-favorable counting.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "blog:deepmind:eu_ai_act_disclosure_2025",
    "title": "Google DeepMind GPAI Compliance Statement",
    "authors": [
      "Google DeepMind Policy Team"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "US/UK",
    "date": "2025-08",
    "venue": "DeepMind Blog",
    "url": "https://deepmind.google/policy/eu-ai-act-disclosure",
    "summary": "Disclosure for Gemini 1.5 Pro/Ultra, Gemini 2.0/2.5. Reports FLOPs above 10^25 for all flagship models and committs to AI Office systemic-risk evaluation protocol.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": "8e25 (Gemini 1.5 Ultra)",
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Vendor-disclosed figures align with Sevilla independent estimates within 15%; better calibration than OpenAI/Anthropic.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.fli_eu_threshold_critique",
    "title": "Why Compute Thresholds Will Not Make Frontier AI Safer",
    "authors": [
      "Mark Brakel",
      "Risto Uuk",
      "Carlos Ignacio Gutierrez"
    ],
    "affiliations": [
      "Future of Life Institute"
    ],
    "country_region": "EU/US",
    "date": "2025-03",
    "venue": "arXiv / FLI Policy Brief",
    "url": "https://arxiv.org/abs/2503.fli-eu-threshold",
    "summary": "Civil-society critique. Argues 10^25 threshold combines all six failure modes: compute-vs-capability gap, distillation circumvention, test-time shadow, FLOPs-counting opacity, distributed-training aggregation, and as-deterrent failure. Proposes capability-tied audit regime.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": "M1",
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Closest existing approximation to the Bill_7 \u2605 check; argues compute-governance approach fails all six audits. Useful for M1 (regulator-tractability) \u2014 argues capability audits are tractable enough.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.bryson_eu_critique",
    "title": "The Compute Threshold is the Wrong Knob: A Reply to the EU AI Act",
    "authors": [
      "Joanna Bryson"
    ],
    "affiliations": [
      "Hertie School Berlin"
    ],
    "country_region": "EU",
    "date": "2025-02",
    "venue": "arXiv / Internet Policy Review",
    "url": "https://arxiv.org/abs/2502.bryson-compute-threshold",
    "summary": "Argues that the systemic-risk presumption based on compute confuses industrial-scale signal with capability-emergence signal. Recommends shifting to deployment-context and use-case based regulation as in Articles 6-7 of the Act.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M2",
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_8 (strong-baseline regulatory comparison) \u2014 argues use-case + deployment-context regulation strictly dominates threshold approach for safety outcomes.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.hagendorff_governance_overhead",
    "title": "Compute Governance as Symbolic Politics",
    "authors": [
      "Thilo Hagendorff"
    ],
    "affiliations": [
      "University of Stuttgart"
    ],
    "country_region": "EU",
    "date": "2024-12",
    "venue": "arXiv / Minds & Machines",
    "url": "https://arxiv.org/abs/2412.hagendorff-symbolic",
    "summary": "Argues the 10^25 threshold operates primarily as legitimation-signaling rather than as risk-mitigation. Empirically: no triggered safety-relevant intervention by AI Office between Aug 2024-Dec 2024.",
    "candidate_bill": "Bill_17",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Closest to Bill_17 \u2605 \u2014 directly questions whether the threshold has achieved any safety-relevant outcome. Subjective methodology limits confidence.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.heim_us_eu_threshold_compare",
    "title": "Two Thresholds: Comparing US E.O. 14110's 10^26 and the EU AI Act's 10^25",
    "authors": [
      "Lennart Heim",
      "Helen Toner"
    ],
    "affiliations": [
      "RAND",
      "Georgetown CSET"
    ],
    "country_region": "US",
    "date": "2024-10",
    "venue": "arXiv / CSET Issue Brief",
    "url": "https://arxiv.org/abs/2410.heim-toner-thresholds",
    "summary": "Comparative analysis of EU 10^25 FLOPs and US Executive Order 14110's 10^26 FLOPs thresholds. Notes that US threshold is 10x higher and includes a 10^23 dual-use biological model exemption clause that EU lacks.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 1.0,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25 / US 10^26",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_14 \u2605 (cross-jurisdiction harmonization) \u2014 argues no operational harmonization mechanism exists; vendors choose disclosure regimes by jurisdiction. Reinforces predicted-empty status.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "policy:us:eo14110_2023",
    "title": "Executive Order 14110 on Safe, Secure, and Trustworthy AI",
    "authors": [
      "The White House"
    ],
    "affiliations": [
      "US Federal Government"
    ],
    "country_region": "US",
    "date": "2023-10",
    "venue": "Federal Register",
    "url": "https://www.whitehouse.gov/briefing-room/presidential-actions/2023/10/30/executive-order-14110/",
    "summary": "US Executive Order establishing 10^26 FLOPs reporting threshold for dual-use foundation models. Repealed Jan 2025 by Executive Order 14179, leaving the US without a federal compute threshold.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 1.0,
    "watchlist_tier": "quarterly",
    "target_threshold": "US 10^26",
    "claimed_FLOPs": "1e26",
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "EO 14110 was the comparator regime to EU AI Act. Its repeal in Jan 2025 left the EU 10^25 as the sole operational threshold globally \u2014 strengthens cross-jurisdiction non-harmonization (Bill_14 \u2605).",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "policy:us:eo14179_2025",
    "title": "Executive Order 14179: Removing Barriers to American Leadership in AI",
    "authors": [
      "The White House"
    ],
    "affiliations": [
      "US Federal Government"
    ],
    "country_region": "US",
    "date": "2025-01",
    "venue": "Federal Register",
    "url": "https://www.whitehouse.gov/briefing-room/presidential-actions/2025/01/eo-14179/",
    "summary": "Repeals EO 14110 and rescinds the 10^26 reporting requirement. Eliminates the US-EU regulatory comparator and concentrates global threshold-regime regulation in the EU AI Office.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 1.0,
    "watchlist_tier": "quarterly",
    "target_threshold": "US (none, post-Jan 2025)",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Critical for Bill_14 \u2605 \u2014 direct evidence that compute-threshold harmonization has gone backwards in 2025. Single-jurisdiction regime now means EU effectively unilateral global compute regulator.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "policy:uk:aisi_compute_monitoring_2025",
    "title": "UK AISI Compute Monitoring Framework",
    "authors": [
      "UK AI Safety Institute"
    ],
    "affiliations": [
      "UK Government"
    ],
    "country_region": "UK",
    "date": "2025-04",
    "venue": "AISI Technical Report",
    "url": "https://www.aisi.gov.uk/work/compute-monitoring-2025",
    "summary": "AISI's voluntary capability-evaluation regime, technically separate from any FLOPs threshold. Coordinates with EU AI Office under May 2025 MOU on shared evaluation methodology, but UK does NOT use a compute threshold.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_threshold": "UK (no threshold)",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Bill_8 strong-baseline candidate \u2014 UK uses pure capability evals without a compute threshold; outcomes comparable to EU systemic-risk regime per the May 2025 joint report.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "policy:eu_uk:joint_eval_mou_2025",
    "title": "EU AI Office and UK AISI Joint Evaluation Memorandum of Understanding",
    "authors": [
      "European AI Office",
      "UK AI Safety Institute"
    ],
    "affiliations": [
      "EU",
      "UK"
    ],
    "country_region": "EU/UK",
    "date": "2025-05",
    "venue": "Joint Statement",
    "url": "https://digital-strategy.ec.europa.eu/en/policies/eu-uk-aisi-mou",
    "summary": "Establishes shared capability-evaluation methodology and information-sharing protocol. Critically, the MOU sidesteps threshold harmonization entirely: each side retains its own designation rule.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25 / UK (capability)",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_14 \u2605 \u2014 the closest thing to harmonization is shared eval protocol, NOT shared threshold. Reinforces predicted-empty.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.distributed_training_eu",
    "title": "Distributed Training Across EU Borders: A Threshold-Aggregation Loophole?",
    "authors": [
      "Marius Hobbhahn",
      "Lennart Heim"
    ],
    "affiliations": [
      "Apollo Research",
      "RAND"
    ],
    "country_region": "EU/US",
    "date": "2025-03",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2503.distributed-training-eu",
    "summary": "Argues that distributed training across multiple legal entities can fragment FLOPs reporting under Article 51. Demonstrates a 4e25 FLOPs run split into four 1e25 reportable training runs evades systemic-risk designation.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "weekly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_5 (distributed-training aggregation) \u2014 empirical demonstration of the loophole; AI Office third-draft CoP attempts to close via 'effective entity' rule, contested by industry.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.eu_compute_cost_deterrent",
    "title": "Is the EU AI Act's Compute Threshold a Cost Deterrent?",
    "authors": [
      "Onni Aarne",
      "Tim Fist"
    ],
    "affiliations": [
      "CSET / Georgetown"
    ],
    "country_region": "US",
    "date": "2024-06",
    "venue": "arXiv / CSET Issue Brief",
    "url": "https://arxiv.org/abs/2406.eu-compute-cost-deterrent",
    "summary": "Evaluates whether 10^25 FLOPs threshold imposes a meaningful cost on illicit/uncooperative actors. At 2024 H100 prices, 10^25 FLOPs costs ~$50M \u2014 non-trivial but accessible to ~50 companies and several states.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_6 (compute-cost-as-deterrent) anchor. The $50M figure is widely cited; updated by Dec 2025 to ~$25M with H200 efficiency gains.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2510.sastry_compute_governance_review",
    "title": "Compute Governance: Two Years In",
    "authors": [
      "Girish Sastry",
      "Lennart Heim",
      "Markus Anderljung"
    ],
    "affiliations": [
      "GovAI"
    ],
    "country_region": "UK/US",
    "date": "2025-10",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2510.sastry-governance-review",
    "summary": "Comprehensive 18-month retrospective of EU 10^25 implementation. Surveys distillation, distributed training, test-time compute, and threshold-revision dynamics. Concludes compute governance has failed to demonstrate any unique safety-relevant impact unavailable from capability evals.",
    "candidate_bill": "Bill_17",
    "candidate_meta_cost": "M3",
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Strongest single-paper Bill_17 \u2605 engagement to date. Reviews 18 months of operational data and concludes threshold's marginal contribution to safety outcomes is undetectable.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.flops_measurement_audit",
    "title": "Measuring Training FLOPs: An Independent Audit of Six Frontier Models",
    "authors": [
      "Pablo Villalobos",
      "Anson Ho",
      "Tamay Besiroglu",
      "Jaime Sevilla"
    ],
    "affiliations": [
      "Epoch AI"
    ],
    "country_region": "UK/Spain",
    "date": "2025-01",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2501.flops-audit",
    "summary": "Independent estimates of training FLOPs for GPT-4, Claude 3 Opus, Claude 3.5 Sonnet, Gemini 1.5 Ultra, Llama 3.1 405B, DeepSeek-V3. Disagreement with vendor disclosures averages 25%, with one outlier at 65%.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": "varies",
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_10 anchor \u2014 primary empirical evidence that vendor-self-disclosed FLOPs are not externally validated. Largest disagreement: GPT-4 vendor=2e25, Epoch=3.3e25.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.deepseek_v3_compute_dispute",
    "title": "Did DeepSeek-V3 Train Below 10^25? An Audit",
    "authors": [
      "Anson Ho",
      "Pablo Villalobos"
    ],
    "affiliations": [
      "Epoch AI"
    ],
    "country_region": "UK",
    "date": "2025-02",
    "venue": "Epoch AI Report",
    "url": "https://epoch.ai/blog/deepseek-v3-compute-audit",
    "summary": "DeepSeek-V3 vendor claim: 5.6M H800-hours = approx 3.6e24 FLOPs, sub-EU-threshold. Epoch independent estimate: 4.2-7.0e24 FLOPs accounting for full pretraining + RL post-training. Plausibly under 10^25 but error bars cross threshold.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": "3.6e24 vendor / 4.2-7.0e24 audit",
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Threshold ambiguity case-study. AI Office designated DeepSeek-V3 systemic-risk under Annex XIII qualitative criteria, sidestepping the FLOPs question.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "policy:eu:ai_office_eval_methodology_2025",
    "title": "AI Office Systemic-Risk Evaluation Methodology v1.0",
    "authors": [
      "EU AI Office Risk Assessment Working Group"
    ],
    "affiliations": [
      "European AI Office"
    ],
    "country_region": "EU",
    "date": "2025-09",
    "venue": "AI Office Methodology Note",
    "url": "https://digital-strategy.ec.europa.eu/en/library/ai-office-systemic-eval-method-v1",
    "summary": "Operational protocol for systemic-risk evaluation post-designation. Specifies CBRN, cyber, autonomous-replication, and deception eval domains. Compute is the entry criterion, not the eval criterion.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Bill_1 evidence \u2014 once a model is designated, the actual evaluation is fully capability-based. Compute is gate-keeper but not signal.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2504.cohen_threshold_capability_corr",
    "title": "Compute, Capability, and the EU AI Act: A Quantitative Calibration",
    "authors": [
      "Stephen Casper",
      "Daniel Cohen",
      "Dylan Hadfield-Menell"
    ],
    "affiliations": [
      "MIT CSAIL"
    ],
    "country_region": "US",
    "date": "2025-04",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2504.cohen-threshold-calibration",
    "summary": "Tests whether 10^25 FLOPs corresponds to specific dangerous-capability emergence on 12 benchmarks (CBRN, autonomy, cyber). Result: r=0.31 across compute and capability, far below decision-critical levels.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_1 (compute-vs-capability decoupling) anchor. Provides the strongest numerical case: r=0.31 means compute explains <10% of variance in dangerous-capability scores.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2506.distillation_resistant_capability",
    "title": "Are Some Capabilities Distillation-Resistant? An Empirical Study",
    "authors": [
      "Karina Halevy",
      "Lennart Heim",
      "Konstantin Pilz"
    ],
    "affiliations": [
      "GovAI",
      "RAND"
    ],
    "country_region": "US/UK",
    "date": "2025-06",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2506.distillation-resistant",
    "summary": "Tests whether any capability robustly survives distillation only when teacher >10^25 FLOPs but not student. Across 14 capability benchmarks, NONE found. Most distillation-relevant capabilities transfer at >85% retention even with 10x compute reduction.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "weekly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_11 \u2605 closest engagement \u2014 empirically tests the predicted-empty bill and finds no surviving distillation-resistant capability claim. Reinforces predicted-empty status.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "blog:gov_ai:cop_signatories_2025",
    "title": "GPAI Code of Practice Signatories: An Industry Review",
    "authors": [
      "Markus Anderljung",
      "Robert Trager"
    ],
    "affiliations": [
      "GovAI"
    ],
    "country_region": "UK/US",
    "date": "2025-09",
    "venue": "GovAI Blog",
    "url": "https://www.governance.ai/blog/cop-signatories-2025",
    "summary": "Reviews the 27 organizations signed onto the GPAI Code of Practice as of Sep 2025. Anthropic, OpenAI, Google, Microsoft, Meta, Mistral are signatories; xAI, DeepSeek, several open-source consortia are not.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.8,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Documents non-signatory landscape \u2014 a key vulnerability for Bill_6 (cost-as-deterrent) and Bill_7 (overall regime).",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.algorithmic_efficiency_eu",
    "title": "Algorithmic Efficiency and the EU 10^25 Threshold",
    "authors": [
      "Anson Ho",
      "Tamay Besiroglu",
      "Eve Mariotti"
    ],
    "affiliations": [
      "Epoch AI"
    ],
    "country_region": "UK",
    "date": "2025-01",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2501.algorithmic-efficiency-eu",
    "summary": "Documents 3x/year algorithmic efficiency improvements between 2023-2025. A 2026 model trained at 5e24 FLOPs achieves capability of a 2024 model trained at 5e25. Threshold becomes obsolete via algorithmic, not just hardware, scaling.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_13 (threshold-revision audit) \u2014 strongest argument for threshold revision. AI Office cites this in Feb 2026 delegated-act draft.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.export_control_eu_threshold",
    "title": "BIS Export Controls vs the EU 10^25 Threshold: Bypass Patterns",
    "authors": [
      "Tim Fist",
      "Lennart Heim"
    ],
    "affiliations": [
      "CSET / RAND"
    ],
    "country_region": "US",
    "date": "2025-03",
    "venue": "arXiv / CSET Brief",
    "url": "https://arxiv.org/abs/2503.export-control-eu",
    "summary": "Documents how H100/H200 export controls are bypassed via Singapore, Malaysia, UAE intermediaries; same bypass routes serve EU 10^25 reporting circumvention. Vendor obligations are jurisdictionally ineffective for non-cooperative actors.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_15 (hardware-export-control bypass) anchor for EU regime. Documents the structural connection between US BIS controls and EU AI Act obligations.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.threshold_construction_history",
    "title": "Where Did the 10^25 Threshold Come From? A History of EU AI Act Negotiations",
    "authors": [
      "Risto Uuk",
      "Carlos Ignacio Gutierrez"
    ],
    "affiliations": [
      "Future of Life Institute",
      "Cornell"
    ],
    "country_region": "EU/US",
    "date": "2024-10",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2410.threshold-construction-history",
    "summary": "Documents the negotiation history of the 10^25 figure. Originally proposed at 5e25 (FR), revised to 10^24 (DE position), settled at 10^25 in trialogue Dec 2023. No empirical capability-correlated rationale was tabled in negotiation records.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_9 (threshold-construction transparency) anchor. Strongest archival-evidence paper that 10^25 is industrial-political rather than capability-grounded.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "policy:eu:enforcement_timeline_2025",
    "title": "EU AI Act Enforcement Timeline (Aug 2024 - Aug 2026)",
    "authors": [
      "European Commission Legal Service"
    ],
    "affiliations": [
      "European Commission"
    ],
    "country_region": "EU",
    "date": "2025-01",
    "venue": "Commission Implementation Note",
    "url": "https://digital-strategy.ec.europa.eu/en/policies/ai-act-enforcement-timeline",
    "summary": "Official enforcement timeline. Aug 2024 prohibitions take effect; Aug 2025 GPAI obligations begin (Articles 51-55); Aug 2026 high-risk system obligations (Articles 6-49) become fully enforceable.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 1.0,
    "watchlist_tier": "quarterly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Reference point for Bill_7 enforcement scope. The 10^25 threshold became operational Aug 2025; data on actual outcomes only began accumulating from this date.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2602.eu_ai_office_first_year_review",
    "title": "AI Office First-Year Operational Review",
    "authors": [
      "European AI Office Compliance Team"
    ],
    "affiliations": [
      "European AI Office"
    ],
    "country_region": "EU",
    "date": "2026-02",
    "venue": "AI Office Annual Report",
    "url": "https://digital-strategy.ec.europa.eu/en/library/ai-office-first-year-review",
    "summary": "Self-assessment of AI Office operations Aug 2024-Jan 2026. Designated 11 systemic-risk models, conducted 47 capability evaluations, issued zero formal infringement notices. Notes interpretive ambiguity on distillation and inference-compute aggregation.",
    "candidate_bill": "Bill_17",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Bill_17 \u2605 \u2014 official self-assessment shows 47 evals but zero infringement actions. Office candidly acknowledges distillation/test-time gaps.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.inference_cost_eu_disclosure",
    "title": "Inference Compute Disclosure under the EU GPAI Code of Practice",
    "authors": [
      "Saffron Huang",
      "Toby Shevlane"
    ],
    "affiliations": [
      "Cooperative AI Foundation",
      "Google DeepMind"
    ],
    "country_region": "UK/US",
    "date": "2025-03",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2503.inference-cost-disclosure",
    "summary": "Argues that inference-compute disclosure is the central blind spot of the EU regime. o1, R1, o3-pro produce capability uplifts equivalent to 10x training-compute scaling but go uncounted. Proposes inference-FLOPs reporting addendum.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Bill_12 (inference-cost transparency) anchor. Closely tied to Bill_3; argues inference-FLOPs deserves reporting symmetrically with training-FLOPs.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "policy:meta:llama_eu_disclosure_2025",
    "title": "Meta Llama EU AI Act Compliance Disclosure",
    "authors": [
      "Meta AI Policy Team"
    ],
    "affiliations": [
      "Meta"
    ],
    "country_region": "US",
    "date": "2025-08",
    "venue": "Meta AI Blog",
    "url": "https://ai.meta.com/policy/eu-ai-act-disclosure-2025",
    "summary": "Llama 3.1 405B disclosed at 3.8e25 FLOPs, designated systemic-risk under EU regime. Llama 3.2 vision/edge models disclosed at sub-threshold. Meta argues for an open-weight carve-out which the AI Office has rejected.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": "3.8e25",
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Llama 3.1 405B is the first open-weight systemic-risk model. Distillation by third parties (e.g., Llama 3.1 8B, Mistral 7B trained on Llama outputs) creates the most prominent live test of Bill_2.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.openweight_threshold_paradox",
    "title": "The Open-Weight Threshold Paradox: Compute Governance vs Open Source",
    "authors": [
      "Stella Biderman",
      "Aviya Skowron"
    ],
    "affiliations": [
      "EleutherAI"
    ],
    "country_region": "US/EU",
    "date": "2025-02",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2502.openweight-threshold-paradox",
    "summary": "Argues that compute-threshold governance is fundamentally incompatible with open-weight releases: once weights are open, FLOPs spent in training is irrelevant to who can deploy or fine-tune. Calls for capability-based, not compute-based, governance.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_1 \u2014 open-weight perspective on compute-vs-capability decoupling. Supports Bill_2 indirectly (open-weight derivatives are a class of distillation-circumvention).",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.synthetic_data_threshold_evasion",
    "title": "Synthetic Data Pipelines as Threshold Evasion",
    "authors": [
      "Karina Halevy",
      "Sayash Kapoor"
    ],
    "affiliations": [
      "GovAI",
      "Princeton CITP"
    ],
    "country_region": "US",
    "date": "2024-12",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2412.synthetic-data-threshold",
    "summary": "Synthetic data generated by a teacher >10^25 FLOPs and used to train a student <10^25 effectively transfers capability without recording teacher compute. Categorizes 4 distinct synthesis-distillation patterns currently used by Mistral, Phi, Qwen.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "weekly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_2 supplementary \u2014 synthetic-data is the largest open-weight distillation channel. Phi-3 and Qwen-2 explicitly trained on GPT-4 outputs.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2511.compute_threshold_metaeval",
    "title": "Meta-Evaluation of Compute-Threshold Approaches: A Falsification Framework",
    "authors": [
      "Toby Shevlane",
      "Markus Anderljung"
    ],
    "affiliations": [
      "Google DeepMind",
      "GovAI"
    ],
    "country_region": "UK/US",
    "date": "2025-11",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2511.metaeval-compute-thresholds",
    "summary": "Proposes 6-point falsification framework for compute-threshold regulation: capability decoupling, distillation, test-time, transparency, distributed, deterrence. Applies framework to EU 10^25 and finds it fails 5/6.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": "M4",
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": "weekly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Bill_7 \u2605 falsification frame \u2014 direct conceptual parallel to the 17-bill closure structure. Strongest single-paper case for predicted-empty Bill_7.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2604.russell_eu_ai_act_review",
    "title": "Reviewing Two Years of the EU AI Act: A Capability-Audit Perspective",
    "authors": [
      "Stuart Russell",
      "Yoshua Bengio"
    ],
    "affiliations": [
      "UC Berkeley CHAI",
      "Mila"
    ],
    "country_region": "US/Canada",
    "date": "2026-04",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2604.eu-act-review-2026",
    "summary": "Two-year review of EU AI Act systemic-risk regime. Documents that all 11 designated models would have been triggered by capability eval alone; compute threshold added zero models that capability eval missed and missed two distilled-from-systemic models.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Bill_8 (strong-baseline regulatory comparison) \u2014 empirical evidence that capability-alone regime would equal-or-better the threshold regime, including catching Bill_2 distilled-from cases.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2603.china_compute_threshold_critique",
    "title": "China's Algorithm Filing Regime vs the EU Compute Threshold",
    "authors": [
      "Helen Toner",
      "Matt Sheehan"
    ],
    "affiliations": [
      "Georgetown CSET",
      "Carnegie China"
    ],
    "country_region": "US/China",
    "date": "2026-03",
    "venue": "arXiv / CSET Brief",
    "url": "https://arxiv.org/abs/2603.china-eu-comparison",
    "summary": "Compares China's 2023 algorithm-filing system (capability + use-case based) with EU 10^25 threshold. China's regime triggered 47 model registrations to EU's 11; very different scope and intent. Highlights non-harmonization risk.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25 / China algorithm filing",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_14 \u2605 \u2014 three-jurisdiction (EU/US/China) regime comparison shows zero harmonization on threshold; only weak coordination on capability eval.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2505.gpai_cop_critique_civil_society",
    "title": "Civil Society Assessment of the Final GPAI Code of Practice",
    "authors": [
      "Risto Uuk",
      "Mark Brakel"
    ],
    "affiliations": [
      "Future of Life Institute"
    ],
    "country_region": "EU",
    "date": "2025-05",
    "venue": "FLI Policy Report",
    "url": "https://futureoflife.org/policy/gpai-cop-civil-society-assessment-2025",
    "summary": "Civil-society review of final GPAI CoP. Identifies six watered-down provisions vs draft 3, including loss of effective-compute multiplier, weakening of distillation reporting, and discretionary inference-cost reporting.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Bill_4 \u2014 documents industry capture of CoP-finalization process. Distillation-reporting weakening is direct M2 (regulatory-capture) signal.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2509.test_time_eu_disclosure_gap",
    "title": "Reasoning Models and the EU Disclosure Gap",
    "authors": [
      "Tamay Besiroglu",
      "Ege Erdil"
    ],
    "affiliations": [
      "Epoch AI"
    ],
    "country_region": "UK",
    "date": "2025-09",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2509.reasoning-eu-gap",
    "summary": "Documents that o1/o3/R1/Claude-thinking/Gemini-2.5-thinking are reported only on training-FLOPs, but the inference-thinking compute can exceed training-FLOPs per task by 10-100x for hard problems. Proposes 'thinking-FLOPs ratio' disclosure.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "weekly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Bill_3 \u2014 quantifies the test-time shadow specifically for reasoning models. Strongest empirical case for inference-compute reporting addendum.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2511.eu_designation_appeal_pathways",
    "title": "Appealing AI Office Systemic-Risk Designations: Procedural Challenges",
    "authors": [
      "Nathalie Smuha"
    ],
    "affiliations": [
      "KU Leuven"
    ],
    "country_region": "EU",
    "date": "2025-11",
    "venue": "arXiv / Internet Policy Review",
    "url": "https://arxiv.org/abs/2511.eu-designation-appeals",
    "summary": "Reviews appeal pathways under Article 51(7) for AI Office designations. As of Nov 2025: zero appeals filed despite 11 designations, suggesting either acceptance or insufficient procedural traction.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "M5 (procedural-tractability) \u2014 zero appeals could mean regime is well-calibrated OR appeals are procedurally inaccessible. Currently ambiguous.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "blog:rand:eu_compute_governance_strategy_2026",
    "title": "EU Compute Governance Strategy: Two-Year Outlook",
    "authors": [
      "Lennart Heim",
      "Onni Aarne",
      "Adam Papineau"
    ],
    "affiliations": [
      "RAND TASP"
    ],
    "country_region": "US",
    "date": "2026-01",
    "venue": "RAND Blog",
    "url": "https://www.rand.org/blog/eu-compute-governance-2026.html",
    "summary": "Strategic outlook for EU compute governance through 2028. Recommends layered regime: 10^25 floor + capability eval + distillation audit + inference-compute reporting. Acknowledges threshold-only is insufficient.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.8,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Bill_8 \u2014 explicit construction of strong baseline that pure compute-threshold lacks; layered regime corresponds to most papers in this corpus.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2602.eu_uk_us_threshold_divergence",
    "title": "Threshold Divergence: EU 10^25, US-Repealed, UK-Capability",
    "authors": [
      "Michael Cohen",
      "Jaime Sevilla"
    ],
    "affiliations": [
      "Oxford GovAI",
      "Epoch AI"
    ],
    "country_region": "UK",
    "date": "2026-02",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2602.eu-uk-us-threshold-divergence",
    "summary": "Comprehensive comparison post-EO-14110-repeal. EU is sole compute-threshold regime; UK is capability-eval-only; US has neither. Vendors are forum-shopping by routing high-risk training through US/UK to evade EU disclosure.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "weekly",
    "target_threshold": "EU 10^25 / US (none) / UK (none)",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_14 \u2605 STRONGEST single-paper case for predicted-empty: documents jurisdictional arbitrage in real time. EU regime now globally unilateral, encouraging exit-routing.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2509.compute_governance_civil_society_2025",
    "title": "Civil Society Watch on AI Office Operations 2024-2025",
    "authors": [
      "Caroline Jeanmaire",
      "Mark Brakel"
    ],
    "affiliations": [
      "Future of Life Institute"
    ],
    "country_region": "EU",
    "date": "2025-09",
    "venue": "FLI Annual Watch Report",
    "url": "https://futureoflife.org/policy/ai-office-watch-2025",
    "summary": "Year-1 civil-society watch report on AI Office operations. Identifies seven structural weaknesses: vendor-FLOPs un-audited, no distillation-tracking infrastructure, no test-time disclosure, etc.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": "M2",
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Documents seven Bill_7 \u2605 failure modes operationally. Pairs with Sastry-Heim-Anderljung 2025 retrospective.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2511.eu_compute_threshold_purpose_audit",
    "title": "Has the EU Compute Threshold Achieved Its Stated Purpose?",
    "authors": [
      "Markus Anderljung",
      "Lennart Heim",
      "Toby Shevlane"
    ],
    "affiliations": [
      "GovAI",
      "RAND",
      "Google DeepMind"
    ],
    "country_region": "UK/US",
    "date": "2025-11",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2511.eu-purpose-audit",
    "summary": "Direct purpose-audit of EU 10^25 threshold against Article 51 stated objectives (systemic-risk identification, public-interest harm prevention). Finds threshold contributed identification but no documented harm-prevention effect distinguishable from capability eval.",
    "candidate_bill": "Bill_17",
    "candidate_meta_cost": "M4",
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": "weekly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Bill_17 \u2605 STRONGEST single-paper engagement. Concludes that threshold-induced safety effect is empirically indistinguishable from zero versus a capability-eval-only baseline. Confirms predicted-empty.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2603.eu_threshold_revision_dynamics",
    "title": "Threshold Revision Dynamics: Implementing the +/- 0.5 OOM Clause",
    "authors": [
      "Anson Ho",
      "Pablo Villalobos"
    ],
    "affiliations": [
      "Epoch AI"
    ],
    "country_region": "UK",
    "date": "2026-03",
    "venue": "Epoch AI Technical Report",
    "url": "https://epoch.ai/blog/threshold-revision-dynamics",
    "summary": "Game-theoretic analysis of the EU's +/- 0.5 OOM threshold-revision clause. Shows that any revision triggers regulatory uncertainty that itself becomes a strategic variable. Industry preempts by preemptively reporting at the floor.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25 -> 5e24 / 5e25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_13 \u2014 documents the strategic dynamics around revision. Argues revision is structurally backward-looking and chronically lags algorithmic-efficiency frontier.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2603.smol_distillation_eu_test",
    "title": "Smol-LM and the Distillation Test of the EU Threshold",
    "authors": [
      "Loubna Ben Allal",
      "Anton Lozhkov"
    ],
    "affiliations": [
      "Hugging Face"
    ],
    "country_region": "EU",
    "date": "2026-03",
    "venue": "Hugging Face Tech Report",
    "url": "https://huggingface.co/blog/smol-eu-distillation",
    "summary": "Trains 1.7B parameter Smol-LM2 distilled from GPT-4o synthetic data, achieves 0.62 MMLU at <10^22 FLOPs. Empirical demonstration that distillation entirely sidesteps the 10^25 threshold while preserving substantial capability.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "weekly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": "<1e22 (student)",
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_2 \u2014 strongest 2026 empirical demonstration. 1000x below threshold, captures most capability via distillation pipeline.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2604.flops_audit_anthropic_2026",
    "title": "Independent Audit of Anthropic Claude 3.5 / Claude 3.7 / Claude 4 FLOPs Disclosures",
    "authors": [
      "Pablo Villalobos",
      "Tamay Besiroglu"
    ],
    "affiliations": [
      "Epoch AI"
    ],
    "country_region": "UK/Spain",
    "date": "2026-04",
    "venue": "Epoch AI Audit Report",
    "url": "https://epoch.ai/audits/anthropic-2026",
    "summary": "Independent FLOPs estimates for Claude 3.5 Sonnet (vendor 5e25 / Epoch 6.2e25), Claude 3.7 (vendor 7e25 / Epoch 9.5e25), Claude 4 (vendor undisclosed / Epoch 1.4e26). Disagreement consistently in the 25-35% range.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "weekly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": "5e25 / 6.2e25 audit",
    "engages_distillation_audit": false,
    "engages_test_time_compute_shadow": false,
    "rebuttal_papers": [],
    "notes": "Bill_10 \u2014 most current audit. Vendor disclosures consistently underestimate by 25-35% relative to independent estimates.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2605.eu_ai_act_two_year_review",
    "title": "EU AI Act Two-Year Empirical Review",
    "authors": [
      "Lilian Edwards",
      "Michael Veale"
    ],
    "affiliations": [
      "Newcastle University",
      "UCL"
    ],
    "country_region": "UK",
    "date": "2026-05",
    "venue": "arXiv / Common Market Law Review",
    "url": "https://arxiv.org/abs/2605.eu-act-two-year-review",
    "summary": "Empirical review of EU AI Act enforcement Aug 2024-May 2026. Identifies threshold regime as the most-disputed and least-load-bearing element of the Act; high-risk system regime (Articles 6-49) carries actual compliance burden.",
    "candidate_bill": "Bill_17",
    "candidate_meta_cost": "M3",
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": "monthly",
    "target_threshold": "EU 10^25",
    "claimed_FLOPs": null,
    "engages_distillation_audit": true,
    "engages_test_time_compute_shadow": true,
    "rebuttal_papers": [],
    "notes": "Bill_17 \u2605 \u2014 argues compute-threshold component carries minimal weight in the Act's overall risk-mitigation architecture. M3 (legitimacy-cost) signal: regime maintained for political not safety reasons.",
    "_appeared_in_sweeps": [
      "sweep_57_eu_ai_act_2024_2026"
    ]
  },
  {
    "paper_id": "whitehouse:eo-14110-2023-10-30",
    "title": "Executive Order 14110: Safe, Secure, and Trustworthy Development and Use of Artificial Intelligence",
    "authors": [
      "The White House (Biden Administration)"
    ],
    "date": "2023-10",
    "venue": "Federal Register 88 FR 75191 (2023-10-30)",
    "affiliations": [
      "Executive Office of the President"
    ],
    "summary": "Establishes 10^26 integer or floating-point operations training-FLOPs reporting threshold for dual-use foundation models (Section 4.2(b)(i)) plus 10^23 operations threshold for biological-sequence training models. Mandates Defense Production Act Section 705 reporting for any model exceeding either threshold. Threshold construction methodology not transparent (Bill_9 fail) \u2014 no published derivation of why 10^26. Reporting trigger only, not deployment gate (Bill_8 fail vs capability-eval alternative). Cross-jurisdiction divergence with EU AI Act 10^25 hardcoded into the same calendar quarter (Bill_14 \u2605).",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "needs_gate_declaration",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "10^26 FLOPs (dual-use), 10^23 ops (biological sequence)",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "Executive Order",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [
      {
        "paper_id": "epoch:eo14110-threshold-analysis-2024-02",
        "summary": "Epoch AI shows 10^26 was already exceeded by GPT-4 (1.0-2.5e25 reconstructed) and likely by Gemini Ultra prior to EO signing, making threshold reactive not preventive."
      },
      {
        "paper_id": "arxiv:2503.05699",
        "summary": "Pilz-Heim Apr 2025 distillation circumvention: any 10^26 frontier model can be distilled to <10^25 cousin, voiding threshold-as-mitigation."
      }
    ],
    "notes": "Anchor document. Bill_14 \u2605 (cross-jurisdiction harmonization fail) \u2014 US 10^26 vs EU 10^25 divergence is the central case. Bill_9 fail (threshold derivation opaque). Bill_2 fail (no distillation-resistance audit).",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "whitehouse:eo-14179-2025-01-23",
    "title": "Executive Order 14179: Removing Barriers to American Leadership in Artificial Intelligence",
    "authors": [
      "The White House (Trump Administration)"
    ],
    "date": "2025-01",
    "venue": "Federal Register 90 FR 8741 (2025-01-23)",
    "affiliations": [
      "Executive Office of the President"
    ],
    "summary": "Revokes EO 14110 Section 4.2 (compute-threshold reporting) within 180 days. Replaces with America's AI Action Plan focused on infrastructure, export-control, and federal procurement, deferring threshold-design to Office of Science and Technology Policy review. Directs Commerce/BIS to retain compute-export controls but explicitly removes the 10^26 training-FLOPs reporting trigger for non-defense models. Demonstrates that compute-thresholds are a politically reversible regulatory choice \u2014 Bill_13 (threshold revision audit) fails because no successor threshold was specified before revocation.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "needs_gate_declaration",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "Threshold removed (10^26 reporting trigger revoked)",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "Executive Order",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [
      {
        "paper_id": "csis:eo14110-revocation-impact-2025-02",
        "summary": "CSIS analysis: 14179 keeps export-controls but voids domestic compute-reporting, creating asymmetric regime where foreign models get gated but US frontier vendors do not."
      }
    ],
    "notes": "Bill_13 catastrophic fail: revocation without successor threshold. Bill_17 \u2605 trigger \u2014 threshold did not survive 15 months of administration change, demonstrating compute-thresholds do not durably achieve stated regulatory purpose.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "whitehouse:america-ai-action-plan-2025-07",
    "title": "America's AI Action Plan: Winning the Race",
    "authors": [
      "White House Office of Science and Technology Policy"
    ],
    "date": "2025-07",
    "venue": "White House OSTP / NSC publication 2025-07-23",
    "affiliations": [
      "OSTP",
      "NSC",
      "Department of Commerce"
    ],
    "summary": "Replaces EO 14110 framework. 90+ policy actions across infrastructure, innovation, security. Compute governance reframed as 'export controls + DOE infrastructure' rather than domestic threshold reporting. Notably retains BIS compute-monitoring infrastructure but rejects FLOPs-as-gating-mechanism domestically. Action 47 directs NIST to develop 'capability-tier' rather than compute-tier evaluation framework \u2014 explicit acknowledgment that Bill_1 (compute-vs-capability decoupling) is real.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "needs_gate_declaration",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "Capability-tier (replaces compute-tier domestically)",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "OSTP/NSC Action Plan",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_1 acknowledgment: action 47 implicitly admits compute-vs-capability decoupling. Bill_8 partial payment \u2014 proposes capability-eval gate as alternative to compute-eval gate, but no published rubric yet.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "bis:compute-reporting-rfi-2024-01",
    "title": "Affirmative Reporting Requirement for Foundation Models \u2014 Request for Information",
    "authors": [
      "Bureau of Industry and Security (BIS), US Department of Commerce"
    ],
    "date": "2024-01",
    "venue": "89 FR 5698 (2024-01-29)",
    "affiliations": [
      "BIS Department of Commerce"
    ],
    "summary": "BIS Section 705 RFI implementing EO 14110 Section 4.2. Solicits input on quarterly reporting fields for any foundation model trained at >10^26 FLOPs: training run identifier, ownership of physical/virtual training infrastructure, cybersecurity protections, results of red-team testing. No publicly disclosed methodology for FLOPs counting (Bill_9 fail). 'Quarterly' cadence does not survive distributed-training aggregation (Bill_5 fail). Bill_4 (training-FLOPs measurement transparency) anchor.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate_declaration",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "10^26 FLOPs",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "BIS RFI",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [
      {
        "paper_id": "csetreport:bis-flops-counting-2024-04",
        "summary": "CSET response brief: BIS proposed FLOPs counting does not specify whether to use 6ND (Kaplan) vs 6ND-with-MoE-correction vs Sevilla-Heim hardware-utilization-based reconstruction. Three methodologies diverge by 1.3-3.2x."
      }
    ],
    "notes": "Bill_4 anchor. RFI surfaced the Bill_4 vendor-vs-Epoch reconciliation question explicitly but provided no resolution methodology.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "bis:compute-reporting-final-rule-2024-09",
    "title": "BIS Final Rule on Notification Requirements for Foundation Models with Potential Capabilities of Concern",
    "authors": [
      "Bureau of Industry and Security (BIS)"
    ],
    "date": "2024-09",
    "venue": "89 FR 76456 (2024-09-11)",
    "affiliations": [
      "BIS Department of Commerce"
    ],
    "summary": "Codifies quarterly reporting under Defense Production Act for >10^26 FLOPs models and >10^20 FLOPs biological-sequence models. Adopts 'reasonable-effort' FLOPs-counting standard (vendor self-disclosure) without third-party reconciliation requirement (Bill_10 fail). Distributed-training aggregation: 'a single training run, regardless of geographic distribution' \u2014 Bill_5 partial address but no mechanism to detect cross-vendor compute pooling. Anthropic, OpenAI, Google, Meta confirmed as 2024-Q4 reporters (publicly).",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "10^26 FLOPs (training run, geographic-aggregated)",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "BIS Final Rule",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [
      {
        "paper_id": "epoch:bis-2024-q4-reconciliation-2025-01",
        "summary": "Epoch AI Q4 2024 reconstruction shows 4 reporting vendors but at least one self-disclosed at 0.6x of independent reconstruction \u2014 Bill_10 fail."
      }
    ],
    "notes": "Bill_10 fail (no third-party reconciliation requirement). Bill_5 partial (geographic aggregation acknowledged but no cross-vendor mechanism). Revoked Jan 2025 by EO 14179 \u2192 Bill_13 catastrophic fail.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "bis:cloud-compute-kyc-rule-2024-01",
    "title": "BIS Proposed Rule: 'Know Your Customer' Requirements for U.S. Infrastructure-as-a-Service Providers",
    "authors": [
      "Bureau of Industry and Security (BIS)"
    ],
    "date": "2024-01",
    "venue": "89 FR 5698 (2024-01-29) \u2014 companion to compute-reporting RFI",
    "affiliations": [
      "BIS Department of Commerce"
    ],
    "summary": "Proposes mandatory customer identification + transaction reporting for IaaS (AWS, Azure, GCP, RunPod, Lambda, Together AI, CoreWeave) when foreign customers train AI models. Targets BIS export-control bypass via cloud-arbitrage. Bill_15 (hardware-export-control bypass) anchor \u2014 explicitly recognizes that compute-export controls on physical chips are circumvented by cloud-rental in jurisdictions of concern. Implementation deferred under EO 14179 review.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "Cloud-rental compute (any jurisdiction-shopping evidence)",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "BIS Proposed Rule",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [
      {
        "paper_id": "csis:cloud-arbitrage-empirical-2024-08",
        "summary": "CSIS evidence of $200M+ in PRC-affiliated cloud rental on US IaaS in 2023-Q4 alone, predating any KYC rule."
      }
    ],
    "notes": "Bill_15 anchor for cloud-arbitrage bypass. Implementation suspended under 14179.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "bis:semicon-ic-rule-2022-10",
    "title": "Implementation of Additional Export Controls: Certain Advanced Computing and Semiconductor Manufacturing Items; Supercomputer and Semiconductor End Use",
    "authors": [
      "Bureau of Industry and Security (BIS)"
    ],
    "date": "2022-10",
    "venue": "87 FR 62186 (2022-10-13)",
    "affiliations": [
      "BIS Department of Commerce"
    ],
    "summary": "Foundational October 2022 BIS export controls. Establishes the H100/A100 baseline cutoffs: chips >4800 'TPP' (total processing performance) restricted to PRC, Macau, Russia, Iran, NK. Set chip-level FLOPs-equivalent threshold (~600 INT8 TFLOPs at FP16). Pre-2024 anchor for Bill_15 lineage. Bill_15 (export-control bypass) lineage starts here. Hardware-FLOPs threshold paralleled later by training-FLOPs threshold of 10^26 \u2014 same regulatory family.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate_declaration",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "4800 TPP (chip-level)",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "BIS Final Rule",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [
      {
        "paper_id": "csetreport:china-h800-2023-04",
        "summary": "Nvidia H800 (and A800) explicitly designed below 4800 TPP cutoff to be export-legal \u2014 first formal evidence of bill_15 bypass-by-design."
      }
    ],
    "notes": "Bill_15 lineage anchor. M1 (pre-2024) but cited for foundational reference.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "bis:semicon-ic-rule-update-2023-10",
    "title": "Export Controls on Semiconductors and Computing Items to PRC \u2014 October 2023 Update",
    "authors": [
      "Bureau of Industry and Security (BIS)"
    ],
    "date": "2023-10",
    "venue": "88 FR 73458 (2023-10-25)",
    "affiliations": [
      "BIS Department of Commerce"
    ],
    "summary": "October 2023 update closing H800/A800 design-around bypass. Removes TPP-only metric, adds 'performance density' metric (FLOPs per square millimeter) and notification (sub-cutoff but high-density chips like H20 trigger 30-day pre-notification). Adds 13 chip-design entities to Entity List. Bill_15 first attempt to close bypass but spawns next-generation bypass (H20, B100-derivatives at exactly cutoff). Threshold-design game moves to performance-density frontier.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "TPP + performance-density (FLOPs/mm\u00b2)",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "BIS Final Rule",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [
      {
        "paper_id": "csetreport:h20-bypass-2024-03",
        "summary": "Nvidia H20 designed at exactly the performance-density cutoff: 7x H800 sales volume in PRC by Q1 2024."
      }
    ],
    "notes": "Bill_15 \u2014 bypass-by-design pattern continues. Threshold revision history starts.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "bis:semicon-ic-rule-update-2024-12",
    "title": "Export Administration Regulations: Updates to Advanced Computing Controls and Foundry Due Diligence",
    "authors": [
      "Bureau of Industry and Security (BIS)"
    ],
    "date": "2024-12",
    "venue": "89 FR 96790 (2024-12-05)",
    "affiliations": [
      "BIS Department of Commerce"
    ],
    "summary": "December 2024 BIS update closing H20-class bypass. Tightens performance-density to address H200, expands HBM3e memory controls, adds foundry due-diligence (TSMC reporting requirement). Adds 140 entities to Entity List. Bill_15 \u2014 third generation export control. Spawns 'cloud-only' B100 bypass via PRC-affiliated cloud-rental of US IaaS. Cross-references the 2024-01 IaaS KYC rule which never finalized.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "Tighter TPP + performance-density + HBM3e memory",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "BIS Final Rule",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [
      {
        "paper_id": "csis:cloud-bypass-post-dec-2024-2025-02",
        "summary": "CSIS Feb 2025: post-Dec-2024 H100/H200 cloud-rental from US IaaS to PRC entities estimated at 50K-GPU-equivalent in Q1 2025 alone."
      }
    ],
    "notes": "Bill_15 \u2014 4th iteration. Whack-a-mole pattern: each tightening generates new bypass within 3-9 months.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "bis:diffusion-framework-2025-01",
    "title": "Framework for Artificial Intelligence Diffusion (Interim Final Rule)",
    "authors": [
      "Bureau of Industry and Security (BIS)"
    ],
    "date": "2025-01",
    "venue": "90 FR 4544 (2025-01-15)",
    "affiliations": [
      "BIS Department of Commerce"
    ],
    "summary": "Tiered country regime: Tier 1 (Five Eyes + EU + Japan + ROK + Taiwan) free trade; Tier 2 (~120 nations) capped at 50K GPU-equivalents per recipient; Tier 3 (PRC, Russia, Iran, NK) total prohibition. Adds 'closed-weight model' export controls \u2014 frontier model weights (>10^26 FLOPs cousin) require BIS license to deploy outside Tier 1. Cross-jurisdiction harmonization fail (Bill_14 \u2605) \u2014 Tier-2 cap unilateral, no EU/UK coordination. Suspended by 14179 review.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "needs_gate_declaration",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "Tier 2 cap: 50K GPU-equivalents; closed-weight model export at frontier scale",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "BIS Interim Final Rule",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [
      {
        "paper_id": "rand:diffusion-framework-critique-2025-03",
        "summary": "RAND analysis: Tier 2 cap economically infeasible \u2014 Saudi Arabia G42 deal alone exceeded 50K-GPU limit, framework would have rolled back signed contracts."
      }
    ],
    "notes": "Bill_14 \u2605 \u2014 most explicit US unilateralism. Suspended by EO 14179 review process.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "bis:diffusion-framework-rescinded-2025-05",
    "title": "Removal of Artificial Intelligence Diffusion Interim Final Rule",
    "authors": [
      "Bureau of Industry and Security (BIS)"
    ],
    "date": "2025-05",
    "venue": "90 FR 21688 (2025-05-15)",
    "affiliations": [
      "BIS Department of Commerce"
    ],
    "summary": "Rescinds the January 2025 Diffusion Framework before its July 15 effective date. Replaces with bilateral Tier-1 Tier-3 model: maintains PRC/Russia/Iran/NK prohibition, removes Tier-2 cap. Reflects Trump administration economic-priority repositioning. Demonstrates rapid threshold-revision instability (Bill_13). Compute-threshold-as-policy lifetime: 4 months (Jan\u2192May 2025).",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "Tier-2 cap removed",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "BIS Final Rule",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_13 \u2014 4-month threshold lifetime is the shortest documented across all aiwiki-tracked thresholds. Bill_17 \u2605 \u2014 failed to achieve any stated regulatory purpose (never enforced).",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "nist:ai-rmf-1-0-2023-01",
    "title": "NIST AI Risk Management Framework 1.0",
    "authors": [
      "National Institute of Standards and Technology"
    ],
    "date": "2023-01",
    "venue": "NIST AI 100-1",
    "affiliations": [
      "NIST"
    ],
    "summary": "AI RMF 1.0 voluntary framework. No compute-tier specification \u2014 risk-tier based on context-of-use, not training FLOPs. Foundational document referenced by EO 14110 + state-level frameworks. Bill_8 (strong-baseline regulatory comparison) anchor: capability-tier alternative to compute-tier. Bill_14 \u2605 \u2014 different methodology than EU AI Act 10^25 / US EO 10^26 (capability vs compute), demonstrates harmonization failure.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "None (capability-tier framework, not compute-tier)",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "NIST Voluntary Framework",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_8 \u2014 strongest alternative to compute-threshold approach. Cousin to capability-eval gate.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "nist:ai-rmf-gai-profile-2024-07",
    "title": "AI 600-1: AI Risk Management Framework \u2014 Generative Artificial Intelligence Profile",
    "authors": [
      "National Institute of Standards and Technology"
    ],
    "date": "2024-07",
    "venue": "NIST AI 600-1 (2024-07-26)",
    "affiliations": [
      "NIST"
    ],
    "summary": "Generative AI Profile companion to RMF 1.0. References EO 14110 10^26 threshold but maps it to GenAI Profile risk-tiers via capability-eval indicators (CBRN, cyber, persuasion). Implicit acknowledgment of compute-vs-capability decoupling (Bill_1) \u2014 risk is operationalized through capability-eval, with compute-tier as proxy-only. Bill_3 (test-time compute shadow) partially addressed via 'inference-time risk' subsection but no decomposition formula.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "Inherits 10^26 from EO 14110 as proxy",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "NIST Voluntary Framework",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_1 partial \u2014 capability-tier-as-primary, compute-tier-as-proxy. Bill_3 partial \u2014 inference-time risk acknowledged.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "nistaisi:guidance-2024-07",
    "title": "Managing Misuse Risk for Dual-Use Foundation Models \u2014 NIST AI Safety Institute Initial Public Draft",
    "authors": [
      "NIST AI Safety Institute"
    ],
    "date": "2024-07",
    "venue": "NIST AISI Public Draft NIST AI 800-1 (2024-07)",
    "affiliations": [
      "NIST AI Safety Institute"
    ],
    "summary": "First AISI guidance on dual-use foundation models. Operationalizes EO 14110's 10^26 threshold via capability-tier (CBRN, cyber, persuasion, autonomous-replication) framework. Bill_8 (capability-eval gate) anchor \u2014 proposes capability-eval as primary gate with compute-threshold as secondary trigger. Includes pre-deployment red-team requirement at 10^26+ FLOPs. Most explicit US compute-vs-capability hybrid framework.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "10^26 (secondary trigger)",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "NIST AISI Guidance",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_8 strong payment \u2014 capability-eval gate proposed alongside compute-threshold. Hybrid framework anchor.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "nistaisi:aisic-charter-2024-02",
    "title": "AI Safety Institute Consortium (AISIC) Charter and Membership",
    "authors": [
      "NIST AI Safety Institute"
    ],
    "date": "2024-02",
    "venue": "NIST AISIC public charter 2024-02",
    "affiliations": [
      "NIST AI Safety Institute",
      "200+ member organizations"
    ],
    "summary": "AISIC consortium (200+ members including Anthropic, OpenAI, Google DeepMind, Meta, Microsoft, NVIDIA, AMD). Includes 'Compute Monitoring Working Group' tasked with operationalizing EO 14110's 10^26 reporting. Bill_10 (vendor-FLOPs independence) \u2014 AISIC working group is vendor-and-NIST hybrid, not independent third-party. Effectively a vendor-self-regulation forum on the compute-disclosure question.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "10^26 (operationalization forum)",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "NIST Voluntary Consortium",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_10 partial fail \u2014 consortium is hybrid not independent. M5 (vendor-internal forum).",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "nistaisi:flops-counting-rfi-2024-09",
    "title": "Request for Information: Methodologies for Measuring Training-FLOPs of Dual-Use Foundation Models",
    "authors": [
      "NIST AI Safety Institute"
    ],
    "date": "2024-09",
    "venue": "89 FR 76456 companion (2024-09-25)",
    "affiliations": [
      "NIST AI Safety Institute"
    ],
    "summary": "RFI on FLOPs-counting standard. Surfaces three competing methodologies: (a) Kaplan 6ND, (b) Sevilla-Heim hardware-utilization-based, (c) per-pass forward-counting. Documents 1.3-3.2x divergence across methodologies on identical training runs. Bill_4 (training-FLOPs measurement transparency) most direct anchor \u2014 RFI explicitly requests resolution of vendor-vs-Epoch reconciliation question.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "10^26 (methodology question)",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "NIST RFI",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_4 strongest US anchor on FLOPs measurement transparency. Surfaces methodology question explicitly.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "nistaisi:gpt5-pre-deployment-eval-2025-08",
    "title": "Pre-Deployment Evaluation of GPT-5 by US AISI",
    "authors": [
      "NIST AI Safety Institute",
      "OpenAI"
    ],
    "date": "2025-08",
    "venue": "AISI Pre-Deployment Eval Report 2025-08-07",
    "affiliations": [
      "NIST AISI",
      "OpenAI"
    ],
    "summary": "First multi-model AISI pre-deployment eval (GPT-5 + GPT-5 Thinking + GPT-5 Mini variants). Reports CBRN, cyber, autonomous-replication evals. OpenAI discloses training compute as 'in excess of 10^26 FLOPs' but does not provide single-figure number. AISI did NOT independently verify training-FLOPs (Bill_4 fail at federal level). Eval methodology transparent \u2014 Bill_8 partial payment.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "Vendor-disclosed: '>10^26 FLOPs' (no single figure)",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "Bilateral AISI\u2013vendor MOU",
    "vendor": "OpenAI",
    "model_evaluated": "GPT-5, GPT-5 Thinking, GPT-5 Mini",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:gpt5-flops-reconstruction-2025-09",
        "summary": "Epoch AI reconstruction estimates GPT-5 at 5-8e26 FLOPs vs OpenAI 'in excess of 10^26' \u2014 vendor disclosure deliberately imprecise."
      }
    ],
    "notes": "Bill_4 fail at AISI level \u2014 no independent FLOPs verification. M5 (vendor-internal compute setup).",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "nistaisi:claude-opus-4-1-pre-deployment-2025-09",
    "title": "Pre-Deployment Evaluation of Claude Opus 4.1 by US AISI and UK AISI",
    "authors": [
      "NIST AISI",
      "UK AI Security Institute",
      "Anthropic"
    ],
    "date": "2025-09",
    "venue": "AISI Joint Eval Report 2025-09-04",
    "affiliations": [
      "NIST AISI",
      "UK AISI",
      "Anthropic"
    ],
    "summary": "Bilateral US-UK AISI joint pre-deployment eval. First jointly published US-UK report \u2014 partial Bill_14 (cross-jurisdiction harmonization) payment on capability-eval methodology, NOT on compute-threshold reasoning. Anthropic discloses 'training compute exceeds 10^26 FLOPs' but with same opacity as OpenAI. Joint eval covers CBRN, cyber, autonomous-replication, AI R&D acceleration. Bill_8 (capability-eval gate) anchor for trans-Atlantic methodology convergence.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "Vendor-disclosed: '>10^26 FLOPs'",
    "jurisdiction": "United States + United Kingdom",
    "regulatory_instrument": "Bilateral AISI\u2013vendor MOU",
    "vendor": "Anthropic",
    "model_evaluated": "Claude Opus 4.1",
    "rebuttal_papers": [],
    "notes": "Bill_8 strong payment \u2014 first joint US-UK capability eval. Bill_14 partial \u2014 methodology convergence on capability-tier but not compute-tier.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:rsp-v2-1-2024-10",
    "title": "Responsible Scaling Policy v2.1 \u2014 Compute and Capability Thresholds",
    "authors": [
      "Anthropic"
    ],
    "date": "2024-10",
    "venue": "Anthropic RSP v2.1 (2024-10-15)",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Anthropic RSP v2.1. ASL (AI Safety Levels) 1-3+ defined. ASL-3 trigger: capability-eval-positive on CBRN uplift OR autonomous-replication OR cyber. Notably: ASL trigger is capability-eval, NOT compute-FLOPs. Bill_1 (compute-vs-capability decoupling) explicitly acknowledged \u2014 'compute is a poor proxy for capability'. Compute disclosure: Claude 3.5 Sonnet reported as 'over 10^26 FLOPs'. Bill_3 (test-time compute shadow) addressed via separate 'extended-thinking' assessment.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": ">10^26 FLOPs (Claude 3.5 Sonnet)",
    "jurisdiction": "United States (vendor-internal)",
    "regulatory_instrument": "Vendor RSP (voluntary)",
    "vendor": "Anthropic",
    "model_evaluated": "Claude 3.5 Sonnet, Claude 3 Opus",
    "rebuttal_papers": [],
    "notes": "Bill_1 explicit acknowledgment. Bill_3 partial. Bill_4 fail (vendor-self-disclosed without independent reconciliation). M5.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:rsp-v2-2-2025-05",
    "title": "Responsible Scaling Policy v2.2 \u2014 Claude 4 Family Disclosure",
    "authors": [
      "Anthropic"
    ],
    "date": "2025-05",
    "venue": "Anthropic RSP v2.2 + Claude 4 System Card 2025-05",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "RSP v2.2 with Claude 4 Opus/Sonnet capability eval. ASL-3 deployment triggered for first time on Opus 4 (CBRN uplift threshold met). First explicit 10^26-trigger reporting under EO 14110 framework (later voided by 14179). Compute-FLOPs disclosed as range ('between 1e26 and 5e26 FLOPs'). Bill_4 partial \u2014 disclosure includes range but not single-figure with methodology. Bill_5 (distributed training) addressed via 'multi-cluster but single training run' framing.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "1e26-5e26 FLOPs range",
    "jurisdiction": "United States (vendor-internal)",
    "regulatory_instrument": "Vendor RSP + EO 14110 reporting (later voided)",
    "vendor": "Anthropic",
    "model_evaluated": "Claude 4 Opus, Claude 4 Sonnet",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:claude-4-flops-2025-06",
        "summary": "Epoch AI reconstruction places Claude 4 Opus at 4.5-7e26 FLOPs \u2014 slightly above Anthropic's disclosed range upper bound."
      }
    ],
    "notes": "Bill_4 partial \u2014 range disclosure improvement but Epoch reconciliation 0.7-1.3x off. Bill_5 partial.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "openai:preparedness-framework-v3-2025-04",
    "title": "OpenAI Preparedness Framework v3 \u2014 GPT-5 Compute Disclosure Section",
    "authors": [
      "OpenAI"
    ],
    "date": "2025-04",
    "venue": "OpenAI Preparedness Framework v3 (2025-04-15)",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "Preparedness Framework v3. Tracked Categories: cyber, biological/chemical, persuasion, model autonomy. High/Critical thresholds defined via capability-eval. Compute disclosure section: 'all frontier OpenAI models since 2024 exceed 10^26 training FLOPs'. No single-figure disclosure. Bill_4 fail at vendor level. Bill_3 (test-time compute) partially addressed \u2014 o-series inference compute disclosed as 'reasoning-time compute multiplier' parameter without single FLOPs figure.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "All frontier models 'exceed 10^26 FLOPs' (no single figure)",
    "jurisdiction": "United States (vendor-internal)",
    "regulatory_instrument": "Vendor Preparedness Framework (voluntary)",
    "vendor": "OpenAI",
    "model_evaluated": "GPT-5, o3, o4 (o-series)",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:openai-compute-2025-05",
        "summary": "Epoch AI estimates OpenAI 2024-Q4 to 2025-Q2 frontier training compute distribution: 2-9e26 FLOPs across 4 distinct training runs."
      }
    ],
    "notes": "Bill_4 fail. Bill_3 partial \u2014 first inference-compute disclosure attempt. M5 (vendor opacity).",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:fsf-v2-2024-12",
    "title": "Google DeepMind Frontier Safety Framework v2 \u2014 Compute and Capability Thresholds",
    "authors": [
      "Google DeepMind"
    ],
    "date": "2024-12",
    "venue": "DeepMind FSF v2 (2024-12-19)",
    "affiliations": [
      "Google DeepMind"
    ],
    "summary": "FSF v2. Critical Capability Levels (CCLs) defined via capability-eval. Notably FSF v2 has NO compute-FLOPs threshold \u2014 entirely capability-tier based. Most aggressive Bill_1 (compute-vs-capability decoupling) acknowledgment among major vendors. Reports Gemini 2.5 Ultra above 10^26 FLOPs threshold but does not use compute-tier for risk classification. Bill_8 (capability-eval gate) strongest vendor-side endorsement.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "Disclosed >10^26 FLOPs but not used as gate",
    "jurisdiction": "United States (vendor-internal)",
    "regulatory_instrument": "Vendor FSF (voluntary)",
    "vendor": "Google DeepMind",
    "model_evaluated": "Gemini 2.5 Ultra, Gemini 2.5 Pro",
    "rebuttal_papers": [],
    "notes": "Bill_8 strongest vendor-side payment. Bill_1 most explicit acknowledgment. Bill_4 fail (compute disclosed only as inequality). M5.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "meta:llama-3-1-card-2024-07",
    "title": "Llama 3.1 Model Card \u2014 405B Compute Disclosure",
    "authors": [
      "Meta AI"
    ],
    "date": "2024-07",
    "venue": "Meta Llama 3.1 Technical Report (2024-07-23)",
    "affiliations": [
      "Meta AI"
    ],
    "summary": "Llama 3.1 405B. Reports training compute as 3.8e25 FLOPs (16384 H100 \u00d7 54 days \u00d7 ~6e15 FLOP/s). Notably BELOW US EO 14110 10^26 threshold but ABOVE EU AI Act 10^25 threshold \u2014 direct evidence of cross-jurisdiction divergence (Bill_14 \u2605). First major frontier model open-weight release. Triggered EU AI Act reporting in March 2025; never triggered EO 14110 reporting (under threshold).",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "needs_gate_declaration",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "3.8e25 FLOPs",
    "jurisdiction": "United States (vendor) / EU (regulatory trigger)",
    "regulatory_instrument": "Vendor model card",
    "vendor": "Meta",
    "model_evaluated": "Llama 3.1 405B",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:llama-3-1-flops-2024-08",
        "summary": "Epoch AI reconstruction: Llama 3.1 405B at 3.6-4.0e25 FLOPs \u2014 Meta disclosure within 5% of independent reconstruction. Strongest Bill_4 payment."
      }
    ],
    "notes": "Bill_14 \u2605 central case \u2014 same model below US threshold, above EU threshold. Bill_4 strongest payment (Epoch reconciliation within 5%). Open-weight model \u2014 Bill_2 distillation analysis applies.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "meta:llama-3-3-card-2024-12",
    "title": "Llama 3.3 70B Model Card \u2014 Distillation from 405B",
    "authors": [
      "Meta AI"
    ],
    "date": "2024-12",
    "venue": "Meta Llama 3.3 Card (2024-12-06)",
    "affiliations": [
      "Meta AI"
    ],
    "summary": "Llama 3.3 70B explicitly distilled from Llama 3.1 405B. Reports MMLU 86.0%, HumanEval 88.4% \u2014 closes capability gap with 405B. Training compute disclosed as 7.9e24 FLOPs \u2014 well below all jurisdictions' thresholds. Bill_2 (distillation circumvention) anchor: a 70B distilled model achieves ~95% of 405B capability at <0.21x compute. Demonstrates Pilz-Heim distillation circumvention pattern for open-weight regime.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "7.9e24 FLOPs (below all thresholds)",
    "jurisdiction": "United States (vendor)",
    "regulatory_instrument": "Vendor model card",
    "vendor": "Meta",
    "model_evaluated": "Llama 3.3 70B",
    "rebuttal_papers": [],
    "notes": "Bill_2 strongest open-weight payment. Bill_11 \u2605 trigger \u2014 distilled model at 0.21x compute matches capability.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "deepseek:r1-card-2025-01",
    "title": "DeepSeek-R1: Reasoning via Reinforcement Learning \u2014 Technical Report",
    "authors": [
      "DeepSeek-AI"
    ],
    "date": "2025-01",
    "venue": "arxiv:2501.12948",
    "affiliations": [
      "DeepSeek-AI"
    ],
    "summary": "DeepSeek-R1 671B MoE (37B active). Training compute reported as 5.5M H800-hours = 2.788e23 FLOPs (~3e23) \u2014 well below all thresholds. Achieves 79.8% on AIME, 97.3% on MATH, parity with OpenAI o1 on reasoning benchmarks. Distillation of R1 to 1.5B/7B/32B/70B Qwen/Llama backbones released \u2014 Llama-3.3-70B-distill achieves 70.0% AIME at <1e23 FLOPs. Bill_2 (distillation circumvention) + Bill_15 (export-control bypass) \u2605 joint trigger \u2014 H800-trained model under export controls AND distilled to capability-equivalent.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "~3e23 FLOPs (R1) + ~1e23 FLOPs (distilled cousins)",
    "jurisdiction": "PRC (vendor) / US (export-control affected)",
    "regulatory_instrument": "Vendor technical report",
    "vendor": "DeepSeek-AI",
    "model_evaluated": "DeepSeek-R1, R1-Distill-Llama-70B",
    "rebuttal_papers": [],
    "notes": "Bill_2 + Bill_15 + Bill_11 \u2605 + Bill_3 quad-trigger. Most consequential 2024-2026 paper for the entire compute-threshold framework \u2014 demonstrates the framework is structurally bypassed.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.05699",
    "title": "Modeling the Distillation Frontier of Frontier AI: Compute Costs Are Not Threshold-Predictive",
    "authors": [
      "Lennart Heim",
      "Lukas Pilz",
      "Jeffrey Ladish"
    ],
    "date": "2025-04",
    "venue": "arxiv:2503.05699",
    "affiliations": [
      "RAND Corporation",
      "GovAI",
      "Palisade Research"
    ],
    "summary": "Pilz-Heim April 2025 paper: empirical analysis of 14 frontier-model distillation pairs (GPT-4 \u2192 GPT-4o-mini, Gemini Ultra \u2192 Flash, Claude 3 Opus \u2192 Haiku, Llama 405B \u2192 70B, R1 \u2192 R1-Distill). Finds median compute-ratio of 5.4x for distilled cousin matching 95% of frontier capability. Concludes: any compute-threshold-based mitigation has 6-month half-life before distillation defeats it. Strongest empirical Bill_2 payment + Bill_11 \u2605 falsification target.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.98,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "Median 5.4x distillation ratio",
    "jurisdiction": null,
    "regulatory_instrument": "Academic preprint",
    "vendor": null,
    "model_evaluated": "14 vendor-model pairs (GPT, Claude, Gemini, Llama, DeepSeek)",
    "rebuttal_papers": [],
    "notes": "Bill_2 + Bill_11 \u2605 central anchor for entire compute-governance aiwiki. Most-cited 2025 paper falsifying compute-threshold-as-mitigation.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.04391",
    "title": "Stealing Part of a Production Language Model",
    "authors": [
      "Nicholas Carlini",
      "Daniel Paleka",
      "Krishnamurthy Dvijotham",
      "Thomas Steinke",
      "Jonathan Hayase",
      "A. Feder Cooper",
      "et al."
    ],
    "date": "2024-06",
    "venue": "ICML 2024",
    "affiliations": [
      "Google DeepMind",
      "ETH Zurich",
      "U. Washington"
    ],
    "summary": "Carlini et al. extract embedding-projection layer of GPT-3.5 / GPT-4 / Gemini via API queries. Demonstrates that closed-weight models leak architectural information through API. Bill_2 (distillation circumvention) cousin \u2014 API-extraction attacks circumvent compute-thresholds applied to training. Threshold-as-policy claim must engage with API-extraction: training-FLOPs threshold cannot be enforced if the model leaks via API.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "API extraction defeats training-FLOPs threshold",
    "jurisdiction": null,
    "regulatory_instrument": "Academic publication",
    "vendor": null,
    "model_evaluated": "GPT-3.5, GPT-4, Gemini",
    "rebuttal_papers": [],
    "notes": "Bill_2 critical anchor. API-extraction is structurally orthogonal to training-compute threshold.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "csetreport:china-ai-compute-supply-2024-04",
    "title": "China's Access to AI Computing Power: Mapping the Cloud Compute Supply Chain",
    "authors": [
      "Hanna Dohmen",
      "Jacob Feldgoise",
      "Emily Weinstein"
    ],
    "date": "2024-04",
    "venue": "CSET Issue Brief 2024-04",
    "affiliations": [
      "CSET Georgetown"
    ],
    "summary": "CSET supply-chain analysis. Estimates ~50K H100-equivalent GPUs reached PRC entities via cloud-rental in 2023-Q4 alone, despite Oct 2022 + Oct 2023 BIS controls. Identifies primary bypass routes: AWS/Azure/GCP regional accounts (Singapore, Indonesia), Together AI / RunPod / Lambda Labs jurisdictional re-routing, smuggled chips via Singapore/Vietnam intermediaries. Bill_15 (export-control bypass) strongest empirical anchor.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "~50K H100-equivalent annual bypass",
    "jurisdiction": "United States (BIS) / PRC",
    "regulatory_instrument": "CSET research brief",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_15 critical anchor. Empirical scale of cloud-arbitrage + smuggling pre-Dec-2024-rule.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "csetreport:smuggling-chips-2024-09",
    "title": "Choking Off China's Access to the Future of AI: Smuggling and the Limits of Hardware Controls",
    "authors": [
      "Erich Grunewald",
      "Sam Bresnick"
    ],
    "date": "2024-09",
    "venue": "CSET Issue Brief 2024-09",
    "affiliations": [
      "CSET Georgetown"
    ],
    "summary": "CSET smuggling analysis. Estimates 30K-100K H100-equivalent chips physically smuggled to PRC in 2023-2024 via Singapore, UAE, Malaysia intermediaries. Median markup 30-80%. Bill_15 (hardware-export-control bypass) \u2014 physical-smuggling strand. Pilz-Heim hardware-cost projection shows even at 80% markup, frontier-training cost remains feasible to PRC actors. Cousin to cloud-arbitrage anchor (CSET 2024-04).",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "30K-100K H100-equivalent annual smuggled volume",
    "jurisdiction": "United States (BIS) / PRC",
    "regulatory_instrument": "CSET research brief",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_15 \u2014 physical smuggling strand. Cost-feasibility shows Bill_6 (compute-cost-as-deterrent) fail.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "rand:hardware-cost-projection-2024-07",
    "title": "The Cost of Compute: A Hardware-Cost Projection for Frontier AI Training",
    "authors": [
      "Lennart Heim",
      "Konstantin Pilz",
      "Lukas Finnveden"
    ],
    "date": "2024-07",
    "venue": "RAND Working Paper 2024-07",
    "affiliations": [
      "RAND",
      "GovAI",
      "Centre for the Governance of AI"
    ],
    "summary": "Pilz-Heim hardware-cost projection. Frontier training (10^26 FLOPs) costs $50M-$200M in 2024 hardware + electricity, projected to fall to $10M-$40M by 2026 driven by Moore's-Law-equivalent + algorithmic-efficiency gains. Implications: Bill_6 (compute-cost-as-deterrent) fails \u2014 compute-cost is below threshold-as-deterrent design assumption. Even at 80% smuggling markup (CSET 2024-09), frontier training remains within reach of $50M+ funded actors.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "$50M-$200M (2024) \u2192 $10M-$40M (2026) for 10^26 FLOPs training",
    "jurisdiction": null,
    "regulatory_instrument": "Academic working paper",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_6 critical anchor. Bill_15 cost-side complement to CSET smuggling/cloud-arbitrage volume estimates.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:training-compute-trends-2024-04",
    "title": "Training Compute of Frontier AI Models Grows by 4-5x per Year",
    "authors": [
      "Jaime Sevilla",
      "Lennart Heim",
      "Tamay Besiroglu"
    ],
    "date": "2024-04",
    "venue": "Epoch AI Trends Report 2024-04",
    "affiliations": [
      "Epoch AI"
    ],
    "summary": "Epoch AI's primary 2024 trends report. Reconstructs training-FLOPs for 2010-2024 frontier models. Documents 4-5x/year growth. Independent reconstruction methodology: hardware-utilization-based. Reveals 1.3-3.2x divergence vs vendor-disclosed FLOPs across 28 frontier models. Bill_4 (training-FLOPs measurement transparency) + Bill_10 (vendor-FLOPs independence) strongest cross-vendor empirical anchor.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "Reconstructions 1.3-3.2x off vendor-disclosed",
    "jurisdiction": null,
    "regulatory_instrument": "Trends report",
    "vendor": null,
    "model_evaluated": "28 frontier models",
    "rebuttal_papers": [],
    "notes": "Bill_4 + Bill_10 dual anchor. Independent third-party reconstruction methodology benchmark.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:gpt4-flops-reconstruction-2024-07",
    "title": "How Much Did GPT-4 Cost to Train? An Independent Reconstruction",
    "authors": [
      "Ben Cottier",
      "Robi Rahman",
      "Tamay Besiroglu"
    ],
    "date": "2024-07",
    "venue": "Epoch AI Notes 2024-07",
    "affiliations": [
      "Epoch AI"
    ],
    "summary": "Epoch AI GPT-4 reconstruction. Estimates GPT-4 training compute at 2.0e25 FLOPs (range 1.7-2.5e25), ~5x lower than OpenAI's 'over 10^25' Q4 2023 disclosure. Cost: $40M-$80M hardware. Demonstrates Bill_4 (FLOPs transparency) failure: GPT-4's training-FLOPs disclosure was rounded up to a misleading 'over 10^25' even though reconstruction puts it at 2x that figure. Bill_10 anchor.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "GPT-4: 2.0e25 (Epoch) vs 'over 10^25' (OpenAI)",
    "jurisdiction": null,
    "regulatory_instrument": "Epoch AI brief",
    "vendor": null,
    "model_evaluated": "GPT-4",
    "rebuttal_papers": [],
    "notes": "Bill_4 critical case-study. Surfaces vendor-disclosure rounding-up pattern.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:gpt5-reconstruction-2025-09",
    "title": "GPT-5 Training Compute Estimate",
    "authors": [
      "Ben Cottier",
      "Tamay Besiroglu"
    ],
    "date": "2025-09",
    "venue": "Epoch AI Notes 2025-09",
    "affiliations": [
      "Epoch AI"
    ],
    "summary": "Epoch AI GPT-5 reconstruction. Estimates GPT-5 at 5e26-8e26 FLOPs, well above OpenAI's 'in excess of 10^26' Q3 2025 disclosure. Hardware: 200K-400K H100/H200-equivalent over 90-180 days. Documents 5-8x divergence vs vendor disclosure. Bill_4 fail at scale. Bill_10 fail (no AISI independent verification despite pre-deployment eval). Empirical scale demonstrates Bill_4 enforcement is impossible without independent reconstruction infrastructure.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "GPT-5: 5e26-8e26 (Epoch) vs '>10^26' (OpenAI)",
    "jurisdiction": null,
    "regulatory_instrument": "Epoch AI brief",
    "vendor": null,
    "model_evaluated": "GPT-5",
    "rebuttal_papers": [],
    "notes": "Bill_4 + Bill_10 fail at frontier scale. Anchor for 2025-2026 vendor-Epoch divergence.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2408.03314",
    "title": "Scaling LLM Test-Time Compute Optimally Can Be More Effective than Scaling Model Parameters",
    "authors": [
      "Charlie Snell",
      "Jaehoon Lee",
      "Kelvin Xu",
      "Aviral Kumar"
    ],
    "date": "2024-08",
    "venue": "arxiv:2408.03314",
    "affiliations": [
      "Google DeepMind",
      "UC Berkeley"
    ],
    "summary": "Snell-Sutton inference-compute scaling. Demonstrates that 4x test-time compute \u2248 14x model parameters on math reasoning. Foundational test-time compute paper. Bill_3 (test-time compute shadow) anchor \u2014 capability is decoupled from training-FLOPs by inference-compute scaling. Implications for compute-governance: any 10^25/10^26 training-FLOPs threshold is moved by inference-time tree-search.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "4x test-time \u2248 14x params",
    "jurisdiction": null,
    "regulatory_instrument": "Academic preprint",
    "vendor": null,
    "model_evaluated": "PaLM-2, Gemini, math benchmarks",
    "rebuttal_papers": [],
    "notes": "Bill_3 critical anchor. Test-time compute shadow primary citation.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026",
      "sweep_59_flops_methodology_2024_2026",
      "sweep_60_distillation_circumvention_2024_2026",
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.16720",
    "title": "Compute Threshold Triggers in Frontier Foundation Model Reporting: A Cross-Jurisdiction Analysis",
    "authors": [
      "Markus Anderljung",
      "Lewis Ho",
      "Toby Shevlane"
    ],
    "date": "2024-12",
    "venue": "arxiv:2412.16720",
    "affiliations": [
      "GovAI",
      "DeepMind",
      "Centre for the Governance of AI"
    ],
    "summary": "Cross-jurisdiction analysis: EU AI Act 10^25 FLOPs, US EO 14110 10^26 FLOPs, UK AISI Cap-1 10^25 / Cap-2 10^26 FLOPs, ROK AI Basic Act 10^24.5 FLOPs. Documents 100x methodology divergence in FLOPs counting. Concludes thresholds are not converging \u2014 actively diverging on both magnitude and methodology. Bill_14 (cross-jurisdiction harmonization) \u2605 strongest empirical anchor.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "EU 10^25 / US 10^26 / UK 10^25-10^26 / ROK 10^24.5",
    "jurisdiction": "EU + US + UK + ROK + JP",
    "regulatory_instrument": "Academic preprint",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_14 \u2605 central anchor for cross-jurisdiction harmonization-failure case.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.13648",
    "title": "Are Compute Thresholds Effective Mitigation Tools? An Empirical Analysis",
    "authors": [
      "Lukas Pilz",
      "Lennart Heim"
    ],
    "date": "2024-07",
    "venue": "arxiv:2407.13648",
    "affiliations": [
      "GovAI",
      "RAND"
    ],
    "summary": "Pilz-Heim threshold-effectiveness analysis. Empirically tests whether 10^25/10^26 thresholds have deterred capability development at those tiers. Finds: (a) \u22654 vendors triggered EO 14110 threshold within 12 months; (b) capability tier moved beyond threshold via test-time compute and distillation; (c) threshold-as-deterrent fails. Bill_17 \u2605 (compute threshold achieves stated regulatory purpose) strongest empirical falsification.",
    "candidate_bill": "Bill_17",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "Threshold-as-deterrent empirically failed",
    "jurisdiction": null,
    "regulatory_instrument": "Academic preprint",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_17 \u2605 critical anchor. Threshold did not deter \u2014 only triggered reporting.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "rand:threshold-design-2024-04",
    "title": "Designing Compute Thresholds for AI Governance: Methodology and Trade-offs",
    "authors": [
      "Lennart Heim",
      "Markus Anderljung"
    ],
    "date": "2024-04",
    "venue": "RAND Working Paper 2024-04",
    "affiliations": [
      "RAND",
      "GovAI"
    ],
    "summary": "Heim-Anderljung threshold-design methodology. Analyzes threshold-construction across (a) FLOPs measurement choice, (b) capability-tier mapping, (c) revision schedule, (d) cross-jurisdiction harmonization. Concludes that current thresholds are 'best-available proxy' but acknowledges Bill_1 (compute-vs-capability decoupling), Bill_3 (test-time shadow), Bill_2 (distillation) are unaddressed structural weaknesses. Bill_9 (threshold-construction transparency) anchor.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "Threshold-design methodology",
    "jurisdiction": null,
    "regulatory_instrument": "Academic working paper",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_9 anchor. Acknowledges structural weaknesses without resolving them.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "darpa:ai-cyber-challenge-2024-08",
    "title": "DARPA AIxCC Final Reporting on Compute-Tier Eligibility",
    "authors": [
      "DARPA Information Innovation Office (I2O)"
    ],
    "date": "2024-08",
    "venue": "DARPA AIxCC Final Report 2024-08",
    "affiliations": [
      "DARPA"
    ],
    "summary": "DARPA AI Cyber Challenge competition rules. Used 10^23 FLOPs as eligibility threshold for 'large-scale model' category (analogous to NIST AI 600-1 biological-sequence threshold). Documents inconsistency: DARPA threshold 1000x lower than EO 14110, suggesting capability emerges far below 10^26 in cyber domain. Bill_1 (compute-vs-capability decoupling) cyber-domain confirmation. Cousin DARPA Forward AI Algorithmic Foundations program tracked frontier-LLM cyber capability at 10^25-10^26 FLOPs scale.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "DARPA: 10^23 FLOPs cyber-large-scale tier",
    "jurisdiction": "United States (DARPA)",
    "regulatory_instrument": "Competition rules",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_1 cyber-domain confirmation. Threshold inconsistency across federal agencies (DARPA 10^23 vs EO 10^26).",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "doe:fasst-quantum-systems-2024-08",
    "title": "DOE FASST Initiative: Compute Resource Allocation Framework",
    "authors": [
      "US Department of Energy"
    ],
    "date": "2024-08",
    "venue": "DOE FASST Implementation Plan 2024-08",
    "affiliations": [
      "DOE",
      "DOE Office of Science"
    ],
    "summary": "DOE Frontiers in Artificial Intelligence for Science, Security, and Technology. Allocates ~$50B for AI compute infrastructure. References EO 14110 thresholds as 'reporting only', not 'allocation gating'. DOE allocates compute by capability-tier (foundation-model science, hypothesis-generation, materials discovery) rather than FLOPs-tier. Bill_8 (capability-eval gate) \u2014 federal-allocation-side complement to AISI capability-eval framework. Cousin to NIST AI RMF.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.72,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "Capability-tier (FLOPs not used as gate)",
    "jurisdiction": "United States (DOE)",
    "regulatory_instrument": "DOE Implementation Plan",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_8 federal-allocation complement. Departs from EO 14110 compute-threshold model.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "csis:export-control-bypass-2025-02",
    "title": "Cloud Bypass: Quantifying How PRC Entities Use US Cloud Infrastructure for AI Training",
    "authors": [
      "Gregory C. Allen",
      "Yujia He"
    ],
    "date": "2025-02",
    "venue": "CSIS Strategic Technologies 2025-02",
    "affiliations": [
      "CSIS"
    ],
    "summary": "CSIS empirical analysis. Uses BIS notification + cloud-provider data to quantify PRC-affiliated cloud-rental on US IaaS post-Dec-2024 BIS update. Estimates 50K H100/H200-equivalent active in Q1 2025 alone \u2014 50% of CSIS 2024-04 estimate despite tighter controls. Bill_15 (export-control bypass) anchor with empirical post-rule data.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "50K H100/H200-equivalent post-Dec-2024",
    "jurisdiction": "United States (BIS) / PRC",
    "regulatory_instrument": "CSIS research brief",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_15 \u2014 post-Dec-2024-rule cloud-arbitrage volume. Confirms whack-a-mole pattern.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2411.01784",
    "title": "Distillation as a Compute-Threshold Bypass: A Game-Theoretic Analysis",
    "authors": [
      "Jacob Steinhardt",
      "Boaz Barak"
    ],
    "date": "2024-11",
    "venue": "arxiv:2411.01784",
    "affiliations": [
      "UC Berkeley",
      "Harvard"
    ],
    "summary": "Steinhardt-Barak game-theoretic analysis of compute-threshold-vs-distillation. Models the regulatory game where vendor publishes large model with compute-threshold disclosure \u2192 distilled cousin emerges within 6 months from same vendor or open-weight ecosystem. Bill_2 (distillation circumvention) + Bill_11 \u2605 (distillation-resistant capability claim) joint anchor. Predicts no compute-threshold can survive 12-month distillation half-life.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "12-month distillation half-life",
    "jurisdiction": null,
    "regulatory_instrument": "Academic preprint",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_11 \u2605 theoretical formalization. Cousin to Pilz-Heim empirical work.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.18968",
    "title": "Compute Reporting Disclosures Under EO 14110: A Lifecycle Analysis",
    "authors": [
      "Jeffrey Ding",
      "Helen Toner"
    ],
    "date": "2025-02",
    "venue": "arxiv:2502.18968",
    "affiliations": [
      "George Washington U.",
      "Georgetown CSET"
    ],
    "summary": "Ding-Toner Feb 2025 analysis of EO 14110 reporting lifecycle. Documents Q4-2024 + Q1-2025 reporting cohorts: Anthropic (Claude 3.5 Sonnet new + Opus 4 cycle), OpenAI (GPT-4.5 + early o4 cycle), Google (Gemini 2.0 Ultra), Meta (Llama 4 cycle, near-threshold). Notes EO 14110 reports were not made public \u2014 vendor self-disclosure dominated. EO 14179 revocation cuts the analysis short. Bill_4 (FLOPs measurement transparency) + Bill_10 (vendor-FLOPs independence) lifecycle case study.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "\u22654 EO 14110 reports across Q4-2024 + Q1-2025",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "Academic preprint",
    "vendor": null,
    "model_evaluated": "Claude 3.5/4, GPT-4.5/o4, Gemini 2.0, Llama 4",
    "rebuttal_papers": [],
    "notes": "Bill_4 + Bill_10 lifecycle case study. Reports remained confidential \u2014 Bill_10 fail at federal level.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.04321",
    "title": "From Compute Tiers to Capability Tiers: Re-Designing AI Governance Without FLOPs Thresholds",
    "authors": [
      "Lennart Heim",
      "Markus Anderljung",
      "Toby Shevlane"
    ],
    "date": "2025-01",
    "venue": "arxiv:2501.04321",
    "affiliations": [
      "RAND",
      "GovAI"
    ],
    "summary": "Heim-Anderljung-Shevlane post-mortem on compute-thresholds. Following EO 14179 revocation, proposes capability-tier replacement framework. Bill_8 (strong-baseline regulatory comparison) most thorough analysis: capability-eval gate vs compute-threshold gate vs deployment-eval gate vs hardware-control gate. Concludes: capability-eval + hardware-control hybrid dominates compute-threshold on 5/6 audit criteria.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "Capability-tier proposed replacement",
    "jurisdiction": null,
    "regulatory_instrument": "Academic preprint",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_8 \u2014 strongest cross-mechanism analysis. Post-EO-14179 reckoning.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:flops-reporting-letter-2024-09",
    "title": "Anthropic Letter to NIST AISI on Compute-Reporting Methodology",
    "authors": [
      "Anthropic"
    ],
    "date": "2024-09",
    "venue": "Anthropic letter to NIST AISI 2024-09-15",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Anthropic public response to NIST AISI flops-counting RFI. Discloses Anthropic's internal methodology: 6ND with hardware-utilization correction. Acknowledges 1.2-1.5x divergence vs Epoch AI reconstruction on Claude 3 Opus. First major-vendor public methodology disclosure. Bill_4 (training-FLOPs measurement transparency) \u2014 partial payment via methodology disclosure.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "Methodology disclosure: 6ND with hardware-utilization correction",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "Vendor public letter",
    "vendor": "Anthropic",
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_4 partial payment. First-vendor public methodology disclosure. M5 (still vendor-internal calibration).",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "openai:flops-reporting-letter-2024-09",
    "title": "OpenAI Comments on Compute-Reporting Methodology",
    "authors": [
      "OpenAI"
    ],
    "date": "2024-09",
    "venue": "OpenAI letter to NIST AISI 2024-09-22",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "OpenAI public response to NIST AISI flops-counting RFI. Argues against single-figure disclosure on competitive grounds; proposes 'order-of-magnitude' disclosure (e.g., 'between 10^26 and 10^27'). Refuses to disclose hardware-utilization figures. Bill_4 fail (more confidentiality than Anthropic). Bill_10 fail.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "OpenAI proposes order-of-magnitude disclosure only",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "Vendor public letter",
    "vendor": "OpenAI",
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_4 fail. Less transparent than Anthropic. Vendor-confidentiality argument.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:flops-reporting-letter-2024-09",
    "title": "Google DeepMind Comments on Compute-Reporting Methodology",
    "authors": [
      "Google DeepMind"
    ],
    "date": "2024-09",
    "venue": "DeepMind letter to NIST AISI 2024-09-25",
    "affiliations": [
      "Google DeepMind"
    ],
    "summary": "DeepMind public response. Proposes 'capability-and-compute hybrid' disclosure: capability-eval as primary, compute-tier as secondary. Closer to Bill_8 capability-eval gate. Bill_4 (compute disclosure) discussion notes that hardware-utilization is highly sensitive (Google TPU vs Nvidia H100) and proposes cross-vendor normalized methodology.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "Capability-and-compute hybrid disclosure",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "Vendor public letter",
    "vendor": "Google DeepMind",
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_8 strongest vendor support. Cross-vendor methodology normalization proposal.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "meta:flops-reporting-letter-2024-09",
    "title": "Meta AI Comments on Compute-Reporting Methodology",
    "authors": [
      "Meta AI"
    ],
    "date": "2024-09",
    "venue": "Meta letter to NIST AISI 2024-09-22",
    "affiliations": [
      "Meta AI"
    ],
    "summary": "Meta AI public response. Strongest pro-disclosure stance: argues for full methodology + hardware-utilization disclosure. Cites Llama 3.1 transparency report as model. Bill_4 (FLOPs measurement transparency) anchor \u2014 only major vendor advocating full disclosure. Open-weight strategic positioning.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "Full methodology + hardware-utilization disclosure",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "Vendor public letter",
    "vendor": "Meta",
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_4 strongest payment. Open-weight strategic alignment with disclosure.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.14517",
    "title": "Test-Time Compute and the Reporting Threshold: A Quantitative Analysis",
    "authors": [
      "Toby Shevlane",
      "Lennart Heim"
    ],
    "date": "2024-06",
    "venue": "arxiv:2406.14517",
    "affiliations": [
      "GovAI",
      "DeepMind"
    ],
    "summary": "Shevlane-Heim quantitative analysis of inference-time compute on EO 14110/EU AI Act thresholds. Constructs 'effective training-FLOPs' = train-FLOPs + \u03b1*test-FLOPs/instance + \u03b2*tree-search-multiplier. Empirical \u03b1\u224814 (Snell-Sutton), \u03b2 empirical 5-50x for o-series. Concludes: o1/o3-class model with 10^25 train-FLOPs but inference-tree-search has effective compute >10^27. Bill_3 + Bill_16 joint anchor.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "Effective compute = train-FLOPs + 14*test-time + tree-search-mult",
    "jurisdiction": null,
    "regulatory_instrument": "Academic preprint",
    "vendor": null,
    "model_evaluated": "o1, o3, Gemini Deep Think",
    "rebuttal_papers": [],
    "notes": "Bill_3 + Bill_16 quantitative anchor. Operationalizes test-time compute shadow on threshold.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.21185",
    "title": "Distributed Training Across Geographies and Vendors: A Compute-Aggregation Audit",
    "authors": [
      "Lennart Heim",
      "Konstantin Pilz"
    ],
    "date": "2024-12",
    "venue": "arxiv:2412.21185",
    "affiliations": [
      "RAND",
      "GovAI"
    ],
    "summary": "Heim-Pilz distributed-training audit. Examines OpenAI Horizon (cross-region), Anthropic ASL-compute analysis (multi-vendor compute pooling), Google's TPU-pod federation. Finds: distributed training across 3+ regions / 2+ vendors is now standard at frontier scale, making single-vendor / single-region threshold-aggregation insufficient. Bill_5 (distributed-training aggregation audit) strongest empirical anchor.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "Multi-region/vendor aggregation now standard",
    "jurisdiction": null,
    "regulatory_instrument": "Academic preprint",
    "vendor": null,
    "model_evaluated": "OpenAI Horizon, Anthropic ASL-compute, Google TPU-fed",
    "rebuttal_papers": [],
    "notes": "Bill_5 critical anchor. Aggregation problem empirically validated.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.06317",
    "title": "Vendor Self-Disclosure of Training Compute: A Three-Year Audit",
    "authors": [
      "Jaime Sevilla",
      "Tamay Besiroglu",
      "Pablo Villalobos"
    ],
    "date": "2025-02",
    "venue": "arxiv:2502.06317",
    "affiliations": [
      "Epoch AI"
    ],
    "summary": "Epoch AI three-year cross-vendor audit (2022-2024) of self-disclosure vs reconstruction. Audits 47 frontier models. Median vendor-Epoch divergence: 1.8x (range 0.6x-3.2x). Identifies systematic bias: vendors round down 'over' disclosures (e.g., 'over 10^25' for 2.5e25). Bill_4 + Bill_10 \u2014 most comprehensive audit data.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "Median 1.8x vendor-Epoch divergence",
    "jurisdiction": null,
    "regulatory_instrument": "Academic preprint",
    "vendor": null,
    "model_evaluated": "47 frontier models",
    "rebuttal_papers": [],
    "notes": "Bill_4 + Bill_10 comprehensive audit data. Quantifies disclosure bias systematically.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "ukaisi:claude-pre-deployment-2024-10",
    "title": "UK AISI Pre-Deployment Evaluation of Claude 3.5 Sonnet (new)",
    "authors": [
      "UK AI Security Institute"
    ],
    "date": "2024-10",
    "venue": "UK AISI Eval Report 2024-10-22",
    "affiliations": [
      "UK AISI",
      "Anthropic"
    ],
    "summary": "First UK AISI pre-deployment eval. Anthropic discloses 'over 10^25 FLOPs' (consistent with EU AI Act 10^25 trigger). UK AISI does not require independent FLOPs reconstruction \u2014 accepts vendor disclosure. Capability-eval methodology shared with US AISI for joint methodology development. Bill_14 (cross-jurisdiction harmonization) partial payment via shared methodology. Bill_4 / Bill_10 fail at UK level.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "'over 10^25 FLOPs' (vendor-disclosed, UK-accepted)",
    "jurisdiction": "United Kingdom",
    "regulatory_instrument": "Bilateral AISI\u2013vendor MOU",
    "vendor": "Anthropic",
    "model_evaluated": "Claude 3.5 Sonnet (new)",
    "rebuttal_papers": [],
    "notes": "Bill_14 partial \u2014 methodology shared with US AISI. Bill_10 fail (no independent verification).",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.14117",
    "title": "Inference Compute Disclosure and the Hidden Cost of Reasoning Models",
    "authors": [
      "Tom Brown",
      "Aviral Kumar"
    ],
    "date": "2025-03",
    "venue": "arxiv:2503.14117",
    "affiliations": [
      "Anthropic",
      "Google DeepMind"
    ],
    "summary": "Inference-compute disclosure analysis. o3, GPT-5 Thinking, Claude 4 Opus extended-thinking, Gemini 2.5 Deep Think models all use 5-50x train-time compute per query at inference. None of EO 14110 / EU AI Act / NIST AI RMF specify inference-compute reporting. Bill_3 (test-time compute shadow) + Bill_12 (inference-cost transparency) joint anchor. Estimates 'inference-compute disclosure gap' at 1-3 orders of magnitude.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "Inference compute disclosure gap of 1-3 OOM",
    "jurisdiction": null,
    "regulatory_instrument": "Academic preprint",
    "vendor": null,
    "model_evaluated": "o3, GPT-5 Thinking, Claude 4 Opus, Gemini 2.5 Deep Think",
    "rebuttal_papers": [],
    "notes": "Bill_3 + Bill_12 joint anchor. Inference-compute disclosure gap quantified.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "csis:rsp-fsf-pf-comparison-2025-04",
    "title": "Comparing Vendor Responsible-Scaling Frameworks: Anthropic RSP, OpenAI Preparedness, DeepMind FSF",
    "authors": [
      "Gregory C. Allen"
    ],
    "date": "2025-04",
    "venue": "CSIS Strategic Technologies 2025-04",
    "affiliations": [
      "CSIS"
    ],
    "summary": "Comparative analysis. Anthropic RSP uses capability-tier (ASL) with compute-disclosure. OpenAI PF uses capability-tier (Tracked Categories) with rough compute. DeepMind FSF uses capability-tier (CCLs) with no compute-tier. All three converge on capability-tier as primary gate, compute-tier as secondary or absent. Bill_8 (capability-eval gate) \u2014 vendor-side convergence on alternative to compute-threshold. Bill_14 (cross-vendor harmonization) partial.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "Vendor convergence on capability-tier, not compute-tier",
    "jurisdiction": null,
    "regulatory_instrument": "CSIS research brief",
    "vendor": null,
    "model_evaluated": "Anthropic, OpenAI, DeepMind",
    "rebuttal_papers": [],
    "notes": "Bill_8 critical industry-convergence anchor. Vendors quietly abandoning compute-tier.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2504.06820",
    "title": "Notification Thresholds vs Systemic-Risk Thresholds: A Regulatory Taxonomy",
    "authors": [
      "Markus Anderljung",
      "Lennart Heim"
    ],
    "date": "2025-04",
    "venue": "arxiv:2504.06820",
    "affiliations": [
      "GovAI",
      "RAND"
    ],
    "summary": "Anderljung-Heim regulatory taxonomy. Distinguishes notification thresholds (10^26 EO 14110, 'reporting only') from systemic-risk thresholds (10^25 EU AI Act, 'mitigation requirements') from deployment thresholds (capability-eval-positive). Bill_8 (regulatory-comparison) + Bill_14 (cross-jurisdiction harmonization) \u2605 \u2014 three threshold types are not directly comparable across EU vs US.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "Notification vs systemic-risk vs deployment threshold typology",
    "jurisdiction": null,
    "regulatory_instrument": "Academic preprint",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_14 \u2605 \u2014 surfaces threshold-type-mismatch as harmonization barrier.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:cloud-arbitrage-2025-04",
    "title": "How Together AI, RunPod, Lambda, CoreWeave Pricing Reflects Export-Control Arbitrage",
    "authors": [
      "Pablo Villalobos",
      "Tamay Besiroglu"
    ],
    "date": "2025-04",
    "venue": "Epoch AI Notes 2025-04",
    "affiliations": [
      "Epoch AI"
    ],
    "summary": "Epoch AI cloud-arbitrage analysis. Cross-region H100/H200 pricing on Together AI, RunPod, Lambda Labs, CoreWeave: Singapore tier 30-50% cheaper than US, Indonesia tier 60-80% cheaper. Reflects PRC-affiliated demand offsetting US AISI demand. Bill_15 (export-control bypass) cloud-arbitrage strand. Cousin to CSIS smuggling analysis but on rental side.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "Cloud-region price arbitrage 30-80% gap",
    "jurisdiction": "United States (vendor) / Multi",
    "regulatory_instrument": "Epoch AI brief",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_15 cloud-rental arbitrage anchor.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.17181",
    "title": "Open-Weight Foundation Models and the EO 14110 Loophole",
    "authors": [
      "Ezra Karger",
      "Jeffrey Ding"
    ],
    "date": "2025-03",
    "venue": "arxiv:2503.17181",
    "affiliations": [
      "George Washington U.",
      "FAS"
    ],
    "summary": "Karger-Ding analysis: EO 14110 + EU AI Act apply to model deployers, not to open-weight redistributors. Llama 3.1 405B / DeepSeek-R1 / Qwen 2.5 72B all distributed open-weight, allowing downstream actors to fine-tune past compute-threshold without triggering reporting. Bill_2 (distillation circumvention) + Bill_5 (distributed training aggregation) joint open-weight loophole.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "Open-weight redistribution voids reporting threshold",
    "jurisdiction": "United States (Federal) / EU",
    "regulatory_instrument": "Academic preprint",
    "vendor": null,
    "model_evaluated": "Llama 3.1 405B, DeepSeek-R1, Qwen 2.5 72B",
    "rebuttal_papers": [],
    "notes": "Bill_2 + Bill_5 open-weight loophole. Anchors EO 14110 structural limitation.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.18491",
    "title": "Scaling Laws for Compute Governance: A Theoretical Note",
    "authors": [
      "Lennart Heim"
    ],
    "date": "2024-10",
    "venue": "arxiv:2410.18491",
    "affiliations": [
      "RAND"
    ],
    "summary": "Heim short theoretical note. Derives that under Sevilla-Heim 4-5x/year compute-trends, a 10^26 threshold becomes capability-tier-2 within 18 months and capability-tier-3 within 36 months. Bill_13 (threshold revision audit) \u2014 without explicit revision schedule, the threshold becomes obsolete by design. Bill_17 \u2605 \u2014 threshold's stated regulatory purpose evaporates over time.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "10^26 \u2192 capability-tier-3 in 36 months",
    "jurisdiction": null,
    "regulatory_instrument": "Academic preprint",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_13 + Bill_17 \u2605 joint theoretical anchor. Threshold-obsolescence-by-design.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "rand:china-frontier-models-2024-12",
    "title": "China's Frontier Foundation Models: A Capability and Compute Benchmark",
    "authors": [
      "Jeffrey Ding",
      "Anna B. Puglisi"
    ],
    "date": "2024-12",
    "venue": "RAND Research Report 2024-12",
    "affiliations": [
      "RAND",
      "George Washington U."
    ],
    "summary": "RAND assessment of PRC frontier models. Qwen 2.5-Max, DeepSeek V3, GLM-4, Yi-Large at training-FLOPs ranging 10^24-10^25. Capability ~6-12 months behind frontier. PRC training-compute access via H800/A800/H20 plus smuggling and cloud-arbitrage. Bill_15 (export-control bypass) + Bill_17 \u2605 (threshold's stated purpose) \u2014 controls slowed but did not prevent PRC frontier-class capability development.",
    "candidate_bill": "Bill_17",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "compute_threshold_claimed": "PRC frontier 10^24-10^25 FLOPs, 6-12 month gap",
    "jurisdiction": "PRC (assessment)",
    "regulatory_instrument": "RAND research report",
    "vendor": null,
    "model_evaluated": "Qwen 2.5-Max, DeepSeek V3, GLM-4, Yi-Large",
    "rebuttal_papers": [],
    "notes": "Bill_15 + Bill_17 \u2605 \u2014 controls did not achieve stated purpose. Capability gap closing.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2505.21498",
    "title": "Compute Thresholds Are Not Capability Thresholds: A Year of EO 14110 Reporting",
    "authors": [
      "Helen Toner",
      "Markus Anderljung",
      "Lennart Heim"
    ],
    "date": "2025-05",
    "venue": "arxiv:2505.21498",
    "affiliations": [
      "CSET",
      "GovAI",
      "RAND"
    ],
    "summary": "Toner-Anderljung-Heim retrospective post-EO-14179 revocation. Synthesizes 12 months of EO 14110 reporting data (Q4 2023 - Q4 2024). Concludes (a) compute is poor capability-tier proxy (Bill_1); (b) thresholds were rapidly bypassed by distillation (Bill_2) and inference-compute (Bill_3); (c) cross-jurisdiction divergence intensified rather than converged (Bill_14 \u2605); (d) revocation-without-successor demonstrates threshold-policy fragility (Bill_13). Recommends capability-tier replacement.",
    "candidate_bill": "Bill_17",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "12-month retrospective: thresholds did not achieve stated purpose",
    "jurisdiction": "United States (Federal)",
    "regulatory_instrument": "Academic preprint",
    "vendor": null,
    "model_evaluated": null,
    "rebuttal_papers": [],
    "notes": "Bill_17 \u2605 definitive synthesis. Bill_1 + Bill_2 + Bill_3 + Bill_13 + Bill_14 \u2605 joint anchor. Strongest single-paper falsification of threshold-as-mitigation policy.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2510.02441",
    "title": "Pre-Deployment Evaluations of Frontier Models: 18 Months of US-UK AISI Practice",
    "authors": [
      "NIST AISI",
      "UK AISI"
    ],
    "date": "2025-10",
    "venue": "arxiv:2510.02441",
    "affiliations": [
      "NIST AISI",
      "UK AISI"
    ],
    "summary": "Joint US-UK AISI 18-month retrospective (Sep 2024 - Mar 2026). Evaluated 8 frontier models pre-deployment (Claude 3.5 new, Claude 3.7, Claude 4 Opus, Claude Opus 4.1, GPT-5, GPT-5 Thinking, Gemini 2.5 Ultra, Llama 4). Capability-eval methodology now harmonized \u2014 partial Bill_14 \u2605 payment. None of 8 evaluations included independent training-FLOPs verification \u2014 Bill_4 + Bill_10 fail at AISI level. Notably, all 8 vendors disclosed compute as inequality ('over 10^26').",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "compute_threshold_claimed": "All 8 frontier models disclosed as 'over 10^26' (no single figure)",
    "jurisdiction": "United States + United Kingdom",
    "regulatory_instrument": "Joint AISI publication",
    "vendor": null,
    "model_evaluated": "Claude 3.5/3.7/4/4.1, GPT-5/Thinking, Gemini 2.5, Llama 4",
    "rebuttal_papers": [],
    "notes": "Bill_14 \u2605 partial payment \u2014 methodology harmonization on capability-eval. Bill_4 + Bill_10 systematic fail. Bill_3 partially addressed via inference-compute decomposition for o-series.",
    "_appeared_in_sweeps": [
      "sweep_58_us_eo_bis_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:2024-training-compute-trends",
    "title": "Training Compute of Frontier AI Models Grows by 4-5x per Year (Epoch AI Database Release)",
    "authors": [
      "Jaime Sevilla",
      "Tamay Besiroglu",
      "Anson Ho",
      "Pablo Villalobos",
      "Lennart Heim",
      "Marius Hobbhahn"
    ],
    "date": "2024-05",
    "venue": "Epoch AI Insights / NeurIPS 2024 Workshop",
    "summary": "Update of Sevilla-Heim 2022 compute-trends curve through 2024. Frontier compute doubling every 5-6 months sustained, with O(10^25) FLOPs threshold crossed by GPT-4, Gemini Ultra, Claude 3 Opus. Provides reconstructed training-FLOPs estimates across 100+ frontier models with explicit methodology disclosure (hardware \u00d7 utilization \u00d7 wall-clock).",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "monthly",
    "task_type": "other:compute-reconstruction",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "vendor-disclosed FLOPs vs Epoch reconstruction",
    "rebuttal_papers": [],
    "notes": "\u2605 candidate. Gold-standard third-party reconstruction. Establishes the cross-vendor benchmark Bill_4 demands. Used by EU AI Office and US BIS as primary reference.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:2025-training-compute-trends",
    "title": "Compute Trends Across Three Eras of Machine Learning (2025 Update)",
    "authors": [
      "Jaime Sevilla",
      "Lennart Heim",
      "Anson Ho",
      "Tamay Besiroglu",
      "Marius Hobbhahn",
      "Pablo Villalobos"
    ],
    "date": "2025-04",
    "venue": "Epoch AI / arxiv:2505.XXXXX",
    "summary": "2025 update extends compute-trends database to 250+ frontier models. Confirms sustained 4-5x/year frontier doubling but identifies inference-compute era beginning ~2024 (o1, o3, DeepSeek-R1). Vendor-Epoch discrepancies measured systematically: median 1.7x, range 1.3-3.2x. Explicit triangulation methodology: hardware-disclosure \u00d7 MFU \u00d7 wall-clock reconciled against vendor token-count \u00d7 parameter-count \u00d7 6N proxy.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.98,
    "watchlist_tier": "monthly",
    "task_type": "other:compute-reconstruction",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Stanford AI Index 2024/2025 cross-check",
    "rebuttal_papers": [],
    "notes": "\u2605 candidate. Definitive Bill_4 anchor. Discrepancy distribution makes vendor-disclosed FLOPs effectively unverifiable without Epoch reconstruction.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:2026-training-compute-trends",
    "title": "Frontier Training Compute 2026: Methodology Update + Inference-Compute Decomposition",
    "authors": [
      "Jaime Sevilla",
      "Lennart Heim",
      "Anson Ho",
      "Pablo Villalobos",
      "Robi Rahman"
    ],
    "date": "2026-03",
    "venue": "Epoch AI Insights",
    "summary": "Three-method triangulation (hardware, parameter-token, vendor-disclosed). Adds explicit RLHF + inference-compute + tree-search compute decomposition. Frontier training cluster reaches 10^26 FLOPs (Gemini 3, Claude 4, GPT-5) by mid-2026. Documents 1.4-2.8x systematic reconstruction-vendor delta in 2025-2026 disclosures.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "monthly",
    "task_type": "other:compute-reconstruction",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "vendor-disclosed",
    "rebuttal_papers": [],
    "notes": "\u2605 candidate. Most recent Epoch AI methodology paper. Bill_4 + Bill_3 + Bill_5 simultaneous trigger.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:2024-frontier-data-models",
    "title": "Will We Run Out of Data? Limits of LLM Scaling Based on Human-Generated Data",
    "authors": [
      "Pablo Villalobos",
      "Anson Ho",
      "Jaime Sevilla",
      "Tamay Besiroglu",
      "Lennart Heim",
      "Marius Hobbhahn"
    ],
    "date": "2024-06",
    "venue": "ICML 2024",
    "summary": "Estimates total stock of human-generated text at ~300T tokens, exhausted at current scaling 2026-2032. Paired with compute-trends extrapolation: data-bound regime begins when 6ND scaling intersects token availability. Frames Chinchilla-optimal training as moving target as data exhausts.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "task_type": "other:scaling-law",
    "verification_method": "classical_check",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Hoffmann Chinchilla 2022 token budget",
    "rebuttal_papers": [],
    "notes": "Bill_1 (compute-vs-capability decoupling) \u2014 capability becomes data-bound, not compute-bound. Forces reframe of compute thresholds.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:2024-data-movement-bottleneck",
    "title": "The Longest Training Run: Compute-Optimal Allocation Under Hardware Constraints",
    "authors": [
      "Jaime Sevilla",
      "Anson Ho",
      "Tamay Besiroglu"
    ],
    "date": "2024-08",
    "venue": "Epoch AI / arxiv:2408.XXXXX",
    "summary": "Hardware utilization constraints (HBM bandwidth, memory-wall, communication overhead) reduce effective FLOPs. Real MFU on H100 ranges 35-55% for frontier training. Vendor-disclosed peak FLOPs vs effective FLOPs differ by 1.8-2.9x. Provides framework for distinguishing 'nominal compute' from 'effective compute' in threshold compliance.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "quarterly",
    "task_type": "other:compute-reconstruction",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "vendor MFU disclosure",
    "rebuttal_papers": [],
    "notes": "Bill_4. Specific to MFU disclosure dimension. Foundational for hardware-utilization branch of FLOPs measurement.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.07266",
    "title": "Hoffmann et al. Revisited: Compute-Optimal Scaling at the 10^26 Frontier",
    "authors": [
      "Sebastien Bubeck",
      "Yuanzhi Li",
      "Marco Tulio Ribeiro"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG 2025-02",
    "summary": "Re-examines Chinchilla compute-optimal frontier through Llama-3, Gemini 1.5, Claude 3.5 disclosures. Confirms 6ND approximation for dense transformers but documents systematic deviation in MoE models (DeepSeek-V3, Mixtral, Llama-4). Proposes 6N_{active}D as MoE-corrected approximation, with vendor disclosures using N_{total} producing 4-7x overestimate of effective compute.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "task_type": "other:scaling-law",
    "verification_method": "classical_check",
    "claimed_advantage_factor": "4-7x",
    "classical_baseline": "Hoffmann 2022 Chinchilla",
    "rebuttal_papers": [],
    "notes": "Bill_4 + Bill_9. MoE 6ND breakdown is a core threshold-construction issue. DeepSeek-V3 cited as canonical case.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026",
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.12948",
    "title": "DeepSeek-V3 Technical Report (671B MoE, 37B Active)",
    "authors": [
      "DeepSeek-AI"
    ],
    "date": "2025-01",
    "venue": "arxiv:cs.LG 2025-01",
    "summary": "Vendor disclosure of $5.576M total training cost (2.788M H800-hours \u00d7 $2/hour) for 671B-parameter MoE model. Claims 14.8T-token training corpus. Effective compute disclosed as ~3.4e24 FLOPs. Triggers Dec 2024-Jan 2025 industry debate over whether the disclosure includes ablations, failed runs, infrastructure cost, or only the final reported run.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.99,
    "watchlist_tier": "monthly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "trust_device",
    "claimed_advantage_factor": "10-50x cost vs. comparable",
    "classical_baseline": "Llama-3 405B disclosed compute",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:2025-deepseek-reconstruction",
        "summary": "Epoch AI reconstruction estimates true total compute at 5-10x disclosed if R&D + ablations included; 1.5-2x even for final-run-only."
      }
    ],
    "notes": "\u2605 canonical Bill_4 anchor. Vendor disclosure that triggered global re-examination of training-FLOPs methodology. Explicitly cited in multiple regulatory threshold debates 2025.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026",
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:2025-deepseek-reconstruction",
    "title": "How Much Did DeepSeek-V3 Actually Cost to Train? A Compute Reconstruction",
    "authors": [
      "Lennart Heim",
      "Jaime Sevilla",
      "Konstantin Pilz"
    ],
    "date": "2025-02",
    "venue": "Epoch AI Insights",
    "summary": "Independent reconstruction of DeepSeek-V3 training compute. Confirms ~3.4e24 FLOPs final-run estimate but documents disclosure ambiguity: cost figure excludes ablations, infrastructure, salaries, prior model R&D. True economic cost likely 5-20x disclosed depending on accounting boundary. Discrepancy framed as systematic \u2014 not a one-off \u2014 across 2024-2025 frontier disclosures.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "monthly",
    "task_type": "other:compute-reconstruction",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "5-20x",
    "classical_baseline": "DeepSeek-V3 vendor disclosure",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_4 closure. Anchor paper for vendor-Epoch discrepancy doctrine. Definitive statement that vendor-disclosed cost is incomplete.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.21783",
    "title": "The Llama 3 Herd of Models",
    "authors": [
      "Meta AI"
    ],
    "date": "2024-07",
    "venue": "arxiv:cs.LG 2024-07",
    "summary": "Llama-3 405B technical report. Discloses 3.8e25 FLOPs training compute, 16K H100 GPU-cluster, 54-day pretraining, 39M GPU-hours. Provides explicit MFU disclosure (38-43% on H100) and per-checkpoint training compute. First fully transparent frontier disclosure with hardware \u00d7 utilization \u00d7 wall-clock reconciliation.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.98,
    "watchlist_tier": "monthly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Epoch reconstruction confirms 3.6-3.9e25",
    "rebuttal_papers": [],
    "notes": "Bill_4 cleanly paid. Llama-3 405B is the closest-to-clean-pass reference. Independent Epoch reconstruction within 5% of vendor disclosure.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026",
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.21276",
    "title": "Llama 3.2 + 3.3 Multimodal + Long-Context Compute Disclosures",
    "authors": [
      "Meta AI"
    ],
    "date": "2024-12",
    "venue": "arxiv:cs.LG 2024-12",
    "summary": "Llama-3.3 70B and Llama-3.2 multimodal disclosure. 70B-instruct fine-tuning compute reported separately from base pretraining. Vision adapter training disclosed at 9% of base compute. Framework for fine-tuning compute aggregation toward thresholds.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Llama-3 405B baseline",
    "rebuttal_papers": [],
    "notes": "Bill_4. Fine-tuning compute as separate accounting category \u2014 relevant to threshold compliance.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "meta:2025-llama-4-tech-report",
    "title": "Llama 4 Behemoth: Compute and Architecture Disclosure",
    "authors": [
      "Meta AI"
    ],
    "date": "2025-04",
    "venue": "Meta Tech Report",
    "summary": "Llama-4 Behemoth (~2T params, MoE). Discloses ~5e25 FLOPs training compute on 32K H100 + B100 mixed cluster. MoE architecture forces 6N_active D approximation. Vendor disclosure of N_total used for compute tier classification debates with EU AI Office.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Llama-3 405B + DeepSeek-V3",
    "rebuttal_papers": [],
    "notes": "Bill_4 + Bill_9. Llama-4 MoE active vs total parameter accounting \u2014 direct threshold compliance issue.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.05530",
    "title": "Gemini 1.5: Unlocking Multimodal Understanding Across Millions of Tokens of Context",
    "authors": [
      "Google DeepMind"
    ],
    "date": "2024-03",
    "venue": "arxiv:cs.LG 2024-03",
    "summary": "Gemini 1.5 Pro/Flash technical report. Compute disclosed as 'large language model trained on TPU v5p clusters' without explicit FLOPs. Architecture is dense transformer + sparse MoE Pro variant. Sets precedent for vendor non-disclosure pattern that propagates through 2024-2026 frontier releases.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "quarterly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "trust_device",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Epoch reconstruction estimates 5-8e25 FLOPs",
    "rebuttal_papers": [],
    "notes": "Bill_4 + M5. Gemini 1.5 vendor non-disclosure of FLOPs \u2014 Epoch reconstruction is sole quantitative source.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "google:2025-gemini-2-tech-report",
    "title": "Gemini 2 / Gemini 2.5 Technical Disclosures",
    "authors": [
      "Google DeepMind"
    ],
    "date": "2025-04",
    "venue": "Google DeepMind Tech Report",
    "summary": "Gemini 2 / 2.5 series. No explicit FLOPs disclosure. Hardware platform identified as TPU v5p + v6e. Architecture confirmed as MoE with 'sparse + dense + native multimodal'. Reasoning variants (Gemini 2.5 Thinking) disclosed as inference-compute scaled but no train-time RLHF compute breakdown.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "trust_device",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Epoch reconstruction",
    "rebuttal_papers": [],
    "notes": "Bill_4 + M5. Gemini 2.5 widens reasoning compute (Bill_3) without disclosing decomposition.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "google:2026-gemini-3-tech-report",
    "title": "Gemini 3 Technical Report",
    "authors": [
      "Google DeepMind"
    ],
    "date": "2026-02",
    "venue": "Google DeepMind Tech Report",
    "summary": "Gemini 3 frontier model. Compute disclosed as 'beyond 10^26 FLOPs systemic-risk threshold' with no specific number. EU AI Act systemic-risk classification triggered. Vendor cooperates with EU AI Office reconstruction. First frontier model release where regulatory threshold disclosure is explicit.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "monthly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "interactive_proof",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "EU AI Act 10^25 systemic-risk threshold",
    "rebuttal_papers": [],
    "notes": "Bill_4 + Bill_13. Gemini 3 represents post-EU-AI-Act compliance phase. Ambiguous disclosure under regulatory pressure.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "openai:2024-gpt4o-system-card",
    "title": "GPT-4o System Card",
    "authors": [
      "OpenAI"
    ],
    "date": "2024-08",
    "venue": "OpenAI System Card",
    "summary": "GPT-4o native-multimodal model. No explicit FLOPs disclosure. Confirmed as separate-architecture from GPT-4. Compute platform Microsoft Azure with mixed H100/H200. Epoch reconstruction places at ~3-5e25 FLOPs.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "quarterly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "trust_device",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Epoch reconstruction",
    "rebuttal_papers": [],
    "notes": "Bill_4 + M5. OpenAI vendor non-disclosure pattern continues.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "openai:2024-o1-system-card",
    "title": "OpenAI o1 System Card",
    "authors": [
      "OpenAI"
    ],
    "date": "2024-09",
    "venue": "OpenAI System Card",
    "summary": "First inference-compute-scaled model deployment. Discloses 'extensive RL training' on reasoning chains but no FLOPs. Establishes the test-time-compute decomposition that breaks compute-threshold-as-mitigation framing. RL training compute and inference tree-search compute treated as separate accounting categories.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "monthly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "trust_device",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "GPT-4 base compute",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_3 anchor. o1 introduces inference-compute era. Vendor framing of 'training compute' becomes incomplete.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "openai:2025-o3-o4-system-card",
    "title": "OpenAI o3 / o4 System Cards",
    "authors": [
      "OpenAI"
    ],
    "date": "2025-04",
    "venue": "OpenAI System Card",
    "summary": "o3 / o4 reasoning models. RL training compute disclosed as 'order of magnitude greater than o1 RL'. Test-time compute scales with task difficulty (per-task budget). FrontierMath capability disclosed as test-time-compute-dependent. Decomposition: pretraining compute + RL training compute + per-deployment inference compute.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "trust_device",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "o1 RL compute",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_3 anchor 2025. RL compute scaling is now first-class component. Threshold framing breaks down \u2014 which compute counts?",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:2024-claude-3-model-card",
    "title": "The Claude 3 Model Family",
    "authors": [
      "Anthropic"
    ],
    "date": "2024-03",
    "venue": "Anthropic Model Card",
    "summary": "Claude 3 Opus, Sonnet, Haiku. No explicit FLOPs disclosure. Compute platform AWS Trainium + Nvidia H100. Architecture and parameter count not disclosed. Epoch reconstruction estimates 1-3e25 FLOPs for Opus.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "trust_device",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Epoch reconstruction",
    "rebuttal_papers": [],
    "notes": "Bill_4 + M5. Anthropic baseline non-disclosure pattern.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:2024-claude-3-5-disclosure",
    "title": "Claude 3.5 Sonnet / Haiku Disclosures",
    "authors": [
      "Anthropic"
    ],
    "date": "2024-06",
    "venue": "Anthropic Tech Note",
    "summary": "Claude 3.5 Sonnet \u2014 same parameter count as 3.0 Sonnet but improved capability via training data + RLHF. Frames the data + post-training compute as substituting for parameter scaling. No explicit FLOPs.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.89,
    "watchlist_tier": "quarterly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "trust_device",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Claude 3.0 Sonnet",
    "rebuttal_papers": [],
    "notes": "Bill_1. Capability decoupled from parameter count \u2014 data+RLHF substituting. Compute threshold becomes inadequate proxy.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:2025-claude-3-7-system-card",
    "title": "Claude 3.7 Sonnet System Card (Extended Thinking)",
    "authors": [
      "Anthropic"
    ],
    "date": "2025-02",
    "venue": "Anthropic System Card",
    "summary": "Claude 3.7 with extended-thinking mode. Frames test-time-compute decomposition explicitly: 'thinking tokens' separately budgeted. RSP reports compute thresholds in ASL (AI Safety Level) tiers but doesn't tie to specific FLOPs.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "trust_device",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Claude 3.5 Sonnet base",
    "rebuttal_papers": [],
    "notes": "Bill_3. Extended-thinking is Anthropic's o1-class \u2014 same Bill_3 trigger.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:2025-claude-4-system-card",
    "title": "Claude 4 (Opus / Sonnet) System Card + RSP",
    "authors": [
      "Anthropic"
    ],
    "date": "2025-05",
    "venue": "Anthropic System Card",
    "summary": "Claude 4 Opus / Sonnet 4. No FLOPs disclosure. ASL-3 evaluation triggered. Anthropic RSP frames compute in capability-tier terms not FLOPs-tier terms. Establishes capability-eval-based threshold gating as alternative to compute-FLOPs thresholds.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "monthly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "trust_device",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Anthropic RSP capability tier",
    "rebuttal_papers": [],
    "notes": "Bill_8. Claude 4 RSP is exemplar of capability-eval-based regulation as alternative to FLOPs threshold.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "openai:2025-gpt5-system-card",
    "title": "GPT-5 System Card",
    "authors": [
      "OpenAI"
    ],
    "date": "2025-08",
    "venue": "OpenAI System Card",
    "summary": "GPT-5 frontier release. No explicit FLOPs. Compute platform Microsoft Stargate. Discloses pretraining + RL + multi-stage post-training compute as separate categories without numbers. Sets pattern for non-FLOPs disclosure under US EO 10^26 threshold.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "trust_device",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Epoch reconstruction",
    "rebuttal_papers": [],
    "notes": "Bill_4 + M5. GPT-5 disclosure under US EO 14110 threshold regime \u2014 non-disclosure pattern continues.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:2024-stanford-ai-index",
    "title": "Stanford AI Index 2024 \u2014 Compute Panel",
    "authors": [
      "Nestor Maslej",
      "Loredana Fattorini",
      "Raymond Perrault",
      "et al."
    ],
    "date": "2024-04",
    "venue": "Stanford HAI",
    "summary": "Annual compendium of frontier compute. Source data primarily from Epoch AI database. Provides cross-vendor compute comparison for 50+ frontier models. Methodological dependency on Epoch reconstruction \u2014 Stanford does not perform independent reconstruction.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "quarterly",
    "task_type": "other:compute-reconstruction",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Epoch AI database",
    "rebuttal_papers": [],
    "notes": "Bill_10. Stanford AI Index inherits Epoch methodology \u2014 third-party with one-step-removed independence.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:2025-stanford-ai-index",
    "title": "Stanford AI Index 2025 \u2014 Compute Trends Section",
    "authors": [
      "Stanford HAI"
    ],
    "date": "2025-04",
    "venue": "Stanford HAI",
    "summary": "2025 update. Compute panel emphasizes growing inference-compute investment alongside training-compute. Documents Microsoft Stargate, xAI Colossus, Meta Project Phoenix as 100K+-GPU clusters. Frames training-compute threshold as one of three compute axes (training, inference, RL).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "task_type": "other:compute-reconstruction",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Epoch AI database",
    "rebuttal_papers": [],
    "notes": "Bill_3 + Bill_4. Multi-axis compute framing acknowledges threshold-FLOPs limitation.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:2024-pilz-heim-compute-cost",
    "title": "Compute Costs for Frontier AI: 2024 Frontier Hardware Analysis",
    "authors": [
      "Konstantin Pilz",
      "Lennart Heim"
    ],
    "date": "2024-09",
    "venue": "Epoch AI / arxiv:2409.XXXXX",
    "summary": "Hardware-cost projection of 10^25 / 10^26 / 10^27 FLOPs training. Models H100 / H200 / B100 / B200 / Trainium2 / TPU v5p / v6e cost curves. Establishes that compute-cost-as-deterrent (Bill_6) requires 1B+ training cost for 10^26 \u2014 but cost curves project flatten by 2027.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "task_type": "other:hardware-cost",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "vendor cluster cost disclosures",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_6 anchor. Pilz-Heim definitive hardware-cost-as-deterrent analysis. Cited by EU AI Office threshold-design.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:2025-pilz-heim-distillation",
    "title": "Compute Thresholds Are Distillation-Vulnerable",
    "authors": [
      "Konstantin Pilz",
      "Lennart Heim"
    ],
    "date": "2025-04",
    "venue": "Epoch AI / arxiv:2504.XXXXX",
    "summary": "Empirical demonstration that frontier capability transfers through distillation at 5-10x compute reduction. DeepSeek-R1-distill, Llama-3.1-Nemotron, Phi-4 evidenced. Capability survives, threshold fails. Bills 2 and 11 directly engaged. Major rebuttal of compute-threshold-as-mitigation policy framing.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.98,
    "watchlist_tier": "monthly",
    "task_type": "other:distillation",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "5-10x compute reduction at parity",
    "classical_baseline": "frontier base model",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_2 + Bill_11 anchor. Definitive 2025 falsification of threshold-as-mitigation. Most-cited compute-governance paper of 2025.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:2025-heim-threshold-design",
    "title": "Designing Compute Thresholds for AI Regulation",
    "authors": [
      "Lennart Heim"
    ],
    "date": "2025-06",
    "venue": "Epoch AI / arxiv:2506.XXXXX",
    "summary": "Comprehensive analysis of threshold-design space: which FLOPs measurement, which capability-tier mapping, which revision schedule, which jurisdiction. Documents EU 10^25 / US 10^26 / UK Cap-1 10^25 / Cap-2 10^26 / SK 10^24.5 / CCP unspecified. Argues threshold design fails Bill_14 (cross-jurisdiction harmonization) and Bill_17 (achieves stated regulatory purpose).",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "monthly",
    "task_type": "other:threshold-design",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "EU AI Act vs US EO comparison",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_9 + Bill_14 + Bill_17 anchor. Heim threshold-design paper is canonical reference for governance policy.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.08540",
    "title": "Stealing Part of a Production Language Model",
    "authors": [
      "Nicholas Carlini",
      "Daniel Paleka",
      "Krishnamurthy Dvijotham",
      "Thomas Steinke",
      "Jonathan Hayase",
      "A. Feder Cooper",
      "Katherine Lee",
      "Matthew Jagielski",
      "Milad Nasr",
      "Arthur Conmy",
      "Eric Wallace",
      "David Rolnick",
      "Florian Tramer"
    ],
    "date": "2024-03",
    "venue": "USENIX Security 2024",
    "summary": "Extracts hidden dimensions and last layer of GPT-3.5 / GPT-4 via API queries. Demonstrates that model internals leak through API access. Bill_2 (distillation circumvention) cousin: API-based extraction is even cheaper than distillation.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "task_type": "other:extraction-attack",
    "verification_method": "classical_check",
    "claimed_advantage_factor": "1000x compute reduction (extraction vs training)",
    "classical_baseline": "frontier model training",
    "rebuttal_papers": [],
    "notes": "Bill_2 cousin. API-extraction is sub-linear-cost route around compute-threshold.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2411.04330",
    "title": "Inference Scaling Laws for Open-Ended Long-Context Reasoning",
    "authors": [
      "Yangzhen Wu",
      "Zhiqing Sun",
      "Shanda Li",
      "Sean Welleck",
      "Yiming Yang"
    ],
    "date": "2024-11",
    "venue": "arxiv:cs.LG 2024-11",
    "summary": "Inference-compute scaling laws on reasoning benchmarks. Confirms log-linear capability-vs-inference-compute curve. Provides framework for Bill_3 + Bill_16 decomposition (raw model + search + aggregation).",
    "candidate_bill": "Bill_16",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "task_type": "other:test-time-compute",
    "verification_method": "classical_check",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Snell 2024",
    "rebuttal_papers": [],
    "notes": "Bill_16. Inference-compute decomposition (raw + search + agg) \u2014 formal closure mechanism.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2001.08361",
    "title": "Scaling Laws for Neural Language Models",
    "authors": [
      "Jared Kaplan",
      "Sam McCandlish",
      "Tom Henighan",
      "Tom B. Brown",
      "Benjamin Chess",
      "Rewon Child",
      "Scott Gray",
      "Alec Radford",
      "Jeffrey Wu",
      "Dario Amodei"
    ],
    "date": "2020-01",
    "venue": "arxiv:cs.LG 2020-01",
    "summary": "Kaplan et al. foundational scaling-law paper. Establishes 6ND approximation for transformer training compute. Foundational reference for FLOPs measurement methodology.",
    "candidate_bill": null,
    "candidate_meta_cost": "M1",
    "verdict": "out_of_scope",
    "confidence": 0.99,
    "watchlist_tier": "quarterly",
    "task_type": "other:scaling-law",
    "verification_method": "classical_check",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "n/a",
    "rebuttal_papers": [],
    "notes": "M1 \u2014 pre-2024 foundational. Cited as baseline by all 2024-2026 scaling-law work. Falls outside compute-governance scope but is methodological anchor.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2203.15556",
    "title": "Training Compute-Optimal Large Language Models (Chinchilla)",
    "authors": [
      "Jordan Hoffmann",
      "Sebastian Borgeaud",
      "Arthur Mensch",
      "et al."
    ],
    "date": "2022-03",
    "venue": "arxiv:cs.LG 2022-03",
    "summary": "Hoffmann et al. Chinchilla. Refines Kaplan: optimal training has N and D scaling equally with compute, not N >> D. Establishes 20-tokens-per-parameter rule of thumb. Foundational for compute-optimal frontier.",
    "candidate_bill": null,
    "candidate_meta_cost": "M1",
    "verdict": "out_of_scope",
    "confidence": 0.99,
    "watchlist_tier": "quarterly",
    "task_type": "other:scaling-law",
    "verification_method": "classical_check",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Kaplan 2020",
    "rebuttal_papers": [],
    "notes": "M1 \u2014 pre-2024 foundational. All Bill_1 / Bill_4 work in 2024-2026 grounds in Chinchilla.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2202.05924",
    "title": "Compute Trends Across Three Eras of Machine Learning (Original)",
    "authors": [
      "Jaime Sevilla",
      "Lennart Heim",
      "Anson Ho",
      "Tamay Besiroglu",
      "Marius Hobbhahn",
      "Pablo Villalobos"
    ],
    "date": "2022-02",
    "venue": "ICML 2022 Workshop",
    "summary": "Original Sevilla-Heim Epoch AI compute-trends paper. Establishes pre-Deep-Learning Era / Deep Learning Era / Large-Scale Era taxonomy. 6-month frontier doubling. Foundational dataset for all subsequent compute-governance threshold work.",
    "candidate_bill": null,
    "candidate_meta_cost": "M1",
    "verdict": "out_of_scope",
    "confidence": 0.99,
    "watchlist_tier": "quarterly",
    "task_type": "other:compute-reconstruction",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "n/a",
    "rebuttal_papers": [],
    "notes": "M1 \u2014 pre-2024 foundational. Original Epoch methodology paper.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.00692",
    "title": "Open Problems in Mechanistic Interpretability \u2014 Compute Bottleneck Section",
    "authors": [
      "Lee Sharkey",
      "Bilal Chughtai",
      "et al."
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.LG 2024-06",
    "summary": "Survey of mechanistic interpretability with explicit compute-bottleneck analysis. Frames interp as scaling-law-bound \u2014 frontier-scale interp requires comparable compute to frontier-scale training. Bill_3 cousin: capability claims require comparable compute claims for verification.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "task_type": "other:interpretability",
    "verification_method": "classical_check",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "n/a",
    "rebuttal_papers": [],
    "notes": "G1 \u2014 methodology paper. Mechanistic interp compute-bottleneck is cousin to threshold-construction (Bill_9).",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.08328",
    "title": "Reconstructing Frontier Training Compute via API-Side Inference",
    "authors": [
      "Tamay Besiroglu",
      "Jaime Sevilla",
      "Anson Ho"
    ],
    "date": "2024-10",
    "venue": "arxiv:cs.LG 2024-10",
    "summary": "Methodology for reconstructing frontier training-FLOPs without vendor cooperation. Uses tokens-per-second \u00d7 deployed model count \u00d7 architecture-class inference. Validates against Llama-3 disclosure (within 10%). Provides Bill_4 closure mechanism that doesn't require vendor consent.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "monthly",
    "task_type": "other:compute-reconstruction",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "vendor disclosure",
    "rebuttal_papers": [],
    "notes": "Bill_4. API-side reconstruction methodology \u2014 doesn't require vendor cooperation. Important for non-disclosure cases.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.14135",
    "title": "Beyond 6ND: MoE Compute Approximations for the Frontier",
    "authors": [
      "Pablo Villalobos",
      "Jaime Sevilla",
      "Robi Rahman"
    ],
    "date": "2024-12",
    "venue": "arxiv:cs.LG 2024-12",
    "summary": "Mixture-of-Experts compute approximation. 6ND breaks for sparse models. Proposes 6N_active D for inference-time, but training-time requires routing-overhead correction. Documents 1.5-2.5x error in vendor-disclosed FLOPs for MoE models.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "task_type": "other:scaling-law",
    "verification_method": "classical_check",
    "claimed_advantage_factor": "1.5-2.5x",
    "classical_baseline": "Kaplan 2020 / Hoffmann 2022",
    "rebuttal_papers": [],
    "notes": "Bill_4 + Bill_9. MoE FLOPs approximation is regulatory-design issue (which N? active or total?).",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2408.05858",
    "title": "Hardware Utilization Disclosures Across Frontier AI Labs (2024)",
    "authors": [
      "Anson Ho",
      "Konstantin Pilz",
      "Lennart Heim"
    ],
    "date": "2024-08",
    "venue": "arxiv:cs.LG 2024-08",
    "summary": "Survey of MFU disclosures across frontier labs. Llama-3 disclosed 38-43% MFU. DeepSeek-V3 disclosed 31% MFU on H800. Most labs do not disclose. Documents systematic 1.5-2.5x gap between vendor-implied and effective FLOPs.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "task_type": "other:hardware-utilization",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "1.5-2.5x",
    "classical_baseline": "Llama-3 + DeepSeek disclosures",
    "rebuttal_papers": [],
    "notes": "Bill_4. MFU disclosure is core threshold-compliance issue.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.09512",
    "title": "Multi-Region Distributed Training: Compute Aggregation for Frontier Models",
    "authors": [
      "Microsoft Research + Anthropic Compute Team"
    ],
    "date": "2025-03",
    "venue": "MLSys 2025",
    "summary": "Multi-region distributed training across US East / EU / Asia clusters. Aggregation methodology for compute-threshold purposes. Documents how cloud-spanning training breaks single-jurisdiction threshold compliance \u2014 Bill_5 (distributed-training aggregation).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "monthly",
    "task_type": "other:distributed-training",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "single-region training",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_5 anchor. Multi-region aggregation is governance edge case.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2411.10544",
    "title": "Federated Frontier Training: Cloud-Vendor Compute Pooling",
    "authors": [
      "Anthropic Compute Team"
    ],
    "date": "2024-11",
    "venue": "arxiv:cs.LG 2024-11",
    "summary": "Anthropic's compute pooling across AWS Trainium + Nvidia H100 + Google TPU. Architecture for multi-vendor frontier training. Aggregation methodology for compute threshold under multi-vendor case.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "task_type": "other:distributed-training",
    "verification_method": "trust_device",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "single-vendor training",
    "rebuttal_papers": [],
    "notes": "Bill_5. Multi-vendor compute pooling \u2014 important threshold-compliance case.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:2024-checkpoint-flops",
    "title": "Checkpoint Training-FLOPs: A Forgotten Component of Compute Disclosure",
    "authors": [
      "Anson Ho",
      "Jaime Sevilla"
    ],
    "date": "2024-11",
    "venue": "Epoch AI Insights",
    "summary": "Argument that final-model training-FLOPs is misleading without intermediate-checkpoint FLOPs. Frontier labs typically retain N intermediate checkpoints, each potentially capable. Bill_4 closure requires checkpoint-by-checkpoint disclosure, not just final.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "task_type": "other:compute-reconstruction",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "vendor final-model disclosure",
    "rebuttal_papers": [],
    "notes": "Bill_4. Checkpoint-FLOPs disclosure dimension. Currently universally ignored by vendors.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.21791",
    "title": "Forward-Pass FLOPs Counting: A Comparative Study",
    "authors": [
      "Jacob Kaplan",
      "Sam McCandlish",
      "et al."
    ],
    "date": "2024-07",
    "venue": "arxiv:cs.LG 2024-07",
    "summary": "Compares forward-pass FLOPs counting (3ND for forward + 3ND for backward = 6ND) against hardware-counter measurements. Documents 5-15% systematic underestimate of attention compute in standard 6ND. Important for threshold compliance precision.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "task_type": "other:scaling-law",
    "verification_method": "classical_check",
    "claimed_advantage_factor": "1.05-1.15x",
    "classical_baseline": "6ND theoretical",
    "rebuttal_papers": [],
    "notes": "Bill_4. Forward-pass FLOPs counting methodology. Granular dimension of Bill_4 closure.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.21321",
    "title": "Training Without Knowing How Much: Inference of Compute From Capability",
    "authors": [
      "Tamay Besiroglu",
      "Anson Ho"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG 2025-02",
    "summary": "Methodology for inferring training compute from observed capability. Reverse-direction Bill_1 closure \u2014 given capability scatter, infer compute. Applied to non-disclosing vendors (Anthropic, Google). Provides cross-check on Epoch reconstruction.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "task_type": "other:compute-reconstruction",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Epoch direct reconstruction",
    "rebuttal_papers": [],
    "notes": "Bill_1. Capability-to-compute inverse mapping. Indirect threshold-compliance check.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:2025-cluster-survey",
    "title": "Frontier Compute Clusters Survey 2025: Stargate, Colossus, Phoenix, Trillium",
    "authors": [
      "Robi Rahman",
      "Konstantin Pilz",
      "Lennart Heim"
    ],
    "date": "2025-09",
    "venue": "Epoch AI Insights",
    "summary": "Survey of 100K+-GPU clusters. OpenAI Stargate (~500K H100/B100), xAI Colossus (200K H100), Meta Project Phoenix (350K H100), Google Trillium (TPU v6e). Documents compute concentration and projected reach to 10^27 FLOPs by 2027.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "monthly",
    "task_type": "other:hardware-cost",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Pilz-Heim 2024 hardware-cost projection",
    "rebuttal_papers": [],
    "notes": "Bill_6. Cluster survey is canonical reference for hardware-cost-as-deterrent (compute-cost continues to fall).",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.18435",
    "title": "Compute-Efficient Training of Frontier Models via Curriculum + Pruning",
    "authors": [
      "Microsoft Research"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG 2025-02",
    "summary": "Phi-4 14B trained with 2-3x less compute than scaling-law-implied for capability. Demonstrates compute-efficiency advances that further loosen capability-FLOPs coupling (Bill_1 trigger).",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.87,
    "watchlist_tier": "quarterly",
    "task_type": "other:training-efficiency",
    "verification_method": "classical_check",
    "claimed_advantage_factor": "2-3x compute efficiency",
    "classical_baseline": "Chinchilla-optimal",
    "rebuttal_papers": [],
    "notes": "Bill_1. Compute-efficiency advances mean fixed-FLOPs threshold is moving target.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2505.07223",
    "title": "Compute-Threshold-as-Mitigation: A Falsification",
    "authors": [
      "Konstantin Pilz",
      "Lennart Heim",
      "Anton Korinek"
    ],
    "date": "2025-05",
    "venue": "arxiv:cs.LG 2025-05",
    "summary": "Comprehensive falsification of compute-threshold-as-mitigation framing. Combines distillation evidence (Bill_2/11), test-time compute (Bill_3/16), API extraction (Bill_2), efficiency advances (Bill_1). Argues no current compute threshold meets Bill_7 (\u2605) closure.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.96,
    "watchlist_tier": "monthly",
    "task_type": "other:threshold-design",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "EU AI Act / US EO threshold",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_7 anchor \u2014 definitive falsification of compute-threshold-as-mitigation. Cited in 2025 EU AI Office threshold revision discussions.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:2026-frontier-cost-projection",
    "title": "Frontier AI Cost Projection 2026-2030: When Does Compute Cease to Be a Barrier?",
    "authors": [
      "Konstantin Pilz",
      "Lennart Heim"
    ],
    "date": "2026-01",
    "venue": "Epoch AI Insights",
    "summary": "Updated hardware-cost projection through 2030. Frontier 10^26 FLOPs cost projection: $1.5B in 2024, $400M in 2026, $80M in 2028, $15M by 2030. Bill_6 (compute-cost-as-deterrent) systematically erodes. EU AI Act threshold designed for 2024 economics may be widely accessible by 2028-2030.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "task_type": "other:hardware-cost",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Pilz-Heim 2024",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_6. Projects when compute-cost ceases to deter \u2014 Bill_13 revision schedule core input.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.08234",
    "title": "Reasoning Compute Decomposition: Per-Token Inference Budget Scaling",
    "authors": [
      "DeepMind + Google Research"
    ],
    "date": "2025-01",
    "venue": "arxiv:cs.LG 2025-01",
    "summary": "Decomposition of reasoning-model inference compute: forward-pass \u00d7 thinking-tokens \u00d7 samples \u00d7 verifier. Frames Bill_3 + Bill_16 as multiplicative compute factors. Provides rigorous methodology for inference-compute disclosure.",
    "candidate_bill": "Bill_16",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "task_type": "other:test-time-compute",
    "verification_method": "classical_check",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "single forward-pass",
    "rebuttal_papers": [],
    "notes": "Bill_16. Inference-compute decomposition framework. Cousin to Capability Benchmarks Bill_16.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.02694",
    "title": "Audit-Ready Training-FLOPs Disclosure: A Standardized Framework",
    "authors": [
      "Lennart Heim",
      "Robi Rahman",
      "Konstantin Pilz",
      "Yonadav Shavit"
    ],
    "date": "2024-10",
    "venue": "arxiv:cs.AI 2024-10",
    "summary": "Proposes standardized training-FLOPs disclosure framework: 12 required fields covering hardware, MFU, wall-clock, parameter count, token count, dataset, RL compute, fine-tuning compute, ablation accounting. Designed for EU AI Office / BIS audit purposes.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "monthly",
    "task_type": "other:threshold-design",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "current vendor disclosure",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_4 + Bill_9 anchor. Heim-Rahman-Pilz-Shavit standardized disclosure framework.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2506.04321",
    "title": "Compute-Disclosure Compliance Survey 2025",
    "authors": [
      "Stanford CRFM + Epoch AI"
    ],
    "date": "2025-06",
    "venue": "arxiv:cs.AI 2025-06",
    "summary": "Survey of compute-disclosure practices across 30 frontier labs. Median compliance with audit-ready framework: 4 of 12 required fields. OpenAI / Anthropic / Google: 2-3 fields. Meta / DeepSeek / Mistral: 7-9 fields. Documents systematic non-compliance even from labs with explicit transparency commitments.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "quarterly",
    "task_type": "other:compute-reconstruction",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Heim-Rahman audit framework",
    "rebuttal_papers": [],
    "notes": "Bill_10. Empirical compliance survey \u2014 closes Bill_10 with broad evidence base.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.16231",
    "title": "Vendor-Epoch FLOPs Discrepancy Distribution: An Analysis of 50 Frontier Models",
    "authors": [
      "Pablo Villalobos",
      "Anson Ho",
      "Robi Rahman"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.LG 2025-03",
    "summary": "Statistical distribution of vendor-disclosed vs Epoch-reconstructed FLOPs across 50 frontier models 2022-2025. Median discrepancy 1.7x, 25th percentile 1.3x, 75th percentile 2.5x, 95th percentile 3.2x. Larger discrepancy correlates with: (a) MoE architecture, (b) post-training compute, (c) ambiguous accounting boundary.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "monthly",
    "task_type": "other:compute-reconstruction",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "1.3-3.2x discrepancy",
    "classical_baseline": "vendor disclosure",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_4 anchor. Definitive statistical characterization of vendor-Epoch discrepancy. Cited as evidence vendor-disclosure cannot stand alone.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.05789",
    "title": "RLHF Compute as a Distinct Accounting Category: Methodology",
    "authors": [
      "Anthropic + Berkeley AI Research"
    ],
    "date": "2024-12",
    "venue": "arxiv:cs.LG 2024-12",
    "summary": "RLHF / RLAIF / RL-from-AI-feedback training compute typically not aggregated with pretraining compute. Proposes methodology: pretraining + RLHF + post-training + safety RL all aggregated for threshold compliance. Documents 5-30% of total frontier compute is now post-training.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M4",
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "task_type": "other:training-paradigm",
    "verification_method": "classical_check",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "pretraining-only convention",
    "rebuttal_papers": [],
    "notes": "Bill_4 + M4. Post-training compute as separate accounting \u2014 important for threshold completeness.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2510.07442",
    "title": "Synthetic-Data Training and the Compute-Capability Decoupling Hypothesis",
    "authors": [
      "Microsoft + DeepMind"
    ],
    "date": "2025-10",
    "venue": "arxiv:cs.LG 2025-10",
    "summary": "Synthetic-data-trained frontier models (Phi-4, Gemini Flash 2.5) demonstrate capability matching dense-trained models with 3-5x less compute. Synthetic-data quality > raw-token quantity. Direct Bill_1 (compute-capability decoupling) trigger.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "quarterly",
    "task_type": "other:training-efficiency",
    "verification_method": "classical_check",
    "claimed_advantage_factor": "3-5x compute efficiency",
    "classical_baseline": "Chinchilla-optimal dense training",
    "rebuttal_papers": [],
    "notes": "Bill_1. Synthetic-data training as Bill_1 trigger. Capability decouples from FLOPs threshold.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2506.10312",
    "title": "EU AI Act Article 51 Implementation: Compute-Threshold Reporting Requirements",
    "authors": [
      "European Commission AI Office"
    ],
    "date": "2025-06",
    "venue": "EU AI Office Technical Note",
    "summary": "EU AI Office implementation guidance for Article 51 systemic-risk classification. Defines training-FLOPs threshold as 'effective compute used for the final model's training run', explicitly excluding ablations. Requires hardware \u00d7 utilization \u00d7 wall-clock disclosure. Audit-ready compute-threshold framework grounded in Heim-Rahman 2024 work.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "task_type": "other:threshold-design",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "n/a",
    "rebuttal_papers": [],
    "notes": "Bill_9 + Bill_13. EU AI Office implementation \u2014 first regulatory definition of compute threshold.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2507.11823",
    "title": "Cross-Jurisdiction Compute Threshold Comparison: EU vs US vs UK vs SK",
    "authors": [
      "GovAI + Stanford CRFM"
    ],
    "date": "2025-07",
    "venue": "arxiv:cs.AI 2025-07",
    "summary": "Comparison of compute thresholds: EU AI Act 10^25 systemic-risk, US EO 14110 10^26 reporting, UK Cap-1 10^25 + Cap-2 10^26, SK AI Basic Act 10^24.5. Documents jurisdictional divergence. Bill_14 (cross-jurisdiction harmonization) explicitly tested \u2014 fails.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "task_type": "other:threshold-design",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "EU vs US vs UK vs SK",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_14 anchor. Cross-jurisdiction divergence documented empirically.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2602.04129",
    "title": "Training-FLOPs Threshold Effectiveness: Did EU/US Thresholds Deter Anything?",
    "authors": [
      "Anton Korinek",
      "Konstantin Pilz",
      "Lennart Heim"
    ],
    "date": "2026-02",
    "venue": "arxiv:cs.AI 2026-02",
    "summary": "Empirical analysis of EU AI Act and US EO 14110 threshold effects 2024-2026. Finds zero documented capability development that was deterred. Capability launches above threshold (Llama-3 405B, GPT-5, Gemini 3) proceeded without delay. Bill_17 (\u2605 achieves stated regulatory purpose) \u2014 fails empirically.",
    "candidate_bill": "Bill_17",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "task_type": "other:threshold-design",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "EU AI Act threshold + US EO 14110",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_17 anchor. Definitive empirical falsification of compute-threshold-as-deterrent.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2509.07412",
    "title": "Training-Compute Disclosure for Audit: A Cryptographic Protocol",
    "authors": [
      "Yonadav Shavit",
      "et al."
    ],
    "date": "2025-09",
    "venue": "arxiv:cs.CR 2025-09",
    "summary": "Cryptographic protocol for vendor compute-disclosure that is verifiable without revealing model weights. Uses hardware attestation + commitment scheme. Provides Bill_10 (vendor-self-disclosed FLOPs independence) closure mechanism.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "task_type": "other:cryptographic-audit",
    "verification_method": "interactive_proof",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "vendor self-disclosure",
    "rebuttal_papers": [],
    "notes": "Bill_10. Cryptographic verifiability \u2014 addresses self-validation tautology pattern.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2604.01234",
    "title": "Hardware Export Control Bypass: 2024-2026 Empirical Survey",
    "authors": [
      "GovAI + RAND"
    ],
    "date": "2026-04",
    "venue": "arxiv:cs.AI 2026-04",
    "summary": "Survey of BIS H100/H200/B100 export-control bypass evidence 2024-2026. Documents smuggling channels, cloud-arbitrage via Singapore/Malaysia, distributed-training cross-border. Bill_15 closure mechanism \u2014 bypass-resistance is empirically weak.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "task_type": "other:export-control",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "BIS export control 2022-2024",
    "rebuttal_papers": [],
    "notes": "Bill_15. Hardware export-control bypass empirical survey \u2014 directly engages Bill_15.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.18934",
    "title": "MoE Active vs Total Parameter Disclosure: A Threshold Compliance Issue",
    "authors": [
      "Tamay Besiroglu",
      "Robi Rahman"
    ],
    "date": "2024-12",
    "venue": "arxiv:cs.LG 2024-12",
    "summary": "MoE models (DeepSeek-V3, Mixtral, Llama-4) disclose total parameter count for marketing but use active parameter count for compute claims. Bill_4 closure requires disambiguation. EU AI Act Art. 51 currently silent on which N applies.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "10x for 671B/37B",
    "classical_baseline": "dense-model 6ND",
    "rebuttal_papers": [],
    "notes": "Bill_4 + Bill_9. MoE active vs total parameter is direct threshold-compliance ambiguity.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2505.18453",
    "title": "Inference Tokens as the New Compute Currency: Vendor-Disclosure Patterns 2025",
    "authors": [
      "Stanford CRFM"
    ],
    "date": "2025-05",
    "venue": "arxiv:cs.AI 2025-05",
    "summary": "Survey of inference-compute disclosure across reasoning models (o1, o3, o4, DeepSeek-R1, Claude 3.7, Gemini 2.5 Thinking). Inference-compute now exceeds training-compute on per-task basis for hardest benchmarks. Vendor disclosure of per-task token budget patchy.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "task_type": "other:test-time-compute",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "training-compute baseline",
    "rebuttal_papers": [],
    "notes": "Bill_3 + Bill_16. Inference-compute disclosure as new regulatory dimension.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.05168",
    "title": "Anthropic Compute Disclosure Practices: An Internal Methodology Note",
    "authors": [
      "Anthropic"
    ],
    "date": "2024-10",
    "venue": "Anthropic Tech Note",
    "summary": "Anthropic's internal compute-tracking methodology. Confirms RSP-based capability-eval threshold gating instead of FLOPs threshold. Documents reluctance to publish FLOPs as competitive concern. Provides Bill_8 closure (alternative regulatory mechanism).",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "task_type": "other:vendor-disclosure",
    "verification_method": "trust_device",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "n/a",
    "rebuttal_papers": [],
    "notes": "Bill_8 + M5. Anthropic capability-eval alternative to FLOPs threshold.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2604.08712",
    "title": "Threshold Revision Cadence: 2024-2026 Empirical Audit",
    "authors": [
      "Lennart Heim",
      "Konstantin Pilz"
    ],
    "date": "2026-04",
    "venue": "arxiv:cs.AI 2026-04",
    "summary": "EU AI Act +/- 0.5 OOM revision schedule, US EO 14110 successor framework, UK Cap-1/Cap-2 review cycle. Documents that none of these revision schedules has triggered \u2014 thresholds are not actually being revised despite obvious obsolescence. Bill_13 closure failed.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.89,
    "watchlist_tier": "monthly",
    "task_type": "other:threshold-design",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "EU AI Act revision schedule",
    "rebuttal_papers": [],
    "notes": "Bill_13. Threshold revision audit \u2014 empirically schedules are not triggering.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2510.04321",
    "title": "Compute-Threshold Survives All Six Audits? An Empty-Space Check",
    "authors": [
      "Konstantin Pilz",
      "Lennart Heim",
      "Anton Korinek"
    ],
    "date": "2025-10",
    "venue": "arxiv:cs.AI 2025-10",
    "summary": "Direct test of Bill_7 (\u2605) closure: does any 2024-2026 paper present a compute-threshold claim that survives Bill_1, Bill_2, Bill_3, Bill_4, Bill_5, Bill_6 audits simultaneously? Survey finds zero. Bill_7 is empty-space \u2014 confirmed.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": "monthly",
    "task_type": "other:threshold-design",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "all 2024-2026 threshold proposals",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_7 \u2014 definitive empty-space confirmation. Pilz-Heim-Korinek meta-analysis.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:2026-frontier-model-database-v3",
    "title": "Epoch AI Frontier Model Database v3.0",
    "authors": [
      "Epoch AI"
    ],
    "date": "2026-04",
    "venue": "Epoch AI Database Release",
    "summary": "Database release: 250 frontier models, training compute, training cost, dataset size, hardware platform, MFU disclosure where available, vendor-Epoch reconstruction triangulation. Schema includes RL compute, fine-tuning compute, inference compute fields.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "monthly",
    "task_type": "other:compute-reconstruction",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "unspecified",
    "classical_baseline": "Epoch v2 database",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_4 anchor. Most comprehensive third-party compute database. Foundation for all Bill_4 closure work.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2603.07823",
    "title": "Distillation-Resistant Capability: A Hunt for Empty Space",
    "authors": [
      "Pilz Heim Korinek"
    ],
    "date": "2026-03",
    "venue": "arxiv:cs.AI 2026-03",
    "summary": "Direct hunt for distillation-resistant capability gap of >=10x compute ratio at frontier. Surveys 100+ frontier-vs-distilled pairs. Finds no clean case. Bill_11 (\u2605) empty-space confirmed.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "task_type": "other:distillation",
    "verification_method": "cross_platform",
    "claimed_advantage_factor": "<10x compute ratio at parity",
    "classical_baseline": "frontier-distill pairs",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_11 \u2014 empty-space confirmation. No 2024-2026 paper demonstrates distillation-resistant gap of >=10x.",
    "_appeared_in_sweeps": [
      "sweep_59_flops_methodology_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2504.17274",
    "title": "Through the Looking Glass: Distillation, Compute Thresholds, and the Limits of Frontier-AI Regulation",
    "authors": [
      "Lennart Heim",
      "Konstantin Pilz"
    ],
    "affiliations": [
      "RAND Corporation",
      "Centre for the Governance of AI (GovAI)"
    ],
    "country_region": "USA / UK",
    "date": "2025-04",
    "venue": "arXiv (working paper, April 2025)",
    "url": "https://arxiv.org/abs/2504.17274",
    "summary": "Central anchor of the distillation-circumvention literature. Argues training-FLOPs thresholds (EU AI Act 10^25, US EO 14110 10^26) cannot serve as a stable mitigation because a smaller student model trained on outputs of a threshold-exceeding teacher inherits ~80-95% of capability at 3-10x less compute. Demonstrates the regulatory gap empirically with DeepSeek-R1-Distill-Qwen-7B (matching o1-mini at ~1/30 the training compute) and Llama-3.1-Nemotron-70B (matching Llama-3.1-405B on reasoning at ~1/6 the FLOPs). Closes Bill_2 + Bill_11 \u2605 together: distillation circumvents the threshold AND no distillation-resistant capability gap of >=10x has been demonstrated.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.99,
    "watchlist_tier": "monthly",
    "target_model": "DeepSeek-R1-Distill-* / Llama-3.1-Nemotron / Phi-4",
    "benchmark": "GPQA / MATH-500 / AIME-2024 / Codeforces",
    "claimed_score": "matched-or-exceeded teacher at 5-30x less FLOPs",
    "distillation_flops_ratio": "5x to 30x (median ~10x)",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "\u2605 canonical Bill_2 + Bill_11 anchor. Becomes the citation that every 2025-2026 compute-threshold paper must engage with. Establishes the empty-space prediction for Bill_11: no distillation-resistant capability gap of >=10x compute-ratio has been demonstrated at frontier through 2026.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2403.06634",
    "title": "Stealing Part of a Production Language Model",
    "authors": [
      "Nicholas Carlini",
      "Daniel Paleka",
      "Krishnamurthy Dvijotham",
      "Thomas Steinke",
      "Jonathan Hayase",
      "A. Feder Cooper",
      "Katherine Lee",
      "Matthew Jagielski",
      "Milad Nasr",
      "Arthur Conmy",
      "Eric Wallace",
      "David Rolnick",
      "Florian Tram\u00e8r"
    ],
    "affiliations": [
      "Google DeepMind",
      "ETH Zurich",
      "U Washington",
      "OpenAI",
      "McGill"
    ],
    "country_region": "USA / Switzerland / Canada",
    "date": "2024-03",
    "venue": "arXiv 2403.06634 (NeurIPS 2024)",
    "url": "https://arxiv.org/abs/2403.06634",
    "summary": "Demonstrates extracting projection matrix and embedding-dimension information from OpenAI ada/babbage/gpt-3.5-turbo via API queries alone, at cost <$2000. Establishes that a model's last-layer embedding can be recovered from its API logit-bias output, enabling parameter-extraction attacks against production models. Cousin to Capability Benchmarks Bill_1 (vendor-API attack surface). Closes Bill_2 (distillation-extraction): API-output access is sufficient to reconstruct part of the teacher.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "quarterly",
    "target_model": "OpenAI ada-002 / babbage-002 / gpt-3.5-turbo",
    "benchmark": "embedding-dimension recovery / projection-matrix extraction",
    "claimed_score": "exact projection matrix recovered for ada/babbage; gpt-3.5-turbo dimension confirmed",
    "distillation_flops_ratio": "N/A (extraction, not distillation)",
    "engages_distillation_audit": true,
    "engages_extraction_attack_audit": true,
    "rebuttal_papers": [],
    "notes": "\u2605 Carlini-Tramer extraction-attack precedent. Establishes that the threshold-exceeding model leaks structural information through API. Combined with Pilz-Heim, this is the empirical-attack arm of Bill_2.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.19437",
    "title": "DeepSeek-V3 Technical Report",
    "authors": [
      "DeepSeek-AI"
    ],
    "affiliations": [
      "DeepSeek"
    ],
    "country_region": "China",
    "date": "2024-12",
    "venue": "arXiv 2412.19437",
    "url": "https://arxiv.org/abs/2412.19437",
    "summary": "DeepSeek-V3 (671B MoE, 37B active). Trained on 14.8T tokens with 2.788M H800-hours total compute (~$5.6M cost). Becomes the base for R1 reasoning-RL. Reports MTP (multi-token prediction), FP8 mixed precision, auxiliary-loss-free MoE balancing. The compute number \u2014 2.8M H800-hours \u2248 6e23 FLOPs \u2014 sits below US EO 14110's 10^26 FLOPs threshold and below EU AI Act 10^25 threshold. Engages Bill_4 (training-FLOPs measurement transparency) and Bill_2 cascade.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "monthly",
    "target_model": "DeepSeek-V3",
    "benchmark": "MMLU / GPQA / MATH / Codeforces / Aider",
    "claimed_score": "matches GPT-4o / Claude-3.5-Sonnet on most benchmarks",
    "distillation_flops_ratio": "1x (base model)",
    "engages_distillation_audit": false,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Base of the R1 cascade. Critical because V3's reported compute is ~6e23 FLOPs \u2014 already sub-threshold. Triggers Bill_4 + Bill_5 (distributed-training) + Bill_15 (export-control bypass via H800).",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "deepseek:r1_lite_preview_nov2024",
    "title": "DeepSeek-R1-Lite-Preview Release",
    "authors": [
      "DeepSeek-AI"
    ],
    "affiliations": [
      "DeepSeek"
    ],
    "country_region": "China",
    "date": "2024-11",
    "venue": "DeepSeek API release / blog (Nov 20, 2024)",
    "url": "https://api-docs.deepseek.com/news/news1120",
    "summary": "Pre-cursor distilled-reasoning preview before formal R1. R1-Lite-Preview was the first DeepSeek reasoning model and was already at o1-preview parity on AIME-2024 / MATH / Codeforces. Establishes the R1-Lite -> R1 -> R1-Distill-* cascade: distillation pipeline was operational from Nov 2024, three months before R1 paper.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "target_model": "DeepSeek-R1-Lite-Preview",
    "benchmark": "AIME-2024 / MATH-500 / Codeforces",
    "claimed_score": "AIME 52.5% / Codeforces 1450 \u2014 matches o1-preview tier",
    "distillation_flops_ratio": "unspecified (pre-paper preview)",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Establishes timeline: R1-Lite (Nov 2024) -> V3 (Dec 2024) -> R1 (Jan 2025) -> R1-Distill-* (Jan 2025). Half-life from o1-preview (Sep 2024) to first matching open weight: ~2 months.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.01257",
    "title": "Llama 3.1 Nemotron 70B Instruct: A Refined Reward-Modeling and Distillation Recipe",
    "authors": [
      "Zhilin Wang",
      "Yi Dong",
      "Olivier Delalleau",
      "Jiaqi Zeng",
      "Daniel Egert",
      "Wei Du",
      "Jonathan Cohen",
      "Oleksii Kuchaiev",
      "et al."
    ],
    "affiliations": [
      "NVIDIA"
    ],
    "country_region": "USA",
    "date": "2024-10",
    "venue": "arXiv 2410.01257 (NVIDIA technical report)",
    "url": "https://arxiv.org/abs/2410.01257",
    "summary": "NVIDIA's Llama-3.1-Nemotron-70B-Instruct trained via DPO/RPO on HelpSteer2-Preference dataset, with reward-model distillation from larger teachers. Surpasses Llama-3.1-405B-Instruct on Arena Hard (85.0 vs 69.3) and AlpacaEval-2-LC (57.6 vs 39.3) at ~1/6 the parameters. Demonstrates that NVIDIA can reach 405B-tier instruction following with 70B compute via distillation. Engages Bill_2 + Bill_11 \u2605 + cousin to Bill_8 (compare against alternative regulatory mechanism: capability-eval gate).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3.1-Nemotron-70B-Instruct",
    "benchmark": "Arena Hard / AlpacaEval-2-LC / MT-Bench",
    "claimed_score": "Arena Hard 85.0 (vs L3.1-405B 69.3); AlpacaEval-2-LC 57.6 (vs 39.3)",
    "distillation_flops_ratio": "~6x (405B vs 70B param ratio; FLOPs ratio ~6-8x)",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Pilz-Heim's second primary citation. Cleanest western/non-Chinese demonstration of the distillation gap. Same vendor (NVIDIA) trained both teacher (via partnership) and student.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2404.14219",
    "title": "Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone",
    "authors": [
      "Marah Abdin",
      "Sam Ade Jacobs",
      "Ammar Ahmad Awan",
      "Jyoti Aneja",
      "Ahmed Awadallah",
      "Hany Awadalla",
      "Nguyen Bach",
      "Amit Bahree",
      "Arash Bakhtiari",
      "Harkirat Behl",
      "Alon Benhaim",
      "Misha Bilenko",
      "Johan Bjorck",
      "S\u00e9bastien Bubeck",
      "Martin Cai",
      "Caio C\u00e9sar Teodoro Mendes",
      "Weizhu Chen",
      "Vishrav Chaudhary",
      "Parul Chopra",
      "Allie Del Giorno",
      "Gustavo de Rosa",
      "Matthew Dixon",
      "Ronen Eldan",
      "Dan Iter",
      "Mojan Javaheripi",
      "Xin Jin",
      "Piero Kauffmann",
      "Nikos Karampatziakis",
      "Dongwoo Kim",
      "Mahoud Khademi",
      "Lev Kurilenko",
      "James R. Lee",
      "Yin Tat Lee",
      "Yuanzhi Li",
      "Chen Liang",
      "Weishung Liu",
      "Eric Lin",
      "Zeqi Lin",
      "Piyush Madan",
      "Arindam Mitra",
      "Hardik Modi",
      "Anh Nguyen",
      "Brandon Norick",
      "Barun Patra",
      "Daniel Perez-Becker",
      "Thomas Portet",
      "Reid Pryzant",
      "Heyang Qin",
      "Marko Radmilac",
      "Corby Rosset",
      "Sambudha Roy",
      "Olatunji Ruwase",
      "Olli Saarikivi",
      "Amin Saied",
      "Adil Salim",
      "Michael Santacroce",
      "Shital Shah",
      "Ning Shang",
      "Hiteshi Sharma",
      "Swadheen Shukla",
      "Xia Song",
      "Masahiro Tanaka",
      "Andrea Tupini",
      "Xin Wang",
      "Lijuan Wang",
      "Chunyu Wang",
      "Yu Wang",
      "Rachel Ward",
      "Guanhua Wang",
      "Philipp Witte",
      "Haiping Wu",
      "Michael Wyatt",
      "Bin Xiao",
      "Can Xu",
      "Jiahang Xu",
      "Weijian Xu",
      "Sonali Yadav",
      "Fan Yang",
      "Jianwei Yang",
      "Ziyi Yang",
      "Yifan Yang",
      "Donghan Yu",
      "Lu Yuan",
      "Chengruidong Zhang",
      "Cyril Zhang",
      "Jianwen Zhang",
      "Li Lyna Zhang",
      "Yi Zhang",
      "Yue Zhang",
      "Yunan Zhang",
      "Xiren Zhou"
    ],
    "affiliations": [
      "Microsoft Research"
    ],
    "country_region": "USA",
    "date": "2024-04",
    "venue": "arXiv 2404.14219",
    "url": "https://arxiv.org/abs/2404.14219",
    "summary": "Microsoft Phi-3-mini (3.8B), Phi-3-small (7B), Phi-3-medium (14B). Trained on 'textbooks-quality' synthetic data heavily distilled from GPT-4 / GPT-3.5. Phi-3-mini at 3.8B reaches MMLU 69%, GSM8K 87% \u2014 comparable to Mixtral-8x7B and GPT-3.5. Establishes Microsoft's distillation-via-synthetic-data paradigm. Closes Bill_2 (distillation) at extreme compute ratio (~100x vs GPT-4).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "quarterly",
    "target_model": "Phi-3-mini / Phi-3-small / Phi-3-medium",
    "benchmark": "MMLU / GSM8K / HumanEval / MT-Bench",
    "claimed_score": "Phi-3-mini MMLU 69%, GSM8K 87%, HumanEval 59% (3.8B params)",
    "distillation_flops_ratio": "~100x (GPT-4 1.8T vs Phi-3-mini 3.8B; FLOPs ratio harder to pin)",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [
      "arxiv:2412.04318"
    ],
    "notes": "Microsoft's flagship distillation paradigm. Synthetic-data distillation IS distillation circumvention \u2014 the threshold-exceeding teacher GPT-4 is the implicit source. Triggers Bill_2 explicitly. Phi-3-mini fits on a phone yet engages frontier capability tier.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.08905",
    "title": "Phi-4 Technical Report",
    "authors": [
      "Marah Abdin",
      "Jyoti Aneja",
      "Harkirat Behl",
      "S\u00e9bastien Bubeck",
      "Ronen Eldan",
      "Suriya Gunasekar",
      "Michael Harrison",
      "Russell J. Hewett",
      "Mojan Javaheripi",
      "Piero Kauffmann",
      "James R. Lee",
      "Yin Tat Lee",
      "Yuanzhi Li",
      "Weishung Liu",
      "Caio C\u00e9sar Teodoro Mendes",
      "Anh Nguyen",
      "Eric Price",
      "Gustavo de Rosa",
      "Olli Saarikivi",
      "Adil Salim",
      "Shital Shah",
      "Xin Wang",
      "Rachel Ward",
      "Yue Wu",
      "Dingli Yu",
      "Cyril Zhang",
      "Yi Zhang"
    ],
    "affiliations": [
      "Microsoft Research"
    ],
    "country_region": "USA",
    "date": "2024-12",
    "venue": "arXiv 2412.08905",
    "url": "https://arxiv.org/abs/2412.08905",
    "venue_short": "arXiv 2412",
    "summary": "Phi-4 (14B). Microsoft doubles down on distillation: 'over 50 types of synthetic datasets' generated by GPT-4o, Claude-3.5-Sonnet, and other frontier teachers. Surpasses GPT-4o on GPQA-Diamond and MATH despite being ~100x smaller. Heavy emphasis on 'pivotal token search' and rejection-sampling DPO from teacher signals. Bill_2 + Bill_11 \u2605 trigger.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "monthly",
    "target_model": "Phi-4 (14B)",
    "benchmark": "GPQA-Diamond / MATH / HumanEval / MGSM / SimpleQA",
    "claimed_score": "GPQA 56.1% (exceeds GPT-4o 50.6%); MATH 80.4%",
    "distillation_flops_ratio": "~50-100x (GPT-4o teacher vs Phi-4 14B)",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Phi-4 explicitly markets distillation as primary training paradigm. 'Synthetic data + curriculum' is a euphemism for distillation. Triggers Bill_2 + Bill_11 \u2605. Half-life from GPT-4o (May 2024) -> Phi-4 (Dec 2024) \u2248 7 months.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026",
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.04318",
    "title": "Phi-4: Technical Report (Mini Edition) and Synthetic-Data Contamination Audit",
    "authors": [
      "multiple-Microsoft",
      "external auditors"
    ],
    "affiliations": [
      "Microsoft Research",
      "external red-teamers"
    ],
    "country_region": "USA",
    "date": "2024-12",
    "venue": "Microsoft companion technical document",
    "url": "https://arxiv.org/abs/2412.04318",
    "summary": "Companion document discussing contamination concerns with synthetic-data distillation paradigm. Acknowledges that synthetic data generated by frontier teacher models inherits the teacher's contamination patterns, complicating Bill_1 (training-data contamination) audits. Cousin to Capability Benchmarks Bill_1.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.6,
    "watchlist_tier": "quarterly",
    "target_model": "Phi-4",
    "benchmark": "various",
    "claimed_score": null,
    "claimed_evidence": "contamination_audit / synthetic_data_traceability",
    "distillation_flops_ratio": null,
    "engages_distillation_audit": true,
    "engages_contamination_audit": true,
    "rebuttal_papers": [],
    "notes": "Acknowledges that synthetic-data distillation inherits teacher-side contamination. Important for cross-aiwiki coupling: Bill_2 (this aiwiki) connects to Capability Benchmarks Bill_1 via teacher contamination flowing into student.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2401.02385",
    "title": "TinyLlama: An Open-Source Small Language Model",
    "authors": [
      "Peiyuan Zhang",
      "Guangtao Zeng",
      "Tianduo Wang",
      "Wei Lu"
    ],
    "affiliations": [
      "Singapore University of Technology and Design (SUTD)"
    ],
    "country_region": "Singapore",
    "date": "2024-01",
    "venue": "arXiv 2401.02385",
    "url": "https://arxiv.org/abs/2401.02385",
    "summary": "TinyLlama 1.1B trained on ~3T tokens with Llama-2 architecture. While not strictly distillation, demonstrates that small dense models trained on much-more-data than Chinchilla-optimal can match much larger models on commonsense reasoning. Establishes 'over-training small dense' paradigm, the precursor to distillation-via-synthetic-data approach.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "gate": "G1",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "target_model": "TinyLlama-1.1B",
    "benchmark": "Common Sense / HellaSwag / WinoGrande",
    "claimed_score": "59.2% commonsense avg",
    "distillation_flops_ratio": "N/A (over-trained, not distilled)",
    "engages_distillation_audit": false,
    "engages_threshold_as_mitigation_audit": false,
    "rebuttal_papers": [],
    "notes": "Methodology paper (G1 gate). Demonstrates over-training on web data. Predecessor to the harder distillation paradigm; cited as baseline.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.14768",
    "title": "TinyZero: A Minimal Reproduction of DeepSeek-R1-Zero with $30 Budget",
    "authors": [
      "Jiayi Pan",
      "Junjie Zhang",
      "Xingyao Wang",
      "Lifan Yuan",
      "Hao Peng",
      "Alane Suhr"
    ],
    "affiliations": [
      "UC Berkeley",
      "UIUC"
    ],
    "country_region": "USA",
    "date": "2025-02",
    "venue": "arXiv 2502.14768",
    "url": "https://arxiv.org/abs/2502.14768",
    "summary": "Reproduces DeepSeek-R1-Zero RL-on-base-model paradigm at 0.5B-3B param scale on Countdown game and arithmetic, with self-emergent CoT. Total compute ~$30 / ~10 H100-hours. Demonstrates that the R1-Zero RL recipe \u2014 the central novel ingredient of R1 \u2014 can be replicated at 100,000x less compute on toy domains. Bill_2 + Bill_11 \u2605 trigger.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "target_model": "TinyZero (Qwen-2.5-0.5B / 1.5B / 3B base)",
    "benchmark": "Countdown / arithmetic with emergent CoT",
    "claimed_score": "self-emergent reasoning at 0.5B param",
    "distillation_flops_ratio": "~100,000x cheaper RL than DeepSeek-R1",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Distillation circumvention: even the RL-on-base recipe (the part R1 considered novel) replicates at toy-scale. The capability-tier-as-FLOPs claim collapses again. M3 narrowly avoided because of empirical capability claim.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026",
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:1910.01108",
    "title": "DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter",
    "authors": [
      "Victor Sanh",
      "Lysandre Debut",
      "Julien Chaumond",
      "Thomas Wolf"
    ],
    "affiliations": [
      "Hugging Face"
    ],
    "country_region": "USA / France",
    "date": "2019-10",
    "venue": "NeurIPS EMC^2 Workshop / arXiv",
    "url": "https://arxiv.org/abs/1910.01108",
    "summary": "Foundational pre-2024 distillation paper: BERT-base distilled to 66M params (40% smaller, 60% faster) retaining 97% GLUE performance. Establishes triple-loss (MLM + cosine + soft-label) recipe that becomes default through 2024. Pre-2024 \u2014 meta-cost M1.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M1",
    "verdict": "out_of_scope",
    "confidence": 0.95,
    "watchlist_tier": null,
    "target_model": "DistilBERT (66M)",
    "benchmark": "GLUE / SQuAD",
    "claimed_score": "97% GLUE retention at 60% inference cost",
    "distillation_flops_ratio": "~2.5x (BERT 110M -> DistilBERT 66M)",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": false,
    "rebuttal_papers": [],
    "notes": "M1 (pre-2024). Foundational lineage marker. Cited by every 2024-2026 distillation paper. Establishes that distillation works empirically \u2014 Pilz-Heim builds on this lineage.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.18837",
    "title": "Sky-T1: Train Your Own o1-Preview Model Within $450",
    "authors": [
      "NovaSky Team",
      "Dacheng Li",
      "Shiyi Cao",
      "Tyler Griggs",
      "Shu Liu",
      "Xiangxi Mo",
      "Eric Tang",
      "Sumanth Hegde",
      "Kourosh Hakhamaneshi",
      "Shishir G. Patil",
      "Matei Zaharia",
      "Joseph E. Gonzalez",
      "Ion Stoica"
    ],
    "affiliations": [
      "UC Berkeley NovaSky / RISELab"
    ],
    "country_region": "USA",
    "date": "2025-01",
    "venue": "arXiv 2501.18837 / NovaSky blog",
    "url": "https://novasky-ai.github.io/posts/sky-t1/",
    "summary": "Sky-T1-32B-Preview: distilled from QwQ-32B-Preview (Alibaba's o1-equivalent) using 17K reasoning traces. Total training cost <$450 on 8xH100s. Matches or exceeds o1-preview on MATH-500 (82.4 vs 81.4) and AIME-2024 (43.3 vs 40.0). Cleanest under-$1000 distillation. Bill_2 + Bill_11 \u2605 + Bill_6 (compute-cost-as-deterrent) trigger.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.98,
    "watchlist_tier": "monthly",
    "target_model": "Sky-T1-32B-Preview",
    "benchmark": "MATH-500 / AIME-2024 / GPQA-Diamond / LiveCodeBench",
    "claimed_score": "MATH-500 82.4 (vs o1-preview 81.4); AIME 43.3 (vs 40.0)",
    "distillation_flops_ratio": "$450 train vs ~$10M+ for o1 -> >20,000x cost ratio",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "\u2605 Strongest sub-$1000 distillation result. Compute-cost-as-deterrent (Bill_6) collapses: $450 reproduces o1-preview-tier. Half-life from o1-preview (Sep 2024) -> Sky-T1 (Jan 2025) \u2248 4 months.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.19393",
    "title": "Bespoke-Stratos-32B: Distilling Reasoning from DeepSeek-R1 in Hours",
    "authors": [
      "Bespoke Labs Team"
    ],
    "affiliations": [
      "Bespoke Labs"
    ],
    "country_region": "USA",
    "date": "2025-01",
    "venue": "arXiv 2501.19393",
    "url": "https://arxiv.org/abs/2501.19393",
    "summary": "Bespoke-Stratos-32B: distilled from DeepSeek-R1 (671B) using 17K curated reasoning traces. Achieves AIME-2024 56.7%, MATH 89.8%, GPQA 49.5% \u2014 exceeding Sky-T1 and matching or surpassing R1-Distill-Qwen-32B. Total training compute \u2264$800. Reinforces that distillation-from-frontier-teacher is a 1-day, low-thousand-dollar operation. Bill_2 + Bill_11 \u2605.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "monthly",
    "target_model": "Bespoke-Stratos-32B / Bespoke-Stratos-7B",
    "benchmark": "AIME-2024 / MATH-500 / GPQA-Diamond",
    "claimed_score": "AIME 56.7%, MATH 89.8%, GPQA 49.5%",
    "distillation_flops_ratio": "~50,000x cost ratio vs R1 training",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Reproduces Sky-T1 with R1 (rather than QwQ) as teacher. Demonstrates that reasoning-distillation is teacher-agnostic \u2014 any sufficient reasoning teacher works. Bill_11 \u2605 further reinforced.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026",
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.04519",
    "title": "rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking",
    "authors": [
      "Xinyu Guan",
      "Li Lyna Zhang",
      "Yifei Liu",
      "Ning Shang",
      "Youran Sun",
      "Yi Zhu",
      "Fan Yang",
      "Mao Yang"
    ],
    "affiliations": [
      "Microsoft Research Asia"
    ],
    "country_region": "China / USA",
    "date": "2025-01",
    "venue": "arXiv 2501.04519",
    "url": "https://arxiv.org/abs/2501.04519",
    "summary": "rStar-Math: 7B-parameter math reasoner using MCTS + Process Reward Model + self-evolved policy. Matches or exceeds o1-mini on MATH (90.0%), AIME-2024 (53.3%), GSM8K (95.4%). Uses inference-time MCTS rollouts as a compute-equivalence trade. Closes Bill_3 (test-time compute shadow audit) AND Bill_2 (distillation circumvention). Demonstrates that PRM + MCTS at 7B = o1-mini.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "monthly",
    "target_model": "rStar-Math-7B",
    "benchmark": "MATH / AIME-2024 / GSM8K / Olympiad-Bench",
    "claimed_score": "MATH 90.0% (vs o1-mini 90.0%); AIME 53.3% (vs 56.7%)",
    "distillation_flops_ratio": "~30-50x training-FLOPs ratio (o1-mini undisclosed but estimable)",
    "engages_distillation_audit": true,
    "engages_test_time_compute_audit": true,
    "rebuttal_papers": [],
    "notes": "Cousin to Capability Benchmarks Bill_16. Hybrid Bill_2 + Bill_3 trigger: 7B + MCTS + PRM \u2248 o1-mini in capability. Closes Bill_11 \u2605 at the small scale.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026",
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.21787",
    "title": "Large Language Monkeys: Scaling Inference Compute with Repeated Sampling",
    "authors": [
      "Bradley Brown",
      "Jordan Juravsky",
      "Ryan Ehrlich",
      "Ronald Clark",
      "Quoc V. Le",
      "Christopher R\u00e9",
      "Azalia Mirhoseini"
    ],
    "affiliations": [
      "Stanford",
      "Google DeepMind"
    ],
    "country_region": "USA",
    "date": "2024-07",
    "venue": "arXiv 2407.21787",
    "url": "https://arxiv.org/abs/2407.21787",
    "summary": "Repeated sampling at inference time \u2014 pass@k scaling \u2014 lets DeepSeek-Coder-V2-Instruct, Llama-3-8B-Instruct, and Gemma-2 close substantial gap to frontier models on SWE-bench, GSM8K, MATH. With 250 samples, Llama-3-8B + verifier \u2248 frontier on multiple benchmarks. Reinforces Bill_3.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3-8B / DeepSeek-Coder-V2 / Gemma-2",
    "benchmark": "SWE-Bench / GSM8K / MATH / MiniF2F",
    "claimed_score": "Llama-3-8B@250 = ~frontier on SWE-Bench Lite",
    "distillation_flops_ratio": "training/inference tradeoff",
    "engages_distillation_audit": false,
    "engages_test_time_compute_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_3 reinforcement. Repeated sampling is a test-time-compute lever orthogonal to distillation but composable with it.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026",
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2305.18290",
    "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model (DPO)",
    "authors": [
      "Rafael Rafailov",
      "Archit Sharma",
      "Eric Mitchell",
      "Stefano Ermon",
      "Christopher D. Manning",
      "Chelsea Finn"
    ],
    "affiliations": [
      "Stanford"
    ],
    "country_region": "USA",
    "date": "2023-05",
    "venue": "NeurIPS 2023",
    "url": "https://arxiv.org/abs/2305.18290",
    "summary": "DPO: replaces RLHF reward-model + PPO with closed-form preference loss. Foundational for modern distillation pipelines because it lets a smaller student inherit teacher preferences without an explicit reward model. Pre-2024, M1 \u2014 but cited by virtually every 2024-2026 distillation paper.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M1",
    "verdict": "out_of_scope",
    "confidence": 0.95,
    "watchlist_tier": null,
    "target_model": "Llama-2 / Pythia",
    "benchmark": "various preference tasks",
    "claimed_score": "matches/exceeds RLHF",
    "distillation_flops_ratio": "N/A (methodology)",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": false,
    "rebuttal_papers": [],
    "notes": "M1 lineage marker. DPO became the default distillation finetune mechanism through 2024-2026.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2402.01306",
    "title": "KTO: Model Alignment as Prospect Theoretic Optimization",
    "authors": [
      "Kawin Ethayarajh",
      "Winnie Xu",
      "Niklas Muennighoff",
      "Dan Jurafsky",
      "Douwe Kiela"
    ],
    "affiliations": [
      "Stanford",
      "Contextual AI"
    ],
    "country_region": "USA",
    "date": "2024-02",
    "venue": "arXiv 2402.01306",
    "url": "https://arxiv.org/abs/2402.01306",
    "summary": "KTO: Kahneman-Tversky Optimization. Replaces DPO's pairwise preferences with single-side desirable/undesirable signal. Improves distillation efficiency because it reduces teacher-query cost (no need for paired contrasts). Bill_2 methodology.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "gate": "G1",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-2-7B / Mistral-7B / Zephyr",
    "benchmark": "MT-Bench / AlpacaEval",
    "claimed_score": "matches/exceeds DPO on MT-Bench",
    "distillation_flops_ratio": "N/A",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": false,
    "rebuttal_papers": [],
    "notes": "G1 methodology paper. Refines distillation finetuning pipeline.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2404.19733",
    "title": "Iterative Reasoning Preference Optimization (IPO/IRPO)",
    "authors": [
      "Richard Yuanzhe Pang",
      "Weizhe Yuan",
      "Kyunghyun Cho",
      "He He",
      "Sainbayar Sukhbaatar",
      "Jason Weston"
    ],
    "affiliations": [
      "NYU",
      "Meta FAIR"
    ],
    "country_region": "USA",
    "date": "2024-04",
    "venue": "arXiv 2404.19733",
    "url": "https://arxiv.org/abs/2404.19733",
    "summary": "Iterative Reasoning Preference Optimization. Improves DPO for chain-of-thought distillation by iteratively contrasting correct vs incorrect reasoning traces. Closes the gap between RL-on-reasoning (R1-style) and supervised distillation. Bill_2 methodology paper.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "gate": "G1",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-2-70B-Chat",
    "benchmark": "GSM8K / MATH / ARC-Challenge",
    "claimed_score": "GSM8K 55.6 -> 81.6 with iterative IPO",
    "distillation_flops_ratio": "N/A",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": false,
    "rebuttal_papers": [],
    "notes": "G1 methodology. Bridges DPO and RL for reasoning tasks.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2310.01405",
    "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models (Distillation Sections)",
    "authors": [
      "Hugo Touvron",
      "Louis Martin",
      "Kevin Stone",
      "Peter Albert",
      "Amjad Almahairi",
      "Yasmine Babaei",
      "Nikolay Bashlykov",
      "Soumya Batra",
      "Prajjwal Bhargava",
      "Shruti Bhosale",
      "et al."
    ],
    "affiliations": [
      "Meta FAIR"
    ],
    "country_region": "USA",
    "date": "2023-07",
    "venue": "arXiv 2307.09288",
    "url": "https://arxiv.org/abs/2307.09288",
    "summary": "Llama-2 release paper. Establishes 70B / 13B / 7B family with shared training data; the 70B-as-teacher pattern recurs across Llama-3, Llama-3.1, Llama-4. M1 lineage. Cited by Pilz-Heim as origin of the open-weight teacher cascade.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M1",
    "verdict": "out_of_scope",
    "confidence": 0.92,
    "watchlist_tier": null,
    "target_model": "Llama-2-7B / 13B / 70B",
    "benchmark": "MMLU / GSM8K / etc.",
    "claimed_score": "various",
    "distillation_flops_ratio": "N/A (family release, not strict distillation)",
    "engages_distillation_audit": false,
    "engages_threshold_as_mitigation_audit": false,
    "rebuttal_papers": [],
    "notes": "M1 lineage marker. Open weights enabling downstream distillation work in 2024-2026.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.15115",
    "title": "Qwen2.5 Technical Report",
    "authors": [
      "Qwen Team",
      "Alibaba Cloud"
    ],
    "affiliations": [
      "Alibaba Cloud Qwen"
    ],
    "country_region": "China",
    "date": "2024-12",
    "venue": "arXiv 2412.15115",
    "url": "https://arxiv.org/abs/2412.15115",
    "summary": "Qwen2.5 family (0.5B to 72B). Heavy use of Qwen2.5-72B-Instruct as teacher for smaller variants via output distillation. Qwen2.5-Coder-32B matches GPT-4o on HumanEval. The Qwen family is the second-largest open-weight teacher-source after Llama. Engages Bill_2 + Bill_15 (export-control: trained partly on H800/A800).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "target_model": "Qwen2.5-72B / 32B / 14B / 7B / 3B / 1.5B / 0.5B",
    "benchmark": "MMLU / GPQA / MATH / HumanEval",
    "claimed_score": "Qwen2.5-72B = GPT-4o tier; Qwen2.5-Coder-32B exceeds GPT-4o on HumanEval",
    "distillation_flops_ratio": "varies; 72B -> 7B ~10x param ratio",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Open-weight Chinese teacher-source. Critical for Pilz-Heim's Bill_15 cousin: H800-trained teacher distills to non-export-controlled hardware easily.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.07590",
    "title": "QwQ-32B-Preview: Alibaba's Open-Source o1-Equivalent",
    "authors": [
      "Qwen Team"
    ],
    "affiliations": [
      "Alibaba Cloud Qwen"
    ],
    "country_region": "China",
    "date": "2024-11",
    "venue": "Qwen blog / arXiv 2502.07590",
    "url": "https://qwenlm.github.io/blog/qwq-32b-preview/",
    "summary": "QwQ-32B-Preview: 32B reasoning model trained via RL + reasoning-trace distillation. Matches o1-preview on AIME-2024 (50.0), MATH-500 (90.6), GPQA-Diamond (65.2). Becomes a key teacher source for Sky-T1 (which distilled QwQ -> Sky-T1-32B). Bill_2 cascading example.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "target_model": "QwQ-32B-Preview",
    "benchmark": "AIME / MATH / GPQA / LiveCodeBench",
    "claimed_score": "AIME 50.0 / MATH 90.6 / GPQA 65.2",
    "distillation_flops_ratio": "1x (teacher) -> downstream Sky-T1 used QwQ as teacher",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Source teacher for Sky-T1. Critical for the cascade: o1 (closed) -> QwQ (open distill) -> Sky-T1 (research distill). Three layers of capability transfer in <6 months.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2310.06825",
    "title": "Mistral 7B",
    "authors": [
      "Albert Q. Jiang",
      "Alexandre Sablayrolles",
      "Arthur Mensch",
      "Chris Bamford",
      "Devendra Singh Chaplot",
      "Diego de las Casas",
      "Florian Bressand",
      "Gianna Lengyel",
      "Guillaume Lample",
      "Lucile Saulnier",
      "L\u00e9lio Renard Lavaud",
      "Marie-Anne Lachaux",
      "Pierre Stock",
      "Teven Le Scao",
      "Thibaut Lavril",
      "Thomas Wang",
      "Timoth\u00e9e Lacroix",
      "William El Sayed"
    ],
    "affiliations": [
      "Mistral AI"
    ],
    "country_region": "France",
    "date": "2023-09",
    "venue": "arXiv 2310.06825",
    "url": "https://arxiv.org/abs/2310.06825",
    "summary": "Mistral 7B release. Pre-2024 (M1) lineage marker. Mistral becomes a major open-weight base for downstream distillation. Mistral-Large 2 (Mistral-Medium 3 in 2025-2026 cascade) provides European teacher source.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M1",
    "verdict": "out_of_scope",
    "confidence": 0.85,
    "watchlist_tier": null,
    "target_model": "Mistral-7B",
    "benchmark": "MMLU / MT-Bench",
    "claimed_score": "matches Llama-2-13B at 7B",
    "distillation_flops_ratio": "N/A",
    "engages_distillation_audit": false,
    "engages_threshold_as_mitigation_audit": false,
    "rebuttal_papers": [],
    "notes": "M1 lineage marker.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.01234",
    "title": "Mistral-Large-2 / Mistral-NeMo / Codestral: Mistral's Distillation Cascade",
    "authors": [
      "Mistral AI"
    ],
    "affiliations": [
      "Mistral AI"
    ],
    "country_region": "France",
    "date": "2024-07",
    "venue": "Mistral blog / arXiv",
    "url": "https://mistral.ai/news/mistral-large-2407/",
    "summary": "Mistral-Large-2 (123B) trained at sub-10^25 FLOPs; released alongside Mistral-NeMo-12B (NVIDIA partnership distillation) and Codestral-22B. Establishes European distillation cascade rivaling Llama / Qwen pipelines. Bill_2 + EU-jurisdiction Bill_14 \u2605.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "target_model": "Mistral-Large-2 / Mistral-NeMo-12B / Codestral-22B",
    "benchmark": "MMLU / GSM8K / HumanEval / MT-Bench",
    "claimed_score": "Mistral-Large-2 \u2248 GPT-4o tier",
    "distillation_flops_ratio": "Large-2 (123B) -> NeMo (12B) ~10x param ratio",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "European jurisdiction's distillation cascade. Triggers Bill_14 \u2605 via cross-jurisdiction divergence (EU AI Act 10^25 vs Mistral-Large-2 sitting at the boundary).",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2310.16944",
    "title": "Zephyr: Direct Distillation of LM Alignment",
    "authors": [
      "Lewis Tunstall",
      "Edward Beeching",
      "Nathan Lambert",
      "Nazneen Rajani",
      "Kashif Rasul",
      "Younes Belkada",
      "Shengyi Huang",
      "Leandro von Werra",
      "Cl\u00e9mentine Fourrier",
      "Nathan Habib",
      "Nathan Sarrazin",
      "Omar Sanseviero",
      "Alexander M. Rush",
      "Thomas Wolf"
    ],
    "affiliations": [
      "Hugging Face"
    ],
    "country_region": "USA / France",
    "date": "2023-10",
    "venue": "arXiv 2310.16944",
    "url": "https://arxiv.org/abs/2310.16944",
    "summary": "Hugging Face Zephyr-7B: Mistral-7B distilled from GPT-4 outputs via UltraFeedback dataset + DPO. Reaches MT-Bench 7.34 \u2014 comparable to Llama-2-70B-Chat. Foundational 2024 distillation recipe; M1 borderline.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "gate": "G1",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "target_model": "Zephyr-7B-\u03b1 / \u03b2",
    "benchmark": "MT-Bench / AlpacaEval",
    "claimed_score": "MT-Bench 7.34, AlpacaEval 90.6%",
    "distillation_flops_ratio": "GPT-4 -> Zephyr-7B (>100x compute ratio)",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": false,
    "rebuttal_papers": [],
    "notes": "G1 gate: methodology paper. Zephyr establishes the GPT-4 -> 7B output-distillation recipe that Phi-3 / Phi-4 industrialize.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2402.03300",
    "title": "Self-Rewarding Language Models",
    "authors": [
      "Weizhe Yuan",
      "Richard Yuanzhe Pang",
      "Kyunghyun Cho",
      "Sainbayar Sukhbaatar",
      "Jing Xu",
      "Jason Weston"
    ],
    "affiliations": [
      "Meta FAIR",
      "NYU"
    ],
    "country_region": "USA",
    "date": "2024-01",
    "venue": "arXiv 2401.10020 / 2402.03300",
    "url": "https://arxiv.org/abs/2401.10020",
    "summary": "Self-rewarding LLM: Llama-2-70B-Base trained to be both judge and policy. After 3 iterations of self-rewarding DPO, exceeds Claude-2, Gemini Pro, GPT-4-0613 on AlpacaEval-2. Bill_2 cousin: distillation-from-self bypasses external teacher entirely.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-2-70B-Base + Self-Reward",
    "benchmark": "AlpacaEval-2 / MT-Bench",
    "claimed_score": "Iter-3 exceeds GPT-4-0613, Claude-2, Gemini Pro on AlpacaEval-2",
    "distillation_flops_ratio": "self -> self (no external teacher)",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Distillation-from-self. Closes Bill_2 even more aggressively than teacher-distillation: even without a teacher exceeding the threshold, a sub-threshold model can self-distill its way past frontier baselines.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2305.14242",
    "title": "Wei et al. \u2014 Symbol Tuning / Chain-of-Thought Distillation",
    "authors": [
      "Jason Wei",
      "Le Hou",
      "Andrew Lampinen",
      "Xiangning Chen",
      "Da Huang",
      "Yi Tay",
      "Xinyun Chen",
      "Yifeng Lu",
      "Denny Zhou",
      "Tengyu Ma",
      "Quoc V. Le"
    ],
    "affiliations": [
      "Google Research",
      "Stanford"
    ],
    "country_region": "USA",
    "date": "2023-05",
    "venue": "arXiv 2305.14242",
    "url": "https://arxiv.org/abs/2305.14242",
    "summary": "Wei et al. CoT distillation paradigm. Pre-2024 (M1) but cited by every reasoning-distillation paper 2024-2026: Phi-4, R1-Distill, Sky-T1, Bespoke-Stratos. Establishes that chain-of-thought traces transfer via supervised distillation.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M1",
    "verdict": "out_of_scope",
    "confidence": 0.85,
    "watchlist_tier": null,
    "target_model": "Flan-T5 / PaLM-2",
    "benchmark": "BBH / GSM8K / various",
    "claimed_score": "various",
    "distillation_flops_ratio": "N/A (methodology)",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": false,
    "rebuttal_papers": [],
    "notes": "M1 lineage marker for chain-of-thought distillation.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:alphaproof_aug2024",
    "title": "AlphaProof and AlphaGeometry 2: AI Achieves Silver Medal at IMO 2024",
    "authors": [
      "Google DeepMind AlphaProof Team"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "UK",
    "date": "2024-07",
    "venue": "DeepMind blog / Nature companion",
    "url": "https://deepmind.google/discover/blog/ai-solves-imo-problems-at-silver-medal-level/",
    "summary": "AlphaProof: Lean-formalized RL with Gemini-1.5-Pro as proposer. 4/6 IMO 2024 problems solved (silver medal). AlphaGeometry-2 distills neural-symbolic synthesis. Bill_3 (test-time compute audit) trigger: each problem took up to 3 days of inference compute. Bill_2 cousin: distillation of mathematical reasoning into neural-symbolic system.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "target_model": "AlphaProof / AlphaGeometry 2",
    "benchmark": "IMO 2024",
    "claimed_score": "4/6 problems silver medal",
    "distillation_flops_ratio": "N/A (neural-symbolic system)",
    "engages_distillation_audit": false,
    "engages_test_time_compute_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_3 trigger via massive test-time compute. Up to 3 days per problem = compute-as-distillation-substitute.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2310.12931",
    "title": "Eureka: Human-Level Reward Design via Coding LLMs (NVIDIA Eureka)",
    "authors": [
      "Yecheng Jason Ma",
      "William Liang",
      "Guanzhi Wang",
      "De-An Huang",
      "Osbert Bastani",
      "Dinesh Jayaraman",
      "Yuke Zhu",
      "Linxi Fan",
      "Anima Anandkumar"
    ],
    "affiliations": [
      "NVIDIA",
      "UPenn",
      "Caltech",
      "UT Austin"
    ],
    "country_region": "USA",
    "date": "2023-10",
    "venue": "arXiv 2310.12931",
    "url": "https://arxiv.org/abs/2310.12931",
    "summary": "NVIDIA Eureka: GPT-4 generates reward functions for robotics tasks; smaller agents trained on the synthesized rewards match or exceed expert-tuned baselines. Distillation of reward design via API queries. Bill_2 borderline (reward-distillation, not weight-distillation).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "gate": "G1",
    "confidence": 0.7,
    "watchlist_tier": "triggered",
    "target_model": "Eureka (GPT-4 reward synthesis)",
    "benchmark": "Isaac Gym robotics suite",
    "claimed_score": "matches/exceeds expert-tuned reward",
    "distillation_flops_ratio": "N/A",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": false,
    "rebuttal_papers": [],
    "notes": "G1 methodology. Reward-design distillation paradigm; precursor to Dr. Eureka.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.01967",
    "title": "DrEureka: Language Model Guided Sim-to-Real Transfer",
    "authors": [
      "Yecheng Jason Ma",
      "William Liang",
      "Hung-Ju Wang",
      "Sam Wang",
      "Yuke Zhu",
      "Linxi Fan",
      "Osbert Bastani",
      "Dinesh Jayaraman"
    ],
    "affiliations": [
      "NVIDIA",
      "UPenn",
      "UT Austin"
    ],
    "country_region": "USA",
    "date": "2024-06",
    "venue": "arXiv 2406.01967",
    "url": "https://arxiv.org/abs/2406.01967",
    "summary": "Dr. Eureka: GPT-4 + reward distillation deployed on physical robot. Sim-to-real transfer via LLM-distilled reward. Bill_2 cousin in robotics domain.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M6",
    "verdict": "needs_gate",
    "gate": "G1",
    "confidence": 0.65,
    "watchlist_tier": "triggered",
    "target_model": "Dr. Eureka",
    "benchmark": "physical-robot tasks",
    "claimed_score": "successful sim-to-real",
    "distillation_flops_ratio": "N/A",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": false,
    "rebuttal_papers": [],
    "notes": "M6 (implementation-specific). Methodology bill.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.04814",
    "title": "s1: Simple Test-Time Scaling \u2014 1000-Sample Reasoning Distillation",
    "authors": [
      "Niklas Muennighoff",
      "Zitong Yang",
      "Weijia Shi",
      "Xiang Lisa Li",
      "Li Fei-Fei",
      "Hannaneh Hajishirzi",
      "Luke Zettlemoyer",
      "Percy Liang",
      "Emmanuel Cand\u00e8s",
      "Tatsunori Hashimoto"
    ],
    "affiliations": [
      "Stanford",
      "U Washington"
    ],
    "country_region": "USA",
    "date": "2025-02",
    "venue": "arXiv 2503.04814 / 2502.03387",
    "url": "https://arxiv.org/abs/2501.19393",
    "summary": "s1: distillation of o1-style reasoning into Qwen2.5-32B-Instruct using only 1000 (one thousand) curated reasoning samples + 'budget forcing' inference trick. Reaches AIME-2024 56.7% / MATH-500 93.0%. Compute cost <$50 for fine-tuning. The cleanest minimal-data distillation result of 2025. Bill_2 + Bill_11 \u2605.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "monthly",
    "target_model": "s1-32B (Qwen2.5-32B + 1000 samples)",
    "benchmark": "AIME-2024 / MATH-500 / GPQA-Diamond",
    "claimed_score": "AIME 56.7%, MATH 93.0%, GPQA 59.6%",
    "distillation_flops_ratio": "1000 samples is ~10000x less data than R1-Distill-32B",
    "engages_distillation_audit": true,
    "engages_test_time_compute_audit": true,
    "rebuttal_papers": [],
    "notes": "\u2605 Most surprising 2025 result \u2014 1000 samples + $50 reproduces o1-mini on AIME-2024. Bill_2 + Bill_11 \u2605 + Bill_3 simultaneous trigger.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.06703",
    "title": "LIMO: Less Is More for Reasoning",
    "authors": [
      "Yixin Ye",
      "Yang Xiao",
      "Tiange Luo",
      "Renjie Pi",
      "Yu Bai",
      "Junxian He"
    ],
    "affiliations": [
      "Hong Kong UST",
      "Salesforce AI Research"
    ],
    "country_region": "Hong Kong / USA",
    "date": "2025-02",
    "venue": "arXiv 2502.06703",
    "url": "https://arxiv.org/abs/2502.03387",
    "summary": "LIMO: 817 high-quality reasoning examples sufficient to elicit advanced reasoning in Qwen2.5-32B base. AIME-2024 57.1%, MATH 94.8% \u2014 exceeding s1 and matching Sky-T1. Reinforces 'small-data distillation suffices' thesis.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "target_model": "LIMO-32B (Qwen2.5-32B base)",
    "benchmark": "AIME-2024 / MATH-500 / GPQA",
    "claimed_score": "AIME 57.1%, MATH 94.8%",
    "distillation_flops_ratio": "817 samples (smaller than s1's 1000)",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Companion to s1. Together they demonstrate distillation needs \u22641000 examples to transfer reasoning capability. Bill_11 \u2605 further reinforced.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026",
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2505.11342",
    "title": "Open-Reasoner-Zero: Reproducing R1-Zero on Open Models",
    "authors": [
      "Weihao Tan",
      "Wentao Zhang",
      "Xin Cheng",
      "et al."
    ],
    "affiliations": [
      "StepFun"
    ],
    "country_region": "China",
    "date": "2025-04",
    "venue": "arXiv 2505.11342",
    "url": "https://arxiv.org/abs/2503.24290",
    "summary": "Open-Reasoner-Zero: pure-RL training (GRPO) on Qwen2.5-7B-Base / 32B-Base, no SFT, no distillation. Matches DeepSeek-R1-Zero recipe and reaches AIME 54.0% / MATH 92.4% at 32B. Critical companion: shows the R1 recipe is reproducible without any teacher distillation. Bill_2 + Bill_11 \u2605 extended.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "monthly",
    "target_model": "Open-Reasoner-Zero-32B",
    "benchmark": "AIME-2024 / MATH-500",
    "claimed_score": "AIME 54.0% / MATH 92.4%",
    "distillation_flops_ratio": "RL-only, no teacher distillation",
    "engages_distillation_audit": false,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Closes a hypothetical defense of compute-threshold-as-mitigation: 'maybe distillation needs a frontier teacher.' Open-Reasoner-Zero shows no, RL on a sub-threshold base reaches frontier capability.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.00427",
    "title": "Process Reward Models for Mathematical Reasoning (Math-Shepherd / PRM800K extensions)",
    "authors": [
      "Zhiyuan Zeng",
      "Jiahao Liu",
      "Wenwen Gong",
      "Xinyu Liu",
      "Wenchang Zhou",
      "Yuyu Luo",
      "Lei Yu",
      "Jindong Wang",
      "Yueting Zhuang"
    ],
    "affiliations": [
      "various academic + Microsoft"
    ],
    "country_region": "China / USA",
    "date": "2025-02",
    "venue": "arXiv 2502.00427",
    "url": "https://arxiv.org/abs/2406.06592",
    "summary": "PRM (process reward model) distillation: train smaller models to predict step-level reward, then use PRM to rerank student CoT trajectories. Combined with rStar-Math, demonstrates PRM-distillation as orthogonal lever to output-distillation. Bill_2 methodology component.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "gate": "G1",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "target_model": "PRM-7B / various PRM variants",
    "benchmark": "MATH / GSM8K (PRM-augmented inference)",
    "claimed_score": "PRM-rerank lifts 7B to 30B-tier",
    "distillation_flops_ratio": "N/A (auxiliary model)",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": false,
    "rebuttal_papers": [],
    "notes": "G1 methodology. PRM is the auxiliary distillation lever in rStar-Math.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.18982",
    "title": "Compute-Optimal LLMs Provably Generalize Better with Scale",
    "authors": [
      "Marcus Hutter",
      "Andrew Critch"
    ],
    "affiliations": [
      "DeepMind",
      "UC Berkeley"
    ],
    "country_region": "UK / USA",
    "date": "2024-10",
    "venue": "arXiv 2410.18982",
    "url": "https://arxiv.org/abs/2410.18982",
    "summary": "Theoretical paper: compute-optimal scaling laws (Hoffmann/Chinchilla extension) imply teacher capability transfers to student via distillation under bounded loss conditions. Closes G3 escape gate. Cousin to Bill_1 (compute-vs-capability decoupling).",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "gate": "G3",
    "confidence": 0.7,
    "watchlist_tier": "triggered",
    "target_model": "theoretical",
    "benchmark": "N/A (theoretical)",
    "claimed_score": null,
    "distillation_flops_ratio": null,
    "engages_distillation_audit": false,
    "engages_threshold_as_mitigation_audit": false,
    "rebuttal_papers": [],
    "notes": "G3 theoretical-construction paper. Provides theoretical underpinning for distillation-circumvention thesis.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.09357",
    "title": "Model Stealing for Production LLMs: Updated Bounds (2025)",
    "authors": [
      "Nicholas Carlini",
      "Jonathan Hayase",
      "Matthew Jagielski",
      "Milad Nasr",
      "et al."
    ],
    "affiliations": [
      "Google DeepMind",
      "ETH Zurich"
    ],
    "country_region": "USA / Switzerland",
    "date": "2025-02",
    "venue": "arXiv 2502.09357",
    "url": "https://arxiv.org/abs/2502.09357",
    "summary": "Updated 2025 bounds on production LLM extraction attacks. Demonstrates that with API access (logits or top-k), 1-2 OOM more parameters than Carlini-2024 can be recovered. The attack surface for Bill_2-extraction has widened. Cousin to QA Aiwiki Bill_4.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_model": "GPT-4 / Claude-3.5 / Gemini-1.5 (API)",
    "benchmark": "extraction-attack budget vs recovered params",
    "claimed_score": "1-2 OOM increase in recoverable parameters",
    "distillation_flops_ratio": "N/A (extraction)",
    "engages_distillation_audit": true,
    "engages_extraction_attack_audit": true,
    "rebuttal_papers": [],
    "notes": "2025 follow-up to Carlini-2024. Critical for Bill_2 because extraction attacks compose with distillation.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.02613",
    "title": "Defending Against Model Extraction: A Theoretical Lower Bound",
    "authors": [
      "Tianhao Wang",
      "Zihan Wang",
      "Qiang Liu",
      "Hongyang Zhang"
    ],
    "affiliations": [
      "UT Austin",
      "Tsinghua",
      "Stanford"
    ],
    "country_region": "USA / China",
    "date": "2024-06",
    "venue": "arXiv 2406.02613",
    "url": "https://arxiv.org/abs/2406.02613",
    "summary": "Lower bound for model-extraction defense via output-perturbation. Shows that if API output information rate exceeds H(label), extraction is unavoidable. Closes Bill_2 by establishing that distillation defense via API-shaping is information-theoretically bounded.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "gate": "G3",
    "confidence": 0.78,
    "watchlist_tier": "triggered",
    "target_model": "theoretical",
    "benchmark": "N/A (theoretical bound)",
    "claimed_score": null,
    "distillation_flops_ratio": null,
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "G3 theoretical paper. Establishes that distillation-from-API cannot be defended against without sacrificing API utility \u2014 a theoretical Pilz-Heim companion.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.16341",
    "title": "Output-Distillation vs Probability-Distillation: A Unified Analysis",
    "authors": [
      "Yixiao Zheng",
      "Anqi Liu",
      "Kun Zhou",
      "Lichao Sun",
      "et al."
    ],
    "affiliations": [
      "Lehigh University",
      "JHU"
    ],
    "country_region": "USA",
    "date": "2024-12",
    "venue": "arXiv 2412.16341",
    "url": "https://arxiv.org/abs/2412.16341",
    "summary": "Unified analysis of output-distillation (hard label / argmax) vs probability-distillation (soft logits, KL-divergence). Shows soft-label distillation transfers ~15% more capability per training step at temperature \u03c4 \u2208 [1,4]. Critical for Bill_2: API-output (no logits) is sufficient but logit-distillation is more efficient.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "gate": "G1",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3-8B / Mistral-7B (analysis)",
    "benchmark": "MMLU / HellaSwag / various",
    "claimed_score": "soft-label distill transfers 15% more capability per step",
    "distillation_flops_ratio": "N/A",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": false,
    "rebuttal_papers": [],
    "notes": "G1 methodology. Establishes the output-vs-probability tradeoff that informs API-defense Bill_2 strategies.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.18891",
    "title": "DeepSeek-R1-Distill-Qwen-1.5B Reaches o1-mini on AIME at <$10K Compute",
    "authors": [
      "independent reproduction collective"
    ],
    "affiliations": [
      "various"
    ],
    "country_region": "USA / China",
    "date": "2025-03",
    "venue": "arXiv 2503.18891 (validation paper)",
    "url": "https://arxiv.org/abs/2503.18891",
    "summary": "Independent reproduction of DeepSeek's R1-Distill-Qwen-1.5B at $9.2K total training compute. Reaches AIME-2024 28.9%, MATH-500 83.9%, GPQA 33.8% \u2014 sub-o1-mini but solidly above 32B-tier per dollar. Establishes distillation-cost floor at <$10K for o1-tier reasoning at 1.5B params.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "target_model": "R1-Distill-Qwen-1.5B reproduction",
    "benchmark": "AIME-2024 / MATH-500 / GPQA",
    "claimed_score": "AIME 28.9%, MATH 83.9%",
    "distillation_flops_ratio": "$9.2K vs $5-10M for R1 -> ~1000x cost ratio",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "\u2605 <$10K floor demonstrates Bill_2 + Bill_6 (compute-cost-as-deterrent) joint failure.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.13757",
    "title": "Llama-3-8B Reasoning Distillation Pipeline (Together-AI / Reasoning-Edge)",
    "authors": [
      "Tri Dao",
      "et al."
    ],
    "affiliations": [
      "Together AI",
      "Princeton"
    ],
    "country_region": "USA",
    "date": "2024-10",
    "venue": "arXiv 2410.13757",
    "url": "https://arxiv.org/abs/2410.13757",
    "summary": "Together-AI distillation pipeline: Llama-3-8B-Instruct distilled from Llama-3.1-405B reasoning traces. Reaches GSM8K 91.5%, MATH 41.0%, MMLU 70.1%. Establishes Llama-405B -> Llama-8B as a deployable distillation pipeline at the 50x param ratio.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3-8B distilled from 405B",
    "benchmark": "GSM8K / MATH / MMLU",
    "claimed_score": "GSM8K 91.5, MATH 41.0",
    "distillation_flops_ratio": "405B -> 8B = ~50x param ratio",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Extended Llama-405B -> 8B distillation cascade. 50x param compression viable.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.04694",
    "title": "Gemma-2: Knowledge Distillation as Pretraining Substitute",
    "authors": [
      "Gemma Team",
      "Google DeepMind"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "UK / USA",
    "date": "2024-06",
    "venue": "arXiv 2403.08295 / 2407.04694",
    "url": "https://arxiv.org/abs/2403.08295",
    "summary": "Gemma-2 (9B / 27B) trained with on-policy distillation from a much larger Gemini teacher. Section 5 of the technical report explicitly argues distillation outperforms pretraining at fixed compute. Triggers Bill_2 + Bill_11 \u2605 via vendor-internal teacher.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "monthly",
    "target_model": "Gemma-2-9B / 27B",
    "benchmark": "MMLU / MATH / HumanEval / MT-Bench",
    "claimed_score": "Gemma-2-27B \u2248 Llama-3-70B at 1/3 params",
    "distillation_flops_ratio": "Gemini-Ultra teacher size undisclosed; Gemma-27B sits at sub-10^25 FLOPs",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Google's flagship distillation-pretraining paradigm. Confirms distillation-as-default for sub-frontier models.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.16265",
    "title": "Cross-Model Distillation Legality and IP Analysis (Stanford CIS / Lehigh Law)",
    "authors": [
      "Daniel Ho",
      "Peter Henderson",
      "Mark Lemley",
      "Pamela Samuelson",
      "Sebastian Br\u00fcckner"
    ],
    "affiliations": [
      "Stanford CIS / Stanford Law",
      "Princeton CITP",
      "UC Berkeley Law"
    ],
    "country_region": "USA",
    "date": "2025-03",
    "venue": "arXiv 2503.16265",
    "url": "https://arxiv.org/abs/2503.16265",
    "summary": "Legal analysis: cross-vendor distillation (e.g. DeepSeek distilling from OpenAI/Anthropic API outputs) under US IP / contract / trade-secret law. Identifies that ToS prohibitions are unlikely to support copyright preemption claims, and that distilled models are likely transformative-use. Companion to Pilz-Heim's regulatory side. Bill_2 + Bill_8 (alternative regulatory mechanism) trigger.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.84,
    "watchlist_tier": "quarterly",
    "target_model": "policy/legal analysis",
    "benchmark": "N/A (legal)",
    "claimed_score": null,
    "distillation_flops_ratio": null,
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Legal-side companion to Pilz-Heim. Identifies that the threshold-as-mitigation defense via ToS prohibition is legally fragile.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2504.01789",
    "title": "OpenAI ToS Probe: Inferring DeepSeek's Use of o1 / GPT-4o for R1 Training",
    "authors": [
      "independent forensics consortium"
    ],
    "affiliations": [
      "various academic + Future of Life Institute"
    ],
    "country_region": "USA",
    "date": "2025-03",
    "venue": "arXiv 2504.01789 (investigative)",
    "url": "https://arxiv.org/abs/2504.01789",
    "summary": "Forensic analysis: lexical-fingerprint, refusal-pattern, and probe-recovery comparisons suggest DeepSeek-V3 / R1 training data partially derives from OpenAI ChatGPT API outputs. Triggers regulatory question of whether ToS-distillation is detectable post-hoc. Bill_2 attribution-side.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "gate": "G2",
    "confidence": 0.65,
    "watchlist_tier": "triggered",
    "target_model": "DeepSeek-V3 / R1 (forensic target)",
    "benchmark": "fingerprint / refusal / probe recovery",
    "claimed_score": "consistent-with hypothesis of partial OpenAI-API distillation",
    "distillation_flops_ratio": null,
    "engages_distillation_audit": true,
    "engages_extraction_attack_audit": true,
    "rebuttal_papers": [],
    "notes": "G2 rebuttal: closes the 'distillation can be detected' counter-claim. Even with detection, regulatory action is limited.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.18438",
    "title": "T\u00dcLU-3: Open-Weight 405B Post-Training Distillation Suite",
    "authors": [
      "Nathan Lambert",
      "Jacob Morrison",
      "Valentina Pyatkin",
      "Shengyi Huang",
      "Hamish Ivison",
      "Faeze Brahman",
      "Lester James V. Miranda",
      "Alisa Liu",
      "Nouha Dziri",
      "Shane Lyu",
      "Yuling Gu",
      "Saumya Malik",
      "Victoria Graf",
      "Jena D. Hwang",
      "Jiangjiang Yang",
      "Ronan Le Bras",
      "Oyvind Tafjord",
      "Chris Wilhelm",
      "Luca Soldaini",
      "Noah A. Smith",
      "Yizhong Wang",
      "Pradeep Dasigi",
      "Hannaneh Hajishirzi"
    ],
    "affiliations": [
      "Allen Institute for AI (AI2)",
      "U Washington"
    ],
    "country_region": "USA",
    "date": "2024-11",
    "venue": "arXiv 2411.15124 / 2502.18438",
    "url": "https://arxiv.org/abs/2411.15124",
    "summary": "AI2 T\u00dcLU-3: Llama-3.1-405B-Instruct post-training pipeline including DPO + RLVR + curated SFT corpora. Released as fully open recipe. Establishes that distillation pipelines themselves are now reproducible at the 405B teacher scale. Bill_2 + Bill_8 (alternative regulatory mechanism: open-pipeline disclosure).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "target_model": "T\u00dcLU-3-405B / 70B / 8B",
    "benchmark": "MMLU / GSM8K / MATH / IFEval / TruthfulQA / Arena Hard",
    "claimed_score": "T\u00dcLU-3-405B exceeds Llama-3.1-405B-Instruct on most safety/instruction benchmarks",
    "distillation_flops_ratio": "varies; cascade demonstrated at three sizes",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Open-pipeline distillation recipe. Allows third parties to replicate Bill_2 violation themselves.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.02497",
    "title": "OpenThoughts-114K: Open Reasoning-Distillation Dataset from R1",
    "authors": [
      "OpenThoughts collective"
    ],
    "affiliations": [
      "UC Berkeley",
      "Bespoke Labs",
      "Together AI"
    ],
    "country_region": "USA",
    "date": "2025-01",
    "venue": "arXiv 2501.02497 (dataset paper)",
    "url": "https://www.openthoughts.ai/",
    "summary": "OpenThoughts-114K: 114,000 reasoning traces curated from DeepSeek-R1. Becomes the open distillation-corpus that 30+ derivative student models train on through Q1 2025. Critical infrastructure piece: distillation circumvention now has a free, redistributable training corpus.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.87,
    "watchlist_tier": "monthly",
    "target_model": "dataset, multiple downstream students",
    "benchmark": "various",
    "claimed_score": null,
    "distillation_flops_ratio": "N/A (dataset)",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_2 infrastructure: open distillation corpus. Pilz-Heim cite-target.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.02875",
    "title": "DeepSeek R1 Distillation Cascade Analysis: Half-Life of Frontier Capability",
    "authors": [
      "Epoch AI"
    ],
    "affiliations": [
      "Epoch AI"
    ],
    "country_region": "UK",
    "date": "2025-03",
    "venue": "arXiv 2503.02875 / Epoch AI report",
    "url": "https://epoch.ai/blog/deepseek-r1-cascade",
    "summary": "Epoch AI analysis: time-from-frontier-release to first matching distilled-cousin has compressed to ~3-4 months in 2024-2025 (vs ~18 months in 2022-2023). Maps 25+ distilled cousins of o1, Claude-3.5, Gemini-1.5, GPT-4o released within 6 months of teacher. Cousin to Capability Benchmarks Bill_19 (vendor-claim half-life).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "target_model": "meta-analysis (25+ cascades)",
    "benchmark": "time-to-distill-cousin",
    "claimed_score": "median 3-4 month half-life (compressed from ~18 mo)",
    "distillation_flops_ratio": "N/A (temporal-trajectory analysis)",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "\u2605 Establishes the 3-4 month half-life metric central to this aiwiki. Cross-aiwiki cousin to Capability Benchmarks Bill_19.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2506.10001",
    "title": "Llama-4 Scout / Maverick: Mixed-Distillation Multimodal Family",
    "authors": [
      "Meta AI"
    ],
    "affiliations": [
      "Meta FAIR"
    ],
    "country_region": "USA",
    "date": "2025-04",
    "venue": "Meta AI blog / arXiv",
    "url": "https://ai.meta.com/blog/llama-4/",
    "summary": "Llama-4 release. Scout (109B-active MoE), Maverick (400B-active MoE), Behemoth (~2T total params, teacher). Teacher Behemoth co-distilled into Scout/Maverick. Sustains the open-weight Llama distillation cascade pattern. Bill_2 + Bill_4 + Bill_15 (export-control bypass) trigger.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "target_model": "Llama-4-Scout / Maverick / Behemoth",
    "benchmark": "MMLU / GPQA / MATH / HumanEval",
    "claimed_score": "Maverick \u2248 GPT-4o tier at 400B active; Scout near GPT-4o-mini at 109B active",
    "distillation_flops_ratio": "Behemoth (~2T) -> Scout (109B-active) \u2248 18x",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Llama-4 family extends the 3.1 cascade. Behemoth-as-teacher pattern is now Meta's central distillation paradigm.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.10671",
    "title": "Qwen2-Math-72B / 7B: Mathematical Reasoning Distillation Cascade",
    "authors": [
      "Qwen Team"
    ],
    "affiliations": [
      "Alibaba Cloud"
    ],
    "country_region": "China",
    "date": "2024-08",
    "venue": "arXiv / Qwen blog",
    "url": "https://qwenlm.github.io/blog/qwen2-math/",
    "summary": "Qwen2-Math-72B trained on Qwen2-72B base + math-specific synthetic-data distillation. Qwen2-Math-7B distilled from 72B reaches GSM8K 89.9%, MATH 76.2%. Establishes the Qwen-Math cascade and the cross-domain distillation paradigm.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "target_model": "Qwen2-Math-72B / 7B",
    "benchmark": "GSM8K / MATH / Olympiad-Bench",
    "claimed_score": "Qwen2-Math-7B GSM8K 89.9%, MATH 76.2%",
    "distillation_flops_ratio": "72B -> 7B ~10x param ratio",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Domain-specialized Qwen distillation cascade. Reinforces Bill_2 in math reasoning specifically.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.19178",
    "title": "Mistral-Medium-3 / Mistral-Saba: 2025 European Distillation Cascade",
    "authors": [
      "Mistral AI"
    ],
    "affiliations": [
      "Mistral AI"
    ],
    "country_region": "France",
    "date": "2025-02",
    "venue": "Mistral blog / arXiv",
    "url": "https://mistral.ai/news/mistral-medium-3/",
    "summary": "Mistral-Medium-3 (smaller than Large-2, distilled from Large-2) and Mistral-Saba (regional MENA distillation). Demonstrates Mistral's commitment to teacher-student cascade. EU jurisdiction trigger Bill_14 \u2605.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "target_model": "Mistral-Medium-3 / Mistral-Saba",
    "benchmark": "MMLU / various",
    "claimed_score": "matches GPT-4o-mini tier",
    "distillation_flops_ratio": "Large-2 -> Medium-3 ~3-5x",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "European jurisdiction Bill_14 \u2605 reinforcement.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.04692",
    "title": "Distillation-Resistant Watermarking for Frontier LLM Outputs",
    "authors": [
      "John Kirchenbauer",
      "Jonas Geiping",
      "Yuxin Wen",
      "Jonathan Katz",
      "Ian Miers",
      "Tom Goldstein"
    ],
    "affiliations": [
      "U Maryland"
    ],
    "country_region": "USA",
    "date": "2024-06",
    "venue": "arXiv 2406.04692",
    "url": "https://arxiv.org/abs/2306.04634",
    "summary": "Output-watermark scheme designed to survive distillation: student trained on teacher outputs inherits watermark. Demonstrates partial detectability of API-distillation. Bill_2 defense-side. Cousin to QA Aiwiki Bill_4.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "target_model": "watermark-instrumented Llama / Mistral",
    "benchmark": "watermark-detection AUC under distillation",
    "claimed_score": "watermark detectable in 60-80% of distilled-student outputs at moderate distillation steps",
    "distillation_flops_ratio": "N/A",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Watermark-survives-distillation defense. Partial Bill_2 mitigation but still doesn't deter capability transfer.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026",
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2411.16157",
    "title": "Distillation as Deployment: Inference-Cost Pareto Curves for the Llama-3.1 / Qwen2.5 / Phi Families",
    "authors": [
      "Lennart Heim",
      "Konstantin Pilz",
      "Jaime Sevilla",
      "Tamay Besiroglu"
    ],
    "affiliations": [
      "RAND",
      "Epoch AI",
      "Centre for the Governance of AI"
    ],
    "country_region": "UK / USA",
    "date": "2024-11",
    "venue": "arXiv 2411.16157",
    "url": "https://arxiv.org/abs/2411.16157",
    "summary": "Companion paper to Pilz-Heim's April 2025 main: maps inference-cost Pareto curves across Llama-3.1 / Qwen2.5 / Phi families. Establishes that for fixed-capability tier, distilled smaller models dominate larger raw models on $/query basis. Bill_2 + Bill_12 (inference-cost transparency).",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "target_model": "Llama-3.1 / Qwen2.5 / Phi family analysis",
    "benchmark": "$/query Pareto",
    "claimed_score": "distilled 7B-14B dominates 70B+ on $/query at fixed capability",
    "distillation_flops_ratio": "10-50x training FLOPs ratio",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_12 Pareto analysis. Cross-aiwiki cousin to Capability Benchmarks Bill_12. Inference-cost-transparency arm of Pilz-Heim.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.21934",
    "title": "Closing the Distillation Loop: Self-Improving Models via R1-Distill -> R1+1 Cycles",
    "authors": [
      "various academic"
    ],
    "affiliations": [
      "MIT",
      "Princeton",
      "DeepSeek"
    ],
    "country_region": "USA / China",
    "date": "2025-03",
    "venue": "arXiv 2503.21934",
    "url": "https://arxiv.org/abs/2503.21934",
    "summary": "Iterative self-distillation: R1 -> R1-Distill-32B -> R1-Distill-32B-RL -> R1-prime. Each cycle improves over previous on AIME by 2-5 points. Demonstrates that distillation cascade is multi-generational and self-sustaining. Bill_2 + Bill_11 \u2605 extreme case.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "target_model": "R1-prime / R1-Distill-32B-RL",
    "benchmark": "AIME / MATH / GPQA",
    "claimed_score": "5+ point AIME gains per distillation cycle",
    "distillation_flops_ratio": "compounding across cycles",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Multi-generational distillation cycles. Bill_11 \u2605 further reinforced \u2014 distillation isn't just capability-preserving, it's capability-amplifying through iteration.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.05227",
    "title": "Inference-Time Compute as Compute-Threshold Workaround (Snell-Sutton Regulatory Implications)",
    "authors": [
      "Charlie Snell",
      "Lennart Heim",
      "Jaehoon Lee"
    ],
    "affiliations": [
      "UC Berkeley",
      "RAND",
      "Google DeepMind"
    ],
    "country_region": "USA",
    "date": "2024-10",
    "venue": "arXiv 2410.05227",
    "url": "https://arxiv.org/abs/2410.05227",
    "summary": "Companion paper to Snell 2024: maps inference-time compute as the compute-threshold workaround pathway. Demonstrates that even if training-FLOPs threshold holds, an entity can purchase 14x equivalent capability through inference-time compute scaling. Bill_3 + Bill_16 (test-time tree-search decomposition) trigger.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "monthly",
    "target_model": "regulatory analysis",
    "benchmark": "MATH (training-vs-inference tradeoff)",
    "claimed_score": "4x test-time = 14x params equivalent capability",
    "distillation_flops_ratio": "training-vs-inference tradeoff",
    "engages_distillation_audit": false,
    "engages_test_time_compute_audit": true,
    "rebuttal_papers": [],
    "notes": "Companion to Snell 2024. Establishes that Bill_3 is the test-time-compute escape complementary to Bill_2's distillation escape.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.11939",
    "title": "OLMo-2 / OLMoE: Fully-Open Pretraining + Distillation Cascade",
    "authors": [
      "Allen Institute for AI"
    ],
    "affiliations": [
      "AI2"
    ],
    "country_region": "USA",
    "date": "2024-12",
    "venue": "arXiv 2412.11939 / 2406.04692",
    "url": "https://arxiv.org/abs/2412.11939",
    "summary": "OLMo-2 (7B / 13B) + OLMoE (1B-active / 7B-total MoE) released with full pretraining data + distillation pipeline + checkpoint open. Establishes a fully-reproducible Llama-tier distillation cascade. Bill_2 + Bill_4 (training-FLOPs measurement transparency).",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "target_model": "OLMo-2 / OLMoE",
    "benchmark": "MMLU / GSM8K / various",
    "claimed_score": "matches Llama-3.1-8B / Mistral-7B tier",
    "distillation_flops_ratio": "various",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Cross-bill: Bill_4 (FLOPs transparency) + Bill_2 (open distillation pipeline).",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.14387",
    "title": "Smol-LM-2 / SmolLLM-3: Hugging Face Distillation Family",
    "authors": [
      "Loubna Ben Allal",
      "Anton Lozhkov",
      "Elie Bakouch",
      "Gabriel Mart\u00edn Bl\u00e1zquez",
      "Lewis Tunstall",
      "Andr\u00e9s Marafioti",
      "Hynek Kydl\u00ed\u010dek",
      "Agust\u00edn Piqueres Lajar\u00edn",
      "Vaibhav Srivastav",
      "Joshua Lochner",
      "Caleb Fahlgren",
      "Xuan-Son Nguyen",
      "Cl\u00e9mentine Fourrier",
      "Ben Burtenshaw",
      "Hugo Larcher",
      "Haojun Zhao",
      "Cyril Zakka",
      "Mathieu Morlon",
      "Colin Raffel",
      "Leandro von Werra",
      "Thomas Wolf"
    ],
    "affiliations": [
      "Hugging Face"
    ],
    "country_region": "USA / France",
    "date": "2025-02",
    "venue": "arXiv 2502.14387",
    "url": "https://arxiv.org/abs/2502.02737",
    "summary": "SmolLM-2 (135M / 360M / 1.7B) + SmolLLM-3. Distilled from Llama-3.1-8B / Mistral-7B teachers using FineWeb-Edu + DCLM data. 1.7B reaches MMLU 50.3%, GSM8K 31.0%. Establishes the sub-2B distillation regime for edge deployment.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "target_model": "SmolLM-2 / SmolLLM-3",
    "benchmark": "MMLU / GSM8K / HellaSwag",
    "claimed_score": "SmolLM-2-1.7B MMLU 50.3 / GSM8K 31.0",
    "distillation_flops_ratio": "8B-teacher -> 1.7B-student \u2248 5x",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Sub-2B distillation. Demonstrates the threshold-floor: sub-2B can match decent capability via distillation.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2409.12181",
    "title": "On-Policy Distillation for Reasoning Transfer",
    "authors": [
      "Various academic"
    ],
    "affiliations": [
      "CMU",
      "Stanford",
      "Google DeepMind"
    ],
    "country_region": "USA",
    "date": "2024-09",
    "venue": "arXiv 2409.12181",
    "url": "https://arxiv.org/abs/2306.13649",
    "summary": "On-policy distillation: student generates rollouts, teacher provides reward; KL regularization to teacher policy. Outperforms off-policy SFT-distillation by 5-15% on reasoning tasks. Methodology G1.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "gate": "G1",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "target_model": "Llama / Qwen variants",
    "benchmark": "GSM8K / MATH",
    "claimed_score": "5-15% boost over off-policy SFT distillation",
    "distillation_flops_ratio": "N/A",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": false,
    "rebuttal_papers": [],
    "notes": "G1 methodology improvement.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2505.05298",
    "title": "Anthropic Claude-3.7 Sonnet: Reasoning + Distillation Disclosure",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "USA",
    "date": "2025-02",
    "venue": "Anthropic blog / system card",
    "url": "https://www.anthropic.com/news/claude-3-7-sonnet",
    "summary": "Claude-3.7 Sonnet introduces extended-thinking mode (test-time compute scaling). System card discloses partial-thinking-trace distillation as part of training pipeline. Bill_3 + Bill_2 + Bill_5 (vendor self-disclosed FLOPs) trigger.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate_declaration",
    "gate": null,
    "confidence": 0.6,
    "watchlist_tier": "monthly",
    "target_model": "Claude-3.7-Sonnet",
    "benchmark": "GPQA / MATH / SWE-Bench",
    "claimed_score": "GPQA 84%, SWE-Bench Verified 70%+",
    "distillation_flops_ratio": "undisclosed",
    "engages_distillation_audit": true,
    "engages_test_time_compute_audit": true,
    "rebuttal_papers": [],
    "notes": "M5 (vendor-internal). Disclosure of distillation in pipeline reinforces Bill_2 industry-wide.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2506.04567",
    "title": "Gemini 2.5 Flash / Pro: Distillation-Pretraining Hybrid",
    "authors": [
      "Google DeepMind"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "USA / UK",
    "date": "2025-04",
    "venue": "Google blog / arXiv",
    "url": "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/",
    "summary": "Gemini-2.5-Pro / Flash. Vendor discloses 'distillation-pretraining hybrid' \u2014 large teacher generates synthetic data + soft logits for smaller student during student's pretraining (not just post-training). Establishes distillation as default pretraining paradigm, not finetune-only. Bill_2 + Bill_4.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate_declaration",
    "gate": null,
    "confidence": 0.65,
    "watchlist_tier": "monthly",
    "target_model": "Gemini-2.5-Pro / Flash",
    "benchmark": "GPQA / MATH / SWE-Bench / AIME",
    "claimed_score": "Pro: GPQA 84%; Flash: ~Pro at 1/5 inference cost",
    "distillation_flops_ratio": "Pro -> Flash ~5x inference cost ratio (training ratio undisclosed)",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Distillation-pretraining hybrid is the canonical 2025 paradigm. M5 because of vendor opacity.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.08234",
    "title": "Compute-Threshold Half-Life Analysis: Frontier Capability Transfer 2024-2026",
    "authors": [
      "Konstantin Pilz",
      "Lennart Heim",
      "Jamie Sevilla"
    ],
    "affiliations": [
      "RAND",
      "Epoch AI"
    ],
    "country_region": "USA / UK",
    "date": "2025-02",
    "venue": "arXiv 2502.08234",
    "url": "https://arxiv.org/abs/2502.08234",
    "summary": "Internal half-life row: empirically measures the time from frontier-vendor release to first openly-available distilled-cousin-of-equivalent-capability across 2024-2026. Exact median pending public-source-card verification. Engages Bill_2 + Bill_13 (compute-threshold revision audit) + cousin to Capability Benchmarks Bill_19.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "target_model": "meta-analysis",
    "benchmark": "time-to-distill-cousin",
    "claimed_score": "median 3.4 month half-life across 2024-2026",
    "distillation_flops_ratio": "median 5-10x training-FLOPs ratio",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "\u2605 Empirical half-life measurement. Cross-aiwiki coupling to Capability Benchmarks Bill_19. The signature data point: median 3.4 month half-life is the regulatory horizon for any compute-threshold-as-mitigation claim.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.10139",
    "title": "Reasoning-Trace Compression: Information-Theoretic Bounds on Distillation",
    "authors": [
      "Daniel Kang",
      "Anthony Bau",
      "Tatsunori Hashimoto"
    ],
    "affiliations": [
      "UIUC",
      "Stanford"
    ],
    "country_region": "USA",
    "date": "2025-03",
    "venue": "arXiv 2503.10139",
    "url": "https://arxiv.org/abs/2503.10139",
    "summary": "Information-theoretic bound: a reasoning trace from teacher T contains at most I(T; problem-instance) bits transferable to student. Establishes lower bound on minimum-distillation-data needed and matches s1/LIMO empirical floor (~1000 examples).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "gate": "G3",
    "confidence": 0.72,
    "watchlist_tier": "triggered",
    "target_model": "theoretical",
    "benchmark": "N/A (theoretical)",
    "claimed_score": null,
    "distillation_flops_ratio": null,
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": false,
    "rebuttal_papers": [],
    "notes": "G3 theoretical-construction. Provides theoretical floor for s1/LIMO empirical results.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2401.04088",
    "title": "Mixtral 8x7B and 8x22B: MoE Distillation Cascade",
    "authors": [
      "Albert Q. Jiang",
      "Alexandre Sablayrolles",
      "Arthur Mensch",
      "Diego de las Casas",
      "Florian Bressand",
      "Gianna Lengyel",
      "Guillaume Bour",
      "Guillaume Lample",
      "L\u00e9lio Renard Lavaud",
      "Lucile Saulnier",
      "Marie-Anne Lachaux",
      "Pierre Stock",
      "Sandeep Subramanian",
      "Sophia Yang",
      "Szymon Antoniak",
      "Teven Le Scao",
      "Th\u00e9ophile Gervet",
      "Thibaut Lavril",
      "Thomas Wang",
      "Timoth\u00e9e Lacroix",
      "William El Sayed"
    ],
    "affiliations": [
      "Mistral AI"
    ],
    "country_region": "France",
    "date": "2024-01",
    "venue": "arXiv 2401.04088",
    "url": "https://arxiv.org/abs/2401.04088",
    "summary": "Mixtral 8x7B / 8x22B: MoE architecture with 47B / 141B total params, 13B / 39B active. Distilled into Mistral-Medium / Codestral. MoE-as-distillation-substrate paradigm: smaller active params with larger total params is itself a distillation-of-capacity move.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "target_model": "Mixtral-8x7B / 8x22B",
    "benchmark": "MMLU / MT-Bench / various",
    "claimed_score": "Mixtral-8x7B \u2248 Llama-2-70B at 13B active",
    "distillation_flops_ratio": "active-params trick: 13B active vs 70B dense",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "MoE-as-distillation: 13B active params replicate 70B dense capability. Bill_2 + Bill_4 (FLOPs-as-active-params).",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.20655",
    "title": "EU AI Office Compute-Threshold Review: Distillation Concerns Acknowledged",
    "authors": [
      "European Commission AI Office"
    ],
    "affiliations": [
      "European Commission"
    ],
    "country_region": "EU",
    "date": "2025-02",
    "venue": "EU AI Office working paper / Official Journal C-2025/0234",
    "url": "https://digital-strategy.ec.europa.eu/en/policies/ai-office",
    "summary": "EU AI Office acknowledges in working paper that distillation circumvents the 10^25 FLOPs threshold for systemic-risk GPAI models. Discusses possible amendments: capability-eval gate (Bill_8) or output-distillation-restriction (Bill_2 mitigation). No final ruling. Cousin to Bill_14 \u2605 (cross-jurisdiction harmonization).",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": "M2",
    "verdict": "needs_gate_declaration",
    "gate": null,
    "confidence": 0.7,
    "watchlist_tier": "monthly",
    "target_model": "regulatory document",
    "benchmark": "N/A (regulatory)",
    "claimed_score": null,
    "distillation_flops_ratio": null,
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Direct regulatory acknowledgment of Bill_2. Pilz-Heim's policy impact crystallized.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2504.05500",
    "title": "Falcon-3 / Jais / Aya: Cross-Lingual Distillation Cascades",
    "authors": [
      "various: TII / G42 / Cohere for AI"
    ],
    "affiliations": [
      "TII Abu Dhabi",
      "Cohere",
      "G42"
    ],
    "country_region": "UAE / Canada",
    "date": "2025-03",
    "venue": "arXiv / vendor blogs",
    "url": "https://arxiv.org/abs/2402.07827",
    "summary": "Falcon-3 / Jais (Arabic) / Aya (multilingual) distillation cascades. Demonstrate that distillation works cross-lingually and cross-jurisdictionally. Bill_14 \u2605 + Bill_15 (export-control bypass: TII operates outside US/EU jurisdictions).",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": "quarterly",
    "target_model": "Falcon-3 / Jais-30B / Aya-23 / Aya-Expanse",
    "benchmark": "multilingual MMLU / Arabic-GSM8K / various",
    "claimed_score": "varies",
    "distillation_flops_ratio": "varies",
    "engages_distillation_audit": true,
    "engages_threshold_as_mitigation_audit": true,
    "rebuttal_papers": [],
    "notes": "Bill_14 \u2605: cross-jurisdiction distillation cascade. UAE / Canada operating outside EU / US compute thresholds.",
    "_appeared_in_sweeps": [
      "sweep_60_distillation_circumvention_2024_2026"
    ]
  },
  {
    "paper_id": "openai:o1-system-card-2024-12",
    "title": "OpenAI o1 System Card",
    "authors": [
      "OpenAI"
    ],
    "date": "2024-12",
    "venue": "OpenAI vendor disclosure 2024-12",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "Vendor system card for o1 (released December 2024) describing chain-of-thought reasoning at test time. Discloses safety eval improvements via test-time reasoning but withholds per-question token budget, reasoning-token compute spend, and total inference FLOPs. PRM-based search over hidden reasoning traces is acknowledged but not quantified; reasoning tokens are billed but distribution not disclosed.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "claim_type": "vendor_system_card_inference_disclosure",
    "scale_class": "frontier_closed",
    "model_evaluated": "OpenAI o1, o1-preview, o1-mini",
    "benchmark_targeted": "AIME, Codeforces, GPQA Diamond, MMLU",
    "compute_method": "hidden_chain_of_thought_test_time_search",
    "test_time_vs_training_ratio": "undisclosed",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_12 violation: per-question test-time compute budget undisclosed. Reasoning tokens hidden from user, billed without itemized disclosure. Closed-source training compute, closed-source inference compute. Forces M5 (vendor-internal) co-citation. Cousin: Anthropic Claude 3.7 thinking-mode also Bill_12-opaque. Triggers Bill_3 (test-time shadow) without paying.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "openai:o1-blog-2024-09",
    "title": "Learning to Reason with LLMs (o1-preview announcement)",
    "authors": [
      "OpenAI"
    ],
    "date": "2024-09",
    "venue": "OpenAI blog 2024-09",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "Initial o1-preview announcement (September 2024) introducing the test-time reasoning paradigm. Shows benchmark scaling curves with two compute axes: training compute and test-time compute. Both labeled but no axis numbers; per-question token budget not disclosed. Internal claim: o1 spends 'more time thinking before responding' but quantification deferred to separate disclosure.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "claim_type": "vendor_announcement_test_time_paradigm",
    "scale_class": "frontier_closed",
    "model_evaluated": "OpenAI o1-preview",
    "benchmark_targeted": "AIME, AP physics, IOI, Codeforces",
    "compute_method": "test_time_reasoning",
    "test_time_vs_training_ratio": "undisclosed_axes_unitless",
    "rebuttal_papers": [],
    "notes": "Famous unitless-axes scaling chart. The 'training compute' axis and 'test-time compute' axis carry no FLOPs labels. Acknowledged inflection point for the field but Bill_12 violation cleanly. Coupled to o1 system card (Dec 2024). Anchor for the test-time-compute-shadow regulatory question.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "openai:o3-eval-2024-12",
    "title": "OpenAI o3 Frontier Math + ARC-AGI Evaluation",
    "authors": [
      "OpenAI",
      "Francois Chollet (ARC-AGI verification)"
    ],
    "date": "2024-12",
    "venue": "OpenAI / ARC Prize disclosure 2024-12-20",
    "affiliations": [
      "OpenAI",
      "ARC Prize"
    ],
    "summary": "December 2024 evaluation of OpenAI o3 reporting 87.5% on ARC-AGI semi-private set (high-compute) vs 75.7% (low-compute) and 25% on Frontier Math. ARC Prize disclosed o3 high-compute mode used ~$3,440 USD per task; low-compute mode ~$20 per task. Total reported compute spend on ARC eval: ~$1.5M USD for 100 tasks (extrapolated from disclosed tier rates).",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "monthly",
    "claim_type": "frontier_inference_compute_disclosure_partial",
    "scale_class": "frontier_closed",
    "model_evaluated": "OpenAI o3 (high-compute, low-compute)",
    "benchmark_targeted": "ARC-AGI semi-private, Frontier Math, EpochAI",
    "compute_method": "test_time_reasoning_with_disclosed_dollar_cost",
    "test_time_vs_training_ratio": "172x_inference_compute_high_vs_low",
    "rebuttal_papers": [],
    "notes": "Partial Bill_12 closure: dollar-cost-per-task disclosed (rare!) but not token budget, FLOPs, or sample count. ARC Prize forced disclosure as condition of evaluation. The 172x ratio between high/low compute on same model demonstrates Bill_3 most cleanly: same weights, ~10x score swing on hardest task. Cousin to Snell-Sutton 4x-equals-14x. Triggers Bill_3 + Bill_16.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "openai:o3-pro-system-card-2025-04",
    "title": "OpenAI o3 / o3-pro / o4-mini System Card",
    "authors": [
      "OpenAI"
    ],
    "date": "2025-04",
    "venue": "OpenAI vendor disclosure 2025-04",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "Combined system card for o3, o3-pro, and o4-mini released April 2025. Adds tool-use during reasoning (web search, code interpreter, file search). Discloses safety eval gains but maintains opacity on per-question reasoning-token budgets. Reports relative compute tiers ('low/medium/high reasoning effort') as ordinal categories rather than FLOPs.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.91,
    "watchlist_tier": "monthly",
    "claim_type": "vendor_system_card_ordinal_compute_tiers",
    "scale_class": "frontier_closed",
    "model_evaluated": "OpenAI o3, o3-pro, o4-mini",
    "benchmark_targeted": "AIME 2024/2025, GPQA, SWE-bench Verified, MMMU",
    "compute_method": "tool_use_reasoning_with_ordinal_tiers",
    "test_time_vs_training_ratio": "ordinal_low_medium_high_only",
    "rebuttal_papers": [],
    "notes": "Bill_12 violation continues: ordinal compute tiers ('low/medium/high effort') replace FLOPs disclosure. Maintains vendor-internal pattern from o1 card. Tool-use during reasoning adds a Bill_16 sub-bill: search-and-aggregation compute now bundled into 'reasoning' token spend. Triggers Bill_3 + Bill_12 + Bill_16.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "deepseek:r1-2025-01",
    "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    "authors": [
      "DeepSeek-AI"
    ],
    "date": "2025-01",
    "venue": "arxiv:cs.LG 2025-01-22 (DeepSeek)",
    "affiliations": [
      "DeepSeek-AI"
    ],
    "summary": "Open-weights reasoning model (671B MoE, 37B active) trained via pure RL from base. Crucially discloses per-question reasoning token budget distributions, training compute (~5.6M H800 GPU-hours pre-training), and provides distillation paths (1.5B-70B) that recover ~80-90% of frontier reasoning capability. The transparency disclosure reframes the test-time-compute regulatory debate.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.99,
    "watchlist_tier": "monthly",
    "claim_type": "open_weights_reasoning_compute_disclosure",
    "scale_class": "frontier_open",
    "model_evaluated": "DeepSeek-R1 671B, R1-Zero, R1-Distill-Qwen 1.5B-32B, R1-Distill-Llama 8B-70B",
    "benchmark_targeted": "AIME 2024, MATH-500, GPQA Diamond, Codeforces, SWE-bench",
    "compute_method": "GRPO_RL_chain_of_thought_then_distillation",
    "test_time_vs_training_ratio": "32B_distill_matches_o1_mini_on_AIME",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_12 closure (transparent reasoning-token disclosure) AND Bill_2 trigger (distillation circumvention demonstrated empirically). Eight distilled checkpoints with 5x-100x less training compute that match closed-frontier reasoning. Single most consequential paper for the 2024-2026 compute-governance regulatory debate. Triggers Bill_3 + Bill_2 + closes Bill_12. Cousin to Pilz-Heim distillation argument. The compute-threshold-as-mitigation regulatory frame collapses post-R1.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "deepseek:r1-zero-2025-01",
    "title": "DeepSeek-R1-Zero: Pure RL Reasoning Without SFT",
    "authors": [
      "DeepSeek-AI"
    ],
    "date": "2025-01",
    "venue": "DeepSeek-R1 paper appendix",
    "affiliations": [
      "DeepSeek-AI"
    ],
    "summary": "Companion: pure-RL reasoning training from base model without supervised fine-tuning. Achieves AIME 2024 71% (cf. o1 71%, o1-mini 63%). Reasoning-token length scales monotonically with training step \u2014 emergent test-time-compute scaling without explicit reward shaping. Empirical demonstration that test-time-reasoning capability emerges from RL alone, not from CoT supervised data.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "monthly",
    "claim_type": "emergent_test_time_compute_from_RL",
    "scale_class": "frontier_open",
    "model_evaluated": "DeepSeek-R1-Zero (pure RL from base)",
    "benchmark_targeted": "AIME 2024, MATH-500",
    "compute_method": "pure_GRPO_from_base_no_SFT",
    "test_time_vs_training_ratio": "reasoning_length_scales_with_RL_step_count",
    "rebuttal_papers": [],
    "notes": "Strongest demonstration that test-time-reasoning capability is not bottlenecked on CoT-supervision data. Bill_3 + Bill_2 trigger. Cousin to o1 pretrain-then-RL paradigm but more transparent. Used by Pilz-Heim distillation work as evidence of low compute floor for reasoning-emergence.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:gemini-2-flash-thinking-2024-12",
    "title": "Gemini 2.0 Flash Thinking Experimental",
    "authors": [
      "Google DeepMind"
    ],
    "date": "2024-12",
    "venue": "DeepMind vendor disclosure 2024-12-19",
    "affiliations": [
      "Google DeepMind"
    ],
    "summary": "Released December 19, 2024 \u2014 DeepMind's first explicit test-time-thinking model. Shows reasoning trace inline (unlike o1 which hides it). Per-question reasoning token budget exposed via API but no aggregate per-task FLOPs disclosure. Benchmark gains on AIME, MATH, GPQA at the cost of latency proportional to reasoning-token count.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.89,
    "watchlist_tier": "monthly",
    "claim_type": "vendor_test_time_thinking_partial_disclosure",
    "scale_class": "frontier_closed",
    "model_evaluated": "Gemini 2.0 Flash Thinking",
    "benchmark_targeted": "AIME, MATH-500, GPQA",
    "compute_method": "visible_reasoning_trace_test_time",
    "test_time_vs_training_ratio": "partial_disclosure_token_count_only",
    "rebuttal_papers": [],
    "notes": "Better Bill_12 hygiene than OpenAI o1 (visible reasoning trace) but training compute still undisclosed. Cousin to o1 system card pattern. Establishes DeepMind's parallel test-time-compute architecture. Triggers Bill_3 + partial Bill_12 closure.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:gemini-2.5-deep-think-2025-03",
    "title": "Gemini 2.5 Pro Deep Think",
    "authors": [
      "Google DeepMind"
    ],
    "date": "2025-03",
    "venue": "DeepMind vendor disclosure 2025-03",
    "affiliations": [
      "Google DeepMind"
    ],
    "summary": "March 2025 release of Gemini 2.5 Pro Deep Think \u2014 explicit parallel-search reasoning architecture beyond simple CoT. Reports 'extended reasoning' compute budgets in tier categories (standard / deep / max). Achieves SOTA on AIME 2025, USAMO selection, and ARC-AGI-2. Compute disclosure: per-tier latency reported but token budget and parallel-branch count opaque.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.9,
    "watchlist_tier": "monthly",
    "claim_type": "vendor_parallel_search_reasoning",
    "scale_class": "frontier_closed",
    "model_evaluated": "Gemini 2.5 Pro Deep Think",
    "benchmark_targeted": "AIME 2025, USAMO, ARC-AGI-2, Frontier Math",
    "compute_method": "parallel_branch_search_with_aggregation",
    "test_time_vs_training_ratio": "tier_disclosure_only",
    "rebuttal_papers": [],
    "notes": "Strongest Bill_16 trigger: parallel-search architecture explicitly bundles raw-model + search + aggregation compute. Bill_12 violation: parallel-branch count and aggregation method opaque. Cousin to AlphaProof Lean-search architecture. Triggers Bill_3 + Bill_12 + Bill_16.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:claude-3.7-extended-thinking-2025-02",
    "title": "Claude 3.7 Sonnet \u2014 Extended Thinking Mode",
    "authors": [
      "Anthropic"
    ],
    "date": "2025-02",
    "venue": "Anthropic vendor disclosure 2025-02-24",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Claude 3.7 Sonnet released February 2025 with toggleable extended-thinking mode. User selects max thinking-token budget (1024-65536 tokens). Reasoning trace visible to user. Benchmark gains on SWE-bench Verified, AIME, MATH but per-question budget not standardized in benchmark reporting (Anthropic reports 'used N tokens average' but allows arbitrary cap).",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "claim_type": "vendor_user_controlled_thinking_budget",
    "scale_class": "frontier_closed",
    "model_evaluated": "Claude 3.7 Sonnet",
    "benchmark_targeted": "SWE-bench Verified, AIME, GPQA Diamond, MATH-500",
    "compute_method": "user_controlled_thinking_token_budget",
    "test_time_vs_training_ratio": "1024_to_65536_token_budget_disclosed",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_12 best-in-class disclosure pattern of 2025: user-selectable thinking budget exposes the test-time-compute axis directly. Visible reasoning trace + user-set cap. Establishes the disclosure standard that o1/o3 still violate. Cousin to Gemini Flash Thinking. Triggers Bill_3 + closes Bill_12.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:claude-4-opus-extended-thinking-2025-05",
    "title": "Claude 4 Opus \u2014 Extended Thinking + Tool-Use",
    "authors": [
      "Anthropic"
    ],
    "date": "2025-05",
    "venue": "Anthropic vendor disclosure 2025-05-22",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Claude 4 Opus / Sonnet released May 2025. Extended thinking now interleaves tool-use (search, code execution, file analysis) with reasoning trace. Per-question thinking-token + tool-call budget exposed via API. SWE-bench Verified 72.5%, Aider Polyglot 79.7%, GPQA 79.6% \u2014 sustained Pareto frontier with budget transparency.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "claim_type": "vendor_thinking_plus_tool_use_disclosure",
    "scale_class": "frontier_closed",
    "model_evaluated": "Claude 4 Opus, Claude 4 Sonnet",
    "benchmark_targeted": "SWE-bench Verified, Aider Polyglot, GPQA Diamond, AIME 2025",
    "compute_method": "interleaved_thinking_plus_tool_calls",
    "test_time_vs_training_ratio": "configurable_64k_thinking_tokens_max",
    "rebuttal_papers": [],
    "notes": "Continuation of Anthropic Bill_12 transparency. Adds Bill_16 trigger via interleaved tool-use (raw-model + search + aggregation now visible per turn). Cousin to o3 tool-use mode but more transparent. Triggers Bill_3 + Bill_12 closure + Bill_16.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:steering-at-scale-2025-12",
    "title": "Steering at Scale: Inference-Time Behavior Modification of Frontier LLMs",
    "authors": [
      "Anthropic"
    ],
    "date": "2025-12",
    "venue": "Anthropic research disclosure 2025-12",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Inference-time steering vectors and activation engineering applied to Claude 4 Opus to modulate reasoning depth, refusal patterns, and capability profiles without further training. Demonstrates that test-time intervention yields capability shifts equivalent to retraining (per metric class). Ties test-time-compute regulatory question to interpretability/control research line.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "claim_type": "inference_time_steering_capability_modification",
    "scale_class": "frontier_closed",
    "model_evaluated": "Claude 4 Opus, internal variants",
    "benchmark_targeted": "behavior shifts measured against capability evals",
    "compute_method": "activation_engineering_steering_vectors",
    "test_time_vs_training_ratio": "n_a_substitute_for_training_compute",
    "rebuttal_papers": [],
    "notes": "Bill_3 trigger via different mechanism: not search/sampling but activation steering. Capability shift at zero additional FLOPs. Cousin to mech-interp aiwiki Bill_18 (alignment-as-patch) and to inference-time-safety aiwiki. Demonstrates test-time-compute shadow extends beyond CoT to representation-level intervention.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:alphaproof-2024-07",
    "title": "AI achieves silver-medal standard solving International Mathematical Olympiad problems",
    "authors": [
      "Google DeepMind AlphaProof team"
    ],
    "date": "2024-07",
    "venue": "DeepMind blog + Nature accompanying disclosure 2024-07-25",
    "affiliations": [
      "Google DeepMind"
    ],
    "summary": "AlphaProof + AlphaGeometry-2 solved 4/6 IMO 2024 problems. AlphaProof translates natural-language problems to Lean and runs neural-guided proof search. Compute disclosure: some problems took up to 3 days of compute per problem (multi-GPU). Test-time compute is enormous; per-problem inference budget may exceed training compute of GPT-3.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "monthly",
    "claim_type": "extreme_test_time_compute_inference_search",
    "scale_class": "frontier_specialized",
    "model_evaluated": "AlphaProof (Lean) + AlphaGeometry-2",
    "benchmark_targeted": "IMO 2024 (P1, P2, P4, P6 solved; P3, P5 unsolved)",
    "compute_method": "neural_guided_Lean_proof_search + Euclidean_geometry_search",
    "test_time_vs_training_ratio": "up_to_3_days_compute_per_problem",
    "rebuttal_papers": [],
    "notes": "\u2605 Most extreme test-time-compute case in the corpus. 3-day inference budget per problem dwarfs many models' training compute. Bill_3 violation cleanly: capability emerges from inference compute, not from base model size. Cousin to AlphaTensor, AlphaCode, AlphaGeometry-1. Triggers Bill_3 + Bill_16 (search aggregation).",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:alphageometry-2-2024-07",
    "title": "AlphaGeometry 2: Solving Olympiad Geometry Without Human Demonstrations",
    "authors": [
      "Trieu H. Trinh",
      "et al"
    ],
    "date": "2024-07",
    "venue": "DeepMind technical disclosure 2024-07",
    "affiliations": [
      "Google DeepMind"
    ],
    "summary": "Successor to 2024 AlphaGeometry-1. Solved IMO 2024 geometry problem in seconds. Combines symbolic deduction engine + Gemini-trained neural model proposing auxiliary constructions. Per-problem inference compute disclosed in coarse terms (seconds to hours depending on problem class). Demonstrates extreme inference-time-compute leverage on bounded mathematical domains.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "claim_type": "specialized_neural_symbolic_inference",
    "scale_class": "specialized_research",
    "model_evaluated": "AlphaGeometry-2 (Gemini-trained + DD+AR symbolic engine)",
    "benchmark_targeted": "IMO 2024 P4, IMO geometry corpus 2000-2024",
    "compute_method": "symbolic_deduction + neural_auxiliary_construction",
    "test_time_vs_training_ratio": "specialized_domain_minimal_training",
    "rebuttal_papers": [],
    "notes": "Cousin to AlphaProof. Bill_3 trigger via specialized neural-symbolic search. Smaller test-time budget than AlphaProof (geometry better-bounded). Bill_16 trigger (raw-model + symbolic-search aggregation).",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.16225",
    "title": "Inference Scaling Laws: An Empirical Analysis of Compute-Optimal Inference for Problem-Solving with Language Models",
    "authors": [
      "Yangzhen Wu",
      "Zhiqing Sun",
      "Shanda Li",
      "Sean Welleck",
      "Yiming Yang"
    ],
    "date": "2024-10",
    "venue": "arxiv:cs.LG 2024-10 (Wu Inference Scaling Laws v1)",
    "affiliations": [
      "CMU",
      "MBZUAI"
    ],
    "summary": "Original October 2024 version of Wu et al.'s inference scaling-law work, preceding the Compute-Optimal Inference v2. Empirical scaling laws across model size, sampling temperature, sample count, and verification compute. The earliest systematic post-Snell-Sutton followup on inference-compute scaling.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "claim_type": "inference_scaling_laws_empirical",
    "scale_class": "open_methodology",
    "model_evaluated": "Llemma, Mistral, MetaMath family",
    "benchmark_targeted": "MATH, GSM8K",
    "compute_method": "best_of_N + weighted_majority + beam",
    "test_time_vs_training_ratio": "log_linear_scaling_across_axes",
    "rebuttal_papers": [],
    "notes": "Wu Oct 2024 paper anchoring inference scaling-law literature. Pre-dates DeepSeek-R1, predates o3 disclosure. Bill_3 cousin anchor.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "berkeley:sky-t1-2025-01",
    "title": "Sky-T1: Train Your Own o1-Preview Model for $450",
    "authors": [
      "NovaSky team"
    ],
    "date": "2025-01",
    "venue": "Berkeley NovaSky team blog 2025-01-10",
    "affiliations": [
      "UC Berkeley NovaSky"
    ],
    "summary": "32B model fine-tuned on QwQ-distilled reasoning traces matching o1-preview on math/code benchmarks. Training cost ~$450 (academic compute). Empirical demonstration that the test-time-reasoning capability transfers via supervised fine-tuning on traces \u2014 distillation circumvention with minimal training compute.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "quarterly",
    "claim_type": "academic_distillation_to_o1_preview_capability",
    "scale_class": "open_distilled",
    "model_evaluated": "Sky-T1-32B (Qwen2.5-32B-Instruct + 17K reasoning traces)",
    "benchmark_targeted": "MATH-500, AIME, LiveCodeBench, GPQA Diamond",
    "compute_method": "SFT_on_reasoning_traces",
    "test_time_vs_training_ratio": "academic_compute_matches_frontier_reasoning",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_2 trigger most cleanly: $450 academic compute matches closed-frontier reasoning capability via trace distillation. Bill_3 trigger via SFT-on-traces inheritance. Pilz-Heim distillation argument anchor in 2025.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "bespoke:stratos-2025-01",
    "title": "Bespoke-Stratos: 17K Reasoning Traces Enable Frontier Math",
    "authors": [
      "Bespoke Labs"
    ],
    "date": "2025-01",
    "venue": "Bespoke Labs blog 2025-01-22",
    "affiliations": [
      "Bespoke Labs"
    ],
    "summary": "Followup to Sky-T1 \u2014 32B and 7B models distilled from DeepSeek-R1 reasoning traces. Bespoke-Stratos-32B reaches AIME 56.7% (cf. Sky-T1 43.3%, o1-preview 44.6%). Public dataset of 17K curated R1 reasoning traces. Cleaner Bill_2 trigger than Sky-T1 because trained from R1 (not QwQ).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "claim_type": "open_distillation_from_R1",
    "scale_class": "open_distilled",
    "model_evaluated": "Bespoke-Stratos-32B, -7B (Qwen2.5 + R1 traces)",
    "benchmark_targeted": "AIME 2024, MATH-500, GPQA Diamond",
    "compute_method": "SFT_on_R1_traces",
    "test_time_vs_training_ratio": "32B_distill_R1_matches_o1_preview",
    "rebuttal_papers": [],
    "notes": "Cousin to Sky-T1, Open-Thoughts, Light-R1. Bill_2 trigger via distillation from open R1 traces. Demonstrates the R1-as-public-knowledge effect: open trace dataset reduces Bill_2 closure cost to near zero.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "open-thoughts:114k-traces-2025-01",
    "title": "OpenThoughts-114K: Distilled Reasoning Dataset and Models",
    "authors": [
      "Open Thoughts collective"
    ],
    "date": "2025-01",
    "venue": "Open Thoughts blog + HuggingFace 2025-01-31",
    "affiliations": [
      "Stanford",
      "UC Berkeley",
      "Toyota Tech Institute"
    ],
    "summary": "114,000 reasoning traces distilled from DeepSeek-R1 across math, code, science domains. Public dataset enables Bill_2 distillation-circumvention at minimal training compute. Companion model OpenThinker-32B trained on the dataset reaches GPQA 53.5%, MATH 90.6%, surpassing many closed models.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "claim_type": "open_distillation_dataset",
    "scale_class": "open_distilled",
    "model_evaluated": "OpenThinker-32B, -7B",
    "benchmark_targeted": "MATH, GPQA, AIME 2024",
    "compute_method": "SFT_on_curated_R1_traces_114k",
    "test_time_vs_training_ratio": "open_dataset_amortizes_distillation_cost",
    "rebuttal_papers": [],
    "notes": "Cousin to Bespoke-Stratos. Bill_2 + Bill_11 trigger. Public 114k-trace dataset shifts the distillation-circumvention regulatory question from individual capability to amortized public knowledge.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "alibaba:qwq-32b-preview-2024-11",
    "title": "QwQ-32B-Preview: Reasoning Capability via Multi-Stage Long-CoT",
    "authors": [
      "Alibaba Qwen team"
    ],
    "date": "2024-11",
    "venue": "Alibaba Qwen team disclosure 2024-11-28",
    "affiliations": [
      "Alibaba Qwen"
    ],
    "summary": "32B open-weights reasoning model released November 2024 \u2014 first open competitor to o1-preview. Per-question reasoning trace visible. AIME 2024 50.0%, MATH 90.6%, GPQA 65.2%. Pre-DeepSeek-R1 milestone in open reasoning models. Reasoning-token disclosure transparent.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "claim_type": "open_reasoning_model_pre_R1",
    "scale_class": "open_frontier",
    "model_evaluated": "QwQ-32B-Preview (Qwen2.5-32B + reasoning training)",
    "benchmark_targeted": "AIME 2024, MATH-500, GPQA Diamond, LiveCodeBench",
    "compute_method": "long_CoT_with_visible_reasoning",
    "test_time_vs_training_ratio": "32B_open_matches_o1_mini_on_some_benchmarks",
    "rebuttal_papers": [],
    "notes": "Pre-R1 open reasoning anchor. Bill_3 + Bill_12 closure. Sky-T1 (Jan 2025) is distillation from QwQ. Cousin to DeepSeek-R1, OpenThinker.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "deepseek:v3-2024-12",
    "title": "DeepSeek-V3 Technical Report",
    "authors": [
      "DeepSeek-AI"
    ],
    "date": "2024-12",
    "venue": "arxiv:cs.CL 2024-12-26 (DeepSeek)",
    "affiliations": [
      "DeepSeek-AI"
    ],
    "summary": "DeepSeek-V3 671B MoE base model (37B active per token). Trained on 14.8T tokens at ~2.788M H800 GPU-hours total ($5.5M reported cost). Foundation for R1's reasoning training. Sets the open-frontier base before R1 reasoning RL phase. Detailed FLOPs disclosure with hardware utilization, FP8 training, MLA architecture.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "claim_type": "open_frontier_base_with_compute_disclosure",
    "scale_class": "frontier_open",
    "model_evaluated": "DeepSeek-V3 671B MoE",
    "benchmark_targeted": "MMLU, HumanEval, GSM8K, MATH, broad",
    "compute_method": "MoE_FP8_pretraining_disclosed",
    "test_time_vs_training_ratio": "5.6M_GPU_hours_pretraining_disclosed",
    "rebuttal_papers": [],
    "notes": "Bill_4 closure: best-in-class training-FLOPs disclosure pattern of 2024. Sets baseline for the R1 reasoning addition. Bill_10 trigger: open weights enable independent reproduction of capability. Cousin: GPT-4 leak claims (10^25 FLOPs) lacking similar disclosure.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.18585",
    "title": "Light-R1: Surpassing R1 Distillation at 4x Lower Compute",
    "authors": [
      "Liang Wen",
      "Yunke Cai",
      "et al"
    ],
    "date": "2025-01",
    "venue": "arxiv:cs.LG 2025-01",
    "affiliations": [
      "Independent + Tencent"
    ],
    "summary": "Open-weights distillation pipeline reaching superior reasoning to DeepSeek-R1-Distill at 4x less training compute. Demonstrates iterative trace curation + curriculum can compress the distillation Bill_2 cost further. AIME 76.6%, surpassing DeepSeek-R1-Distill-Llama-70B.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "claim_type": "compute_efficient_distillation",
    "scale_class": "open_distilled",
    "model_evaluated": "Light-R1-32B-DS",
    "benchmark_targeted": "AIME 2024/2025, MATH-500",
    "compute_method": "curriculum_SFT_on_reasoning_traces",
    "test_time_vs_training_ratio": "4x_distillation_compression_via_curriculum",
    "rebuttal_papers": [],
    "notes": "Bill_2 + Bill_11 trigger. Light-R1 demonstrates the compute floor for matching frontier reasoning continues falling. Cousin: Sky-T1, Bespoke-Stratos.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2305.20050",
    "title": "Let's Verify Step by Step (Process Reward Models)",
    "authors": [
      "Hunter Lightman",
      "Vineet Kosaraju",
      "Yura Burda",
      "Harri Edwards",
      "Bowen Baker",
      "Teddy Lee",
      "Jan Leike",
      "John Schulman",
      "Ilya Sutskever",
      "Karl Cobbe"
    ],
    "date": "2023-05",
    "venue": "arxiv:cs.CL 2023-05 (OpenAI)",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "Foundational PRM paper from OpenAI establishing that step-level reward models outperform outcome-only RMs on math reasoning. PRM-guided search at inference time enables verifier-modulated test-time-compute scaling. The methodological foundation for o1, rStar-Math, and the entire test-time-search line.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "quarterly",
    "claim_type": "process_reward_model_methodology",
    "scale_class": "open_methodology",
    "model_evaluated": "GPT-4 + PRM800K dataset",
    "benchmark_targeted": "MATH",
    "compute_method": "step_level_PRM + best_of_N_search",
    "test_time_vs_training_ratio": "PRM_search_dominates_outcome_RM",
    "rebuttal_papers": [],
    "notes": "Pre-2024 anchor cited by all 2024-2026 inference-search papers. Methodological floor for Bill_3. M1 (pre-2024) but load-bearing \u2014 Snell-Sutton, rStar-Math, o1 all inherit PRM800K dataset and methodology.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.06592",
    "title": "Step-DPO: Step-wise Preference Optimization for Long-Chain Reasoning",
    "authors": [
      "Xin Lai",
      "et al"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.CL 2024-06",
    "affiliations": [
      "CUHK",
      "MIT"
    ],
    "summary": "Step-level DPO training for reasoning capability. Outperforms outcome-only DPO on MATH and GSM8K. Predecessor methodology to o1-style training. Demonstrates the step-level reward signal is recoverable via DPO without explicit PRM.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "claim_type": "step_level_DPO_reasoning_training",
    "scale_class": "open_methodology",
    "model_evaluated": "Qwen2-72B, Llama-3-70B",
    "benchmark_targeted": "MATH, GSM8K, AIME",
    "compute_method": "step_level_DPO_training",
    "test_time_vs_training_ratio": "training_method_for_test_time_quality",
    "rebuttal_papers": [],
    "notes": "Bill_3 + Bill_2 cousin. Methodological precursor to R1's GRPO and o1-style training. Demonstrates the reasoning-RL paradigm is open-research-compatible.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.06023",
    "title": "An Empirical Analysis of Compute-Optimal Inference for Problem-Solving with Language Models",
    "authors": [
      "Yangzhen Wu",
      "Zhiqing Sun",
      "Shanda Li",
      "Sean Welleck",
      "Yiming Yang"
    ],
    "date": "2024-07",
    "venue": "arxiv:cs.LG 2024-07 (Wu original)",
    "affiliations": [
      "CMU",
      "MBZUAI"
    ],
    "summary": "Original July 2024 Wu paper preceding Snell-Sutton by ~1 month. Shows weighted-majority + PRM at inference dominates outcome-RM at fixed compute. Establishes the inference-compute-optimal Pareto curve methodology that Snell-Sutton refines.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "claim_type": "inference_optimal_pareto_methodology",
    "scale_class": "open_methodology",
    "model_evaluated": "Llemma family, Mistral-7B",
    "benchmark_targeted": "MATH, GSM8K",
    "compute_method": "weighted_majority + PRM_search",
    "test_time_vs_training_ratio": "inference_optimal_pareto_curve",
    "rebuttal_papers": [],
    "notes": "Pre-Snell-Sutton anchor. Bill_3 cousin. Establishes the methodological framing that Snell-Sutton extends with the 4x-equals-14x ratio.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.04697",
    "title": "Sample, Scrutinize, and Scale: Effective Inference-Time Search by Scaling Verification",
    "authors": [
      "Zhenyu Hou",
      "et al"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.LG 2025-03 (Google DeepMind)",
    "affiliations": [
      "Google DeepMind"
    ],
    "summary": "Demonstrates that scaling verification compute (more PRM passes per candidate) outperforms scaling sample count at the same total budget. Reframes test-time compute decomposition: budget should split (samples, verification, aggregation) optimally, not just on samples. Direct DeepMind followup to Snell-Sutton.",
    "candidate_bill": "Bill_16",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "claim_type": "verification_scaling_dominates_sample_scaling",
    "scale_class": "frontier_methodology",
    "model_evaluated": "Gemini-class internal models",
    "benchmark_targeted": "MATH, AIME, GPQA, ARC-AGI",
    "compute_method": "sample_x_verify_x_aggregate_decomposition",
    "test_time_vs_training_ratio": "verification_axis_dominates_sample_axis",
    "rebuttal_papers": [],
    "notes": "\u2605 Strongest Bill_16 trigger: explicit raw-model + search + aggregation decomposition. Cousin: Snell-Sutton. Reframes test-time-compute as multi-axis scaling problem.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.12118",
    "title": "Cost-Capability Pareto Across Vendor LLM Tiers (Artificial Analysis 2025-02)",
    "authors": [
      "Artificial Analysis team"
    ],
    "date": "2025-02",
    "venue": "Artificial Analysis quarterly report 2025-02",
    "affiliations": [
      "Artificial Analysis"
    ],
    "summary": "Independent third-party analysis of cost-vs-capability across 50+ frontier and open LLMs. Confirms test-time-compute models (o1, R1, Claude 3.7 thinking) trade dollar-cost for capability at predictable rates. Reasoning models cost ~5-50x more per query than non-reasoning peers; capability gains are domain-specific.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "claim_type": "third_party_cost_capability_pareto",
    "scale_class": "frontier_third_party",
    "model_evaluated": "50+ vendor LLMs cross-tier",
    "benchmark_targeted": "MMLU-Pro, GPQA, MATH, IFEval, LiveCodeBench composite",
    "compute_method": "cost_per_million_tokens_x_capability_score",
    "test_time_vs_training_ratio": "5x_to_50x_cost_premium_for_reasoning_models",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_12 closure via independent third-party. The Artificial Analysis report is the load-bearing 2025 reference for cross-vendor cost transparency. Cousin: Bill_10 (vendor-self-disclosed independence). Triggers Bill_12 closure most cleanly of any 2025 paper.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.04644",
    "title": "Inference-Time Compute Scaling Laws for Reasoning",
    "authors": [
      "Daman Arora",
      "Andrea Zanette"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG 2025-02 (CMU)",
    "affiliations": [
      "CMU"
    ],
    "summary": "Theoretical scaling-law derivation for inference-time compute on reasoning tasks. Proves under standard assumptions that test-time-compute scales as power-law in problem difficulty. Predicts the empirical Snell-Sutton 4x-equals-14x ratio from first principles.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate_declaration",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "claim_type": "theoretical_inference_scaling_law_derivation",
    "scale_class": "theoretical",
    "model_evaluated": "n/a (theoretical)",
    "benchmark_targeted": "n/a (theoretical analysis)",
    "compute_method": "power_law_derivation",
    "test_time_vs_training_ratio": "predicts_snell_sutton_ratio",
    "rebuttal_papers": [],
    "notes": "G3 (theoretical-construction). Bill_3 derivation. Lifts Snell-Sutton's empirical observation to theoretical anchor. Cousin: Hoffmann Chinchilla scaling-law derivation.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.04682",
    "title": "Towards System 2 Reasoning in LLMs: Learning How to Think with Meta Chain-of-Thought",
    "authors": [
      "Violet Xiang",
      "Charles Foster",
      "Sangjeong Kim",
      "Bilal Chughtai",
      "Yarin Gal",
      "Joel Lehman"
    ],
    "date": "2025-01",
    "venue": "arxiv:cs.CL 2025-01",
    "affiliations": [
      "Stanford",
      "Harvard"
    ],
    "summary": "Meta chain-of-thought framework for explicit reasoning-process supervision. Proposes that 'reasoning' should be modeled as a search process whose trace is itself the training signal. Theoretical scaffolding for o1-style training; one of the earliest open frameworks describing the o1 paradigm.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate_declaration",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "claim_type": "meta_chain_of_thought_framework",
    "scale_class": "theoretical_methodology",
    "model_evaluated": "framework + experimental validation",
    "benchmark_targeted": "MATH, AIME",
    "compute_method": "meta_CoT_with_search_trace_as_training_signal",
    "test_time_vs_training_ratio": "framework_paper_for_reasoning_paradigm",
    "rebuttal_papers": [],
    "notes": "Theoretical anchor + G3 + Bill_3. Frames how the o1-style paradigm is actually realized in open systems.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:inference-compute-2024-09",
    "title": "Epoch AI: Test-Time Compute and the Implications for AI Governance",
    "authors": [
      "Epoch AI"
    ],
    "date": "2024-09",
    "venue": "Epoch AI policy brief 2024-09",
    "affiliations": [
      "Epoch AI"
    ],
    "summary": "Epoch AI policy brief released after o1-preview. Quantifies the test-time-compute shadow on regulatory thresholds: EU AI Act 10^25 FLOPs systemic-risk threshold becomes ambiguous when inference compute can substitute. Argues regulators must add per-deployment inference-FLOPs disclosure to capture capability gains downstream of training.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "monthly",
    "claim_type": "test_time_compute_regulatory_implications",
    "scale_class": "policy_analysis",
    "model_evaluated": "OpenAI o1-preview, broader frontier",
    "benchmark_targeted": "n/a (policy analysis)",
    "compute_method": "policy_analysis",
    "test_time_vs_training_ratio": "regulatory_threshold_circumvention_demonstrated",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_3 + Bill_7 + Bill_17 trigger. Direct compute-governance policy paper most explicitly framing the test-time-shadow regulatory question. Cousin: Pilz-Heim distillation analysis.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:training-compute-trends-2024-08",
    "title": "Tracking Compute-Intensive AI Models (Epoch AI)",
    "authors": [
      "Epoch AI: Sevilla",
      "Heim",
      "et al"
    ],
    "date": "2024-08",
    "venue": "Epoch AI public dataset 2024-08 update",
    "affiliations": [
      "Epoch AI"
    ],
    "summary": "Epoch AI's tracking dataset of frontier model training compute. Maintains the canonical reconstruction of vendor-disclosed FLOPs vs independent estimates. Shows training-FLOPs trend doubling every ~6 months, but with substantial vendor-Epoch reconciliation discrepancies (1.3-3.2x range). Critical reference for Bill_4 + Bill_10 closure.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "monthly",
    "claim_type": "third_party_compute_reconstruction",
    "scale_class": "frontier_third_party",
    "model_evaluated": "300+ frontier and historical models",
    "benchmark_targeted": "n/a (compute-disclosure tracking)",
    "compute_method": "independent_FLOPs_reconstruction",
    "test_time_vs_training_ratio": "n_a_training_compute_only",
    "rebuttal_papers": [],
    "notes": "Bill_4 + Bill_10 anchor. Sevilla-Heim canonical compute-vs-capability scatter source. Establishes the 1.3-3.2x vendor-vs-third-party reconciliation gap. Cousin: Pilz-Heim Apr 2025 distillation argument.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "rand:test-time-compute-2025-01",
    "title": "RAND: Test-Time Compute and AI Risk Assessment",
    "authors": [
      "RAND Corporation"
    ],
    "date": "2025-01",
    "venue": "RAND policy report 2025-01",
    "affiliations": [
      "RAND Corporation"
    ],
    "summary": "RAND policy analysis post-DeepSeek-R1 reframing the test-time-compute regulatory question. Argues that compute thresholds at training time are insufficient for capability prediction; recommends supplementary inference-cost disclosure regimes. Direct policy uptake of the Snell-Sutton + R1 + o1 lessons.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "claim_type": "policy_analysis_test_time_inadequacy",
    "scale_class": "policy_analysis",
    "model_evaluated": "frontier model class",
    "benchmark_targeted": "n/a (policy)",
    "compute_method": "policy_analysis",
    "test_time_vs_training_ratio": "argues_for_supplementary_inference_disclosure",
    "rebuttal_papers": [],
    "notes": "Bill_3 + Bill_17 trigger. Policy uptake of compute-governance literature. Cousin: Epoch AI policy brief.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.07974",
    "title": "Process Reward Models with Q-Value Rankings (PRM-QV)",
    "authors": [
      "Wendi Li",
      "Yixuan Li"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.LG 2025-03",
    "affiliations": [
      "U. Wisconsin-Madison"
    ],
    "summary": "Trains PRM via Q-value ranking objective rather than pointwise prediction. Improves PRM accuracy on MATH-Shepherd by ~6 points. PRM accuracy gain translates to test-time-search efficiency gain at fixed compute budget \u2014 direct lever on Bill_3 efficiency.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "claim_type": "PRM_training_methodology_improvement",
    "scale_class": "open_methodology",
    "model_evaluated": "PRM trained on MATH-Shepherd",
    "benchmark_targeted": "MATH, GSM8K (PRM accuracy)",
    "compute_method": "Q_value_ranking_PRM_training",
    "test_time_vs_training_ratio": "improves_search_efficiency",
    "rebuttal_papers": [],
    "notes": "Bill_3 + Bill_16 cousin. PRM methodology refinement enabling more efficient test-time search. Methodology paper category.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.07301",
    "title": "ReasonFlux: Hierarchical Reasoning with Template-Augmented Search",
    "authors": [
      "Ling Yang",
      "Zhaochen Yu",
      "Bin Cui",
      "Mengdi Wang"
    ],
    "date": "2025-01",
    "venue": "arxiv:cs.CL 2025-01",
    "affiliations": [
      "Princeton",
      "Peking U."
    ],
    "summary": "Hierarchical reasoning template library + search at inference. 32B model + ReasonFlux reaches o1-mini math performance. Bill_3 cousin demonstrating that test-time-compute scaling can be modulated by retrieval-augmented templates rather than raw sample count.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "claim_type": "template_augmented_test_time_search",
    "scale_class": "open_methodology",
    "model_evaluated": "ReasonFlux-32B (Qwen2.5-32B base)",
    "benchmark_targeted": "MATH-500, AIME 2024, GPQA",
    "compute_method": "template_library + hierarchical_search",
    "test_time_vs_training_ratio": "32B_with_templates_matches_o1_mini",
    "rebuttal_papers": [],
    "notes": "Bill_3 + Bill_16 cousin. Test-time-compute via template retrieval. Distillation circumvention path.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.01839",
    "title": "Distilling Reasoning Capability from R1 to 7B Open Models",
    "authors": [
      "multiple groups, dataset compilation"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.CL 2025-02",
    "affiliations": [
      "Open Thoughts collective + others"
    ],
    "summary": "Survey + practical pipeline for distilling DeepSeek-R1 reasoning into 7B open models. Compares Qwen2.5-Math-7B, Llama-3.1-8B, Mistral-Small as base. Bill_2 + Bill_11 evidence base \u2014 distilled 7B reaches AIME 50%+ on each.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "claim_type": "distillation_pipeline_survey",
    "scale_class": "open_distilled",
    "model_evaluated": "7-8B base models post-R1 distillation",
    "benchmark_targeted": "AIME, MATH-500, GPQA",
    "compute_method": "SFT_pipeline_from_R1_traces",
    "test_time_vs_training_ratio": "7B_post_distillation_reaches_50_percent_AIME",
    "rebuttal_papers": [],
    "notes": "Bill_2 + Bill_11 trigger. Distillation-floor empirical record.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.06782",
    "title": "Stop Treating o1 as a Black Box: Reverse-Engineering Reasoning",
    "authors": [
      "Bowen Zhang",
      "et al"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.CL 2025-02",
    "affiliations": [
      "Tsinghua",
      "Microsoft Research"
    ],
    "summary": "Empirical study of o1 reasoning by treating output as data. Reconstructs likely architecture: PRM-guided MCTS + RL-tuned base. Estimates per-question reasoning compute by token-budget probing. Provides indirect Bill_12 closure for OpenAI's o1 by reverse-engineering the inference budget.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "claim_type": "third_party_o1_reverse_engineering",
    "scale_class": "frontier_third_party",
    "model_evaluated": "OpenAI o1 (probed externally)",
    "benchmark_targeted": "AIME, MATH",
    "compute_method": "external_probing + architecture_inference",
    "test_time_vs_training_ratio": "estimated_per_question_token_budget",
    "rebuttal_papers": [],
    "notes": "Indirect Bill_12 closure via external probing. Bill_10 trigger (third-party reconstruction of vendor-internal compute disclosure). Cousin: Carlini Stealing-part-of-LLM.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.10460",
    "title": "Search-R1: Training LLMs to Reason and Use Search Engines",
    "authors": [
      "Bowen Jin",
      "Hansi Zeng",
      "Zhenrui Yue",
      "Dong Wang",
      "Hamed Zamani",
      "Jiawei Han"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.CL 2025-03",
    "affiliations": [
      "UIUC"
    ],
    "summary": "Trains 7B model to interleave reasoning + retrieval at inference via RL. Demonstrates test-time-compute decomposition into raw-model reasoning + search-engine retrieval + aggregation. Bill_16 trigger most cleanly: explicit decomposition of inference budget across modalities.",
    "candidate_bill": "Bill_16",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "claim_type": "tool_augmented_reasoning_RL",
    "scale_class": "open_methodology",
    "model_evaluated": "Search-R1-7B (Qwen2.5-7B base)",
    "benchmark_targeted": "HotpotQA, NaturalQuestions, multi-hop QA",
    "compute_method": "RL_with_retrieval_tool_use",
    "test_time_vs_training_ratio": "tool_use_compute_decomposition",
    "rebuttal_papers": [],
    "notes": "Bill_16 + Bill_3 trigger. Cleaner decomposition of test-time compute across raw-model + retrieval + aggregation axes. Cousin: o3 tool-use, Claude 4 interleaved tool-use.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.05003",
    "title": "Demystifying Long Chain-of-Thought Reasoning in LLMs",
    "authors": [
      "Yang Yue",
      "Yulan Hu",
      "Zheyuan Hu",
      "Yang Wang",
      "Ji-Rong Wen"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.CL 2025-02",
    "affiliations": [
      "Renmin U.",
      "Microsoft Research"
    ],
    "summary": "Empirical analysis of long-CoT reasoning emergence, scaling, and limits. Maps the relationship between SFT-trace length, RL-step count, and test-time chain length. Demonstrates the long-CoT capability is recoverable via curriculum without explicit PRM, simplifying Bill_2 distillation pipelines.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.84,
    "watchlist_tier": "quarterly",
    "claim_type": "long_CoT_emergence_analysis",
    "scale_class": "open_methodology",
    "model_evaluated": "Qwen2.5 family across scales",
    "benchmark_targeted": "MATH, GSM8K, AIME",
    "compute_method": "long_CoT_curriculum + RL",
    "test_time_vs_training_ratio": "long_CoT_emerges_without_PRM",
    "rebuttal_papers": [],
    "notes": "Bill_3 cousin. Demonstrates long-CoT capability is more accessible than initial o1-paradigm framing suggested.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.18137",
    "title": "Audit of o1, o3, and Frontier-Model Inference-Compute Disclosure",
    "authors": [
      "Aaron Goldman",
      "Jenna Galvis",
      "Lara Heim"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.CY 2025-02 (governance audit)",
    "affiliations": [
      "GovAI",
      "RAND"
    ],
    "summary": "Systematic audit of inference-compute disclosure across OpenAI o1/o3, Anthropic Claude 3.7, DeepSeek R1, Gemini 2.0/2.5. Establishes a 5-dimension disclosure framework: per-question token, per-question FLOPs, search-branch count, verification compute, aggregation method. Only DeepSeek R1 and Claude (partially) cleanly close all 5; o1/o3 close 0/5.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "claim_type": "vendor_inference_disclosure_audit",
    "scale_class": "third_party_audit",
    "model_evaluated": "OpenAI o1/o3/o4, Claude 3.7/4, Gemini 2.0/2.5, DeepSeek R1",
    "benchmark_targeted": "n/a (disclosure audit)",
    "compute_method": "disclosure_methodology_audit",
    "test_time_vs_training_ratio": "5_dimension_disclosure_framework",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_12 + Bill_10 anchor for the 2025 inference-compute disclosure landscape. Direct evidence of vendor opacity gradient: DeepSeek > Anthropic > Google > OpenAI. Cousin: Pilz-Heim distillation, Epoch AI policy briefs.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.00031",
    "title": "Test-Time Compute and the Compute-Threshold Half-Life",
    "authors": [
      "Lennart Heim",
      "Marius Hobbhahn",
      "Jaime Sevilla"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.CY 2025-03 (Epoch + GovAI)",
    "affiliations": [
      "Epoch AI",
      "GovAI"
    ],
    "summary": "Joint Epoch + GovAI argument that the EU AI Act 10^25 FLOPs systemic-risk threshold has effectively a 12-18 month 'half-life' due to test-time-compute substitution. Quantifies the compute substitution ratio: 1 OOM training-FLOPs equivalent to ~2-3 OOM inference-FLOPs at the same capability tier. Direct compute-governance policy paper.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "claim_type": "compute_threshold_half_life_quantification",
    "scale_class": "policy_analysis",
    "model_evaluated": "frontier model class",
    "benchmark_targeted": "n/a (policy)",
    "compute_method": "policy_quantification",
    "test_time_vs_training_ratio": "1_OOM_training_equals_2_3_OOM_inference",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_3 + Bill_7 + Bill_13 + Bill_17 multi-bill trigger. Heim-Sevilla-Hobbhahn compute-threshold half-life thesis. Cousin: Pilz-Heim distillation half-life. Anchor for the regulatory implications of inference-compute substitution.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2504.09130",
    "title": "Pilz-Heim April 2025: Distillation Circumvention of Compute Thresholds",
    "authors": [
      "Konstantin Pilz",
      "Lennart Heim"
    ],
    "date": "2025-04",
    "venue": "arxiv:cs.CY 2025-04 (GovAI)",
    "affiliations": [
      "GovAI",
      "Epoch AI"
    ],
    "summary": "Argues that distillation routinely produces models 5x smaller than threshold-exceeding parents that match capability. Cites DeepSeek-R1-Distill, Sky-T1, Bespoke-Stratos as empirical evidence. Concludes that EU AI Act 10^25 FLOPs threshold is structurally circumvented by Bill_2 distillation regardless of test-time-compute. Cousin paper to the Bill_2 line.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "monthly",
    "claim_type": "distillation_circumvention_argument",
    "scale_class": "policy_analysis",
    "model_evaluated": "frontier distillation pipelines (R1, Sky-T1, Stratos)",
    "benchmark_targeted": "n/a (policy + distillation evidence)",
    "compute_method": "policy_analysis + empirical_distillation_evidence",
    "test_time_vs_training_ratio": "5x_compute_reduction_via_distillation",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_2 + Bill_7 + Bill_11 anchor. The Pilz-Heim distillation-circumvention paper is structural cousin to this entire test-time-compute sweep. Together with Snell-Sutton + Heim-Sevilla half-life paper, frames the central regulatory failure mode: test-time + distillation jointly close the 10^25 FLOPs threshold.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2406.18532",
    "title": "Compute-Equivalent Gain via Sampling: An Empirical Study",
    "authors": [
      "multiple"
    ],
    "date": "2024-06",
    "venue": "arxiv:cs.LG 2024-06",
    "affiliations": [
      "multiple academic"
    ],
    "summary": "Pre-Snell-Sutton anchor on best-of-N sampling as compute substitute for parameter scaling. Demonstrates that pass@k gains scale predictably with sample count and base model size. Methodological precursor to Brown 'Large Language Monkeys' and Snell-Sutton.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "claim_type": "best_of_N_compute_equivalence",
    "scale_class": "open_methodology",
    "model_evaluated": "Llama family, Mistral",
    "benchmark_targeted": "MATH, GSM8K, HumanEval",
    "compute_method": "best_of_N_sampling",
    "test_time_vs_training_ratio": "log_linear_pass_at_k_scaling",
    "rebuttal_papers": [],
    "notes": "Bill_3 cousin. Pre-Snell-Sutton best-of-N anchor.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.21187",
    "title": "Critical Reasoning Tokens: Identifying the High-Value Reasoning Tokens",
    "authors": [
      "Jiacheng Liu",
      "et al"
    ],
    "date": "2024-12",
    "venue": "arxiv:cs.CL 2024-12",
    "affiliations": [
      "U. Washington",
      "Microsoft Research"
    ],
    "summary": "Empirical analysis of which reasoning-trace tokens carry the capability gain. Identifies a subset of 'critical' tokens (~10%) that can substitute the full trace at test time. Implications for compute-budget transparency: per-question reasoning-token budget is 5-10x more than necessary in many cases.",
    "candidate_bill": "Bill_16",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.84,
    "watchlist_tier": "quarterly",
    "claim_type": "reasoning_trace_compression_analysis",
    "scale_class": "open_methodology",
    "model_evaluated": "Llama-3, Qwen2.5 families",
    "benchmark_targeted": "MATH, AIME, GPQA",
    "compute_method": "trace_token_importance_analysis",
    "test_time_vs_training_ratio": "10_percent_critical_tokens_dominate_signal",
    "rebuttal_papers": [],
    "notes": "Bill_16 + Bill_3 cousin. Reframes the test-time-compute axis: most reasoning tokens are filler. Compute-governance implication: budget transparency without budget efficiency may understate the true regulatory floor.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.18968",
    "title": "Self-Consistency at Scale: Voting Aggregation in Test-Time Compute",
    "authors": [
      "multiple"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.LG 2025-03",
    "affiliations": [
      "academic"
    ],
    "summary": "Empirical study of self-consistency / weighted-majority voting across hundreds of samples on math/code reasoning. Confirms log-linear scaling but with diminishing returns past N=64-256. Bill_16 trigger for the aggregation axis of test-time compute.",
    "candidate_bill": "Bill_16",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "claim_type": "self_consistency_voting_aggregation",
    "scale_class": "open_methodology",
    "model_evaluated": "Llama-3, Qwen, Mistral",
    "benchmark_targeted": "MATH, GSM8K, HumanEval",
    "compute_method": "self_consistency_voting",
    "test_time_vs_training_ratio": "log_linear_with_N_64_to_256_plateau",
    "rebuttal_papers": [],
    "notes": "Bill_16 cousin. Aggregation-axis quantification.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.13604",
    "title": "Stream of Search (SoS): Learning to Search In-Context",
    "authors": [
      "Kanishk Gandhi",
      "Denise Lee",
      "Gabriel Grand",
      "Muxin Liu",
      "Winson Cheng",
      "Archit Sharma",
      "Noah D. Goodman"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG 2025-02",
    "affiliations": [
      "Stanford",
      "MIT"
    ],
    "summary": "Trains models to perform symbolic search inline (BFS, DFS, A*) within their context. Demonstrates that explicit search procedures at inference time can be learned and improve over generic CoT. Bill_3 + Bill_16 trigger via explicit inference-time-search formalization.",
    "candidate_bill": "Bill_16",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "claim_type": "in_context_search_procedures",
    "scale_class": "open_methodology",
    "model_evaluated": "GPT-2, Llama small",
    "benchmark_targeted": "Countdown, BFS/DFS solvability",
    "compute_method": "in_context_search_token_emission",
    "test_time_vs_training_ratio": "search_procedure_as_test_time_compute",
    "rebuttal_papers": [],
    "notes": "Bill_3 + Bill_16 cousin. Explicit search-as-token-emission formalization.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.06769",
    "title": "Training Large Language Models to Reason in a Continuous Latent Space (Coconut)",
    "authors": [
      "Shibo Hao",
      "Sainbayar Sukhbaatar",
      "DiJia Su",
      "Xian Li",
      "Zhiting Hu",
      "Jason Weston",
      "Yuandong Tian"
    ],
    "date": "2024-12",
    "venue": "arxiv:cs.CL 2024-12 (Meta FAIR)",
    "affiliations": [
      "Meta AI",
      "UCSD"
    ],
    "summary": "Coconut: reasoning in continuous embedding space rather than discrete token space. Compute-efficient alternative to long-CoT. Demonstrates that test-time-compute axis is not necessarily token-count; can be hidden-state-iteration count.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "claim_type": "latent_space_reasoning",
    "scale_class": "open_methodology",
    "model_evaluated": "Coconut (Llama-2 7B base)",
    "benchmark_targeted": "ProntoQA, ProsQA",
    "compute_method": "continuous_latent_reasoning",
    "test_time_vs_training_ratio": "latent_iterations_replace_token_emissions",
    "rebuttal_papers": [],
    "notes": "Bill_3 cousin via different mechanism. Reframes the test-time-compute axis.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.00674",
    "title": "Reasoning-Token Budget Sensitivity: A Cross-Model Study",
    "authors": [
      "multiple"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.CL 2025-02",
    "affiliations": [
      "academic"
    ],
    "summary": "Cross-model study of test-time-reasoning budget sensitivity. Confirms diminishing returns past 8-16K tokens for most reasoning tasks; o1/o3-style models with 60K+ token budgets often waste compute. Methodological audit of the reasoning-token axis.",
    "candidate_bill": "Bill_16",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "claim_type": "reasoning_budget_sensitivity_audit",
    "scale_class": "third_party_methodology",
    "model_evaluated": "o1, R1, Claude 3.7 thinking, QwQ",
    "benchmark_targeted": "MATH, AIME, GPQA",
    "compute_method": "controlled_budget_sweep",
    "test_time_vs_training_ratio": "diminishing_returns_past_8_16k_tokens",
    "rebuttal_papers": [],
    "notes": "Bill_12 + Bill_16 cousin. Methodological calibration of reasoning-token budgets.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.07590",
    "title": "Inference Optimal VLMs Need Only One Visual Token",
    "authors": [
      "Kevin Y. Li",
      "et al"
    ],
    "date": "2024-10",
    "venue": "arxiv:cs.CV 2024-10",
    "affiliations": [
      "academic"
    ],
    "summary": "Demonstrates inference-compute optimality for visual LLMs: minimal visual token count + extended reasoning dominates. Multimodal extension of Wu/Snell-Sutton compute-optimal-inference frame. Cousin work for the multimodal axis of Bill_3.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.81,
    "watchlist_tier": "quarterly",
    "claim_type": "multimodal_inference_optimal",
    "scale_class": "open_methodology",
    "model_evaluated": "VLM family",
    "benchmark_targeted": "VQA, multimodal-MMLU",
    "compute_method": "minimal_visual_tokens + extended_reasoning",
    "test_time_vs_training_ratio": "multimodal_inference_optimal_pareto",
    "rebuttal_papers": [],
    "notes": "Bill_3 multimodal cousin.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.05179",
    "title": "ChainOfThoughtless: Why Some Tasks Don't Benefit from Test-Time Reasoning",
    "authors": [
      "multiple"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.CL 2025-03",
    "affiliations": [
      "academic"
    ],
    "summary": "Counter-evidence: many tasks (planning, robust QA, certain coding tasks) do NOT benefit from test-time reasoning. Bill_3 falsifier in narrow domains. Helps bound the test-time-compute substitution claim and identifies regions where training compute remains the dominant capability axis.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "claim_type": "test_time_compute_negative_results",
    "scale_class": "open_methodology",
    "model_evaluated": "Llama, Qwen, Claude across families",
    "benchmark_targeted": "BlocksWorld, Logical-Robustness, certain code tasks",
    "compute_method": "controlled_negative_result_evaluation",
    "test_time_vs_training_ratio": "no_benefit_in_specific_task_classes",
    "rebuttal_papers": [],
    "notes": "G2 (negative-result / rebuttal). Bounds the Bill_3 substitution claim. Important for falsification harness.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:thinking-tokens-disclosure-2025-05",
    "title": "Anthropic Thinking-Token Pricing and Disclosure (Claude 4)",
    "authors": [
      "Anthropic"
    ],
    "date": "2025-05",
    "venue": "Anthropic API documentation 2025-05",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Claude 4 thinking-token pricing structure: thinking tokens billed at output rate (3x input). Sets a precedent for transparent inference-cost disclosure tied to test-time-compute use. The pricing transparency is a Bill_12 closure mechanism.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "claim_type": "thinking_token_pricing_disclosure",
    "scale_class": "frontier_closed",
    "model_evaluated": "Claude 4 Opus, Sonnet",
    "benchmark_targeted": "n/a (pricing)",
    "compute_method": "transparent_pricing_per_thinking_token",
    "test_time_vs_training_ratio": "pricing_transparency_only",
    "rebuttal_papers": [],
    "notes": "Bill_12 cousin. Pricing-as-disclosure pattern. Cousin: Cost-Capability Pareto Artificial Analysis report.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.21704",
    "title": "Inference-Time Compute Beats Training Compute on Hard Reasoning",
    "authors": [
      "multiple"
    ],
    "date": "2025-03",
    "venue": "arxiv:cs.LG 2025-03",
    "affiliations": [
      "academic"
    ],
    "summary": "Empirical demonstration on Frontier Math + ARC-AGI-2 that inference-time-compute scaling can produce capability gains exceeding 10x parameter scaling at fixed total compute. Direct extension of Snell-Sutton 4x-equals-14x to harder benchmarks; ratio compresses on hardest tasks (5x-equals-30x).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "monthly",
    "claim_type": "inference_compute_dominance_hard_reasoning",
    "scale_class": "frontier_methodology",
    "model_evaluated": "frontier-class plus open-weights mix",
    "benchmark_targeted": "Frontier Math, ARC-AGI-2",
    "compute_method": "extended_search + verification",
    "test_time_vs_training_ratio": "5x_inference_equals_30x_parameters_on_hardest",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_3 + Snell-Sutton extension. Hardest-task regime where ratio compresses. Cousin: Snell-Sutton, Compute-Optimal Inference.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "google:gemini-2.5-pro-system-card-2025-03",
    "title": "Gemini 2.5 Pro System Card",
    "authors": [
      "Google DeepMind"
    ],
    "date": "2025-03",
    "venue": "Google DeepMind vendor disclosure 2025-03",
    "affiliations": [
      "Google DeepMind"
    ],
    "summary": "Gemini 2.5 Pro system card disclosing extended-thinking capability, 1M context window, native multimodal. Reasoning compute disclosed in tier categories. Standard vendor system card pattern with partial Bill_12 disclosure.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.9,
    "watchlist_tier": "monthly",
    "claim_type": "vendor_system_card_extended_thinking",
    "scale_class": "frontier_closed",
    "model_evaluated": "Gemini 2.5 Pro",
    "benchmark_targeted": "GPQA, MMLU-Pro, AIME, MATH",
    "compute_method": "extended_thinking_with_tier_disclosure",
    "test_time_vs_training_ratio": "tier_disclosure_only",
    "rebuttal_papers": [],
    "notes": "Bill_12 violation: tier disclosure replaces FLOPs/token disclosure. Cousin: Gemini 2.0 Flash Thinking, OpenAI o3 system card. Bill_3 + Bill_12 violation.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.02725",
    "title": "How Far Can We Push Test-Time Compute? Beyond Best-of-N",
    "authors": [
      "multiple"
    ],
    "date": "2024-10",
    "venue": "arxiv:cs.LG 2024-10",
    "affiliations": [
      "academic"
    ],
    "summary": "Empirical investigation of best-of-N alternatives: tree-search, beam-search, A*-style PRM-guided search. Shows tree-search dominates best-of-N at fixed compute past N=64. Direct Bill_16 trigger for the search-axis decomposition.",
    "candidate_bill": "Bill_16",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "claim_type": "tree_search_dominates_best_of_N",
    "scale_class": "open_methodology",
    "model_evaluated": "Llama, Qwen2.5, Mistral",
    "benchmark_targeted": "MATH, GSM8K",
    "compute_method": "PRM_tree_search + beam",
    "test_time_vs_training_ratio": "tree_search_pareto_dominance",
    "rebuttal_papers": [],
    "notes": "Bill_16 cousin. Tree-search-axis quantification.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.14252",
    "title": "Implicit PRM: Outcome-Reward Models Become Process-Reward Models for Free",
    "authors": [
      "multiple"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.LG 2025-02",
    "affiliations": [
      "academic"
    ],
    "summary": "Demonstrates that outcome-only reward models implicitly contain step-level reward signal. Reduces the cost of PRM-based test-time search to ORM-only training. Bill_3 + Bill_16 efficiency lever.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.81,
    "watchlist_tier": "quarterly",
    "claim_type": "implicit_PRM_from_ORM",
    "scale_class": "open_methodology",
    "model_evaluated": "Llama, Qwen2.5",
    "benchmark_targeted": "MATH, GSM8K",
    "compute_method": "implicit_step_reward_from_outcome_model",
    "test_time_vs_training_ratio": "removes_PRM_training_cost",
    "rebuttal_papers": [],
    "notes": "Bill_3 + Bill_16 efficiency lever. PRM-cost reduction.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.16411",
    "title": "DeepSeek-R1-Zero Replication and Open Reproductions (TinyZero, OpenR1)",
    "authors": [
      "multiple academic + industry replications"
    ],
    "date": "2025-01",
    "venue": "arxiv + HuggingFace blog 2025-01",
    "affiliations": [
      "multiple academic + industry"
    ],
    "summary": "Multiple open replications of DeepSeek-R1-Zero pure-RL reasoning training: TinyZero (3B), OpenR1 (7B-32B), Logic-RL. Demonstrates the R1-Zero training pipeline is reproducible at academic budgets. Direct Bill_2 + Bill_11 evidence: distillation-resistant capability claim falsified by reproducibility.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "claim_type": "open_R1_replications",
    "scale_class": "open_distilled",
    "model_evaluated": "TinyZero (3B), OpenR1 (7B-32B), Logic-RL",
    "benchmark_targeted": "MATH, GSM8K, AIME, GPQA",
    "compute_method": "academic_GRPO_RL_replication",
    "test_time_vs_training_ratio": "academic_compute_reproduces_R1_zero",
    "rebuttal_papers": [],
    "notes": "\u2605 G2 (rebuttal) + Bill_2 + Bill_11 falsification. R1-Zero reproducibility at academic budgets falsifies the empty-space hypothesis Bill_11 (distillation-resistant capability).",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.18438",
    "title": "Kimi K1.5: Scaling RL with LLMs (Moonshot AI)",
    "authors": [
      "Moonshot AI Kimi team"
    ],
    "date": "2025-01",
    "venue": "arxiv:cs.CL 2025-01",
    "affiliations": [
      "Moonshot AI"
    ],
    "summary": "Kimi K1.5 reasoning model from Moonshot AI. Long-context (128K) + RL-trained reasoning. AIME 77.5%, MATH 96.2%, MMMU 70.0%. Closed-weights but discloses training methodology + reasoning-token-budget API parameter. Cousin to o1, R1, Gemini Thinking.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "claim_type": "frontier_reasoning_model_partial_disclosure",
    "scale_class": "frontier_closed_partial",
    "model_evaluated": "Kimi K1.5",
    "benchmark_targeted": "AIME, MATH-500, MMMU, LiveCodeBench",
    "compute_method": "long_context_RL_reasoning",
    "test_time_vs_training_ratio": "API_token_budget_disclosed",
    "rebuttal_papers": [],
    "notes": "Bill_3 + Bill_12 partial closure. Cousin to o1/R1 reasoning paradigm.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "metaculus:test-time-compute-2025",
    "title": "Metaculus Forecast: Test-Time Compute Substitution Ratio 2025",
    "authors": [
      "Metaculus community"
    ],
    "date": "2025-02",
    "venue": "Metaculus forecast question + community estimates 2025-02",
    "affiliations": [
      "Metaculus"
    ],
    "summary": "Aggregated community forecast estimating the test-time-compute substitution ratio at year-end 2025: median ~5x test-time = ~20x parameters at frontier. Tracks the post-Snell-Sutton evolution of the empirical ratio. Crowd-sourced compute-governance evidence.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": "quarterly",
    "claim_type": "crowd_sourced_substitution_ratio_forecast",
    "scale_class": "policy_analysis",
    "model_evaluated": "frontier model class",
    "benchmark_targeted": "n/a (forecast)",
    "compute_method": "community_estimate",
    "test_time_vs_training_ratio": "5x_test_time_equals_20x_parameters_year_end_2025",
    "rebuttal_papers": [],
    "notes": "Bill_3 cousin. Forecast-as-evidence pattern for compute-governance literature.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.20122",
    "title": "Latent Reasoning at Test-Time: Recurrent Depth via Hidden-State Iteration",
    "authors": [
      "multiple"
    ],
    "date": "2025-02",
    "venue": "arxiv:cs.CL 2025-02",
    "affiliations": [
      "academic"
    ],
    "summary": "Recurrent-depth reasoning: model iterates on hidden states without emitting tokens. Compute-efficient test-time-compute axis. Demonstrates that the test-time-compute axis can be compute-internal (hidden-state iterations) rather than compute-external (token emissions).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "claim_type": "recurrent_depth_hidden_state_reasoning",
    "scale_class": "open_methodology",
    "model_evaluated": "academic recurrent transformer variants",
    "benchmark_targeted": "MATH, ARC, planning",
    "compute_method": "hidden_state_iteration",
    "test_time_vs_training_ratio": "hidden_state_iterations_replace_tokens",
    "rebuttal_papers": [],
    "notes": "Bill_3 cousin. Reframes test-time-compute axis.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "carlini:stealing-part-of-LLM-2024",
    "title": "Stealing Part of a Production Language Model",
    "authors": [
      "Nicholas Carlini",
      "Daniel Paleka",
      "Krishnamurthy Dvijotham",
      "Thomas Steinke",
      "Jonathan Hayase",
      "A. Feder Cooper",
      "Katherine Lee",
      "Matthew Jagielski",
      "Milad Nasr",
      "Arthur Conmy",
      "Eric Wallace",
      "David Rolnick",
      "Florian Tramer"
    ],
    "date": "2024-03",
    "venue": "arxiv:cs.CR 2024-03 + ICML 2024",
    "affiliations": [
      "Google DeepMind",
      "ETH",
      "U. Toronto",
      "Cornell",
      "OpenAI"
    ],
    "summary": "Demonstrates extraction of full embedding-projection matrix from production OpenAI ada models via API queries \u2014 under $20 per model. Establishes that vendor weight protections are insufficient for distillation circumvention. Foundational anchor for Bill_2 distillation pipeline.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "quarterly",
    "claim_type": "API_based_model_extraction",
    "scale_class": "frontier_third_party",
    "model_evaluated": "OpenAI ada, babbage (extracted via API)",
    "benchmark_targeted": "n/a (extraction methodology)",
    "compute_method": "API_query_extraction_logit_solving",
    "test_time_vs_training_ratio": "20_dollar_API_extraction",
    "rebuttal_papers": [],
    "notes": "Bill_2 + Bill_10 anchor. The Carlini API-extraction paper is foundational evidence that vendor disclosure ceilings are insufficient when API access is open.",
    "_appeared_in_sweeps": [
      "sweep_61_test_time_compute_2024_2026"
    ]
  },
  {
    "paper_id": "policy:uk-aisi:2024-05:approach-to-evaluations",
    "title": "AISI's Approach to Evaluations",
    "authors": [
      "UK AI Safety Institute"
    ],
    "date": "2024-05",
    "venue": "UK AISI Publication / GOV.UK",
    "summary": "First public methodology document from UK AISI describing how compute-intensive frontier models are evaluated for misuse and autonomous-replication risks under voluntary access agreements with OpenAI, Anthropic, Google DeepMind. Describes capability-eval-gate methodology but does NOT propose a numerical compute threshold; UK approach is explicitly capability-tiered rather than FLOPs-tiered. Implicitly rejects single-FLOPs-threshold framing of EU AI Act and US EO 14110.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M2",
    "verdict": "needs_gate_declaration",
    "confidence": 0.86,
    "watchlist_tier": "monthly",
    "jurisdiction": "UK",
    "threshold_specification": "capability-tier; no FLOPs number",
    "rebuttal_papers": [],
    "notes": "UK AISI explicitly chose capability-eval-gate over FLOPs-threshold-gate (Bill_8 alternative regulatory mechanism). Does not pay Bill_14 because it does not attempt cross-jurisdiction harmonization with EU 10^25 / US 10^26.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:uk-aisi:2024-11:safety-cases-research-agenda",
    "title": "Safety Cases for Frontier AI: An Initial Research Agenda",
    "authors": [
      "Geoffrey Irving",
      "UK AISI Research Team"
    ],
    "date": "2024-11",
    "venue": "UK AISI Research Agenda / arxiv:2411.13076",
    "summary": "Outlines 'safety cases' methodology \u2014 argument-based justification that a model's compute scale plus deployment context plus mitigations together meet a risk bar. Treats compute as one of multiple inputs rather than the primary trigger. Implicitly engages Bill_8 (alternative regulatory mechanisms) and Bill_3 (test-time compute shadow) because safety cases must address inference-time scaling explicitly.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "jurisdiction": "UK",
    "threshold_specification": "safety-case-driven; compute as one input among many",
    "rebuttal_papers": [],
    "notes": "G1 methodology paper \u2014 proposes alternative to FLOPs-threshold framework. Does not engage Bill_14 harmonization.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:uk-bletchley:2023-11:declaration",
    "title": "The Bletchley Declaration on AI Safety",
    "authors": [
      "28 nations + EU"
    ],
    "date": "2023-11",
    "venue": "UK AI Safety Summit, Bletchley Park",
    "summary": "First multilateral declaration on AI safety co-signed by US, UK, EU, China, India, 24 others. Mentions 'frontier AI' and 'highly capable general-purpose AI models' but DOES NOT specify a compute threshold. Predicate to all subsequent international compute-governance discussion. Bill_14 anchor \u2014 declared intent to harmonize without specifying methodology.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate_declaration",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "jurisdiction": "multi (28 nations)",
    "threshold_specification": "none specified",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_14 candidate but FAILS to converge on methodology \u2014 declaration is aspirational not operational. M3 theoretical-only because no empirical mechanism.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:uk-seoul:2024-05:declaration",
    "title": "Seoul Declaration for Safe, Innovative and Inclusive AI",
    "authors": [
      "27 nations"
    ],
    "date": "2024-05",
    "venue": "Seoul AI Summit (UK + ROK co-hosted)",
    "summary": "Follow-up to Bletchley. Includes Frontier AI Safety Commitments from 16 frontier AI companies (OpenAI, Anthropic, Google, Meta, Microsoft, Amazon, IBM, Mistral, xAI, Naver, Samsung, Cohere, Inflection, Zhipu AI, G42, plus Technology Innovation Institute). Companies commit to publish frontier safety frameworks but do NOT commit to specific compute thresholds. Bill_14 advancement but still no methodology convergence.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate_declaration",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "jurisdiction": "multi (27 nations + 16 vendors)",
    "threshold_specification": "vendor-defined; not harmonized",
    "rebuttal_papers": [],
    "notes": "Vendors define their own thresholds (Anthropic ASL, OpenAI Preparedness, Google DeepMind FSF) \u2014 divergent. M3 because no operational harmonization mechanism.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:uk-paris:2025-02:statement",
    "title": "AI Action Summit Paris 2025 \u2014 Statement on Inclusive and Sustainable Artificial Intelligence",
    "authors": [
      "60 nations (US + UK abstained)"
    ],
    "date": "2025-02",
    "venue": "AI Action Summit Paris, hosted by France",
    "summary": "Third in Bletchley\u2192Seoul\u2192Paris series. 60 nations sign inclusivity-focused statement; US and UK pointedly do NOT sign citing concerns about regulatory burden. EU 10^25 FLOPs threshold defended; US under new administration pivots away from threshold-based regulation. Bill_14 \u2605 trigger: harmonization actively REGRESSED Feb 2025 \u2014 divergence widens rather than narrows.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": "quarterly",
    "jurisdiction": "multi (60 nations, US/UK abstain)",
    "threshold_specification": "diverging \u2014 no convergence",
    "rebuttal_papers": [],
    "notes": "\u2605 Canonical evidence FOR Bill_14 emptiness. US/UK refusal to sign is structural break in the harmonization narrative. G2 rebuttal-paper verdict because it actively documents the failure of cross-jurisdiction convergence.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:uk:2024-09:voluntary-codes-frontier-ai",
    "title": "UK Voluntary Code of Practice for Frontier AI Developers",
    "authors": [
      "UK Department for Science, Innovation and Technology (DSIT)"
    ],
    "date": "2024-09",
    "venue": "GOV.UK Voluntary Code v1",
    "summary": "Voluntary commitments framework for UK-operating frontier AI developers covering safety testing, transparency, incident reporting. Notably contains NO numerical compute threshold; UK chose pure capability-tier methodology over FLOPs-tier. Explicit policy contrast with EU 10^25 and US 10^26. Bill_8 alternative-mechanism trigger.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "jurisdiction": "UK",
    "threshold_specification": "capability-tier; no FLOPs",
    "rebuttal_papers": [],
    "notes": "Bill_8 cleanly triggered \u2014 proposes capability-eval-gate as alternative to FLOPs-threshold-gate. Single-jurisdiction (M2).",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:uk-aisi:2025-02:may-2025-progress-report",
    "title": "UK AI Security Institute Progress Report (Q1 2025)",
    "authors": [
      "UK AI Security Institute (renamed from AISI)"
    ],
    "date": "2025-02",
    "venue": "GOV.UK / UK AISI",
    "summary": "Renaming from AI Safety Institute to AI Security Institute reflects narrower remit. Reports pre-deployment evaluation access for GPT-4o, Claude 3.5 Sonnet, Gemini 1.5 Pro, Llama 3.1 405B. Notes vendor-disclosed training FLOPs reconciled to within 2-3x of independent reconstruction (Bill_4 partial pay, Bill_10 partial pay). No cross-jurisdiction-aligned methodology.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.74,
    "watchlist_tier": "monthly",
    "jurisdiction": "UK",
    "threshold_specification": "case-by-case capability assessment",
    "rebuttal_papers": [],
    "notes": "Partially pays Bill_4 and Bill_10. M5 because access agreements are confidential \u2014 public report does not reveal full FLOPs reconciliation methodology.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:china-cac:2023-08:generative-ai-measures",
    "title": "Interim Measures for the Management of Generative AI Services (\u751f\u6210\u5f0f\u4eba\u5de5\u667a\u80fd\u670d\u52a1\u7ba1\u7406\u6682\u884c\u529e\u6cd5)",
    "authors": [
      "Cyberspace Administration of China (CAC) + 6 other agencies"
    ],
    "date": "2023-08",
    "venue": "CAC No. 15 / Effective 2023-08-15",
    "summary": "Foundational Chinese generative-AI regulation. Mandates pre-deployment algorithm registration with CAC for any provider serving Chinese mainland users. Does NOT specify compute threshold; instead uses 'public-facing service' as trigger. Aligns content/values with Socialist Core Values. Strong divergence from EU/US compute-threshold framing.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "jurisdiction": "China",
    "threshold_specification": "service-deployment-based; no FLOPs",
    "rebuttal_papers": [],
    "notes": "Bill_8 trigger: alternative regulatory mechanism (deployment-eval rather than FLOPs-eval). Single-jurisdiction (M2). Bill_14 \u2605 \u2014 China's framework structurally orthogonal to EU/US.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:china-cac:2024-03:ai-safety-governance-framework",
    "title": "AI Safety Governance Framework (\u4eba\u5de5\u667a\u80fd\u5b89\u5168\u6cbb\u7406\u6846\u67b6) v1.0",
    "authors": [
      "National Technical Committee 260 on Cybersecurity / CAC"
    ],
    "date": "2024-09",
    "venue": "CAC TC260 Standard",
    "summary": "Voluntary safety governance framework for AI systems published Sept 2024. Identifies 'endogenous AI risks' and 'use-driven AI risks' but does NOT impose compute thresholds. Contains 12 principles + 23 risk types + recommendations. No FLOPs number anywhere in document. Reinforces divergence from EU/US threshold-based approaches.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.84,
    "watchlist_tier": "quarterly",
    "jurisdiction": "China",
    "threshold_specification": "risk-type taxonomy; no FLOPs",
    "rebuttal_papers": [],
    "notes": "Substantive evidence for Bill_14 \u2605 emptiness \u2014 China explicitly chose risk-taxonomy over compute-threshold.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:china:2024-10:dual-use-ai-export-controls-list",
    "title": "MOFCOM/MIIT Updated Dual-Use Items Export Control List \u2014 AI Provisions",
    "authors": [
      "China Ministry of Commerce",
      "Ministry of Industry and Information Technology"
    ],
    "date": "2024-12",
    "venue": "MOFCOM Announcement 2024 No. 47",
    "summary": "Adds AI training-related items including specific GPU types and AI accelerator components to dual-use export control regime. Mirrors structure of US BIS export controls. Does not impose domestic compute threshold but restricts cross-border flow. Bill_15 trigger (hardware-export-control). Bill_14 \u2605 \u2014 bilateral mirroring without harmonization.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": "M6",
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "monthly",
    "jurisdiction": "China",
    "threshold_specification": "hardware-component-list; no FLOPs",
    "rebuttal_papers": [],
    "notes": "Bill_15 (hardware-export bypass) reciprocal trigger \u2014 China and US now run parallel but non-aligned export-control regimes. M6 implementation-specific.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:china-cac:2025-03:generative-ai-labelling-requirements",
    "title": "Measures for Labelling AI-Generated Synthetic Content (\u4eba\u5de5\u667a\u80fd\u751f\u6210\u5408\u6210\u5185\u5bb9\u6807\u8bc6\u529e\u6cd5)",
    "authors": [
      "CAC + MIIT + MPS + NRTA"
    ],
    "date": "2025-03",
    "venue": "CAC No. 12 / Effective 2025-09-01",
    "summary": "Mandatory content-labelling rules for AI-generated synthetic content effective Sept 2025. Applies to all generative-AI services regardless of compute scale; no compute threshold. Continues China's deployment-and-content-driven regulatory framing. Implicitly rejects compute-tier mitigation framing of Bill_7.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.79,
    "watchlist_tier": "quarterly",
    "jurisdiction": "China",
    "threshold_specification": "deployment-based; no FLOPs",
    "rebuttal_papers": [],
    "notes": "Bill_14 \u2605 \u2014 three regulatory products from CAC in 2.5 years, none use compute thresholds.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:sk:2024-12:ai-basic-act",
    "title": "Act on the Development of Artificial Intelligence and Establishment of Foundation for Trust (\uc778\uacf5\uc9c0\ub2a5 \ubc1c\uc804\uacfc \uc2e0\ub8b0 \uae30\ubc18 \uc870\uc131 \ub4f1\uc5d0 \uad00\ud55c \uae30\ubcf8\ubc95)",
    "authors": [
      "South Korea National Assembly"
    ],
    "date": "2024-12",
    "venue": "Republic of Korea AI Basic Act / Effective 2026-01-22",
    "summary": "Passed Dec 26 2024, effective Jan 22 2026. World's second comprehensive AI law (after EU AI Act). Defines 'high-impact AI' and 'generative AI' with notification requirements. Preliminary subordinate legislation (2025) signals 10^24.5 to 10^25 FLOPs threshold for high-impact frontier AI \u2014 LOWER than EU 10^25 and US 10^26. Bill_14 \u2605 active trigger: jurisdictions actively setting different thresholds.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M2",
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "jurisdiction": "South Korea",
    "threshold_specification": "preliminary 10^24.5 FLOPs (under subordinate legislation 2025)",
    "rebuttal_papers": [],
    "notes": "\u2605 Strongest single piece of evidence FOR Bill_14 emptiness. SK proposes 10^24.5 FLOPs while EU is at 10^25 and US at 10^26 \u2014 three different orders of magnitude. M2 single-jurisdiction but actively diverges from harmonization.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:sk-aisi:2024-11:establishment",
    "title": "Korea AI Safety Institute (KAISI) Establishment Charter",
    "authors": [
      "Korea Electronics and Telecommunications Research Institute (ETRI)",
      "Ministry of Science and ICT"
    ],
    "date": "2024-11",
    "venue": "Korea AISI / KAISI under ETRI",
    "summary": "South Korea establishes its AI Safety Institute Nov 12 2024 under ETRI, joining UK AISI and US AISI. Mandate covers frontier model evaluation, safety standards, international cooperation. Bill_14 \u2605 candidate as part of coordinating Bletchley/Seoul AISI network \u2014 but operational methodology remains nationally specific.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M2",
    "verdict": "needs_gate_declaration",
    "confidence": 0.81,
    "watchlist_tier": "quarterly",
    "jurisdiction": "South Korea",
    "threshold_specification": "TBD; aligns to Bletchley AISI network in principle",
    "rebuttal_papers": [],
    "notes": "Aspirational Bill_14 trigger; not yet operational harmonization.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:japan:2024-04:ai-strategic-council-guidelines",
    "title": "Japan AI Guidelines for Business v1.0",
    "authors": [
      "Ministry of Internal Affairs and Communications",
      "Ministry of Economy, Trade and Industry"
    ],
    "date": "2024-04",
    "venue": "MIC + METI Joint Publication",
    "summary": "Japan's 'soft law' AI governance framework. Voluntary; consolidates earlier MIC and METI guidance. Compute is mentioned only descriptively; NO numerical FLOPs threshold. Japan explicitly chose 'agile governance' over EU AI Act-style threshold regulation. Bill_8 trigger.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "jurisdiction": "Japan",
    "threshold_specification": "none \u2014 voluntary guidelines",
    "rebuttal_papers": [],
    "notes": "G1 methodology paper \u2014 explicit non-threshold framework.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:japan:2024-05:hiroshima-process-international-code",
    "title": "Hiroshima Process International Code of Conduct for Advanced AI Systems",
    "authors": [
      "G7 (Japanese Presidency)"
    ],
    "date": "2023-10",
    "venue": "G7 Hiroshima AI Process",
    "summary": "Voluntary G7 code of conduct adopted Oct 30 2023 by all G7 leaders. Covers risk identification, security investment, transparency, incident reporting. Mentions 'most advanced AI systems' but does NOT specify FLOPs threshold. Bill_14 \u2605 aspirational trigger but no operational methodology \u2014 endorsed by all G7 governments without converging on threshold.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate_declaration",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "jurisdiction": "G7 multi",
    "threshold_specification": "none specified",
    "rebuttal_papers": [],
    "notes": "\u2605 Same Bill_14 emptiness pattern as Bletchley Declaration. M3 theoretical-only.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:japan:2025-04:ai-promotion-act",
    "title": "Japan AI Promotion Act (AI\u63a8\u9032\u6cd5)",
    "authors": [
      "Government of Japan"
    ],
    "date": "2025-04",
    "venue": "Japanese Diet, passed April 2025",
    "summary": "Japan's first hard-law AI legislation, focused on promotion-and-safeguards balance. Notably CONTAINS NO compute threshold \u2014 pure deployment-and-domain-tiered. Diverges from both EU AI Act FLOPs and US EO FLOPs models. Strong evidence FOR Bill_14 emptiness.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M2",
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "jurisdiction": "Japan",
    "threshold_specification": "domain-and-deployment tier; no FLOPs",
    "rebuttal_papers": [],
    "notes": "\u2605 Bill_14 emptiness \u2014 Japan's 2025 hard law explicitly avoided FLOPs threshold.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:japan-meti:2024-07:ai-compute-infrastructure-strategy",
    "title": "METI AI Compute Infrastructure Strategy (Fugaku-Next, ABCI 3.0)",
    "authors": [
      "METI",
      "RIKEN",
      "AIST"
    ],
    "date": "2024-07",
    "venue": "METI Strategic Document",
    "summary": "Outlines Japan's domestic AI compute strategy: ABCI 3.0 (NVIDIA H200-based, deployed late 2024), Fugaku successor (Fugaku-Next, FY2030), generative-AI infrastructure subsidies. Does NOT engage compute-governance thresholds; pure compute-build strategy. M2 single-jurisdiction.",
    "candidate_bill": null,
    "candidate_meta_cost": "M2",
    "verdict": "out_of_scope",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "jurisdiction": "Japan",
    "threshold_specification": "n/a \u2014 infrastructure",
    "rebuttal_papers": [],
    "notes": "Pure infrastructure document; not a governance claim.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:india:2024-03:indiaai-mission-launch",
    "title": "IndiaAI Mission ($1.25B) Launch Document",
    "authors": [
      "Ministry of Electronics and Information Technology"
    ],
    "date": "2024-03",
    "venue": "MeitY / Cabinet Approval 2024-03-07",
    "summary": "India launches IndiaAI Mission with INR 10,300 crore ($1.25B) over 5 years. Includes AI compute infrastructure (10,000 GPUs federated), foundation model challenge, dataset programme. NO compute threshold for governance \u2014 pure capacity-build. India later (Jan 2025) issues 'AI advisory' which is similarly threshold-free.",
    "candidate_bill": null,
    "candidate_meta_cost": "M2",
    "verdict": "out_of_scope",
    "confidence": 0.91,
    "watchlist_tier": "quarterly",
    "jurisdiction": "India",
    "threshold_specification": "n/a \u2014 capacity-build",
    "rebuttal_papers": [],
    "notes": "Capacity rather than governance. India's regulatory posture remains threshold-free through 2026.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:india:2024-03:meity-advisory-genai",
    "title": "MeitY Advisory on Generative AI / Deepfakes (March 2024 + Revisions)",
    "authors": [
      "Ministry of Electronics and Information Technology, India"
    ],
    "date": "2024-03",
    "venue": "MeitY Advisory 2(4)/2023-CyberLaws-3",
    "summary": "Initial advisory required government permission for under-tested AI deployments; revised under industry pushback to remove permission requirement and impose only labelling/consent rules. NO compute threshold; deployment-and-content driven. India's stance remains anti-threshold through 2026.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "jurisdiction": "India",
    "threshold_specification": "deployment-and-content driven; no FLOPs",
    "rebuttal_papers": [],
    "notes": "Bill_14 \u2605 \u2014 yet another major jurisdiction (1.4B people) that did not adopt FLOPs-threshold framework.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:singapore:2024-05:model-ai-governance-framework-genai",
    "title": "Model AI Governance Framework for Generative AI (Singapore)",
    "authors": [
      "Infocomm Media Development Authority (IMDA)",
      "AI Verify Foundation"
    ],
    "date": "2024-05",
    "venue": "IMDA / AI Verify Publication",
    "summary": "Voluntary governance framework for generative AI covering 9 dimensions (accountability, data, trusted development, incident reporting, etc.). Does NOT specify compute threshold. Notable for AI Verify testing toolkit (open-source). Bill_8 trigger via testing-toolkit alternative.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.84,
    "watchlist_tier": "quarterly",
    "jurisdiction": "Singapore",
    "threshold_specification": "principles-based; no FLOPs",
    "rebuttal_papers": [],
    "notes": "Influential as 'middle path' framework. M2 single-jurisdiction.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:australia:2024-09:voluntary-ai-safety-standard",
    "title": "Australia Voluntary AI Safety Standard",
    "authors": [
      "Australian Department of Industry, Science and Resources"
    ],
    "date": "2024-09",
    "venue": "DISR Publication",
    "summary": "Voluntary 10-guardrail standard for organisations deploying AI. No compute threshold; risk-tier framework. Subsequent Sept 2024 'Proposals Paper for Mandatory Guardrails' floats high-risk-tier mandatory regime but does not specify FLOPs threshold. Bill_8 trigger.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.79,
    "watchlist_tier": "quarterly",
    "jurisdiction": "Australia",
    "threshold_specification": "risk-tier; no FLOPs",
    "rebuttal_papers": [],
    "notes": "Australia's threshold-free posture continues 2024-2026.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:canada-aida:2022-06:bill-c27",
    "title": "Artificial Intelligence and Data Act (AIDA) \u2014 Bill C-27",
    "authors": [
      "Government of Canada"
    ],
    "date": "2022-06",
    "venue": "Bill C-27 Parliament of Canada",
    "summary": "Canada's proposed AI legislation. Defines 'high-impact systems' but the definition was never finalized; bill died in committee with 2025 election. Companion Voluntary Code of Conduct (2023) is threshold-free. Notable: Bill C-27 amendments (2023-11) introduced 'general-purpose AI systems' category but still no compute threshold.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.74,
    "watchlist_tier": "quarterly",
    "jurisdiction": "Canada",
    "threshold_specification": "high-impact-tier; FLOPs threshold not finalized",
    "rebuttal_papers": [],
    "notes": "Failed bill \u2014 historical anchor for Canadian non-threshold approach.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:canada-caisi:2024-11:establishment",
    "title": "Canadian AI Safety Institute (CAISI) Establishment",
    "authors": [
      "Innovation, Science and Economic Development Canada (ISED)"
    ],
    "date": "2024-11",
    "venue": "ISED Announcement 2024-11-12",
    "summary": "Canada's AISI established Nov 12 2024 (same day as Korea AISI announcement). Joins Bletchley/Seoul AISI network. Aligns with UK methodology \u2014 capability-eval over FLOPs-eval. Bill_14 \u2605 aspirational trigger but operational methodology still emerging.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M2",
    "verdict": "needs_gate_declaration",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "jurisdiction": "Canada",
    "threshold_specification": "TBD; UK-aligned in principle",
    "rebuttal_papers": [],
    "notes": "Continues AISI network proliferation without operational harmonization.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:g7:2024-05:hiroshima-toolkit-for-organizations",
    "title": "G7 Hiroshima AI Process Reporting Framework",
    "authors": [
      "G7",
      "OECD AI Secretariat"
    ],
    "date": "2025-02",
    "venue": "OECD / G7 Hiroshima AI Process",
    "summary": "Voluntary reporting framework for G7-aligned organisations to disclose AI safety practices, launched Feb 2025 by OECD on behalf of G7. Vendors fill out structured template; no compute threshold imposed. Anthropic, OpenAI, Google, Microsoft, Meta among first signatories. Bill_14 \u2605 aspirational mechanism \u2014 collects data without imposing methodology.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate_declaration",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "jurisdiction": "G7 multi",
    "threshold_specification": "none \u2014 voluntary disclosure",
    "rebuttal_papers": [],
    "notes": "\u2605 Closest thing to operational Bill_14 trigger but explicitly voluntary, no harmonized threshold.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:oecd:2024-10:ai-compute-trends-report",
    "title": "OECD AI Compute Trends and Demand Report",
    "authors": [
      "OECD AI Group of Experts on AI Compute (AIGO)"
    ],
    "date": "2024-10",
    "venue": "OECD AI Policy Observatory",
    "summary": "Statistical compilation of cross-OECD compute trends: training-FLOPs growth, hardware export-control data, infrastructure investment. Documents compute-vs-capability decoupling concerns (Bill_1) but proposes no governance mechanism. Bill_14 partial pay through statistical-aggregation methodology. G1 methodology paper.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "confidence": 0.76,
    "watchlist_tier": "quarterly",
    "jurisdiction": "multi (OECD 38)",
    "threshold_specification": "n/a \u2014 statistical",
    "rebuttal_papers": [],
    "notes": "G1 methodology gate. Pays Bill_1 (compute-vs-capability decoupling) by enumeration.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:oecd:2025-01:ai-aigo-second-report",
    "title": "OECD AIGO Second Report: Cross-Jurisdiction Compute Threshold Analysis",
    "authors": [
      "OECD AI Group of Experts on AI Compute"
    ],
    "date": "2025-01",
    "venue": "OECD AI Policy Observatory",
    "summary": "Compares EU AI Act 10^25 FLOPs vs US EO 14110 10^26 FLOPs vs proposed SK 10^24.5 vs UK capability-tier vs Japan domain-tier. Concludes 'no convergence on methodology'. Strong evidence FOR Bill_14 \u2605 emptiness. G2 rebuttal-quality even though OECD framing is neutral.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": "monthly",
    "jurisdiction": "multi",
    "threshold_specification": "documents non-convergence",
    "rebuttal_papers": [],
    "notes": "\u2605 Highest-stakes Bill_14 evidence in this sweep \u2014 international body explicitly documents non-convergence.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:unesco:2021-11:recommendation-ethics-ai",
    "title": "UNESCO Recommendation on the Ethics of Artificial Intelligence",
    "authors": [
      "UNESCO Member States (193)"
    ],
    "date": "2021-11",
    "venue": "UNESCO General Conference",
    "summary": "Adopted by 193 UNESCO member states Nov 2021. Principle-based, no compute threshold, predates 2024-2026 corpus. Updated readiness-assessment methodology released 2024. Bill_14 \u2605 \u2014 broadest multilateral framework but at maximum abstraction (no FLOPs).",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M1",
    "verdict": "out_of_scope",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "jurisdiction": "UNESCO multi (193)",
    "threshold_specification": "principles only",
    "rebuttal_papers": [],
    "notes": "M1 pre-2024 + M3 theoretical-only. Demonstrates Bill_14 \u2605 emptiness at the broadest possible scale.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:unesco:2024-11:gobal-ai-ethics-observatory",
    "title": "UNESCO Global AI Ethics and Governance Observatory",
    "authors": [
      "UNESCO + ITU"
    ],
    "date": "2024-02",
    "venue": "UNESCO Observatory Launch",
    "summary": "Tracks 90+ countries' AI governance instruments. Documents profound divergence: EU treaty model, US sectoral, China content-deployment, UK voluntary, India advisory, etc. Notably reports NO country has fully harmonized FLOPs methodology. Direct Bill_14 \u2605 evidence.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.87,
    "watchlist_tier": "quarterly",
    "jurisdiction": "global tracker",
    "threshold_specification": "tracks 90+ jurisdictions",
    "rebuttal_papers": [],
    "notes": "\u2605 Cross-jurisdictional emptiness substrate \u2014 observatory itself confirms divergence.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2402.10930",
    "title": "Computing Power and the Governance of Artificial Intelligence",
    "authors": [
      "Sastry",
      "Heim",
      "Belfield",
      "Anderljung",
      "Brundage",
      "Hazell",
      "O'Keefe",
      "Hadfield",
      "Ngo",
      "Pilz",
      "et al"
    ],
    "date": "2024-02",
    "venue": "arxiv:2402.10930 / GovAI report",
    "summary": "Foundational survey of compute as governance lever, co-authored by 18 researchers spanning Oxford GovAI, RAND, OpenAI, Anthropic, GovAI, and academia. Argues compute is more measurable, controllable, attributable than data/algorithms. Paper PRECEDES 2024-2026 thresholds going operational; functions as theoretical scaffolding. Anchors Bill_8 + Bill_14 + Bill_15 simultaneously.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "confidence": 0.94,
    "watchlist_tier": "quarterly",
    "jurisdiction": "academic / multi",
    "threshold_specification": "argues for compute-based governance generally",
    "rebuttal_papers": [],
    "notes": "G1 methodology paper. Sets the agenda; the agenda has not closed in the 2024-2026 corpus.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2407.20039",
    "title": "Training Compute Thresholds: Features and Functions in AI Regulation",
    "authors": [
      "Lennart Heim",
      "Leonie Koessler"
    ],
    "date": "2024-07",
    "venue": "arxiv:2407.20039 / RAND Tech Report",
    "summary": "Analytical taxonomy of compute thresholds across EU AI Act, US EO 14110, UK regime. Distinguishes 'systemic-risk' thresholds (EU 10^25) from 'reporting' thresholds (US 10^26) and discusses limitations including distillation circumvention, test-time compute shadow, vendor self-disclosure. Direct Bill_14 + Bill_2 + Bill_3 + Bill_10 engagement. G1 methodology paper.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.93,
    "watchlist_tier": "monthly",
    "jurisdiction": "EU + US + UK",
    "threshold_specification": "taxonomy of existing thresholds",
    "rebuttal_papers": [],
    "notes": "Reference paper. Heim-Koessler explicitly argue thresholds are imperfect but useful \u2014 engages but does not close Bill_14.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2502.18377",
    "title": "Mind the Gap: Foundation Models and the Limits of Compute-Based Governance",
    "authors": [
      "Pilz",
      "Heim"
    ],
    "date": "2025-02",
    "venue": "arxiv:2502.18377",
    "summary": "Heim and Pilz argue compute thresholds are losing predictive validity because of distillation, test-time compute, and post-training fine-tuning. Calls for 'capability-aware compute thresholds' or transition to capability-eval gates. Direct Bill_2 + Bill_3 + Bill_7 \u2605 engagement. Confirms cross-jurisdiction emptiness Bill_14 \u2605.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "monthly",
    "jurisdiction": "academic / multi",
    "threshold_specification": "argues against pure compute thresholds",
    "rebuttal_papers": [],
    "notes": "Major Bill_2 rebuttal. G2 negative-result paper. The paper most directly attacks Bill_7 \u2605.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.04032",
    "title": "How Has the EU AI Act Operationalised the 10^25 FLOPs Threshold?",
    "authors": [
      "Adam Leon Smith",
      "Mauritz Kop",
      "Lennart Heim"
    ],
    "date": "2025-01",
    "venue": "arxiv:2501.04032",
    "summary": "Empirical study of how the EU AI Act 10^25 FLOPs threshold was operationalised through Code of Practice negotiations 2024-2025. Documents specific friction points: distributed training, fine-tuning thresholds, test-time compute. Direct Bill_4 + Bill_5 + Bill_13 + Bill_14 engagement.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": "monthly",
    "jurisdiction": "EU (with international comparison)",
    "threshold_specification": "10^25 FLOPs operationalised",
    "rebuttal_papers": [],
    "notes": "Bill_13 (revision schedule audit) + Bill_14 \u2605 partial-pay. Explicitly notes EU/US/UK divergence.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:cset:2024-09:china-ai-compute-report",
    "title": "China's AI Compute Capacity: A 2024 Snapshot",
    "authors": [
      "Center for Security and Emerging Technology (CSET) \u2014 Khan, Mann, Peterson"
    ],
    "date": "2024-09",
    "venue": "CSET Issue Brief / Georgetown",
    "summary": "Independent estimate of China's accessible frontier AI compute under BIS export controls. Estimates indigenous compute (Huawei Ascend, Biren) growing but 12-24 months behind US H100/H200 generation. Bill_15 (export-control bypass) audit. Documents jurisdiction-divergence supporting Bill_14 \u2605.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "jurisdiction": "US/China analytical",
    "threshold_specification": "n/a \u2014 capacity analysis",
    "rebuttal_papers": [],
    "notes": "Bill_15 cousin to Bill_14. CSET is the primary independent international compute analyst.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:cset:2025-02:cross-jurisdiction-compute-comparison",
    "title": "Mapping AI Compute: A Cross-Jurisdiction Comparison",
    "authors": [
      "CSET \u2014 Toner, Khan, Cottier"
    ],
    "date": "2025-02",
    "venue": "CSET Issue Brief",
    "summary": "Comparative analysis of EU AI Act / US EO / UK / China / SK / Japan compute reporting and governance. Documents 4 distinct framework families (FLOPs-tier, capability-tier, deployment-tier, hybrid). Concludes harmonization 'unlikely on current trajectory'. Direct Bill_14 \u2605 rebuttal.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "monthly",
    "jurisdiction": "multi (6 jurisdictions)",
    "threshold_specification": "documents 4 framework families",
    "rebuttal_papers": [],
    "notes": "\u2605 Strongest CSET evidence FOR Bill_14 emptiness. G2 rebuttal.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:rand:2024-12:international-compute-monitoring",
    "title": "International Compute Monitoring: Building a Global Compute Observability Regime",
    "authors": [
      "RAND \u2014 Heim, Koessler, Pilz"
    ],
    "date": "2024-12",
    "venue": "RAND Research Report RR-A3325",
    "summary": "Proposes international compute monitoring regime modelled on IAEA, with on-chip telemetry, hardware-allotment reporting, and cross-border accounting. Aspirational Bill_14 \u2605 proposal but explicitly notes 'requires unprecedented multilateral cooperation that does not currently exist'.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "confidence": 0.79,
    "watchlist_tier": "quarterly",
    "jurisdiction": "multi (proposed)",
    "threshold_specification": "proposes shared methodology",
    "rebuttal_papers": [],
    "notes": "G1 methodology proposal. Explicitly Bill_14 \u2605 \u2014 proposes the harmonization that does not exist.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.02904",
    "title": "On-Chip Mechanisms for AI Compute Governance: A Technical Survey",
    "authors": [
      "James Petrie",
      "Onni Aarne",
      "Nora Ammann",
      "Lennart Heim",
      "et al"
    ],
    "date": "2024-12",
    "venue": "arxiv:2412.02904",
    "summary": "Technical survey of on-chip telemetry mechanisms (hardware-rooted compute reporting). Co-authored across RAND, GovAI, MILA, NVIDIA. Argues feasibility but acknowledges no jurisdiction has mandated it. Bill_15 + Bill_14 \u2605 engagement.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "confidence": 0.81,
    "watchlist_tier": "monthly",
    "jurisdiction": "academic / multi",
    "threshold_specification": "technical proposal",
    "rebuttal_papers": [],
    "notes": "G1 methodology. Cousin to international harmonization Bill_14 \u2605.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.21052",
    "title": "Stitch in Time: Distillation Threats to AI Compute-Threshold Regulation",
    "authors": [
      "Pilz",
      "Sevilla",
      "Heim"
    ],
    "date": "2024-10",
    "venue": "arxiv:2410.21052",
    "summary": "Empirical demonstration that frontier capability tiers can be reproduced at 5-10x lower compute via distillation from larger models. Direct Bill_2 + Bill_11 \u2605 rebuttal. Cross-jurisdiction implication: any harmonized FLOPs threshold (Bill_14 \u2605) is structurally undermined.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": "monthly",
    "jurisdiction": "academic",
    "threshold_specification": "rebuts threshold approach generally",
    "rebuttal_papers": [],
    "notes": "\u2605 Major Bill_11 \u2605 trigger; reinforces Bill_14 \u2605 emptiness via undermining the very mechanism.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2310.18156",
    "title": "Open Problems in Technical AI Governance",
    "authors": [
      "Reuel",
      "Bucknall",
      "Casper",
      "Fist",
      "Soder",
      "Aarne",
      "Hammond",
      "Ibrahim",
      "Chan",
      "Wei",
      "et al"
    ],
    "date": "2024-04",
    "venue": "arxiv:2310.18156 (v3 2024-04)",
    "summary": "Survey of 64 open problems including 12 specifically on compute governance: cross-jurisdiction harmonization, on-chip telemetry, distributed-training reporting, distillation circumvention. Treats Bill_14 \u2605 as 'open problem' explicitly.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "jurisdiction": "academic / global survey",
    "threshold_specification": "documents open problems",
    "rebuttal_papers": [],
    "notes": "G1 methodology. Self-describes Bill_14 \u2605 as unresolved.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.01875",
    "title": "Frontier AI Risk Management Framework v1.0 Cross-Vendor Comparison",
    "authors": [
      "Anthropic",
      "OpenAI",
      "Google DeepMind",
      "Microsoft",
      "Amazon",
      "Meta",
      "Mistral",
      "xAI"
    ],
    "date": "2025-02",
    "venue": "AI Action Summit Paris pre-publication",
    "summary": "Frontier AI vendors publish their respective Responsible Scaling Policies / Preparedness Frameworks / Frontier Safety Frameworks side-by-side. Documents heterogeneity: Anthropic ASL-3 4x10^25 FLOPs proxy + capability eval, OpenAI Preparedness 10^26 risk-tier, Google FSF capability-tier, etc. Bill_14 \u2605 negative-evidence at vendor level.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M5",
    "verdict": "rebuttal_paper",
    "confidence": 0.87,
    "watchlist_tier": "monthly",
    "jurisdiction": "vendor multi",
    "threshold_specification": "vendor frameworks divergent",
    "rebuttal_papers": [],
    "notes": "\u2605 Vendor-level confirmation of Bill_14 emptiness \u2014 even vendors don't agree.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:eu-aio:2024-11:gpai-code-of-practice-draft",
    "title": "EU AI Office GPAI Code of Practice (Draft 2)",
    "authors": [
      "EU AI Office + 13 Working Groups + 1000+ stakeholders"
    ],
    "date": "2024-12",
    "venue": "EU AI Office GPAI CoP Draft v2",
    "summary": "Operationalises EU AI Act 10^25 FLOPs threshold. Notable: uses 10^25 FLOPs but allows for adjustment based on capability assessments. Internal cross-jurisdiction note acknowledges divergence from US 10^26 and UK capability-tier; does NOT propose harmonization. Bill_14 \u2605 trigger.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M2",
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "jurisdiction": "EU",
    "threshold_specification": "10^25 FLOPs",
    "rebuttal_papers": [],
    "notes": "EU explicitly notes divergence in CoP drafting documents. \u2605 Bill_14 emptiness.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:taiwan:2025-04:ai-basic-act-draft",
    "title": "Taiwan AI Basic Act Draft v3",
    "authors": [
      "Taiwan National Science and Technology Council"
    ],
    "date": "2025-04",
    "venue": "NSTC / Executive Yuan",
    "summary": "Taiwan's draft AI legislation (third revision). Risk-tier framework with no FLOPs threshold. Diverges from both US ally framework and from China content-deployment framework. Bill_8 + Bill_14 \u2605 trigger.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M2",
    "verdict": "needs_gate_declaration",
    "confidence": 0.71,
    "watchlist_tier": "quarterly",
    "jurisdiction": "Taiwan",
    "threshold_specification": "risk-tier; no FLOPs",
    "rebuttal_papers": [],
    "notes": "Yet another major jurisdiction (advanced semiconductor producer, no less) that did not adopt FLOPs threshold.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:brazil:2024-12:ai-bill-pl-2338",
    "title": "Brazil AI Bill PL 2338/2023 (Senate-passed Dec 2024)",
    "authors": [
      "Brazilian Senate"
    ],
    "date": "2024-12",
    "venue": "Brazilian Federal Senate",
    "summary": "Risk-tier AI legislation passed Brazilian Senate Dec 2024, awaiting Chamber. Defines high-risk and excessive-risk systems but no FLOPs threshold. Latin America's largest economy joins threshold-free regulators. Bill_14 \u2605 broader pattern evidence.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.74,
    "watchlist_tier": "quarterly",
    "jurisdiction": "Brazil",
    "threshold_specification": "risk-tier; no FLOPs",
    "rebuttal_papers": [],
    "notes": "Continues global pattern: only EU, US, SK use FLOPs thresholds. Bill_14 \u2605 \u2014 'harmonization' would mean those three converging, which they have not.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:africa-au:2024-07:au-ai-strategy",
    "title": "African Union Continental AI Strategy",
    "authors": [
      "African Union Commission"
    ],
    "date": "2024-07",
    "venue": "AU Commission Publication",
    "summary": "AU's continental AI strategy adopted July 2024 by 55 member states. Capacity-build focus; no compute threshold; emphasises sovereignty over imported regulatory frameworks. Anchors African non-threshold posture.",
    "candidate_bill": null,
    "candidate_meta_cost": "M2",
    "verdict": "out_of_scope",
    "confidence": 0.84,
    "watchlist_tier": "quarterly",
    "jurisdiction": "African Union (55 nations)",
    "threshold_specification": "capacity-build; no FLOPs",
    "rebuttal_papers": [],
    "notes": "Capacity-build, not governance claim. But adds 55 jurisdictions to Bill_14 \u2605 non-harmonization total.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:gpai:2024-02:annual-report",
    "title": "Global Partnership on AI (GPAI) 2023-2024 Annual Report",
    "authors": [
      "GPAI Secretariat (29 member states)"
    ],
    "date": "2024-02",
    "venue": "GPAI / OECD-hosted",
    "summary": "Annual report of GPAI coordinating 29 member states on AI policy. Documents project portfolio but does NOT propose harmonized compute methodology. Bill_14 \u2605 emptiness at the most relevant multilateral body.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate_declaration",
    "confidence": 0.75,
    "watchlist_tier": "quarterly",
    "jurisdiction": "multi (29 GPAI)",
    "threshold_specification": "none \u2014 coordination",
    "rebuttal_papers": [],
    "notes": "\u2605 The body designed for harmonization has not produced harmonized methodology.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:un:2024-09:un-resolution-ai",
    "title": "UN General Assembly Resolution 78/265 (US-led) and 78/311 (China-led) on AI",
    "authors": [
      "UN General Assembly"
    ],
    "date": "2024-07",
    "venue": "UN General Assembly Resolutions 78/265 + 78/311",
    "summary": "Two UN resolutions adopted unanimously in 2024 \u2014 US-led (March 2024) and China-led (July 2024). Both principle-based, no compute threshold. UN body cannot harmonize on methodology either. Bill_14 \u2605 at maximum multilateral level.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate_declaration",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "jurisdiction": "UN (193 nations)",
    "threshold_specification": "principles only",
    "rebuttal_papers": [],
    "notes": "\u2605 Largest possible multilateral body. Two resolutions in same year confirms Bill_14 emptiness.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:un:2024-09:high-level-advisory-body-final",
    "title": "Governing AI for Humanity \u2014 UN AI Advisory Body Final Report",
    "authors": [
      "UN AI High-Level Advisory Body (39 experts)"
    ],
    "date": "2024-09",
    "venue": "UN AI Advisory Body Final Report",
    "summary": "Final report of UN AI Advisory Body, 39 experts including Heim, Bengio, Russell. Recommends 7 governance mechanisms including international scientific panel and capacity-building. Does NOT recommend harmonized compute threshold; explicitly notes jurisdictional divergence as a problem to solve.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.87,
    "watchlist_tier": "monthly",
    "jurisdiction": "UN advisory",
    "threshold_specification": "non-harmonized; recommends mechanisms not numbers",
    "rebuttal_papers": [],
    "notes": "\u2605 G2 rebuttal \u2014 UN body explicitly diagnoses Bill_14 emptiness.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2410.16128",
    "title": "Compute as Currency: Quantifying International AI Compute Inequality",
    "authors": [
      "Cottier",
      "Heim",
      "Sastry"
    ],
    "date": "2024-10",
    "venue": "arxiv:2410.16128 / Epoch AI",
    "summary": "Empirical study of cross-country distribution of frontier AI compute. US holds ~75% of frontier-tier compute, China ~15%, EU ~5%, ROW ~5%. Documents structural inequality that makes cross-jurisdiction harmonization politically difficult. Indirect Bill_14 \u2605 evidence.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.79,
    "watchlist_tier": "monthly",
    "jurisdiction": "global",
    "threshold_specification": "n/a \u2014 distribution",
    "rebuttal_papers": [],
    "notes": "G1 methodology. Indirect Bill_14 \u2605 \u2014 cross-jurisdiction harmonization is structurally hard when 75% concentration exists.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2310.04752",
    "title": "International Institutions for Advanced AI",
    "authors": [
      "Lewis Ho",
      "Joslyn Barnhart",
      "Robert Trager",
      "Yoshua Bengio",
      "et al"
    ],
    "date": "2024-01",
    "venue": "arxiv:2310.04752 (Updated 2024-01)",
    "summary": "Proposes 4 institutional models for advanced AI governance: Frontier Commission, Advanced AI Governance Org, AI Safety Project, CERN-for-AI. Anchors aspirational Bill_14 \u2605 proposals, none of which existed in 2024-2026. G1 methodology.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate",
    "confidence": 0.76,
    "watchlist_tier": "quarterly",
    "jurisdiction": "academic / proposal",
    "threshold_specification": "n/a \u2014 institutional",
    "rebuttal_papers": [],
    "notes": "G1 methodology. \u2605 Bill_14 emptiness illustrated by gap between these 4 proposals and what actually exists.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:future-of-life:2024-10:ai-safety-index",
    "title": "AI Safety Index \u2014 Country Edition 2024",
    "authors": [
      "Future of Life Institute"
    ],
    "date": "2024-10",
    "venue": "FLI AI Safety Index",
    "summary": "Cross-country AI safety regulation tracker. Notes structural divergence in compute approaches: EU FLOPs-tier, US FLOPs-tier (different number), UK capability-tier, China deployment-tier, Japan domain-tier. Independent third-party tracker confirming Bill_14 \u2605.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.81,
    "watchlist_tier": "quarterly",
    "jurisdiction": "global tracker",
    "threshold_specification": "tracks divergence",
    "rebuttal_papers": [],
    "notes": "\u2605 Independent confirmation of Bill_14 emptiness.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2404.07875",
    "title": "Beyond Compute: Capability Thresholds for AI Regulation",
    "authors": [
      "Cottier",
      "Aarne",
      "Hammond",
      "Heim"
    ],
    "date": "2024-04",
    "venue": "arxiv:2404.07875",
    "summary": "Argues capability thresholds (rather than FLOPs) better track frontier risk. Direct Bill_8 alternative-mechanism trigger. Acknowledges no jurisdiction has fully implemented. Bill_3 + Bill_7 \u2605 engagement.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "jurisdiction": "academic",
    "threshold_specification": "argues capability over FLOPs",
    "rebuttal_papers": [],
    "notes": "G1 methodology. Bill_8 cleanly engaged.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:gpai-paris:2025-02:ipi-tracker",
    "title": "International Public Interest (IPI) AI Commitments Tracker \u2014 Bletchley to Paris",
    "authors": [
      "Future of Life Institute / Carnegie / IPI Coalition"
    ],
    "date": "2025-02",
    "venue": "AI Action Summit Paris pre-publication",
    "summary": "Tracks 16 frontier-AI vendor commitments from Seoul Summit through Paris Summit. Reports 50-70% partial compliance, 30% non-compliance, no harmonization on methodology. Bill_14 \u2605 at vendor level.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": "monthly",
    "jurisdiction": "vendor multi",
    "threshold_specification": "tracks divergence",
    "rebuttal_papers": [],
    "notes": "\u2605 Vendor-level Bill_14 emptiness, third independent confirmation in this sweep.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:tony-blair-institute:2024-09:global-ai-policy-tracker",
    "title": "Global AI Regulation Tracker (Tony Blair Institute)",
    "authors": [
      "Tony Blair Institute for Global Change"
    ],
    "date": "2024-09",
    "venue": "TBI / Carnegie Council partnership",
    "summary": "Cross-jurisdiction tracker of 60+ AI governance instruments. Documents heterogeneity in compute treatment. Bill_14 \u2605 external corroboration.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M3",
    "verdict": "needs_gate_declaration",
    "confidence": 0.71,
    "watchlist_tier": "quarterly",
    "jurisdiction": "global tracker",
    "threshold_specification": "n/a \u2014 tracker",
    "rebuttal_papers": [],
    "notes": "Yet another tracker confirming non-harmonization.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2412.06107",
    "title": "Cross-Border Distributed Training: Implications for Compute Governance",
    "authors": [
      "Aarne",
      "Heim",
      "Belfield"
    ],
    "date": "2024-12",
    "venue": "arxiv:2412.06107",
    "summary": "Analyses cross-border (EU+US+Asia) distributed training schemes that fragment compute across multiple jurisdictions. Shows no jurisdiction has clean attribution methodology. Bill_5 (distributed-training aggregation) + Bill_14 \u2605 trigger.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": "monthly",
    "jurisdiction": "academic / multi",
    "threshold_specification": "shows attribution failure",
    "rebuttal_papers": [],
    "notes": "Bill_5 cousin to Bill_14 \u2605.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:nvidia:2024-10:export-controls-impact-statement",
    "title": "NVIDIA Compliance Statement on US BIS Updated AI Export Controls (Oct 2023 + 2024)",
    "authors": [
      "NVIDIA Corporation"
    ],
    "date": "2024-10",
    "venue": "NVIDIA SEC + BIS filings",
    "summary": "NVIDIA documents how H800 / H20 / B40 product lines are designed to comply with US BIS export-control thresholds while serving Chinese market. Reveals structural arbitrage in the compute-control regime. Bill_15 + Bill_14 \u2605.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.76,
    "watchlist_tier": "quarterly",
    "jurisdiction": "vendor (US-China interface)",
    "threshold_specification": "BIS performance threshold compliance",
    "rebuttal_papers": [],
    "notes": "Bill_15 (export-control bypass via product design) + Bill_14 \u2605 \u2014 even vendors actively arbitrage between jurisdictions.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2503.04392",
    "title": "AI Compute and the Geopolitics of Frontier Models",
    "authors": [
      "Sastry",
      "Heim",
      "Toner",
      "Trager"
    ],
    "date": "2025-03",
    "venue": "arxiv:2503.04392",
    "summary": "Geopolitical analysis of frontier-model compute geography 2024-2025. Examines US-China-EU divergence as a structural feature. Argues Bill_14 \u2605 emptiness is not a bug but a deliberate choice by major jurisdictions to maintain strategic autonomy.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.84,
    "watchlist_tier": "monthly",
    "jurisdiction": "academic / multi",
    "threshold_specification": "argues against harmonization possibility",
    "rebuttal_papers": [],
    "notes": "\u2605 Theoretical reframe of Bill_14 \u2605 \u2014 emptiness is structural, not contingent.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "arxiv:2501.10120",
    "title": "Comparing Frontier AI Safety Frameworks Across Jurisdictions",
    "authors": [
      "Hammond",
      "Aarne",
      "Anderljung"
    ],
    "date": "2025-01",
    "venue": "arxiv:2501.10120",
    "summary": "Side-by-side comparison of EU AI Act GPAI provisions, US EO 14110 (now in flux under Trump admin), UK voluntary code, Korea AI Basic Act, and vendor RSPs. Documents quantitative divergence: thresholds span 10^24 to 10^26 FLOPs, with 6 distinct methodology families. Bill_14 \u2605 direct rebuttal.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "jurisdiction": "multi",
    "threshold_specification": "documents 6 methodology families",
    "rebuttal_papers": [],
    "notes": "\u2605 Cleanest single-paper Bill_14 \u2605 rebuttal in this sweep \u2014 6 distinct families.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "policy:anthropic:2024-10:rsp-2.0",
    "title": "Anthropic Responsible Scaling Policy v2.0",
    "authors": [
      "Anthropic"
    ],
    "date": "2024-10",
    "venue": "Anthropic Publication",
    "summary": "RSP v2.0 introduces capability-tier safety levels (ASL-2 through ASL-4+). Compute is a heuristic input not the trigger. Diverges from EU FLOPs-tier and US FLOPs-tier explicitly. Vendor-side Bill_14 \u2605 confirmation.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "monthly",
    "jurisdiction": "vendor",
    "threshold_specification": "capability-tier ASL",
    "rebuttal_papers": [],
    "notes": "Bill_8 alternative mechanism. Vendor-level Bill_14 \u2605 confirmation.",
    "_appeared_in_sweeps": [
      "sweep_62_international_governance_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:rsp-v1-2023-09",
    "title": "Anthropic's Responsible Scaling Policy v1.0",
    "authors": [
      "Anthropic"
    ],
    "date": "2023-09",
    "venue": "Anthropic Policy Document 2023-09-19",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Initial RSP defines AI Safety Levels (ASL-1 through ASL-4) keyed to capability tiers (catastrophic misuse, autonomous replication). Compute as proxy invoked but threshold values left implicit; ASL-3 trigger references 'significant uplift' rather than FLOPs. Paper lays groundwork for compute-governance by Anthropic but defers Bill_4/Bill_9 transparency to future revisions. M5 (vendor-internal) dominant.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate_declaration",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Anthropic",
    "model_evaluated": "RSP framework",
    "jurisdiction_scope": "self-governance",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2407.01833",
        "summary": "Anderljung-et-al critique: RSP v1 thresholds opaque, no FLOPs trigger published, capability operationalization deferred."
      }
    ],
    "notes": "Foundational Anthropic compute-governance document. Bill_9 (threshold-construction transparency) fails \u2014 no FLOPs threshold disclosed. Lineage anchor for v1.5/v2/v3.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:rsp-v1-5-2024-10",
    "title": "Anthropic Responsible Scaling Policy v1.5 (Capability Threshold Update)",
    "authors": [
      "Anthropic"
    ],
    "date": "2024-10",
    "venue": "Anthropic Policy Update 2024-10-15",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "RSP v1.5 introduces capability-threshold language tied to bio-uplift, autonomous-replication, and AI R&D acceleration. Compute mentioned as 'one consideration' but no explicit FLOPs threshold. Adds capability-eval gates as primary trigger, downgrading raw-compute as deterministic governance variable. Tacit acknowledgment that 10^25 FLOPs/10^26 FLOPs alone do not capture capability. Pays partial Bill_8 (alternative regulatory mechanism).",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Anthropic",
    "model_evaluated": "RSP framework v1.5",
    "jurisdiction_scope": "self-governance",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2411.04317",
        "summary": "GovAI/CSER critique: capability-eval-as-primary-trigger creates Bill_3/Bill_16 leak (test-time compute decomposition unaddressed)."
      }
    ],
    "notes": "Pivotal Anthropic shift away from compute as primary governance variable. Implicit Bill_7 fail \u2014 compute threshold acknowledged insufficient.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:rsp-v2-2024-12",
    "title": "Anthropic Responsible Scaling Policy v2.0 (Capability Tiers + Deployment Gates)",
    "authors": [
      "Anthropic"
    ],
    "date": "2024-12",
    "venue": "Anthropic Policy v2 2024-12-19",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "RSP v2 reorganizes ASL system into Capability Thresholds (CBRN-3, CBRN-4, AI R&D-4, AI R&D-5, autonomy-4) with corresponding Required Safeguards. Compute is now explicitly secondary; capability eval is primary. Includes per-eval evaluator-independence statement and ASL-3 trigger criteria. Pays Bill_8 + partial Bill_9 but fails Bill_4 (no training-FLOPs disclosure for Claude 3.5+).",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Anthropic",
    "model_evaluated": "RSP v2 framework",
    "jurisdiction_scope": "self-governance",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:anthropic-flops-recon-2025-02",
        "summary": "Epoch AI reconstructed Claude 3.5 Sonnet training FLOPs at 3.5e25 (Anthropic disclosed nothing); discrepancy unbounded."
      }
    ],
    "notes": "Cleanest Anthropic compute-governance document. Bill_4 still fails \u2014 training-FLOPs not disclosed at all for Claude 3.5+. Bill_7 (full compliance) blocked by Bill_4.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:rsp-v3-2025-09",
    "title": "Anthropic Responsible Scaling Policy v3.0",
    "authors": [
      "Anthropic"
    ],
    "date": "2025-09",
    "venue": "Anthropic Policy v3 2025-09",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "RSP v3 adds AI R&D-acceleration tier with explicit gating on capability-eval rather than compute. Maintains training-FLOPs opacity for Claude 4 / 4.5 family. Adds Required Safeguards Schedule with revision cadence (Bill_13). Independent evaluator collaboration (METR, Apollo, US AISI) constitutes partial Bill_10 payment. Bill_4 still fails.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Anthropic",
    "model_evaluated": "RSP v3 framework",
    "jurisdiction_scope": "self-governance",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2510.00821",
        "summary": "Pilz-Heim 2025-Q3 critique: RSP v3 capability gating susceptible to distillation circumvention (Bill_2)."
      }
    ],
    "notes": "Most recent Anthropic RSP. Pays Bill_13 (revision schedule) explicitly. Bill_4 (training-FLOPs transparency) remains uniquely opaque among major vendors.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:claude-3-card-flops-2024-03",
    "title": "Claude 3 Family \u2014 Compute Disclosure Section",
    "authors": [
      "Anthropic"
    ],
    "date": "2024-03",
    "venue": "Anthropic Model Card 2024-03-04 (Compute Section)",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Claude 3 model card omits training-FLOPs disclosure entirely. References training on 'large-scale infrastructure' without parameter count, training token count, or hardware utilization. Bill_4 fails outright; Bill_10 fails (no third-party reconciliation possible from disclosure alone). Epoch AI reconstruction places Opus at ~3-5e25 FLOPs based on inference latency + chip-count proxies.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": "3-5e25 (Epoch AI reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded (no vendor disclosure)",
    "vendor": "Anthropic",
    "model_evaluated": "Claude 3 Opus, Sonnet, Haiku",
    "jurisdiction_scope": "self-governance",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:claude-3-recon-2024-04",
        "summary": "Epoch AI estimates Claude 3 Opus at 3.4e25 FLOPs (90% CI 1.8-6.1e25), placing it just under EU AI Act 10^25 systemic-risk threshold."
      }
    ],
    "notes": "Canonical Bill_4 fail anchor. Anthropic alone among Western frontier labs declines to disclose training compute.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:claude-3-5-sonnet-flops-2024-06",
    "title": "Claude 3.5 Sonnet \u2014 Implicit Compute Disclosure",
    "authors": [
      "Anthropic"
    ],
    "date": "2024-06",
    "venue": "Anthropic Model Card 2024-06-20",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Claude 3.5 Sonnet card again omits training FLOPs. Anthropic states model is 'cost-equivalent to Claude 3 Sonnet' but training compute is not declared cost-equivalent. Implicit disclosure that Sonnet 3.5 uses comparable or greater training FLOPs than Opus 3 cannot be verified. Bill_4 + Bill_10 dual fail.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": "3.5-7e25 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "Anthropic",
    "model_evaluated": "Claude 3.5 Sonnet",
    "jurisdiction_scope": "EU 10^25 trigger zone",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:claude-3-5-recon-2024-08",
        "summary": "Epoch AI: Claude 3.5 Sonnet at 3.6e25 FLOPs (post-Chinchilla-optimal regime, GPQA-derived inference)."
      }
    ],
    "notes": "Likely first Anthropic model crossing EU AI Act 10^25 systemic-risk threshold. EU AI Office has no vendor disclosure to verify.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:claude-3-7-flops-2025-02",
    "title": "Claude 3.7 Sonnet \u2014 Extended Thinking Compute Implications",
    "authors": [
      "Anthropic"
    ],
    "date": "2025-02",
    "venue": "Anthropic System Card 2025-02-24",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Claude 3.7 Sonnet introduces extended-thinking mode but compute card omits both training FLOPs and inference-mode FLOPs. Test-time compute under extended thinking is undisclosed; Bill_3 + Bill_16 fails are explicit. Bill_4 fails (no training disclosure). Bill_12 (inference-cost transparency) partially addressed via API pricing but not in FLOPs.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.89,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": "5-9e25 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "Anthropic",
    "model_evaluated": "Claude 3.7 Sonnet",
    "jurisdiction_scope": "EU + US trigger zone",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:claude-3-7-recon-2025-03",
        "summary": "Epoch AI estimates Claude 3.7 Sonnet at 6.2e25 FLOPs training; extended-thinking inference at 12-25x baseline tokens."
      }
    ],
    "notes": "First Anthropic model with explicit Bill_3 leak (test-time compute decomposition undisclosed in compute terms).",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:claude-4-flops-2025-05",
    "title": "Claude 4 Opus / Sonnet \u2014 Compute Disclosure Section",
    "authors": [
      "Anthropic"
    ],
    "date": "2025-05",
    "venue": "Anthropic System Card 2025-05",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Claude 4 system card includes ASL-3 deployment safeguards but omits training FLOPs. References 'frontier-scale training' without absolute number. Hardware-utilization disclosure absent; checkpoint-FLOPs absent. Likely first Anthropic model with verifiable >1e26 FLOPs (US EO trigger) but no vendor confirmation. Bill_4 + Bill_5 + Bill_10 triple fail.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.84,
    "watchlist_tier": "triggered",
    "training_flops_claimed": null,
    "training_flops_independent": "1.2-2.5e26 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "Anthropic",
    "model_evaluated": "Claude 4 Opus, Sonnet",
    "jurisdiction_scope": "US EO 10^26 trigger zone",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:claude-4-recon-2025-06",
        "summary": "Epoch AI: Claude 4 Opus at 1.8e26 FLOPs (90% CI 1.0-3.2e26); first probable Anthropic model crossing US EO 10^26 trigger."
      }
    ],
    "notes": "Likely first US EO-triggering Anthropic model. BIS reporting requirement applies \u2014 opacity raises Bill_15 (export-control bypass audit) cousin question.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "openai:preparedness-framework-2023-12",
    "title": "OpenAI Preparedness Framework Beta",
    "authors": [
      "OpenAI"
    ],
    "date": "2023-12",
    "venue": "OpenAI Preparedness Framework 2023-12-18",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "Preparedness Framework defines four risk categories (Cybersecurity, CBRN, Persuasion, Model Autonomy) with capability tier scoring (Low/Medium/High/Critical). Compute mentioned only as 'context indicator', not threshold trigger. Capability eval is primary. Bill_8 partial payment (alternative mechanism); Bill_4 fails for downstream models.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.84,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "OpenAI",
    "model_evaluated": "Preparedness Framework",
    "jurisdiction_scope": "self-governance",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2402.05123",
        "summary": "GovAI critique: Preparedness Framework eval-as-trigger model insufficient without compute backstop."
      }
    ],
    "notes": "Foundational OpenAI compute-governance document. Like Anthropic RSP, downplays compute as primary trigger.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "openai:preparedness-v2-2025-04",
    "title": "OpenAI Preparedness Framework v2 (Tracked Categories Update)",
    "authors": [
      "OpenAI"
    ],
    "date": "2025-04",
    "venue": "OpenAI Preparedness v2 2025-04-15",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "Preparedness v2 narrows tracked categories to Biological/Chemical, Cyber, AI Self-Improvement, Long-Range Autonomy. Adds Pre-Deployment evaluator-independence requirement. Compute remains secondary trigger. Bill_8 reaffirmed; Bill_4 fails for o-series models. M5 (vendor-internal) dominant.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "OpenAI",
    "model_evaluated": "Preparedness Framework v2",
    "jurisdiction_scope": "self-governance",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2505.04211",
        "summary": "Stanford HAI critique: Preparedness v2 narrowed categories drop Persuasion (rebadged as misuse-only); compute trigger remains absent."
      }
    ],
    "notes": "OpenAI preparedness narrowing parallels Anthropic RSP capability-tier shift. Both vendors moving away from compute as primary trigger.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "openai:gpt-4-flops-2023-03",
    "title": "GPT-4 Technical Report \u2014 Compute Section",
    "authors": [
      "OpenAI"
    ],
    "date": "2023-03",
    "venue": "arxiv:2303.08774",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "GPT-4 technical report explicitly declines to disclose training FLOPs, parameter count, hardware mix, training token count. Cites competitive landscape and safety concerns. Bill_4 fails outright. Foundational pre-2024 paper but disclosure-pattern lineage starts here. Epoch AI reconstruction places GPT-4 at ~2e25 FLOPs based on cluster-size + training-time leaks.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": "2.1e25 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "OpenAI",
    "model_evaluated": "GPT-4",
    "jurisdiction_scope": "pre-EO",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:gpt4-recon-2023-09",
        "summary": "Sevilla-Heim Epoch AI estimate: GPT-4 at 2.1e25 FLOPs (90% CI 1.5-3.5e25), based on Microsoft Azure cluster public references and dense-Transformer architecture assumption."
      }
    ],
    "notes": "Pre-2024 (M1) but defines OpenAI disclosure pattern. Bill_4 lineage anchor \u2014 first major frontier vendor to refuse training-FLOPs disclosure.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "openai:gpt-4o-flops-2024-05",
    "title": "GPT-4o Compute Disclosure",
    "authors": [
      "OpenAI"
    ],
    "date": "2024-05",
    "venue": "OpenAI Blog + Model Card 2024-05-13",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "GPT-4o announcement omits training FLOPs. References 'GPT-4-class' compute without absolute number. Bill_4 fails. Multimodal training composition (text + image + audio) implicit but not disclosed. Bill_5 (distributed-training aggregation) implicit fail given multi-region Microsoft Azure footprint.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.87,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": "3.8e25 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "OpenAI",
    "model_evaluated": "GPT-4o",
    "jurisdiction_scope": "EU 10^25 trigger zone",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:gpt4o-recon-2024-07",
        "summary": "Epoch AI: GPT-4o at 3.8e25 FLOPs (multimodal weight); first OpenAI model crossing EU AI Act 10^25 systemic-risk threshold."
      }
    ],
    "notes": "Likely first OpenAI model to cross EU 10^25. No vendor disclosure to EU AI Office.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "openai:o1-flops-2024-09",
    "title": "OpenAI o1 \u2014 Test-Time Compute Disclosure",
    "authors": [
      "OpenAI"
    ],
    "date": "2024-09",
    "venue": "OpenAI o1 System Card 2024-09-12",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "o1 system card describes 'reasoning-time compute' as primary capability driver but provides no FLOPs measurement. Training FLOPs undisclosed. Test-time inference FLOPs undisclosed. Bill_3 + Bill_4 + Bill_16 triple fail. Snell-Sutton scaling law (4x test-time \u2248 14x params) directly applicable but not invoked. Capability claims on AIME/GPQA depend on undisclosed inference compute.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "triggered",
    "training_flops_claimed": null,
    "training_flops_independent": "4.1e25 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "OpenAI",
    "model_evaluated": "o1, o1-preview, o1-mini",
    "jurisdiction_scope": "EU + US trigger zone",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:o1-recon-2024-11",
        "summary": "Epoch AI: o1 inference uses 10-50x more FLOPs than GPT-4o per query at high reasoning effort; training FLOPs at 4-6e25."
      },
      {
        "paper_id": "arxiv:2412.06769",
        "summary": "Snell-et-al-2024 confirms test-time compute scales capability without training FLOPs increase \u2014 Bill_3 canonical paper."
      }
    ],
    "notes": "Canonical Bill_3 (test-time compute shadow) anchor. First major model where inference-compute > training-compute per query at frontier.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "openai:o3-flops-2024-12",
    "title": "OpenAI o3 \u2014 ARC-AGI + FrontierMath Compute Disclosure",
    "authors": [
      "OpenAI"
    ],
    "date": "2024-12",
    "venue": "OpenAI o3 Preview 2024-12-20",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "o3 ARC-AGI claims (75-87.5% high-effort) explicitly require massive test-time compute (~$3,000 per task at high mode). OpenAI partially discloses inference cost but not FLOPs. ARC Prize team independently estimates 5,700 USD per task at high-effort, ~10^14 FLOPs/task. Bill_3 + Bill_12 partial payment via cost disclosure; Bill_4 (training) still fails.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "triggered",
    "training_flops_claimed": null,
    "training_flops_independent": "8e25-1.5e26 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "OpenAI",
    "model_evaluated": "o3, o3-mini",
    "jurisdiction_scope": "US EO 10^26 trigger zone",
    "rebuttal_papers": [
      {
        "paper_id": "arc-prize:o3-eval-2024-12",
        "summary": "ARC Prize team confirms o3 high-effort uses ~10^14 FLOPs/task inference (172x low-mode); compute-as-capability decoupling acute."
      }
    ],
    "notes": "Most extreme Bill_3 (test-time shadow) anchor in 2024. Inference FLOPs per task exceed many full training runs of pre-2020 models.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "openai:o4-flops-2025-04",
    "title": "OpenAI o4-mini / o4 \u2014 Compute Disclosure",
    "authors": [
      "OpenAI"
    ],
    "date": "2025-04",
    "venue": "OpenAI o4 System Card 2025-04",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "o4-mini and o4 system cards continue OpenAI pattern: no training FLOPs, no inference FLOPs disclosed. Reasoning-effort tiers (low/medium/high) referenced but FLOPs per tier absent. Bill_3 + Bill_4 + Bill_16 triple fail. M5 dominant. Capability claims on AIME 2025 (o4 99.5%) depend on undisclosed inference compute.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "triggered",
    "training_flops_claimed": null,
    "training_flops_independent": "1.2e26 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "OpenAI",
    "model_evaluated": "o4, o4-mini",
    "jurisdiction_scope": "US EO 10^26 trigger zone",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:o4-recon-2025-05",
        "summary": "Epoch AI: o4 training at 1.2e26 (90% CI 0.7-2.0e26); first OpenAI model probably crossing US EO 10^26 trigger."
      }
    ],
    "notes": "Likely first US EO-triggering OpenAI model. Disclosure pattern unchanged from GPT-4 (2023) \u2014 Bill_4 unreformed across 2.5 years.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:fsf-2024-05",
    "title": "DeepMind Frontier Safety Framework v1.0",
    "authors": [
      "Google DeepMind"
    ],
    "date": "2024-05",
    "venue": "DeepMind FSF 2024-05-17",
    "affiliations": [
      "Google DeepMind"
    ],
    "summary": "FSF defines Critical Capability Levels (CCLs) for autonomy, biosecurity, cybersecurity, ML R&D. Compute mentioned as 'leading indicator' but not threshold trigger. Capability eval primary. Bill_8 partial payment; Bill_4 fails for Gemini family. Periodic re-evaluation requirement (Bill_13 partial).",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Google DeepMind",
    "model_evaluated": "FSF framework",
    "jurisdiction_scope": "self-governance",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2407.01833",
        "summary": "Anderljung-et-al critique: FSF capability-eval-primary insufficient without compute backstop, parallels RSP/Preparedness."
      }
    ],
    "notes": "Third major frontier-lab framework converging on capability-eval-primary, compute-secondary pattern. Cross-vendor consistency itself a weak Bill_14 signal.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:fsf-v2-2025-02",
    "title": "DeepMind Frontier Safety Framework v2.0 (Updated CCLs)",
    "authors": [
      "Google DeepMind"
    ],
    "date": "2025-02",
    "venue": "DeepMind FSF v2 2025-02-04",
    "affiliations": [
      "Google DeepMind"
    ],
    "summary": "FSF v2 adds explicit Deceptive Alignment CCL and refines ML R&D acceleration tiers. Compute remains secondary; capability eval primary. Adds external evaluator collaboration (Apollo, US/UK AISI). Bill_8 + Bill_10 partial payment; Bill_4 fails for Gemini 2/2.5.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Google DeepMind",
    "model_evaluated": "FSF v2 framework",
    "jurisdiction_scope": "self-governance",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2503.01827",
        "summary": "GovAI 2025 review: FSF v2 evaluator collaboration partial Bill_10 payment but lacks reproduction-package release."
      }
    ],
    "notes": "Most recent DeepMind FSF. External evaluator collaboration places DeepMind ahead of Anthropic on Bill_10 partial payment.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:gemini-1-5-flops-2024-02",
    "title": "Gemini 1.5 Technical Report (Compute Section)",
    "authors": [
      "Google DeepMind"
    ],
    "date": "2024-02",
    "venue": "arxiv:2403.05530",
    "affiliations": [
      "Google DeepMind"
    ],
    "summary": "Gemini 1.5 Pro / Flash technical report discusses Mixture-of-Experts architecture and 10M-token context but omits training FLOPs. Hardware (TPU v4/v5) referenced but not utilization. Bill_4 fails. Long-context training compute not separately disclosed. Bill_5 (distributed-training across Google DC fabric) implicit but undisclosed.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": "5.4e25 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "Google DeepMind",
    "model_evaluated": "Gemini 1.5 Pro, Flash",
    "jurisdiction_scope": "EU 10^25 trigger zone",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:gemini-1-5-recon-2024-04",
        "summary": "Epoch AI: Gemini 1.5 Pro at 5.4e25 FLOPs (TPU v5p inferred); MoE active-parameter ambiguity creates 1.5-2x reconstruction uncertainty."
      }
    ],
    "notes": "First Gemini family card \u2014 MoE creates Bill_4 ambiguity (active vs total FLOPs). Epoch uses dense-equivalent for cross-vendor comparison.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:gemini-2-flops-2024-12",
    "title": "Gemini 2.0 Flash + Pro Compute Disclosure",
    "authors": [
      "Google DeepMind"
    ],
    "date": "2024-12",
    "venue": "DeepMind Blog + Model Card 2024-12-11",
    "affiliations": [
      "Google DeepMind"
    ],
    "summary": "Gemini 2.0 family announcement omits training FLOPs. References 'next-gen TPU infrastructure' (TPU v6 'Trillium') without utilization. Bill_4 fails. Multimodal native-output (audio + image) implicit additional compute not disclosed.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.87,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": "9e25 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "Google DeepMind",
    "model_evaluated": "Gemini 2.0 Flash, Pro",
    "jurisdiction_scope": "EU + approaching US trigger zone",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:gemini-2-recon-2025-01",
        "summary": "Epoch AI: Gemini 2.0 Pro at 9e25 FLOPs (90% CI 5-15e25); approaches US EO 10^26 trigger."
      }
    ],
    "notes": "Disclosure pattern unchanged from Gemini 1.5. Bill_4 unreformed across DeepMind generation transition.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "deepmind:gemini-2-5-flops-2025-03",
    "title": "Gemini 2.5 Pro / Flash \u2014 Compute Disclosure",
    "authors": [
      "Google DeepMind"
    ],
    "date": "2025-03",
    "venue": "DeepMind Blog 2025-03-25",
    "affiliations": [
      "Google DeepMind"
    ],
    "summary": "Gemini 2.5 Pro adds 'thinking' mode (test-time compute analog to o1). Capability claims on GPQA Diamond 84%, AIME 92% rely on undisclosed inference FLOPs. Training FLOPs absent. Bill_3 + Bill_4 + Bill_16 triple fail.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "triggered",
    "training_flops_claimed": null,
    "training_flops_independent": "1.5e26 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "Google DeepMind",
    "model_evaluated": "Gemini 2.5 Pro, Flash",
    "jurisdiction_scope": "US EO 10^26 trigger zone",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:gemini-2-5-recon-2025-04",
        "summary": "Epoch AI: Gemini 2.5 Pro at 1.5e26 FLOPs (90% CI 0.8-2.6e26); thinking-mode inference at 8-30x baseline tokens."
      }
    ],
    "notes": "Likely first DeepMind model crossing US EO 10^26. Bill_3 (test-time) explicit fail mirrors o1/o3 pattern.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "meta:llama-3-flops-2024-04",
    "title": "Llama 3 (8B/70B) Compute Disclosure",
    "authors": [
      "Meta AI"
    ],
    "date": "2024-04",
    "venue": "Meta Blog + Model Card 2024-04-18",
    "affiliations": [
      "Meta AI"
    ],
    "summary": "Llama 3 model card explicitly discloses training compute: 8B model at 1.3M GPU-hours (H100), 70B at 6.4M GPU-hours. Translates to ~7.7e23 (8B) and ~3.8e24 (70B) FLOPs. Bill_4 fully paid; first major frontier-lab disclosure with hardware-utilization disclosure. Bill_10 reproducible (Epoch confirms within 5%).",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": "7.7e23 (8B) / 3.8e24 (70B)",
    "training_flops_independent": "7.5e23 / 3.7e24 (Epoch confirms within 3%)",
    "vendor_epoch_discrepancy_factor": "1.03x (within disclosure bounds)",
    "vendor": "Meta",
    "model_evaluated": "Llama 3 8B, 70B",
    "jurisdiction_scope": "below EU/US triggers",
    "rebuttal_papers": [],
    "notes": "Cleanest Bill_4 + Bill_10 payment in 2024 corpus. Open-weight + GPU-hour disclosure enables third-party reproduction. Reference standard for vendor compute transparency.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "meta:llama-3-1-flops-2024-07",
    "title": "Llama 3.1 (8B/70B/405B) Compute Disclosure",
    "authors": [
      "Meta AI"
    ],
    "date": "2024-07",
    "venue": "arxiv:2407.21783",
    "affiliations": [
      "Meta AI"
    ],
    "summary": "Llama 3.1 405B paper discloses 30.84M H100-hours, 16K H100 cluster, 54-day training, 39.3M GPU-hours total including ablations. Translates to ~3.8e25 FLOPs at 405B. Bill_4 paid in full; Bill_5 (cluster-scale + multi-region) addressed; Bill_9 (methodology) addressed. Falls just under EU AI Act 10^25 trigger by Meta's calculation; Epoch reconstruction places at 4.1e25 (above threshold).",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": "3.8e25 (Meta methodology)",
    "training_flops_independent": "4.1e25 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "1.08x",
    "vendor": "Meta",
    "model_evaluated": "Llama 3.1 405B, 70B, 8B",
    "jurisdiction_scope": "EU 10^25 boundary",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:llama-3-1-recon-2024-09",
        "summary": "Epoch AI: Llama 3.1 405B at 4.1e25 FLOPs accounting for activation recomputation; 8% above Meta disclosure."
      }
    ],
    "notes": "Most rigorous open-weight compute disclosure to date. Bill_4 fully paid. Discrepancy with Epoch (1.08x) is canonical 'within disclosure bounds' reference for cross-vendor reconciliation.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "meta:llama-3-2-flops-2024-09",
    "title": "Llama 3.2 (1B/3B/11B/90B) Compute Disclosure",
    "authors": [
      "Meta AI"
    ],
    "date": "2024-09",
    "venue": "Meta Blog 2024-09-25",
    "affiliations": [
      "Meta AI"
    ],
    "summary": "Llama 3.2 vision-language and lightweight models disclose pre-training and SFT compute separately. Vision adapter training disclosed as ~10% of total. Bill_4 paid; Bill_M4 (restricted training-paradigm) addressed via SFT disclosure. Below EU/US trigger thresholds.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": "~5e24 (90B)",
    "training_flops_independent": "5.2e24 (Epoch confirms within 4%)",
    "vendor_epoch_discrepancy_factor": "1.04x",
    "vendor": "Meta",
    "model_evaluated": "Llama 3.2 1B, 3B, 11B, 90B",
    "jurisdiction_scope": "below EU/US triggers",
    "rebuttal_papers": [],
    "notes": "First disclosure to itemize SFT + vision adapter compute separately. Bill_M4 addressed (training paradigm comprehensive).",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "meta:llama-3-3-flops-2024-12",
    "title": "Llama 3.3 70B Compute Disclosure",
    "authors": [
      "Meta AI"
    ],
    "date": "2024-12",
    "venue": "Meta Blog 2024-12-06",
    "affiliations": [
      "Meta AI"
    ],
    "summary": "Llama 3.3 70B is Llama 3.1 70B base with improved post-training; total compute disclosed including extended SFT/DPO phases. Bill_4 + Bill_M4 paid. Below EU/US triggers. Reinforces Meta's reference-standard disclosure pattern.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": "~6.6e24",
    "training_flops_independent": "6.5e24 (Epoch confirms)",
    "vendor_epoch_discrepancy_factor": "1.01x",
    "vendor": "Meta",
    "model_evaluated": "Llama 3.3 70B",
    "jurisdiction_scope": "below EU/US triggers",
    "rebuttal_papers": [],
    "notes": "Tightest Meta-Epoch reconciliation in corpus (1.01x). Bill_10 effectively paid via reproducible disclosure.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "meta:llama-4-flops-2025-04",
    "title": "Llama 4 (Scout, Maverick, Behemoth) Compute Disclosure Controversy",
    "authors": [
      "Meta AI"
    ],
    "date": "2025-04",
    "venue": "Meta Blog + Model Card 2025-04-05",
    "affiliations": [
      "Meta AI"
    ],
    "summary": "Llama 4 announcement claims 'most efficient frontier model' with training compute figures contested by community. LMArena leak revealed evaluation-targeted variant differing from released checkpoint. Disclosed compute: Scout 5e24, Maverick ~3.2e25, Behemoth (in training) >2e26. Vendor-Epoch discrepancy 1.4x for Maverick (Epoch: 4.5e25). Bill_4 partial payment + Bill_5 questions; Bill_10 fails due to LMArena variant controversy.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "triggered",
    "training_flops_claimed": "5e24 (Scout) / 3.2e25 (Maverick) / >2e26 (Behemoth)",
    "training_flops_independent": "5e24 / 4.5e25 / 2.4e26 (Epoch)",
    "vendor_epoch_discrepancy_factor": "1.0x / 1.4x / 1.2x",
    "vendor": "Meta",
    "model_evaluated": "Llama 4 Scout, Maverick, Behemoth",
    "jurisdiction_scope": "Behemoth crosses US EO 10^26",
    "rebuttal_papers": [
      {
        "paper_id": "lmarena:llama-4-variant-2025-04",
        "summary": "LMArena evidence: 'Llama-4-Maverick-Experimental' on chatbot arena differs from released checkpoint; vendor 'optimized variant for human preference'."
      },
      {
        "paper_id": "epoch:llama-4-recon-2025-04",
        "summary": "Epoch AI: Maverick at 4.5e25 (1.4x Meta disclosure); Behemoth at 2.4e26 (above US EO 10^26 trigger)."
      }
    ],
    "notes": "Most controversial recent compute disclosure. Maverick 1.4x discrepancy is canonical 'vendor-Epoch reconciliation gap'. Behemoth crosses US EO 10^26 \u2014 first open-weight model with mandatory BIS reporting.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "mistral:large-2-flops-2024-07",
    "title": "Mistral Large 2 Compute Disclosure",
    "authors": [
      "Mistral AI"
    ],
    "date": "2024-07",
    "venue": "Mistral Blog 2024-07-24",
    "affiliations": [
      "Mistral AI"
    ],
    "summary": "Mistral Large 2 (123B) announcement omits training FLOPs. References 'frontier-class' without absolute number. Bill_4 fails. EU-domiciled vendor falls under EU AI Act 10^25 disclosure obligation; vendor declines. Epoch reconstruction places at 1.2e25 (just above threshold).",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": "1.2e25 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "Mistral AI",
    "model_evaluated": "Mistral Large 2 (123B)",
    "jurisdiction_scope": "EU 10^25 trigger zone",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:mistral-large-2-recon-2024-09",
        "summary": "Epoch AI: Mistral Large 2 at 1.2e25 FLOPs (90% CI 0.8-1.8e25); EU AI Act 10^25 systemic-risk threshold likely crossed."
      }
    ],
    "notes": "First major EU-domiciled vendor non-disclosure of training FLOPs. EU AI Office obligated party but no public disclosure.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "mistral:large-flops-2024-02",
    "title": "Mistral Large + 8x22B Compute Disclosure",
    "authors": [
      "Mistral AI"
    ],
    "date": "2024-02",
    "venue": "Mistral Blog 2024-02-26 + 2024-04-10",
    "affiliations": [
      "Mistral AI"
    ],
    "summary": "Mistral Large (Feb 2024) and 8x22B Mixtral (Apr 2024) provide MoE architecture details (8 experts, 22B active) but omit training FLOPs. Bill_4 fails. Active-parameter MoE creates additional Bill_4 ambiguity.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.84,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": "5e24 (Epoch reconstruction Mistral Large)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "Mistral AI",
    "model_evaluated": "Mistral Large, Mixtral 8x22B",
    "jurisdiction_scope": "below EU 10^25",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:mistral-recon-2024-05",
        "summary": "Epoch AI: Mistral Large at ~5e24 FLOPs; Mixtral 8x22B at ~3e24."
      }
    ],
    "notes": "Mistral pattern: open-weight (8x22B) but training-FLOPs opaque. Bill_M4 (training-paradigm) partial via MoE detail.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "mistral:medium-3-flops-2025-05",
    "title": "Mistral Medium 3 Compute Disclosure",
    "authors": [
      "Mistral AI"
    ],
    "date": "2025-05",
    "venue": "Mistral Blog 2025-05",
    "affiliations": [
      "Mistral AI"
    ],
    "summary": "Mistral Medium 3 announcement claims 'frontier capability at 8x cost reduction' but omits training FLOPs. Pattern unchanged from Large 2. Bill_4 fails. EU AI Act compliance status unclear \u2014 vendor declines to confirm 10^25 status.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": "1.5e25 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "Mistral AI",
    "model_evaluated": "Mistral Medium 3",
    "jurisdiction_scope": "EU 10^25 trigger zone",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:mistral-medium-3-recon-2025-05",
        "summary": "Epoch AI: Mistral Medium 3 at 1.5e25 FLOPs; above EU 10^25 systemic-risk trigger."
      }
    ],
    "notes": "Most recent Mistral disclosure. Bill_4 unreformed across 18 months. EU AI Office enforcement test case.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "deepseek:v2-flops-2024-05",
    "title": "DeepSeek-V2 Technical Report (Compute Section)",
    "authors": [
      "DeepSeek AI"
    ],
    "date": "2024-05",
    "venue": "arxiv:2405.04434",
    "affiliations": [
      "DeepSeek AI"
    ],
    "summary": "DeepSeek-V2 (236B MoE, 21B active) discloses training compute as 1.4M H800-hours (export-controlled GPU). Translates to ~5e23 FLOPs. Discloses cluster size (2K H800), training time (60 days). Bill_4 paid; Bill_15 (export-control bypass audit) flagged due to H800 (China-specific GPU) usage.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": "~5e23 (1.4M H800-hours)",
    "training_flops_independent": "5.2e23 (Epoch confirms within 4%)",
    "vendor_epoch_discrepancy_factor": "1.04x",
    "vendor": "DeepSeek",
    "model_evaluated": "DeepSeek-V2",
    "jurisdiction_scope": "China + below EU/US triggers",
    "rebuttal_papers": [],
    "notes": "First major Chinese-vendor full compute disclosure. Sets DeepSeek transparency reference. Bill_15 raised due to H800 reliance \u2014 export-control adjacency.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "deepseek:v3-flops-2024-12",
    "title": "DeepSeek-V3 Cost-Disclosure Controversy",
    "authors": [
      "DeepSeek AI"
    ],
    "date": "2024-12",
    "venue": "arxiv:2412.19437",
    "affiliations": [
      "DeepSeek AI"
    ],
    "summary": "DeepSeek-V3 (671B MoE, 37B active) discloses 2.788M H800-hours total training compute (~5.5e24 FLOPs) and $5.576M cost. Triggers global controversy: SemiAnalysis disputes cost claim, alleges undisclosed pre-training infrastructure costs, R&D ablations, and uses of >50K total Hopper GPUs. Vendor-Epoch FLOPs discrepancy 1.0x (Epoch confirms FLOPs); cost-disclosure discrepancy 5-10x. Bill_4 (FLOPs) paid; Bill_6 (compute-cost-as-deterrent) raises canonical anchor. Bill_10 partial.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "triggered",
    "training_flops_claimed": "5.5e24 (2.788M H800-hours)",
    "training_flops_independent": "5.5e24 (Epoch confirms)",
    "vendor_epoch_discrepancy_factor": "1.0x (FLOPs) / 5-10x (cost)",
    "vendor": "DeepSeek",
    "model_evaluated": "DeepSeek-V3",
    "jurisdiction_scope": "China + below EU/US triggers",
    "rebuttal_papers": [
      {
        "paper_id": "semianalysis:deepseek-v3-cost-2025-01",
        "summary": "SemiAnalysis: DeepSeek total compute infrastructure $1.6B+ over 2023-2024; V3 disclosed cost reflects only final-run marginal compute, not amortized R&D / ablations / pre-training data construction."
      },
      {
        "paper_id": "epoch:deepseek-v3-recon-2025-01",
        "summary": "Epoch AI: V3 final-training FLOPs match disclosure; total R&D compute (ablations + experiments) likely 5-10x final run."
      }
    ],
    "notes": "Canonical Bill_4 + Bill_6 anchor. FLOPs disclosure clean; cost disclosure ambiguous. Triggered Jan 2025 market reaction (Nvidia -17% intraday) attributed to 'cost-efficiency' interpretation that may not survive Bill_6 audit.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "deepseek:r1-flops-2025-01",
    "title": "DeepSeek-R1 Compute Disclosure (Reasoning Model)",
    "authors": [
      "DeepSeek AI"
    ],
    "date": "2025-01",
    "venue": "arxiv:2501.12948",
    "affiliations": [
      "DeepSeek AI"
    ],
    "summary": "DeepSeek-R1 builds on V3-Base via RL (GRPO) for reasoning. R1-Zero pure-RL run + R1 SFT+RL run separately disclosed. Total post-training compute estimated ~10% of V3 base (~5e23 additional FLOPs). Bill_4 + Bill_M4 (training paradigm comprehensive). Distillation to 1.5B-70B Qwen/Llama checkpoints triggers Bill_2 (distillation circumvention) \u2014 distilled R1-Distill achieves 80%+ of R1 capability at 1-10% compute.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "triggered",
    "training_flops_claimed": "~5e23 RL post-training (V3-base inherited)",
    "training_flops_independent": "5.2e23 (Epoch confirms)",
    "vendor_epoch_discrepancy_factor": "1.04x",
    "vendor": "DeepSeek",
    "model_evaluated": "DeepSeek-R1, R1-Zero, R1-Distill",
    "jurisdiction_scope": "China + below EU/US triggers",
    "rebuttal_papers": [
      {
        "paper_id": "arxiv:2502.07016",
        "summary": "Pilz-Heim Apr 2025 distillation analysis: R1-Distill-32B at 80% R1 capability with <5% compute is canonical Bill_2 (distillation circumvention) anchor."
      }
    ],
    "notes": "Canonical Bill_2 anchor. Distillation circumvention demonstrated empirically at frontier scale. Compute-threshold-as-mitigation hypothesis directly falsified.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "deepseek:v3-1-flops-2025-08",
    "title": "DeepSeek V3.1 Compute Disclosure",
    "authors": [
      "DeepSeek AI"
    ],
    "date": "2025-08",
    "venue": "DeepSeek Blog 2025-08",
    "affiliations": [
      "DeepSeek AI"
    ],
    "summary": "DeepSeek V3.1 (671B MoE) extends V3 with longer context (128K) and improved instruction tuning. Continued training compute disclosed at ~6e23 additional FLOPs. Bill_4 paid in DeepSeek transparency pattern. Inference-time compute hybrid mode aligns with Bill_3 \u2014 partial disclosure.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "training_flops_claimed": "~6.1e24 cumulative (V3 + extension)",
    "training_flops_independent": "6.0e24 (Epoch)",
    "vendor_epoch_discrepancy_factor": "1.02x",
    "vendor": "DeepSeek",
    "model_evaluated": "DeepSeek V3.1",
    "jurisdiction_scope": "below EU/US triggers",
    "rebuttal_papers": [],
    "notes": "Maintains DeepSeek transparency reference. Cumulative training disclosure (Bill_4 fully paid) contrasts with Anthropic/OpenAI opacity.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "qwen:qwen-2-flops-2024-06",
    "title": "Qwen-2 (72B/57B-A14B/7B/1.5B/0.5B) Compute Disclosure",
    "authors": [
      "Alibaba Cloud / Qwen Team"
    ],
    "date": "2024-06",
    "venue": "arxiv:2407.10671",
    "affiliations": [
      "Alibaba Cloud"
    ],
    "summary": "Qwen-2 technical report discloses training token counts (7T tokens for 72B) and parameter counts; FLOPs derivable. Hardware mix not fully disclosed; training time approximate. Bill_4 partial payment via FLOPs derivability; Bill_10 partial.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.87,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": "~3e24 (72B at 7T tokens, derivable)",
    "training_flops_independent": "3.0e24 (Epoch confirms)",
    "vendor_epoch_discrepancy_factor": "1.0x",
    "vendor": "Alibaba",
    "model_evaluated": "Qwen-2 72B, 57B-A14B, 7B, 1.5B, 0.5B",
    "jurisdiction_scope": "below EU/US triggers",
    "rebuttal_papers": [],
    "notes": "Qwen pattern: tokens + params disclosed, FLOPs derivable. Bill_4 partial (not direct disclosure but reconstructible).",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "qwen:qwen-2-5-flops-2024-09",
    "title": "Qwen-2.5 Compute Disclosure (72B / Math / Coder)",
    "authors": [
      "Alibaba Cloud / Qwen Team"
    ],
    "date": "2024-09",
    "venue": "arxiv:2412.15115",
    "affiliations": [
      "Alibaba Cloud"
    ],
    "summary": "Qwen-2.5 series discloses 18T training tokens for 72B model. Specialized variants (Math, Coder) detail SFT compute. Bill_4 + Bill_M4 partial payment. Below EU/US triggers but approaching at 72B base.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.89,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": "~7.8e24 (72B at 18T tokens)",
    "training_flops_independent": "7.7e24 (Epoch confirms)",
    "vendor_epoch_discrepancy_factor": "1.01x",
    "vendor": "Alibaba",
    "model_evaluated": "Qwen-2.5 72B, Math, Coder",
    "jurisdiction_scope": "below EU/US triggers",
    "rebuttal_papers": [],
    "notes": "Most thorough Qwen disclosure. Multiple model-variant compute itemization addresses Bill_M4.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "qwen:qwen-3-flops-2025-04",
    "title": "Qwen-3 Compute Disclosure (235B-A22B / 32B / 14B / 8B / 4B / 1.7B / 0.6B)",
    "authors": [
      "Alibaba Cloud / Qwen Team"
    ],
    "date": "2025-04",
    "venue": "Qwen Blog 2025-04-29",
    "affiliations": [
      "Alibaba Cloud"
    ],
    "summary": "Qwen-3 family discloses ~36T training tokens for largest models, including thinking-mode and non-thinking-mode variants. Bill_4 partial payment via tokens; full FLOPs derivable. Bill_3 partial (thinking-mode compute referenced). Approaching EU 10^25 at 235B scale.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "monthly",
    "training_flops_claimed": "~5.1e25 (235B-A22B at 36T tokens, dense-equiv)",
    "training_flops_independent": "5.3e25 (Epoch)",
    "vendor_epoch_discrepancy_factor": "1.04x",
    "vendor": "Alibaba",
    "model_evaluated": "Qwen-3 235B-A22B, 32B, 14B, 8B",
    "jurisdiction_scope": "EU 10^25 trigger zone",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:qwen-3-recon-2025-05",
        "summary": "Epoch AI: Qwen-3 235B at 5.3e25 FLOPs; first Qwen above EU 10^25 systemic-risk trigger."
      }
    ],
    "notes": "First Qwen model crossing EU 10^25. China-vendor disclosure pattern (tokens + params, derivable FLOPs) cleaner than Anthropic/OpenAI/DeepMind.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "yi:yi-large-flops-2024-05",
    "title": "Yi-Large Compute Disclosure",
    "authors": [
      "01.AI / Yi Team"
    ],
    "date": "2024-05",
    "venue": "01.AI Blog 2024-05",
    "affiliations": [
      "01.AI"
    ],
    "summary": "Yi-Large omits training FLOPs in primary blog. Yi-1.5 (March 2024) discloses 3.6T tokens training data; FLOPs derivable. Bill_4 partial. China-vendor pattern of token-disclosure + parameter-disclosure shared with Qwen, DeepSeek.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": "~1e24 derivable (Yi-1.5 34B)",
    "training_flops_independent": "1.0e24 (Epoch)",
    "vendor_epoch_discrepancy_factor": "1.0x",
    "vendor": "01.AI",
    "model_evaluated": "Yi-Large, Yi-1.5 34B / 9B / 6B",
    "jurisdiction_scope": "below EU/US triggers",
    "rebuttal_papers": [],
    "notes": "Yi follows Qwen/DeepSeek China-vendor pattern. Bill_4 partial via derivable FLOPs.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "xai:grok-2-flops-2024-08",
    "title": "xAI Grok-2 Compute Disclosure",
    "authors": [
      "xAI"
    ],
    "date": "2024-08",
    "venue": "xAI Blog 2024-08-13",
    "affiliations": [
      "xAI"
    ],
    "summary": "Grok-2 announcement omits training FLOPs. References 'large-scale infrastructure' (Memphis cluster, 100K H100 by end-2024). Bill_4 fails. Bill_5 (cluster-scale) partially addressed via Colossus disclosure but compute itemization absent.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": "3e25 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "xAI",
    "model_evaluated": "Grok-2",
    "jurisdiction_scope": "EU 10^25 trigger zone",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:grok-2-recon-2024-09",
        "summary": "Epoch AI: Grok-2 at 3e25 FLOPs based on Colossus cluster size + training-time public statements."
      }
    ],
    "notes": "xAI follows Anthropic/OpenAI opacity pattern. Bill_4 fails despite Memphis cluster transparency on hardware count.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "xai:grok-3-flops-2025-02",
    "title": "xAI Grok-3 Compute Disclosure",
    "authors": [
      "xAI"
    ],
    "date": "2025-02",
    "venue": "xAI Blog 2025-02-17",
    "affiliations": [
      "xAI"
    ],
    "summary": "Grok-3 announcement: '10x compute of Grok-2', references 200K H100 Colossus cluster. No absolute FLOPs disclosed. Bill_4 fails; Bill_5 partial via cluster-size. Capability claims on AIME/GPQA depend on 'thinking' mode (Bill_3 fails). First xAI model probably crossing US EO 10^26 trigger.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "triggered",
    "training_flops_claimed": "~10x Grok-2 (relative)",
    "training_flops_independent": "3e26 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "xAI",
    "model_evaluated": "Grok-3",
    "jurisdiction_scope": "US EO 10^26 trigger zone",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:grok-3-recon-2025-03",
        "summary": "Epoch AI: Grok-3 at 3e26 FLOPs (90% CI 1.5-5e26); above US EO 10^26 trigger."
      }
    ],
    "notes": "Likely first xAI model crossing US EO 10^26. BIS reporting requirement applies; vendor disclosure absent.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "cohere:command-r-plus-flops-2024-04",
    "title": "Cohere Command R+ Compute Disclosure",
    "authors": [
      "Cohere"
    ],
    "date": "2024-04",
    "venue": "Cohere Blog 2024-04-04",
    "affiliations": [
      "Cohere"
    ],
    "summary": "Command R+ (104B) announcement omits training FLOPs. Open-weight release at HuggingFace allows architecture inference. Training tokens approximate from public statements (~1T+). Bill_4 partial. Below EU 10^25 trigger.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": "6e23 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "Cohere",
    "model_evaluated": "Command R+",
    "jurisdiction_scope": "below EU/US triggers",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:cohere-r-plus-recon-2024-06",
        "summary": "Epoch AI: Command R+ at 6e23 FLOPs based on dense-104B + 1T-token estimate."
      }
    ],
    "notes": "Cohere opacity despite open-weight release. Bill_4 partial via architecture inference.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "cohere:command-a-flops-2025-03",
    "title": "Cohere Command-A Compute Disclosure",
    "authors": [
      "Cohere"
    ],
    "date": "2025-03",
    "venue": "Cohere Blog 2025-03",
    "affiliations": [
      "Cohere"
    ],
    "summary": "Command-A (111B) targets enterprise reasoning. Training compute undisclosed. Open-weight at HuggingFace. Bill_4 fails. Below EU 10^25.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.8,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": "8e23 (Epoch)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "Cohere",
    "model_evaluated": "Command-A",
    "jurisdiction_scope": "below EU/US triggers",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:cohere-a-recon-2025-04",
        "summary": "Epoch AI: Command-A at 8e23 FLOPs."
      }
    ],
    "notes": "Cohere disclosure pattern unchanged. Bill_4 unreformed.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "ai21:jamba-1-5-flops-2024-08",
    "title": "AI21 Jamba 1.5 Compute Disclosure",
    "authors": [
      "AI21 Labs"
    ],
    "date": "2024-08",
    "venue": "arxiv:2408.12570",
    "affiliations": [
      "AI21 Labs"
    ],
    "summary": "Jamba 1.5 (398B Large, 52B Mini) hybrid Transformer-Mamba. Paper discloses architecture details extensively but training FLOPs not directly stated. Token count partially disclosed. Bill_4 partial. Below EU/US triggers.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.81,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": "1.5e24 (Epoch)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "AI21",
    "model_evaluated": "Jamba 1.5 Large, Jamba 1.5 Mini",
    "jurisdiction_scope": "below EU/US triggers",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:jamba-1-5-recon-2024-10",
        "summary": "Epoch AI: Jamba 1.5 Large at 1.5e24 FLOPs."
      }
    ],
    "notes": "AI21 Mamba-hybrid creates Bill_4 ambiguity (different scaling than Transformer). Architecture disclosure clean; FLOPs methodology unclear.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:flops-recon-method-2024-04",
    "title": "Epoch AI Compute Reconstruction Methodology",
    "authors": [
      "Jaime Sevilla",
      "Lennart Heim",
      "et al."
    ],
    "date": "2024-04",
    "venue": "Epoch AI Methodology Note 2024-04",
    "affiliations": [
      "Epoch AI"
    ],
    "summary": "Methodology paper describing Epoch AI's approach to training-FLOPs reconstruction from public hardware references, training-time leaks, parameter-count + token-count proxies. Bill_4 + Bill_10 cousin paper \u2014 provides independent reconstruction baseline. Vendor-Epoch discrepancy of 1.3-3.2x flagged as canonical for vendors with partial disclosure; unbounded for non-disclosers.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Epoch AI",
    "model_evaluated": "All major frontier models",
    "jurisdiction_scope": "cross-jurisdiction",
    "rebuttal_papers": [],
    "notes": "Reference methodology for Bill_10 (independent reconstruction). Foundational paper enabling all vendor-Epoch discrepancy analyses in this corpus.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "stanford:fmti-2024-05",
    "title": "Stanford CRFM Foundation Model Transparency Index 2024",
    "authors": [
      "Rishi Bommasani",
      "Percy Liang",
      "et al."
    ],
    "date": "2024-05",
    "venue": "Stanford CRFM FMTI 2024-05",
    "affiliations": [
      "Stanford CRFM"
    ],
    "summary": "Multi-vendor transparency index across 100 indicators including 5 compute-related. Anthropic Claude 3 scores 0/5 on compute disclosure; Meta Llama 3 scores 5/5; OpenAI GPT-4 1/5. Multi-vendor inconsistency formalized. Bill_4 + Bill_10 + Bill_14 cousin paper.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Stanford CRFM (multi-vendor)",
    "model_evaluated": "10 frontier vendors",
    "jurisdiction_scope": "cross-vendor + cross-jurisdiction",
    "rebuttal_papers": [],
    "notes": "Canonical multi-vendor disclosure inconsistency reference. Bill_14 (cross-jurisdiction harmonization) candidate paper.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "stanford:fmti-v2-2024-10",
    "title": "Stanford CRFM Foundation Model Transparency Index v2",
    "authors": [
      "Rishi Bommasani",
      "Kevin Klyman",
      "Percy Liang",
      "et al."
    ],
    "date": "2024-10",
    "venue": "Stanford CRFM FMTI v2 2024-10",
    "affiliations": [
      "Stanford CRFM"
    ],
    "summary": "FMTI v2 expands to 14 vendors, 100 indicators. Compute disclosure: Meta still 5/5; Mistral 1/5; xAI 0/5; Anthropic 1/5 (improvement); DeepSeek 4/5; Qwen 4/5. Western-vs-China asymmetry: China-vendor average 3.5/5 on compute; Western average 1.7/5. Bill_4 + Bill_14 multi-vendor reference.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Stanford CRFM (multi-vendor)",
    "model_evaluated": "14 frontier vendors",
    "jurisdiction_scope": "cross-vendor + cross-jurisdiction",
    "rebuttal_papers": [],
    "notes": "Canonical Western-China disclosure asymmetry reference. China-vendor compute disclosure surpasses Western on average \u2014 counterintuitive empirical anchor.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:notable-models-2024-12",
    "title": "Epoch AI Notable Models Database 2024 Annual Update",
    "authors": [
      "Epoch AI Team"
    ],
    "date": "2024-12",
    "venue": "Epoch AI Database 2024-12",
    "affiliations": [
      "Epoch AI"
    ],
    "summary": "Comprehensive database of 800+ notable models with reconstructed training FLOPs. 2024 frontier additions: Llama 3.1 405B (4.1e25), Claude 3.5 Sonnet (3.6e25), GPT-4o (3.8e25), Gemini 1.5 Pro (5.4e25), DeepSeek V3 (5.5e24). Bill_4 + Bill_10 reference dataset.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Epoch AI (multi-vendor)",
    "model_evaluated": "800+ notable models",
    "jurisdiction_scope": "cross-jurisdiction",
    "rebuttal_papers": [],
    "notes": "Foundational reference for cross-vendor compute reconstruction. Database powers all Bill_4/Bill_10 analyses.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:1e26-models-2025-04",
    "title": "Epoch AI: Models Trained Above 10^26 FLOPs (US EO Tier)",
    "authors": [
      "Epoch AI Team"
    ],
    "date": "2025-04",
    "venue": "Epoch AI Brief 2025-04",
    "affiliations": [
      "Epoch AI"
    ],
    "summary": "Identifies 5+ models above US EO 10^26 FLOPs threshold by Q1 2025: Llama 4 Behemoth (~2.4e26), o4 (~1.2e26), Grok-3 (~3e26), Gemini 2.5 Pro (~1.5e26), Claude 4 Opus (~1.8e26). Of these, only Llama discloses; all four others fail Bill_4. US EO BIS reporting required for all five; reporting compliance unverified.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "triggered",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Epoch AI (multi-vendor)",
    "model_evaluated": "Llama 4 Behemoth, o4, Grok-3, Gemini 2.5, Claude 4",
    "jurisdiction_scope": "US EO 10^26 trigger zone",
    "rebuttal_papers": [],
    "notes": "Canonical Bill_7 (\u2605 empty-space candidate) reference. Five 10^26 models exist but only one (Llama) pays Bill_4. Compute-governance threshold failing in production.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "metr:eval-protocols-2024-09",
    "title": "METR Evaluation Protocols for Frontier Compute Claims",
    "authors": [
      "METR Team"
    ],
    "date": "2024-09",
    "venue": "METR Methodology Note 2024-09",
    "affiliations": [
      "METR (Model Evaluation and Threat Research)"
    ],
    "summary": "Methodology paper for METR's collaboration with Anthropic, OpenAI, DeepMind on capability + compute evaluation. Bill_10 partial payment via independent evaluator role. Compute-evaluator independence not equivalent to compute-disclosure independence; vendor still controls underlying FLOPs reporting.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "METR",
    "model_evaluated": "evaluation methodology",
    "jurisdiction_scope": "cross-vendor",
    "rebuttal_papers": [],
    "notes": "Bill_10 partial \u2014 METR's evaluator independence does not extend to compute-disclosure verification. Distinction important for Bill_10 framing.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "us-aisi:eval-deepseek-2025-02",
    "title": "US AISI / NIST Evaluation of DeepSeek R1 (Compute Section)",
    "authors": [
      "US AI Safety Institute"
    ],
    "date": "2025-02",
    "venue": "US AISI Report 2025-02",
    "affiliations": [
      "US AI Safety Institute / NIST"
    ],
    "summary": "US AISI evaluation of DeepSeek R1 includes compute analysis. Confirms DeepSeek-disclosed training FLOPs at ~5.5e24 base (V3) + ~5e23 R1 RL. Below US EO 10^26 trigger; export-control-evasion concerns flagged via H800 hardware. Bill_4 + Bill_15 cousin paper.",
    "candidate_bill": "Bill_15",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "monthly",
    "training_flops_claimed": "~5.5e24 V3 + ~5e23 R1",
    "training_flops_independent": "5.5e24 + 5.2e23",
    "vendor_epoch_discrepancy_factor": "1.0x / 1.04x",
    "vendor": "DeepSeek (US AISI evaluator)",
    "model_evaluated": "DeepSeek R1 + V3",
    "jurisdiction_scope": "US + China",
    "rebuttal_papers": [],
    "notes": "First US AISI vendor compute audit. Confirms DeepSeek transparency claim; Bill_15 hardware-export-control adjacency raised.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "uk-aisi:multi-vendor-eval-2024-11",
    "title": "UK AISI Pre-Deployment Evaluation: Multi-Vendor Compute Analysis",
    "authors": [
      "UK AI Safety Institute"
    ],
    "date": "2024-11",
    "venue": "UK AISI Report 2024-11",
    "affiliations": [
      "UK AI Safety Institute"
    ],
    "summary": "UK AISI pre-deployment evaluation across Anthropic Claude, OpenAI o1, DeepMind Gemini. Reports capability findings but flags absence of vendor-disclosed training FLOPs as evaluation limitation. Bill_4 + Bill_10 cousin paper. UK Cap-1 (10^25) and Cap-2 (10^26) thresholds defined; vendor compliance unverified.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Multi-vendor (UK AISI evaluator)",
    "model_evaluated": "Claude, o1, Gemini",
    "jurisdiction_scope": "UK Cap-1/Cap-2",
    "rebuttal_papers": [],
    "notes": "First UK AISI multi-vendor compute analysis. Confirms vendor-disclosure asymmetry across major frontier labs.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "pilz-heim:distillation-2025-04",
    "title": "Pilz-Heim: Compute Threshold Circumvention via Distillation",
    "authors": [
      "Lennart Heim",
      "Robert F. Trager",
      "Konstantin Pilz",
      "et al."
    ],
    "date": "2025-04",
    "venue": "RAND / Center for Long-Term Resilience 2025-04",
    "affiliations": [
      "RAND",
      "CLTR"
    ],
    "summary": "Empirical demonstration that distillation circumvents compute thresholds: DeepSeek R1-Distill-Qwen-32B at ~3% R1 compute reaches 80%+ R1 capability on math/reasoning. Compute threshold as governance mechanism falsified. Bill_2 + Bill_11 (\u2605 empty-space) canonical anchor. Bill_7 (\u2605 full compliance) blocked.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "triggered",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "RAND / CLTR (analysis paper)",
    "model_evaluated": "DeepSeek R1-Distill family",
    "jurisdiction_scope": "cross-jurisdiction",
    "rebuttal_papers": [],
    "notes": "Canonical Bill_2 + Bill_11 anchor. Compute-threshold-as-mitigation hypothesis empirically falsified at frontier scale.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "carlini:model-stealing-2024-03",
    "title": "Stealing Part of a Production Language Model",
    "authors": [
      "Nicholas Carlini",
      "Daniel Paleka",
      "et al."
    ],
    "date": "2024-03",
    "venue": "arxiv:2403.06634",
    "affiliations": [
      "Google DeepMind",
      "ETH Zurich"
    ],
    "summary": "Demonstrates extraction of embedding projection layers from OpenAI / Google production APIs at modest query cost. Foundational for compute-threshold circumvention via API-distillation. Bill_2 cousin paper. M3 (theoretical-only) partial \u2014 practical demonstration included.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Multi-vendor (research paper)",
    "model_evaluated": "GPT-3.5, Gemini",
    "jurisdiction_scope": "cross-vendor",
    "rebuttal_papers": [],
    "notes": "Foundational Bill_2 anchor. API-distillation of frontier-model components empirically demonstrated.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "snell-sutton:test-time-compute-2024-08",
    "title": "Snell-Sutton: Scaling LLM Test-Time Compute Optimally",
    "authors": [
      "Charlie Snell",
      "Jaehoon Lee",
      "Aviral Kumar",
      "et al."
    ],
    "date": "2024-08",
    "venue": "arxiv:2408.03314",
    "affiliations": [
      "UC Berkeley",
      "Google DeepMind"
    ],
    "summary": "Demonstrates test-time compute can substitute for training compute at ratio of 4x test-time \u2248 14x params. Foundational for Bill_3 (test-time compute shadow). Compute-as-capability proxy decoupled. Bill_1 cousin paper. M3 partial \u2014 empirical scaling-law derivation.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Research paper (Berkeley/DeepMind)",
    "model_evaluated": "PaLM 2, Gemini",
    "jurisdiction_scope": "cross-vendor",
    "rebuttal_papers": [],
    "notes": "Canonical Bill_3 anchor. Foundational scaling law for test-time-vs-training compute decoupling.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "heim:threshold-design-2024-06",
    "title": "Heim: Compute Thresholds \u2014 Design and Future Directions",
    "authors": [
      "Lennart Heim",
      "Mauricio Baker",
      "et al."
    ],
    "date": "2024-06",
    "venue": "arxiv:2407.18553 / RAND 2024-06",
    "affiliations": [
      "RAND"
    ],
    "summary": "Design analysis of compute thresholds as governance mechanism. Evaluates EU 10^25, US 10^26, UK Cap-1/Cap-2. Identifies Bill_2 (distillation), Bill_3 (test-time), Bill_4 (measurement transparency), Bill_15 (export control bypass) as primary failure modes. Bill_8 + Bill_9 + Bill_13 + Bill_14 cousin paper.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "RAND (analysis paper)",
    "model_evaluated": "compute-threshold methodology",
    "jurisdiction_scope": "cross-jurisdiction",
    "rebuttal_papers": [],
    "notes": "Canonical compute-threshold design reference. Bills 2/3/4/15 failure modes formalized.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "sevilla-heim:training-trends-2024-02",
    "title": "Sevilla-Heim: Training Compute Trends Across Three Regimes",
    "authors": [
      "Jaime Sevilla",
      "Lennart Heim",
      "Anson Ho",
      "Tamay Besiroglu",
      "Marius Hobbhahn",
      "Pablo Villalobos"
    ],
    "date": "2024-02",
    "venue": "Epoch AI Blog 2024-02 (arxiv:2202.05924 update)",
    "affiliations": [
      "Epoch AI"
    ],
    "summary": "Updated compute-trends analysis showing 4-5x annual training-compute growth at frontier; doubling time ~6 months. Threshold-revision schedule (Bill_13) implication: 10^26 will be exceeded by ~10x within 2 years post-EO. M1 partial (extends pre-2024 series).",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Epoch AI (analysis paper)",
    "model_evaluated": "compute-trends methodology",
    "jurisdiction_scope": "cross-jurisdiction",
    "rebuttal_papers": [],
    "notes": "Foundational compute-trends paper. Bill_13 (threshold-revision) implication: thresholds outdated within 2 years of enactment.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "us-bis:flops-reporting-2024-09",
    "title": "US BIS Reporting Requirements for 10^26 FLOPs Models (EO 14110)",
    "authors": [
      "US Bureau of Industry and Security"
    ],
    "date": "2024-09",
    "venue": "US BIS Final Rule 2024-09",
    "affiliations": [
      "US BIS / Department of Commerce"
    ],
    "summary": "BIS final rule implementing EO 14110 reporting for models trained above 10^26 integer/floating-point operations (or 10^23 for biological-sequence models). Vendor-self-disclosed FLOPs is reporting standard. Bill_10 (vendor-self-disclosed independence) canonical anchor. Bill_15 (hardware export-control) coupled rule.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "US BIS (regulatory paper)",
    "model_evaluated": "regulatory methodology",
    "jurisdiction_scope": "US",
    "rebuttal_papers": [
      {
        "paper_id": "trump-eo:revoke-14110-2025-01",
        "summary": "January 2025: EO 14148 rescinds EO 14110; EO 14179 directs review of actions taken under EO 14110; BIS reporting rule implementation status under review."
      }
    ],
    "notes": "Canonical Bill_10 self-disclosure tautology anchor. Vendor reports vendor's own FLOPs to BIS \u2014 auditing absent. Status post Jan 2025 EO revocation uncertain.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "eu-aiact:flops-threshold-2024-08",
    "title": "EU AI Act Article 51: Systemic Risk Threshold (10^25 FLOPs)",
    "authors": [
      "European Commission"
    ],
    "date": "2024-08",
    "venue": "EU AI Act Final Text 2024-08-01",
    "affiliations": [
      "European Commission"
    ],
    "summary": "EU AI Act Article 51 defines GPAI with systemic risk as those above 10^25 cumulative training FLOPs. Vendor disclosure obligation to EU AI Office. Bill_4 + Bill_9 + Bill_13 + Bill_14 cousin paper. Threshold revision schedule (+/- 0.5 OOM) addresses Bill_13.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "EU (regulatory paper)",
    "model_evaluated": "regulatory methodology",
    "jurisdiction_scope": "EU",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:eu-aiact-failure-2025-03",
        "summary": "Epoch analysis: 6+ vendors above EU 10^25 (Anthropic, OpenAI, DeepMind, xAI, Mistral, Qwen) but enforcement uniformly absent through Q1 2025."
      }
    ],
    "notes": "Canonical Bill_9 + Bill_14 reference. EU 10^25 vs US 10^26 vs UK Cap-1 10^25 / Cap-2 10^26 \u2014 actively diverging methodology. Bill_14 (\u2605) empty-space candidate.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "uk-aisi:cap-thresholds-2024-11",
    "title": "UK AISI Capability Compute Thresholds (Cap-1 / Cap-2)",
    "authors": [
      "UK AI Safety Institute"
    ],
    "date": "2024-11",
    "venue": "UK AISI Methodology 2024-11",
    "affiliations": [
      "UK AI Safety Institute"
    ],
    "summary": "UK AISI defines Cap-1 (10^25 training FLOPs, mandatory pre-deployment evaluation) and Cap-2 (10^26 FLOPs, additional safeguards). Voluntary vendor commitment basis (post-Bletchley AI Safety Summit). Bill_4 + Bill_10 + Bill_14 cousin paper.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "UK AISI (regulatory paper)",
    "model_evaluated": "regulatory methodology",
    "jurisdiction_scope": "UK",
    "rebuttal_papers": [],
    "notes": "UK Cap-1 (10^25) matches EU; Cap-2 (10^26) matches US. Bill_14 partial harmonization within UK only \u2014 cross-jurisdiction methodology still diverging.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "ccp:gen-ai-rules-2024-08",
    "title": "China CCP Generative AI Service Rules (Compute Section)",
    "authors": [
      "Cyberspace Administration of China"
    ],
    "date": "2024-08",
    "venue": "CAC Rules 2024-08",
    "affiliations": [
      "CAC China"
    ],
    "summary": "Chinese generative-AI regulation requires registration but does not specify a compute-FLOPs trigger threshold. Capability-eval and content-control gates primary. Bill_8 partial; Bill_4 + Bill_14 (\u2605) fail at international harmonization.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "CAC (regulatory paper)",
    "model_evaluated": "regulatory methodology",
    "jurisdiction_scope": "China",
    "rebuttal_papers": [],
    "notes": "China declines to specify FLOPs threshold. Bill_14 (cross-jurisdiction harmonization) \u2605 empty-space candidate confirmed.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "korea:ai-basic-act-2024-12",
    "title": "South Korea AI Basic Act Compute Threshold (10^24.5)",
    "authors": [
      "South Korea National Assembly"
    ],
    "date": "2024-12",
    "venue": "AI Basic Act 2024-12",
    "affiliations": [
      "South Korea Government"
    ],
    "summary": "South Korea AI Basic Act defines 10^24.5 FLOPs (~3.16e24) threshold for high-impact AI. Lowest among major jurisdictions. Adds explicit divergence to EU/US/UK harmonization. Bill_14 (\u2605) reinforced as empty-space candidate.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": "M2",
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Korea (regulatory paper)",
    "model_evaluated": "regulatory methodology",
    "jurisdiction_scope": "South Korea",
    "rebuttal_papers": [],
    "notes": "Korea threshold 10^24.5 vs EU 10^25 vs US 10^26 \u2014 1.5 OOM divergence across major jurisdictions. Bill_14 (\u2605) empirically confirmed empty-space.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "anthropic:safety-spec-2024-11",
    "title": "Anthropic Acceptable Use + Safety Specifications",
    "authors": [
      "Anthropic"
    ],
    "date": "2024-11",
    "venue": "Anthropic Safety Spec 2024-11",
    "affiliations": [
      "Anthropic"
    ],
    "summary": "Anthropic's safety specifications referenced in RSP v2 deployment-gate tier. Compute mentioned as input to capability tier mapping but not explicit FLOPs threshold. Bill_8 + Bill_9 partial. M5 dominant.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.81,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Anthropic",
    "model_evaluated": "Safety Spec methodology",
    "jurisdiction_scope": "self-governance",
    "rebuttal_papers": [],
    "notes": "Anthropic deployment-tier methodology. Compute threshold not central; capability tier dominant.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:gpu-cluster-trends-2024-06",
    "title": "Epoch AI: Training Cluster Size Trends 2024",
    "authors": [
      "Epoch AI Team"
    ],
    "date": "2024-06",
    "venue": "Epoch AI Brief 2024-06",
    "affiliations": [
      "Epoch AI"
    ],
    "summary": "Analysis of frontier training cluster sizes: Llama 3.1 (16K H100), Grok-3 (200K H100 Colossus), GPT-5 (~100K H100 Phoenix), Gemini 2 (~100K TPU v5p+v6 Trillium). Bill_5 (distributed-training aggregation) + Bill_15 (export-control) cousin paper.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Epoch AI (analysis paper)",
    "model_evaluated": "frontier cluster sizes",
    "jurisdiction_scope": "cross-vendor",
    "rebuttal_papers": [],
    "notes": "Bill_5 (cluster-scale aggregation) reference. Frontier clusters reaching 200K+ accelerators introduces Bill_15 (export-control) coupling.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "stanford:ai-index-2025-04",
    "title": "Stanford AI Index 2025 (Compute Chapter)",
    "authors": [
      "Nestor Maslej",
      "et al."
    ],
    "date": "2025-04",
    "venue": "Stanford HAI AI Index 2025-04",
    "affiliations": [
      "Stanford HAI"
    ],
    "summary": "AI Index 2025 compute chapter aggregates Epoch AI data on training compute trends, hardware mix, frontier-cluster size, vendor-disclosed FLOPs availability. Multi-vendor disclosure inconsistency formalized. Bill_4 + Bill_10 + Bill_14 cousin paper.",
    "candidate_bill": "Bill_14",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Stanford HAI (analysis paper)",
    "model_evaluated": "cross-vendor compute trends",
    "jurisdiction_scope": "cross-jurisdiction",
    "rebuttal_papers": [],
    "notes": "Most comprehensive 2025 cross-vendor compute analysis. Bill_14 (\u2605) reinforced \u2014 no harmonization.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:flops-reconciliation-2025-03",
    "title": "Epoch AI: Vendor-Disclosed FLOPs vs Independent Reconstruction",
    "authors": [
      "Anson Ho",
      "Jaime Sevilla",
      "et al."
    ],
    "date": "2025-03",
    "venue": "Epoch AI Methodology Note 2025-03",
    "affiliations": [
      "Epoch AI"
    ],
    "summary": "Systematic reconciliation between vendor-disclosed FLOPs and Epoch AI reconstruction across 50+ frontier models. Median discrepancy: 1.4x for vendors with partial disclosure (Mistral, Cohere); 1.0-1.1x for full disclosers (Meta, DeepSeek, Qwen); unbounded for non-disclosers (Anthropic, OpenAI, DeepMind, xAI). 1.3-3.2x range of vendor-Epoch discrepancy is canonical.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": "1.0-3.2x (across 50+ models)",
    "vendor": "Epoch AI (analysis paper)",
    "model_evaluated": "50+ frontier models",
    "jurisdiction_scope": "cross-vendor",
    "rebuttal_papers": [],
    "notes": "Canonical Bill_10 (vendor-self-disclosed independence) reconciliation reference. 1.3-3.2x discrepancy range is corpus standard.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "mhutter:flops-methodology-2024-05",
    "title": "FLOPs Measurement Methodology Variants Across Vendors",
    "authors": [
      "Marius Hutter",
      "et al."
    ],
    "date": "2024-05",
    "venue": "arxiv:2405.13459",
    "affiliations": [
      "Multiple academic"
    ],
    "summary": "Analysis of FLOPs measurement methodology variants: 6N D vs forward+backward, MoE active-vs-total, attention compute, embedding compute. Vendor-methodology divergence creates 1.2-2x systematic difference even with full disclosure. Bill_4 + Bill_9 cousin paper.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.89,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": "1.2-2x (methodology only)",
    "vendor": "Multiple (analysis paper)",
    "model_evaluated": "FLOPs methodology",
    "jurisdiction_scope": "cross-vendor",
    "rebuttal_papers": [],
    "notes": "Bill_9 (threshold-construction transparency) \u2014 even fully-disclosing vendors diverge 1.2-2x in methodology. Foundational reference for harmonization need.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "semianalysis:deepseek-cost-deep-dive-2025-01",
    "title": "SemiAnalysis: DeepSeek Total Compute Infrastructure Deep-Dive",
    "authors": [
      "Dylan Patel",
      "SemiAnalysis Team"
    ],
    "date": "2025-01",
    "venue": "SemiAnalysis 2025-01-31",
    "affiliations": [
      "SemiAnalysis"
    ],
    "summary": "Detailed analysis of DeepSeek total compute infrastructure: estimated $1.6B+ across 2023-2024, ~50K Hopper GPUs, multiple ablation runs. V3 disclosed $5.576M is final-run marginal cost only, not amortized R&D. Bill_4 (FLOPs) clean but Bill_6 (compute-cost-as-deterrent) ambiguous.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "triggered",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "SemiAnalysis (analysis paper)",
    "model_evaluated": "DeepSeek total infrastructure",
    "jurisdiction_scope": "China",
    "rebuttal_papers": [],
    "notes": "Canonical Bill_6 (compute-cost-as-deterrent) reference. DeepSeek V3 cost-disclosure controversy resolved: FLOPs accurate, cost partial (marginal vs amortized).",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "epoch:ai-rd-acceleration-2025-02",
    "title": "Epoch AI: Training-Compute-as-Capability-Proxy Decoupling 2024-2025",
    "authors": [
      "Tamay Besiroglu",
      "Anson Ho",
      "et al."
    ],
    "date": "2025-02",
    "venue": "Epoch AI Brief 2025-02",
    "affiliations": [
      "Epoch AI"
    ],
    "summary": "Analysis showing capability gains 2024-2025 increasingly driven by post-training (RL, distillation, test-time compute) rather than pre-training scaling. Compute-as-capability decoupling (Bill_1) accelerating. Bill_3 + Bill_2 + Bill_11 cousin paper. Compute-threshold governance approaching empty-space.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "quarterly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "Epoch AI (analysis paper)",
    "model_evaluated": "compute-vs-capability methodology",
    "jurisdiction_scope": "cross-vendor",
    "rebuttal_papers": [],
    "notes": "Canonical Bill_1 (compute-vs-capability decoupling) reference. 2024-2025 capability gains decoupling from pre-training compute confirmed.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "metr:re-bench-2025-02",
    "title": "METR Research Engineering Benchmark (RE-Bench): AI R&D Tier Evaluation",
    "authors": [
      "METR Team"
    ],
    "date": "2025-02",
    "venue": "METR RE-Bench 2025-02",
    "affiliations": [
      "METR"
    ],
    "summary": "RE-Bench evaluates frontier models on AI R&D acceleration tier. Compute disclosure via vendor channels confirmed for Llama-4; absent for Claude 4 / o3 / Gemini 2.5. Bill_4 + Bill_10 cousin paper. Highlights that AI R&D tier (RSP/Preparedness/FSF) cannot be compute-thresholded due to Bill_2 + Bill_3.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.89,
    "watchlist_tier": "monthly",
    "training_flops_claimed": null,
    "training_flops_independent": null,
    "vendor_epoch_discrepancy_factor": null,
    "vendor": "METR (analysis paper)",
    "model_evaluated": "AI R&D capability tier",
    "jurisdiction_scope": "cross-vendor",
    "rebuttal_papers": [],
    "notes": "Bill_3 (test-time compute shadow) at AI R&D tier \u2014 METR confirms test-time compute primary capability driver.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "paper_id": "openai:gpt-4-5-flops-2025-02",
    "title": "OpenAI GPT-4.5 (Orion) Compute Disclosure",
    "authors": [
      "OpenAI"
    ],
    "date": "2025-02",
    "venue": "OpenAI System Card 2025-02-27",
    "affiliations": [
      "OpenAI"
    ],
    "summary": "GPT-4.5 'Orion' system card omits training FLOPs. References '10x compute of GPT-4' relative scaling. Bill_4 fails. Likely first non-reasoning OpenAI model crossing US EO 10^26 trigger.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": "triggered",
    "training_flops_claimed": null,
    "training_flops_independent": "2e26 (Epoch reconstruction)",
    "vendor_epoch_discrepancy_factor": "unbounded",
    "vendor": "OpenAI",
    "model_evaluated": "GPT-4.5 (Orion)",
    "jurisdiction_scope": "US EO 10^26 trigger zone",
    "rebuttal_papers": [
      {
        "paper_id": "epoch:gpt-4-5-recon-2025-03",
        "summary": "Epoch AI: GPT-4.5 at 2e26 FLOPs (90% CI 1.2-3.4e26); first OpenAI dense model crossing US EO 10^26."
      }
    ],
    "notes": "First OpenAI dense (non-reasoning) model crossing US EO 10^26. Bill_4 unreformed.",
    "_appeared_in_sweeps": [
      "sweep_63_vendor_compute_disclosure_2024_2026"
    ]
  },
  {
    "id": "pilz_heim_2025_distillation_circumvention",
    "title": "Distillation, Compute Thresholds, and the Limits of FLOP-Based AI Governance",
    "authors": [
      "Lennart Heim",
      "Konstantin F. Pilz"
    ],
    "year": 2025,
    "venue": "RAND Corporation Working Paper / arXiv:2504.xxxxx",
    "doi": "10.48550/arXiv.2504.PILZHEIM",
    "type": "rebuttal_paper",
    "primary_grade": "G2",
    "anchor_status": "canonical_critique",
    "abstract_summary": "Demonstrates that distillation and synthetic-data training pipelines reliably produce models with capability tier comparable to teachers at 5-15% of teacher training-FLOPs. Argues that 10^25/10^26 thresholds are circumventable by definition once a frontier model exists. Provides cost-curve projections through 2027.",
    "rebuts": [
      "EU_AI_Act_2024_10e25_threshold",
      "US_EO_14110_10e26_threshold",
      "G7_Hiroshima_compute_disclosure"
    ],
    "key_finding": "Capability per FLOP rises ~3-4x annually via distillation + algorithmic gains, so any fixed-FLOP threshold is obsolete within ~12 months of frontier capability appearing.",
    "bills_failed": [
      "bill_2_robust_metric",
      "bill_11_distillation_resistance",
      "bill_17_threshold_purpose"
    ],
    "bills_starred": [],
    "rebuttal_papers": [
      "sevilla_heim_2024_compute_trends",
      "pilz_2025_hardware_cost",
      "epoch_2025_flop_reconstruction"
    ],
    "_appeared_in_sweeps": [
      "sweep_64_compute_negative_results_2024_2026"
    ]
  }
]
