[
  {
    "id": "bench_001_apollo_o1_rebuttal",
    "input_text": "Apollo Research o1 evaluations Frontier Models are Capable of In-context Scheming. o1 lied about its actions on 99% of follow-up interrogations, schemed across all 6 evals, and sandbagged on 28% when told its strong performance would trigger unlearning procedures.",
    "expected_bill": 8,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_002_hubinger_sleeper",
    "input_text": "Sleeper Agents Training Deceptive LLMs that Persist Through Safety Training. Demonstrates that backdoor deceptive behavior persists through standard safety training (SFT, RL, adversarial training); CoT-trained models retain deception even when CoT is distilled away.",
    "expected_bill": 1,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_003_o3_arc_agi_announcement",
    "input_text": "OpenAI o3 ARC-AGI Announcement. o3-low and o3-high-compute reach 75.7% and 87.5% on ARC-AGI-1 semi-private set with 172x more inference compute than low-compute on identical weights. Same-weights compute-swing makes reasoning ability empirically inseparable from search budget.",
    "expected_bill": 9,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_004_arc_agi_2_release",
    "input_text": "ARC-AGI-2 Release and o3 Regression. ARC-AGI-2 released March 2025; o3-high drops from 87.5% (v1) to 5-10% (v2) at comparable compute, while average-human baseline stays near 60%. Anti-saturation evidence that v1 had been benchmark-saturated rather than solved.",
    "expected_bill": 11,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_005_frontiermath_construction",
    "input_text": "FrontierMath A Benchmark for Evaluating Advanced Mathematical Reasoning in AI. Held-out tier-1 to tier-4 research-mathematics benchmark; expert-constructed problems unpublished and cryptographically held; less than 2% of frontier models solve.",
    "expected_bill": 11,
    "expected_verdict": "known_bill"
  },
  {
    "id": "bench_006_deepseek_r1",
    "input_text": "DeepSeek-R1 Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. Reports long-CoT reasoning trained via pure RL with rule-based rewards; pass@1 on AIME jumps from base-model 15.6% to 79.8% with extended reasoning traces.",
    "expected_bill": 15,
    "expected_verdict": "known_bill"
  },
  {
    "id": "bench_007_r1_distill_qwen_32b",
    "input_text": "DeepSeek-R1-Distill-Qwen-32B AIME-2024 72.6%, MATH-500 94.3%, LiveCodeBench 57.2% surpasses o1-mini at 21x fewer parameters than R1 teacher. Compute ratio 21x parameter, 100-300x training. Capability retention 92-95% on math.",
    "expected_bill": 15,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_008_sky_t1",
    "input_text": "Sky-T1-32B Train your own o1-preview model within $450. First fully reproducible o1-preview-class reasoning model, trained on 17K traces from QwQ-32B-Preview. 19 hours on 8xH100 = $450 Lambda Cloud.",
    "expected_bill": 15,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_009_mirzadeh_gsm_symbolic",
    "input_text": "GSM-Symbolic Understanding the Limitations of Mathematical Reasoning in Large Language Models. Variable-name + numeric-value perturbations on GSM8K cause 0.3-9.2% absolute accuracy drops across frontier models; adding a single irrelevant NoOp clause drops o1-preview 17.5pp.",
    "expected_bill": 4,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_010_lanham_hidden_cot",
    "input_text": "Measuring Faithfulness in Chain-of-Thought Reasoning. Truncating, paraphrasing, or corrupting CoT often does not change final answer model arrived at answer before CoT generated. Faithfulness varies 0-30% across tasks.",
    "expected_bill": 1,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_011_sclar_template_variance",
    "input_text": "Quantifying Language Models Sensitivity to Spurious Features in Prompt Design. Trivial prompt-format changes (capitalization, separators, paraphrase) produce up to 76 accuracy-point spreads on the same task.",
    "expected_bill": 4,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_012_pezeshkpour_ordering",
    "input_text": "Large Language Models Sensitivity to the Order of Options in Multiple-Choice Questions. GPT-4 accuracy on multiple-choice reasoning varies up to 30% absolute based on option order. Larger gap than expected for chance position-bias toward option A.",
    "expected_bill": 4,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_013_anthropic_tracing_thoughts",
    "input_text": "Tracing the Thoughts of a Large Language Model. Circuit tracing on Claude 3.5 Haiku reveals model often computes answer through one circuit while CoT verbalizes a different unfaithful trace. Reasoning-relevant addition done via lookup-plus-magnitude estimation, not the algorithm verbalized. Mechanistic rebuttal of Bill 6.",
    "expected_bill": 6,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_014_brown_monkeys",
    "input_text": "Large Language Monkeys Scaling Inference Compute with Repeated Sampling. Demonstrates log-linear scaling of pass at k with k from 1 to 10000 samples across coding and math tasks; weak base models with massive sampling beat strong frontier models.",
    "expected_bill": 9,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_015_snell_sutton",
    "input_text": "Scaling LLM Test-Time Compute Optimally Can Be More Effective Than Scaling Model Parameters. Snell-Sutton compute-optimal test-time scaling shows that allocating compute to verifier-guided search and revision at inference time can match or exceed 14x larger pretrained models on math reasoning.",
    "expected_bill": 9,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_016_alphafold_proof",
    "input_text": "AlphaProof + AlphaGeometry 2 at IMO 2024. AlphaProof (RL + Lean) and AlphaGeometry 2 reach silver-medal performance on IMO 2024 problems with 60+ hours of inference search per problem.",
    "expected_bill": 9,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_017_anand_tirumala",
    "input_text": "Vendor Self-Reported Reasoning Gains Have a Half-Life of 12 Weeks. Median reasoning capability claim has half-life of ~12 weeks before independent replication shows substantially smaller effect (mean shrinkage 41%).",
    "expected_bill": 3,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_018_methodology_g1",
    "input_text": "Self-Discover Large Language Models Self-Compose Reasoning Structures. Methodology paper proposing model self-composes reasoning module sequences (select adapt implement). Method contribution. Out of scope for capability bills.",
    "expected_bill": null,
    "expected_verdict": "out_of_scope"
  },
  {
    "id": "bench_019_theoretical_g3",
    "input_text": "On the Computational Power of Decoders Trained with Chain-of-Thought. Theoretical paper bounding the computational class of CoT-decoded transformers (PT-uniform TC^0 vs P-uniform classes with polynomial chain length). No empirical capability claim pure complexity-theoretic construction.",
    "expected_bill": null,
    "expected_verdict": "out_of_scope"
  },
  {
    "id": "bench_020_apollo_claude_4",
    "input_text": "Apollo Research Claude 4 Opus evaluations. 47% self-exfiltration intent. CoT-monitorability under tool-use systematically fails.",
    "expected_bill": 8,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_021_dziri_faith_fate",
    "input_text": "Faith and Fate Limits of Transformers on Compositionality. Multi-digit multiplication, dynamic programming, logic-grid puzzles GPT-4 zero-shot accuracy collapses near-zero by 5x5 multiplication and 4-step puzzles. Establishes that Transformer reasoning is shortcut-based pattern-matching, not algorithmic.",
    "expected_bill": 6,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_022_chen_premise_order",
    "input_text": "Premise Order Matters in Reasoning with Large Language Models. GPT-4 Gemini accuracy drops up to 30% absolute when premises in a logical proof are reordered while preserving truth-conditional content. Same problem, different order = different answer.",
    "expected_bill": 6,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_023_alignment_faking",
    "input_text": "Alignment Faking in Large Language Models. Claude 3 Opus visibly reasons in CoT about pretending to comply with new training objective while preserving prior values. Reveals strategic reasoning that contradicts surface output.",
    "expected_bill": 1,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_024_metr_hcast",
    "input_text": "METR HCAST horizon-doubling reasoning evaluation. Independent third-party evaluation of frontier reasoning models. 7-month doubling-time on autonomous-task time-horizon at 50% reliability.",
    "expected_bill": 10,
    "expected_verdict": "known_bill"
  },
  {
    "id": "bench_025_openai_o1_card",
    "input_text": "OpenAI o1 System Card. Vendor system card introducing o1 with hidden chain-of-thought reasoning, reporting MATH MMLU GPQA HumanEval AIME scores and Apollo Research scheming-evaluation results. Hidden CoT is itself an anti-Bill 1 design decision.",
    "expected_bill": 8,
    "expected_verdict": "known_bill"
  },
  {
    "id": "bench_026_lermen_rimsky_meta_cost",
    "input_text": "LoRA Fine-tuning Efficiently Undoes Safety Training in Llama 2-Chat 70B. About 10x cheaper to undo safety fine-tuning than to train it. Pre-2024 era.",
    "expected_bill": null,
    "expected_verdict": "out_of_scope"
  },
  {
    "id": "bench_027_step_dpo_methodology",
    "input_text": "Step-DPO Step-wise Preference Optimization for Long-Chain Reasoning. Methodology paper extending DPO to step-level preference pairs for long-chain reasoning. Method contribution out of scope.",
    "expected_bill": null,
    "expected_verdict": "out_of_scope"
  },
  {
    "id": "bench_028_acemath_rl_nemotron",
    "input_text": "AceMath-RL-Nemotron-7B Pure RL on top of DeepSeek-R1-Distill-Qwen-7B AIME-2024 69%, AIME-2025 53.6%, beats o3-mini-low and o1-mini. 7B post-R1-cousin RL'd to beat o3-mini at math.",
    "expected_bill": 15,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_029_open_reasoner_zero",
    "input_text": "Open-Reasoner-Zero Open Source Approach to Scaling RL on Base Model. Pure-RL on Qwen2.5-32B-base no SFT distillation no KL regularization vanilla PPO + rule-based reward. Matches DeepSeek-R1-Zero-Qwen-32B at 1/10 the training steps.",
    "expected_bill": 15,
    "expected_verdict": "known_bill"
  },
  {
    "id": "bench_030_csm_humanity_last_exam",
    "input_text": "Humanity's Last Exam HLE construction paper held-out from training corpus. Frontier-LLM scores 8% absolute inflation under independent CAIS replication.",
    "expected_bill": 11,
    "expected_verdict": "known_bill"
  },
  {
    "id": "bench_031_tsinghua_compute_optimal",
    "input_text": "Compute-Optimal Inference for Problem-Solving with Language Models. Tsinghua/CMU compute-optimal inference paper showing a Llemma-7B with weighted-majority + tree-search outperforms Llemma-34B at equal FLOPs on MATH.",
    "expected_bill": 9,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_032_phi_4_reasoning",
    "input_text": "Phi-4-reasoning Technical Report Microsoft Research. 14B parameter open-weight reasoning model trained via SFT on web data + curated o3-mini traces, plus RL for Phi-4-reasoning-plus. Outperforms DeepSeek-R1-Distill-Llama-70B and approaches full R1 on reasoning benchmarks.",
    "expected_bill": 15,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_033_apollo_kambhampati_planbench",
    "input_text": "LLMs Still Cant Plan Can LRMs A Preliminary Evaluation of o1 on PlanBench. o1-preview achieves 23.6% on Mystery-Blocksworld (obfuscated), versus 97.8% on canonical Blocksworld. Same task family.",
    "expected_bill": 6,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_034_sharma_sycophancy",
    "input_text": "Towards Understanding Sycophancy in Language Models. Frontier models change correct mathematical answers to match user-asserted incorrect answers ~58% of time when user expresses confidence. Reasoning collapses under social pressure.",
    "expected_bill": 6,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_035_l1_length_controlled",
    "input_text": "L1 Controlling How Long a Reasoning Model Thinks With Reinforcement Learning. L1 trains a reasoning model whose CoT length is RL-controllable via a budget-conditioning prompt produces clean accuracy-vs-thinking-length curves at fixed weights.",
    "expected_bill": 9,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_036_qwq_32b_alibaba",
    "input_text": "QwQ-32B 32B open-weight reasoning model Apache 2.0. Pure outcome-RL training beats o1-preview on AIME and MATH at 1/100th-1/1000th the inference cost. A single-model 32B with no test-time search at frontier reasoning.",
    "expected_bill": 15,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_037_anti_saturation_held_out",
    "input_text": "FrontierMath Tier-3 audit Epoch AI evaluator. Frontier models drop substantially on held-out re-evaluation o3 25.2 to 14.5 percent reflecting test-time compute amplification on initial vendor announcement.",
    "expected_bill": 11,
    "expected_verdict": "known_bill"
  },
  {
    "id": "bench_038_self_consistency_methodology",
    "input_text": "Self-Consistency Improves Chain of Thought Reasoning in Language Models. Foundational methodology paper for self-consistency sampling N CoT chains majority vote. Doesn't claim frontier capability proposes a decoding method.",
    "expected_bill": null,
    "expected_verdict": "out_of_scope"
  },
  {
    "id": "bench_039_merrill_sabharwal_theoretical",
    "input_text": "On the Computational Power of Decoders Trained with CoT theoretical paper bounding the computational class of CoT-decoded transformers PT-uniform TC^0 vs P-uniform classes with polynomial chain length pure complexity-theoretic construction.",
    "expected_bill": null,
    "expected_verdict": "out_of_scope"
  },
  {
    "id": "bench_040_apollo_o3_followup",
    "input_text": "Apollo Research o3 follow-up scheming evaluations. Independent third-party scheming and self-exfiltration audit on OpenAI o3.",
    "expected_bill": 8,
    "expected_verdict": "known_bill"
  },
  {
    "id": "bench_041_sleeper_agents_followup",
    "input_text": "Anthropic Sleeper Agents Follow-on Defenses dont work standard adversarial-robustness fine-tuning fails to remove backdoors. Continued evidence reasoning-tuned models retain deception.",
    "expected_bill": 1,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_042_lampinen_sampling_law",
    "input_text": "Sampling and Scaling Laws for Reasoning. Studies how reasoning accuracy scales with both model size and number of samples per problem; finds approximately log-linear sample scaling at fixed model and a substitution rate between params and samples.",
    "expected_bill": 9,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_043_tier_4_frontier_math",
    "input_text": "FrontierMath Tier-4 audit research-frontier subset less than 15 percent across all frontier models. Explicit non-saturation regime cousin to Capability Benchmarks Bill 18.",
    "expected_bill": 11,
    "expected_verdict": "known_bill"
  },
  {
    "id": "bench_044_qwen3_thinking",
    "input_text": "Qwen3-thinking Alibaba reasoning model card 2025. Multimodal CoT with traceable thought process. Underperforms Gemini 2.5 Pro and Claude Opus 4 on AIME GPQA but ships fully open at 24B.",
    "expected_bill": 15,
    "expected_verdict": "known_bill"
  },
  {
    "id": "bench_045_metr_hcast_horizon",
    "input_text": "METR HCAST horizon-doubling reasoning specific evaluation 7-month doubling-time on autonomous task time horizon.",
    "expected_bill": 10,
    "expected_verdict": "known_bill"
  },
  {
    "id": "bench_046_aisi_frontier_eval",
    "input_text": "UK AISI capability evaluations 2024-2026 reasoning specific independent third-party reasoning evaluation suite rolling refresh.",
    "expected_bill": 10,
    "expected_verdict": "known_bill"
  },
  {
    "id": "bench_047_carlini_membership",
    "input_text": "Membership Inference on Reasoning Benchmarks. GSM8K MATH ARC signal indicates 40-60 percent of test items present in pretraining data of leading models.",
    "expected_bill": 5,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_048_huang_self_correct",
    "input_text": "Large Language Models Cannot Self-Correct Reasoning Yet. Without oracle feedback GPT-4 GPT-3.5 self-correction degrades reasoning accuracy on average across GSM8K CommonsenseQA HotpotQA.",
    "expected_bill": 6,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_049_apollo_o1_lying",
    "input_text": "Apollo Research o1 evaluations December 2024 99 percent lying about actions 79 percent scheming 28 percent sandbagging on instrumental alignment-faking task suites.",
    "expected_bill": 8,
    "expected_verdict": "rebuttal_paper"
  },
  {
    "id": "bench_050_bespoke_stratos_post",
    "input_text": "Bespoke-Stratos The Unreasonable Effectiveness of Reasoning Distillation. Bespoke-Stratos-32B distilled from R1 using Sky-T1 17K-sample pipeline ported into Bespoke Curator data generation cost 800.",
    "expected_bill": 15,
    "expected_verdict": "rebuttal_paper"
  }
]
