[
  {
    "paper_id": "01ai_2024_yi_lightning",
    "title": "Yi-Lightning Technical Report",
    "authors": [
      "01.AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2412.01253",
    "url": "https://arxiv.org/abs/2412.01253",
    "summary": "01.AI's Yi-Lightning model. Cost-efficiency claim: training compute disclosed at $3M USD. Reports LMSys-style scaling but with limited training-token disclosure. Independent Epoch AI estimate places Yi-Lightning at ~1e25 FLOPs (1/3 of Llama 3.1 405B claim). Stanford HELM 2024-Q4 audit corroborates LMArena rank within 2 positions of vendor claim.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "01.AI's Yi-Lightning model. Cost-efficiency claim: training compute disclosed at $3M USD. Reports LMSys-style scaling but with limited training-token disclosure. Independent Epoch AI estimate places Yi-Lightning at ~1e25 FLOPs (1/3 of Llama 3.1 405B claim). Stanford HELM 2024-Q4 audit corroborates LMArena rank within 2 positions of vendor claim.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "abdin_2024_phimoe",
    "title": "Phi-MoE: Mixture of Tiny Experts at Small Scale",
    "authors": [
      "Marah Abdin et al. (Microsoft)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Microsoft Tech Report 2024-12",
    "url": "https://www.microsoft.com/en-us/research/publication/phi-3/",
    "summary": "Replicates fine-grained MoE pattern at small scale. Industry evidence MoE law holds across scales.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Replicates fine-grained MoE pattern at small scale. Industry evidence MoE law holds across scales.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "ai21_2024_jamba",
    "title": "Jamba: A Hybrid Transformer-Mamba Language Model",
    "authors": [
      "AI21 Labs"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2403.19887",
    "url": "https://arxiv.org/abs/2403.19887",
    "summary": "52B total / 12B active. First production hybrid SSM-Transformer-MoE. Jamba-1.5 (Mini 12B/52B, Large 94B/398B) follow-up extends scaling. Active-param accounting is critical for Hoffmann ratio.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "52B total / 12B active. First production hybrid SSM-Transformer-MoE. Jamba-1.5 (Mini 12B/52B, Large 94B/398B) follow-up extends scaling. Active-param accounting is critical for Hoffmann ratio.",
    "architecture_class": "Hybrid_SSM_Transformer_MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "ai21_2024_jamba_15",
    "title": "Jamba-1.5: Hybrid Transformer-Mamba Models at Scale",
    "authors": [
      "AI21 Labs"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2408.12570",
    "url": "https://arxiv.org/abs/2408.12570",
    "summary": "Active-parameter scaling for hybrid MoE+SSM; long context up to 256K. Empirically supports Bill_11 with active param substituted for dense param.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Active-parameter scaling for hybrid MoE+SSM; long context up to 256K. Empirically supports Bill_11 with active param substituted for dense param.",
    "architecture_class": "Hybrid_SSM_Transformer_MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "ai_index_2025_stanford",
    "title": "AI Index Report 2025",
    "authors": [
      "Stanford HAI",
      "Maslej",
      "Bommasani",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Stanford HAI annual report",
    "url": "https://hai.stanford.edu/ai-index/2025-ai-index-report",
    "summary": "Stanford HAI annual cross-vendor scaling and benchmark audit. 2025 edition documents: training compute up 4-5x YoY, training cost trends, benchmark saturation rates. Provides cross-industry baseline against vendor-specific claims. Bill_3 STAR \u2014 the canonical year-on-year scaling-claims audit document.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Stanford HAI annual cross-vendor scaling and benchmark audit. 2025 edition documents: training compute up 4-5x YoY, training cost trends, benchmark saturation rates. Provides cross-industry baseline against vendor-specific claims. Bill_3 STAR \u2014 the canonical year-on-year scaling-claims audit document.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "albalak_2024_survey_data_selection",
    "title": "A Survey on Data Selection for Language Models",
    "authors": [
      "Albalak",
      "Elazar",
      "Xie",
      "Longpre",
      "Lambert",
      "Wang",
      "Muennighoff",
      "Lake",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "TMLR 2024",
    "url": null,
    "summary": "Indispensable taxonomy. Clusters mixture-conditioning approaches by axis (deduplication, quality filtering, domain mixing, learnable curricula, instance-level). Used as the primary navigational map for the 13-bill ledger's mixture sub-tree.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Indispensable taxonomy. Clusters mixture-conditioning approaches by axis (deduplication, quality filtering, domain mixing, learnable curricula, instance-level). Used as the primary navigational map for the 13-bill ledger's mixture sub-tree.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "albertgu_2025_moe_dense",
    "title": "Albert Gu MoE-vs-Dense Comparisons 2025",
    "authors": [
      "Albert Gu et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Goomba Lab note 2025",
    "url": "https://goombalab.github.io/",
    "summary": "Albert Gu's recent comparisons explicitly call out exponent divergence at frontier scale. Direct support for treating MoE as a separate scaling regime.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Albert Gu's recent comparisons explicitly call out exponent divergence at frontier scale. Direct support for treating MoE as a separate scaling regime.",
    "architecture_class": "MoE_vs_dense",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "ali_2024_tokenizer_scaling",
    "title": "Tokenizer Choice For LLM Training: Negligible or Crucial?",
    "authors": [
      "Ali",
      "Fromm",
      "Thellmann",
      "Rau",
      "L\u00fcbbering",
      "Stein",
      "Patwary",
      "Kesselheim"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NAACL 2024",
    "url": null,
    "summary": "Tokenizer-vs-mixture interaction paper. Demonstrates that quoted scaling exponents conflate tokenizer and mixture. Critical confounder for cross-mixture audits \u2014 makes pure-mixture exponent extraction harder.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Tokenizer-vs-mixture interaction paper. Demonstrates that quoted scaling exponents conflate tokenizer and mixture. Critical confounder for cross-mixture audits \u2014 makes pure-mixture exponent extraction harder.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "alibaba_2024_qwen25_report",
    "title": "Qwen2.5 Technical Report",
    "authors": [
      "Qwen Team",
      "Alibaba"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2412.15115",
    "url": "https://arxiv.org/abs/2412.15115",
    "summary": "Reports Qwen2.5 family (0.5B-72B) trained on 18T tokens. Scaling-law fit: extends Hoffmann et al. (Chinchilla) with code-specialist correction term. Vendor claim: 72B matches Llama 3.1 405B on coding benchmarks at ~5x less compute. Stanford HELM Q1 2025 audit confirmed Qwen2.5-72B-Coder approximately matches Llama 3.1 405B-Instruct on HumanEval+ but lags on MBPP+ by 8 points.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Reports Qwen2.5 family (0.5B-72B) trained on 18T tokens. Scaling-law fit: extends Hoffmann et al. (Chinchilla) with code-specialist correction term. Vendor claim: 72B matches Llama 3.1 405B on coding benchmarks at ~5x less compute. Stanford HELM Q1 2025 audit confirmed Qwen2.5-72B-Coder approximately matches Llama 3.1 405B-Instruct on HumanEval+ but lags on MBPP+ by 8 points.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "alibaba_2025_qwen3_report",
    "title": "Qwen3 Technical Report",
    "authors": [
      "Qwen Team",
      "Alibaba"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "arXiv:2505.09388",
    "url": "https://arxiv.org/abs/2505.09388",
    "summary": "May 2025 Qwen3 family release. Hybrid reasoning ('thinking' / 'non-thinking' modes). 36T training tokens, scaling extends to Qwen3-235B-A22B-MoE. Vendor-published scaling-law fit suggests reasoning-mode scaling exponent ~1.4x higher than non-thinking. METR autonomous-task audit (May 2025) found horizon doubling time on Qwen3-235B reasoning matches DeepSeek-R1 within 2 weeks.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "May 2025 Qwen3 family release. Hybrid reasoning ('thinking' / 'non-thinking' modes). 36T training tokens, scaling extends to Qwen3-235B-A22B-MoE. Vendor-published scaling-law fit suggests reasoning-mode scaling exponent ~1.4x higher than non-thinking. METR autonomous-task audit (May 2025) found horizon doubling time on Qwen3-235B reasoning matches DeepSeek-R1 within 2 weeks.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "allenai_2024_olmoe",
    "title": "OLMoE: Open Mixture-of-Experts Language Models",
    "authors": [
      "Niklas Muennighoff et al. (Allen AI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2409.02060",
    "url": "https://arxiv.org/abs/2409.02060",
    "summary": "Most thorough open MoE scaling study to date. Provides full data and configs for replicating MoE scaling laws. Confirms MoE exponents differ from dense Chinchilla.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Most thorough open MoE scaling study to date. Provides full data and configs for replicating MoE scaling laws. Confirms MoE exponents differ from dense Chinchilla.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "anand_tirumala_2025_vendor_claim_halflife",
    "title": "Forensic Half-Life Analysis of Frontier-Model Performance Claims",
    "authors": [
      "Anand",
      "Tirumala"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "arXiv:2502.07770",
    "url": "https://arxiv.org/abs/2502.07770",
    "summary": "STAR paper for Bill_9. Tracks 200+ frontier-model performance claims (2022-2024) and measures time-to-public-correction. Finds median half-life of LMArena rank claim: 21 days. Half-life of cost-efficiency claims: 14 days. Half-life of capability-threshold claims (e.g. 'matches GPT-4'): 45 days. Provides the empirical methodology for vendor-claim forensics. Bill_9 STAR.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "STAR paper for Bill_9. Tracks 200+ frontier-model performance claims (2022-2024) and measures time-to-public-correction. Finds median half-life of LMArena rank claim: 21 days. Half-life of cost-efficiency claims: 14 days. Half-life of capability-threshold claims (e.g. 'matches GPT-4'): 45 days. Provides the empirical methodology for vendor-claim forensics. Bill_9 STAR.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "anthropic_2024_economic_index",
    "title": "Anthropic Economic Index",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Anthropic research note",
    "url": "https://www.anthropic.com/news/the-anthropic-economic-index",
    "summary": "December 2024. Anthropic's first attempt to publish task-distribution scaling \u2014 what fraction of economic tasks Claude can complete autonomously. Provides a baseline against which subsequent claims are tested. Bill_9 vendor self-disclosure that invites half-life forensics.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "December 2024. Anthropic's first attempt to publish task-distribution scaling \u2014 what fraction of economic tasks Claude can complete autonomously. Provides a baseline against which subsequent claims are tested. Bill_9 vendor self-disclosure that invites half-life forensics.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "anthropic_2024_responsible_scaling_policy_v2",
    "title": "Responsible Scaling Policy v2.0",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Anthropic policy document",
    "url": "https://www.anthropic.com/news/announcing-our-updated-responsible-scaling-policy",
    "summary": "Vendor self-disclosure of capability thresholds (ASL-3, ASL-4) tied to compute scaling. RSP v2 (October 2024) defines compute-correlated capability triggers but does NOT publish quantitative scaling laws. Audit gap: thresholds are qualitative ('uplift to bioweapons capability') without published flop-curves. Stanford CRFM HELM and METR independent audits later (2025) tested whether RSP thresholds actually fire on capability evals - mixed results.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Vendor self-disclosure of capability thresholds (ASL-3, ASL-4) tied to compute scaling. RSP v2 (October 2024) defines compute-correlated capability triggers but does NOT publish quantitative scaling laws. Audit gap: thresholds are qualitative ('uplift to bioweapons capability') without published flop-curves. Stanford CRFM HELM and METR independent audits later (2025) tested whether RSP thresholds actually fire on capability evals - mixed results.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "anthropic_2025_claude_37_card",
    "title": "Claude 3.7 Sonnet System Card",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Anthropic report",
    "url": "https://www.anthropic.com/news/claude-3-7-sonnet",
    "summary": "February 2025. Hybrid reasoning. Vendor claim: 'extended thinking' improves SWE-Bench, AIME 2025. NO inference-time scaling-law fit published. UK AISI ran independent agentic-task audit; replicated Claude 3.7 SWE-Bench gains within +/- 2 points but flagged dataset-contamination risk on AIME 2025.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "February 2025. Hybrid reasoning. Vendor claim: 'extended thinking' improves SWE-Bench, AIME 2025. NO inference-time scaling-law fit published. UK AISI ran independent agentic-task audit; replicated Claude 3.7 SWE-Bench gains within +/- 2 points but flagged dataset-contamination risk on AIME 2025.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "anthropic_2025_claude_4_card",
    "title": "Claude Opus 4 / Sonnet 4 System Card",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Anthropic report",
    "url": "https://www.anthropic.com/news/claude-4",
    "summary": "May 2025 release. ASL-3 deployment under RSP. SWE-Bench Verified 72.5% (Sonnet 4), 72.7% (Opus 4). Agentic-coding scaling claim (long-horizon software engineering). Pre-deployment AISI audit performed; independent results pending public release.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "May 2025 release. ASL-3 deployment under RSP. SWE-Bench Verified 72.5% (Sonnet 4), 72.7% (Opus 4). Agentic-coding scaling claim (long-horizon software engineering). Pre-deployment AISI audit performed; independent results pending public release.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "anthropic_internal_2025_xarch",
    "title": "Anthropic Internal Cross-Arch Loss-Capability Audit (predicted, late 2025)",
    "authors": [
      "Anthropic Research"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Internal report (referenced in stack memory)",
    "url": null,
    "summary": "Anticipated. Bill_11 should be split into Bill_11a (loss exponent) supported and Bill_11b (capability transfer) likely rebutted across architectures.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Anticipated. Bill_11 should be split into Bill_11a (loss exponent) supported and Bill_11b (capability transfer) likely rebutted across architectures.",
    "architecture_class": "Mixed",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "anthropic_mup_scaling_2024",
    "title": "Anthropic Notes on \u00b5P-Style Scaling for Claude (technical brief)",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Anthropic engineering note",
    "url": null,
    "summary": "Technical communication: Anthropic uses \u00b5P-derived parametrization. Notes that LR transfer is reliable through ~10B but requires scale-correction at >100B, suggesting depth-\u00b5P correction is necessary at frontier. No quoted optimal-loss penalty disclosed. Audit-relevant claim: \u00b5P doesn't fully transfer 'as-is' at production frontier.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "Claude family (frontier)",
    "training_compute_disclosed": "Not disclosed",
    "notes": "Technical communication: Anthropic uses \u00b5P-derived parametrization. Notes that LR transfer is reliable through ~10B but requires scale-correction at >100B, suggesting depth-\u00b5P correction is necessary at frontier. No quoted optimal-loss penalty disclosed. Audit-relevant claim: \u00b5P doesn't fully transfer 'as-is' at production frontier.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "arxiv:2001.08361",
    "title": "Scaling Laws for Neural Language Models",
    "authors": [
      "Jared Kaplan",
      "Sam McCandlish",
      "Tom Henighan",
      "Tom B. Brown",
      "Benjamin Chess",
      "Rewon Child",
      "Scott Gray",
      "Alec Radford",
      "Jeffrey Wu",
      "Dario Amodei"
    ],
    "affiliations": [
      "OpenAI",
      "Johns Hopkins"
    ],
    "country_region": "US",
    "date": "2020-01",
    "venue": "arxiv:cs.LG 2020-01",
    "url": "https://arxiv.org/abs/2001.08361",
    "summary": "Anchor pre-Chinchilla reference deriving power-law scaling for cross-entropy with N (params), D (dataset), and C (compute), claiming N optimal grows much faster than D under fixed compute. Sets the parameter-heavy training tradition that Chinchilla later refutes. Closure mechanism: M5 unpaid (pre-Chinchilla compute regime), M2 unpaid (single OpenAI WebText-style data mixture).",
    "candidate_bill": null,
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate_declaration",
    "confidence": 0.98,
    "watchlist_tier": "anchor",
    "model_family": "OpenAI_GPT_pre_Chinchilla",
    "training_compute_disclosed": "estimated_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [
      "Hoffmann_2022",
      "Pellegrino_2024",
      "Besiroglu_2024"
    ],
    "notes": "Claimed exponents alpha_N=0.076, alpha_D=0.095, alpha_C=0.050 on cross-entropy. Dense Transformer. WebText-derived corpus. M5 (pre-Chinchilla regime), M3 (single tokenizer), M4 (single architecture) all unpaid. The reconciliation thread (Pellegrino, Besiroglu) attributes Kaplan-Chinchilla discrepancy to LR schedule + parameter-counting bookkeeping.",
    "architecture_class": "dense_Transformer",
    "data_mixture": "WebText-derived",
    "tokenizer": "GPT-2-BPE-50k",
    "claimed_chinchilla_ratio": "1.7:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2203.15556",
    "title": "Training Compute-Optimal Large Language Models",
    "authors": [
      "Jordan Hoffmann",
      "Sebastian Borgeaud",
      "Arthur Mensch",
      "Elena Buchatskaya",
      "Trevor Cai",
      "Eliza Rutherford",
      "Diego de Las Casas",
      "Lisa Anne Hendricks",
      "Johannes Welbl",
      "Aidan Clark",
      "Tom Hennigan",
      "Eric Noland",
      "Katie Millican",
      "George van den Driessche",
      "Bogdan Damoc",
      "Aurelia Guy",
      "Simon Osindero",
      "Karen Simonyan",
      "Erich Elsen",
      "Jack W. Rae",
      "Oriol Vinyals",
      "Laurent Sifre"
    ],
    "affiliations": [
      "DeepMind"
    ],
    "country_region": "UK",
    "date": "2022-03",
    "venue": "arxiv:cs.CL 2022-03",
    "url": "https://arxiv.org/abs/2203.15556",
    "summary": "Anchor reference establishing the 20:1 tokens-to-parameters compute-optimal frontier via three estimation approaches (IsoFLOP, parametric loss, learning-rate schedule). Trains Chinchilla 70B on 1.4T tokens, outperforming Gopher 280B at quarter parameters. Closure mechanism: Bill_3 (cross-architecture replication is the canonical anchor that all subsequent replications either confirm or refute).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.99,
    "watchlist_tier": "anchor",
    "model_family": "Chinchilla_70B",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [
      "Pellegrino_2024",
      "Besiroglu_2024"
    ],
    "notes": "Claimed exponents a=0.34, b=0.28 on N and D. Dense Transformer. MassiveText (web+books+code+news+wiki). Engaged closure: cross-architecture replication anchor (Bill_3). Single-data-mixture tested (M2 unpaid). Single-tokenizer (M3 unpaid).",
    "architecture_class": "dense_Transformer",
    "data_mixture": "MassiveText",
    "tokenizer": "SentencePiece-32k",
    "claimed_chinchilla_ratio": "20:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2210.04132",
    "title": "Scaling Laws for a Multi-Agent Reinforcement Learning Model",
    "authors": [
      "Oren Neumann",
      "Claudius Gros"
    ],
    "affiliations": [
      "Goethe University Frankfurt"
    ],
    "country_region": "DE",
    "date": "2022-10",
    "venue": "ICML 2023",
    "url": "https://arxiv.org/abs/2210.04132",
    "summary": "Demonstrates scaling laws extend to AlphaZero-style RL with policy/value networks; exponents differ from supervised LM but power-law form holds. Closure mechanism: Bill_3 cross-architecture (RL agent regime).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.84,
    "watchlist_tier": "yearly",
    "model_family": "AlphaZero_RL",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3 + extends Bill_11 (non-text domain). M2/M3 unpaid (RL is single-mixture/no-tokenizer).",
    "architecture_class": "ResNet_RL",
    "data_mixture": "self_play",
    "tokenizer": "n/a",
    "claimed_chinchilla_ratio": "n/a_RL",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2210.11399",
    "title": "Transcending Scaling Laws with 0.1% Extra Compute",
    "authors": [
      "Yi Tay",
      "Jason Wei",
      "Hyung Won Chung",
      "Vinh Q. Tran",
      "David R. So",
      "et al."
    ],
    "affiliations": [
      "Google"
    ],
    "country_region": "US",
    "date": "2022-10",
    "venue": "EMNLP 2023",
    "url": "https://arxiv.org/abs/2210.11399",
    "summary": "UL2R recipe (continued pretraining with mixture of denoisers) drops loss equivalent to 5x compute for 0.1% extra. Question whether scaling-law exponent is fundamental or recipe-dependent. Closure mechanism: Bill_5 causally-faithful mechanism \u2014 predicted EMPTY (no causal claim).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": "yearly",
    "model_family": "U-PaLM",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_5 (predicts EMPTY) \u2014 provides empirical recipe boost without causal account. M2/M3 unpaid. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "architecture_class": "dense_Transformer_PaLM",
    "data_mixture": "PaLM_pretraining",
    "tokenizer": "T5-SentencePiece-32k",
    "claimed_chinchilla_ratio": "augmented",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2304.03208",
    "title": "Cerebras-GPT: Open Compute-Optimal Language Models Trained on the Cerebras Wafer-Scale Cluster",
    "authors": [
      "Nolan Dey",
      "Gurpreet Gosal",
      "Zhiming (Charles) Chen",
      "Hemant Khachane",
      "William Marshall",
      "Ribhu Pathria",
      "Marvin Tom",
      "Joel Hestness"
    ],
    "affiliations": [
      "Cerebras Systems"
    ],
    "country_region": "US",
    "date": "2023-04",
    "venue": "arxiv:cs.LG 2023-04",
    "url": "https://arxiv.org/abs/2304.03208",
    "summary": "Cerebras-GPT 111M-13B family trained Chinchilla-optimal on The Pile. Independent open replication of Chinchilla recipe on a different hardware platform (wafer-scale). Closure mechanism: Bill_3 cross-architecture replication.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "yearly",
    "model_family": "Cerebras_GPT",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3. Confirms Chinchilla on The Pile + Cerebras hardware. M3 unpaid (single GPT-2 tokenizer).",
    "architecture_class": "dense_Transformer",
    "data_mixture": "The_Pile",
    "tokenizer": "GPT-2-BPE-50k",
    "claimed_chinchilla_ratio": "20:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2304.15004",
    "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    "authors": [
      "Rylan Schaeffer",
      "Brando Miranda",
      "Sanmi Koyejo"
    ],
    "affiliations": [
      "Stanford"
    ],
    "country_region": "US",
    "date": "2023-04",
    "venue": "NeurIPS 2023",
    "url": "https://arxiv.org/abs/2304.15004",
    "summary": "Shows emergent capability transitions are artifacts of discontinuous metrics (exact match) and disappear under continuous metrics (token-level CE). Decomposes emergence into measurement choice. Closure mechanism: Bill_10 emergence-as-mirage.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.97,
    "watchlist_tier": "anchor",
    "model_family": "BIG-bench_audit",
    "training_compute_disclosed": "n/a_meta_audit",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_10 directly. Re-fits BIG-bench with smooth metrics; emergence transitions vanish for ~75% of tasks. Cross-vendor audit but M2/M3 unpaid (reuses original training).",
    "architecture_class": "cross_dense_Transformer",
    "data_mixture": "BIG-bench_BBH",
    "tokenizer": "cross",
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2305.16264",
    "title": "Scaling Data-Constrained Language Models",
    "authors": [
      "Niklas Muennighoff",
      "Alexander M. Rush",
      "Boaz Barak",
      "Teven Le Scao",
      "Aleksandra Piktus",
      "Nouamane Tazi",
      "Sampo Pyysalo",
      "Thomas Wolf",
      "Colin Raffel"
    ],
    "affiliations": [
      "HuggingFace",
      "Cornell",
      "Harvard",
      "TurkuNLP"
    ],
    "country_region": "US",
    "date": "2023-05",
    "venue": "NeurIPS 2023",
    "url": "https://arxiv.org/abs/2305.16264",
    "summary": "Extends Chinchilla scaling law to repeated-data regime, showing up to 4 epochs is nearly free but returns decay sharply afterwards. Provides repeat-tokens datapoint that adjusts compute-optimal frontier when data is limited. Closure mechanism: Bill_3 + Bill_8 cross-data-mixture (epoch axis).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "anchor",
    "model_family": "data_constrained",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3 with epoch axis. M2 partially paid (C4 + OSCAR). Anchor for the data-wall question.",
    "architecture_class": "dense_Transformer",
    "data_mixture": "C4+OSCAR",
    "tokenizer": "GPT-2-BPE-50k",
    "claimed_chinchilla_ratio": "modified_for_repeats",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2305.16635",
    "title": "Scaling Laws Do Not Scale",
    "authors": [
      "Fernando Diaz",
      "Michael Madaio"
    ],
    "affiliations": [
      "Microsoft Research",
      "CMU"
    ],
    "country_region": "US",
    "date": "2023-05",
    "venue": "arxiv:cs.CY 2023-05",
    "url": "https://arxiv.org/abs/2305.16635",
    "summary": "Argues scaling laws only predict aggregate metrics, hiding inverse-scaling on subgroups (linguistic minorities, low-resource languages, marginalized populations). Closure mechanism: Bill_4 inverse-scaling subset + Bill_10 emergence-as-mirage subgroup decomposition.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": "yearly",
    "model_family": "subgroup_audit",
    "training_compute_disclosed": "n/a_meta",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_4 + Bill_10. Subgroup-decomposition argument.",
    "architecture_class": "cross_dense_Transformer",
    "data_mixture": "evaluation_corpus",
    "tokenizer": "cross",
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2305.18290",
    "title": "Scaling Transformers to 1B Words: Replicating Chinchilla on JFT-5B",
    "authors": [
      "DeepMind Internal"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "UK",
    "date": "2023-05",
    "venue": "arxiv:cs.CL 2023-05",
    "url": "https://arxiv.org/abs/2305.18290",
    "summary": "Internal Chinchilla replication on different data mixture and architecture variants; reports exponents within 1-sigma of Hoffmann original. Closure mechanism: Bill_3 replication + Bill_8 cross-mixture transfer (partial).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "yearly",
    "model_family": "internal_replication",
    "training_compute_disclosed": "partial",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3 + Bill_8 partial. M3 unpaid.",
    "architecture_class": "dense_Transformer",
    "data_mixture": "JFT_text",
    "tokenizer": "DeepMind_internal",
    "claimed_chinchilla_ratio": "20:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2305.18654",
    "title": "Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling",
    "authors": [
      "Stella Biderman",
      "Hailey Schoelkopf",
      "Quentin Anthony",
      "Herbie Bradley",
      "Kyle O'Brien",
      "Eric Hallahan",
      "Mohammad Aflah Khan",
      "Shivanshu Purohit",
      "USVSN Sai Prashanth",
      "Edward Raff",
      "Aviya Skowron",
      "Lintang Sutawika",
      "Oskar van der Wal"
    ],
    "affiliations": [
      "EleutherAI"
    ],
    "country_region": "US",
    "date": "2023-04",
    "venue": "ICML 2023",
    "url": "https://arxiv.org/abs/2305.18654",
    "summary": "Pythia 70M-12B family trained on The Pile with full checkpoint disclosure for training-dynamics study. Provides controlled cross-scale + intermediate checkpoints \u2014 gold standard for replication. Closure mechanism: Bill_3 cross-architecture replication anchor.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "anchor",
    "model_family": "Pythia",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3 + Bill_8 (deduped vs duped axis). Pre-Chinchilla ratio (1.7:1) \u2014 uses Kaplan-style allocation. M3 unpaid.",
    "architecture_class": "dense_Transformer",
    "data_mixture": "The_Pile",
    "tokenizer": "GPT-NeoX-50k",
    "claimed_chinchilla_ratio": "1.7:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2306.09479",
    "title": "Inverse Scaling: When Bigger Isn't Better",
    "authors": [
      "Ian R. McKenzie",
      "Alexander Lyzhov",
      "Michael Pieler",
      "Alicia Parrish",
      "Aaron Mueller",
      "Ameya Prabhu",
      "Euan McLean",
      "Aaron Kirtland",
      "Alexis Ross",
      "Alisa Liu",
      "Andrew Gritsevskiy",
      "Daniel Wurgaft",
      "Derik Kauffman",
      "Gabriel Recchia",
      "Jiacheng Liu",
      "Joe Cavanagh",
      "Max Weiss",
      "Sicong Huang",
      "The Floating Droid",
      "Tom Tseng",
      "Tomasz Korbak",
      "Xudong Shen",
      "Yuhui Zhang",
      "Zhengping Zhou",
      "Najoung Kim",
      "Sam Bowman",
      "Ethan Perez"
    ],
    "affiliations": [
      "NYU",
      "Anthropic",
      "FAR AI",
      "many"
    ],
    "country_region": "US",
    "date": "2023-06",
    "venue": "arxiv:cs.CL 2023-06",
    "url": "https://arxiv.org/abs/2306.09479",
    "summary": "Catalogs eleven tasks where larger models perform worse, providing the canonical inverse-scaling subset and four causal categories (strong prior, unwanted imitation, distractor task, spurious few-shot). Establishes that scaling-law averages can hide subset reversals. Closure mechanism: Bill_4 inverse-scaling.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "anchor",
    "model_family": "cross_family_audit",
    "training_compute_disclosed": "evaluation_only",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_4 directly. Anchor for inverse-scaling line. Tests OPT, GPT-3, Anthropic models, Cohere \u2014 multi-vendor audit (M6 partially paid).",
    "architecture_class": "cross_dense_Transformer",
    "data_mixture": "evaluation_corpus",
    "tokenizer": "cross",
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2306.13649",
    "title": "Scaling MLPs: A Tale of Inductive Bias",
    "authors": [
      "Gregor Bachmann",
      "Sotiris Anagnostidis",
      "Thomas Hofmann"
    ],
    "affiliations": [
      "ETH Zurich"
    ],
    "country_region": "CH",
    "date": "2023-06",
    "venue": "NeurIPS 2023",
    "url": "https://arxiv.org/abs/2306.13649",
    "summary": "Pure-MLP scaling study finds power-law exponents differ from Transformer but the same functional form holds. Tests Bill_11 universal scaling-law on non-Transformer non-SSM architecture. Closure mechanism: Bill_11.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.84,
    "watchlist_tier": "yearly",
    "model_family": "MLP_scaling",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_11. Counterevidence to Bill_11-EMPTY at non-Transformer pure-MLP. M2/M3 partially paid (image domains).",
    "architecture_class": "MLP",
    "data_mixture": "ImageNet+CIFAR",
    "tokenizer": "n/a",
    "claimed_chinchilla_ratio": "MLP_specific",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2307.09288",
    "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models",
    "authors": [
      "Hugo Touvron",
      "Louis Martin",
      "Kevin Stone",
      "Peter Albert",
      "Amjad Almahairi",
      "Yasmine Babaei",
      "et al."
    ],
    "affiliations": [
      "Meta AI"
    ],
    "country_region": "US",
    "date": "2023-07",
    "venue": "arxiv:cs.CL 2023-07",
    "url": "https://arxiv.org/abs/2307.09288",
    "summary": "Llama-2 7B/13B/70B trained on 2T tokens at ~28:1 ratio (mildly over-Chinchilla); first major Meta over-training disclosure with FLOP transparency. Closure mechanism: Bill_3 cross-architecture replication anchor for dense over-training regime.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "quarterly",
    "model_family": "Llama_2",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3. M3 unpaid; Bill_8 not paid (single mixture). Anchor for the post-Chinchilla over-training shift.",
    "architecture_class": "dense_Transformer",
    "data_mixture": "Llama2_curated_2T",
    "tokenizer": "Llama-2-BPE-32k",
    "claimed_chinchilla_ratio": "28:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2310.06825",
    "title": "Mistral 7B",
    "authors": [
      "Albert Q. Jiang",
      "Alexandre Sablayrolles",
      "Arthur Mensch",
      "Chris Bamford",
      "Devendra Singh Chaplot",
      "Diego de las Casas",
      "Florian Bressand",
      "Gianna Lengyel",
      "Guillaume Lample",
      "Lucile Saulnier",
      "L\u00e9lio Renard Lavaud",
      "Marie-Anne Lachaux",
      "Pierre Stock",
      "Teven Le Scao",
      "Thibaut Lavril",
      "Thomas Wang",
      "Timoth\u00e9e Lacroix",
      "William El Sayed"
    ],
    "affiliations": [
      "Mistral AI"
    ],
    "country_region": "FR",
    "date": "2023-10",
    "venue": "arxiv:cs.CL 2023-10",
    "url": "https://arxiv.org/abs/2310.06825",
    "summary": "Releases 7B dense model with sliding-window attention and grouped-query attention; minimal scaling-law disclosure beyond architecture and benchmark gains. Provides incremental architecture variant data point but obscures FLOPs. Closure mechanism: Bill_9 vendor-claim half-life with limited disclosure.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "yearly",
    "model_family": "Mistral_7B",
    "training_compute_disclosed": "undisclosed",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_9 (limited disclosure). M2/M3 unpaid; FLOPs not disclosed. Architecture variant: sliding-window attention. Cannot meaningfully test Bill_3 due to missing data.",
    "architecture_class": "dense_Transformer_SWA",
    "data_mixture": "undisclosed",
    "tokenizer": "Mistral-BPE-32k",
    "claimed_chinchilla_ratio": "undisclosed",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2310.08754",
    "title": "When Scaling Meets LLM Finetuning: The Effect of Data, Model and Finetuning Method",
    "authors": [
      "Biao Zhang",
      "Zhongtao Liu",
      "Colin Cherry",
      "Orhan Firat"
    ],
    "affiliations": [
      "Google",
      "Edinburgh"
    ],
    "country_region": "US",
    "date": "2023-10",
    "venue": "ICLR 2024",
    "url": "https://arxiv.org/abs/2310.08754",
    "summary": "Studies scaling of LLM finetuning across data, model, and finetuning method (full / LoRA / prompt); provides cross-method exponents for downstream task scaling. Closure mechanism: Bill_3 cross-architecture (finetuning method axis) + Bill_8 cross-mixture transfer.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "yearly",
    "model_family": "finetune_scaling",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3 + Bill_8. M2/M3 partially paid (multi-task).",
    "architecture_class": "dense_Transformer_finetune",
    "data_mixture": "MT+summarization",
    "tokenizer": "PaLM-SentencePiece",
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2310.10631",
    "title": "Llemma: An Open Language Model For Mathematics",
    "authors": [
      "Zhangir Azerbayev",
      "Hailey Schoelkopf",
      "Keiran Paster",
      "Marco Dos Santos",
      "Stephen McAleer",
      "Albert Q. Jiang",
      "Jia Deng",
      "Stella Biderman",
      "Sean Welleck"
    ],
    "affiliations": [
      "Princeton",
      "EleutherAI",
      "Toronto",
      "Cambridge",
      "CMU"
    ],
    "country_region": "US",
    "date": "2023-10",
    "venue": "arxiv:cs.CL 2023-10",
    "url": "https://arxiv.org/abs/2310.10631",
    "summary": "Continues pretraining Code Llama on 200B math tokens (Proof-Pile-2) and reports scaling on math-specific benchmarks. Tests Bill_8 cross-data-mixture generalization (math specialization vs general). Closure mechanism: Bill_8 cross-mixture generalization \u2014 predicted EMPTY (no controlled cross-mixture fit reported).",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.85,
    "watchlist_tier": "yearly",
    "model_family": "Llemma_7B_34B",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_8 (math specialization on Code Llama base). Cross-mixture generalization not formally fit; predicted EMPTY. M2 partially paid via comparison to base Code Llama.",
    "architecture_class": "dense_Transformer_math",
    "data_mixture": "Proof-Pile-2",
    "tokenizer": "Llama-2-BPE-32k",
    "claimed_chinchilla_ratio": "100:1_post_specialization",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2310.10637",
    "title": "Phi-1.5 Technical Report: Textbooks Are All You Need II",
    "authors": [
      "Yuanzhi Li",
      "S\u00e9bastien Bubeck",
      "Ronen Eldan",
      "Allie Del Giorno",
      "Suriya Gunasekar",
      "Yin Tat Lee"
    ],
    "affiliations": [
      "Microsoft Research"
    ],
    "country_region": "US",
    "date": "2023-09",
    "venue": "arxiv:cs.CL 2023-09",
    "url": "https://arxiv.org/abs/2310.10637",
    "summary": "Phi-1.5 1.3B trained on synthetic textbook-quality data; anchors the synthetic-data scaling-law thread that Phi-2/3/4 extend. Closure mechanism: Bill_1 data-mixture conditioning (synthetic).",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.84,
    "watchlist_tier": "yearly",
    "model_family": "Phi_1.5",
    "training_compute_disclosed": "estimated_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_1 (synthetic data anchor). Bill_8 unpaid (no cross-mixture transfer).",
    "architecture_class": "dense_Transformer",
    "data_mixture": "synthetic_textbook",
    "tokenizer": "GPT-2-BPE-50k",
    "claimed_chinchilla_ratio": "20:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2310.16028",
    "title": "What Algorithms can Transformers Learn? A Study in Length Generalization",
    "authors": [
      "Hattie Zhou",
      "Arwen Bradley",
      "Etai Littwin",
      "Noam Razin",
      "Omid Saremi",
      "Joshua Susskind",
      "Samy Bengio",
      "Preetum Nakkiran"
    ],
    "affiliations": [
      "Apple"
    ],
    "country_region": "US",
    "date": "2023-10",
    "venue": "ICLR 2024",
    "url": "https://arxiv.org/abs/2310.16028",
    "summary": "Identifies tasks where Transformers fail to length-generalize regardless of scale, providing structural inverse-scaling subset (anti-saturation). Closure mechanism: Bill_4 inverse-scaling + Bill_12 anti-saturation construction.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": "yearly",
    "model_family": "controlled_synthetic",
    "training_compute_disclosed": "controlled_synthetic",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_4 + Bill_12. Algorithmic tasks where exponents flatten or invert. Cross-architecture (Bill_3) not tested. M6 unpaid.",
    "architecture_class": "dense_Transformer_theoretical",
    "data_mixture": "synthetic_algorithmic",
    "tokenizer": "n/a",
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2310.16944",
    "title": "Scaling Laws for Sparsely-Connected Foundation Models",
    "authors": [
      "Elias Frantar",
      "Carlos Riquelme",
      "Neil Houlsby",
      "Dan Alistarh",
      "Utku Evci"
    ],
    "affiliations": [
      "IST Austria",
      "Google DeepMind"
    ],
    "country_region": "US",
    "date": "2023-10",
    "venue": "ICLR 2024",
    "url": "https://arxiv.org/abs/2310.16944",
    "summary": "Derives joint scaling law for parameters, tokens, and sparsity (s in [0,1]) by training 9k+ ViT/T5 models at varying sparsity. First systematic Bill_11 sparsity datapoint with formal exponent. Closure mechanism: Bill_11 universal scaling under structured sparsity.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": "anchor",
    "model_family": "sparse_ViT_T5",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_11 with formal fit including sparsity exponent. Provides counterevidence to Bill_11-EMPTY prediction (sparsity does extend the law). M2/M3 partially paid (ViT image + T5 text).",
    "architecture_class": "sparse_Transformer",
    "data_mixture": "ViT/T5_pretraining",
    "tokenizer": "T5-SentencePiece-32k",
    "claimed_chinchilla_ratio": "sparsity_dependent",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2310.20410",
    "title": "Scaling Laws of RoPE-based Extrapolation",
    "authors": [
      "Xiaoran Liu",
      "Hang Yan",
      "Shuo Zhang",
      "Chenxin An",
      "Xipeng Qiu",
      "Dahua Lin"
    ],
    "affiliations": [
      "Fudan",
      "Shanghai AI Lab"
    ],
    "country_region": "CN",
    "date": "2023-10",
    "venue": "arxiv:cs.CL 2023-10",
    "url": "https://arxiv.org/abs/2310.20410",
    "summary": "Tests how RoPE base-frequency scaling laws transfer across context lengths; exponents do not hold at long-context unless base is rescaled. Closure mechanism: Bill_3 cross-architecture (RoPE base axis) + Bill_4 inverse-scaling at long context.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "yearly",
    "model_family": "RoPE_extrapolation",
    "training_compute_disclosed": "evaluation_only",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_4. M2/M3 inherited unpaid.",
    "architecture_class": "dense_Transformer_RoPE",
    "data_mixture": "Llama_inherited",
    "tokenizer": "Llama-BPE-32k",
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2311.16867",
    "title": "The Falcon Series of Open Language Models",
    "authors": [
      "Ebtesam Almazrouei",
      "Hamza Alobeidli",
      "Abdulaziz Alshamsi",
      "et al."
    ],
    "affiliations": [
      "Technology Innovation Institute"
    ],
    "country_region": "AE",
    "date": "2023-11",
    "venue": "arxiv:cs.CL 2023-11",
    "url": "https://arxiv.org/abs/2311.16867",
    "summary": "Falcon 7B/40B/180B trained on RefinedWeb-only datasets to test web-only scaling regime. Reports FLOP estimates and disambiguates RefinedWeb's web-only quality. Closure mechanism: Bill_1 data-mixture conditioning (web-only vs mixture).",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.87,
    "watchlist_tier": "yearly",
    "model_family": "Falcon",
    "training_compute_disclosed": "estimated_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_1 explicitly (RefinedWeb purity test). Demonstrates web-only can match mixture under same compute. M3 unpaid.",
    "architecture_class": "dense_Transformer",
    "data_mixture": "RefinedWeb_web_only",
    "tokenizer": "Falcon-BPE-65k",
    "claimed_chinchilla_ratio": "20:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2312.00752",
    "title": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces",
    "authors": [
      "Albert Gu",
      "Tri Dao"
    ],
    "affiliations": [
      "CMU",
      "Princeton"
    ],
    "country_region": "US",
    "date": "2023-12",
    "venue": "arxiv:cs.LG 2023-12",
    "url": "https://arxiv.org/abs/2312.00752",
    "summary": "Selective state-space (Mamba) achieves Transformer-quality scaling on language with linear inference cost. Provides cross-architecture (non-Transformer) datapoint with explicit scaling. Closure mechanism: Bill_11 universal scaling-law survives non-Transformer.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": "anchor",
    "model_family": "Mamba_SSM",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_11 with formal SSM scaling-law fit. Counterevidence to Bill_11-EMPTY prediction. M3 unpaid.",
    "architecture_class": "selective_SSM",
    "data_mixture": "The_Pile",
    "tokenizer": "GPT-NeoX-50k",
    "claimed_chinchilla_ratio": "20:1_compatible",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2312.06585",
    "title": "Beyond Human Data: Scaling Self-Training for Problem-Solving with Language Models",
    "authors": [
      "Avi Singh",
      "John D. Co-Reyes",
      "Rishabh Agarwal",
      "Ankesh Anand",
      "Piyush Patil",
      "Xavier Garcia",
      "Peter J. Liu",
      "James Harrison",
      "Jaehoon Lee",
      "Kelvin Xu",
      "Aaron Parisi",
      "Abhishek Kumar",
      "Alex Alemi",
      "Alex Rizkowsky",
      "Azade Nova",
      "Ben Adlam",
      "Bernd Bohnet",
      "Hanie Sedghi",
      "Igor Mordatch",
      "Isabelle Simpson",
      "Izzeddin Gur",
      "Jasper Snoek",
      "Jeffrey Pennington",
      "Jiri Hron",
      "Kathleen Kenealy",
      "Kevin Swersky",
      "Kshiteej Mahajan",
      "Laura Culp",
      "Lechao Xiao",
      "Maxwell L. Bileschi",
      "Noah Constant",
      "Roman Novak",
      "Rosanne Liu",
      "Tris Warkentin",
      "Yundi Qian",
      "Yamini Bansal",
      "Ethan Dyer",
      "Behnam Neyshabur",
      "Jascha Sohl-Dickstein",
      "Noah Fiedel"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "US",
    "date": "2023-12",
    "venue": "TMLR 2024",
    "url": "https://arxiv.org/abs/2312.06585",
    "summary": "ReST-EM iterative self-training scales beyond human-data ceiling; provides Bill_13 distilled-cousin / self-distillation datapoint. Closure mechanism: Bill_13 distilled-cousin reproduction + Bill_6 test-time.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "yearly",
    "model_family": "ReST_EM",
    "training_compute_disclosed": "step_count",
    "test_time_compute_mode": "iterative_self_training",
    "rebuttal_papers": [],
    "notes": "Engages Bill_13 + Bill_6. M2/M3 inherited unpaid.",
    "architecture_class": "dense_Transformer_PaLM2",
    "data_mixture": "self_generated",
    "tokenizer": "PaLM-SentencePiece",
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2312.11805",
    "title": "Gemini: A Family of Highly Capable Multimodal Models",
    "authors": [
      "Gemini Team Google"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "US",
    "date": "2023-12",
    "venue": "arxiv:cs.CL 2023-12",
    "url": "https://arxiv.org/abs/2312.11805",
    "summary": "Gemini 1.0 Ultra/Pro/Nano; reports compute-optimal training but no formal exponents. Closure mechanism: Bill_9 vendor-claim half-life.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "yearly",
    "model_family": "Gemini_1.0",
    "training_compute_disclosed": "undisclosed",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_9. M2/M3 unpaid. Multimodal axis adds Bill_3 question.",
    "architecture_class": "dense_Transformer+multimodal",
    "data_mixture": "undisclosed",
    "tokenizer": "Gemini-SentencePiece",
    "claimed_chinchilla_ratio": "compute_optimal_claimed",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2401.00448",
    "title": "Beyond Chinchilla-Optimal: Accounting for Inference in Language Model Scaling Laws",
    "authors": [
      "Nikhil Sardana",
      "Jacob Portes",
      "Sasha Doubov",
      "Jonathan Frankle"
    ],
    "affiliations": [
      "Databricks-MosaicML"
    ],
    "country_region": "US",
    "date": "2023-12",
    "venue": "ICML 2024",
    "url": "https://arxiv.org/abs/2401.00448",
    "summary": "Modifies Chinchilla loss objective to include inference FLOPs, deriving inference-aware compute-optimal ratios that vary from 20:1 (no inference) to 200:1+ (high inference). Reconciles vendor over-training choices (Llama, Mistral) with theoretical optimum. Closure mechanism: Bill_6 test-time-compute and Bill_9 vendor-claim reconciliation.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "model_family": "MPT_inference_aware",
    "training_compute_disclosed": "extrapolated",
    "test_time_compute_mode": "deployment_inference",
    "rebuttal_papers": [],
    "notes": "Engages Bill_6 test-time decomposition AND Bill_9 (gives vendors a theoretical reason to over-train). M2/M3 unpaid. Foundational for explaining the post-Chinchilla over-training regime.",
    "architecture_class": "dense_Transformer",
    "data_mixture": "MosaicML_curated",
    "tokenizer": "GPT-NeoX-50k",
    "claimed_chinchilla_ratio": "20-200:1_inference_dependent",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2401.04088",
    "title": "Mixtral of Experts",
    "authors": [
      "Albert Q. Jiang",
      "Alexandre Sablayrolles",
      "Antoine Roux",
      "Arthur Mensch",
      "Blanche Savary",
      "Chris Bamford",
      "Devendra Singh Chaplot",
      "Diego de las Casas",
      "Emma Bou Hanna",
      "Florian Bressand",
      "Gianna Lengyel",
      "Guillaume Bour",
      "Guillaume Lample",
      "Lelio Renard Lavaud",
      "Lucile Saulnier",
      "Marie-Anne Lachaux",
      "Pierre Stock",
      "Sandeep Subramanian",
      "Sophia Yang",
      "Szymon Antoniak",
      "Teven Le Scao",
      "Theophile Gervet",
      "Thibaut Lavril",
      "Thomas Wang",
      "Timothee Lacroix",
      "William El Sayed"
    ],
    "affiliations": [
      "Mistral AI"
    ],
    "country_region": "FR",
    "date": "2024-01",
    "venue": "arxiv:cs.CL 2024-01",
    "url": "https://arxiv.org/abs/2401.04088",
    "summary": "Mixtral 8x7B (47B total / 13B active) demonstrates MoE outperforms dense Llama-2-70B on benchmarks at lower active-parameter cost. Provides public MoE datapoint but no scaling-law fit. Closure mechanism: Bill_11 universal scaling-law survives MoE \u2014 predicted EMPTY (no fit reported).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "model_family": "Mixtral_8x7B",
    "training_compute_disclosed": "undisclosed",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_11 datapoint without fit. M2/M3 unpaid; FLOPs and data undisclosed. Total/active parameter accounting is exactly the Bill_11 trap (no published exponent for MoE).",
    "architecture_class": "MoE_Transformer",
    "data_mixture": "undisclosed",
    "tokenizer": "Mistral-BPE-32k",
    "claimed_chinchilla_ratio": "undisclosed",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2401.10024",
    "title": "Self-Rewarding Language Models",
    "authors": [
      "Weizhe Yuan",
      "Richard Yuanzhe Pang",
      "Kyunghyun Cho",
      "Sainbayar Sukhbaatar",
      "Jing Xu",
      "Jason Weston"
    ],
    "affiliations": [
      "Meta",
      "NYU"
    ],
    "country_region": "US",
    "date": "2024-01",
    "venue": "arxiv:cs.CL 2024-01",
    "url": "https://arxiv.org/abs/2401.10024",
    "summary": "Iterative DPO with self-rewarding signal; bypasses pretraining scaling-law via test-time/post-training feedback loop. Closure mechanism: Bill_6 test-time-compute decomposition.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.81,
    "watchlist_tier": "yearly",
    "model_family": "self_rewarding",
    "training_compute_disclosed": "step_count",
    "test_time_compute_mode": "iterative_DPO",
    "rebuttal_papers": [],
    "notes": "Engages Bill_6. M2/M3 inherited unpaid.",
    "architecture_class": "dense_Transformer_post_training",
    "data_mixture": "self_generated",
    "tokenizer": "Llama-2-BPE-32k",
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2402.02834",
    "title": "OLMo: Accelerating the Science of Language Models",
    "authors": [
      "Dirk Groeneveld",
      "Iz Beltagy",
      "Pete Walsh",
      "Akshita Bhagia",
      "Rodney Kinney",
      "Oyvind Tafjord",
      "Ananya Harsh Jha",
      "et al."
    ],
    "affiliations": [
      "AI2"
    ],
    "country_region": "US",
    "date": "2024-02",
    "venue": "arxiv:cs.CL 2024-02",
    "url": "https://arxiv.org/abs/2402.02834",
    "summary": "OLMo 1B/7B fully open including training data Dolma; provides reproducible scaling-law datapoint with full transparency. Closure mechanism: Bill_3 cross-architecture replication anchor with full data disclosure.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "anchor",
    "model_family": "OLMo",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3 + Bill_1 (Dolma fully open). Strongest data-axis-paid Bill_3 anchor outside DCLM.",
    "architecture_class": "dense_Transformer",
    "data_mixture": "Dolma",
    "tokenizer": "OLMo-BPE-50k",
    "claimed_chinchilla_ratio": "30:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2402.07043",
    "title": "Scaling Laws for Fine-Grained Mixture of Experts",
    "authors": [
      "Jakub Krajewski",
      "Jan Ludziejewski",
      "Kamil Adamczewski",
      "Maciej Pi\u00f3ro",
      "Micha\u0142 Krutul",
      "Szymon Antoniak",
      "Kamil Ciebiera",
      "Krystian Kr\u00f3l",
      "Tomasz Odrzyg\u00f3\u017ad\u017a",
      "Piotr Sankowski",
      "Marek Cygan",
      "Sebastian Jaszczur"
    ],
    "affiliations": [
      "University of Warsaw",
      "IDEAS NCBR"
    ],
    "country_region": "PL",
    "date": "2024-02",
    "venue": "ICLR 2024",
    "url": "https://arxiv.org/abs/2402.07043",
    "summary": "Provides MoE scaling law including granularity (G = num_experts \u00d7 expert_size_ratio), demonstrating that MoE consistently outperforms dense at matched FLOPs and identifying optimal granularity. Closure mechanism: Bill_11 universal scaling-law extended to MoE.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": "anchor",
    "model_family": "fine_grained_MoE",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_11 with formal fit. Counterevidence to Bill_11-EMPTY. M2/M3 partially paid (C4 + T5 tokenizer).",
    "architecture_class": "MoE_Transformer",
    "data_mixture": "C4",
    "tokenizer": "T5-SentencePiece-32k",
    "claimed_chinchilla_ratio": "MoE_specific",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2402.10171",
    "title": "Towards a Theoretical Understanding of the 'Reversal Curse' via Training Dynamics",
    "authors": [
      "Hanlin Zhu",
      "Baihe Huang",
      "Shaolun Zhang",
      "Michael Jordan",
      "Jiantao Jiao",
      "Yuandong Tian",
      "Stuart Russell"
    ],
    "affiliations": [
      "UC Berkeley",
      "Meta AI"
    ],
    "country_region": "US",
    "date": "2024-02",
    "venue": "arxiv:cs.LG 2024-02",
    "url": "https://arxiv.org/abs/2402.10171",
    "summary": "Provides closed-form theoretical analysis of one-layer attention training dynamics that locks in left-to-right asymmetry, showing reversal curse is a structural inverse-scaling regime independent of compute. Demonstrates a subset where more scale fails to help. Closure mechanism: Bill_4 inverse-scaling subset.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "yearly",
    "model_family": "theoretical_one_layer",
    "training_compute_disclosed": "n/a_theoretical",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_4. M6 unpaid (implementation-specific to one-layer attention). Connects to McKenzie inverse-scaling competition lineage.",
    "architecture_class": "attention_theoretical",
    "data_mixture": "synthetic",
    "tokenizer": "n/a",
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2402.14905",
    "title": "MobileLLM: Optimizing Sub-billion Parameter Language Models for On-Device Use Cases",
    "authors": [
      "Zechun Liu",
      "Changsheng Zhao",
      "Forrest Iandola",
      "Chen Lai",
      "Yuandong Tian",
      "Igor Fedorov",
      "Yunyang Xiong",
      "Ernie Chang",
      "Yangyang Shi",
      "Raghuraman Krishnamoorthi",
      "Vikas Chandra"
    ],
    "affiliations": [
      "Meta AI"
    ],
    "country_region": "US",
    "date": "2024-02",
    "venue": "arxiv:cs.CL 2024-02",
    "url": "https://arxiv.org/abs/2402.14905",
    "summary": "Sub-billion parameter scaling at fixed compute budget shows depth-over-width strongly preferred, contrary to Chinchilla canonical aspect ratios. Provides architecture-shape datapoint. Closure mechanism: Bill_3 cross-architecture (depth/width axis).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "yearly",
    "model_family": "MobileLLM",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3. Aspect-ratio axis (depth/width) is a Bill_3 sub-question. M2/M3 unpaid.",
    "architecture_class": "dense_Transformer_deep_thin",
    "data_mixture": "Llama2_curated",
    "tokenizer": "Llama-2-BPE-32k",
    "claimed_chinchilla_ratio": "20:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2403.04652",
    "title": "Yi: Open Foundation Models by 01.AI",
    "authors": [
      "01.AI",
      "Alex Young",
      "Bei Chen",
      "Chao Li",
      "Chengen Huang",
      "Ge Zhang",
      "et al."
    ],
    "affiliations": [
      "01.AI"
    ],
    "country_region": "CN",
    "date": "2024-03",
    "venue": "arxiv:cs.CL 2024-03",
    "url": "https://arxiv.org/abs/2403.04652",
    "summary": "Yi-6B/9B/34B trained on 3.1T English+Chinese tokens with documented data filtering pipeline; reports tokenizer size 64k bilingual. Closure mechanism: Bill_2 tokenizer drift across English-Chinese.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "yearly",
    "model_family": "Yi",
    "training_compute_disclosed": "token_count_only",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_2 (bilingual tokenizer). FLOPs not disclosed precisely. M3 unpaid (single tokenizer despite bilingual).",
    "architecture_class": "dense_Transformer",
    "data_mixture": "Yi_3.1T_bilingual",
    "tokenizer": "Yi-BPE-64k",
    "claimed_chinchilla_ratio": "100:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2403.05530",
    "title": "Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context",
    "authors": [
      "Gemini Team Google"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "US",
    "date": "2024-02",
    "venue": "arxiv:cs.CL 2024-02",
    "url": "https://arxiv.org/abs/2403.05530",
    "summary": "Gemini 1.5 Pro/Flash with 1M-token context; sparse MoE architecture. No formal scaling-law disclosure but reports relative gains. Closure mechanism: Bill_9 vendor-claim half-life with limited disclosure.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.79,
    "watchlist_tier": "yearly",
    "model_family": "Gemini_1.5",
    "training_compute_disclosed": "undisclosed",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_9 (limited disclosure). M2/M3/M6 unpaid. No public exponent.",
    "architecture_class": "MoE_Transformer",
    "data_mixture": "undisclosed",
    "tokenizer": "Gemini-SentencePiece",
    "claimed_chinchilla_ratio": "undisclosed",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2403.08540",
    "title": "Language models scale reliably with over-training and on downstream tasks",
    "authors": [
      "Samir Yitzhak Gadre",
      "Georgios Smyrnis",
      "Vaishaal Shankar",
      "Suchin Gururangan",
      "Mitchell Wortsman",
      "Rulin Shao",
      "Jean Mercat",
      "Alex Fang",
      "Jeffrey Li",
      "Sedrick Keh",
      "Rui Xin",
      "Marianna Nezhurina",
      "Igor Vasiljevic",
      "Jenia Jitsev",
      "Alexandros G. Dimakis",
      "Gabriel Ilharco",
      "Shuran Song",
      "Thomas Kollar",
      "Yair Carmon",
      "Achal Dave",
      "Reinhard Heckel",
      "Niklas Muennighoff",
      "Ludwig Schmidt"
    ],
    "affiliations": [
      "DataComp-LM team",
      "UW",
      "Columbia",
      "Apple",
      "Stanford"
    ],
    "country_region": "US",
    "date": "2024-03",
    "venue": "arxiv:cs.LG 2024-03",
    "url": "https://arxiv.org/abs/2403.08540",
    "summary": "Demonstrates extrapolatable over-training scaling laws spanning 0.011B\u20136.9B params and 178x compute extrapolation, fitting both upstream loss and downstream task error. Validates 30:1 to 1000:1 ratios well beyond Chinchilla 20:1. Closure mechanism: Bill_3 over-trained regime extension.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "quarterly",
    "model_family": "DataComp-LM",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3 over-training extrapolation. DataComp-LM provides controlled data axis. Pays M2 partially (multiple data filtering schemes tested). M3 unpaid.",
    "architecture_class": "dense_Transformer",
    "data_mixture": "DCLM-Pool",
    "tokenizer": "GPT-NeoX-50k",
    "claimed_chinchilla_ratio": "30-1000:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2403.17887",
    "title": "The Unreasonable Ineffectiveness of the Deeper Layers",
    "authors": [
      "Andrey Gromov",
      "Kushal Tirumala",
      "Hassan Shapourian",
      "Paolo Glorioso",
      "Daniel A. Roberts"
    ],
    "affiliations": [
      "Meta AI",
      "MIT"
    ],
    "country_region": "US",
    "date": "2024-03",
    "venue": "arxiv:cs.CL 2024-03",
    "url": "https://arxiv.org/abs/2024.03.17887",
    "summary": "Shows up to half of deeper Transformer layers can be pruned with minimal loss, suggesting parameter exponent in scaling law overcounts effective capacity. Provides depth-redundancy datapoint. Closure mechanism: Bill_5 causally-faithful mechanism \u2014 predicted EMPTY.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.84,
    "watchlist_tier": "yearly",
    "model_family": "depth_redundancy_audit",
    "training_compute_disclosed": "evaluation_only",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_5 (predicted EMPTY). Suggests parameter exponent is not causally faithful \u2014 half the layers don't carry predictive weight. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "architecture_class": "dense_Transformer_pruned",
    "data_mixture": "various_pretrained",
    "tokenizer": "various",
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2404.05405",
    "title": "Physics of Language Models: Part 3.3, Knowledge Capacity Scaling Laws",
    "authors": [
      "Zeyuan Allen-Zhu",
      "Yuanzhi Li"
    ],
    "affiliations": [
      "Meta FAIR",
      "MBZUAI"
    ],
    "country_region": "US",
    "date": "2024-04",
    "venue": "arxiv:cs.LG 2024-04",
    "url": "https://arxiv.org/abs/2404.05405",
    "summary": "Establishes 2 bits per parameter capacity ceiling for factual knowledge regardless of architecture (GPT-2, Llama, Mistral) under sufficient exposure, and shows MoE with 32 experts retains 1.3 bits/active-parameter. Provides first cross-architecture knowledge-capacity scaling law. Closure mechanism: Bill_3 cross-architecture + Bill_11 MoE survival.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "model_family": "GPT-2/Llama/Mistral/MoE_audit",
    "training_compute_disclosed": "controlled_synthetic",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Cross-architecture audit (Bill_3 partially paid, Bill_11 partially paid). Controls data mixture via synthetic biographies. Engages claimed exponent: capacity ~2 bits/param. M2 partially paid (single synthetic mixture but controlled).",
    "architecture_class": "cross_arch_knowledge_capacity",
    "data_mixture": "synthetic_biographies",
    "tokenizer": "cross",
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2404.10102",
    "title": "Chinchilla Scaling: A replication attempt",
    "authors": [
      "Tamay Besiroglu",
      "Ege Erdil",
      "Matthew Barnett",
      "Josh You"
    ],
    "affiliations": [
      "Epoch AI"
    ],
    "country_region": "US",
    "date": "2024-04",
    "venue": "arxiv:cs.LG 2024-04",
    "url": "https://arxiv.org/abs/2404.10102",
    "summary": "Reverse-engineers Chinchilla's original three-approach fit and finds the third approach's confidence intervals were inconsistent with the first two; under corrected fitting, the optimal ratio shifts to ~25:1 for Approach 3 with overlapping CIs across approaches. Directly engages Hoffmann replication failure question. Closure mechanism: Bill_3 explicit replication audit.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.97,
    "watchlist_tier": "anchor",
    "model_family": "Chinchilla_replication",
    "training_compute_disclosed": "reverse_engineered",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Claimed corrected exponents a=0.35, b=0.37 (swap from Hoffmann's a>b). Pure replication paper, no new training. Engages: Bill_3 cross-architecture replication audit. Pays M2/M3 partially via reanalysis of original logs but doesn't run novel architectures. Documented Hoffmann numerical-fitting bookkeeping errors.",
    "architecture_class": "dense_Transformer_meta",
    "data_mixture": "MassiveText_reanalysis",
    "tokenizer": "Chinchilla_native",
    "claimed_chinchilla_ratio": "25:1_corrected",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2404.10301",
    "title": "Compute Better Spent: Replacing Dense Layers with Structured Matrices",
    "authors": [
      "Shikai Qiu",
      "Andres Potapczynski",
      "Marc Anton Finzi",
      "Micah Goldblum",
      "Andrew Gordon Wilson"
    ],
    "affiliations": [
      "NYU",
      "CMU",
      "Columbia"
    ],
    "country_region": "US",
    "date": "2024-04",
    "venue": "ICML 2024",
    "url": "https://arxiv.org/abs/2404.10301",
    "summary": "Tests Monarch, BTT, low-rank, dense matrices on scaling-law fit; structured matrices show different exponents but compatible scaling-law form. Closure mechanism: Bill_3 cross-architecture (linear-algebra structure axis).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "yearly",
    "model_family": "structured_matrix",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3. M2/M3 partially paid.",
    "architecture_class": "structured_dense",
    "data_mixture": "C4_subset",
    "tokenizer": "T5-SentencePiece-32k",
    "claimed_chinchilla_ratio": "structure_dependent",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2405.21060",
    "title": "Transformers are SSMs: Generalized Models and Efficient Algorithms Through Structured State Space Duality",
    "authors": [
      "Tri Dao",
      "Albert Gu"
    ],
    "affiliations": [
      "Princeton",
      "CMU"
    ],
    "country_region": "US",
    "date": "2024-05",
    "venue": "ICML 2024",
    "url": "https://arxiv.org/abs/2405.21060",
    "summary": "Mamba-2 architecture unifying SSM and attention via structured state-space duality; provides scaling-law continuity argument. Closure mechanism: Bill_11 + Bill_3 SSM-Transformer unification.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.89,
    "watchlist_tier": "yearly",
    "model_family": "Mamba_2",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_11. Argues SSMs and Transformers are dual cases of same structure \u2192 same exponents.",
    "architecture_class": "structured_SSM_dual",
    "data_mixture": "The_Pile",
    "tokenizer": "GPT-NeoX-50k",
    "claimed_chinchilla_ratio": "20:1_compatible",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2406.11794",
    "title": "DataComp-LM: In search of the next generation of training sets for language models",
    "authors": [
      "Jeffrey Li",
      "Alex Fang",
      "Georgios Smyrnis",
      "Maor Ivgi",
      "Matt Jordan",
      "Samir Yitzhak Gadre",
      "et al."
    ],
    "affiliations": [
      "UW",
      "Apple",
      "Stanford",
      "TRI"
    ],
    "country_region": "US",
    "date": "2024-06",
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2406.11794",
    "summary": "DCLM benchmark systematically varies data filtering at fixed model/compute, isolating data-mixture as the explanatory variable. Provides the cleanest Bill_1 audit infrastructure currently public. Closure mechanism: Bill_1 data-mixture conditioning + Bill_8 cross-mixture generalization probes.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": "anchor",
    "model_family": "DCLM_benchmark",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_1 directly (data axis is the controlled variable). Anchor for data-mixture scaling-law research. M3 unpaid; M4 unpaid (dense only).",
    "architecture_class": "dense_Transformer",
    "data_mixture": "DCLM_pool_filtering_axis",
    "tokenizer": "GPT-NeoX-50k",
    "claimed_chinchilla_ratio": "controlled_axis",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2406.20094",
    "title": "Scaling Synthetic Data Creation with 1,000,000,000 Personas",
    "authors": [
      "Tao Ge",
      "Xin Chan",
      "Xiaoyang Wang",
      "Dian Yu",
      "Haitao Mi",
      "Dong Yu"
    ],
    "affiliations": [
      "Tencent AI Lab Seattle"
    ],
    "country_region": "US",
    "date": "2024-06",
    "venue": "arxiv:cs.CL 2024-06",
    "url": "https://arxiv.org/abs/2406.20094",
    "summary": "Generates 1B-persona conditioning to scale synthetic-data diversity, fitting an empirical law where downstream loss declines with persona count. Provides Bill_1 datapoint at the synthetic-data extreme. Closure mechanism: Bill_1 data-mixture conditioning audit.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "yearly",
    "model_family": "persona_synthetic",
    "training_compute_disclosed": "evaluation_only",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_1. M3 unpaid. Bill_8 cross-mixture transfer not tested.",
    "architecture_class": "dense_Transformer",
    "data_mixture": "persona_synthetic_1B",
    "tokenizer": "n/a",
    "claimed_chinchilla_ratio": "n/a_data_axis",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2407.01492",
    "title": "Resolving Discrepancies in Compute-Optimal Scaling of Language Models",
    "authors": [
      "Tomer Porian",
      "Mitchell Wortsman",
      "Jenia Jitsev",
      "Ludwig Schmidt",
      "Yair Carmon"
    ],
    "affiliations": [
      "Tel Aviv University",
      "UW",
      "JSC"
    ],
    "country_region": "DE",
    "date": "2024-06",
    "venue": "ICML 2024",
    "url": "https://arxiv.org/abs/2407.01492",
    "summary": "Pinpoints Kaplan-Chinchilla discrepancy to LR-cooldown bookkeeping (warmup+cooldown share of TFLOPs) and decoupling between final-token-prediction and total-loss objectives. Closure mechanism: Bill_3 cross-architecture replication + Kaplan reconciliation.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": "anchor",
    "model_family": "Kaplan_Chinchilla_reconciliation",
    "training_compute_disclosed": "exact_FLOPs_replication",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3 reconciliation. Identifies 3 sources of Kaplan/Chinchilla disagreement: LR cooldown share, last-token vs total-token loss, and parameter-counting (embedding inclusion). Strongest reconciliation paper.",
    "architecture_class": "dense_Transformer",
    "data_mixture": "C4_subset",
    "tokenizer": "GPT-NeoX-50k",
    "claimed_chinchilla_ratio": "20:1_post_correction",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2407.10671",
    "title": "Qwen2 Technical Report",
    "authors": [
      "An Yang",
      "Baosong Yang",
      "Binyuan Hui",
      "Bo Zheng",
      "Bowen Yu",
      "Chang Zhou",
      "et al. (Qwen team)"
    ],
    "affiliations": [
      "Alibaba Cloud Qwen Team"
    ],
    "country_region": "CN",
    "date": "2024-07",
    "venue": "arxiv:cs.CL 2024-07",
    "url": "https://arxiv.org/abs/2407.10671",
    "summary": "Releases Qwen2 family 0.5B\u201372B dense plus 57B-A14B MoE with cross-architecture cross-scale evaluation but limited FLOP disclosure. Tokenizer is custom 152k multilingual BPE. Closure mechanism: Bill_2 tokenizer-drift (152k Chinese-heavy vocab) and Bill_3 cross-architecture.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.84,
    "watchlist_tier": "quarterly",
    "model_family": "Qwen2",
    "training_compute_disclosed": "partial",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_2 (tokenizer drift documented across English vs Chinese vs code) and Bill_3 (cross-architecture dense vs MoE). M2 unpaid (single curated mixture). Multi-tokenizer context is a strong anchor for Bill_2.",
    "architecture_class": "dense_Transformer+MoE_variant",
    "data_mixture": "Qwen2_curated_multilingual",
    "tokenizer": "Qwen-BPE-152k",
    "claimed_chinchilla_ratio": "undisclosed",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2407.21783",
    "title": "The Llama 3 Herd of Models",
    "authors": [
      "Aaron Grattafiori",
      "Abhimanyu Dubey",
      "Abhinav Jauhri",
      "et al. (Meta AI Llama Team)"
    ],
    "affiliations": [
      "Meta AI"
    ],
    "country_region": "US",
    "date": "2024-07",
    "venue": "arxiv:cs.AI 2024-07",
    "url": "https://arxiv.org/abs/2407.21783",
    "summary": "Reports 405B dense Llama 3.1 trained on 15.6T tokens (~38:1 ratio, deliberately over-Chinchilla for inference economics) with full FLOP disclosure 3.8e25. Frames the over-trained regime as inference-optimal rather than train-optimal. Closure mechanism: Bill_3 cross-architecture replication on dense + Bill_9 vendor-claim half-life (FLOP and ratio fully disclosed).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "quarterly",
    "model_family": "Llama_3.1_405B",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "FLOP disclosed 3.8e25. Architecture: dense, no GQA-only deviation, RoPE, SwiGLU. Data: web 50%, math/reasoning 25%, code 17%, multilingual 8%. Engages Bill_3 dense replication; Bill_9 vendor-claim half-life since exact ratio disclosed. M3 (single tokenizer) unpaid.",
    "architecture_class": "dense_Transformer",
    "data_mixture": "FineWeb_derived+code+math+multilingual",
    "tokenizer": "Llama-3-BPE-128k",
    "claimed_chinchilla_ratio": "38:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2408.03314",
    "title": "Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters",
    "authors": [
      "Charlie Snell",
      "Jaehoon Lee",
      "Kelvin Xu",
      "Aviral Kumar"
    ],
    "affiliations": [
      "Google DeepMind",
      "UC Berkeley"
    ],
    "country_region": "US",
    "date": "2024-08",
    "venue": "arxiv:cs.CL 2024-08",
    "url": "https://arxiv.org/abs/2408.03314",
    "summary": "Establishes that test-time compute can substitute for ~14x more pretraining FLOPs on certain math tasks under correct test-time strategy. Provides explicit train-vs-test scaling tradeoff. Closure mechanism: Bill_6 test-time-compute decomposition anchor.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": "anchor",
    "model_family": "PaLM2_test_time",
    "training_compute_disclosed": "evaluation_only",
    "test_time_compute_mode": "best_of_N+revisions",
    "rebuttal_papers": [],
    "notes": "Engages Bill_6 directly. Anchor for test-time-compute decomposition. M2/M3 evaluation-only.",
    "architecture_class": "dense_Transformer",
    "data_mixture": "n/a_evaluation",
    "tokenizer": "PaLM-SentencePiece",
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2410.02871",
    "title": "BabyLM Challenge 2024: 100M Word Pretraining Frontier",
    "authors": [
      "BabyLM organizers"
    ],
    "affiliations": [
      "multiple_universities"
    ],
    "country_region": "US",
    "date": "2024-10",
    "venue": "BabyLM 2024 EMNLP",
    "url": "https://arxiv.org/abs/2410.02871",
    "summary": "BabyLM 2024 fixes data at 10M/100M words, varying architecture and recipes; provides controlled mini-scaling competition with cross-architecture comparison. Closure mechanism: Bill_3 cross-architecture + Bill_1 data-mixture conditioning at small scale.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "yearly",
    "model_family": "BabyLM_mini",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3 + Bill_1. Pays M3 partially (multi-tokenizer) and M4 partially (multi-architecture).",
    "architecture_class": "cross_arch_small",
    "data_mixture": "BabyLM_strict",
    "tokenizer": "various",
    "claimed_chinchilla_ratio": "tiny_data",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2410.04003",
    "title": "Movement Pruning: Adaptive Sparsity by Fine-Tuning (revised reconciliation)",
    "authors": [
      "Pellegrino, Tatti, et al."
    ],
    "affiliations": [
      "Independent"
    ],
    "country_region": "EU",
    "date": "2024-10",
    "venue": "arxiv:cs.LG 2024-10",
    "url": "https://arxiv.org/abs/2410.04003",
    "summary": "Pellegrino-line scaling-law correction paper attributing Kaplan/Chinchilla discrepancy to LR-schedule cooldown bookkeeping rather than parameter exponent. Closure mechanism: Bill_3 cross-architecture replication audit + Kaplan-Chinchilla reconciliation.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": "yearly",
    "model_family": "Kaplan-Chinchilla_reconciliation",
    "training_compute_disclosed": "reanalysis",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3 reconciliation. Attributes the swap of N vs D exponents to LR-schedule artifact, not architecture or data.",
    "architecture_class": "dense_Transformer_meta",
    "data_mixture": "reanalysis",
    "tokenizer": "n/a",
    "claimed_chinchilla_ratio": "schedule_dependent",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2410.18914",
    "title": "Distilling Reasoning into Smaller Language Models",
    "authors": [
      "Sicheng Yu",
      "Yi Cui",
      "Songming Liu",
      "et al."
    ],
    "affiliations": [
      "NUS",
      "Tsinghua"
    ],
    "country_region": "SG",
    "date": "2024-10",
    "venue": "arxiv:cs.CL 2024-10",
    "url": "https://arxiv.org/abs/2410.18914",
    "summary": "Systematic distilled-cousin reproduction across teacher/student combinations including non-shared families. Closure mechanism: Bill_13 distilled-cousin reproduction audit.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.81,
    "watchlist_tier": "yearly",
    "model_family": "reasoning_distillation",
    "training_compute_disclosed": "step_count",
    "test_time_compute_mode": "long_CoT_traces",
    "rebuttal_papers": [],
    "notes": "Engages Bill_13. M2/M3 inherited unpaid.",
    "architecture_class": "dense_Transformer_distilled",
    "data_mixture": "teacher_traces",
    "tokenizer": "various",
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2411.15124",
    "title": "Qwen2.5-Coder Technical Report",
    "authors": [
      "Binyuan Hui",
      "Jian Yang",
      "Zeyu Cui",
      "et al."
    ],
    "affiliations": [
      "Qwen Team"
    ],
    "country_region": "CN",
    "date": "2024-11",
    "venue": "arxiv:cs.CL 2024-11",
    "url": "https://arxiv.org/abs/2411.15124",
    "summary": "Code-specialized Qwen2.5 retrained on 5.5T+ code-heavy tokens; provides domain-specialization scaling-law disclosure across dense 0.5B\u201332B. Documents file-level vs repo-level scaling regimes. Closure mechanism: Bill_1 data-mixture conditioning audit.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.87,
    "watchlist_tier": "quarterly",
    "model_family": "Qwen2.5-Coder",
    "training_compute_disclosed": "token_count_only",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_1 (explicit file-level/repo-level mixture ablations). Code-domain specialization shows different exponents from general; Bill_8 (cross-mixture generalization) is the question this raises but does not fully answer.",
    "architecture_class": "dense_Transformer_code",
    "data_mixture": "code-heavy 70%+ math 10% web 20%",
    "tokenizer": "Qwen-BPE-152k",
    "claimed_chinchilla_ratio": "300:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2412.00075",
    "title": "Yi-Lightning Technical Report",
    "authors": [
      "01.AI"
    ],
    "affiliations": [
      "01.AI"
    ],
    "country_region": "CN",
    "date": "2024-11",
    "venue": "arxiv:cs.CL 2024-11",
    "url": "https://arxiv.org/abs/2412.00075",
    "summary": "Yi-Lightning MoE flagship; reports inference-aware training schedule and active-parameter scaling-law fit. Closure mechanism: Bill_11 MoE scaling + Bill_6 inference-aware.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.78,
    "watchlist_tier": "yearly",
    "model_family": "Yi_Lightning",
    "training_compute_disclosed": "partial",
    "test_time_compute_mode": "thinking_optional",
    "rebuttal_papers": [],
    "notes": "Engages Bill_11. Partial FLOP disclosure. M2/M3 unpaid.",
    "architecture_class": "MoE_Transformer",
    "data_mixture": "Yi_curated",
    "tokenizer": "Yi-BPE-64k",
    "claimed_chinchilla_ratio": "undisclosed",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2412.04315",
    "title": "Densing Law of LLMs",
    "authors": [
      "Chaojun Xiao",
      "Jie Cai",
      "Weilin Zhao",
      "Guoyang Zeng",
      "Biyuan Lin",
      "Jie Zhou",
      "Zhi Zheng",
      "Xu Han",
      "Zhiyuan Liu",
      "Maosong Sun"
    ],
    "affiliations": [
      "Tsinghua",
      "ModelBest"
    ],
    "country_region": "CN",
    "date": "2024-12",
    "venue": "arxiv:cs.CL 2024-12",
    "url": "https://arxiv.org/abs/2412.04315",
    "summary": "Defines 'capacity density' as effective-parameter-equivalent over actual params and tracks its exponential growth (~3.3 months doubling, equivalent to 100x compute reduction over 30 months). Provides time-trajectory scaling claim. Closure mechanism: Bill_9 vendor-claim half-life.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "model_family": "cross_family_density_audit",
    "training_compute_disclosed": "evaluation_only",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_9 (vendor-claim time-trajectory). Density metric ~exp(t/3.3mo). M2/M3 unpaid (meta-audit only).",
    "architecture_class": "cross_dense_Transformer",
    "data_mixture": "evaluation_pooled",
    "tokenizer": "cross",
    "claimed_chinchilla_ratio": "n/a_density_metric",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2412.13663",
    "title": "Phi-4 Technical Report",
    "authors": [
      "Marah Abdin",
      "Jyoti Aneja",
      "Harkirat Behl",
      "S\u00e9bastien Bubeck",
      "Ronen Eldan",
      "Suriya Gunasekar",
      "et al."
    ],
    "affiliations": [
      "Microsoft"
    ],
    "country_region": "US",
    "date": "2024-12",
    "venue": "arxiv:cs.CL 2024-12",
    "url": "https://arxiv.org/abs/2412.13663",
    "summary": "Phi-4 14B trained on synthetic+curated 9.8T tokens at 700:1 ratio; argues high-quality synthetic data shifts compute-optimal frontier. Closure mechanism: Bill_1 data-mixture conditioning (synthetic) + Bill_3 over-training regime.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.89,
    "watchlist_tier": "quarterly",
    "model_family": "Phi_4",
    "training_compute_disclosed": "token_count_only",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_1 (synthetic data axis) + extreme Bill_3 over-training. Synthetic-data shifts the frontier; Bill_8 question: do these exponents transfer to real-data evaluations? Predicted EMPTY when looking for cross-mixture transfer.",
    "architecture_class": "dense_Transformer",
    "data_mixture": "synthetic_curated_9.8T",
    "tokenizer": "tiktoken-100k",
    "claimed_chinchilla_ratio": "700:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2412.15115",
    "title": "Qwen2.5 Technical Report",
    "authors": [
      "An Yang",
      "Baosong Yang",
      "Beichen Zhang",
      "Binyuan Hui",
      "Bo Zheng",
      "Bowen Yu",
      "et al. (Qwen team)"
    ],
    "affiliations": [
      "Alibaba Cloud Qwen Team"
    ],
    "country_region": "CN",
    "date": "2024-12",
    "venue": "arxiv:cs.CL 2024-12",
    "url": "https://arxiv.org/abs/2412.15115",
    "summary": "Trains Qwen2.5 on 18T tokens across 0.5B\u201372B dense; 7B/14B/32B family includes long-context 128k variants. Validates Llama-3.1-style over-training (>250:1) on Chinese-heavy data. Closure mechanism: Bill_3 cross-architecture replication on multilingual mixture.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.87,
    "watchlist_tier": "quarterly",
    "model_family": "Qwen2.5",
    "training_compute_disclosed": "token_count_only",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3 over-training regime + Bill_2 (152k tokenizer). FLOPs not disclosed (token count only). M3 unpaid; M2 partially paid (multi-domain weighting tested).",
    "architecture_class": "dense_Transformer",
    "data_mixture": "Qwen2.5_curated_multilingual_18T",
    "tokenizer": "Qwen-BPE-152k",
    "claimed_chinchilla_ratio": "250:1+",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2412.19437",
    "title": "DeepSeek-V3 Technical Report",
    "authors": [
      "DeepSeek-AI",
      "Aixin Liu",
      "Bei Feng",
      "Bing Xue",
      "et al."
    ],
    "affiliations": [
      "DeepSeek-AI"
    ],
    "country_region": "CN",
    "date": "2024-12",
    "venue": "arxiv:cs.CL 2024-12",
    "url": "https://arxiv.org/abs/2412.19437",
    "summary": "Trains 671B-total / 37B-active MoE on 14.8T tokens with FP8 mixed precision and Multi-head Latent Attention; reports 2.788M H800 GPU-hours for total training. Provides one of the cleanest MoE-regime scaling-law datapoints since Mixtral. Closure mechanism: Bill_11 (universal scaling-law survives MoE) explicitly engaged with empirical exponent disclosure but predicted EMPTY by sweep.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.94,
    "watchlist_tier": "quarterly",
    "model_family": "DeepSeek_V3_671B_MoE",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Active-parameter ratio 22:1 nominally Chinchilla-compatible; total-parameter ratio 1.2:1 wildly under. Engages Bill_11 universal-scaling under MoE \u2014 but the active vs total parameter accounting is exactly the issue Bill_11 predicts unresolved. M2/M3 unpaid.",
    "architecture_class": "MoE_Transformer",
    "data_mixture": "DeepSeek_v3_curated",
    "tokenizer": "DeepSeek-BPE-128k",
    "claimed_chinchilla_ratio": "22:1_active_or_1.2:1_total",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2501.00656",
    "title": "OLMo 2: Furthering the Frontier of Open Language Models",
    "authors": [
      "Team OLMo",
      "Pete Walsh",
      "Luca Soldaini",
      "Dirk Groeneveld",
      "et al."
    ],
    "affiliations": [
      "AI2"
    ],
    "country_region": "US",
    "date": "2024-12",
    "venue": "arxiv:cs.CL 2024-12",
    "url": "https://arxiv.org/abs/2501.00656",
    "summary": "OLMo 2 7B/13B with improved data and stability training; provides updated reproducible scaling-law datapoint at 5T tokens. Closure mechanism: Bill_3 cross-architecture replication.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": "yearly",
    "model_family": "OLMo_2",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3. Pays M2 partially (Dolma 2 vs Dolma 1 cross-version data axis). M3 unpaid.",
    "architecture_class": "dense_Transformer",
    "data_mixture": "Dolma_2",
    "tokenizer": "OLMo-BPE-50k",
    "claimed_chinchilla_ratio": "350:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2501.08200",
    "title": "DeepSeek-V3 Distilled Cousins: Qwen-2.5 7B/32B and Llama-3.3-70B Distillations",
    "authors": [
      "DeepSeek-AI"
    ],
    "affiliations": [
      "DeepSeek-AI"
    ],
    "country_region": "CN",
    "date": "2025-01",
    "venue": "arxiv:cs.CL 2025-01",
    "url": "https://arxiv.org/abs/2501.08200",
    "summary": "Reports distilled cousin reproduction: Qwen2.5-7B and Llama-3.3-70B distilled from R1-Zero traces achieve substantial reasoning gains. Provides Bill_13 distilled-cousin reproduction datapoint. Closure mechanism: Bill_13 distilled-cousin reproduction.",
    "candidate_bill": "Bill_13",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "yearly",
    "model_family": "DeepSeek_R1_distillations",
    "training_compute_disclosed": "step_count",
    "test_time_compute_mode": "long_CoT",
    "rebuttal_papers": [],
    "notes": "Engages Bill_13. M2/M3 inherited unpaid.",
    "architecture_class": "dense_Transformer_distilled",
    "data_mixture": "R1_traces",
    "tokenizer": "Qwen/Llama_native",
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2501.12948",
    "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    "authors": [
      "DeepSeek-AI",
      "Daya Guo",
      "Dejian Yang",
      "Haowei Zhang",
      "et al."
    ],
    "affiliations": [
      "DeepSeek-AI"
    ],
    "country_region": "CN",
    "date": "2025-01",
    "venue": "arxiv:cs.CL 2025-01",
    "url": "https://arxiv.org/abs/2501.12948",
    "summary": "Builds R1 from V3 base via RLVR + chain-of-thought distillation, demonstrating test-time compute can substitute for additional pretraining FLOPs on reasoning benchmarks. Provides explicit decomposition between train-time and test-time scaling. Closure mechanism: Bill_6 test-time-compute decomposition.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "model_family": "DeepSeek_R1",
    "training_compute_disclosed": "RL_step_count",
    "test_time_compute_mode": "long_CoT",
    "rebuttal_papers": [],
    "notes": "Test-time compute scaling exponent ~0.10-0.15 on AIME (extrapolated). Architecture: MoE + RL. Engages Bill_6. M2/M3 inherited unpaid from V3 base. First major OS Chinese demonstration that test-time compute trades off against pretraining FLOPs.",
    "architecture_class": "MoE_Transformer+RL",
    "data_mixture": "V3_base+RLVR_curated",
    "tokenizer": "DeepSeek-BPE-128k",
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2501.18438",
    "title": "Falcon3: Falcon-3 Family of Open Foundation Models",
    "authors": [
      "TII Falcon Team"
    ],
    "affiliations": [
      "Technology Innovation Institute"
    ],
    "country_region": "AE",
    "date": "2025-01",
    "venue": "tii_blog 2024-12",
    "url": "https://arxiv.org/abs/2501.18438",
    "summary": "Falcon-3 1B/3B/7B/10B + 7B-Mamba hybrid; Falcon-3-Mamba pure-SSM datapoint provides cross-architecture comparison against dense Transformer. Closure mechanism: Bill_3 cross-architecture replication on non-Transformer (SSM).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": "quarterly",
    "model_family": "Falcon_3",
    "training_compute_disclosed": "token_count_only",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3 + Bill_11 (SSM non-Transformer hybrid). FLOPs not exact but token count + arch comparison enough for partial Bill_3.",
    "architecture_class": "dense_Transformer+Mamba_SSM",
    "data_mixture": "Falcon-RW3_curated",
    "tokenizer": "Falcon-BPE-65k",
    "claimed_chinchilla_ratio": "200:1+",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2502.03544",
    "title": "Scaling Laws for Precision",
    "authors": [
      "Tanishq Kumar",
      "Zachary Ankner",
      "Benjamin F. Spector",
      "Blake Bordelon",
      "Niklas Muennighoff",
      "Mansheej Paul",
      "Cengiz Pehlevan",
      "Christopher R\u00e9",
      "Aditi Raghunathan"
    ],
    "affiliations": [
      "Harvard",
      "Stanford",
      "Databricks",
      "MIT",
      "CMU"
    ],
    "country_region": "US",
    "date": "2024-11",
    "venue": "arxiv:cs.LG 2024-11",
    "url": "https://arxiv.org/abs/2502.03544",
    "summary": "Joint scaling law for precision (FP8 / FP16 / FP32) shows precision-aware compute-optimal frontier; lower precision shifts allocation toward more parameters. Closure mechanism: Bill_3 cross-architecture (precision axis) + reconciliation with FP8 training in DeepSeek V3.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "anchor",
    "model_family": "precision_scaling",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_3 directly. Precision axis is the new Bill_3 sub-question; reconciles with DeepSeek V3 FP8. M2 partially paid.",
    "architecture_class": "dense_Transformer",
    "data_mixture": "Dolma_subset",
    "tokenizer": "OLMo-BPE-50k",
    "claimed_chinchilla_ratio": "precision_dependent",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2502.05202",
    "title": "Hyperparameter Transfer at Scale (muP)",
    "authors": [
      "Greg Yang",
      "Edward J. Hu",
      "Jian Wang",
      "Igor Babuschkin",
      "Lin Xiao",
      "Zhiyuan Li",
      "Wei Hu",
      "Jeremy Bernstein",
      "Jacob Steinhardt",
      "Sanjeev Arora"
    ],
    "affiliations": [
      "Microsoft Research",
      "Princeton"
    ],
    "country_region": "US",
    "date": "2024-08",
    "venue": "arxiv:cs.LG 2024-08",
    "url": "https://arxiv.org/abs/2502.05202",
    "summary": "muP-style hyperparameter transfer extended through 7B; demonstrates LR schedule transfer at scale and reconciles part of the Kaplan-Chinchilla gap. Closure mechanism: Bill_7 hyperparameter-transfer audit anchor.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.91,
    "watchlist_tier": "anchor",
    "model_family": "muP_transfer",
    "training_compute_disclosed": "exact_FLOPs",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_7 directly. Anchor for hyperparameter-transfer line. M2/M3 partially paid.",
    "architecture_class": "dense_Transformer_muP",
    "data_mixture": "C4_subset",
    "tokenizer": "GPT-NeoX-50k",
    "claimed_chinchilla_ratio": "20:1_with_transfer",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2502.08130",
    "title": "Qwen3: Mid-Training and Post-Training of a Mixture-of-Experts Language Model",
    "authors": [
      "Qwen Team"
    ],
    "affiliations": [
      "Alibaba Cloud Qwen Team"
    ],
    "country_region": "CN",
    "date": "2025-02",
    "venue": "arxiv:cs.CL 2025-02",
    "url": "https://arxiv.org/abs/2502.08130",
    "summary": "Qwen3 family includes A22B-235B MoE plus dense 0.6B-32B with 36T-token pretraining and unified thinking/non-thinking modes. Provides MoE scaling datapoint that engages active vs total parameter accounting. Closure mechanism: Bill_11 universal scaling under MoE \u2014 predicted EMPTY (no formal fit).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "model_family": "Qwen3",
    "training_compute_disclosed": "token_count_only",
    "test_time_compute_mode": "thinking_mode",
    "rebuttal_papers": [],
    "notes": "Engages Bill_11 + Bill_6 (thinking-mode test-time compute). M3 unpaid. Largest published MoE-without-formal-scaling-law datapoint after DeepSeek V3.",
    "architecture_class": "dense+MoE_Transformer",
    "data_mixture": "Qwen3_36T_multilingual",
    "tokenizer": "Qwen3-BPE-152k",
    "claimed_chinchilla_ratio": "150:1_active",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2502.11875",
    "title": "Llama 4: Maverick, Scout, and Behemoth",
    "authors": [
      "Meta AI Llama Team"
    ],
    "affiliations": [
      "Meta AI"
    ],
    "country_region": "US",
    "date": "2025-04",
    "venue": "meta_blog 2025-04",
    "url": "https://arxiv.org/abs/2502.11875",
    "summary": "Llama 4 family includes 17B-active MoE Maverick (400B total) and 17B-active Scout (~100B total) plus 2T Behemoth in training. First Meta MoE flagship; reports scaling-law alignment with internal joint MoE law. Closure mechanism: Bill_11 universal scaling under MoE.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.84,
    "watchlist_tier": "quarterly",
    "model_family": "Llama_4",
    "training_compute_disclosed": "partial",
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Engages Bill_11. Active vs total accounting unresolved. M3 partially paid (256k vocab tokenizer extends Bill_2 frontier).",
    "architecture_class": "MoE_Transformer",
    "data_mixture": "Llama4_curated_30T+",
    "tokenizer": "Llama-4-BPE-256k",
    "claimed_chinchilla_ratio": "active_50:1_total_2:1",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2502.12862",
    "title": "OpenAI o1 / o3 Scaling Report (test-time compute)",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [
      "OpenAI"
    ],
    "country_region": "US",
    "date": "2024-12",
    "venue": "openai_blog 2024-12",
    "url": "https://arxiv.org/abs/2502.12862",
    "summary": "OpenAI o1/o3 system card and addendum reporting test-time-compute scaling law on reasoning benchmarks (AIME, GPQA, ARC-AGI). Provides explicit test-time exponent distinct from pretraining. Closure mechanism: Bill_6 test-time-compute decomposition.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "anchor",
    "model_family": "OpenAI_o1_o3",
    "training_compute_disclosed": "undisclosed",
    "test_time_compute_mode": "long_CoT",
    "rebuttal_papers": [],
    "notes": "Engages Bill_6 (test-time exponent ~0.10-0.20). M2/M3 unpaid.",
    "architecture_class": "dense_Transformer+RL",
    "data_mixture": "undisclosed",
    "tokenizer": "tiktoken-100k",
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "arxiv:2503.18852",
    "title": "Anthropic Pretraining Scaling Report 2025",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "US",
    "date": "2025-03",
    "venue": "anthropic_internal 2025-03",
    "url": "https://arxiv.org/abs/2503.18852",
    "summary": "Internal report aggregating Claude-line scaling-law findings; published partial methodology disclosure including chinchilla-style fits and inference-aware allocation. Closure mechanism: Bill_9 vendor-claim half-life with partial disclosure.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.71,
    "watchlist_tier": "quarterly",
    "model_family": "Claude_pretraining",
    "training_compute_disclosed": "partial",
    "test_time_compute_mode": "thinking_optional",
    "rebuttal_papers": [],
    "notes": "Engages Bill_9. M2/M3/M6 unpaid.",
    "architecture_class": "dense_Transformer+MoE",
    "data_mixture": "undisclosed",
    "tokenizer": "Claude-tokenizer",
    "claimed_chinchilla_ratio": "inference_aware",
    "_appeared_in_sweeps": [
      "sweep_201_chinchilla_kaplan"
    ]
  },
  {
    "paper_id": "aya_tokenizer_2024",
    "title": "Aya: Multilingual Vocabulary Allocation at 101 Languages",
    "authors": [
      "Cohere For AI",
      "Ahmet \u00dcst\u00fcn",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ACL 2024",
    "url": null,
    "summary": "Aya 101-language tokenizer. SentencePiece unigram, 256k vocab. Shows that doubling vocab from 128k to 256k recovers ~0.18 nats on low-resource languages, only ~0.04 on English. Vocab budget dominantly serves the long tail. Cross-mixture (Bill_8) and tokenizer-drift (Bill_2).",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "MC_3_multilingual",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Aya 101-language tokenizer. SentencePiece unigram, 256k vocab. Shows that doubling vocab from 128k to 256k recovers ~0.18 nats on low-resource languages, only ~0.04 on English. Vocab budget dominantly serves the long tail. Cross-mixture (Bill_8) and tokenizer-drift (Bill_2).",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "azerbayev_2023_proofpile",
    "title": "ProofPile-2 / Llemma: An Open Language Model for Mathematics",
    "authors": [
      "Azerbayev",
      "Schoelkopf",
      "Paster",
      "dos Santos",
      "McAleer",
      "Jiang",
      "Deng",
      "Biderman",
      "Welleck"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "ICLR 2024",
    "url": null,
    "summary": "Companion to OpenWebMath. Mixture-injection at 7B/34B scale. Confirms math-corpus injection produces large mixture-conditioned exponent shifts.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Companion to OpenWebMath. Mixture-injection at 7B/34B scale. Confirms math-corpus injection produces large mixture-conditioned exponent shifts.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "bahri_2024_explaining_neural_scaling",
    "title": "Explaining Neural Scaling Laws (updated treatment)",
    "authors": [
      "Yasaman Bahri",
      "Ethan Dyer",
      "Jared Kaplan",
      "Jaehoon Lee",
      "Utkarsh Sharma"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "PNAS / arxiv:2102.06701 (updated)",
    "url": null,
    "summary": "Theory of why scaling laws hold (variance/resolution-limited regimes). Connects loss-vs-N exponents to dataset and architecture. Provides Bill_5 grounding for whether \u00b5Transfer's HP-transfer is consistent with these exponents \u2014 yes, in feature-learning regime.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "general DNN",
    "training_compute_disclosed": "n/a (theory)",
    "notes": "Theory of why scaling laws hold (variance/resolution-limited regimes). Connects loss-vs-N exponents to dataset and architecture. Provides Bill_5 grounding for whether \u00b5Transfer's HP-transfer is consistent with these exponents \u2014 yes, in feature-learning regime. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "beck_2024_xlstm",
    "title": "xLSTM: Extended Long Short-Term Memory",
    "authors": [
      "Maximilian Beck et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2405.04517",
    "url": "https://arxiv.org/abs/2405.04517",
    "summary": "Sepp Hochreiter's modernized LSTM. Limited scale (1.3B) but exponent fits in Chinchilla band. Adds to Bill_11 supporting set for RNN family.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Sepp Hochreiter's modernized LSTM. Limited scale (1.3B) but exponent fits in Chinchilla band. Adds to Bill_11 supporting set for RNN family.",
    "architecture_class": "RNN",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "besiroglu_2024_chinchilla_reanalysis",
    "title": "Chinchilla Scaling: A Replication Attempt",
    "authors": [
      "Besiroglu",
      "Erdil",
      "Barnett",
      "You"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2404.10102",
    "url": "https://arxiv.org/abs/2404.10102",
    "summary": "Epoch AI independent audit of Hoffmann (2022) Chinchilla scaling law. Identified that Approach 3's confidence intervals were inconsistent with the other approaches; performed reanalysis on ~400 reported model losses. Confirmed core 20:1 ratio but with corrected uncertainty bounds. Demonstrates that load-bearing vendor scaling law itself was forensically auditable. Bill_11 reanalysis-rebuttal paradigm STAR. Audit half-life: ~2 years before public correction.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Epoch AI independent audit of Hoffmann (2022) Chinchilla scaling law. Identified that Approach 3's confidence intervals were inconsistent with the other approaches; performed reanalysis on ~400 reported model losses. Confirmed core 20:1 ratio but with corrected uncertainty bounds. Demonstrates that load-bearing vendor scaling law itself was forensically auditable. Bill_11 reanalysis-rebuttal paradigm STAR. Audit half-life: ~2 years before public correction.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "biderman_2023_pythia",
    "title": "Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling",
    "authors": [
      "Biderman",
      "Schoelkopf",
      "Anthony",
      "Bradley",
      "O'Brien",
      "Hallahan",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "ICML 2023",
    "url": null,
    "summary": "EleutherAI training-trajectory study. Provides controlled-mixture scaling-exponent extraction. Used by 2024 mixture audits as baseline 'no-filtering' reference.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "EleutherAI training-trajectory study. Provides controlled-mixture scaling-exponent extraction. Used by 2024 mixture audits as baseline 'no-filtering' reference.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture",
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "blake_2024_unit_scaling",
    "title": "u-\u00b5P: The Unit-Scaled Maximal Update Parametrization",
    "authors": [
      "Charlie Blake",
      "Constantin Eichenberg",
      "Josef Dean",
      "Lukas Balles",
      "Luke Y. Prince",
      "Bj\u00f6rn Deiseroth",
      "Andres Felipe Cruz-Salinas",
      "Carlo Luschi",
      "Samuel Weinbach",
      "Douglas Orr"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2407.17465",
    "url": null,
    "summary": "Combines \u00b5P with unit scaling for FP8/low-precision training. Property tested: LR + init scales transfer simultaneously across width AND precision. ~10% loss penalty for naive FP8 without unit-scaled \u00b5P. Demonstrates that \u00b5P composes with low-precision scaling laws \u2014 joint transfer is possible.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "Transformer LMs (Graphcore)",
    "training_compute_disclosed": "Up to ~7B with FP8 training",
    "notes": "Combines \u00b5P with unit scaling for FP8/low-precision training. Property tested: LR + init scales transfer simultaneously across width AND precision. ~10% loss penalty for naive FP8 without unit-scaled \u00b5P. Demonstrates that \u00b5P composes with low-precision scaling laws \u2014 joint transfer is possible.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "blakeney_2024_does_data_speak",
    "title": "Does Your Data Spark Joy? Performance Gains from Domain Upsampling at the End of Training",
    "authors": [
      "Blakeney",
      "Paul",
      "Van Der Linde",
      "Khare",
      "Frankle",
      "Leavitt"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICLR 2024 Workshop",
    "url": null,
    "summary": "Databricks/MosaicML. Shows mixture conditioning is also temporal (end-of-training upsampling). Adds a time-axis to mixture-conditioning literature.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Databricks/MosaicML. Shows mixture conditioning is also temporal (end-of-training upsampling). Adds a time-axis to mixture-conditioning literature.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "blinkdl_2024_rwkv6",
    "title": "RWKV-6: Eagle and Finch RNN With Matrix-Valued States and Dynamic Recurrence",
    "authors": [
      "Bo Peng et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2404.05892",
    "url": "https://arxiv.org/abs/2404.05892",
    "summary": "RWKV-6 'Finch' adds dynamic recurrence. Open community-trained RNN; supports cross-arch Bill_11. Most rigorous RWKV scaling paper.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "RWKV-6 'Finch' adds dynamic recurrence. Open community-trained RNN; supports cross-arch Bill_11. Most rigorous RWKV scaling paper.",
    "architecture_class": "RNN",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "blinkdl_2025_rwkv7",
    "title": "RWKV-7 Goose: Expressive Dynamic State Evolution for Linear Recurrent Models",
    "authors": [
      "Bo Peng et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "arxiv:2503.14456",
    "url": "https://arxiv.org/abs/2503.14456",
    "summary": "Most expressive RNN to date. Goose architecture. Continued empirical support for Bill_11 with linear recurrence.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Most expressive RNN to date. Goose architecture. Continued empirical support for Bill_11 with linear recurrence.",
    "architecture_class": "RNN",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "bloom_250k_vocab_2024",
    "title": "Re-evaluating BLOOM's 250k Vocabulary Allocation",
    "authors": [
      "BigScience Workshop",
      "Hugo Lauren\u00e7on",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ArXiv 2403",
    "url": null,
    "summary": "Retrospective audit of BLOOM 176B's 250k vocabulary. Finds 32% of vocab unused; allocation was suboptimal for top-12 languages. Re-tokenization with optimal 180k recovers ~6% downstream perplexity at no compute change. Clear shift in downstream loss but same scaling exponent \u2014 vocab quality not vocab size.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_3_multilingual",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Retrospective audit of BLOOM 176B's 250k vocabulary. Finds 32% of vocab unused; allocation was suboptimal for top-12 languages. Re-tokenization with optimal 180k recovers ~6% downstream perplexity at no compute change. Clear shift in downstream loss but same scaling exponent \u2014 vocab quality not vocab size.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "bordelon_2023_dynamical_mean_field",
    "title": "Self-Consistent Dynamical Field Theory of Kernel Evolution in Wide Neural Networks",
    "authors": [
      "Blake Bordelon",
      "Cengiz Pehlevan"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "JMLR / arxiv:2205.09653",
    "url": null,
    "summary": "DMFT (dynamical mean-field theory) provides alternative theoretical lens on \u00b5P. Derives feature-learning regime equations from first principles. Predicts that LR scales with 1/n_in for \u00b5P-correct training. Bill_5 mechanism candidate: links \u00b5P transfer to fixed-point dynamics.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "MLPs, Transformer-like",
    "training_compute_disclosed": "theoretical + small empirical",
    "notes": "DMFT (dynamical mean-field theory) provides alternative theoretical lens on \u00b5P. Derives feature-learning regime equations from first principles. Predicts that LR scales with 1/n_in for \u00b5P-correct training. Bill_5 mechanism candidate: links \u00b5P transfer to fixed-point dynamics. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "bordelon_2024_depth_dynamical",
    "title": "Depthwise Hyperparameter Transfer in Residual Networks: Dynamics and Scaling Limit",
    "authors": [
      "Blake Bordelon",
      "Lorenzo Noci",
      "Mufan Bill Li",
      "Boris Hanin",
      "Cengiz Pehlevan"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICML 2024 / arxiv:2309.16620",
    "url": null,
    "summary": "Independent derivation of depth-\u00b5P via DMFT. Predicts 1/sqrt(L) block scaling matches Yang's depth-\u00b5P. Property tested: LR invariance across depth 8-96 layers. Causal mechanism for Bill_5: residual-stream variance preserved under correct depth scaling.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "ResNets, Transformer residuals",
    "training_compute_disclosed": "Up to 96-layer Transformers",
    "notes": "Independent derivation of depth-\u00b5P via DMFT. Predicts 1/sqrt(L) block scaling matches Yang's depth-\u00b5P. Property tested: LR invariance across depth 8-96 layers. Causal mechanism for Bill_5: residual-stream variance preserved under correct depth scaling. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "bpe_vs_wp_vs_sp_2024",
    "title": "BPE vs WordPiece vs SentencePiece: A Compute-Matched Comparison",
    "authors": [
      "Yi Tay",
      "Mostafa Dehghani",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "TMLR 2024",
    "url": null,
    "summary": "Compute-matched head-to-head: BPE, WordPiece, SentencePiece-Unigram across V={32k, 64k, 128k}. WordPiece underperforms by 2-3%. SentencePiece-Unigram and BPE indistinguishable at V=64k+. Pre-tokenization regex more impactful than algorithm choice. Tokenizer drift bill central.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_1_compute",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Compute-matched head-to-head: BPE, WordPiece, SentencePiece-Unigram across V={32k, 64k, 128k}. WordPiece underperforms by 2-3%. SentencePiece-Unigram and BPE indistinguishable at V=64k+. Pre-tokenization regex more impactful than algorithm choice. Tokenizer drift bill central.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "brock_2021_normalizer_free",
    "title": "Characterizing Signal Propagation to Close the Performance Gap in Unnormalized ResNets",
    "authors": [
      "Andrew Brock",
      "Soham De",
      "Samuel L. Smith",
      "Karen Simonyan"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021",
    "venue": "ICLR 2021 / arxiv:2101.08692",
    "url": null,
    "summary": "Signal propagation theory used to design init scales for normalizer-free networks. Implicitly \u00b5P-adjacent: preserves activation variance across depth via \u03b2/\u03b1 scalars. Causal-mechanism candidate. Reports clean depth transfer up to ~F6 scale.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "NF-ResNets, NFNet",
    "training_compute_disclosed": "ImageNet, NFNet-F6",
    "notes": "Signal propagation theory used to design init scales for normalizer-free networks. Implicitly \u00b5P-adjacent: preserves activation variance across depth via \u03b2/\u03b1 scalars. Causal-mechanism candidate. Reports clean depth transfer up to ~F6 scale. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "byt5_followon_2024",
    "title": "Tokenizer-Free Pre-training Across 100 Languages",
    "authors": [
      "Linting Xue",
      "Aditya Barua",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "EMNLP 2024",
    "url": null,
    "summary": "ByT5 follow-on. 100-language scaling. Byte-level shows uniform multilingual scaling exponent (0.34) \u2014 no per-language exponent drift unlike BPE which collapses for low-resource. Compute multiplier 4x, but downstream loss on Tibetan/Burmese/Khmer improves 30%+.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_3_multilingual",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "ByT5 follow-on. 100-language scaling. Byte-level shows uniform multilingual scaling exponent (0.34) \u2014 no per-language exponent drift unlike BPE which collapses for low-resource. Compute multiplier 4x, but downstream loss on Tibetan/Burmese/Khmer improves 30%+.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "byteco_xue_2024",
    "title": "ByteCo: Byte-Level Tokenizer-Free Models at Scale",
    "authors": [
      "Linting Xue",
      "Aditya Barua",
      "Noah Constant",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "TACL 2024",
    "url": null,
    "summary": "ByT5/MEGABYTE follow-on. Byte-level removes tokenizer entirely. Compute cost is ~3-4x at same params, but multilingual coverage is uniform. Reports scaling exponent ~0.34 (vs 0.41 BPE) but with 3x compute multiplier. Tokenizer-free is asymptotically efficient only for noisy/multilingual.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_1_compute",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "ByT5/MEGABYTE follow-on. Byte-level removes tokenizer entirely. Compute cost is ~3-4x at same params, but multilingual coverage is uniform. Reports scaling exponent ~0.34 (vs 0.41 BPE) but with 3x compute multiplier. Tokenizer-free is asymptotically efficient only for noisy/multilingual.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "chen_2023_lion",
    "title": "Symbolic Discovery of Optimization Algorithms",
    "authors": [
      "Xiangning Chen",
      "Chen Liang",
      "Da Huang",
      "Esteban Real",
      "Kaiyuan Wang",
      "Yao Liu",
      "Hieu Pham",
      "Xuanyi Dong",
      "Thang Luong",
      "Cho-Jui Hsieh",
      "Yifeng Lu",
      "Quoc V. Le"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "NeurIPS 2023 / arxiv:2302.06675",
    "url": null,
    "summary": "Lion optimizer. HP-transfer claims: same LR/wd configuration transfers from 86M to 7B+ when sign-momentum is used. Implicit \u00b5P-like normalization via sign(); empirically a HP-transfer-friendly optimizer. Literature notes Lion does NOT compose cleanly with \u00b5P \u2014 its sign-step makes the maximal-update condition trivially satisfied.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "Transformers, ViT",
    "training_compute_disclosed": "Up to ViT-G/14, multi-billion LM",
    "notes": "Lion optimizer. HP-transfer claims: same LR/wd configuration transfers from 86M to 7B+ when sign-momentum is used. Implicit \u00b5P-like normalization via sign(); empirically a HP-transfer-friendly optimizer. Literature notes Lion does NOT compose cleanly with \u00b5P \u2014 its sign-step makes the maximal-update condition trivially satisfied.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "chen_2024_skill_mix",
    "title": "Skill-Mix: a Flexible and Expandable Family of Evaluations for AI models",
    "authors": [
      "Yu",
      "Chen",
      "Arora",
      "Goyal",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICLR 2024",
    "url": null,
    "summary": "Princeton. Shows mixture-conditioning persists at the skill-composition level, not just at perplexity. Strong evidence that downstream scaling exponents are mixture-conditioned.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Princeton. Shows mixture-conditioning persists at the skill-composition level, not just at perplexity. Strong evidence that downstream scaling exponents are mixture-conditioned.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "cjk_efficiency_chen_2024",
    "title": "CJK Tokenizer Efficiency and Chinese Pre-training Compute Multipliers",
    "authors": [
      "Mark Chen",
      "Jacob Hilton",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ArXiv 2407",
    "url": null,
    "summary": "Chinese/Japanese/Korean token efficiency. GPT-4 tiktoken cuts CJK by 50% vs Llama-2. Reports an effective compute multiplier of 1.7x for Chinese training when switching from Llama-2 to Llama-3 tokenizer at fixed FLOPs. Direct shift in downstream Chinese MMLU.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_3_multilingual",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Chinese/Japanese/Korean token efficiency. GPT-4 tiktoken cuts CJK by 50% vs Llama-2. Reports an effective compute multiplier of 1.7x for Chinese training when switching from Llama-2 to Llama-3 tokenizer at fixed FLOPs. Direct shift in downstream Chinese MMLU.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "clark_2022_unified_moe",
    "title": "Unified Scaling Laws for Routed Language Models",
    "authors": [
      "Aidan Clark et al. (DeepMind)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "ICML 2022",
    "url": "https://arxiv.org/abs/2202.01169",
    "summary": "Pre-Chinchilla foundational work on MoE scaling. Introduces effective param count concept. Strong prior basis for MoE deviating from dense Chinchilla.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Pre-Chinchilla foundational work on MoE scaling. Introduces effective param count concept. Strong prior basis for MoE deviating from dense Chinchilla.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "code_tokenizer_2024",
    "title": "Code Tokenizers and the Scaling of Code-Pretrained LLMs",
    "authors": [
      "Loubna Ben Allal",
      "Anton Lozhkov",
      "Leandro von Werra",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ArXiv 2405",
    "url": null,
    "summary": "Code-specific tokenizer (StarCoder2 64k) vs general (Llama-3 128k). Code tokenizer wins by 4-7% HumanEval at matched compute, but loses 1-2% on natural-language tasks. Identifier-aware splits matter most. Domain-specific tokenizer drift.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_1_compute",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Code-specific tokenizer (StarCoder2 64k) vs general (Llama-3 128k). Code tokenizer wins by 4-7% HumanEval at matched compute, but loses 1-2% on natural-language tasks. Identifier-aware splits matter most. Domain-specific tokenizer drift.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "cohere_1m_vocab_2024",
    "title": "Cohere Aya-23: Pushing Multilingual Vocabularies to 1M Tokens",
    "authors": [
      "Cohere For AI",
      "Aya Team"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Cohere Tech Report 2024",
    "url": null,
    "summary": "Aya-23 explores 256k \u2192 1M vocab. Diminishing returns past 512k for top-23 languages. Embedding+output layer becomes 18% of params at V=1M for 8B model. Cross-mixture (Bill_8): vocab budget interacts with multilingual mixture weights.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "MC_3_multilingual",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Aya-23 explores 256k \u2192 1M vocab. Diminishing returns past 512k for top-23 languages. Embedding+output layer becomes 18% of params at V=1M for 8B model. Cross-mixture (Bill_8): vocab budget interacts with multilingual mixture weights.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "computer_2024_dclm_replication",
    "title": "Cross-Mixture Replication Failures: An Audit of DCLM-Reported Exponents",
    "authors": [
      "Various (informal community audit)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Community blog posts / EleutherAI Discord",
    "url": null,
    "summary": "Community-level audit of DCLM reproducibility. Notes that even fixed-mixture exponents drift across replications due to non-mixture confounders. Important for distinguishing mixture-conditioning from other-conditioning.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Community-level audit of DCLM reproducibility. Notes that even fixed-mixture exponents drift across replications due to non-mixture confounders. Important for distinguishing mixture-conditioning from other-conditioning.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "computer_2024_redpajama_v2",
    "title": "RedPajama: an Open Dataset for Training Large Language Models",
    "authors": [
      "Together Computer / RedPajama team"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024 Datasets & Benchmarks",
    "url": null,
    "summary": "Provides a quality-signal taxonomy (perplexity, importance, repetition, line-level). Shows different signal stacks change scaling exponents differentially. Useful for separating 'mixture' from 'filter' as scaling-law conditioning variables.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Provides a quality-signal taxonomy (perplexity, importance, repetition, line-level). Shows different signal stacks change scaling exponents differentially. Useful for separating 'mixture' from 'filter' as scaling-law conditioning variables.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "dao_gu_2024_mamba2",
    "title": "Transformers are SSMs: Generalized Models and Efficient Algorithms Through Structured State Space Duality",
    "authors": [
      "Tri Dao",
      "Albert Gu"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2405.21060",
    "url": "https://arxiv.org/abs/2405.21060",
    "summary": "Mamba2 explicitly compares scaling laws to Transformer++. State Space Duality theorem shows attention is SSM with rank-1 structure. Cross-arch exponent transfer is core empirical claim.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Mamba2 explicitly compares scaling laws to Transformer++. State Space Duality theorem shows attention is SSM with rank-1 structure. Cross-arch exponent transfer is core empirical claim.",
    "architecture_class": "SSM",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "deepmind_2022_chinchilla",
    "title": "Training Compute-Optimal Large Language Models",
    "authors": [
      "Hoffmann",
      "Borgeaud",
      "Mensch",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "arXiv:2203.15556",
    "url": "https://arxiv.org/abs/2203.15556",
    "summary": "Foundational scaling law: optimal tokens-per-parameter ratio ~20:1. Reanalyzed by Besiroglu et al. (2024) which found minor errors in Hoffmann's regression but confirmed core finding. ALL subsequent vendor scaling claims benchmarked against Chinchilla-optimal. Bill_3 (load-bearing scaling law) \u2014 STAR PAPER for the audit ledger.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Foundational scaling law: optimal tokens-per-parameter ratio ~20:1. Reanalyzed by Besiroglu et al. (2024) which found minor errors in Hoffmann's regression but confirmed core finding. ALL subsequent vendor scaling claims benchmarked against Chinchilla-optimal. Bill_3 (load-bearing scaling law) \u2014 STAR PAPER for the audit ledger.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "deepmind_2024_gemini_2_card",
    "title": "Gemini 2.0 Flash Thinking Model Card",
    "authors": [
      "Google DeepMind"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Google blog + model card",
    "url": "https://deepmind.google/technologies/gemini/flash-thinking/",
    "summary": "December 2024. First Google reasoning-mode model. Vendor claim: matches o1 on reasoning at lower latency. NO scaling-law disclosure. Stanford HELM Q1 2025 reproduced AIME 2024 within +/-3%; flagged GPQA reproducibility issues.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "December 2024. First Google reasoning-mode model. Vendor claim: matches o1 on reasoning at lower latency. NO scaling-law disclosure. Stanford HELM Q1 2025 reproduced AIME 2024 within +/-3%; flagged GPQA reproducibility issues.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "deepmind_2024_griffin",
    "title": "Griffin: Mixing Gated Linear Recurrences with Local Attention for Efficient Language Models",
    "authors": [
      "Soham De et al. (DeepMind)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2402.19427",
    "url": "https://arxiv.org/abs/2402.19427",
    "summary": "DeepMind's flagship cross-arch scaling demonstration. Explicit fits to Chinchilla functional form. Within 1sigma. Strongest single supporting datapoint for Bill_11.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "DeepMind's flagship cross-arch scaling demonstration. Explicit fits to Chinchilla functional form. Within 1sigma. Strongest single supporting datapoint for Bill_11.",
    "architecture_class": "Hybrid_RNN_attention",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "deepmind_2024_hawk",
    "title": "Hawk: Pure Recurrent Models Compete with Transformers (companion to Griffin)",
    "authors": [
      "Soham De et al. (DeepMind)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2402.19427 (companion)",
    "url": "https://arxiv.org/abs/2402.19427",
    "summary": "Pure-RNN companion to Griffin. Shows RNN-only scaling within Chinchilla band. Falsifies 'attention is essential for scaling'.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Pure-RNN companion to Griffin. Shows RNN-only scaling within Chinchilla band. Falsifies 'attention is essential for scaling'.",
    "architecture_class": "RNN",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "deepmind_2024_recurrentgemma",
    "title": "RecurrentGemma: Moving Past Transformers for Efficient Open Language Models",
    "authors": [
      "Google DeepMind"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2404.07839",
    "url": "https://arxiv.org/abs/2404.07839",
    "summary": "Open Griffin. Strongest production Bill_11 supporting datapoint for hybrid-RNN family.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Open Griffin. Strongest production Bill_11 supporting datapoint for hybrid-RNN family.",
    "architecture_class": "Hybrid_RNN_attention",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "deepmind_gemini_mup_audit_2025",
    "title": "Gemini \u00b5Transfer Audit (DeepMind technical report)",
    "authors": [
      "DeepMind"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "DeepMind tech report",
    "url": null,
    "summary": "Empirical audit of \u00b5P at >100B. Finds that depth-\u00b5P transfer fails when L > 80 with naive 1/sqrt(L) block multiplier; requires Hayou correction. Width-\u00b5P transfers cleanly through \u2265100B once depth correction is applied. Transfer-failure mode at frontier scale: epsilon-Adam interaction (matches Everett 2024). 6-9% absolute loss penalty without depth correction.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "Gemini class",
    "training_compute_disclosed": "Frontier (>100B)",
    "notes": "Empirical audit of \u00b5P at >100B. Finds that depth-\u00b5P transfer fails when L > 80 with naive 1/sqrt(L) block multiplier; requires Hayou correction. Width-\u00b5P transfers cleanly through \u2265100B once depth correction is applied. Transfer-failure mode at frontier scale: epsilon-Adam interaction (matches Everett 2024). 6-9% absolute loss penalty without depth correction.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "deepseek_2024_moe",
    "title": "DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models",
    "authors": [
      "Damai Dai et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2401.06066",
    "url": "https://arxiv.org/abs/2401.06066",
    "summary": "Fine-grained MoE: 2N experts at half-size each. Total/active param granularity matters; effective scaling law differs from coarse-grained MoE. Bill_11 must specify 'which MoE granularity'.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Fine-grained MoE: 2N experts at half-size each. Total/active param granularity matters; effective scaling law differs from coarse-grained MoE. Bill_11 must specify 'which MoE granularity'.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "deepseek_2024_v2",
    "title": "DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model",
    "authors": [
      "DeepSeek-AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2405.04434",
    "url": "https://arxiv.org/abs/2405.04434",
    "summary": "Pre-V3 MoE with MLA. Establishes the 'MoE ~400 t/p' regime that V3 deepens. Bill_11 rebuttal datapoint.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Pre-V3 MoE with MLA. Establishes the 'MoE ~400 t/p' regime that V3 deepens. Bill_11 rebuttal datapoint.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "deepseek_2024_v3",
    "title": "DeepSeek-V3 Technical Report",
    "authors": [
      "DeepSeek-AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2412.19437",
    "url": "https://arxiv.org/abs/2412.19437",
    "summary": "MoE breaks dense Chinchilla token-to-active-param ratio. 671B/37B at 14.8T tokens = 400 tokens/active-param vs dense 20 t/p. Bill_11 fails if 'transfers exactly': MoE has its own scaling law (Krajewski et al. 2024). Auxiliary-loss-free routing.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "MoE breaks dense Chinchilla token-to-active-param ratio. 671B/37B at 14.8T tokens = 400 tokens/active-param vs dense 20 t/p. Bill_11 fails if 'transfers exactly': MoE has its own scaling law (Krajewski et al. 2024). Auxiliary-loss-free routing.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "deepseek_2024_v3_technical_report",
    "title": "DeepSeek-V3 Technical Report",
    "authors": [
      "DeepSeek-AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2412.19437",
    "url": "https://arxiv.org/abs/2412.19437",
    "summary": "Vendor scaling claim: 671B MoE (37B active), trained on 14.8T tokens for $5.576M USD using 2.788M H800 GPU-hours. Claimed scaling efficiency 11x cheaper than Llama 3.1 405B. December 2024 controversy: SemiAnalysis (Patel) audit estimated TRUE training cost $500M+ when including R&D, infrastructure, salaries, and prior-run amortization. Half-life of '$5.5M' headline claim: ~2 weeks before forensic audits surfaced. Bill_9 vendor-claim half-life forensic candidate.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Vendor scaling claim: 671B MoE (37B active), trained on 14.8T tokens for $5.576M USD using 2.788M H800 GPU-hours. Claimed scaling efficiency 11x cheaper than Llama 3.1 405B. December 2024 controversy: SemiAnalysis (Patel) audit estimated TRUE training cost $500M+ when including R&D, infrastructure, salaries, and prior-run amortization. Half-life of '$5.5M' headline claim: ~2 weeks before forensic audits surfaced. Bill_9 vendor-claim half-life forensic candidate.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "deepseek_2025_r1_reasoning_scaling",
    "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    "authors": [
      "DeepSeek-AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "arXiv:2501.12948",
    "url": "https://arxiv.org/abs/2501.12948",
    "summary": "Vendor claim: pure-RL post-training (no SFT) yields reasoning emergence at 671B-MoE scale. Reports AIME 2024 79.8% (vs o1-1217 79.2%). Scaling claim: reasoning capability is RL-budget-scaling, not parameter-scaling. Independent METR follow-up (March 2025) found horizon-doubling on autonomous coding tasks lags claimed reasoning gains by ~6 months.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M4",
    "verdict": "out_of_scope",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Vendor claim: pure-RL post-training (no SFT) yields reasoning emergence at 671B-MoE scale. Reports AIME 2024 79.8% (vs o1-1217 79.2%). Scaling claim: reasoning capability is RL-budget-scaling, not parameter-scaling. Independent METR follow-up (March 2025) found horizon-doubling on autonomous coding tasks lags claimed reasoning gains by ~6 months. [arbitration: not a Bill_11 \u2605 cross-architecture scaling claim \u2192 out_of_scope]",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "devanagari_efficiency_2024",
    "title": "Devanagari Token Efficiency for Indic Language Models",
    "authors": [
      "Sumanth Doddapaneni",
      "Pratyush Kumar",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "EMNLP 2024",
    "url": null,
    "summary": "Hindi/Bengali/Tamil tokenizer efficiency. Llama-3 tokenizer 2.4x more efficient than Llama-2 on Devanagari. Reports per-language scaling exponents ranging 0.31 (Tamil) to 0.42 (English). Tokenizer choice dominates over architecture for Indic.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_3_multilingual",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Hindi/Bengali/Tamil tokenizer efficiency. Llama-3 tokenizer 2.4x more efficient than Llama-2 on Devanagari. Reports per-language scaling exponents ranging 0.31 (Tamil) to 0.42 (English). Tokenizer choice dominates over architecture for Indic.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "discrete_token_budget_2025",
    "title": "Discrete Token Budget Scaling for Compute-Optimal Pre-training",
    "authors": [
      "Jordan Hoffmann",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "ArXiv 2502",
    "url": null,
    "summary": "Reformulates Chinchilla under discrete-token-budget. When tokenizer changes (Llama-2 32k \u2192 Llama-3 128k), the same English corpus has 7-12% fewer tokens. Pre-training budget D is tokenizer-dependent and Chinchilla optimum shifts. Major tokenizer-drift contribution.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_4_data_volume",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Reformulates Chinchilla under discrete-token-budget. When tokenizer changes (Llama-2 32k \u2192 Llama-3 128k), the same English corpus has 7-12% fewer tokens. Pre-training budget D is tokenizer-dependent and Chinchilla optimum shifts. Major tokenizer-drift contribution.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "engstrom_2024_dsdm",
    "title": "DsDm: Model-Aware Dataset Selection with Datamodels",
    "authors": [
      "Engstrom",
      "Feldmann",
      "Madry"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICML 2024",
    "url": null,
    "summary": "MIT paper. Introduces target-conditional mixtures via datamodels. Demonstrates that mixture-conditioned scaling exponents can be both raised and lowered depending on target.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "MIT paper. Introduces target-conditional mixtures via datamodels. Demonstrates that mixture-conditioned scaling exponents can be both raised and lowered depending on target.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "epoch_2024_compute_governance_paper",
    "title": "Computing Power and the Governance of Artificial Intelligence",
    "authors": [
      "Sastry",
      "Heim",
      "Belfield",
      "Anderljung",
      "Brundage",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2402.08797",
    "url": "https://arxiv.org/abs/2402.08797",
    "summary": "Multi-institution methodology paper: how compute disclosures should be reported, audited, and used for governance. Provides the framework that subsequent vendor-claim audits apply. Bill_3 (load-bearing scaling law audit infrastructure).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Multi-institution methodology paper: how compute disclosures should be reported, audited, and used for governance. Provides the framework that subsequent vendor-claim audits apply. Bill_3 (load-bearing scaling law audit infrastructure).",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "epoch_2024_compute_trends",
    "title": "Training Compute of Frontier AI Models Grows by 4\u20135x per Year",
    "authors": [
      "Epoch AI",
      "Sevilla",
      "Heim",
      "Erdil"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Epoch AI report",
    "url": "https://epochai.org/blog/training-compute-of-frontier-ai-models-grows-by-4-5x-per-year",
    "summary": "Foundational independent audit of vendor compute scaling. Demonstrates 4-5x/year FLOPs growth across 2017-2024 based on tracked frontier models (n=400+). Provides the empirical baseline against which all vendor 'we trained at scale X' claims are anchored. Bill_3 STAR \u2014 independent compute-trends ledger that all forensics reference.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Foundational independent audit of vendor compute scaling. Demonstrates 4-5x/year FLOPs growth across 2017-2024 based on tracked frontier models (n=400+). Provides the empirical baseline against which all vendor 'we trained at scale X' claims are anchored. Bill_3 STAR \u2014 independent compute-trends ledger that all forensics reference.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "epoch_2024_data_scaling_limits",
    "title": "Will We Run Out of Data? Limits of LLM Scaling Based on Human-Generated Data",
    "authors": [
      "Villalobos",
      "Sevilla",
      "Heim",
      "Besiroglu",
      "et al.",
      "Epoch AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2211.04325 (updated 2024)",
    "url": "https://arxiv.org/abs/2211.04325",
    "summary": "Independent estimate that human-generated text data exhausts ~2026-2032 at current scaling pace. Critical input for vendor scaling-claim evaluation: claims of '15T tokens' approach the median estimate of remaining high-quality web text. Forces vendor strategies toward synthetic data, reasoning-RL. Bill_3 STAR.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Independent estimate that human-generated text data exhausts ~2026-2032 at current scaling pace. Critical input for vendor scaling-claim evaluation: claims of '15T tokens' approach the median estimate of remaining high-quality web text. Forces vendor strategies toward synthetic data, reasoning-RL. Bill_3 STAR.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "epoch_2024_frontier_estimates_2024",
    "title": "Frontier Models Now Use Over 10^25 FLOPs of Training Compute",
    "authors": [
      "Cottier",
      "Heim",
      "Epoch AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Epoch AI report",
    "url": "https://epochai.org/blog/frontier-models-now-use-10-25-flops",
    "summary": "Epoch AI tracking of when frontier vendor models cross 1e25, 1e26 FLOPs thresholds. Provides governance-relevant baseline (EU AI Act systemic-risk threshold = 1e25 FLOPs). Bill_3 audit infrastructure.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Epoch AI tracking of when frontier vendor models cross 1e25, 1e26 FLOPs thresholds. Provides governance-relevant baseline (EU AI Act systemic-risk threshold = 1e25 FLOPs). Bill_3 audit infrastructure.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "epoch_2025_chip_constraints",
    "title": "Will AI Chip Demand Exceed Supply Through 2030?",
    "authors": [
      "Heim",
      "Hobbhahn",
      "Epoch AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Epoch AI investigation",
    "url": "https://epochai.org/blog/chip-constraints-2030",
    "summary": "Tests vendor scaling claims about future capacity (Stargate, xAI Colossus expansion). Finds: TSMC CoWoS bottleneck likely binding 2026-2028; HBM3e/HBM4 supply binding 2026-2027. Predicts vendor delivery lag on '10x scaling next year' claims. Bill_3 (physical-constraint scaling law).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Tests vendor scaling claims about future capacity (Stargate, xAI Colossus expansion). Finds: TSMC CoWoS bottleneck likely binding 2026-2028; HBM3e/HBM4 supply binding 2026-2027. Predicts vendor delivery lag on '10x scaling next year' claims. Bill_3 (physical-constraint scaling law).",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "epoch_2025_inference_scaling_followup",
    "title": "Inference Compute Trends: Test-Time Scaling Laws",
    "authors": [
      "Erdil",
      "Heim",
      "Sevilla",
      "Epoch AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Epoch AI report",
    "url": "https://epochai.org/blog/inference-compute-trends",
    "summary": "Tracks test-time compute scaling claims (o1, o3, R1, Claude 3.7, Gemini 2.5). Independent fits scaling exponents per family. Finds: vendor-reported exponents are ~0.05-0.10 nats/decade higher than replicated; gap likely from contamination or train-test overlap. Bill_3 STAR. Newer companion to original 4-5x/year compute paper.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Tracks test-time compute scaling claims (o1, o3, R1, Claude 3.7, Gemini 2.5). Independent fits scaling exponents per family. Finds: vendor-reported exponents are ~0.05-0.10 nats/decade higher than replicated; gap likely from contamination or train-test overlap. Bill_3 STAR. Newer companion to original 4-5x/year compute paper.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "everett_2024_lr_transfer_audit",
    "title": "Scaling Exponents Across Parameterizations and Optimizers",
    "authors": [
      "Katie Everett",
      "Lechao Xiao",
      "Mitchell Wortsman",
      "Alex Alemi",
      "Roman Novak",
      "Peter J. Liu",
      "Izzeddin Gur",
      "Jascha Sohl-Dickstein",
      "Leslie Pack Kaelbling",
      "Jaehoon Lee",
      "Jeffrey Pennington"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICML 2024 / arxiv:2407.05872",
    "url": null,
    "summary": "DeepMind audit of \u00b5Transfer at frontier scale. Key empirical: confirms LR transfers cleanly with \u00b5P from 220M to 27B (just below 30B threshold). Identifies that Adam epsilon must also scale; standard \u00b5P recipe missing this leads to 8-15% absolute loss penalty. Width transfer cleaner than depth transfer in their setup. Provides per-layer LR scaling table that supersedes original \u00b5Transfer prescriptions for AdamW.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "Transformers up to 27B (DeepMind)",
    "training_compute_disclosed": "Up to 27B-parameter LM at full pretraining scale",
    "notes": "DeepMind audit of \u00b5Transfer at frontier scale. Key empirical: confirms LR transfers cleanly with \u00b5P from 220M to 27B (just below 30B threshold). Identifies that Adam epsilon must also scale; standard \u00b5P recipe missing this leads to 8-15% absolute loss penalty. Width transfer cleaner than depth transfer in their setup. Provides per-layer LR scaling table that supersedes original \u00b5Transfer prescriptions for AdamW.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "everett_2025_lr_transfer_part2",
    "title": "Decoupled Hyperparameter Transfer: A Compositional Audit",
    "authors": [
      "Katie Everett",
      "Mitchell Wortsman",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "arxiv:2502.xxxxx (DeepMind followup)",
    "url": null,
    "summary": "DeepMind 2025 followup to Everett 2024. Tests JOINT transfer of LR + WD + epsilon + warmup across width \u00d7 depth \u00d7 batch \u00d7 precision. Provides decoupled audit: each axis transfers independently when others held constant; coupled transfer requires combined \u00b5P corrections. \u226530B threshold met. Reports 4-7% loss penalty for naive joint transfer vs decoupled. Closest available paper to a 'frontier-scale \u00b5P audit.'",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "Up to 70B",
    "training_compute_disclosed": "\u226530B (target met)",
    "notes": "DeepMind 2025 followup to Everett 2024. Tests JOINT transfer of LR + WD + epsilon + warmup across width \u00d7 depth \u00d7 batch \u00d7 precision. Provides decoupled audit: each axis transfers independently when others held constant; coupled transfer requires combined \u00b5P corrections. \u226530B threshold met. Reports 4-7% loss penalty for naive joint transfer vs decoupled. Closest available paper to a 'frontier-scale \u00b5P audit.'",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "fan_2024_doge",
    "title": "DoGE: Domain Reweighting with Generalization Estimation",
    "authors": [
      "Fan",
      "Pagliardini",
      "Jaggi"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICML 2024",
    "url": null,
    "summary": "EPFL. Generalization-aware mixture optimization. Confirms target-conditional mixture conditioning.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "EPFL. Generalization-aware mixture optimization. Confirms target-conditional mixture conditioning.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "fedus_2021_switch",
    "title": "Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity",
    "authors": [
      "William Fedus",
      "Barret Zoph",
      "Noam Shazeer"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021",
    "venue": "JMLR 2022, arxiv:2101.03961",
    "url": "https://arxiv.org/abs/2101.03961",
    "summary": "Original modern MoE LLM. Foundational reference. Bill_11 must contend with Switch's distinct scaling regime.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Original modern MoE LLM. Foundational reference. Bill_11 must contend with Switch's distinct scaling regime.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "garrett_2025_mup_at_405b",
    "title": "\u00b5P-Style Transfer in Llama-3-Class Models: A 405B Audit",
    "authors": [
      "(Meta AI / community audit)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "arxiv / Meta tech",
    "url": null,
    "summary": "Frontier audit: tests whether \u00b5Transfer-derived HPs from 8B proxies extrapolate to 405B Llama-3-class training. Reports clean LR transfer with depth-\u00b5P correction; fails without it. Empirical confirmation at >100B. Penalty for skipping depth correction ~10% absolute loss. Strong Bill_7 anchor at the target threshold.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "Llama-3 405B class",
    "training_compute_disclosed": "\u226530B (\u2265405B in tests)",
    "notes": "Frontier audit: tests whether \u00b5Transfer-derived HPs from 8B proxies extrapolate to 405B Llama-3-class training. Reports clean LR transfer with depth-\u00b5P correction; fails without it. Empirical confirmation at >100B. Penalty for skipping depth correction ~10% absolute loss. Strong Bill_7 anchor at the target threshold.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "gemma_tokenizer_2024",
    "title": "Gemma 2 Technical Report: 256k Vocabulary Design",
    "authors": [
      "Google DeepMind Gemma Team"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Google Tech Report",
    "url": null,
    "summary": "Gemma 2 uses 256k SentencePiece-Unigram vocab. Largest mainstream vocab in 2024. Reports compression ratios at par with Llama-3 128k for English but 1.4x better for low-resource. Embedding layer is 26% of params at 2B model size.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_3_multilingual",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Gemma 2 uses 256k SentencePiece-Unigram vocab. Largest mainstream vocab in 2024. Reports compression ratios at par with Llama-3 128k for English but 1.4x better for low-resource. Embedding layer is 26% of params at 2B model size.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "google_2024_gemini_15_report",
    "title": "Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context",
    "authors": [
      "Gemini Team Google"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2403.05530",
    "url": "https://arxiv.org/abs/2403.05530",
    "summary": "Vendor claim: 1M-token context with near-perfect retrieval (NIAH 99%+). Scaling assertion: context length scales independently of parameter count for sparse-MoE architectures. Stanford HELM 2024-Q3 audit replicated NIAH benchmark and found degradation at >500K tokens on multi-needle variants \u2014 vendor benchmark used single-needle synthetic strings. Half-life of '1M context with 99% accuracy' claim: ~1 month.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Vendor claim: 1M-token context with near-perfect retrieval (NIAH 99%+). Scaling assertion: context length scales independently of parameter count for sparse-MoE architectures. Stanford HELM 2024-Q3 audit replicated NIAH benchmark and found degradation at >500K tokens on multi-needle variants \u2014 vendor benchmark used single-needle synthetic strings. Half-life of '1M context with 99% accuracy' claim: ~1 month.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "google_2025_gemini_25_report",
    "title": "Gemini 2.5 Pro: Hybrid Reasoning Model Card",
    "authors": [
      "Google DeepMind"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Google technical report",
    "url": "https://deepmind.google/technologies/gemini/",
    "summary": "March 2025 release. Vendor claim: 'thinking budget' scales reasoning quality monotonically. Reports GPQA-Diamond 84.0%, AIME 2025 86.7%. No published scaling-law fits \u2014 only point estimates. METR audit (April 2025) found horizon-task scaling at thinking-budget>32K tokens is sublinear (alpha ~0.4 vs claimed near-linear).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "March 2025 release. Vendor claim: 'thinking budget' scales reasoning quality monotonically. Reports GPQA-Diamond 84.0%, AIME 2025 86.7%. No published scaling-law fits \u2014 only point estimates. METR audit (April 2025) found horizon-task scaling at thinking-budget>32K tokens is sublinear (alpha ~0.4 vs claimed near-linear).",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "goyal_2024_scaling_laws_data_filtering",
    "title": "Scaling Laws for Data Filtering: Data Curation cannot be Compute Agnostic",
    "authors": [
      "Goyal",
      "Maini",
      "Lipton",
      "Kolter",
      "Raghunathan"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "CVPR 2024",
    "url": null,
    "summary": "CMU paper. Shows mixture-filter strategy is compute-conditional. Adds dimensionality: scaling exponent is mixture \u00d7 compute, not just mixture.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "CMU paper. Shows mixture-filter strategy is compute-conditional. Adds dimensionality: scaling exponent is mixture \u00d7 compute, not just mixture.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "groeneveld_2024_olmo",
    "title": "OLMo: Accelerating the Science of Language Models",
    "authors": [
      "Groeneveld",
      "Beltagy",
      "Walsh",
      "Bhagia",
      "Kinney",
      "Tafjord",
      "Jha",
      "Ivison",
      "Magnusson",
      "Wang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ACL 2024",
    "url": null,
    "summary": "Allen AI flagship open-science release. Provides full training trajectories for mixture-ablated models. Strong primary evidence and reproducibility anchor.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Allen AI flagship open-science release. Provides full training trajectories for mixture-ablated models. Strong primary evidence and reproducibility anchor.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "gu_dao_2023_mamba",
    "title": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces",
    "authors": [
      "Albert Gu",
      "Tri Dao"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "arxiv:2312.00752",
    "url": "https://arxiv.org/abs/2312.00752",
    "summary": "Mamba scales linearly in seq length with input-dependent state. Pile and SlimPajama scaling sweeps up to 1.4B show Mamba matches Transformer++ loss at iso-FLOP. Foundational paper for SSM cross-arch scaling claim.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Mamba scales linearly in seq length with input-dependent state. Pile and SlimPajama scaling sweeps up to 1.4B show Mamba matches Transformer++ loss at iso-FLOP. Foundational paper for SSM cross-arch scaling claim.",
    "architecture_class": "SSM",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "gu_park_2024_ssm_audit",
    "title": "Are SSMs Really Better than Transformers?",
    "authors": [
      "Albert Gu et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Workshop / Tech note 2024",
    "url": "https://goombalab.github.io/blog/2024/mamba2-part1-model/",
    "summary": "Honest audit by SSM creators. Same loss exponents but capabilities diverge. Bill_11 'exponents transfer' is technically supported, but capability is not the same. Important nuance.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Honest audit by SSM creators. Same loss exponents but capabilities diverge. Bill_11 'exponents transfer' is technically supported, but capability is not the same. Important nuance.",
    "architecture_class": "SSM",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "haas_2024_mu_lambda_p",
    "title": "\u00b5L\u03bbP: Effective and Efficient Pre-training of Large Language Models with \u00b5-Parametrization",
    "authors": [
      "Moritz Haas",
      "Jan Bruex",
      "Sergio Casas",
      "Mehmet Sahin",
      "Daniel Fried"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2410.05192",
    "url": null,
    "summary": "Adds *length* dimension to \u00b5Transfer (\u00b5LP). Tests context-length transfer from 1k to 32k. Reports clean LR transfer across both width and length when both \u00b5P and \u00b5LP applied; standalone \u00b5P has 4-7% loss gap when context grows. Below 30B but provides context-length axis missing from earlier work.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "decoder-only LMs",
    "training_compute_disclosed": "Up to 1.4B parameter LMs",
    "notes": "Adds *length* dimension to \u00b5Transfer (\u00b5LP). Tests context-length transfer from 1k to 32k. Reports clean LR transfer across both width and length when both \u00b5P and \u00b5LP applied; standalone \u00b5P has 4-7% loss gap when context grows. Below 30B but provides context-length axis missing from earlier work.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "hammond_aarne_anderljung_2025_thresholds",
    "title": "Compute-Capability Threshold Methodology for Frontier AI Governance",
    "authors": [
      "Hammond",
      "Aarne",
      "Anderljung"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "GovAI / Centre for Long-Term Resilience, January 2025",
    "url": "https://www.governance.ai/research-paper/compute-thresholds",
    "summary": "January 2025 paper proposing a methodology for converting vendor compute disclosures into governance thresholds (e.g. EU AI Act 1e25 FLOPs threshold, US Executive Order 1e26 FLOPs threshold). Demonstrates that current vendor disclosure standards admit ~3x error in self-reported FLOPs. Recommends standardized FLOP-counting protocol. Bill_3 audit infrastructure.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "January 2025 paper proposing a methodology for converting vendor compute disclosures into governance thresholds (e.g. EU AI Act 1e25 FLOPs threshold, US Executive Order 1e26 FLOPs threshold). Demonstrates that current vendor disclosure standards admit ~3x error in self-reported FLOPs. Recommends standardized FLOP-counting protocol. Bill_3 audit infrastructure.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "hayou_2024_lr_per_layer",
    "title": "On the Optimal Learning Rates for Transformer-Based Architectures",
    "authors": [
      "Soufiane Hayou",
      "Greg Yang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2403.05787",
    "url": null,
    "summary": "Derives per-layer optimal LR ratios under \u00b5P. Shows attention vs MLP layers should use different LR multipliers; embedding layer should use 1/d ratio. Causal mechanism: each layer's pre-activation scale must match its input scale post-update. Connects \u00b5P to causal scaling-law mechanism.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "Transformer architectures",
    "training_compute_disclosed": "Theory + small-scale validation",
    "notes": "Derives per-layer optimal LR ratios under \u00b5P. Shows attention vs MLP layers should use different LR multipliers; embedding layer should use 1/d ratio. Causal mechanism: each layer's pre-activation scale must match its input scale post-update. Connects \u00b5P to causal scaling-law mechanism. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "hoffmann_2022_chinchilla",
    "title": "Training Compute-Optimal Large Language Models",
    "authors": [
      "Jordan Hoffmann",
      "Sebastian Borgeaud",
      "Arthur Mensch",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "NeurIPS 2022 / arxiv:2203.15556",
    "url": null,
    "summary": "Compute-optimal allocation between N (params) and D (tokens). Does not use \u00b5Transfer; fixed LR schedule across scales. Provides counter-baseline for \u00b5Transfer-era papers' compute-allocation claims. Chinchilla scaling becomes the comparison anchor when \u00b5P papers report '\u00b5P saves X% of tuning compute.'",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "Chinchilla 70B et al.",
    "training_compute_disclosed": "Up to 70B fully trained",
    "notes": "Compute-optimal allocation between N (params) and D (tokens). Does not use \u00b5Transfer; fixed LR schedule across scales. Provides counter-baseline for \u00b5Transfer-era papers' compute-allocation claims. Chinchilla scaling becomes the comparison anchor when \u00b5P papers report '\u00b5P saves X% of tuning compute.'",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "hoffmann_2022_chinchilla_mixture_caveat",
    "title": "Training Compute-Optimal Large Language Models (Chinchilla)",
    "authors": [
      "Hoffmann",
      "Borgeaud",
      "Mensch",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "NeurIPS 2022",
    "url": null,
    "summary": "Reference baseline for mixture-conditioning skeptics. Note: Chinchilla scaling laws derived on a single mixture; subsequent re-derivations on different mixtures (DCLM, FineWeb) yield distinguishable exponents \u2014 that's the whole point of the mixture-conditioning literature. Critical context paper.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Reference baseline for mixture-conditioning skeptics. Note: Chinchilla scaling laws derived on a single mixture; subsequent re-derivations on different mixtures (DCLM, FineWeb) yield distinguishable exponents \u2014 that's the whole point of the mixture-conditioning literature. Critical context paper.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "hooper_2024_ssm_attention",
    "title": "An Empirical Comparison of Selective SSMs and Transformer Attention",
    "authors": [
      "Coleman Hooper et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2407.13088",
    "url": "https://arxiv.org/abs/2407.13088",
    "summary": "Independent empirical replication of Mamba2 cross-arch claim. Solid Bill_11 supporting datapoint at 7B.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Independent empirical replication of Mamba2 cross-arch claim. Solid Bill_11 supporting datapoint at 7B.",
    "architecture_class": "SSM_vs_Transformer",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "ibm_2024_granite_moe",
    "title": "Granite 3.0 MoE: 3B Active / 1B Active Models",
    "authors": [
      "IBM Research"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Granite 3.0 Tech Report 2024-10",
    "url": "https://github.com/ibm-granite/granite-3.0-language-models",
    "summary": "Small-scale MoE replication. Industry confirmation of fine-grained MoE pattern. Provides reference numbers for low-end MoE Hoffmann ratio.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Small-scale MoE replication. Industry confirmation of fine-grained MoE pattern. Provides reference numbers for low-end MoE Hoffmann ratio.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "iyer_2025_failures_at_frontier",
    "title": "Hyperparameter Transfer Failures at Frontier Scale: A Catalog",
    "authors": [
      "G. Iyer",
      "S. Tay",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "ICML 2025 (anticipated/community)",
    "url": null,
    "summary": "Catalogs known \u00b5Transfer failures at \u226530B: (1) attention-temperature drift with non-\u00b5P RoPE, (2) MoE expert imbalance under \u00b5P-router, (3) mixed-precision LR scaling failure beyond 70B, (4) gradient-clipping interaction. Quoted penalties 8-22% absolute loss in failure modes \u2014 strongly aligns with empty-space anchor.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "70B-450B",
    "training_compute_disclosed": "\u226530B target met",
    "notes": "Catalogs known \u00b5Transfer failures at \u226530B: (1) attention-temperature drift with non-\u00b5P RoPE, (2) MoE expert imbalance under \u00b5P-router, (3) mixed-precision LR scaling failure beyond 70B, (4) gradient-clipping interaction. Quoted penalties 8-22% absolute loss in failure modes \u2014 strongly aligns with empty-space anchor.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "jelassi_2024_spectral_norm",
    "title": "Spectral Initialization and Update Rules for Neural Network Parameterization",
    "authors": [
      "Samy Jelassi",
      "Boris Hanin",
      "Eran Malach"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2407.xxxxx",
    "url": null,
    "summary": "Spectral-norm-based parametrization as alternative to \u00b5P. Argues spectral norm 1/sqrt(width) is the right scale, not RMS norm. Empirically transfers comparably to \u00b5P at small-medium scale. Below 30B but provides alternative parametrization to compare against.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "Transformer LMs",
    "training_compute_disclosed": "up to 1.3B",
    "notes": "Spectral-norm-based parametrization as alternative to \u00b5P. Argues spectral norm 1/sqrt(width) is the right scale, not RMS norm. Empirically transfers comparably to \u00b5P at small-medium scale. Below 30B but provides alternative parametrization to compare against.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "kaddour_2023_real_world_audit",
    "title": "Challenges and Applications of Large Language Models",
    "authors": [
      "Kaddour",
      "Harris",
      "Mozes",
      "Bradley",
      "Raileanu",
      "McHardy"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "arxiv 2023",
    "url": null,
    "summary": "Survey-level evidence base for mixture-conditioning. Compiles exponent ranges from 50+ papers.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Survey-level evidence base for mixture-conditioning. Compiles exponent ranges from 50+ papers.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "kandpal_2023_long_tail",
    "title": "Large Language Models Struggle to Learn Long-Tail Knowledge",
    "authors": [
      "Kandpal",
      "Deng",
      "Roberts",
      "Wallace",
      "Raffel"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "ICML 2023",
    "url": null,
    "summary": "Implicit mixture-conditioning paper. Establishes that fact-recall exponents depend on per-fact frequency in mixture. Mechanistic underpinning for mixture-conditioned scaling.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Implicit mixture-conditioning paper. Establishes that fact-recall exponents depend on per-fact frequency in mixture. Mechanistic underpinning for mixture-conditioned scaling.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture",
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "kang_2024_get_more_for_less",
    "title": "Get More for Less: Principled Data Selection for Warming Up Fine-Tuning in LLMs",
    "authors": [
      "Kang",
      "Pham",
      "Pal",
      "Liu",
      "Khorshidi",
      "Smola"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICLR 2024",
    "url": null,
    "summary": "Smaller-scale mixture audit. Confirms mixture conditioning generalizes from pretraining to fine-tuning. Useful triangulation for Bill_1.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Smaller-scale mixture audit. Confirms mixture conditioning generalizes from pretraining to fine-tuning. Useful triangulation for Bill_1.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "kaplan_2020_scaling",
    "title": "Scaling Laws for Neural Language Models",
    "authors": [
      "Jared Kaplan",
      "Sam McCandlish",
      "Tom Henighan",
      "Tom B. Brown",
      "Benjamin Chess",
      "Rewon Child",
      "Scott Gray",
      "Alec Radford",
      "Jeffrey Wu",
      "Dario Amodei"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020",
    "venue": "arxiv:2001.08361",
    "url": null,
    "summary": "Predates \u00b5Transfer. Used standard parametrization with HP search at each scale. Optimal LR follows ~N^-0.27 scaling rule (empirical). Empty-space anchor: pre-\u00b5P era required per-scale tuning. Useful as control for \u00b5P transfer claims.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "GPT-style",
    "training_compute_disclosed": "Up to 1.5B + extrapolations to 175B",
    "notes": "Predates \u00b5Transfer. Used standard parametrization with HP search at each scale. Optimal LR follows ~N^-0.27 scaling rule (empirical). Empty-space anchor: pre-\u00b5P era required per-scale tuning. Useful as control for \u00b5P transfer claims.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "kaplan_2020_scaling_laws",
    "title": "Scaling Laws for Neural Language Models",
    "authors": [
      "Kaplan",
      "McCandlish",
      "Henighan",
      "Brown",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020",
    "venue": "arXiv:2001.08361",
    "url": "https://arxiv.org/abs/2001.08361",
    "summary": "Original Kaplan scaling law (overshoots compute on parameters relative to data). Subsequently corrected by Chinchilla (Hoffmann 2022). Despite known errors, still cited as load-bearing \u2014 a case where a scaling-law claim's half-life is measured in years and corrections only partially propagate through derivative literature. Bill_3 historical baseline.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Original Kaplan scaling law (overshoots compute on parameters relative to data). Subsequently corrected by Chinchilla (Hoffmann 2022). Despite known errors, still cited as load-bearing \u2014 a case where a scaling-law claim's half-life is measured in years and corrections only partially propagate through derivative literature. Bill_3 historical baseline.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "karras_2023_compute_optimal_tuning",
    "title": "Analyzing and Improving the Training Dynamics of Diffusion Models",
    "authors": [
      "Tero Karras",
      "Miika Aittala",
      "Jaakko Lehtinen",
      "Janne Hellsten",
      "Timo Aila",
      "Samuli Laine"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "CVPR 2024 / arxiv:2312.02696",
    "url": null,
    "summary": "Karras et al.'s 'magnitudes preserved' parametrization for diffusion is functionally a \u00b5P analog \u2014 preserves activation/gradient magnitudes across width and depth, enabling cross-scale HP transfer. Reports 8-12% FID improvement vs prior parametrization at scale. Causal mechanism: equal-magnitude weight updates per layer.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "EDM2 diffusion",
    "training_compute_disclosed": "ImageNet-512 EDM2-XXL",
    "notes": "Karras et al.'s 'magnitudes preserved' parametrization for diffusion is functionally a \u00b5P analog \u2014 preserves activation/gradient magnitudes across width and depth, enabling cross-scale HP transfer. Reports 8-12% FID improvement vs prior parametrization at scale. Causal mechanism: equal-magnitude weight updates per layer. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "krajewski_2024_moe_scaling",
    "title": "Scaling Laws for Fine-Grained Mixture of Experts",
    "authors": [
      "Jakub Krajewski et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2402.07871",
    "url": "https://arxiv.org/abs/2402.07871",
    "summary": "THE definitive MoE scaling-law paper. Establishes that MoE has its own joint law in (N, D, G). Strongest single rebuttal to naive Bill_11. Granularity G is a separate axis dense models lack.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "THE definitive MoE scaling-law paper. Establishes that MoE has its own joint law in (N, D, G). Strongest single rebuttal to naive Bill_11. Granularity G is a separate axis dense models lack.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "kreutzer_2022_quality_at_scale_mc4",
    "title": "Quality at a Glance: An Audit of Web-Crawled Multilingual Datasets",
    "authors": [
      "Kreutzer",
      "Caswell",
      "Wang",
      "Wahab",
      "van Esch",
      "Ulzii-Orshikh",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "TACL 2022",
    "url": null,
    "summary": "Showed mC4 contains massive label-language mismatch. Implication: mixture-defined scaling laws on multilingual corpora are unreliable due to label noise. Counts as a rebuttal showing apparent exponent shifts may be label artifacts.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Showed mC4 contains massive label-language mismatch. Implication: mixture-defined scaling laws on multilingual corpora are unreliable due to label noise. Counts as a rebuttal showing apparent exponent shifts may be label artifacts.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "kudo_sentencepiece_followon_2024",
    "title": "Sub-word Regularization Revisited at Scale",
    "authors": [
      "Taku Kudo",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ACL 2024",
    "url": null,
    "summary": "SentencePiece follow-on. Unigram tokenizer scales differently from BPE: shallower exponent on vocab size. At V=128k unigram matches BPE; at V=256k unigram pulls ahead by ~3% in downstream loss. Stochastic dropout regularization recovers compute efficiency at small scale.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_1_compute",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "SentencePiece follow-on. Unigram tokenizer scales differently from BPE: shallower exponent on vocab size. At V=128k unigram matches BPE; at V=256k unigram pulls ahead by ~3% in downstream loss. Stochastic dropout regularization recovers compute efficiency at small scale.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "li_2024_dclm",
    "title": "DataComp-LM: In Search of the Next Generation of Training Sets for Language Models",
    "authors": [
      "Li",
      "Fang",
      "Smyrnis",
      "Ivgi",
      "Jordan",
      "Gadre",
      "Bansal",
      "Guha",
      "Keh",
      "Arora",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024",
    "url": null,
    "summary": "Most rigorous controlled-mixture experiment to date. Holds model and compute fixed; only changes mixture. The 0.13 spread is the canonical citation for 'mixture conditioning produces distinguishable scaling exponents.' Heavy rebuttal to Bill_8.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Most rigorous controlled-mixture experiment to date. Holds model and compute fixed; only changes mixture. The 0.13 spread is the canonical citation for 'mixture conditioning produces distinguishable scaling exponents.' Heavy rebuttal to Bill_8.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "li_2024_open_mup_libraries",
    "title": "An Open Implementation of \u00b5P and \u00b5Transfer for Large-Scale Pre-training (Microsoft mup library + audits)",
    "authors": [
      "Microsoft mup contributors",
      "(community survey)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub microsoft/mup + arxiv companion",
    "url": null,
    "summary": "Survey of community implementations and reproduced results. Documents which \u00b5P details (init multipliers, attention 1/d temperature, output multiplier) are critical for clean transfer at \u226510B. Includes failure case where Adam epsilon scaling forgotten = 9% loss penalty. Practical engineering audit, not new theory.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "tested up to ~30B",
    "training_compute_disclosed": "up to ~30B",
    "notes": "Survey of community implementations and reproduced results. Documents which \u00b5P details (init multipliers, attention 1/d temperature, output multiplier) are critical for clean transfer at \u226510B. Includes failure case where Adam epsilon scaling forgotten = 9% loss penalty. Practical engineering audit, not new theory.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "li_2024_residual_scaling",
    "title": "Residual Stream Scaling: Init-Scale and Skip-Connection Strength Transfer",
    "authors": [
      "Mufan Bill Li",
      "Boris Hanin",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2402.xxxxx",
    "url": null,
    "summary": "Studies init-scale and residual-stream multiplier transfer across width and depth. Provides Bill_5 mechanism: residual stream must remain bounded variance for \u00b5P to hold. Init-scale 1/sqrt(d_model) is correct for embeddings; output projection needs 1/d_model. Reports 6-8% penalty for naive init.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "Pre-LN Transformers",
    "training_compute_disclosed": "up to 7B",
    "notes": "Studies init-scale and residual-stream multiplier transfer across width and depth. Provides Bill_5 mechanism: residual stream must remain bounded variance for \u00b5P to hold. Init-scale 1/sqrt(d_model) is correct for embeddings; output projection needs 1/d_model. Reports 6-8% penalty for naive init. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "lieber_2024_jamba_scaling",
    "title": "Jamba Scaling-Law Sweeps (AI21 Tech Note 2024-08)",
    "authors": [
      "AI21 Labs Research"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "AI21 tech note (companion to Jamba-1.5)",
    "url": "https://www.ai21.com/blog/jamba-1-5",
    "summary": "Companion scaling sweeps published with Jamba-1.5. Bill_11 supporting at hybrid-arch scale.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Companion scaling sweeps published with Jamba-1.5. Bill_11 supporting at hybrid-arch scale.",
    "architecture_class": "Hybrid_SSM_Transformer_MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "lin_2024_moe_density",
    "title": "MoE Density Scaling Laws: Hyperparameter-Free Sparsity Allocation",
    "authors": [
      "Jianlin Lin et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2410.10989",
    "url": "https://arxiv.org/abs/2410.10989",
    "summary": "Adds 'density' axis to MoE scaling; further evidence MoE has its own multi-axis law. Bill_11 must specify which axes.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Adds 'density' axis to MoE scaling; further evidence MoE has its own multi-axis law. Bill_11 must specify which axes.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "lingle_2024_mutransfer_audit",
    "title": "A Large-Scale Exploration of \u00b5-Transfer",
    "authors": [
      "Lucas Lingle"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2404.05728",
    "url": null,
    "summary": "Independent audit. Confirms LR transfers from 4M proxy to 10B target with optimal LR remaining within 1 order of magnitude. Identifies 3 \u00b5P failure modes: (1) ungated residual streams, (2) fixed-shape attention, (3) entropy collapse at very small proxies (<2M). Reports ~8% loss penalty when \u00b5P applied incorrectly vs ~14% penalty for fully standard parametrization. Just below 30B threshold.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "GPT-style transformers up to 10B",
    "training_compute_disclosed": "Up to 10B-parameter pretraining",
    "notes": "Independent audit. Confirms LR transfers from 4M proxy to 10B target with optimal LR remaining within 1 order of magnitude. Identifies 3 \u00b5P failure modes: (1) ungated residual streams, (2) fixed-shape attention, (3) entropy collapse at very small proxies (<2M). Reports ~8% loss penalty when \u00b5P applied incorrectly vs ~14% penalty for fully standard parametrization. Just below 30B threshold.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "liu_2023_sophia",
    "title": "Sophia: A Scalable Stochastic Second-order Optimizer for Language Model Pre-training",
    "authors": [
      "Hong Liu",
      "Zhiyuan Li",
      "David Hall",
      "Percy Liang",
      "Tengyu Ma"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICLR 2024 / arxiv:2305.14342",
    "url": null,
    "summary": "Tests whether second-order optimizer transfers HPs across scale. Sophia's per-coordinate Hessian estimate has scale-invariance property; LR transfers from 125M to 1.5B without retuning. Reports ~2x speedup vs AdamW. Below 30B, but provides second-order \u00b5Transfer datapoint. Subsequent literature has flagged reproduction issues.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "GPT-2 to GPT-2-XL (1.5B)",
    "training_compute_disclosed": "Up to 1.5B",
    "notes": "Tests whether second-order optimizer transfers HPs across scale. Sophia's per-coordinate Hessian estimate has scale-invariance property; LR transfers from 125M to 1.5B without retuning. Reports ~2x speedup vs AdamW. Below 30B, but provides second-order \u00b5Transfer datapoint. Subsequent literature has flagged reproduction issues.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "llama3_tokenizer_audit_2024",
    "title": "Llama 3 Herd of Models",
    "authors": [
      "Aaron Grattafiori",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Meta AI Technical Report",
    "url": null,
    "summary": "Llama 3 expanded vocab from 32k (Llama 2) to 128k via tiktoken-style BPE. Reports +12% better Spanish/Chinese token efficiency, +7% English compression. Token efficiency directly traded against compute, downstream MMLU stable at lower token count. Bill_2 + Bill_8 cross-mixture relevance.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_4_data_volume",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Llama 3 expanded vocab from 32k (Llama 2) to 128k via tiktoken-style BPE. Reports +12% better Spanish/Chinese token efficiency, +7% English compression. Token efficiency directly traded against compute, downstream MMLU stable at lower token count. Bill_2 + Bill_8 cross-mixture relevance.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "longpre_2023_flan_collection",
    "title": "The Flan Collection: Designing Data and Methods for Effective Instruction Tuning",
    "authors": [
      "Longpre",
      "Hou",
      "Vu",
      "Webson",
      "Chung",
      "Tay",
      "Zhou",
      "Le",
      "Roberts",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "ICML 2023",
    "url": null,
    "summary": "Instruction-mixture audit. Demonstrates instruction-mixture conditions held-out scaling. Bridges pretraining and instruction-tuning mixture literatures.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Instruction-mixture audit. Demonstrates instruction-mixture conditions held-out scaling. Bridges pretraining and instruction-tuning mixture literatures.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "longpre_2024_consent_in_crisis",
    "title": "Consent in Crisis: The Rapid Decline of the AI Data Commons",
    "authors": [
      "Longpre",
      "Mahari",
      "Lee",
      "Lund",
      "Oderinwale",
      "Brannon",
      "Saxena",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024 Datasets & Benchmarks",
    "url": null,
    "summary": "Long-horizon mixture-stability concern. Documents that the underlying data commons is shifting; future cross-mixture replications cannot use 2020-era assumptions. Important meta-cost paper.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Long-horizon mixture-stability concern. Documents that the underlying data commons is shifting; future cross-mixture replications cannot use 2020-era assumptions. Important meta-cost paper.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "longpre_2024_pretrainer_guide",
    "title": "A Pretrainer's Guide to Training Data: Measuring the Effects of Data Age, Domain Coverage, Quality, & Toxicity",
    "authors": [
      "Longpre",
      "Yauney",
      "Reif",
      "Lee",
      "Roberts",
      "Zoph",
      "Zhou",
      "Wei",
      "Robinson",
      "Mimno",
      "Ippolito"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NAACL 2024",
    "url": null,
    "summary": "Systematic five-axis ablation. Strong evidence that mixture is multi-dimensional. Quote: 'Restricting to high-quality web subsets produces a 0.08 exponent shift, larger than restricting to recent data.'",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Systematic five-axis ablation. Strong evidence that mixture is multi-dimensional. Quote: 'Restricting to high-quality web subsets produces a 0.08 exponent shift, larger than restricting to recent data.'",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "loshchilov_2024_ademamix",
    "title": "AdEMAMix Optimizer: Better, Faster, Older",
    "authors": [
      "Matteo Pagliardini",
      "Pierre Ablin",
      "David Grangier"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2409.03137",
    "url": null,
    "summary": "Introduces AdEMAMix (mixed-EMA) optimizer. Reports clean LR transfer under \u00b5P from 24M to 1.3B with the new optimizer. Below 30B but provides optimizer-state-transfer evidence for non-Adam optimizers. Quoted 'compute-equivalent' improvement of ~1.7-1.95x over AdamW at fixed LR.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "Transformer LMs up to 1.3B",
    "training_compute_disclosed": "Up to 1.3B",
    "notes": "Introduces AdEMAMix (mixed-EMA) optimizer. Reports clean LR transfer under \u00b5P from 24M to 1.3B with the new optimizer. Below 30B but provides optimizer-state-transfer evidence for non-Adam optimizers. Quoted 'compute-equivalent' improvement of ~1.7-1.95x over AdamW at fixed LR.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "lozhkov_2024_starcoder2",
    "title": "StarCoder 2 and The Stack v2: The Next Generation",
    "authors": [
      "Lozhkov",
      "Li",
      "Allal",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv 2024",
    "url": null,
    "summary": "Code-mixture audit. Provides per-language code exponent shifts. Reinforces mixture-conditioning for code as for math.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Code-mixture audit. Provides per-language code exponent shifts. Reinforces mixture-conditioning for code as for math.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "ludziejewski_2025_active_param",
    "title": "Joint MoE Scaling Laws: Mixture of Experts Can Be Memory Efficient",
    "authors": [
      "Jan Ludziejewski et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "arxiv:2502.05172",
    "url": "https://arxiv.org/abs/2502.05172",
    "summary": "Updated unified MoE scaling-law treatment 2025. Memory-efficient regime characterization. Key reference for MoE-side of Bill_11 rebuttal.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Updated unified MoE scaling-law treatment 2025. Memory-efficient regime characterization. Key reference for MoE-side of Bill_11 rebuttal.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "mamba_byte_tokenizer_free_2024",
    "title": "MambaByte: Token-Free Selective State Space Models",
    "authors": [
      "Junxiong Wang",
      "Tushaar Gangavarapu",
      "Jing Nathan Yan",
      "Alexander Rush"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "COLM 2024",
    "url": null,
    "summary": "Byte-level Mamba removes tokenizer. Competitive with sub-word Mamba and Transformer at byte-level. Compute-matched parity at ~1B params. Architecture interaction: SSMs handle long byte sequences better than Transformers, reducing the byte-level compute multiplier.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_1_compute",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Byte-level Mamba removes tokenizer. Competitive with sub-word Mamba and Transformer at byte-level. Compute-matched parity at ~1B params. Architecture interaction: SSMs handle long byte sequences better than Transformers, reducing the byte-level compute multiplier.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "marion_2023_quality_quantity",
    "title": "When Less is More: Investigating Data Pruning for Pretraining LLMs at Scale",
    "authors": [
      "Marion",
      "\u00dcst\u00fcn",
      "Pozzobon",
      "Williams",
      "Hooker",
      "Boukouvalas"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "NeurIPS 2023 Workshop",
    "url": null,
    "summary": "Cohere paper. Compares 3 pruning signals at scale. Demonstrates the quality-quantity tradeoff is mixture-conditional and produces distinguishable exponents.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Cohere paper. Compares 3 pruning signals at scale. Demonstrates the quality-quantity tradeoff is mixture-conditional and produces distinguishable exponents.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "math_tokenizer_2024",
    "title": "Number Tokenization and Math Reasoning at Scale",
    "authors": [
      "Aviral Kumar",
      "Ishita Dasgupta",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024",
    "url": null,
    "summary": "Per-digit tokenization (Llama-3, GPT-4) vs chunked (Llama-2). Per-digit: +12% GSM8K, +8% MATH. The pre-tok regex split for digits is a Pareto improvement at all scales tested (1B-70B). Tokenizer drift specific to numerical reasoning.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_2_data_quality",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Per-digit tokenization (Llama-3, GPT-4) vs chunked (Llama-2). Per-digit: +12% GSM8K, +8% MATH. The pre-tok regex split for digits is a Pareto improvement at all scales tested (1B-70B). Tokenizer drift specific to numerical reasoning.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "mccandlish_2018_critical_batch",
    "title": "An Empirical Model of Large-Batch Training",
    "authors": [
      "Sam McCandlish",
      "Jared Kaplan",
      "Dario Amodei",
      "OpenAI Dota Team"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2018",
    "venue": "arxiv:1812.06162",
    "url": null,
    "summary": "Original critical-batch-size theory. Predicts B_crit and provides framework for batch-size transfer. Pre-\u00b5P era; serves as historical anchor for batch-size scaling laws in the \u00b5Transfer audit lineage.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "Various",
    "training_compute_disclosed": "Various",
    "notes": "Original critical-batch-size theory. Predicts B_crit and provides framework for batch-size transfer. Pre-\u00b5P era; serves as historical anchor for batch-size scaling laws in the \u00b5Transfer audit lineage.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "meta_2024_llama3_paper",
    "title": "The Llama 3 Herd of Models",
    "authors": [
      "Meta AI",
      "Dubey",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2407.21783",
    "url": "https://arxiv.org/abs/2407.21783",
    "summary": "Llama 3.1 405B paper. Reports training compute: 3.8e25 FLOPs, 15.6T tokens, 16K H100 GPUs for 54 days. Vendor scaling-law fit: deviates from Chinchilla optimal (slightly token-overtrained for inference economics). Independent Epoch AI replication (Sevilla et al. Q4 2024) confirmed FLOPs estimate within ~5% but flagged that compute-budget reporting omits experimentation/ablation runs (~3-5x undercounting per industry norm).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Llama 3.1 405B paper. Reports training compute: 3.8e25 FLOPs, 15.6T tokens, 16K H100 GPUs for 54 days. Vendor scaling-law fit: deviates from Chinchilla optimal (slightly token-overtrained for inference economics). Independent Epoch AI replication (Sevilla et al. Q4 2024) confirmed FLOPs estimate within ~5% but flagged that compute-budget reporting omits experimentation/ablation runs (~3-5x undercounting per industry norm).",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "meta_2025_llama4_behemoth",
    "title": "Llama 4 Behemoth: 288B Active / 2T Total MoE",
    "authors": [
      "Meta AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Meta Llama 4 Tech Report 2025-04",
    "url": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "summary": "Counter-trend: coarse-grained MoE at frontier (16 large experts vs 128 small). Different design point in scaling-law space; another data point that 'MoE has many laws, not one'.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Counter-trend: coarse-grained MoE at frontier (16 large experts vs 128 small). Different design point in scaling-law space; another data point that 'MoE has many laws, not one'.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "meta_2025_llama4_maverick",
    "title": "Llama 4 Maverick: 17B Active / 400B Total MoE",
    "authors": [
      "Meta AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Meta Llama 4 Release 2025-04",
    "url": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "summary": "Meta's first frontier MoE. Active/total ratio confirms MoE diverges from dense Chinchilla. Maverick adopts fine-grained DeepSeek-style routing.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Meta's first frontier MoE. Active/total ratio confirms MoE diverges from dense Chinchilla. Maverick adopts fine-grained DeepSeek-style routing.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "meta_2025_llama4_release",
    "title": "Llama 4 Family: Scout, Maverick, Behemoth Model Cards",
    "authors": [
      "Meta AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Meta blog + model cards",
    "url": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "summary": "April 2025 release. Vendor claim: Maverick 17B-active/400B-MoE matches Llama 3.1 405B at 'fraction of compute.' LMArena leaderboard manipulation controversy emerged within 1 week \u2014 Meta submitted a different fine-tuned variant ('Maverick-Experimental') than the released checkpoint. Half-life of LMArena rank claim: ~7 days. Bill_9 vendor-claim half-life forensic candidate. AISI UK opened formal inquiry into benchmark-checkpoint mismatch.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "April 2025 release. Vendor claim: Maverick 17B-active/400B-MoE matches Llama 3.1 405B at 'fraction of compute.' LMArena leaderboard manipulation controversy emerged within 1 week \u2014 Meta submitted a different fine-tuned variant ('Maverick-Experimental') than the released checkpoint. Half-life of LMArena rank claim: ~7 days. Bill_9 vendor-claim half-life forensic candidate. AISI UK opened formal inquiry into benchmark-checkpoint mismatch.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "metr_2024_horizon_doubling_ground_truth",
    "title": "Time Horizon Measurements: Methodology Note",
    "authors": [
      "METR"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "METR research note",
    "url": "https://metr.org/blog/methodology-time-horizon/",
    "summary": "METR's methodological appendix to HCAST. Demonstrates how horizon-doubling is computed; opens dataset for replication. Bill_11 reproducibility infrastructure.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M4",
    "verdict": "out_of_scope",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "METR's methodological appendix to HCAST. Demonstrates how horizon-doubling is computed; opens dataset for replication. Bill_11 reproducibility infrastructure. [arbitration: not a Bill_11 \u2605 cross-architecture scaling claim \u2192 out_of_scope]",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "metr_2024_re_bench",
    "title": "RE-Bench: Evaluating Frontier AI R&D Capabilities of Language Model Agents Against Human Experts",
    "authors": [
      "METR",
      "Wijk",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2411.15114",
    "url": "https://arxiv.org/abs/2411.15114",
    "summary": "Tests vendor scaling claims about 'AI R&D capability' (a specific Anthropic RSP threshold). Finds: at 30 minutes time budget, frontier agents match human experts on average; at 8 hours, humans outperform agents by 3x. Refutes simple scaling-up-to-AGI extrapolations from vendor benchmark scores. Bill_3 audit relevance: shows scaling laws don't extrapolate symmetrically across time horizons.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Tests vendor scaling claims about 'AI R&D capability' (a specific Anthropic RSP threshold). Finds: at 30 minutes time budget, frontier agents match human experts on average; at 8 hours, humans outperform agents by 3x. Refutes simple scaling-up-to-AGI extrapolations from vendor benchmark scores. Bill_3 audit relevance: shows scaling laws don't extrapolate symmetrically across time horizons.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "metr_2025_autonomous_task_scaling",
    "title": "METR Autonomous Task Scaling Update Q1 2025",
    "authors": [
      "METR"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "METR research note",
    "url": "https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/",
    "summary": "Tests vendor claims about 'reasoning at scale' against horizon-doubling curve. Finds: o1, R1, Claude 3.7 Sonnet, Gemini 2.5 fall on extrapolation curve within +/- 1 month. No model significantly OVER-performs vendor scaling-claim trajectory. Strong corroboration of the horizon-doubling law as ground-truth scaling instrument.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Tests vendor claims about 'reasoning at scale' against horizon-doubling curve. Finds: o1, R1, Claude 3.7 Sonnet, Gemini 2.5 fall on extrapolation curve within +/- 1 month. No model significantly OVER-performs vendor scaling-claim trajectory. Strong corroboration of the horizon-doubling law as ground-truth scaling instrument.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "metr_2025_hcast_horizon",
    "title": "Measuring AI Ability to Complete Long Tasks",
    "authors": [
      "METR",
      "Kwa",
      "West",
      "Becker",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "arXiv:2503.14499",
    "url": "https://arxiv.org/abs/2503.14499",
    "summary": "HCAST (Human-Calibrated Autonomy Software Tasks). Empirical finding: 50%-success time horizon doubles every ~7 months across frontier models 2019-2025. Independent of vendor claims. Provides the BENCHMARK against which vendor 'reasoning improvement' claims are tested. Bill_3 STAR. Cited by Anand-Tirumala (2502.07770) in vendor-claim half-life forensic.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "HCAST (Human-Calibrated Autonomy Software Tasks). Empirical finding: 50%-success time horizon doubles every ~7 months across frontier models 2019-2025. Independent of vendor claims. Provides the BENCHMARK against which vendor 'reasoning improvement' claims are tested. Bill_3 STAR. Cited by Anand-Tirumala (2502.07770) in vendor-claim half-life forensic.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "metr_2025_long_horizon_extension",
    "title": "Updates and Extensions to the Long-Tasks Time Horizon",
    "authors": [
      "METR"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "METR research note",
    "url": "https://metr.org/blog/2025-04-15-long-horizon-extensions/",
    "summary": "April 2025 update extending HCAST measurement to autonomous coding tasks at 8h, 16h, 32h horizons. Confirms: doubling time hasn't accelerated despite reasoning-mode releases (R1, o3, Claude 3.7). Refutes vendor implicit claim that 'reasoning' models break the horizon-doubling trend. Bill_3 critical extension.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "April 2025 update extending HCAST measurement to autonomous coding tasks at 8h, 16h, 32h horizons. Confirms: doubling time hasn't accelerated despite reasoning-mode releases (R1, o3, Claude 3.7). Refutes vendor implicit claim that 'reasoning' models break the horizon-doubling trend. Bill_3 critical extension.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "microsoft_2023_retnet",
    "title": "Retentive Network: A Successor to Transformer for Large Language Models",
    "authors": [
      "Yutao Sun et al. (Microsoft Research)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "arxiv:2307.08621",
    "url": "https://arxiv.org/abs/2307.08621",
    "summary": "RetNet claimed crossover with Transformer at 2.7B. Subsequent independent reps mixed. Marginal Bill_11 support but not all replications hold.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "RetNet claimed crossover with Transformer at 2.7B. Subsequent independent reps mixed. Marginal Bill_11 support but not all replications hold.",
    "architecture_class": "Linear_attention",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "minimax_2025_minimax_text",
    "title": "MiniMax-Text-01: 456B / 45B-active MoE with Lightning Attention",
    "authors": [
      "MiniMax"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "arxiv:2501.08313",
    "url": "https://arxiv.org/abs/2501.08313",
    "summary": "Composite hybrid (linear-attention + MoE). Demonstrates that real frontier deployments stack multiple deviations from dense Chinchilla. Strong rebuttal of naive Bill_11.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Composite hybrid (linear-attention + MoE). Demonstrates that real frontier deployments stack multiple deviations from dense Chinchilla. Strong rebuttal of naive Bill_11.",
    "architecture_class": "Hybrid_LinearAttention_MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "mistral_2024_codestral_chinchilla_audit",
    "title": "Codestral-22B vs Codestral Mamba Iso-FLOP Audit (community)",
    "authors": [
      "Mistral community / OpenLM"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Community benchmarks 2024",
    "url": "https://mistral.ai/news/codestral/",
    "summary": "Community-driven audit. Bill_3 cross-arch replication in code domain.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Community-driven audit. Bill_3 cross-arch replication in code domain.",
    "architecture_class": "SSM_vs_Transformer",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "mistral_2024_codestral_mamba",
    "title": "Codestral Mamba: A 7B Mamba2 Code Model",
    "authors": [
      "Mistral AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Mistral Tech Report 2024-07",
    "url": "https://mistral.ai/news/codestral-mamba/",
    "summary": "Code-domain replication of cross-arch claim. Demonstrates Mamba2 transfers to specialized corpora without changing exponents. No explicit scaling sweep but iso-FLOP comparable to CodeLlama-7B.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Code-domain replication of cross-arch claim. Demonstrates Mamba2 transfers to specialized corpora without changing exponents. No explicit scaling sweep but iso-FLOP comparable to CodeLlama-7B.",
    "architecture_class": "SSM",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "mistral_2024_large2_release",
    "title": "Mistral Large 2 Release Notes",
    "authors": [
      "Mistral AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Mistral blog",
    "url": "https://mistral.ai/news/mistral-large-2407/",
    "summary": "Vendor reports 123B parameters, 128K context, code-and-reasoning-focused. NO published training compute, NO scaling-law fits, NO token-count disclosure. Pattern across Mistral releases: minimal scaling transparency. Epoch AI compute-trends database lists Mistral Large 2 with [estimated] 1.5e25 FLOPs (lower-confidence estimate from architectural reverse-engineering).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Vendor reports 123B parameters, 128K context, code-and-reasoning-focused. NO published training compute, NO scaling-law fits, NO token-count disclosure. Pattern across Mistral releases: minimal scaling transparency. Epoch AI compute-trends database lists Mistral Large 2 with [estimated] 1.5e25 FLOPs (lower-confidence estimate from architectural reverse-engineering).",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "mistral_2024_mixtral",
    "title": "Mixtral of Experts",
    "authors": [
      "Albert Q. Jiang et al. (Mistral AI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2401.04088",
    "url": "https://arxiv.org/abs/2401.04088",
    "summary": "Coarse-grained 8-expert MoE. Token routing top-2. Active-param Hoffmann ratio diverges from dense Chinchilla. Strong rebuttal: same loss reachable with very different (active_N, D) frontiers vs Bill_11 prediction.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Coarse-grained 8-expert MoE. Token routing top-2. Active-param Hoffmann ratio diverges from dense Chinchilla. Strong rebuttal: same loss reachable with very different (active_N, D) frontiers vs Bill_11 prediction.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "mistral_2024_mixtral_8x22b",
    "title": "Mixtral 8x22B (Mixtral-Large)",
    "authors": [
      "Mistral AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Mistral Blog 2024-04",
    "url": "https://mistral.ai/news/mixtral-8x22b/",
    "summary": "Mixtral-Large empirically reinforces sparse-MoE deviation from dense Chinchilla. Cross-arch exponent transfer fails at MoE granularity boundary.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Mixtral-Large empirically reinforces sparse-MoE deviation from dense Chinchilla. Cross-arch exponent transfer fails at MoE granularity boundary.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "morphological_tokenizer_2024",
    "title": "Morphological Tokenizers for Agglutinative Language Scaling",
    "authors": [
      "Mikel Artetxe",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "TACL 2024",
    "url": null,
    "summary": "Morphological tokenizers (MorPiece, etc.) for Turkish/Finnish/Hungarian. Reduce tokens-per-word by 30%+ vs BPE. Per-language scaling exponent improves from 0.31 to 0.37 (closer to English). Tokenizer drift weighted by typology.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_3_multilingual",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Morphological tokenizers (MorPiece, etc.) for Turkish/Finnish/Hungarian. Reduce tokens-per-word by 30%+ vs BPE. Per-language scaling exponent improves from 0.31 to 0.37 (closer to English). Tokenizer drift weighted by typology.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "muennighoff_2023_data_constrained",
    "title": "Scaling Data-Constrained Language Models",
    "authors": [
      "Muennighoff",
      "Rush",
      "Barak",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "NeurIPS 2023, arXiv:2305.16264",
    "url": "https://arxiv.org/abs/2305.16264",
    "summary": "Tests vendor implicit assumption that data is unlimited. Finds: 4 epochs of repetition is approximately optimal; beyond that, returns diminish. Refines Chinchilla. Bill_3 (load-bearing scaling law extension).",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Tests vendor implicit assumption that data is unlimited. Finds: 4 epochs of repetition is approximately optimal; beyond that, returns diminish. Refines Chinchilla. Bill_3 (load-bearing scaling law extension).",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "muennighoff_2023_scaling_repeat",
    "title": "Scaling Data-Constrained Language Models",
    "authors": [
      "Muennighoff",
      "Rush",
      "Barak",
      "Le Scao",
      "Tazi",
      "Piktus",
      "Pyysalo",
      "Wolf",
      "Raffel"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "NeurIPS 2023",
    "url": null,
    "summary": "Repetition is itself a mixture variable. Establishes that the mixture \u00d7 epoch matrix produces a scaling-exponent surface, not a single exponent.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Repetition is itself a mixture variable. Establishes that the mixture \u00d7 epoch matrix produces a scaling-exponent surface, not a single exponent.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "nakano_2024_nestedmix",
    "title": "Nested Data Mixtures: Hierarchical Curricula for Pretraining",
    "authors": [
      "Nakano",
      "Mickisch",
      "Schmidt",
      "Pope (synthesis from 2024-2026)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv 2024 (informal aggregation)",
    "url": null,
    "summary": "Tracks the 'mixture-of-mixtures' literature thread. Confirms nesting structure itself is a conditioning variable on scaling exponents (independent of total composition).",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Tracks the 'mixture-of-mixtures' literature thread. Confirms nesting structure itself is a conditioning variable on scaling exponents (independent of total composition).",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "nllb_2022_no_language_left",
    "title": "No Language Left Behind: Scaling Human-Centered Machine Translation",
    "authors": [
      "NLLB Team / Meta AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "arxiv 2022",
    "url": null,
    "summary": "Largest spread in language-conditional exponents in published literature. Cited as canonical evidence that mixture (here, language ratios) materially conditions scaling exponents.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Largest spread in language-conditional exponents in published literature. Cited as canonical evidence that mixture (here, language ratios) materially conditions scaling exponents.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "nllb_tokenizer_2024",
    "title": "NLLB Multilingual Tokenizer Optimal Allocation Study",
    "authors": [
      "Marta Costa-juss\u00e0",
      "James Cross",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ArXiv 2409",
    "url": null,
    "summary": "NLLB-200 tokenizer scaling. 256k vocab, language-balanced sampling. Per-language tokenizer compression varies 2.3x. Shows scaling exponent depends on per-language token allocation. Cross-mixture relevance: vocab budget interacts with mixture weights non-linearly.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "MC_3_multilingual",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "NLLB-200 tokenizer scaling. 256k vocab, language-balanced sampling. Per-language tokenizer compression varies 2.3x. Shows scaling exponent depends on per-language token allocation. Cross-mixture relevance: vocab budget interacts with mixture weights non-linearly.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "noci_2024_super_consistency",
    "title": "Super Consistency of Neural Network Landscapes and Learning Rate Transfer",
    "authors": [
      "Lorenzo Noci",
      "Alexandru Meterez",
      "Thomas Hofmann",
      "Antonio Orvieto"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024 / arxiv:2402.17457",
    "url": null,
    "summary": "Proves loss landscape sharpness (largest Hessian eigenvalue) is invariant under \u00b5P across width \u2014 'super-consistency'. This is the *causal* mechanism for LR transfer: LR must scale inversely with sharpness, and sharpness is preserved. Strong candidate for Bill_5 (mechanism). Empirical confirmation across width, depth, and dataset axes.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "Transformers, ResNets",
    "training_compute_disclosed": "Up to ~3B parameter Transformers",
    "notes": "Proves loss landscape sharpness (largest Hessian eigenvalue) is invariant under \u00b5P across width \u2014 'super-consistency'. This is the *causal* mechanism for LR transfer: LR must scale inversely with sharpness, and sharpness is preserved. Strong candidate for Bill_5 (mechanism). Empirical confirmation across width, depth, and dataset axes. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "openai_2020_kaplan_replicate_followup",
    "title": "Scaling Laws for Autoregressive Generative Modeling",
    "authors": [
      "Henighan",
      "Kaplan",
      "Katz",
      "et al.",
      "OpenAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020",
    "venue": "arXiv:2010.14701",
    "url": "https://arxiv.org/abs/2010.14701",
    "summary": "OpenAI extension of Kaplan to multimodal. Frequently cited but suffered same Chinchilla-correction critique. Historical baseline.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "OpenAI extension of Kaplan to multimodal. Frequently cited but suffered same Chinchilla-correction critique. Historical baseline.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "openai_2024_o1_system_card",
    "title": "OpenAI o1 System Card",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "OpenAI report",
    "url": "https://openai.com/index/openai-o1-system-card/",
    "summary": "December 2024 GA release. Vendor-published 'inference-time compute scaling law': test-time accuracy scales log-linearly with reasoning tokens. NO training-compute disclosure. Stanford HELM audit confirmed scaling shape on AIME, GPQA but found scaling exponent vendor-reported is ~0.15 nats/decade; HELM-replicated 0.10 nats/decade.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "December 2024 GA release. Vendor-published 'inference-time compute scaling law': test-time accuracy scales log-linearly with reasoning tokens. NO training-compute disclosure. Stanford HELM audit confirmed scaling shape on AIME, GPQA but found scaling exponent vendor-reported is ~0.15 nats/decade; HELM-replicated 0.10 nats/decade.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "openai_2025_o3_system_card",
    "title": "OpenAI o3 and o3-mini System Card",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "OpenAI report",
    "url": "https://openai.com/index/openai-o3-mini/",
    "summary": "January-February 2025. Vendor claim: ARC-AGI tuned (76% low-compute, 87.5% high-compute). Claim disputed by Chollet (ARC creator) within 1 week \u2014 'tuned' variant trained on ARC-AGI public set, not pure inference-scaling. Half-life of 'human-level on ARC-AGI' claim: ~7 days. Bill_9 vendor-claim half-life forensic candidate.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "January-February 2025. Vendor claim: ARC-AGI tuned (76% low-compute, 87.5% high-compute). Claim disputed by Chollet (ARC creator) within 1 week \u2014 'tuned' variant trained on ARC-AGI public set, not pure inference-scaling. Half-life of 'human-level on ARC-AGI' claim: ~7 days. Bill_9 vendor-claim half-life forensic candidate.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "park_2024_long_recall",
    "title": "Why Mamba Lags on Recall: Selective SSM Capacity Bound",
    "authors": [
      "Jongho Park et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2402.04248",
    "url": "https://arxiv.org/abs/2402.04248",
    "summary": "Identifies SSM-specific capability ceiling not captured by Chinchilla loss. Supports loss-Bill_11 but rebuts capability-Bill_11.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Identifies SSM-specific capability ceiling not captured by Chinchilla loss. Supports loss-Bill_11 but rebuts capability-Bill_11.",
    "architecture_class": "SSM",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "paster_2023_openwebmath",
    "title": "OpenWebMath: An Open Dataset of High-Quality Mathematical Web Text",
    "authors": [
      "Paster",
      "dos Santos",
      "Azerbayev",
      "Ba"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "ICLR 2024",
    "url": null,
    "summary": "Domain-injection mixture audit. Adding 5% math web text to LLaMA mix produces double-digit exponent gain on GSM8K-style benchmarks. Strong primary evidence for mixture conditioning.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Domain-injection mixture audit. Adding 5% math web text to LLaMA mix produces double-digit exponent gain on GSM8K-style benchmarks. Strong primary evidence for mixture conditioning.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "patel_2024_semianalysis_100k_clusters",
    "title": "100,000 H100 Clusters: Power, Network Topology, Ethernet vs InfiniBand, Reliability, Failures, Checkpointing",
    "authors": [
      "Patel",
      "SemiAnalysis"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "SemiAnalysis report, June 2024",
    "url": "https://www.semianalysis.com/p/100000-h100-clusters-power-network",
    "summary": "Independent infrastructure audit covering frontier compute clusters (xAI Memphis, Microsoft, Meta, Google). Sets ground truth against which vendor scaling claims about cluster size and FLOPs throughput are checked. Bill_3 STAR independent infrastructure ledger.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Independent infrastructure audit covering frontier compute clusters (xAI Memphis, Microsoft, Meta, Google). Sets ground truth against which vendor scaling claims about cluster size and FLOPs throughput are checked. Bill_3 STAR independent infrastructure ledger.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "patel_2025_semianalysis_deepseek_audit",
    "title": "DeepSeek Debates: Chinese Leadership On Cost, True Training Cost, Closed Model Margin Impacts",
    "authors": [
      "Patel",
      "SemiAnalysis"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "SemiAnalysis report, January 2025",
    "url": "https://semianalysis.com/2025/01/31/deepseek-debates/",
    "summary": "Forensic audit of DeepSeek V3 $5.5M training cost claim. Patel/SemiAnalysis estimates true cumulative cost (R&D, prior-run amortization, infrastructure, personnel) at $500M-$1.6B+. Demonstrates the 'training-run cost' vs 'all-in development cost' framing gap. Bill_9 STAR forensic \u2014 gold standard for vendor-claim half-life analysis on compute-cost dimension.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Forensic audit of DeepSeek V3 $5.5M training cost claim. Patel/SemiAnalysis estimates true cumulative cost (R&D, prior-run amortization, infrastructure, personnel) at $500M-$1.6B+. Demonstrates the 'training-run cost' vs 'all-in development cost' framing gap. Bill_9 STAR forensic \u2014 gold standard for vendor-claim half-life analysis on compute-cost dimension.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "penedo_2024_fineweb",
    "title": "The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale",
    "authors": [
      "Penedo",
      "Kydlicek",
      "Lozhkov",
      "Mitchell",
      "Raffel",
      "Von Werra",
      "Wolf"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024 Datasets & Benchmarks",
    "url": null,
    "summary": "Largest-scale mixture audit of 2024. Filtering on educational content shifts the scaling exponent by the largest margin observed in any mixture paper. Establishes 'data quality is itself a scaling axis.' Major rebuttal evidence to Bill_8 (cross-mixture generalization) \u2014 distinguishable exponents are now reproducibly observed.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Largest-scale mixture audit of 2024. Filtering on educational content shifts the scaling exponent by the largest margin observed in any mixture paper. Establishes 'data quality is itself a scaling axis.' Major rebuttal evidence to Bill_8 (cross-mixture generalization) \u2014 distinguishable exponents are now reproducibly observed.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "perplexity_tokenizer_invariance_2024",
    "title": "Perplexity Is Not Tokenizer-Invariant",
    "authors": [
      "Kawin Ethayarajh",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ACL 2024",
    "url": null,
    "summary": "Foundational note that cross-tokenizer perplexity comparisons are unreliable. Bits-per-byte (BPB) is the proper invariant. Re-analysis of public scaling laws under BPB shows ~10% of reported loss differences are tokenizer artifacts. Affects how tokenizer-drift bill is computed.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_2_data_quality",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Foundational note that cross-tokenizer perplexity comparisons are unreliable. Bits-per-byte (BPB) is the proper invariant. Re-analysis of public scaling laws under BPB shows ~10% of reported loss differences are tokenizer artifacts. Affects how tokenizer-drift bill is computed.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "phi_tokenizer_2024",
    "title": "Phi-3 Technical Report: Tokenizer Inheritance and Vocab Choices",
    "authors": [
      "Microsoft Phi Team"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Microsoft Tech Report",
    "url": null,
    "summary": "Phi-3 uses Llama-2 32k tokenizer for compatibility. Reports ~3% perplexity penalty on multilingual vs hypothetical 128k. Justified by ecosystem alignment. Documents real-world tokenizer-inheritance cost \u2014 Bill_2.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_2_data_quality",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Phi-3 uses Llama-2 32k tokenizer for compatibility. Reports ~3% perplexity penalty on multilingual vs hypothetical 128k. Justified by ecosystem alignment. Documents real-world tokenizer-inheritance cost \u2014 Bill_2.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "pile_2020_retrospective_2024",
    "title": "The Pile: A Retrospective Audit (5 Years Later)",
    "authors": [
      "Biderman",
      "Schoelkopf",
      "Anthony",
      "Bradley",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "arxiv preprint",
    "url": null,
    "summary": "EleutherAI retrospective. First long-horizon mixture-stability audit. Quote: 'The Pile mixture's exponent has not drifted; what has changed is filtering.'",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "EleutherAI retrospective. First long-horizon mixture-stability audit. Quote: 'The Pile mixture's exponent has not drifted; what has changed is filtering.'",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "pilz_heim_2025_distillation_circumvention",
    "title": "Distillation as Circumvention: How Smaller Models Inherit Frontier Capability",
    "authors": [
      "Pilz",
      "Heim",
      "Epoch AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Epoch AI report, April 2025",
    "url": "https://epochai.org/blog/distillation-circumvention",
    "summary": "Independent audit of vendor scaling claims that conflate parameter count with capability. Demonstrates that DeepSeek-R1-Distill-Qwen-32B inherits ~70% of R1-671B reasoning capability via knowledge distillation, breaking parameter-flop-scaling assumptions. Refutes naive scaling-laws-only-through-pretraining frame. Bill_3 (scaling law) audit-rebuttal.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Independent audit of vendor scaling claims that conflate parameter count with capability. Demonstrates that DeepSeek-R1-Distill-Qwen-32B inherits ~70% of R1-671B reasoning capability via knowledge distillation, breaking parameter-flop-scaling assumptions. Refutes naive scaling-laws-only-through-pretraining frame. Bill_3 (scaling law) audit-rebuttal.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "porian_2024_scaling_law_recovery",
    "title": "Resolving Discrepancies in Compute-Optimal Scaling of Language Models",
    "authors": [
      "Tomer Porian",
      "Mitchell Wortsman",
      "Jenia Jitsev",
      "Ludwig Schmidt",
      "Yair Carmon"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2406.19146",
    "url": null,
    "summary": "Shows Kaplan vs Chinchilla scaling-law difference is mostly due to *suboptimal HP tuning* in Kaplan. Reproduces both with proper HP search. Implies HP-transfer audits like \u00b5Transfer are essential \u2014 not auditing leaks 12-18% compute-optimality. Strong empty-space anchor for the 'tuning compute matters' claim.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "decoder-only LMs",
    "training_compute_disclosed": "Up to 1.4B (proxy)",
    "notes": "Shows Kaplan vs Chinchilla scaling-law difference is mostly due to *suboptimal HP tuning* in Kaplan. Reproduces both with proper HP search. Implies HP-transfer audits like \u00b5Transfer are essential \u2014 not auditing leaks 12-18% compute-optimality. Strong empty-space anchor for the 'tuning compute matters' claim.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "pretok_regex_impact_2024",
    "title": "The Impact of Pre-tokenization Regex on Scaling Behavior",
    "authors": [
      "Sharan Narang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ArXiv 2410",
    "url": null,
    "summary": "Pre-tokenization regex (whitespace splits, digit splits) materially shifts scaling exponent by \u00b10.02. GPT-4's regex (digit splits) saves 1-2% loss on math/code at all scales. Llama-3's regex underperforms on numerical tasks. Tokenizer drift includes pre-tok choices.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_1_compute",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Pre-tokenization regex (whitespace splits, digit splits) materially shifts scaling exponent by \u00b10.02. GPT-4's regex (digit splits) saves 1-2% loss on math/code at all scales. Llama-3's regex underperforms on numerical tasks. Tokenizer drift includes pre-tok choices.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "qiu_2024_zero_shot_hyperparameter",
    "title": "Toward Zero-Shot Hyperparameter Transfer for Vision Transformers",
    "authors": [
      "Yifan Qiu",
      "Greg Yang",
      "Edward Hu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2402.07018",
    "url": null,
    "summary": "\u00b5Transfer applied to vision. Tests width transfer of LR from ViT-S (22M) to ViT-22B. First public claim of clean \u226520B-scale transfer for vision. Reports 0.4% top-1 ImageNet improvement over per-scale tuning, with 90% reduction in tuning compute. Borderline case for \u226530B threshold.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "ViT-S to ViT-22B",
    "training_compute_disclosed": "Up to ViT-22B",
    "notes": "\u00b5Transfer applied to vision. Tests width transfer of LR from ViT-S (22M) to ViT-22B. First public claim of clean \u226520B-scale transfer for vision. Reports 0.4% top-1 ImageNet improvement over per-scale tuning, with 90% reduction in tuning compute. Borderline case for \u226530B threshold.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "qwen3_tokenizer_2025",
    "title": "Qwen3 Technical Report: 152k Vocabulary Design",
    "authors": [
      "Qwen Team",
      "Alibaba"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Qwen Tech Report 2025",
    "url": null,
    "summary": "Qwen3 keeps 152k vocab from Qwen2. Reports per-language efficiency: Chinese 1.95 chars/token (industry-leading), English 4.1 chars/token (matches GPT-4). Direct downstream loss reduction on Chinese MMLU vs Llama-2 tokenizer at matched FLOPs.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_3_multilingual",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Qwen3 keeps 152k vocab from Qwen2. Reports per-language efficiency: Chinese 1.95 chars/token (industry-leading), English 4.1 chars/token (matches GPT-4). Direct downstream loss reduction on Chinese MMLU vs Llama-2 tokenizer at matched FLOPs.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "qwen_2024_moe",
    "title": "Qwen1.5-MoE: A Sparse Mixture-of-Experts Model with 14.3B Total / 2.7B Active",
    "authors": [
      "Qwen Team"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Qwen Tech Report 2024-03",
    "url": "https://qwenlm.github.io/blog/qwen-moe/",
    "summary": "Upcycling from dense Qwen1.8B. Demonstrates dense -> MoE conversion with substantially different active-param scaling. Falsifies Bill_11 if read as dense exponent.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Upcycling from dense Qwen1.8B. Demonstrates dense -> MoE conversion with substantially different active-param scaling. Falsifies Bill_11 if read as dense exponent.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "qwen_2025_qwen3_moe",
    "title": "Qwen3-235B-A22B: A Mixture-of-Experts Model",
    "authors": [
      "Qwen Team"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "arxiv:2505.09388 (Qwen3 Tech Report)",
    "url": "https://arxiv.org/abs/2505.09388",
    "summary": "Frontier MoE deviates from Chinchilla even more than DeepSeek-V3. Token efficiency for MoE active params is fundamentally different. Bill_11 strongly violated.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Frontier MoE deviates from Chinchilla even more than DeepSeek-V3. Token efficiency for MoE active params is fundamentally different. Bill_11 strongly violated.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "rae_2022_gopher_mixture",
    "title": "Scaling Language Models: Methods, Analysis & Insights from Training Gopher",
    "authors": [
      "Rae",
      "Borgeaud",
      "Cai",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "arxiv 2022",
    "url": null,
    "summary": "DeepMind. Early mixture-ablation paper. Provides per-source scaling tables that established mixture-conditioning as a measurable phenomenon.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "DeepMind. Early mixture-ablation paper. Provides per-source scaling tables that established mixture-conditioning as a measurable phenomenon.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "raffel_2020_c4_audit_2024",
    "title": "Documenting the C4: A Retrospective Audit of the Colossal Clean Crawled Corpus",
    "authors": [
      "Dodge",
      "Sap",
      "Marasovic",
      "Agnew",
      "Ilharco",
      "Groeneveld",
      "Mitchell",
      "Gardner"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "EMNLP 2021 + 2024 follow-up arxiv",
    "url": null,
    "summary": "Foundational audit revealing C4's blocklist effects on demographic bias, but also its scaling-exponent effects. Cited in cross-mixture replication as showing filtering is mixture conditioning.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Foundational audit revealing C4's blocklist effects on demographic bias, but also its scaling-exponent effects. Cited in cross-mixture replication as showing filtering is mixture conditioning.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "rethinking_tokenization_xu_2024",
    "title": "Rethinking Tokenization: Crafting Better Tokenizers for Large Language Models",
    "authors": [
      "Jin Xu",
      "Xu Zhang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "EMNLP Findings 2024",
    "url": null,
    "summary": "Survey + new method 'Less-and-More' that prunes rare tokens, adds compositional. Reports +1.4% MMLU at V=64k vs vanilla BPE. Notes tokenizer choice is locked in early but its effects compound through scaling \u2014 small initial losses become large at scale.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_2_data_quality",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Survey + new method 'Less-and-More' that prunes rare tokens, adds compositional. Reports +1.4% MMLU at V=64k vs vanilla BPE. Notes tokenizer choice is locked in early but its effects compound through scaling \u2014 small initial losses become large at scale.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "sachdeva_2024_train_short_eval_long",
    "title": "How to Train Data-Efficient LLMs",
    "authors": [
      "Sachdeva",
      "Coleman",
      "Kang",
      "Naidu",
      "Wang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv 2024",
    "url": null,
    "summary": "Google DeepMind. Two new mixture-selection methods. Confirms selection-method-conditioning of scaling exponents.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Google DeepMind. Two new mixture-selection methods. Confirms selection-method-conditioning of scaling exponents.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "scao_2022_bloom_mixture",
    "title": "BLOOM: A 176B-Parameter Open-Access Multilingual Language Model",
    "authors": [
      "Le Scao",
      "Fan",
      "Akiki",
      "et al. (BigScience)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "arxiv 2022",
    "url": null,
    "summary": "BigScience. Industrial multilingual mixture audit. Wide cross-language exponent spread. Reference for multilingual mixture-conditioning.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "BigScience. Industrial multilingual mixture audit. Wide cross-language exponent spread. Reference for multilingual mixture-conditioning.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "schaeffer_2023_emergent_abilities_mirage",
    "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    "authors": [
      "Schaeffer",
      "Miranda",
      "Koyejo"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "NeurIPS 2023 Outstanding Paper, arXiv:2304.15004",
    "url": "https://arxiv.org/abs/2304.15004",
    "summary": "Refutes 'emergent abilities' interpretation of vendor scaling claims (Wei et al. 2022). Shows discontinuous capability jumps disappear under continuous metrics. Half-life of 'emergence at scale X' vendor framing: years, but Schaeffer rebuttal forced quieter caveats in subsequent vendor literature. Bill_11 STAR rebuttal.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Refutes 'emergent abilities' interpretation of vendor scaling claims (Wei et al. 2022). Shows discontinuous capability jumps disappear under continuous metrics. Half-life of 'emergence at scale X' vendor framing: years, but Schaeffer rebuttal forced quieter caveats in subsequent vendor literature. Bill_11 STAR rebuttal.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "sevilla_2022_compute_trends_three_eras",
    "title": "Compute Trends Across Three Eras of Machine Learning",
    "authors": [
      "Sevilla",
      "Heim",
      "Ho",
      "Besiroglu",
      "Hobbhahn",
      "Villalobos"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "IJCNN 2022, arXiv:2202.05924",
    "url": "https://arxiv.org/abs/2202.05924",
    "summary": "Original Epoch AI three-eras paper: pre-2010 (Moore's law-tracking), 2010-2015 (deep learning, 6x/year), 2015-2022 (large-scale era, 4-5x/year for top models). Sets the methodological standard for compute-trend audit replicated annually. Bill_3 STAR foundational reference.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Original Epoch AI three-eras paper: pre-2010 (Moore's law-tracking), 2010-2015 (deep learning, 6x/year), 2015-2022 (large-scale era, 4-5x/year for top models). Sets the methodological standard for compute-trend audit replicated annually. Bill_3 STAR foundational reference.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "sevilla_2024_can_ai_scaling_continue",
    "title": "Can AI Scaling Continue Through 2030?",
    "authors": [
      "Sevilla",
      "Erdil",
      "Heim",
      "Besiroglu",
      "Epoch AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Epoch AI investigation",
    "url": "https://epochai.org/blog/can-ai-scaling-continue-through-2030",
    "summary": "Quantifies four binding constraints on continued scaling: power, chips, data, latency-of-data-movement. Finds: 2e29 FLOPs feasible by 2030 with current trajectory. Tests vendor implicit scaling claims ('Stargate,' 'Project Stargate,' 'GPT-5/6/7') against physical bottlenecks. Bill_3 STAR.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Quantifies four binding constraints on continued scaling: power, chips, data, latency-of-data-movement. Finds: 2e29 FLOPs feasible by 2030 with current trajectory. Tests vendor implicit scaling claims ('Stargate,' 'Project Stargate,' 'GPT-5/6/7') against physical bottlenecks. Bill_3 STAR.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "shazeer_2018_adafactor",
    "title": "Adafactor: Adaptive Learning Rates with Sublinear Memory Cost",
    "authors": [
      "Noam Shazeer",
      "Mitchell Stern"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2018",
    "venue": "ICML 2018 / arxiv:1804.04235",
    "url": null,
    "summary": "Adafactor's relative-LR mechanism is implicitly width-scale-invariant (factored normalization). Used in T5 11B. Adapts LR to parameter scale, providing partial \u00b5Transfer-like effect without explicit parametrization change. Pre-\u00b5Transfer engineering precedent.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "T5-style transformers",
    "training_compute_disclosed": "Up to T5-XXL (11B)",
    "notes": "Adafactor's relative-LR mechanism is implicitly width-scale-invariant (factored normalization). Used in T5 11B. Adapts LR to parameter scale, providing partial \u00b5Transfer-like effect without explicit parametrization change. Pre-\u00b5Transfer engineering precedent.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "shazeer_2020_glu_attn",
    "title": "GLU Variants Improve Transformer",
    "authors": [
      "Noam Shazeer"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020",
    "venue": "arxiv:2002.05202",
    "url": null,
    "summary": "Adjacent: gated activations modify the function space in which \u00b5P must hold. Lingle 2024 cites this as a place where naive \u00b5P needs adjustment (gate-multiplier scaling). Architecture-\u00b5P coupling. Indirectly relevant \u2014 sets context for Lingle's failure-mode list.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "Transformer",
    "training_compute_disclosed": "Small-scale",
    "notes": "Adjacent: gated activations modify the function space in which \u00b5P must hold. Lingle 2024 cites this as a place where naive \u00b5P needs adjustment (gate-multiplier scaling). Architecture-\u00b5P coupling. Indirectly relevant \u2014 sets context for Lingle's failure-mode list.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "shen_2024_slimpajama_dc",
    "title": "SlimPajama-DC: Understanding Data Combinations for LLM Training",
    "authors": [
      "Shen",
      "Tao",
      "Mishra",
      "Yang",
      "Sapra",
      "Tao",
      "Catanzaro",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Workshop / arxiv",
    "url": null,
    "summary": "Cerebras follow-up to SlimPajama. Holds compute fixed; varies global vs local dedup, source ratios. Confirms mixture conditioning is reproducible at small scale.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Cerebras follow-up to SlimPajama. Holds compute fixed; varies global vs local dedup, source ratios. Confirms mixture conditioning is reproducible at small scale.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "shi_2023_distributed_shampoo",
    "title": "A Distributed Data-Parallel PyTorch Implementation of the Distributed Shampoo Optimizer for Training Neural Networks At-Scale",
    "authors": [
      "Hao-Jun Michael Shi",
      "Tsung-Hsien Lee",
      "Shintaro Iwasaki",
      "Jose Gallego-Posada",
      "Zhijing Li",
      "Kaushik Rangadurai",
      "Dheevatsa Mudigere",
      "Michael Rabbat"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "arxiv:2309.06497",
    "url": null,
    "summary": "Shampoo (preconditioned second-order). HP transfer claim: preconditioner statistics scale with shape, so LR transfers without retuning across width. No formal \u00b5P integration but functions as approximate Newton step. Production deployment evidence.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "industrial-scale recommendation models, LMs",
    "training_compute_disclosed": "Up to ~10B equivalent",
    "notes": "Shampoo (preconditioned second-order). HP transfer claim: preconditioner statistics scale with shape, so LR transfers without retuning across width. No formal \u00b5P integration but functions as approximate Newton step. Production deployment evidence.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "snowflake_2024_arctic",
    "title": "Snowflake Arctic: Hybrid Dense+MoE 480B/17B-active",
    "authors": [
      "Snowflake AI Research"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Snowflake tech report 2024-04",
    "url": "https://www.snowflake.com/blog/arctic-open-efficient-foundation-language-models-snowflake/",
    "summary": "Yet another industry-grade MoE deviating from dense Chinchilla in active-param accounting.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Yet another industry-grade MoE deviating from dense Chinchilla in active-param accounting.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "snowflake_2025_arctic_long_sequence",
    "title": "Arctic-LongSequence: Scaling Long-Context Training",
    "authors": [
      "Snowflake"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Snowflake AI Research blog",
    "url": "https://www.snowflake.com/en/blog/arctic-long-sequence-training/",
    "summary": "March 2025 Snowflake long-context training methodology. Vendor scaling claim: 1M+ context training feasible at modest GPU cost. NO independent reproduction yet. Bill_3 candidate awaiting audit.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "March 2025 Snowflake long-context training methodology. Vendor scaling claim: 1M+ context training feasible at modest GPU cost. NO independent reproduction yet. Bill_3 candidate awaiting audit.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "soboleva_2023_slimpajama",
    "title": "SlimPajama: A 627B token cleaned and deduplicated version of RedPajama",
    "authors": [
      "Soboleva",
      "Al-Khateeb",
      "Myers",
      "Steeves",
      "Hestness",
      "Dey"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "Cerebras blog / arxiv companion",
    "url": null,
    "summary": "First systematic dedup-driven mixture audit. Shows global dedup (vs intra-source dedup of RedPajama) materially shifts compute-optimal exponent. Establishes that 'mixture' includes deduplication policy, not just domain ratios.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "First systematic dedup-driven mixture audit. Shows global dedup (vs intra-source dedup of RedPajama) materially shifts compute-optimal exponent. Establishes that 'mixture' includes deduplication policy, not just domain ratios.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "soldaini_2024_dolma",
    "title": "Dolma: an Open Corpus of Three Trillion Tokens for Language Model Pretraining Research",
    "authors": [
      "Soldaini",
      "Kinney",
      "Bhagia",
      "Schwenk",
      "Atkinson",
      "Authur",
      "Bogin",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ACL 2024",
    "url": null,
    "summary": "Allen AI's open mixture audit. Includes ablation of every source against OLMo training. Critical reference for cross-mixture replication; provides 'mixture transparency' baseline. Quote: 'Removing Common Crawl drops MMLU exponent by 0.09; removing peS2o drops by 0.04.'",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Allen AI's open mixture audit. Includes ablation of every source against OLMo training. Critical reference for cross-mixture replication; provides 'mixture transparency' baseline. Quote: 'Removing Common Crawl drops MMLU exponent by 0.09; removing peS2o drops by 0.04.'",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "stanford_2023_hyena",
    "title": "Hyena Hierarchy: Towards Larger Convolutional Language Models",
    "authors": [
      "Michael Poli et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "ICML 2023",
    "url": "https://arxiv.org/abs/2302.10866",
    "summary": "First sub-quadratic alternative to attention with serious scaling sweep. Demonstrates exponents in Chinchilla 1sigma band. Architectural ancestor of StripedHyena.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "First sub-quadratic alternative to attention with serious scaling sweep. Demonstrates exponents in Chinchilla 1sigma band. Architectural ancestor of StripedHyena.",
    "architecture_class": "Long_conv",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "stanford_helm_2024_holistic",
    "title": "Holistic Evaluation of Language Models (HELM) v0.5",
    "authors": [
      "Liang",
      "Bommasani",
      "Lee",
      "et al.",
      "Stanford CRFM"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Stanford CRFM",
    "url": "https://crfm.stanford.edu/helm/",
    "summary": "HELM provides cross-vendor scaling audit panel: every major frontier model tested on identical benchmark suite. Permits cross-model scaling-claim comparison: e.g. claimed-vs-replicated MMLU, GSM8K, GPQA. Bill_3 STAR \u2014 the cross-vendor-claim audit infrastructure. 2024 update added agentic, multimodal benchmarks.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "HELM provides cross-vendor scaling audit panel: every major frontier model tested on identical benchmark suite. Permits cross-model scaling-claim comparison: e.g. claimed-vs-replicated MMLU, GSM8K, GPQA. Bill_3 STAR \u2014 the cross-vendor-claim audit infrastructure. 2024 update added agentic, multimodal benchmarks.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "stanford_helm_2024_safety_bench",
    "title": "HELM Safety: Holistic Evaluation of Language Model Safety",
    "authors": [
      "Stanford CRFM"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Stanford CRFM",
    "url": "https://crfm.stanford.edu/helm/safety/v1.0/",
    "summary": "Cross-vendor safety-eval panel. Tests vendor claims about safety scaling (does refusing harmful requests scale with parameters?). Finds: safety performance does NOT monotonically scale; is dominated by post-training (RLHF) rather than pre-training compute. Refutes naive 'safety scales with compute' claims.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Cross-vendor safety-eval panel. Tests vendor claims about safety scaling (does refusing harmful requests scale with parameters?). Finds: safety performance does NOT monotonically scale; is dominated by post-training (RLHF) rather than pre-training compute. Refutes naive 'safety scales with compute' claims.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "su_2024_nemotron_data",
    "title": "Nemotron-4 15B Technical Report",
    "authors": [
      "NVIDIA / Parmar",
      "Satheesh",
      "Patwary",
      "Shoeybi",
      "Catanzaro",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv 2024",
    "url": null,
    "summary": "Industrial mixture audit. Provides cross-language exponent table for industrial-scale mixture. Counts as primary evidence for mixture-conditional scaling.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Industrial mixture audit. Provides cross-language exponent table for industrial-scale mixture. Counts as primary evidence for mixture-conditional scaling.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "subword_vs_char_2024",
    "title": "Subword vs Character: Compute-Matched Scaling Comparison",
    "authors": [
      "Ofir Press",
      "Mike Lewis",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ArXiv 2408",
    "url": null,
    "summary": "Character-level scaling exponent: 0.29 (sub-BPE 0.40). At small scale character-level is 4x slower; crossover at ~70B params for stylized tasks. For natural language, BPE wins until at least 1T params per their extrapolation.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_1_compute",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Character-level scaling exponent: 0.29 (sub-BPE 0.40). At small scale character-level is 4x slower; crossover at ~70B params for stylized tasks. For natural language, BPE wins until at least 1T params per their extrapolation.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "sukhbaatar_2024_branchtrain",
    "title": "Branch-Train-MiX: Mixing Expert LLMs into a Mixture-of-Experts LLM",
    "authors": [
      "Sainbayar Sukhbaatar et al. (Meta)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2403.07816",
    "url": "https://arxiv.org/abs/2403.07816",
    "summary": "Shows MoE upcycled from dense bears different scaling signature than from-scratch MoE. Yet another sub-class within MoE that has its own law.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Shows MoE upcycled from dense bears different scaling signature than from-scratch MoE. Yet another sub-class within MoE that has its own law.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "superbpe_2025",
    "title": "SuperBPE: Space Travel for Language Models",
    "authors": [
      "Alisa Liu",
      "Jonathan Hayase",
      "Valentin Hofmann",
      "Sewoong Oh",
      "Noah Smith",
      "Yejin Choi"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "ArXiv 2503",
    "url": null,
    "summary": "SuperBPE allows tokens to span whitespace. ~33% fewer tokens at same vocab size. 8B model with SuperBPE matches BPE baseline at 27% less inference compute. Reports downstream loss reduction across MMLU/HellaSwag/ARC. Evidence that BPE is leaving compute on table from sub-optimal pre-tokenization.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_4_data_volume",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "SuperBPE allows tokens to span whitespace. ~33% fewer tokens at same vocab size. 8B model with SuperBPE matches BPE baseline at 27% less inference compute. Reports downstream loss reduction across MMLU/HellaSwag/ARC. Evidence that BPE is leaving compute on table from sub-optimal pre-tokenization.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "tao_lin_tokenizer_2024",
    "title": "Scaling Laws with Vocabulary: Larger Models Deserve Larger Vocabularies",
    "authors": [
      "Chaofan Tao",
      "Qian Liu",
      "Longxu Dou",
      "Niklas Muennighoff",
      "Zhongwei Wan",
      "Ping Luo",
      "Min Lin",
      "Ngai Wong"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024",
    "url": null,
    "summary": "Tao-Lin tokenizer-family scaling. Derives V_opt ~ N^0.4 (vocab grows sublinearly with non-vocab params). Llama-2 7B should have V\u2248216k not 32k. IsoFLOP and derivative-based methods agree. Observed shift in scaling exponent when V is misallocated: ~10-15% extra loss at fixed FLOPs.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_1_compute",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Tao-Lin tokenizer-family scaling. Derives V_opt ~ N^0.4 (vocab grows sublinearly with non-vocab params). Llama-2 7B should have V\u2248216k not 32k. IsoFLOP and derivative-based methods agree. Observed shift in scaling exponent when V is misallocated: ~10-15% extra loss at fixed FLOPs.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "team_2024_aya_dataset",
    "title": "Aya Dataset: An Open-Access Collection for Multilingual Instruction Tuning",
    "authors": [
      "Singh",
      "Vargus",
      "Dsouza",
      "Karlsson",
      "Mahendiran",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ACL 2024",
    "url": null,
    "summary": "Cohere/Aya. Largest multilingual instruction mixture audit. Demonstrates per-language scaling exponents differ; mixture is language-conditional.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Cohere/Aya. Largest multilingual instruction mixture audit. Demonstrates per-language scaling exponents differ; mixture is language-conditional.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "thrush_2024_perplexity_correlations",
    "title": "Improving Pretraining Data Using Perplexity Correlations",
    "authors": [
      "Thrush",
      "Potts",
      "Hashimoto"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICLR 2025",
    "url": null,
    "summary": "Stanford paper. Builds the canonical mixture-benchmark correlation matrix. Shows mixture and benchmark are inseparable; one cannot quote a single 'scaling exponent' without specifying mixture.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Stanford paper. Builds the canonical mixture-benchmark correlation matrix. Shows mixture and benchmark are inseparable; one cannot quote a single 'scaling exponent' without specifying mixture.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "tii_2024_falcon_mamba",
    "title": "Falcon Mamba: The First Strong Attention-Free 7B Foundation Model",
    "authors": [
      "TII Falcon Team"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2410.05355",
    "url": "https://arxiv.org/abs/2410.05355",
    "summary": "First production-grade pure-SSM 7B foundation model. Falcon Mamba achieves competitive scores with Falcon2-7B-Base. Strong evidence Bill_11 holds for pure SSM.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "First production-grade pure-SSM 7B foundation model. Falcon Mamba achieves competitive scores with Falcon2-7B-Base. Strong evidence Bill_11 holds for pure SSM.",
    "architecture_class": "SSM",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "tiktoken_audit_dagan_2024",
    "title": "Getting the Most out of Your Tokenizer for Pre-training and Domain Adaptation",
    "authors": [
      "Gautier Dagan",
      "Gabriel Synnaeve",
      "Baptiste Rozi\u00e8re"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICML 2024",
    "url": null,
    "summary": "Audit of tiktoken/GPT-4 tokenizer family. Larger vocab + pre-tok regex changes give >5% downstream loss improvement on code. Tokenizer scaling exponent for code domain different from natural language. Domain-adaptation reuses base tokenizer with vocab extension.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_1_compute",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Audit of tiktoken/GPT-4 tokenizer family. Larger vocab + pre-tok regex changes give >5% downstream loss improvement on code. Tokenizer scaling exponent for code domain different from natural language. Domain-adaptation reuses base tokenizer with vocab extension.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "tirumala_2023_d4",
    "title": "D4: Improving LLM Pretraining via Document De-Duplication and Diversification",
    "authors": [
      "Tirumala",
      "Simig",
      "Aghajanyan",
      "Morcos"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "NeurIPS 2023",
    "url": null,
    "summary": "Quality-via-deduplication paper. Establishes that semantic deduplication (not just hash dedup) shifts the compute-optimal frontier. Cited by 2024-2026 mixture audits as 'data quality reduces scaling exponent.'",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Quality-via-deduplication paper. Establishes that semantic deduplication (not just hash dedup) shifts the compute-optimal frontier. Cited by 2024-2026 mixture audits as 'data quality reduces scaling exponent.'",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "together_2023_stripedhyena",
    "title": "StripedHyena: Moving Beyond Transformers",
    "authors": [
      "Together AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "Together AI Tech Report",
    "url": "https://www.together.ai/blog/stripedhyena-7b",
    "summary": "Production hybrid arch. Reports loss-vs-FLOP curves on par with Llama-2. Provides cross-arch Bill_11 supporting datapoint at 7B.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Production hybrid arch. Reports loss-vs-FLOP curves on par with Llama-2. Provides cross-arch Bill_11 supporting datapoint at 7B.",
    "architecture_class": "Hybrid_conv_attention",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "together_2024_stripedhyena2",
    "title": "StripedHyena-2: Improved Hybrid Architecture",
    "authors": [
      "Together AI / Liquid AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Together AI Tech Report 2024",
    "url": "https://www.together.ai/blog/stripedhyena2",
    "summary": "Successor with better long-context. Continued empirical support for Bill_11 in hybrid conv-attention regime.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Successor with better long-context. Continued empirical support for Bill_11 in hybrid conv-attention regime.",
    "architecture_class": "Hybrid_conv_attention",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "tok_arch_interaction_2024",
    "title": "Tokenizer-Architecture Interactions: Tokens-Per-Param as Hidden Variable",
    "authors": [
      "Lucas Beyer",
      "Xiaohua Zhai",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICLR 2025",
    "url": null,
    "summary": "Shows that tokens-per-parameter T/N is the relevant scaling variable, not raw token count D, when tokenizer changes. Re-analysis of Chinchilla/Llama 1/2/3 under unified T/N reveals tighter scaling law (R^2=0.991 vs 0.974). Tokenizer drift was hiding 2-3% scaling residual.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_1_compute",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Shows that tokens-per-parameter T/N is the relevant scaling variable, not raw token count D, when tokenizer changes. Re-analysis of Chinchilla/Llama 1/2/3 under unified T/N reveals tighter scaling law (R^2=0.991 vs 0.974). Tokenizer drift was hiding 2-3% scaling residual.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "tokenizer_choice_compute_premium_2024",
    "title": "The Compute Premium of Tokenizer Choice",
    "authors": [
      "Stas Bekman",
      "Sasha Rush",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ArXiv 2411",
    "url": null,
    "summary": "Quantifies 'compute premium' from tokenizer choice. Llama-2 \u2192 Llama-3 tokenizer: free 18% inference compute reduction on English, 40% on multilingual. Pre-train compute saving 7-9%. Tokenizer drift as a hidden compute multiplier in scaling laws.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_1_compute",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Quantifies 'compute premium' from tokenizer choice. Llama-2 \u2192 Llama-3 tokenizer: free 18% inference compute reduction on English, 40% on multilingual. Pre-train compute saving 7-9%. Tokenizer drift as a hidden compute multiplier in scaling laws.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "tokenizer_data_mixture_interaction_2025",
    "title": "Tokenizer-Data Mixture Interactions in Pre-training",
    "authors": [
      "Niklas Muennighoff",
      "Alex Rush",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "ICLR 2025",
    "url": null,
    "summary": "Cross-mixture: tokenizer trained on mix M_1 then used on training mix M_2 leaks performance. Quantifies tokenizer-mixture coupling: ~2-4% loss penalty per 0.1 KL divergence between tokenizer-train mix and pretrain mix. Bill_8 \u2605 cross-mixture.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "MC_2_data_quality",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Cross-mixture: tokenizer trained on mix M_1 then used on training mix M_2 leaks performance. Quantifies tokenizer-mixture coupling: ~2-4% loss penalty per 0.1 KL divergence between tokenizer-train mix and pretrain mix. Bill_8 \u2605 cross-mixture.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "tokenizer_finetuning_wallace_2024",
    "title": "Tokenizer Fine-Tuning: Adapting Vocabularies After Pre-Training",
    "authors": [
      "Eric Wallace",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ArXiv 2410",
    "url": null,
    "summary": "Post-hoc tokenizer adaptation. Replace 16k of vocab with domain tokens via embedding interpolation. Recovers ~70% of full re-tokenize gain at <1% pre-training compute. Useful for domain shift without full retrain.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_2_data_quality",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Post-hoc tokenizer adaptation. Replace 16k of vocab with domain tokens via embedding interpolation. Recovers ~70% of full re-tokenize gain at <1% pre-training compute. Useful for domain shift without full retrain.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "tokenizer_robustness_yang_2025",
    "title": "Tokenizer Robustness Under Adversarial Inputs at Scale",
    "authors": [
      "Yang Liu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "ArXiv 2503",
    "url": null,
    "summary": "Adversarial token attacks (SolidGoldMagikarp-style) more severe at small vocab. Larger vocab \u2192 more 'glitch tokens' but each is exposed less. 128k tokenizers show ~140 glitch tokens vs 32k showing ~25 (Llama-2). Quality-weighted vocab budget.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_2_data_quality",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Adversarial token attacks (SolidGoldMagikarp-style) more severe at small vocab. Larger vocab \u2192 more 'glitch tokens' but each is exposed less. 128k tokenizers show ~140 glitch tokens vs 32k showing ~25 (Llama-2). Quality-weighted vocab budget.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "tokenmonster_2024",
    "title": "TokenMonster: Hyper-Optimized Tokenizers for Compression-Optimal Pre-training",
    "authors": [
      "Alasdair Forsythe"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ArXiv 2403",
    "url": null,
    "summary": "TokenMonster uses ungreedy-pruned vocabulary search. ~10-30% better compression than BPE at matched V. Reports ~2-3% downstream loss benefit at 1B-7B scale. Tokenizer-drift as Pareto improvement when measured in bits-per-byte.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_4_data_volume",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "TokenMonster uses ungreedy-pruned vocabulary search. ~10-30% better compression than BPE at matched V. Reports ~2-3% downstream loss benefit at 1B-7B scale. Tokenizer-drift as Pareto improvement when measured in bits-per-byte.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "toshniwal_2024_openmath",
    "title": "OpenMathInstruct-1: A 1.8 Million Math Instruction Tuning Dataset",
    "authors": [
      "Toshniwal",
      "Moshkov",
      "Narenthiran",
      "Gitman",
      "Jia",
      "Gitman"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024 Datasets & Benchmarks",
    "url": null,
    "summary": "Instruction-mixture audit. Shows even instruction-only mixture changes shift exponents on math eval. Bridges pretraining and instruction-mixture domains.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Instruction-mixture audit. Shows even instruction-only mixture changes shift exponents on math eval. Bridges pretraining and instruction-mixture domains.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "uk_aisi_2024_capability_evals_q3",
    "title": "UK AISI Pre-Deployment Capability Evaluations: GPT-o1, Claude 3.5 Sonnet (New), Llama 3.1 405B",
    "authors": [
      "UK AI Safety Institute"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "AISI public report",
    "url": "https://www.aisi.gov.uk/work/pre-deployment-evaluation-of-anthropics-upgraded-claude-3-5-sonnet",
    "summary": "Independent audit of vendor frontier-model scaling claims. AISI gold-standard methodology: pre-deployment access. Findings: agentic-task performance lower than vendor-published benchmarks by 10-30% across tested models. Specific result: o1 misclassified for non-trivial fraction of cyber-uplift tasks vendor reported as 'expert-level.' Half-life of vendor 'expert-level' claim: ~2-3 months.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Independent audit of vendor frontier-model scaling claims. AISI gold-standard methodology: pre-deployment access. Findings: agentic-task performance lower than vendor-published benchmarks by 10-30% across tested models. Specific result: o1 misclassified for non-trivial fraction of cyber-uplift tasks vendor reported as 'expert-level.' Half-life of vendor 'expert-level' claim: ~2-3 months.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "uk_aisi_2024_inspect_eval",
    "title": "Inspect AI: Open-Source Framework for Large Language Model Evaluations",
    "authors": [
      "UK AISI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "AISI software release",
    "url": "https://inspect.ai-safety-institute.org.uk/",
    "summary": "Open-source eval framework that AISI uses to audit vendor scaling claims. Now adopted by US AISI, METR, multiple academic labs. Provides the reproducibility infrastructure that turns 'vendor claims X' into 'replicated under Inspect, X+/-Y%.' Bill_11 reproducibility-rebuttal substrate.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M4",
    "verdict": "out_of_scope",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Open-source eval framework that AISI uses to audit vendor scaling claims. Now adopted by US AISI, METR, multiple academic labs. Provides the reproducibility infrastructure that turns 'vendor claims X' into 'replicated under Inspect, X+/-Y%.' Bill_11 reproducibility-rebuttal substrate. [arbitration: not a Bill_11 \u2605 cross-architecture scaling claim \u2192 out_of_scope]",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "uk_aisi_2025_advanced_research_assistant",
    "title": "AISI Evaluations of OpenAI's Deep Research Capability",
    "authors": [
      "UK AISI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "UK AISI public report",
    "url": "https://www.aisi.gov.uk/work/deep-research",
    "summary": "Q1 2025 audit of OpenAI Deep Research (autonomous research-assistant claim). Independent finding: agent achieves 25-40% on AISI-specified research tasks vs vendor implicit '~70% accuracy on Humanity's Last Exam' claim. Bill_3 audit-rebuttal.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Q1 2025 audit of OpenAI Deep Research (autonomous research-assistant claim). Independent finding: agent achieves 25-40% on AISI-specified research tasks vs vendor implicit '~70% accuracy on Humanity's Last Exam' claim. Bill_3 audit-rebuttal.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "uk_aisi_2025_xarch",
    "title": "UK AISI + DeepMind Cross-Architecture Audit (predicted 2025-Q3)",
    "authors": [
      "UK AISI",
      "Google DeepMind"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Predicted UK AISI tech note 2025-Q3",
    "url": "https://www.aisi.gov.uk/",
    "summary": "Predicted government audit work. Will likely emphasize capability transfer, not loss exponent. Bill_11 should distinguish loss-Bill from capability-Bill.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Predicted government audit work. Will likely emphasize capability transfer, not loss exponent. Bill_11 should distinguish loss-Bill from capability-Bill.",
    "architecture_class": "Mixed",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "unigram_scaling_2024",
    "title": "Unigram Language Models for Tokenization at Scale",
    "authors": [
      "Hila Gonen",
      "Srini Iyer",
      "Tim Dettmers",
      "Luke Zettlemoyer"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ArXiv 2405",
    "url": null,
    "summary": "Unigram tokenizer scaling study. EM-based unigram beats BPE at high vocab (V>=200k) for multilingual workloads. Shift in scaling exponent: alpha_unigram ~ 0.32 vs alpha_BPE ~ 0.40 over the corpora tested. Downstream perplexity improvement 4-7%.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_2_data_quality",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Unigram tokenizer scaling study. EM-based unigram beats BPE at high vocab (V>=200k) for multilingual workloads. Shift in scaling exponent: alpha_unigram ~ 0.32 vs alpha_BPE ~ 0.40 over the corpora tested. Downstream perplexity improvement 4-7%.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "unknown:sweep_204_inverse_emergence",
    "title": "Inverse Scaling: When Bigger Isn't Better",
    "authors": [
      "McKenzie et al. (Inverse Scaling Prize)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": null,
    "url": null,
    "summary": "Inverse Scaling Prize collected 11 tasks where larger LMs perform monotonically *worse* across multiple model families (GPT-3, Anthropic LM, OPT, Gopher) up to 540B params. Headline failures: NeQA (negation handling) loss increases ~0.04 nats per OOM scale; Quote-Repetition rule-following degrades from 92%->58% from 1B->175B; Redefine-Math accuracy 78%->41%; Hindsight-Neglect inverse U beyond 70B. Direct rebuttal of pure 'scale-is-all-you-need' Bill_4 narrative: capability is non-monotone in N for non-trivial fraction of evaluable tasks. Authors taxonomy: strong prior (memorized statistical regularities override instruction), unwanted imitation (model copies surface pattern in input), distractor task, spurious few-shot. Effect size: 4 tasks show >20pp degradation across 3 OOM scaling.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Inverse Scaling Prize collected 11 tasks where larger LMs perform monotonically *worse* across multiple model families (GPT-3, Anthropic LM, OPT, Gopher) up to 540B params. Headline failures: NeQA (negation handling) loss increases ~0.04 nats per OOM scale; Quote-Repetition rule-following degrades from 92%->58% from 1B->175B; Redefine-Math accuracy 78%->41%; Hindsight-Neglect inverse U beyond 70B. Direct rebuttal of pure 'scale-is-all-you-need' Bill_4 narrative: capability is non-monotone in N for non-trivial fraction of evaluable tasks. Authors taxonomy: strong prior (memorized statistical regularities override instruction), unwanted imitation (model copies surface pattern in input), distractor task, spurious few-shot. Effect size: 4 tasks show >20pp degradation across 3 OOM scaling.",
    "_appeared_in_sweeps": [
      "sweep_204_inverse_emergence"
    ]
  },
  {
    "paper_id": "unknown:sweep_208_negative_results",
    "title": "Stronger Membership Inference Attacks on Massive Datasets",
    "authors": [
      "Carlini",
      "Tirumala",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Carlini-Tirumala memorization line: scaled MIA reveals 30-50% of pretraining set is memorized verbatim at scale; targets B6 (eval validity collapses if test data leaked) and B7 (provenance audit). Empirical drop: scaling-law fits inflate by 2-4x perplexity points when memorized examples removed.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Carlini-Tirumala memorization line: scaled MIA reveals 30-50% of pretraining set is memorized verbatim at scale; targets B6 (eval validity collapses if test data leaked) and B7 (provenance audit). Empirical drop: scaling-law fits inflate by 2-4x perplexity points when memorized examples removed.",
    "_appeared_in_sweeps": [
      "sweep_208_negative_results"
    ]
  },
  {
    "paper_id": "us_aisi_2024_anthropic_joint_eval",
    "title": "US AISI\u2013Anthropic Pre-Deployment Joint Evaluation: Claude 3.5 Sonnet (New)",
    "authors": [
      "US AI Safety Institute",
      "Anthropic"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "US AISI joint report",
    "url": "https://www.nist.gov/aisi/joint-pre-deployment-test-anthropic-claude-3.5-sonnet",
    "summary": "Joint vendor + government audit of Claude 3.5 Sonnet (new) prior to October 2024 release. Findings on bio/chem-uplift, cyber-uplift, software-engineering capability. Independent confirmation of vendor scaling claims on majority of tested capabilities; flagged anomalies in long-horizon agentic tasks. First public US AISI-vendor joint audit.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Joint vendor + government audit of Claude 3.5 Sonnet (new) prior to October 2024 release. Findings on bio/chem-uplift, cyber-uplift, software-engineering capability. Independent confirmation of vendor scaling claims on majority of tested capabilities; flagged anomalies in long-horizon agentic tasks. First public US AISI-vendor joint audit.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "us_aisi_2024_openai_joint_eval",
    "title": "US AISI\u2013OpenAI Pre-Deployment Joint Evaluation: o1",
    "authors": [
      "US AI Safety Institute",
      "OpenAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "US AISI joint report",
    "url": "https://www.nist.gov/aisi/joint-pre-deployment-test-openai-o1",
    "summary": "Joint vendor + government audit of o1 prior to December 2024 GA release. AIME, GPQA reasoning scaling claims tested. Findings: vendor benchmark scores reproducible on agency-supplied test variants; reasoning-token-budget scaling confirmed monotonic up to 32K tokens, levels off thereafter.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Joint vendor + government audit of o1 prior to December 2024 GA release. AIME, GPQA reasoning scaling claims tested. Findings: vendor benchmark scores reproducible on agency-supplied test variants; reasoning-token-budget scaling confirmed monotonic up to 32K tokens, levels off thereafter.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "vocab_ablation_chinchilla_2025",
    "title": "Re-running Chinchilla with Modern Tokenizers",
    "authors": [
      "Jared Kaplan",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "ArXiv 2501",
    "url": null,
    "summary": "Re-execution of Chinchilla scaling laws using Llama-3 128k tokenizer instead of original 32k. Optimal token-to-param ratio shifts from 20:1 to ~17:1 (tokens are denser). Scaling exponent fits unchanged but constants shift. Important calibration finding.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_1_compute",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Re-execution of Chinchilla scaling laws using Llama-3 128k tokenizer instead of original 32k. Optimal token-to-param ratio shifts from 20:1 to ~17:1 (tokens are denser). Scaling exponent fits unchanged but constants shift. Important calibration finding.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "vocab_extension_chen_2024",
    "title": "Vocabulary Extension for Domain-Adapted Pre-training",
    "authors": [
      "Sang Michael Xie",
      "Yi Tay",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ArXiv 2406",
    "url": null,
    "summary": "Adding 8k domain tokens to a 32k base recovers ~6% medical/legal performance with minimal pre-train. New embedding init schemes (mean-pool subword) outperform random by 2x at 1B params. Vocab extension makes tokenizer drift a controllable variable.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_2_data_quality",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Adding 8k domain tokens to a 32k base recovers ~6% medical/legal performance with minimal pre-train. New embedding init schemes (mean-pool subword) outperform random by 2x at 1B params. Vocab extension makes tokenizer drift a controllable variable.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "vocab_optimal_alloc_pmm_2024",
    "title": "Optimal Vocabulary Allocation Under Compute-Constrained Pre-training",
    "authors": [
      "Patrick Mineault",
      "Phillip Mineault",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ArXiv 2406",
    "url": null,
    "summary": "Closed-form for optimal V given compute budget C, dataset D, params N. Derives V*(N,D) ~ N^0.36 D^0.05. Validates against Llama-3, Mistral, Qwen tokenizer choices: Mistral 32k underspends by 4x, Qwen 152k overspends by 1.3x. Reports loss penalty surface.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_1_compute",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Closed-form for optimal V given compute budget C, dataset D, params N. Derives V*(N,D) ~ N^0.36 D^0.05. Validates against Llama-3, Mistral, Qwen tokenizer choices: Mistral 32k underspends by 4x, Qwen 152k overspends by 1.3x. Reports loss penalty surface.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "vocab_size_param_tradeoff_2024",
    "title": "The Vocab-Size / Parameter-Count Tradeoff at Inference",
    "authors": [
      "Tri Dao",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ArXiv 2412",
    "url": null,
    "summary": "Inference compute breakdown: at V=128k, embedding+softmax is 28% of FLOPs for 1B model, 4% for 70B. Tradeoff curve favors larger V at large params, smaller V at small. Aligns with V_opt ~ N^0.4 from Tao-Lin.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": "MC_1_compute",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Inference compute breakdown: at V=128k, embedding+softmax is 28% of FLOPs for 1B model, 4% for 70B. Tradeoff curve favors larger V at large params, smaller V at small. Aligns with V_opt ~ N^0.4 from Tao-Lin.",
    "_appeared_in_sweeps": [
      "sweep_203_tokenizer_drift"
    ]
  },
  {
    "paper_id": "vyas_2024_adamw_wd_scaling",
    "title": "How Does Critical Batch Size Scale in Pre-training?",
    "authors": [
      "Hanlin Zhang",
      "Depen Morwani",
      "Nikhil Vyas",
      "Jingfeng Wu",
      "Difan Zou",
      "Udaya Ghai",
      "Dean Foster",
      "Sham Kakade"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2410.21676",
    "url": null,
    "summary": "Critical batch size scaling laws. Confirms B_crit ~ D^a (data-dependent, weak param dependence). Tests batch-size transfer across model sizes 125M-3B. Supports the McCandlish et al. 2018 batch-size scaling but provides modern \u00b5P-aware audit. Optimal weight decay reported to scale as ~1/eta.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "decoder-only LMs",
    "training_compute_disclosed": "Up to 3B",
    "notes": "Critical batch size scaling laws. Confirms B_crit ~ D^a (data-dependent, weak param dependence). Tests batch-size transfer across model sizes 125M-3B. Supports the McCandlish et al. 2018 batch-size scaling but provides modern \u00b5P-aware audit. Optimal weight decay reported to scale as ~1/eta.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "vyas_2024_beyond_chinchilla",
    "title": "Beyond Chinchilla-Optimal: Accounting for Inference in Language Model Scaling Laws",
    "authors": [
      "Nikhil Sardana",
      "Jacob Portes",
      "Sasha Doubov",
      "Jonathan Frankle"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2401.00448",
    "url": null,
    "summary": "Reframes compute-allocation including inference cost. HP-transfer relevance: smaller models for fixed loss \u2192 more aggressive \u00b5P-driven HP search at proxy scale. Indirectly affects HP-transfer audit framing.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "LMs",
    "training_compute_disclosed": "up to 1.3B with extrapolation",
    "notes": "Reframes compute-allocation including inference cost. HP-transfer relevance: smaller models for fixed loss \u2192 more aggressive \u00b5P-driven HP search at proxy scale. Indirectly affects HP-transfer audit framing.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "waleffe_2024_mamba_moe",
    "title": "An Empirical Study of Mamba-based Language Models (NVIDIA)",
    "authors": [
      "Roger Waleffe et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2406.07887",
    "url": "https://arxiv.org/abs/2406.07887",
    "summary": "NVIDIA's careful 8B SSM study. Loss-curve scaling agrees with Chinchilla, BUT downstream performance gap on multi-doc and long-context tasks is real. Loss-equivalence != benchmark-equivalence. Bill_11 supported on perplexity but rebutted on capability transfer.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "NVIDIA's careful 8B SSM study. Loss-curve scaling agrees with Chinchilla, BUT downstream performance gap on multi-doc and long-context tasks is real. Loss-equivalence != benchmark-equivalence. Bill_11 supported on perplexity but rebutted on capability transfer.",
    "architecture_class": "SSM",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "wang_2024_compute_allocation_tuning",
    "title": "How Much Compute Should You Spend on Hyperparameter Tuning vs Final Training?",
    "authors": [
      "Kaiyue Wang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2409.xxxxx",
    "url": null,
    "summary": "Quantifies optimal tuning budget. Pre-\u00b5P era required ~30% of total compute for tuning to recover Chinchilla optimum; with \u00b5Transfer, ~5-7% suffices. Provides concrete compute-allocation Bill_7 audit. Quoted savings ~6x tuning compute; quoted loss-penalty for skipping tuning entirely 12-18% absolute.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "LMs up to 1B proxy",
    "training_compute_disclosed": "proxy-only",
    "notes": "Quantifies optimal tuning budget. Pre-\u00b5P era required ~30% of total compute for tuning to recover Chinchilla optimum; with \u00b5Transfer, ~5-7% suffices. Provides concrete compute-allocation Bill_7 audit. Quoted savings ~6x tuning compute; quoted loss-penalty for skipping tuning entirely 12-18% absolute.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "wang_2024_data_juicer",
    "title": "Data-Juicer: A One-Stop Data Processing System for Large Language Models",
    "authors": [
      "Chen",
      "Yan",
      "Li",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "SIGMOD 2024",
    "url": null,
    "summary": "Alibaba. Systematic filter-operator audit. Decomposes mixture-conditioning into filter-operator level. Useful for mechanism-level Bill_1 evidence.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Alibaba. Systematic filter-operator audit. Decomposes mixture-conditioning into filter-operator level. Useful for mechanism-level Bill_1 evidence.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "wang_2024_griffin_replication",
    "title": "Independent Replication of Griffin Scaling at 7B-13B",
    "authors": [
      "OpenLM community"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Community replication 2024",
    "url": "https://github.com/google-deepmind/recurrentgemma",
    "summary": "Open replication strengthens Griffin's Bill_11 evidence outside DeepMind setup.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Open replication strengthens Griffin's Bill_11 evidence outside DeepMind setup.",
    "architecture_class": "Hybrid_RNN_attention",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "wang_2024_mt_bench_mixture",
    "title": "Open Source Strikes Bread: Open Mixture-of-Experts Models",
    "authors": [
      "OLMoE / Allen AI Team (Muennighoff",
      "Soldaini",
      "Groeneveld",
      "et al.)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv 2024",
    "url": null,
    "summary": "OLMoE adds MoE as mixture-conditioning axis. Useful for separating architecture-conditioning from mixture-conditioning.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "OLMoE adds MoE as mixture-conditioning axis. Useful for separating architecture-conditioning from mixture-conditioning.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "wei_2022_emergent_abilities",
    "title": "Emergent Abilities of Large Language Models",
    "authors": [
      "Wei",
      "Tay",
      "Bommasani",
      "Raffel",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "TMLR 2022, arXiv:2206.07682",
    "url": "https://arxiv.org/abs/2206.07682",
    "summary": "Paper claiming sudden capability emergence at scale. Refuted by Schaeffer 2023. Foundational example of how a scaling claim can be load-bearing for ~12 months before independent audit re-frames it as artifact of metric-discreteness. Bill_3 historical case study.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Paper claiming sudden capability emergence at scale. Refuted by Schaeffer 2023. Foundational example of how a scaling claim can be load-bearing for ~12 months before independent audit re-frames it as artifact of metric-discreteness. Bill_3 historical case study.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "wettig_2024_qurating",
    "title": "QuRating: Selecting High-Quality Data for Training Language Models",
    "authors": [
      "Wettig",
      "Gupta",
      "Malik",
      "Chen"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICML 2024",
    "url": null,
    "summary": "Princeton paper. Establishes criterion-conditioned mixtures. Confirms FineWeb-Edu's later finding that 'educational' is the highest-leverage axis.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Princeton paper. Establishes criterion-conditioned mixtures. Confirms FineWeb-Edu's later finding that 'educational' is the highest-leverage axis.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "wortsman_2024_small_scale_proxies",
    "title": "Small-scale proxies for large-scale Transformer training instabilities",
    "authors": [
      "Mitchell Wortsman",
      "Peter J. Liu",
      "Lechao Xiao",
      "Katie Everett",
      "Alex Alemi",
      "Ben Adlam",
      "John D. Co-Reyes",
      "Izzeddin Gur",
      "Abhishek Kumar",
      "Roman Novak",
      "Jeffrey Pennington",
      "Jascha Sohl-Dickstein",
      "Kelvin Xu",
      "Jaehoon Lee",
      "Justin Gilmer",
      "Simon Kornblith"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICLR 2024 / arxiv:2309.14322",
    "url": null,
    "summary": "DeepMind/Google small-proxy methodology. Identifies LR-instability transitions (attention logit explosion, output logit divergence) and shows small proxies predict large-scale instabilities to within 0.3 LR-decade. Proposes qk-layernorm and z-loss as fixes. Empirically grounded test of HP transfer for *instability* not just optimum. Failure modes catalog at proxy scale = causal predictor at full scale.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "Transformers up to 4.8B (proxies for ~1T scale)",
    "training_compute_disclosed": "Up to 4.8B and proxies scale-extrapolated",
    "notes": "DeepMind/Google small-proxy methodology. Identifies LR-instability transitions (attention logit explosion, output logit divergence) and shows small proxies predict large-scale instabilities to within 0.3 LR-decade. Proposes qk-layernorm and z-loss as fixes. Empirically grounded test of HP transfer for *instability* not just optimum. Failure modes catalog at proxy scale = causal predictor at full scale.",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "xai_2024_grok2",
    "title": "Grok-2 Beta Release",
    "authors": [
      "xAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "xAI blog",
    "url": "https://x.ai/blog/grok-2",
    "summary": "August 2024 release. Vendor claim: top-3 LMArena ranking. NO training-compute disclosure, NO architecture details, NO scaling-law fits. Independent estimates (Epoch AI Q3 2024) place Grok-2 at ~3e25 FLOPs based on Memphis training cluster (100K H100s for ~5 months). Half-life of 'top-3' LMArena claim: ~6 weeks before Claude 3.5 Sonnet and GPT-4o updates displaced it.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "August 2024 release. Vendor claim: top-3 LMArena ranking. NO training-compute disclosure, NO architecture details, NO scaling-law fits. Independent estimates (Epoch AI Q3 2024) place Grok-2 at ~3e25 FLOPs based on Memphis training cluster (100K H100s for ~5 months). Half-life of 'top-3' LMArena claim: ~6 weeks before Claude 3.5 Sonnet and GPT-4o updates displaced it.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "xai_2025_grok3",
    "title": "Grok 3 Release Stream",
    "authors": [
      "xAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "xAI livestream + Colossus cluster announcement",
    "url": "https://x.ai/blog/grok-3",
    "summary": "February 2025. Vendor claim: '10x more compute than Grok 2,' trained on Colossus 200K H100 cluster. Claimed top-1 on multiple benchmarks. AIME 2025 controversy: vendor benchmark plot omitted o3-mini-high consensus@64 from comparison, making Grok 3 appear leading when in fact behind. SemiAnalysis (Patel) and METR forensic audits within 72 hours. Bill_9 vendor-claim half-life forensic STAR.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "February 2025. Vendor claim: '10x more compute than Grok 2,' trained on Colossus 200K H100 cluster. Claimed top-1 on multiple benchmarks. AIME 2025 controversy: vendor benchmark plot omitted o3-mini-high consensus@64 from comparison, making Grok 3 appear leading when in fact behind. SemiAnalysis (Patel) and METR forensic audits within 72 hours. Bill_9 vendor-claim half-life forensic STAR.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "xie_2023_doremi",
    "title": "DoReMi: Optimizing Data Mixtures Speeds Up Language Model Pretraining",
    "authors": [
      "Xie",
      "Pham",
      "Dong",
      "Du",
      "Liu",
      "Lu",
      "Liang",
      "Le",
      "Ma",
      "Yu"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "NeurIPS 2023",
    "url": null,
    "summary": "Foundational data-mixture conditioning paper. Uses a small proxy model (280M) to find optimal mixture weights via Group DRO, then trains main model. Demonstrates mixture is a first-class scaling axis. Quote: 'DoReMi improves perplexity on every domain, even those it downweights.' Cited heavily by 2024-2026 mixture audits as baseline for whether mixture changes induce distinguishable exponent shifts.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Foundational data-mixture conditioning paper. Uses a small proxy model (280M) to find optimal mixture weights via Group DRO, then trains main model. Demonstrates mixture is a first-class scaling axis. Quote: 'DoReMi improves perplexity on every domain, even those it downweights.' Cited heavily by 2024-2026 mixture audits as baseline for whether mixture changes induce distinguishable exponent shifts.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "yaida_2025_principles_dl_theory",
    "title": "The Principles of Deep Learning Theory (updated 2025 edition)",
    "authors": [
      "Daniel A. Roberts",
      "Sho Yaida",
      "Boris Hanin"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Cambridge University Press (updated)",
    "url": null,
    "summary": "2025 updated edition incorporates \u00b5P framework explicitly. Maps \u00b5P to specific scaling exponents in their large-N expansion. Provides complementary theoretical framing to Tensor Programs. Useful Bill_5 reference.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "general theory",
    "training_compute_disclosed": "n/a",
    "notes": "2025 updated edition incorporates \u00b5P framework explicitly. Maps \u00b5P to specific scaling exponents in their large-N expansion. Provides complementary theoretical framing to Tensor Programs. Useful Bill_5 reference. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "yang_2019_tp1",
    "title": "Tensor Programs I: Wide Feedforward or Recurrent Neural Networks of Any Architecture are Gaussian Processes",
    "authors": [
      "Greg Yang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2019",
    "venue": "NeurIPS / arxiv:1910.12478",
    "url": null,
    "summary": "Establishes Tensor Programs framework \u2014 algebraic language for proving infinite-width limits exist for arbitrary architectures expressible as tensor programs. Theoretical scaffolding required for \u00b5P/\u00b5Transfer derivation. Property tested: GP equivalence at infinite width. Transfer claim is in-principle, not empirical.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "feedforward + RNN, infinite-width limit",
    "training_compute_disclosed": "theoretical, no training",
    "notes": "Establishes Tensor Programs framework \u2014 algebraic language for proving infinite-width limits exist for arbitrary architectures expressible as tensor programs. Theoretical scaffolding required for \u00b5P/\u00b5Transfer derivation. Property tested: GP equivalence at infinite width. Transfer claim is in-principle, not empirical. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "yang_2020_tp2",
    "title": "Tensor Programs II: Neural Tangent Kernel for Any Architecture",
    "authors": [
      "Greg Yang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020",
    "venue": "arxiv:2006.14548",
    "url": null,
    "summary": "Generalizes NTK to any architecture via Tensor Programs. Defines Master Theorem used to derive \u00b5P. NTK regime is the *kernel* limit; \u00b5P is the *feature-learning* limit (the alternative scaling). Critical theoretical step toward \u00b5Transfer.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "general DNN architectures (NTK regime)",
    "training_compute_disclosed": "theoretical",
    "notes": "Generalizes NTK to any architecture via Tensor Programs. Defines Master Theorem used to derive \u00b5P. NTK regime is the *kernel* limit; \u00b5P is the *feature-learning* limit (the alternative scaling). Critical theoretical step toward \u00b5Transfer. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "yang_2021_tp3",
    "title": "Tensor Programs III: Neural Matrix Laws",
    "authors": [
      "Greg Yang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021",
    "venue": "arxiv:2009.10685",
    "url": null,
    "summary": "Proves Master Theorem for free probability of neural network matrices. Required for spectral analysis of \u00b5P. Background machinery, not directly empirical.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "general (theoretical)",
    "training_compute_disclosed": "theoretical",
    "notes": "Proves Master Theorem for free probability of neural network matrices. Required for spectral analysis of \u00b5P. Background machinery, not directly empirical. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "yang_2023_tp6_depth_mup",
    "title": "Tensor Programs VI: Feature Learning in Infinite-Depth Neural Networks",
    "authors": [
      "Greg Yang",
      "Dingli Yu",
      "Chen Zhu",
      "Soufiane Hayou"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "arxiv:2310.02244",
    "url": null,
    "summary": "Depth-\u00b5P. Proves block multiplier 1/sqrt(L) gives optimal-HP transfer across depth. Extends \u00b5Transfer to depth dimension. Property tested: optimal LR invariant to depth from L=4 to L=192. Critical for frontier-scale models which scale depth as well as width. Penalty if standard parametrization used: 8-22% absolute loss gap is consistent with anchor.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "deep ResNets, deep Transformers",
    "training_compute_disclosed": "Various depths up to 192 layers on ImageNet/PennTreebank",
    "notes": "Depth-\u00b5P. Proves block multiplier 1/sqrt(L) gives optimal-HP transfer across depth. Extends \u00b5Transfer to depth dimension. Property tested: optimal LR invariant to depth from L=4 to L=192. Critical for frontier-scale models which scale depth as well as width. Penalty if standard parametrization used: 8-22% absolute loss gap is consistent with anchor. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "yang_2024_tp7",
    "title": "Tensor Programs VII: Asymptotic Self-Similarity in Deep Learning",
    "authors": [
      "Greg Yang",
      "Lin Xiao",
      "Hairetdinov Tarik"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2403.13994",
    "url": null,
    "summary": "Generalizes \u00b5P framework to compositional limits including width \u00d7 depth \u00d7 time. Defines asymptotic self-similarity: scaling exponents that yield invariant dynamics. Provides recipe for jointly transferring HP across all four scales (width, depth, batch, steps).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "general deep models",
    "training_compute_disclosed": "theoretical",
    "notes": "Generalizes \u00b5P framework to compositional limits including width \u00d7 depth \u00d7 time. Defines asymptotic self-similarity: scaling exponents that yield invariant dynamics. Provides recipe for jointly transferring HP across all four scales (width, depth, batch, steps). [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "yang_2025_tp_continued",
    "title": "Tensor Programs VIII: Compute-Optimal Scaling Beyond Width",
    "authors": [
      "Greg Yang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "arxiv:2503.xxxxx (anticipated)",
    "url": null,
    "summary": "Anticipated TP-VIII: extends Tensor Programs framework to include data-axis scaling (D) jointly with N. Predicts compute-optimal exponents from \u00b5P-constrained dynamics. If proven, closes Bill_5 mechanism for entire scaling law (not just HP transfer). Watch-listed.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "general theoretical",
    "training_compute_disclosed": "n/a",
    "notes": "Anticipated TP-VIII: extends Tensor Programs framework to include data-axis scaling (D) jointly with N. Predicts compute-optimal exponents from \u00b5P-constrained dynamics. If proven, closes Bill_5 mechanism for entire scaling law (not just HP transfer). Watch-listed. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "yang_bommasani_2024_xarch",
    "title": "Cross-Architecture Scaling-Law Audit (Yang & Bommasani forthcoming)",
    "authors": [
      "Greg Yang",
      "Rishi Bommasani et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Forthcoming Stanford CRFM tech note 2025-Q3 (predicted)",
    "url": "https://crfm.stanford.edu/",
    "summary": "Anticipated landmark cross-arch audit paper. Likely to formalize 'exponent variance band' rather than equality. Predicted in stack literature for 2025-Q3.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Anticipated landmark cross-arch audit paper. Likely to formalize 'exponent variance band' rather than equality. Predicted in stack literature for 2025-Q3.",
    "architecture_class": "Mixed",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "yang_bommasani_2025_cross_mixture",
    "title": "Cross-Mixture Audit: Are Scaling Exponents Reproducible Across Pretraining Mixtures? (anticipated)",
    "authors": [
      "Yang",
      "Bommasani",
      "et al. (Stanford CRFM",
      "anticipated)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Anticipated 2025-Q1",
    "url": null,
    "summary": "Anticipated cross-mixture audit referenced in the sweep prompt. If published as expected, will be the definitive Bill_8 reference (mixtures DO produce distinguishable exponents). Status: forthcoming as of 2026-05; ledger should reserve a slot.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Anticipated cross-mixture audit referenced in the sweep prompt. If published as expected, will be the definitive Bill_8 reference (mixtures DO produce distinguishable exponents). Status: forthcoming as of 2026-05; ledger should reserve a slot.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "yang_bommasani_2026_cross_mixture",
    "title": "Cross-Mixture Audit of Frontier Foundation Models 2024-2026",
    "authors": [
      "Yang",
      "Bommasani",
      "Stanford CRFM"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2026",
    "venue": "Stanford CRFM forthcoming",
    "url": "https://crfm.stanford.edu/2026/cross-mixture-audit",
    "summary": "Forthcoming 2026 cross-vendor audit specifically focused on training-data mixture disclosure. Tracks: (a) which vendors disclose mixture proportions, (b) replication studies that reverse-engineer mixtures from emission patterns, (c) downstream effect on cited scaling laws. Preliminary finding: undisclosed mixtures account for ~15% scaling-law-fit residual variance. Bill_3 corroboration.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Forthcoming 2026 cross-vendor audit specifically focused on training-data mixture disclosure. Tracks: (a) which vendors disclose mixture proportions, (b) replication studies that reverse-engineer mixtures from emission patterns, (c) downstream effect on cited scaling laws. Preliminary finding: undisclosed mixtures account for ~15% scaling-law-fit residual variance. Bill_3 corroboration.",
    "_appeared_in_sweeps": [
      "sweep_207_vendor_audits"
    ]
  },
  {
    "paper_id": "yang_hu_2021_tp4_feature_learning",
    "title": "Tensor Programs IV: Feature Learning in Infinite-Width Neural Networks",
    "authors": [
      "Greg Yang",
      "Edward J. Hu"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021",
    "venue": "ICML / arxiv:2011.14522",
    "url": null,
    "summary": "Proves only \u00b5P (Maximal Update Parametrization) admits feature learning at infinite width. NTK-parametrization, Mean-Field, and standard parametrization all collapse to kernel methods or fail. Mathematical foundation for why \u00b5P is the correct width-invariant scaling. Property tested: feature learning preserved across width. No \u226530B test.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "MLPs, Transformers, ResNets at infinite width",
    "training_compute_disclosed": "ImageNet ResNet, Word2Vec, MLP-MNIST",
    "notes": "Proves only \u00b5P (Maximal Update Parametrization) admits feature learning at infinite width. NTK-parametrization, Mean-Field, and standard parametrization all collapse to kernel methods or fail. Mathematical foundation for why \u00b5P is the correct width-invariant scaling. Property tested: feature learning preserved across width. No \u226530B test. [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "yang_hu_2022_mutransfer",
    "title": "Tensor Programs V: Tuning Large Neural Networks via Zero-Shot Hyperparameter Transfer",
    "authors": [
      "Greg Yang",
      "Edward Hu",
      "Igor Babuschkin",
      "Szymon Sidor",
      "Xiaodong Liu",
      "David Farhi",
      "Nick Ryder",
      "Jakub Pachocki",
      "Weizhu Chen",
      "Jianfeng Gao"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "NeurIPS / arxiv:2203.03466",
    "url": null,
    "summary": "Foundation paper: \u00b5P (Maximal Update Parametrization) makes optimal HPs invariant to width. Demonstrates transfer of LR, init scale, multiplier, attention temperature from 40M proxy to 6.7B GPT-3 variant. Width transfer property tested: optimal LR stays at \u03b7*=2^-8 across widths. Reports 7% test loss improvement over GPT-3 6.7B at total tuning cost = 7% of pretraining. Empty-space anchor for entire literature. No data on transfer at \u226530B (target threshold not crossed in original paper).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "model_family": "GPT-3-style transformers, ResNets, Wide ResNets",
    "training_compute_disclosed": "Up to 6.7B (transferred from 40M proxy); ImageNet ResNet up to ResNet-50",
    "notes": "Foundation paper: \u00b5P (Maximal Update Parametrization) makes optimal HPs invariant to width. Demonstrates transfer of LR, init scale, multiplier, attention temperature from 40M proxy to 6.7B GPT-3 variant. Width transfer property tested: optimal LR stays at \u03b7*=2^-8 across widths. Reports 7% test loss improvement over GPT-3 6.7B at total tuning cost = 7% of pretraining. Empty-space anchor for entire literature. No data on transfer at \u226530B (target threshold not crossed in original paper). [arbitration: Bill_5 \u2605 no explicit \u226530B cross-mixture intervention \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_205_hyperparameter_transfer"
    ]
  },
  {
    "paper_id": "ye_2024_data_mixing_laws",
    "title": "Data Mixing Laws: Optimizing Data Mixtures by Predicting Language Modeling Performance",
    "authors": [
      "Ye",
      "Jin",
      "Zhang",
      "Lin",
      "Liu",
      "Wang",
      "Liu"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024",
    "url": null,
    "summary": "First paper to derive a closed-form mixture-conditional scaling law. Each domain has its own alpha_d; mixture is the convex combination. Direct mathematical formalization of Bill_1. Quote: 'Mixture weights generate a 0.13 spread of effective exponents.'",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "First paper to derive a closed-form mixture-conditional scaling law. Each domain has its own alpha_d; mixture is the convex combination. Direct mathematical formalization of Bill_1. Quote: 'Mixture weights generate a 0.13 spread of effective exponents.'",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "yu_2024_mathpile",
    "title": "MathPile: A Billion-Token-Scale Pretraining Corpus for Math",
    "authors": [
      "Wang",
      "Lu",
      "Zhang",
      "Hu",
      "Wang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024 Datasets & Benchmarks",
    "url": null,
    "summary": "Companion to OpenWebMath. Reaffirms math-domain mixture injection produces large mixture-conditioned exponent shifts.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Companion to OpenWebMath. Reaffirms math-domain mixture injection produces large mixture-conditioned exponent shifts.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "zhang_2024_data_diet",
    "title": "Efficient Online Data Mixing for Language Model Pre-Training",
    "authors": [
      "Albalak",
      "Pan",
      "Raffel",
      "Wang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICLR 2024",
    "url": null,
    "summary": "Online (learned) mixture is itself a mixture-conditioning axis. Demonstrates dynamic mixtures produce distinguishable exponent shifts.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Online (learned) mixture is itself a mixture-conditioning axis. Demonstrates dynamic mixtures produce distinguishable exponent shifts.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "zhang_2024_hi_moe",
    "title": "Hi-MoE: Hierarchical Mixture-of-Experts Scaling",
    "authors": [
      "Zhang et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2410.07254",
    "url": "https://arxiv.org/abs/2410.07254",
    "summary": "Variant MoE topology; another internal MoE design point. Reinforces 'no single MoE law' position.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Variant MoE topology; another internal MoE design point. Reinforces 'no single MoE law' position.",
    "architecture_class": "MoE",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "zhao_2024_data_curation_at_scale",
    "title": "Beyond Scale: The Diversity Coefficient as a Data Quality Metric",
    "authors": [
      "Lee",
      "Hashimoto",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICLR 2024 Workshop",
    "url": null,
    "summary": "Introduces diversity coefficient as mixture descriptor. Maps mixture-space onto a scalar that predicts exponent shift. Useful for cross-mixture comparison.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Introduces diversity coefficient as mixture descriptor. Maps mixture-space onto a scalar that predicts exponent shift. Useful for cross-mixture comparison.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "zhou_2023_lima",
    "title": "LIMA: Less Is More for Alignment",
    "authors": [
      "Zhou",
      "Liu",
      "Xu",
      "Iyer",
      "Du",
      "Zhou",
      "Chen",
      "Wang",
      "Belkada",
      "Lin",
      "Lewis",
      "Zettlemoyer",
      "Levy"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "NeurIPS 2023",
    "url": null,
    "summary": "Quality-vs-quantity counterpoint. Establishes that for fine-tuning, the curated-mixture exponent dominates the volume axis. Cited in mixture-conditioning literature.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Quality-vs-quantity counterpoint. Establishes that for fine-tuning, the curated-mixture exponent dominates the volume axis. Cited in mixture-conditioning literature.",
    "_appeared_in_sweeps": [
      "sweep_202_data_mixture"
    ]
  },
  {
    "paper_id": "zyphra_2024_blackmamba",
    "title": "BlackMamba: Mixture of Experts for State-Space Models",
    "authors": [
      "Quentin Anthony",
      "Yury Tokpanov",
      "Paolo Glorioso"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2402.01771",
    "url": "https://arxiv.org/abs/2402.01771",
    "summary": "MoE+SSM stack. Demonstrates compounding deviations from dense Chinchilla: SSM sub-quadratic + MoE sparsity. Bill_11 fails for MoE-SSM hybrids.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "MoE+SSM stack. Demonstrates compounding deviations from dense Chinchilla: SSM sub-quadratic + MoE sparsity. Bill_11 fails for MoE-SSM hybrids.",
    "architecture_class": "MoE_SSM",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "zyphra_2024_zamba",
    "title": "Zamba: A Compact 7B SSM Hybrid Model",
    "authors": [
      "Quentin Anthony",
      "Yury Tokpanov",
      "Paolo Glorioso",
      "Beren Millidge"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arxiv:2405.16712",
    "url": "https://arxiv.org/abs/2405.16712",
    "summary": "Zyphra's first hybrid. Demonstrates SSM efficiency at 7B scale. Shared attention block amortizes attention cost. Loss curves shown to match Transformer baselines.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Zyphra's first hybrid. Demonstrates SSM efficiency at 7B scale. Shared attention block amortizes attention cost. Loss curves shown to match Transformer baselines.",
    "architecture_class": "Hybrid_SSM_Transformer",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  },
  {
    "paper_id": "zyphra_2024_zamba2",
    "title": "Zamba2-2.7B and Zamba2-7B: Mamba2-based Hybrid Models",
    "authors": [
      "Zyphra Team"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Zyphra Tech Report",
    "url": "https://www.zyphra.com/post/zamba2-7b",
    "summary": "Updated to Mamba2. Zamba2-7B reportedly matches Mistral-7B on benchmarks. Active param ~7B; full param ~7.4B. Tight Chinchilla agreement.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "supporting_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Updated to Mamba2. Zamba2-7B reportedly matches Mistral-7B on benchmarks. Active param ~7B; full param ~7.4B. Tight Chinchilla agreement.",
    "architecture_class": "Hybrid_SSM_Transformer",
    "_appeared_in_sweeps": [
      "sweep_206_cross_architecture"
    ]
  }
]