{
  "slug": "capability_benchmarks",
  "generated": "2026-05-08",
  "status": "public-draft-source-linted",
  "verified_public": [
    {
      "id": "arxiv:2403.06634",
      "title": "Stealing Part of a Production Language Model",
      "role": "Bill 1 contamination/model-extraction lineage"
    },
    {
      "id": "arxiv:2403.07974",
      "title": "LiveCodeBench",
      "role": "Bill 18 anti-saturation benchmark construction"
    },
    {
      "id": "arxiv:2410.05229",
      "title": "GSM-Symbolic",
      "role": "Bill 4 problem-format brittleness anchor"
    },
    {
      "id": "arxiv:2411.04872",
      "title": "FrontierMath",
      "role": "Bill 17 held-out frontier benchmark anchor"
    },
    {
      "id": "arxiv:2412.04604",
      "title": "ARC-AGI / ARC Prize public anchor",
      "role": "Bill 17 held-out/iterative reframing anchor"
    },
    {
      "id": "arxiv:2503.14499",
      "title": "METR HCAST",
      "role": "Bill 19 temporal trajectory and independent-auditor anchor"
    }
  ],
  "quarantined": [
    {
      "id": "source_lint_quarantine:2509.13301",
      "reason": "Prior Gerstgrasser-Bommasani six-audit framework handle failed lint.",
      "replacement_status": "Public claim narrowed; not used as evidence."
    },
    {
      "id": "source_lint_quarantine:2502.07770",
      "reason": "Prior Anand-Tirumala vendor-claim half-life handle failed lint.",
      "replacement_status": "Temporal-trajectory claim retained only through verified public anchors."
    },
    {
      "id": "source_lint_quarantine:2511.04832",
      "reason": "FrontierMath tool-exfiltration row quarantined in corpus_batch_1.json.",
      "replacement_status": "Not used as public evidence."
    }
  ],
  "internal_or_synthesis": [
    {
      "id": "bills_draft.md",
      "role": "Bill definitions, meta-costs, escape gates, and public-summary counts."
    },
    {
      "id": "corpus_batch_1.json",
      "role": "Batch 1 corpus export."
    },
    {
      "id": "bill_classifier_benchmark.json",
      "role": "Classifier benchmark fixtures."
    }
  ],
  "notes": [
    "Quarantined handles remain in the manifest to make the prior source-link failure explicit."
  ]
}
