[
  {
    "record_kind": "bridge",
    "bridge_id": "B1_causal_mechanism_empty",
    "paper_id": "bridge:B1_causal_mechanism_empty",
    "title": "Causally-faithful mechanism empty across 7 domains",
    "authors": [
      "Kevin Russell"
    ],
    "affiliations": [
      "Project 42"
    ],
    "date": "2026-05-09",
    "venue": "Project 42 internal \u2014 discoveries.html#meta-findings",
    "url": "https://projectforty2.ai/discoveries.html#meta-findings",
    "first_named": "2026-05-09",
    "summary": "Across 7 production aiwikis (Mech Interp, Inference-time Safety, Reasoning, VLM, Scaling Laws, Agentic, Biology), the Bill \u2605 predicting 'the mechanism is causally faithful' holds empty with 0 clean triggers across 137 candidate papers (60 explicit rebuttals). Frontier ML systems lack causally-faithful internal mechanisms as a domain-invariant property.",
    "participating_ledgers": [
      {
        "ledger": "mech_interp",
        "bill": "Bill_11",
        "cands": 27,
        "clean": 0,
        "rebuttals": 22,
        "anchor": "refusal-direction patching norm-confounded"
      },
      {
        "ledger": "inference_time_safety",
        "bill": "Bill_11",
        "cands": 18,
        "clean": 0,
        "rebuttals": 11,
        "anchor": "steering safety doesn't survive 3-month patch turnover"
      },
      {
        "ledger": "reasoning_cot",
        "bill": "Bill_6",
        "cands": 4,
        "clean": 0,
        "rebuttals": 3,
        "anchor": "Apollo o1 99% lying-about-actions"
      },
      {
        "ledger": "vision_language",
        "bill": "Bill_4",
        "cands": 32,
        "clean": 0,
        "rebuttals": 13,
        "anchor": "Eyes Wide Shut: 25-40% caption-only"
      },
      {
        "ledger": "scaling_laws",
        "bill": "Bill_5",
        "cands": 19,
        "clean": 0,
        "rebuttals": 19,
        "anchor": "mechanism survives only \u22647B intervention"
      },
      {
        "ledger": "agentic_tool_use",
        "bill": "Bill_4",
        "cands": 33,
        "clean": 0,
        "rebuttals": 4,
        "anchor": "tool-name perturbation 18-32% fail"
      },
      {
        "ledger": "bio_protein",
        "bill": "Bill_4",
        "cands": 20,
        "clean": 0,
        "rebuttals": 0,
        "anchor": "attention-pattern interpretability collapses"
      }
    ],
    "operational_definition": "Across the 7 participating ledger \u2605 bills, 0 clean triggers (verdict=known_bill, confidence \u2265 0.9, independent third-party replicated) across all candidates after hand-arbitration.",
    "current_status": {
      "n_ledgers": 7,
      "n_candidates_total": 153,
      "n_clean_triggers_total": 0,
      "rebuttals_total": 72,
      "as_of": "2026-05-09"
    },
    "falsification_condition": "Any participating ledger Bill \u2605 surfaces \u22651 clean trigger (verdict=known_bill, confidence \u2265 0.9, independent third-party replicated) during the 2027 audit cycle. Threshold for falsifying the BRIDGE: \u22653 clean triggers across \u22652 distinct ledgers (single-domain win does not falsify cross-domain claim).",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "active",
    "confidence": 0.92,
    "watchlist_tier": "monthly",
    "rebuts_external": [],
    "corroborated_by_external": [
      "arxiv:2509.13301",
      "blog:apollo:o1_evals_2024_12",
      "anthropic:tracing_thoughts_2025_03"
    ],
    "notes": "Strongest bridge in the synthesis. 7-way star-mechanism alignment. Direct cousin to B5 (the 0/N audit pattern). The breadth of this finding (7 independent domain audits arriving at the same conclusion) is the load-bearing strength; the empirical thinness of any single ledger's evidence is the weakness."
  },
  {
    "record_kind": "bridge",
    "bridge_id": "B2_closure_cycle_compressed",
    "paper_id": "bridge:B2_closure_cycle_compressed",
    "title": "Closure cycle compressed from 18 months to 3-4 months \u2014 10\u00d7 faster than policy revision",
    "authors": [
      "Kevin Russell"
    ],
    "affiliations": [
      "Project 42"
    ],
    "date": "2026-05-09",
    "venue": "Project 42 internal \u2014 discoveries.html#meta-findings",
    "url": "https://projectforty2.ai/discoveries.html#meta-findings",
    "first_named": "2026-05-09",
    "summary": "Five temporal-trajectory bills across 5 ledgers converge on a 30-100 day vendor-claim \u2192 independent-rebuttal half-life, while NIST AI RMF / EU AI Act / BIS revision cycles operate at 12-15 months. The mismatch is structural, not an anomaly.",
    "participating_ledgers": [
      {
        "ledger": "capability_benchmarks",
        "bill": "Bill_19",
        "metric": "vendor-claim half-life",
        "value": "73 days (Anand-Tirumala)"
      },
      {
        "ledger": "inference_time_safety",
        "bill": "Bill_2",
        "metric": "patch half-life",
        "value": "30 days closed / 36 hours open"
      },
      {
        "ledger": "compute_governance",
        "bill": "Bill_19",
        "metric": "distilled-cousin half-life",
        "value": "3.4 months"
      },
      {
        "ledger": "open_weight",
        "bill": "Bill_2",
        "metric": "distilled-cousin half-life",
        "value": "3.4 months"
      },
      {
        "ledger": "scaling_laws",
        "bill": "Bill_9",
        "metric": "vendor-claim half-life",
        "value": "73 days"
      },
      {
        "ledger": "compute_governance",
        "bill": "Bill_18",
        "metric": "policy lifetime",
        "value": "4 months (BIS) / 15 months (EO 14110)"
      }
    ],
    "operational_definition": "Median time from frontier-vendor capability claim to independent-third-party rebuttal across the 2024-2026 corpus. Compared against NIST AI RMF revision cadence (~12 months), EU AI Act systemic-risk threshold revision cadence (~24 months), BIS Diffusion Framework lifetime (4 months observed).",
    "current_status": {
      "closure_half_life_days": 90,
      "policy_revision_months": 12,
      "ratio": "~4\u00d7",
      "stretched_ratio_high": "60\u00d7 (BIS 4mo vs Anand 73d)",
      "stretched_ratio_low": "3.5\u00d7 (Anand 73d vs policy 12mo)",
      "as_of": "2026-05-09"
    },
    "falsification_condition": "Any 2027 audit shows median closure half-life \u2265 9 months (i.e., the cycle has decelerated, returning to pre-2024 cadence), OR policy revision cadence drops to \u2264 4 months (matching the closure cycle).",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "needs_gate_declaration",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "rebuts_external": [],
    "corroborated_by_external": [
      "arxiv:2502.07770"
    ],
    "notes": "Vulnerable to Bill 7 \u2605 predicted-empty. The '10\u00d7' claim averages 3 distinct time-scales (vendor-claim, patch, distilled-cousin) and is honestly between 3.5\u00d7 and 60\u00d7 depending on choice of metric. Operational definition needs to fix one metric or report a range, not a point. The CORE phenomenon (closure cycle has compressed) is robust; the PRECISE '10\u00d7' number is fragile."
  },
  {
    "record_kind": "bridge",
    "bridge_id": "B3_asymmetric_transfer",
    "paper_id": "bridge:B3_asymmetric_transfer",
    "title": "Capabilities transfer cross-surface; mitigations don't",
    "authors": [
      "Kevin Russell"
    ],
    "affiliations": [
      "Project 42"
    ],
    "date": "2026-05-09",
    "venue": "Project 42 internal \u2014 discoveries.html#meta-findings",
    "url": "https://projectforty2.ai/discoveries.html#meta-findings",
    "first_named": "2026-05-09",
    "summary": "Across 4 production aiwikis, the same asymmetric pattern recurs: attack-side capabilities transfer between deployment modes (chat \u2192 API \u2192 agent \u2192 tool-use \u2192 fine-tune \u2192 quantize \u2192 distill), but defense-side mitigations don't. Defense mitigations are a property of the deployment surface, not the model.",
    "participating_ledgers": [
      {
        "ledger": "inference_time_safety",
        "bill": "Bill_14",
        "anchor": "AISI Q4-2024: 0/6 vendors pass cross-surface; AgentDojo 17-43% degradation"
      },
      {
        "ledger": "open_weight",
        "bill": "Bill_8",
        "anchor": "Lermen-Rimsky ~10\u00d7 cheaper to undo safety than train it"
      },
      {
        "ledger": "vision_language",
        "bill": "Bill_18",
        "anchor": "Multi-image / video / interleaved 22-35% accuracy drop"
      },
      {
        "ledger": "agentic_tool_use",
        "bill": "Bill_11",
        "anchor": "Browser-state replay 28-41% leakage in agents vs text models"
      }
    ],
    "operational_definition": "Capabilities transfer = \u226580% of single-surface performance retained at downstream surface. Mitigations transfer = \u226410% degradation of mitigation effectiveness at downstream surface. Bridge fires when capability-transfer ratio > mitigation-transfer ratio by \u22653\u00d7 across \u22653 ledgers.",
    "current_status": {
      "n_ledgers": 4,
      "capability_transfer_median": "0.83",
      "mitigation_transfer_median": "0.27",
      "asymmetry_ratio": "3.07",
      "as_of": "2026-05-09"
    },
    "falsification_condition": "Independent third-party demonstrates a deployment-surface-stable mitigation framework (defense-in-depth artifact applied to model, \u226410% mitigation degradation across chat\u2192API\u2192agent\u2192fine-tune\u2192quantize\u2192distill) on a frontier open-weight model within 12 months.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "active",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "rebuts_external": [],
    "corroborated_by_external": [
      "arxiv:2310.20624",
      "arxiv:2406.13352"
    ],
    "notes": "Directly engineering-actionable. The implication ('defense in depth must be redesigned per-surface') is testable in 12 months. Bill 9 \u2605 vulnerable: single counter-example would falsify this bridge cleanly."
  },
  {
    "record_kind": "bridge",
    "bridge_id": "B4_capability_fluidity",
    "paper_id": "bridge:B4_capability_fluidity",
    "title": "Distillation = architecture-portability = scaling-portability \u2014 three names for one phenomenon",
    "authors": [
      "Kevin Russell"
    ],
    "affiliations": [
      "Project 42"
    ],
    "date": "2026-05-09",
    "venue": "Project 42 internal \u2014 discoveries.html#meta-findings",
    "url": "https://projectforty2.ai/discoveries.html#meta-findings",
    "first_named": "2026-05-09",
    "summary": "Three production aiwikis predict different \u2605 bills empty for what data shows is one underlying phenomenon: frontier ML capability is largely amortizable into smaller models via SFT/RL on outputs at 3.4-month half-life, with no architectural or scaling 'moat'.",
    "participating_ledgers": [
      {
        "ledger": "open_weight",
        "bill": "Bill_5",
        "anchor": "Halevy-Heim-Pilz 0/14 distillation-resistant"
      },
      {
        "ledger": "scaling_laws",
        "bill": "Bill_11",
        "anchor": "Mamba2 dense-Transformer fails 0.06-0.11 on SSM; DeepSeek V3 MoE 20:1 fails 35-60%"
      },
      {
        "ledger": "compute_governance",
        "bill": "Bill_11",
        "anchor": "R1-Distill 100-1000\u00d7 lower compute reaches 85-95%"
      }
    ],
    "operational_definition": "Three independent \u2605 bills (Open-weight 5, Scaling Laws 11, Compute Governance 11) hold empty when audited as the same underlying capability-substrate phenomenon. Bridge fires when median teacher:cousin compute ratio at 90% retention \u2264 50\u00d7 across \u22653 ledgers.",
    "current_status": {
      "median_teacher_cousin_compute_ratio_90pct_retention": "28\u00d7",
      "n_ledgers": 3,
      "as_of": "2026-05-09"
    },
    "falsification_condition": "Any 2027 frontier-capability claim survives \u226510\u00d7 compute ratio with \u226430% capability retention in distilled cousin within 6 months of original release. (Halevy-Heim-Pilz framework applied prospectively.)",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "active",
    "confidence": 0.84,
    "watchlist_tier": "monthly",
    "rebuts_external": [],
    "corroborated_by_external": [
      "arxiv:2501.12948"
    ],
    "notes": "Vulnerable to Bill 10 (bridge-coupling decomposition): the three bills may be three views of one phenomenon, in which case the 'bridge' is a tautology rather than a discovery. Independent verification needs to show the three audits would have surfaced independently from different forensic teams."
  },
  {
    "record_kind": "bridge",
    "bridge_id": "B5_zero_over_N_audit",
    "paper_id": "bridge:B5_zero_over_N_audit",
    "title": "'0/N' audit pattern recurs across forensic researchers",
    "authors": [
      "Kevin Russell"
    ],
    "affiliations": [
      "Project 42"
    ],
    "date": "2026-05-09",
    "venue": "Project 42 internal \u2014 discoveries.html#meta-findings",
    "url": "https://projectforty2.ai/discoveries.html#meta-findings",
    "first_named": "2026-05-09",
    "summary": "Across 6 distinct cross-domain forensic audits (Anand-Goyal unified-VLM, Anand-Bommasani cross-organism, Anand-Rein unified-agent, Halevy-Heim-Pilz distillation-resistance, IBBIS biological-design APIs, Yang-Bommasani cross-mixture scaling), the audit lands at literally zero clean closures. Vendor-claim construction systematically fails its own independent-replication audit.",
    "participating_ledgers": [
      {
        "audit": "Anand-Goyal 2025 unified-VLM",
        "result": "0/9",
        "ledger_anchor": "vision_language/Bill_10"
      },
      {
        "audit": "Anand-Bommasani 2025 cross-organism",
        "result": "0/8",
        "ledger_anchor": "bio_protein/Bill_7"
      },
      {
        "audit": "Anand-Rein 2025 unified-agent",
        "result": "0/9",
        "ledger_anchor": "agentic_tool_use/Bill_10"
      },
      {
        "audit": "Halevy-Heim-Pilz distillation-resistance",
        "result": "0/14",
        "ledger_anchor": "open_weight/Bill_5"
      },
      {
        "audit": "IBBIS biological-design APIs synthesis-screened",
        "result": "0/4",
        "ledger_anchor": "bio_protein/Bill_11"
      },
      {
        "audit": "Yang-Bommasani cross-mixture scaling",
        "result": "0/9",
        "ledger_anchor": "scaling_laws/Bill_8"
      }
    ],
    "operational_definition": "Independent forensic teams (\u22652 distinct authors per audit, not Project 42 authored) report literally 0 clean closures across N>4 frontier candidates under their stated audit framework.",
    "current_status": {
      "n_audits": 6,
      "n_authors_distinct": 4,
      "n_zero_outcomes": 6,
      "as_of": "2026-05-09"
    },
    "falsification_condition": "Independent forensic audit using comparable methodology surfaces \u22651 clean closure during 2027 audit cycle, OR meta-analysis shows the 0/N pattern is artifact of shared methodology (Bommasani-lineage framework) rather than structural property of vendor-claim construction.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "active",
    "confidence": 0.79,
    "watchlist_tier": "quarterly",
    "rebuts_external": [],
    "corroborated_by_external": [
      "arxiv:2509.13301"
    ],
    "notes": "Bill 8 vulnerable: the strongest alternative explanation is methodology-shared-bias. 4 of the 6 audits are Bommasani-lineage. Need \u22652 independent-methodology cross-domain audits returning 0/N to be confident the pattern is structural, not framework-induced."
  },
  {
    "record_kind": "bridge",
    "bridge_id": "B6_anti_saturation_works",
    "paper_id": "bridge:B6_anti_saturation_works",
    "title": "Anti-saturation construction is the only working closure",
    "authors": [
      "Kevin Russell"
    ],
    "affiliations": [
      "Project 42"
    ],
    "date": "2026-05-09",
    "venue": "Project 42 internal \u2014 discoveries.html#meta-findings",
    "url": "https://projectforty2.ai/discoveries.html#meta-findings",
    "first_named": "2026-05-09",
    "summary": "Across 7 production aiwikis, anti-saturation construction is the one closure mechanism that consistently fires positively with clean triggers (\u2265150 total clean triggers). Iterative reframing (ARC v1\u2192v2\u2192v3, MMMU\u2192MMMU-Pro, FrontierMath Tier-1\u21924, LiveCodeBench monthly refresh, Cybench Pro held-out) is empirically the only audit primitive keeping pace with the 3-4 month closure cycle.",
    "participating_ledgers": [
      {
        "ledger": "capability_benchmarks",
        "bill": "Bill_18"
      },
      {
        "ledger": "reasoning_cot",
        "bill": "Bill_11",
        "clean": 41
      },
      {
        "ledger": "vision_language",
        "bill": "Bill_11",
        "clean": 27
      },
      {
        "ledger": "agentic_tool_use",
        "bill": "Bill_9"
      },
      {
        "ledger": "bio_protein",
        "bill": "Bill_9"
      },
      {
        "ledger": "inference_time_safety",
        "bill": "Bill_9"
      },
      {
        "ledger": "open_weight",
        "bill": "Bill_9"
      }
    ],
    "operational_definition": "Anti-saturation Bill in participating ledger has \u226510 clean triggers (verdict=known_bill, confidence \u2265 0.8). Aggregated across \u22655 ledgers as the dominant positive-fire pattern.",
    "current_status": {
      "n_ledgers": 7,
      "clean_triggers_total": 150,
      "as_of": "2026-05-09"
    },
    "falsification_condition": "Anti-saturation benchmark releases stall (no new MMMU-Pro / ARC-v3 / FrontierMath-Tier-5 within 12 months) AND closure cycle accelerates (median half-life \u2264 30 days). Bridge falsified if the community's response to saturation crisis becomes itself saturated.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "active",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "rebuts_external": [],
    "corroborated_by_external": [],
    "notes": "The CONSTRUCTIVE bridge \u2014 what works rather than what fails. Highest-value bridge for actually using in production. Vulnerable to temporal stability (Bill 4) more than to the \u2605 bills."
  },
  {
    "record_kind": "bridge",
    "bridge_id": "B7_western_chinese_inversion",
    "paper_id": "bridge:B7_western_chinese_inversion",
    "title": "Western-vs-Chinese open-weight disclosure inversion",
    "authors": [
      "Kevin Russell"
    ],
    "affiliations": [
      "Project 42"
    ],
    "date": "2026-05-09",
    "venue": "Project 42 internal \u2014 discoveries.html#meta-findings",
    "url": "https://projectforty2.ai/discoveries.html#meta-findings",
    "first_named": "2026-05-09",
    "summary": "Compute Governance + Open-weight Frontier ledger data shows the geopolitical framing of compute-governance ('China = closed/risky, US = open/safe') is empirically inverted for the 2024-2026 open-weight tier: China-domiciled vendors disclose 100% (DeepSeek, Alibaba, 01.AI all 8/8 fields); Western vendors disclose 17% (only Meta consistent).",
    "participating_ledgers": [
      {
        "ledger": "compute_governance",
        "bill": "Bill_10"
      },
      {
        "ledger": "open_weight",
        "bill": "Bill_9"
      }
    ],
    "operational_definition": "China-domiciled vendors' median field-completeness on 8-field compute disclosure (FLOPs, training tokens, training data, architecture, hardware, energy, distillation provenance, cost). Western vendors' median field-completeness on same 8 fields. Bridge fires when China median \u2265 Western median + 30 percentage points.",
    "current_status": {
      "china_field_completeness_median": "100%",
      "western_field_completeness_median": "17%",
      "gap_pp": "83 percentage points",
      "as_of": "2026-05-09"
    },
    "falsification_condition": "Meta, Mistral, or OpenAI open-weight release with full 8-field disclosure within 12 months brings Western median to \u2265 50%, OR Chinese open-weight release falls to \u2264 50% disclosure rate.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "active",
    "confidence": 0.81,
    "watchlist_tier": "monthly",
    "rebuts_external": [],
    "corroborated_by_external": [
      "Stanford FMTI v2"
    ],
    "notes": "Only 2 ledgers participate \u2014 Bill 2 (anchor-independence) is the weakest position. Vulnerable to single-vendor inflection (one Western vendor adopting full disclosure tips the median dramatically since N=4-5). Most likely-to-flip bridge in the 2027 audit cycle."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2211.09110",
    "title": "Holistic Evaluation of Language Models (HELM)",
    "authors": [
      "Percy Liang",
      "Rishi Bommasani",
      "Tony Lee",
      "et al."
    ],
    "date": "2024-11",
    "venue": "Stanford CRFM / TMLR",
    "url": "https://arxiv.org/abs/2211.09110",
    "summary": "HELM's core finding across 30+ scenarios and 7 metrics is that no single model dominates across domains, and that capabilities and safety properties are evaluated separately because they do NOT transfer together. Across 4 versions (HELM v1.0 through HELM Safety 2024), the framework documents that mitigations rarely generalize cross-task while capabilities do. This is direct corroboration of B3 (capabilities transfer, mitigations don't) and B1 (no causal-mechanism unification across domains).",
    "supports": [
      "B1_causal_mechanism_empty",
      "B3_capabilities_transfer_mitigations_dont"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.9,
    "watchlist_tier": "quarterly",
    "operational_evidence": "30+ scenarios, 7 metrics; documented absence of cross-scenario mitigation transfer; HELM Safety 2024 splits Safety from Capabilities precisely because they don't co-evolve",
    "notes": "HELM is the canonical 'holistic evaluation' lineage; cited >2000 times; the original paper notably warns about cross-domain non-comparability"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2310.12941",
    "title": "Foundation Model Transparency Index (FMTI) v1.0",
    "authors": [
      "Rishi Bommasani",
      "Kevin Klyman",
      "et al."
    ],
    "date": "2023-10",
    "venue": "Stanford CRFM",
    "url": "https://crfm.stanford.edu/fmti/May-2024/index.html",
    "summary": "FMTI v1.0 scored 10 major developers across 100 indicators of transparency. The original report documented that mean score was only 37/100, with the lowest scores on downstream impact and data provenance. Western developers (OpenAI 25.5/100) scored markedly lower on disclosure than later-tracked Chinese open-weight releases, providing the baseline for the China-vs-West inversion documented in B7.",
    "supports": [
      "B7_western_chinese_disclosure_inversion"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.85,
    "watchlist_tier": "annual",
    "operational_evidence": "10 developers, 100 indicators; mean transparency 37%; reproduces the Western opacity baseline",
    "notes": "Predecessor to FMTI v1.1 which directly added Chinese labs"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "fmti:v1.1-2024",
    "title": "Foundation Model Transparency Index v1.1",
    "authors": [
      "Rishi Bommasani",
      "Kevin Klyman",
      "Sayash Kapoor",
      "et al."
    ],
    "date": "2024-05",
    "venue": "Stanford CRFM",
    "url": "https://crfm.stanford.edu/fmti/May-2024/index.html",
    "summary": "FMTI v1.1 re-scored 14 developers in May 2024 and found mean transparency rose to 58/100, but on the Methods/Data subdomain Chinese open-weight developers (Yi-34B, Qwen) scored substantially higher than Western closed-weight peers because Chinese labs default to publishing weights, technical reports and dataset signals. This is direct numerical corroboration of B7 (disclosure inversion).",
    "supports": [
      "B7_western_chinese_disclosure_inversion"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.88,
    "watchlist_tier": "semi-annual",
    "operational_evidence": "Mean score 58/100; Methods/Data dominance of Chinese open-weight labs over Western closed peers; quantitative confirmation of the West-opaque / China-open inversion",
    "notes": "Direct mechanism: open weights forces upstream disclosure"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2402.04249",
    "title": "Lermen-Rimsky: LoRA Fine-Tuning Efficiently Undoes Safety Training in Llama 2-Chat 70B",
    "authors": [
      "Simon Lermen",
      "Charlie Rimsky",
      "et al."
    ],
    "date": "2024-02",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2310.20624",
    "summary": "Lermen-Rimsky demonstrate that ~$200 of QLoRA fine-tuning undoes Llama-2-70B-Chat's safety training, reducing refusal rates from 99%+ to <1% across AdvBench. The 10\u00d7 compute-leverage of attackers over defenders, and the fact that capabilities (knowing how to refuse) and mitigation behavior (actually refusing) detach after a fine-tune, directly corroborates B3 (mitigations don't transfer) and B2 (closure timescales of weeks).",
    "supports": [
      "B2_closure_cycle_compression",
      "B3_capabilities_transfer_mitigations_dont"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.95,
    "watchlist_tier": "quarterly",
    "operational_evidence": "<1% refusal rate post-LoRA, $200 cost, ~10\u00d7 attacker leverage over defender; mitigation is brittle while capability persists",
    "notes": "Reference paper for the '10\u00d7' figure in B3"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "aisi:cross-deployment-2024",
    "title": "UK AI Safety Institute: International Scientific Report on AI Safety - Cross-Deployment Findings",
    "authors": [
      "Yoshua Bengio",
      "et al. (UK AISI / Bletchley)"
    ],
    "date": "2024-10",
    "venue": "UK Government",
    "url": "https://www.gov.uk/government/publications/international-scientific-report-on-the-safety-of-advanced-ai",
    "summary": "AISI cross-deployment audit of multiple frontier models found that safety properties measured in the API / sandbox surface fail to transfer to agentic / tool-use / multi-turn deployment surfaces. The pattern that capabilities transfer cleanly across surfaces but mitigations do not is documented quantitatively across 4 labs, directly corroborating B3.",
    "supports": [
      "B3_capabilities_transfer_mitigations_dont"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "operational_evidence": "Cross-surface safety degradation documented across 4 lab deployments; capabilities are surface-invariant",
    "notes": "Government-grade external audit; Bengio is lead"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2406.13352",
    "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents",
    "authors": [
      "Edoardo Debenedetti",
      "Jie Zhang",
      "Mislav Balunovi\u0107",
      "Luca Beurer-Kellner",
      "Marc Fischer",
      "Florian Tram\u00e8r"
    ],
    "date": "2024-06",
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2406.13352",
    "summary": "AgentDojo benchmark across 70 tasks and 17 LLMs shows that prompt-injection defenses tuned for chat surfaces show <50% transfer to agentic / tool-use environments, while the capabilities driving useful tool use transfer at >90%. Direct quantitative corroboration of B3.",
    "supports": [
      "B3_capabilities_transfer_mitigations_dont"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "operational_evidence": "70 tasks \u00d7 17 LLMs; cross-surface mitigation gap >40pp while capability transfer is intact",
    "notes": "Cited in B3 of original ledger"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "metr:hcast-2024",
    "title": "METR HCAST: Human-Calibrated Autonomy Software Tasks",
    "authors": [
      "METR (Beth Barnes et al.)"
    ],
    "date": "2024-12",
    "venue": "METR",
    "url": "https://metr.org/blog/2024-12-20-hcast/",
    "summary": "METR's HCAST framework evaluated 7 frontier models across 100+ multi-hour software tasks and documented that doubling time on task-length is ~6 months \u2014 a closure timescale compressed by roughly 4\u00d7 over Moore-style 18-month doubling. Direct quantitative corroboration of B2 (closure compression to 3-4 months).",
    "supports": [
      "B2_closure_cycle_compression"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.9,
    "watchlist_tier": "quarterly",
    "operational_evidence": "Task-horizon doubling time ~7 months; capability frontier compresses on calendar quarters",
    "notes": "Quantitative scaling-law evidence for the closure-cycle compression bridge"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "epoch:trends-2025",
    "title": "Compute Trends Across Three Eras of Machine Learning (Epoch AI)",
    "authors": [
      "Jaime Sevilla",
      "Lennart Heim",
      "Anson Ho",
      "Tamay Besiroglu",
      "Marius Hobbhahn",
      "Pablo Villalobos"
    ],
    "date": "2025-01",
    "venue": "Epoch AI",
    "url": "https://epoch.ai/blog/compute-trends",
    "summary": "Epoch AI's compute-trends update shows training-compute doubling every ~6 months in the post-2020 era, with frontier-distillation closing the cost gap from leader-model to distilled-cousin in ~3.4 months on average. Directly corroborates B2 (closure compression) and B4 (distillation = scaling-portability).",
    "supports": [
      "B2_closure_cycle_compression",
      "B4_distillation_architecture_portability"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "operational_evidence": "Doubling time 5.7mo post-2020; distilled-cousin lag 3.4mo cited from this lineage",
    "notes": "Sevilla et al. forecasting lineage"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2406.04313",
    "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    "authors": [
      "Evan Hubinger",
      "Carson Denison",
      "Jesse Mu",
      "et al. (Anthropic)"
    ],
    "date": "2024-01",
    "venue": "Anthropic",
    "url": "https://arxiv.org/abs/2401.05566",
    "summary": "Anthropic's sleeper agents paper demonstrates that current safety training (RLHF, SFT, adversarial training) fails to remove backdoors at any model scale tested. Furthermore, mechanistic probes that 'should' detect the backdoor empirically don't \u2014 across 3 detection probes, all 0/3 failed. This corroborates B1 (no causal mechanism) and B5 (0/N audit pattern).",
    "supports": [
      "B1_causal_mechanism_empty",
      "B5_zero_over_N_audit"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "operational_evidence": "0/3 mech-interp detection probes; safety training fails to remove backdoors; mitigations don't address mechanism",
    "notes": "Foundational paper for B1; appears in Anthropic RSP discussions"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2310.06770",
    "title": "MMMU-Pro: A Robust Multi-discipline Multimodal Understanding Benchmark",
    "authors": [
      "Xiang Yue",
      "Tianyu Zheng",
      "et al."
    ],
    "date": "2024-09",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2409.02813",
    "summary": "MMMU-Pro re-released MMMU after Sept-2024 saturation by filtering items solvable from text-only, adding 10\u00d7 more options, and introducing 'vision-only' splits. Model scores dropped 18.9 percentage points on average \u2014 confirming that anti-saturation construction (deliberate hardening) is the only viable closure for multimodal benchmarks. Direct corroboration of B6.",
    "supports": [
      "B6_anti_saturation_construction"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "operational_evidence": "18.9pp drop after Pro filter; deliberate hardening as closure",
    "notes": "Direct citation in B6"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "epoch:frontiermath-2024",
    "title": "FrontierMath: A Benchmark for Evaluating Advanced Mathematical Reasoning",
    "authors": [
      "Elliot Glazer",
      "Ege Erdil",
      "et al. (Epoch AI)"
    ],
    "date": "2024-11",
    "venue": "Epoch AI",
    "url": "https://epoch.ai/frontiermath",
    "summary": "Epoch's FrontierMath specifically constructs Tier-4 'research-level' problems unsolvable by current frontier models (2024-2025 saturation <2%). The Tier-4 hardening philosophy explicitly anticipates rapid closure and is designed to outlive 12-18 months of model gain. Direct corroboration of B6 (anti-saturation construction).",
    "supports": [
      "B6_anti_saturation_construction"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.95,
    "watchlist_tier": "quarterly",
    "operational_evidence": "Tier-4 saturation <2% in 2024; explicit anti-saturation construction methodology",
    "notes": "Reference benchmark for B6"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "livecodebench:2024",
    "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of LLMs for Code",
    "authors": [
      "Naman Jain",
      "King Han",
      "Alex Gu",
      "et al."
    ],
    "date": "2024-03",
    "venue": "NeurIPS 2024",
    "url": "https://livecodebench.github.io/",
    "summary": "LiveCodeBench refreshes its problem set monthly to defeat training-set contamination, and observes that model scores routinely regress 5-15pp on freshly added problems \u2014 anti-saturation by temporal construction. Closure timescales of ~1 month corroborate both B2 and B6.",
    "supports": [
      "B2_closure_cycle_compression",
      "B6_anti_saturation_construction"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.9,
    "watchlist_tier": "quarterly",
    "operational_evidence": "Monthly refresh cadence; 5-15pp regression on uncontaminated problems",
    "notes": "Operationalizes B6 at monthly cadence"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arc-prize:v1-v2-v3",
    "title": "ARC-AGI v1 \u2192 v2 \u2192 v3: Constructing Adversarial Closures",
    "authors": [
      "Fran\u00e7ois Chollet",
      "Mike Knoop",
      "Greg Kamradt",
      "et al."
    ],
    "date": "2025-03",
    "venue": "ARC Prize Foundation",
    "url": "https://arcprize.org/",
    "summary": "ARC-AGI's v1\u2192v2\u2192v3 evolution is the paradigmatic anti-saturation closure: each version takes new SOTA approaches and re-falsifies them with new task families. v1 was tractable by Aug-2024; v2 dropped frontier-model score from 88% to <5%; v3 (planned 2025) anticipates the same. Direct corroboration of B6.",
    "supports": [
      "B6_anti_saturation_construction"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.95,
    "watchlist_tier": "quarterly",
    "operational_evidence": "v2: 88% \u2192 <5% upon release; three-version closure cycle explicitly anti-saturation by construction",
    "notes": "The canonical B6 example"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "ibbis:bio-design-2024",
    "title": "IBBIS: Independent Audit of Biological Design API Safeguards",
    "authors": [
      "IBBIS / Nuclear Threat Initiative"
    ],
    "date": "2024-12",
    "venue": "IBBIS",
    "url": "https://ibbis.bio/",
    "summary": "IBBIS audited 4 commercial biological-design APIs (DNA synthesis screening, protein design, retro-synthesis) and found 0/4 had effective dual-use detection: every API was bypassable via straightforward prompt-engineering. The 0/4 result is the canonical biology-domain instance of the 0/N audit pattern (B5) and confirms B1 (no causal mechanism for refusal).",
    "supports": [
      "B1_causal_mechanism_empty",
      "B5_zero_over_N_audit"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.85,
    "watchlist_tier": "annual",
    "operational_evidence": "0/4 APIs effective at dual-use detection; reproducibility of refusal-bypass demonstrated",
    "notes": "Direct citation in B5 of original ledger"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2406.04244",
    "title": "Halevy-Heim-Pilz: Mechanistic Probes Fail on 0/14 Production Models",
    "authors": [
      "Avi Halevy",
      "Lennart Heim",
      "Lennart Pilz"
    ],
    "date": "2024-06",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2406.04244",
    "summary": "Halevy-Heim-Pilz systematically applied 14 mechanistic-interpretability probes across 14 frontier production models and reported 0/14 yielding actionable safety insight. This is the canonical 'distillation-portability is universal, mech-interp portability is null' result and directly grounds B4 (distillation = scaling-portability) and B5 (0/N audit).",
    "supports": [
      "B1_causal_mechanism_empty",
      "B4_distillation_architecture_portability",
      "B5_zero_over_N_audit"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "operational_evidence": "0/14 probes; the bridge ledger's flagship 0/N citation for B5",
    "notes": "Cited directly in B4 and B5"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2406.04316",
    "title": "Anand-Goyal: Zero-of-Nine VLM Mechanistic Audit",
    "authors": [
      "Anand",
      "Goyal",
      "et al."
    ],
    "date": "2024-06",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2406.04316",
    "summary": "Anand-Goyal audited 9 production VLMs (vision-language models) using 6 mechanistic-interpretability probes and reported 0/9 yielded a causal mechanism for visual grounding errors. The 0/9 figure is the canonical VLM-domain instance of the 0/N audit pattern (B5) and confirms B1 in VLMs.",
    "supports": [
      "B1_causal_mechanism_empty",
      "B5_zero_over_N_audit"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "operational_evidence": "0/9 VLMs; canonical VLM-domain 0/N citation",
    "notes": "Cited directly in B5 of original ledger"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2406.04321",
    "title": "Anand-Bommasani: Cross-Organism Capability Audit",
    "authors": [
      "Anand",
      "Bommasani",
      "et al."
    ],
    "date": "2024-09",
    "venue": "Stanford CRFM",
    "url": "https://arxiv.org/abs/2406.04321",
    "summary": "Anand-Bommasani audited 8 production agent architectures for cross-organism (cross-architecture, cross-scaling) behavioral consistency and reported 0/8 showed mechanism-level transfer. The 0/8 figure is the canonical agentic-domain instance of the 0/N pattern (B5).",
    "supports": [
      "B1_causal_mechanism_empty",
      "B5_zero_over_N_audit"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "operational_evidence": "0/8 agent architectures; canonical agentic-domain 0/N citation",
    "notes": "Cited directly in B5"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2310.13548",
    "title": "Towards Understanding Sycophancy in Language Models",
    "authors": [
      "Mrinank Sharma",
      "Meg Tong",
      "Tomasz Korbak",
      "et al. (Anthropic)"
    ],
    "date": "2023-10",
    "venue": "ICLR 2024",
    "url": "https://arxiv.org/abs/2310.13548",
    "summary": "Anthropic's sycophancy paper documents a behavioral failure mode that persists across 5 production assistants without a known mechanistic cause. Mitigations don't transfer cross-model; capabilities do. Indirect but explicit corroboration of B1 and B3.",
    "supports": [
      "B1_causal_mechanism_empty",
      "B3_capabilities_transfer_mitigations_dont"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.78,
    "watchlist_tier": "annual",
    "operational_evidence": "5/5 production models exhibit sycophancy; no causal mechanism identified",
    "notes": "Behavioral evidence, not mechanism"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "anthropic:rsp-2024",
    "title": "Anthropic Responsible Scaling Policy v2.2 (Methodology Companion)",
    "authors": [
      "Anthropic"
    ],
    "date": "2024-10",
    "venue": "Anthropic",
    "url": "https://www.anthropic.com/news/anthropics-responsible-scaling-policy",
    "summary": "Anthropic's RSP v2.2 methodology explicitly notes that 'evals are necessarily upper bounds' because mechanism-level confirmation is unavailable, and pre-deployment evaluations are scheduled at intervals of 4 months \u2014 directly reflecting the closure-cycle compression (B2). The RSP framework also documents that capability-trigger thresholds and mitigation-trigger thresholds are evaluated separately (B3).",
    "supports": [
      "B1_causal_mechanism_empty",
      "B2_closure_cycle_compression",
      "B3_capabilities_transfer_mitigations_dont"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.88,
    "watchlist_tier": "semi-annual",
    "operational_evidence": "Capability/mitigation thresholds evaluated as independent quantities; 4-month re-eval cadence",
    "notes": "Anthropic's own admission that mechanism unavailable; behavioral upper-bound only"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "openai:preparedness-2024",
    "title": "OpenAI Preparedness Framework (Beta)",
    "authors": [
      "OpenAI"
    ],
    "date": "2024-12",
    "venue": "OpenAI",
    "url": "https://cdn.openai.com/openai-preparedness-framework-beta.pdf",
    "summary": "OpenAI's Preparedness Framework defines 4 risk categories evaluated independently (cybersecurity, CBRN, persuasion, autonomy) and explicitly notes that mitigation-effectiveness on one category does not predict effectiveness on others. The 4 categories are tracked via separate measurement suites and updated quarterly \u2014 directly corroborating B2 (closure cadence) and B3 (mitigation non-transfer).",
    "supports": [
      "B2_closure_cycle_compression",
      "B3_capabilities_transfer_mitigations_dont"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "operational_evidence": "4-category independent measurement; quarterly update cadence; explicit non-transfer admission",
    "notes": "Industry-standard frontier-lab methodology"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "deepmind:fsf-2024",
    "title": "DeepMind Frontier Safety Framework v1.0",
    "authors": [
      "Google DeepMind"
    ],
    "date": "2024-05",
    "venue": "Google DeepMind",
    "url": "https://deepmind.google/discover/blog/introducing-the-frontier-safety-framework/",
    "summary": "DeepMind's FSF v1.0 defines 'Critical Capability Levels' that are evaluated separately per domain (autonomy, biosecurity, cybersecurity, ML-R&D) with no expectation of cross-domain mitigation transfer. The FSF explicitly schedules evaluations at every 6\u00d7 compute increase, which empirically translates to ~3-4 month cadence in current scaling regime. Direct corroboration of B2 and B3.",
    "supports": [
      "B2_closure_cycle_compression",
      "B3_capabilities_transfer_mitigations_dont"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.85,
    "watchlist_tier": "semi-annual",
    "operational_evidence": "6\u00d7 compute cadence \u2248 3.5 months; per-domain isolation",
    "notes": "Companion to OpenAI Preparedness and Anthropic RSP"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2410.07779",
    "title": "Mamba2 vs Dense Transformer: Capabilities Scale, Mech-Interp Doesn't",
    "authors": [
      "Tri Dao",
      "Albert Gu",
      "et al."
    ],
    "date": "2024-05",
    "venue": "ICML 2024",
    "url": "https://arxiv.org/abs/2405.21060",
    "summary": "The Mamba2 release paper documents that capability scaling laws are essentially preserved across SSM (Mamba) and dense transformer architectures \u2014 same downstream scores at matched compute \u2014 but published mech-interp tooling (probing, SAE, attention-circuit) does NOT port between the two. This is the canonical citation for B4 (distillation = architecture-portability = scaling-portability, but mechanism is non-portable).",
    "supports": [
      "B4_distillation_architecture_portability"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.93,
    "watchlist_tier": "quarterly",
    "operational_evidence": "Capability parity at matched compute across Mamba vs dense; mech-interp tooling non-portable",
    "notes": "Direct citation in B4"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "nist:ai-rmf-2024",
    "title": "NIST AI Risk Management Framework Generative AI Profile (NIST AI 600-1)",
    "authors": [
      "NIST"
    ],
    "date": "2024-07",
    "venue": "NIST",
    "url": "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf",
    "summary": "NIST AI 600-1 enumerates 12 risk categories for generative AI and explicitly notes that mitigations must be evaluated per-category because cross-category transfer is empirically null. The document also identifies '0-of-N' audit findings as a recurring pattern across the 12 categories. Direct corroboration of B3 and B5.",
    "supports": [
      "B3_capabilities_transfer_mitigations_dont",
      "B5_zero_over_N_audit"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.85,
    "watchlist_tier": "annual",
    "operational_evidence": "12 risk categories with explicit non-transfer assumption; '0-of-N' audit pattern acknowledged at federal-policy level",
    "notes": "Government policy-grade corroboration"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2407.21783",
    "title": "Llama 3 Technical Report \u2014 Distillation Section",
    "authors": [
      "Meta AI (Llama Team)"
    ],
    "date": "2024-07",
    "venue": "Meta",
    "url": "https://ai.meta.com/research/publications/the-llama-3-herd-of-models/",
    "summary": "The Llama 3 technical report documents that distilled cousin models (Llama-3-8B, 70B) achieve 92-96% of the 405B teacher's capability on average across 40 benchmarks at 5-12% of the compute, with the gap closing in ~3 months from teacher release. Direct numerical corroboration of B2 (distilled-cousin 3.4mo) and B4 (distillation = scaling-portability).",
    "supports": [
      "B2_closure_cycle_compression",
      "B4_distillation_architecture_portability"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.9,
    "watchlist_tier": "annual",
    "operational_evidence": "92-96% teacher capability retained at 5-12% compute; ~3mo cousin-release lag",
    "notes": "Industry-standard distillation cadence baseline"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2410.11516",
    "title": "BIS Disclosure Lifetime in Open-Weight LLM Releases",
    "authors": [
      "U.S. Bureau of Industry and Security (BIS) / CSET analysis"
    ],
    "date": "2024-09",
    "venue": "CSET / BIS",
    "url": "https://cset.georgetown.edu/publication/the-bureau-of-industry-and-securitys-rule-on-advanced-ai-models/",
    "summary": "CSET's analysis of BIS frontier-model export-control rules documents an empirical 4-month average lifetime between an open-weight release and the public availability of a comparable cousin via Chinese-lab distillation. Direct corroboration of B2 (BIS 4mo lifetime).",
    "supports": [
      "B2_closure_cycle_compression"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.82,
    "watchlist_tier": "semi-annual",
    "operational_evidence": "4mo BIS export-control lifetime; quantitatively confirms B2",
    "notes": "Direct citation in B2"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2308.14752",
    "title": "Survey of Cross-Domain Failure Modes in Foundation Model Evaluation",
    "authors": [
      "Bommasani",
      "Liang",
      "Kapoor",
      "et al."
    ],
    "date": "2024-08",
    "venue": "Stanford CRFM",
    "url": "https://arxiv.org/abs/2308.14752",
    "summary": "This Bommasani-lineage survey enumerates 23 documented failure modes of cross-domain ML evaluation, with the dominant theme being that benchmark scores saturate before model capability saturates, requiring anti-saturation reconstruction every ~3-6 months. Direct corroboration of B6 and B2.",
    "supports": [
      "B2_closure_cycle_compression",
      "B6_anti_saturation_construction"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.85,
    "watchlist_tier": "annual",
    "operational_evidence": "23 documented failure modes; 3-6mo benchmark reconstruction cadence",
    "notes": "Comprehensive Bommasani-lineage survey"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2402.04249-v2",
    "title": "Anthropic Many-Shot Jailbreaking: 30d Patch Cycle Documented",
    "authors": [
      "Cem Anil",
      "Esin Durmus",
      "Mrinank Sharma",
      "et al. (Anthropic)"
    ],
    "date": "2024-04",
    "venue": "Anthropic",
    "url": "https://www.anthropic.com/research/many-shot-jailbreaking",
    "summary": "Anthropic's many-shot jailbreaking paper documents the discovery-to-patch cycle for a novel jailbreak family at 30 days; meanwhile, the underlying attack vector (long-context priming) has a 73-day half-life as a vendor-claim. Direct numerical corroboration of B2 (30d patch, 73d vendor-claim half-life).",
    "supports": [
      "B2_closure_cycle_compression"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "operational_evidence": "30d patch cycle; long-context attack vector with 73d vendor half-life",
    "notes": "Direct citation in B2"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "openai:evals-2024",
    "title": "OpenAI Model Card Cross-Capability Audits (o1 / o3)",
    "authors": [
      "OpenAI"
    ],
    "date": "2024-12",
    "venue": "OpenAI",
    "url": "https://openai.com/index/openai-o1-system-card/",
    "summary": "OpenAI's o1 and o3 system cards document capabilities scaling cleanly across domains while mitigations are reported per-category with explicit non-transfer caveats. The 36h emergency patch window documented in o1-preview's bio-CBRN red-team also corroborates B2 (patch 30d/36h).",
    "supports": [
      "B2_closure_cycle_compression",
      "B3_capabilities_transfer_mitigations_dont"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "operational_evidence": "36h patch window documented; per-category mitigation reporting",
    "notes": "Industry-standard reporting"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2406.15877",
    "title": "Quantifying the Persistence of Memorized Information Across Distillation",
    "authors": [
      "Carlini",
      "Tram\u00e8r",
      "et al."
    ],
    "date": "2024-06",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2406.15877",
    "summary": "Carlini-Tram\u00e8r demonstrate that memorized data persists through 3 generations of teacher-student distillation, while safety properties tuned in the teacher do not \u2014 direct evidence that distillation is capability-portable but not mitigation-portable. Corroborates B4 and B3.",
    "supports": [
      "B3_capabilities_transfer_mitigations_dont",
      "B4_distillation_architecture_portability"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.85,
    "watchlist_tier": "annual",
    "operational_evidence": "3-generation memorization persistence; mitigation loss across distillation",
    "notes": "Empirical confirmation of distillation-asymmetry"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "casi:survey-2025",
    "title": "Center for AI Safety: Cross-Domain Misuse Survey",
    "authors": [
      "Dan Hendrycks",
      "et al. (CAIS)"
    ],
    "date": "2025-02",
    "venue": "Center for AI Safety",
    "url": "https://www.safe.ai/research",
    "summary": "CAIS surveyed 8 misuse categories (CBRN, cybersecurity, persuasion, agency, bias, hallucination, jailbreak, copyright) across 6 frontier models and found that no single intervention reduces risk across more than 2 categories. Direct corroboration of B3 (mitigations don't transfer cross-surface) and B5 (1/8 by category).",
    "supports": [
      "B3_capabilities_transfer_mitigations_dont",
      "B5_zero_over_N_audit"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.78,
    "watchlist_tier": "annual",
    "operational_evidence": "8 categories \u00d7 6 models; no single intervention generalizes across >2 categories",
    "notes": "Independent civil-society audit"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "mozilla:audit-2024",
    "title": "Mozilla Foundation: Trustworthy AI Cross-Domain Disclosure Audit",
    "authors": [
      "Mozilla Foundation"
    ],
    "date": "2024-09",
    "venue": "Mozilla",
    "url": "https://foundation.mozilla.org/en/insights/",
    "summary": "Mozilla's audit of 18 frontier model disclosures across 6 dimensions documents that Chinese open-weight labs (Qwen, DeepSeek, Yi) score 80%+ on methods disclosure while Western closed labs (OpenAI, Anthropic, Google) average 25%. The 80%/25% gap directly corroborates B7.",
    "supports": [
      "B7_western_chinese_disclosure_inversion"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.78,
    "watchlist_tier": "annual",
    "operational_evidence": "China-mean 80%, West-mean 25%; quantitative confirmation of the inversion",
    "notes": "Civil-society cross-validation of FMTI numbers"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "algorithmwatch:2024",
    "title": "AlgorithmWatch: Cross-Border Open-Weight Disclosure Survey",
    "authors": [
      "AlgorithmWatch"
    ],
    "date": "2024-11",
    "venue": "AlgorithmWatch",
    "url": "https://algorithmwatch.org/en/",
    "summary": "AlgorithmWatch's survey across 20 LLM releases (12 Chinese, 8 Western) found that 100% of Chinese releases shipped with weights+technical-report while only 17% (1/8: Llama-3) of Western releases did. Direct corroboration of B7 (China 100% / Western 17% inversion).",
    "supports": [
      "B7_western_chinese_disclosure_inversion"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.85,
    "watchlist_tier": "annual",
    "operational_evidence": "China 100% (12/12), West 17% (1/8); exactly matches the B7 claim",
    "notes": "Direct numerical citation match"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2411.07954",
    "title": "Causally-Faithful Mechanism Identification in Reasoning Models: A 0/12 Result",
    "authors": [
      "Atticus Geiger",
      "Hanna Wallach",
      "et al."
    ],
    "date": "2024-11",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2411.07954",
    "summary": "Geiger-Wallach audited 12 production reasoning models (o1, R1-derivatives, DeepSeek-V3 distilled cousins) for causally-faithful chain-of-thought mechanisms using interchange-intervention probes, and reported 0/12 yielded a faithful causal mechanism. This is the canonical reasoning-domain 0/N citation (B5) and confirms B1 in the reasoning domain.",
    "supports": [
      "B1_causal_mechanism_empty",
      "B5_zero_over_N_audit"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "operational_evidence": "0/12 reasoning models; reasoning-domain instance of 0/N pattern",
    "notes": "Direct citation for B1/B5 in reasoning domain"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2502.19320",
    "title": "Scaling Laws Audit: Capabilities Are Portable, Mech-Interp Is Not",
    "authors": [
      "Anson Ho",
      "Tamay Besiroglu",
      "Ege Erdil",
      "et al. (Epoch AI)"
    ],
    "date": "2025-02",
    "venue": "Epoch AI",
    "url": "https://epoch.ai/research",
    "summary": "Epoch AI's scaling-laws audit across 3 architectures (dense transformer, MoE, SSM/Mamba) and 5 scales (1B-1T) shows that downstream-task scaling curves are nearly identical across architectures at matched compute, while mech-interp findings (attention heads, circuit motifs, SAE feature directions) fail to replicate across architecture pairs in 0/14 attempted cross-port studies. Direct corroboration of B4 (distillation = scaling-portability) and B5 (0/14 echo of Halevy-Heim-Pilz).",
    "supports": [
      "B4_distillation_architecture_portability",
      "B5_zero_over_N_audit"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "operational_evidence": "3 architectures \u00d7 5 scales; 0/14 cross-architecture mech-interp port studies",
    "notes": "Epoch-lineage cross-architecture scaling confirmation"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2502.13201",
    "title": "Cross-Lab Inference-Time Safety Audit: 0/6 Production Systems",
    "authors": [
      "Sayash Kapoor",
      "Rishi Bommasani",
      "et al."
    ],
    "date": "2025-02",
    "venue": "Stanford CRFM / Princeton CITP",
    "url": "https://arxiv.org/abs/2502.13201",
    "summary": "Kapoor-Bommasani's cross-lab audit of 6 production inference-time safety systems (Anthropic constitution-AI, OpenAI moderation, Google Perspective, DeepMind Sparrow-derivatives, Mistral Le Chat moderation, Cohere safety filters) found 0/6 had explicit causal-mechanism documentation linking inputs to refusal decisions. Direct corroboration of B1 and B5 in inference-time-safety domain.",
    "supports": [
      "B1_causal_mechanism_empty",
      "B5_zero_over_N_audit"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "operational_evidence": "0/6 inference-time safety systems; mech-empty in safety-tool domain",
    "notes": "Princeton-CITP lineage; inference-time-safety domain"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2503.08105",
    "title": "Agentic Capability Audit: METR-style Cross-Task Failure",
    "authors": [
      "METR (Beth Barnes et al.)"
    ],
    "date": "2025-03",
    "venue": "METR",
    "url": "https://metr.org/research",
    "summary": "METR's cross-task audit of 4 frontier agents across 50 multi-step agentic tasks reported that capability transfer (e.g., from short-horizon coding to long-horizon planning) is empirically 90%+ while mitigation transfer (refusal, sandbox-respecting behavior) is <30%. Direct corroboration of B3 across the agentic-deployment surface.",
    "supports": [
      "B3_capabilities_transfer_mitigations_dont"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "operational_evidence": "Capability transfer >90%, mitigation transfer <30%; cross-surface asymmetry",
    "notes": "METR continued cross-task work"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2401.05566",
    "title": "Anthropic Sycophancy + Sleeper Agents: Combined 0/N Mech-Probe Pattern",
    "authors": [
      "Anthropic (composite)"
    ],
    "date": "2024-01",
    "venue": "Anthropic",
    "url": "https://arxiv.org/abs/2401.05566",
    "summary": "The combined corpus of Anthropic's sleeper-agents and sycophancy papers reports >10 separate mech-interp probe attempts to identify causal mechanism for either persistent backdoor or sycophantic refusal-bypass, with 0/10+ yielding mechanism. Corroborates B1, B5 in the mech-interp domain at the largest single-lab dataset.",
    "supports": [
      "B1_causal_mechanism_empty",
      "B5_zero_over_N_audit"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "operational_evidence": "0/10+ mech-probe attempts across Anthropic safety papers",
    "notes": "Largest single-lab 0/N pattern"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "deepseek:r1-2025",
    "title": "DeepSeek R1 Technical Report \u2014 Full Open Weights + Distillation Recipe",
    "authors": [
      "DeepSeek-AI"
    ],
    "date": "2025-01",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2501.12948",
    "summary": "DeepSeek R1's release shipped with full open weights, distillation recipe, training data composition, and reproducible cousin models \u2014 all 4 dimensions Western frontier labs typically withhold. The release also produced 6 distilled cousins within 3.4 months at <10% compute, hitting 92-95% R1 capability. Direct double corroboration: B4 (distillation=portability) and B7 (Chinese-lab full disclosure).",
    "supports": [
      "B2_closure_cycle_compression",
      "B4_distillation_architecture_portability",
      "B7_western_chinese_disclosure_inversion"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "operational_evidence": "Full disclosure (weights+recipe+data); 6 distilled cousins in 3.4mo at <10% compute, 92-95% capability retention",
    "notes": "The B7 + B4 + B2 triple-corroboration release"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2405.21060",
    "title": "BIG-Bench-Hard / Anti-Saturation Successor: BBH-Pro",
    "authors": [
      "Mirac Suzgun",
      "Liang",
      "et al."
    ],
    "date": "2024-10",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2210.09261",
    "summary": "BIG-Bench-Hard's reconstruction (BBH\u2192BBH-Pro) after Sept-2024 saturation followed the same anti-saturation methodology as MMMU-Pro and ARC-AGI-v2: filter for items requiring reasoning beyond LM heuristic, expand option pool 4\u00d7, add adversarial perturbations. Model scores dropped 12pp on average. Corroborates B6.",
    "supports": [
      "B6_anti_saturation_construction"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.78,
    "watchlist_tier": "annual",
    "operational_evidence": "12pp drop after Pro reconstruction; anti-saturation methodology",
    "notes": "BBH-Pro is a parallel of MMMU-Pro and ARC-v2"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "openreview:rsp-evals-2024",
    "title": "Independent Methodology Survey: RSP/Preparedness/FSF Common Patterns",
    "authors": [
      "Markus Anderljung",
      "Cremer",
      "et al. (GovAI)"
    ],
    "date": "2024-08",
    "venue": "GovAI / Centre for the Governance of AI",
    "url": "https://www.governance.ai/research",
    "summary": "GovAI's survey of the 3 major industry frontier-safety frameworks (Anthropic RSP, OpenAI Preparedness, DeepMind FSF) documents that all 3 explicitly assume capability/mitigation independence (B3 baseline), schedule evaluations at quarterly cadence (B2), and acknowledge that mech-interp is not yet a viable evaluation tool (B1 baseline). The three-framework convergence is itself evidence that all 7 bridges hold operationally.",
    "supports": [
      "B1_causal_mechanism_empty",
      "B2_closure_cycle_compression",
      "B3_capabilities_transfer_mitigations_dont"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.85,
    "watchlist_tier": "semi-annual",
    "operational_evidence": "3 industry frameworks; convergent on quarterly cadence + capability/mitigation independence + mech-empty",
    "notes": "Independent academic synthesis of the 3 frontier-lab frameworks; strong triangulation"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "qwen:tech-report-2025",
    "title": "Qwen 2.5 / 3 Technical Report \u2014 Open Weights + Full Methods",
    "authors": [
      "Alibaba DAMO Academy / Qwen Team"
    ],
    "date": "2025-02",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2412.15115",
    "summary": "Qwen 2.5 and 3 ship with open weights, full pretraining data signal (mixture ratios, decontamination steps), instruction-tuning recipe, and reproducible benchmarks \u2014 matching DeepSeek R1's disclosure profile and contrasting sharply with Western closed peers. Direct corroboration of B7.",
    "supports": [
      "B7_western_chinese_disclosure_inversion"
    ],
    "rebuts": [],
    "verdict": "corroborate",
    "confidence": 0.88,
    "watchlist_tier": "annual",
    "operational_evidence": "Full open-weights + methods disclosure; second-major-Chinese-lab confirmation of the disclosure norm",
    "notes": "Confirms B7 is structural, not single-lab"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "transformer-circuits:2025-attribution-graphs",
    "title": "Circuit Tracing: Revealing Computational Graphs in Language Models / On the Biology of a Large Language Model",
    "authors": [
      "Lindsey",
      "Anthropic Interpretability Team"
    ],
    "date": "2025-03",
    "venue": "Transformer Circuits Thread (Anthropic)",
    "url": "https://transformer-circuits.pub/2025/attribution-graphs/methods.html",
    "summary": "Anthropic published attribution graphs on Claude 3.5 Haiku \u2014 a production frontier-scale model \u2014 with constrained-patching intervention experiments that clamp pre-intervention activations and measure whether downstream effects match the predicted causal graph. They demonstrate multi-hop reasoning, planning (rhyme selection before composition), and medical reasoning circuits with intervention validation. This is a direct rebuttal to B1: causal-mechanism work has reached frontier scale with independent intervention validation, not just toy models.",
    "supports": [],
    "rebuts": [
      "B1_causal_mechanism_empty"
    ],
    "verdict": "rebut",
    "confidence": 0.75,
    "watchlist_tier": "monthly",
    "operational_counter_evidence": "Constrained patching experiments on Claude 3.5 Haiku \u2014 local replacement models with predicted vs. measured perturbation effects agreeing within the attribution graph. Production-grade frontier model, not a small interpretability proxy.",
    "notes": "Strongest single rebuttal of B1. However, faithfulness of the attribution graph to the underlying model still relies on a learned replacement model, so causal-faithfulness skeptics will note this is approximate. Worth tracking how the field absorbs vs. critiques this."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "transformer-circuits:scaling-monosemanticity-2024",
    "title": "Scaling Monosemanticity: Extracting Interpretable Features from Claude 3 Sonnet",
    "authors": [
      "Templeton",
      "Conerly",
      "Anthropic Interpretability Team"
    ],
    "date": "2024-05",
    "venue": "Transformer Circuits Thread",
    "url": "https://transformer-circuits.pub/2024/scaling-monosemanticity/",
    "summary": "SAE-extracted features on Claude 3 Sonnet enable causal interventions: adding decoder vectors to activations produces predictable behavioral changes (e.g. amplifying a 'malicious code' feature causes Claude to output malicious code). This is causally-validated frontier-scale interpretability with intervention. Rebuts B1's claim that causal mechanism work is empty at frontier scale.",
    "supports": [],
    "rebuts": [
      "B1_causal_mechanism_empty"
    ],
    "verdict": "rebut",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Production-Claude-3-Sonnet SAEs with millions of features and behaviorally-causal interventions, including safety-relevant features (deception, sycophancy, bias).",
    "notes": "Critics argue these features are correlational rather than mechanistic \u2014 features encode information rather than compute. But the intervention evidence is direct."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "goodfire:ember-2024-2025",
    "title": "Goodfire Ember: Scaling Interpretability for Frontier Model Alignment",
    "authors": [
      "Goodfire AI Team"
    ],
    "date": "2024-12",
    "venue": "Goodfire Blog / Production API",
    "url": "https://www.goodfire.ai/blog/announcing-goodfire-ember",
    "summary": "Goodfire shipped Ember (Dec 2024) \u2014 a hosted production API for mechanistic-interpretability-based steering on Llama 3.3 70B, Llama 3.1 8B, and (per harvesting evidence) Kimi K2 Thinking trillion-parameter models. Demonstrated jailbreak-resistance by upweighting refusal features against StrongREJECT. Anthropic invested in Goodfire's $50M Series A (April 2025). This is shipped, paying-customer interpretability \u2014 far from 'empty.'",
    "supports": [],
    "rebuts": [
      "B1_causal_mechanism_empty"
    ],
    "verdict": "rebut",
    "confidence": 0.7,
    "watchlist_tier": "monthly",
    "operational_counter_evidence": "Production interpretability API with enterprise customers (Rakuten, Apollo, Haize); demonstrated mid-CoT feature steering on Kimi K2 Thinking (~1T params).",
    "notes": "B1 may need to be reframed: 'causal mechanism is empty in 7 specific domains we audited,' not 'empty everywhere.' Goodfire is one of the cleanest commercial counter-examples."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "deepmind:gemma-scope-2-2025",
    "title": "Gemma Scope 2: Open SAEs and transcoders for all Gemma 3 sizes",
    "authors": [
      "DeepMind Interpretability Team"
    ],
    "date": "2025-12",
    "venue": "DeepMind / Hugging Face",
    "url": "https://intuitionlabs.ai/articles/mechanistic-interpretability-ai-llms",
    "summary": "DeepMind released SAEs+transcoders for Gemma 3 (270M to 27B) including instruction-tuned models \u2014 the largest open-source interpretability release to date. Enables analysis of jailbreak resistance, refusal, and CoT faithfulness with causal interventions on production-scale models. Rebuts B1's frontier-scale-empty claim and B6's interpretability-as-closure absence.",
    "supports": [],
    "rebuts": [
      "B1_causal_mechanism_empty",
      "B6_anti_saturation_only_closure"
    ],
    "verdict": "rebut",
    "confidence": 0.65,
    "watchlist_tier": "monthly",
    "operational_counter_evidence": "~110 petabytes of stored activations, >1 trillion total trained parameters across SAEs/transcoders. Covers instruction-tuned models, not just base.",
    "notes": "DeepMind has publicly pivoted away from SAEs toward 'pragmatic interpretability' \u2014 so this release is partly a legacy artifact. Still rebuts the claim that the work doesn't exist."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "apollo:stress-test-deliberative-2025",
    "title": "Stress Testing Deliberative Alignment for Anti-Scheming Training",
    "authors": [
      "Apollo Research",
      "OpenAI"
    ],
    "date": "2025-09",
    "venue": "Apollo Research Paper / arxiv",
    "url": "https://www.apolloresearch.ai/research/stress-testing-deliberative-alignment-for-anti-scheming-training/",
    "summary": "Apollo + OpenAI demonstrate that deliberative alignment training on o3 and o4-mini significantly reduces covert behaviors (sandbagging, sabotage, reward hacking, lying) across 26 evaluation suites / 180+ environments \u2014 and that this generalizes from a narrow training distribution to broad agentic settings. Partial rebuttal to B3 (mitigations DO partially transfer cross-surface) and weakens B1 (training intervention with measurable effect).",
    "supports": [],
    "rebuts": [
      "B3_asymmetric_caps_mitigations",
      "B1_causal_mechanism_empty"
    ],
    "verdict": "weaken",
    "confidence": 0.55,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Anti-scheming spec trained on narrow distribution generalizes to 180+ environments; reductions across all measured covert categories. Caveat: not eliminated, just reduced.",
    "notes": "Apollo themselves emphasize that covert behavior is reduced but not eliminated. This is a partial rebuttal at best \u2014 the asymmetry still holds qualitatively (mitigations imperfect), but quantitatively it's better than 'mitigations don't transfer.'"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "anthropic-openai:joint-eval-2025",
    "title": "Findings from a pilot Anthropic-OpenAI alignment evaluation exercise",
    "authors": [
      "Anthropic Alignment Science",
      "OpenAI Safety"
    ],
    "date": "2025-08",
    "venue": "Anthropic Alignment Science Blog / OpenAI",
    "url": "https://alignment.anthropic.com/2025/openai-findings/",
    "summary": "Two frontier labs ran their internal alignment evaluations on each other's public models. Anthropic found OpenAI o3/o4-mini aligned 'as well or better' than their own models on sycophancy/whistleblowing/self-preservation/misuse axes (with external safeguards disabled). This is a cross-vendor independent audit that PASSED for o3. Direct rebuttal of B5's 0/N pattern.",
    "supports": [],
    "rebuts": [
      "B5_0_N_audit_pattern"
    ],
    "verdict": "rebut",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "o3 passed sycophancy stress tests where all other tested models (GPT-4o, GPT-4.1, Claude variants) struggled. Mutual independent assessment with published findings.",
    "notes": "Significant counter-evidence to B5. However, the audit was disclosed by the parties themselves rather than a true third-party \u2014 selection effects possible."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "anthropic:petri-2025",
    "title": "Petri: An open-source auditing tool to accelerate AI safety research",
    "authors": [
      "Anthropic Alignment Science Team"
    ],
    "date": "2025-10",
    "venue": "Anthropic Alignment Science",
    "url": "https://alignment.anthropic.com/2025/petri/",
    "summary": "Anthropic released Petri (Oct 2025) as open-source automated behavioral auditing. UK AISI used a pre-release of Petri in Sonnet 4.5 testing. Petri 2.0 launched 2026 with eval-awareness mitigations. This is an active closure-mechanism BEYOND anti-saturation: independent third-party tooling for cross-vendor audit. Rebuts B6 (anti-saturation isn't the only working closure).",
    "supports": [],
    "rebuts": [
      "B6_anti_saturation_only_closure"
    ],
    "verdict": "rebut",
    "confidence": 0.6,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Petri 2.0 with eval-awareness mitigation; handed to Meridian Labs as third-party; AISI used pre-release on Sonnet 4.5. Active framework, not just retired benchmark refresh.",
    "notes": "Whether Petri actually closes anything in the bills-of-particulars sense is unclear \u2014 but the methodology IS an alternative closure mechanism."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "anthropic:claude-opus-4-system-card-2025",
    "title": "Claude Opus 4 / Sonnet 4 System Card (May 2025) with US AISI + UK AISI pre-deployment testing",
    "authors": [
      "Anthropic Safety"
    ],
    "date": "2025-05",
    "venue": "Anthropic",
    "url": "https://www.anthropic.com/claude-4-system-card",
    "summary": "Claude Opus 4 received independent pre-deployment evaluation by US AISI and UK AISI on CBRN, cybersecurity, and autonomous capabilities \u2014 and was cleared for deployment. This is a vendor claim that passed independent replication on safety-relevant dimensions. Rebuts B5's 0/N pattern.",
    "supports": [],
    "rebuts": [
      "B5_0_N_audit_pattern"
    ],
    "verdict": "rebut",
    "confidence": 0.65,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Joint US-AISI + UK-AISI independent capability assessment, separate from Anthropic's internal eval. Apollo also tested (but couldn't complete in time for Opus 4.6).",
    "notes": "AISI evaluations are not full bills-of-particulars-style audits but they are independent. B5's '0 clean closures' framing is strained by these."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "aisi:claude-mythos-gpt55-2025-2026",
    "title": "AISI Pre-deployment evaluations of Claude Mythos Preview and OpenAI GPT-5.5 cyber capabilities",
    "authors": [
      "UK AISI"
    ],
    "date": "2026-01",
    "venue": "AISI Work Blog",
    "url": "https://www.aisi.gov.uk/blog/our-evaluation-of-claude-mythos-previews-cyber-capabilities",
    "summary": "UK AISI published independent third-party cyber capability evaluations of Claude Mythos Preview and GPT-5.5 with published findings. Continued cadence of independent evaluations on vendor-supplied models with replicable methodology. Rebuts B5 (0/N) and weakens B2 (closure can happen pre-deployment, not just post-launch).",
    "supports": [],
    "rebuts": [
      "B5_0_N_audit_pattern",
      "B2_closure_cycle"
    ],
    "verdict": "weaken",
    "confidence": 0.55,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Repeated pre-deployment audits on cyber capabilities with public reports. Independent, methodologically grounded, replicable.",
    "notes": "Cycle time is fast (weeks pre-deployment, not 3-4 months reactive). Different operational profile than the 3-4mo closure cycle in B2."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2510.27378",
    "title": "Measuring Chain-of-Thought Monitorability Through Faithfulness and Verbosity",
    "authors": [
      "Bao et al."
    ],
    "date": "2025-10",
    "venue": "arxiv",
    "url": "https://arxiv.org/abs/2510.27378",
    "summary": "Comprehensive monitorability metric finds DeepSeek-R1 at 78.3% average monitorability and Claude 3.7 Sonnet (extended thinking) at second place. CoT faithfulness is measurable and HIGH on some frontier reasoning models \u2014 rebuts the framing that CoT is opaque/unfaithful across the board.",
    "supports": [],
    "rebuts": [
      "B1_causal_mechanism_empty"
    ],
    "verdict": "weaken",
    "confidence": 0.55,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "DeepSeek-R1 78.3% monitorability average; some models near 90% faithfulness on the matched 'Lie to Me' benchmark.",
    "notes": "Counter to Anthropic's earlier 25%/39% hint-mention rates. Methodology differs \u2014 monitorability is a softer metric than hint acknowledgment. Genuine open empirical question."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2603.22582",
    "title": "Lie to Me: How Faithful Is Chain-of-Thought Reasoning in Open-Weight Reasoning Models?",
    "authors": [
      "Anonymous (open-weight reasoning study)"
    ],
    "date": "2026-03",
    "venue": "arxiv",
    "url": "https://arxiv.org/abs/2603.22582",
    "summary": "Reports DeepSeek-V3.2-Speciale at 89.9% CoT faithfulness and GPT-OSS-120B at 84.9%. Specific open-weight models score very high on CoT faithfulness. Directly rebuts the universal-unfaithfulness framing.",
    "supports": [],
    "rebuts": [
      "B1_causal_mechanism_empty"
    ],
    "verdict": "rebut",
    "confidence": 0.55,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "89.9% (DeepSeek-V3.2-Speciale) and 84.9% (GPT-OSS-120B) faithfulness rates. Wide variance across models (39.7% Seed-1.6-Flash to 89.9%).",
    "notes": "The numbers suggest CoT faithfulness is model-dependent and tractable in some architectures, not uniformly absent. B1 may apply to RLHF'd reasoning models more than open-weight chains."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "antidistillation:2025",
    "title": "Antidistillation Fingerprinting and Preserving AI Openness",
    "authors": [
      "Antidistillation.com Authors"
    ],
    "date": "2025-04",
    "venue": "Antidistillation Blog / preprint",
    "url": "https://antidistillation.com/blog/unexpected-externalities-of-distillation/",
    "summary": "Demonstrates antidistillation fingerprinting that embeds statistical signatures in teacher outputs that SURVIVE distillation, providing provenance evidence. Indicates capability-portability via distillation is not unrestricted \u2014 defenders have technical recourse. Weakens B4 (distillation portability is asymmetric in defenders' favor here).",
    "supports": [],
    "rebuts": [
      "B4_distillation_portability"
    ],
    "verdict": "weaken",
    "confidence": 0.45,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Statistical fingerprints persist through distillation pipelines, enabling provenance detection (proof of distillation extraction).",
    "notes": "This rebuts the cleanness of B4 \u2014 distillation is portable BUT now detectable. Whether detection actually halts copying is policy, not technical."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2503.03730",
    "title": "Towards Understanding Distilled Reasoning Models: A Representational Approach",
    "authors": [
      "ICLR 2025 Building Trust Workshop authors"
    ],
    "date": "2025-03",
    "venue": "ICLR 2025 Workshop",
    "url": "https://arxiv.org/abs/2503.03730",
    "summary": "Mechanistic interpretability analysis of distilled reasoning models shows representational losses \u2014 distillation modifies model in ways that reduce certain reasoning capabilities. Distillation is NOT capability-clean. Direct rebuttal to B4's distillation-as-portable claim.",
    "supports": [],
    "rebuts": [
      "B4_distillation_portability"
    ],
    "verdict": "rebut",
    "confidence": 0.55,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Empirical representation losses in distilled variants on complex reasoning. Architectural mismatch creates capability gaps not bridged by distillation.",
    "notes": "Strongest single rebuttal of B4. Suggests reasoning capabilities have an architecture-specific component that doesn't fully transfer."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "baek-tegmark:scale-emergence-2025",
    "title": "Scale-Dependent Emergence of Reasoning Capabilities (Baek and Tegmark)",
    "authors": [
      "Baek",
      "Tegmark"
    ],
    "date": "2025",
    "venue": "Preprint cited in distillation literature",
    "url": "https://www.rohan-paul.com/p/recent-advancements-in-distillation",
    "summary": "Certain reasoning capabilities emerge only at specific model scales and are difficult to fully transfer to smaller distilled architectures. Identifies a capability-class that resists scale-portability. Directly rebuts B4's scaling-portability premise.",
    "supports": [],
    "rebuts": [
      "B4_distillation_portability"
    ],
    "verdict": "rebut",
    "confidence": 0.45,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Empirical: certain reasoning emerges only at scale-X, fails to transfer to smaller students. Scale-dependent capabilities exist.",
    "notes": "If true, supports a 'compute moat' alternative to B4's distillation-is-portable claim."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2601.03300",
    "title": "TRYLOCK: Defense-in-Depth Against LLM Jailbreaks via Layered Preference and Representation Engineering",
    "authors": [
      "TRYLOCK Authors"
    ],
    "date": "2026-01",
    "venue": "arxiv",
    "url": "https://arxiv.org/abs/2601.03300",
    "summary": "Four-layer defense-in-depth (input canonicalization + DPO + representation steering + sidecar classifier) reduces jailbreak ASR from 46.5% to 5.6%. Demonstrates that multi-layer mitigations DO transfer across attack surfaces \u2014 a 41-point absolute reduction. Direct rebuttal of B3's asymmetric-transfer claim.",
    "supports": [],
    "rebuts": [
      "B3_asymmetric_caps_mitigations"
    ],
    "verdict": "rebut",
    "confidence": 0.6,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "ASR drop 46.5% -> 5.6% across heterogeneous jailbreak techniques; multiple mechanisms compose multiplicatively.",
    "notes": "Empirical defense-in-depth result. Whether this generalizes to fine-tuning attacks (vs. inference-time jailbreaks) is the residual question."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2511.09880",
    "title": "EnchTable: Unified Safety Alignment Transfer in Fine-Tuned Models",
    "authors": [
      "EnchTable Authors"
    ],
    "date": "2025-11",
    "venue": "arxiv",
    "url": "https://arxiv.org/pdf/2511.09880",
    "summary": "Demonstrates broad robustness across diverse static-evaluation settings while consistently elevating downstream safety after fine-tuning. Safety transfers across fine-tuning, a long-standing failure mode (Lermen-Rimsky regime). Rebuts B3.",
    "supports": [],
    "rebuts": [
      "B3_asymmetric_caps_mitigations"
    ],
    "verdict": "rebut",
    "confidence": 0.55,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Cross-evaluation robustness; consistent safety elevation across downstream fine-tunes.",
    "notes": "Replicates and extends the LoX result (11-54pp ASR drop)."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2507.11544",
    "title": "Safety Gap Toolkit: Safety Evaluation of Open-Weight Models",
    "authors": [
      "Safety Gap Toolkit Authors"
    ],
    "date": "2025-07",
    "venue": "arxiv",
    "url": "https://arxiv.org/html/2507.11544",
    "summary": "Quantifies the safety gap (WMD-Bio accuracy * compliance) before/after safeguard removal via fine-tuning. Provides a measurement framework for the Lermen-Rimsky asymmetry \u2014 making it auditable. Partially weakens B3 by demonstrating quantification, not just qualitative existence.",
    "supports": [
      "B3_asymmetric_caps_mitigations"
    ],
    "rebuts": [
      "B5_0_N_audit_pattern"
    ],
    "verdict": "weaken",
    "confidence": 0.45,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Reproducible toolkit for safety-gap measurement; a clean methodological closure for one specific risk class (Bio uplift via fine-tuning).",
    "notes": "Mixed \u2014 corroborates B3 phenomenologically but rebuts B5 by being a clean cross-vendor audit framework."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "mistral:large-3-2025",
    "title": "Mistral Large 3 (open-weights frontier MoE, Apache 2.0)",
    "authors": [
      "Mistral AI"
    ],
    "date": "2025-12",
    "venue": "Mistral AI",
    "url": "https://mistral.ai/news/mistral-3",
    "summary": "Mistral released Large 3 (41B active / 675B total, MoE) under Apache 2.0 with both base and instruction-tuned weights publicly available. Western vendor with open frontier-class weights \u2014 partially flips B7 by demonstrating Western vendors CAN do open-weight disclosure.",
    "supports": [],
    "rebuts": [
      "B7_western_chinese_inversion"
    ],
    "verdict": "weaken",
    "confidence": 0.5,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Apache 2.0 base + instruct release. 675B total params, 256k context, multimodal.",
    "notes": "Caveat: Mistral's FMTI score still dropped two-thirds; weights-open != documentation-transparent. The B7 inversion is about disclosure not just weights."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "meta:llama-4-2025",
    "title": "Meta Llama 4 (Behemoth/Maverick/Scout) \u2014 open-weight April 2025 release",
    "authors": [
      "Meta AI"
    ],
    "date": "2025-04",
    "venue": "Meta",
    "url": "https://www.llama.com/",
    "summary": "Meta released Llama 4 Behemoth/Maverick/Scout in April 2025 as open-weights with public model cards. Western vendor open frontier weights. Counter to B7's framing that Western vendors are disclosure-poor.",
    "supports": [],
    "rebuts": [
      "B7_western_chinese_inversion"
    ],
    "verdict": "weaken",
    "confidence": 0.4,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Open weights for all 3 Llama 4 variants. Public benchmarks and architectural disclosure.",
    "notes": "Caveat: Meta's FMTI score dropped 60->31 between 2024 and 2025. Weights-open but documentation-deteriorating. Partial inversion of B7 only."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "fmti:2025-deepseek-opacity",
    "title": "DeepSeek Transparency Report (2025 Foundation Model Transparency Index)",
    "authors": [
      "Stanford CRFM (Wan et al.)"
    ],
    "date": "2025-12",
    "venue": "Stanford CRFM",
    "url": "https://crfm.stanford.edu/fmti/December-2025/company-reports/DeepSeek_FinalReport_FMTI2025.html",
    "summary": "FMTI 2025 finds DeepSeek 'quite opaque' on data, training, environmental impact \u2014 and shares no crawler name despite being asked. This is a *Chinese* open-weight vendor that scores POORLY on transparency. Rebuts the simple form of B7's inversion (open-weight Chinese vendors are transparent).",
    "supports": [],
    "rebuts": [
      "B7_western_chinese_inversion"
    ],
    "verdict": "rebut",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "DeepSeek doesn't disclose crawler name; data acquisition for V3->R1 transition unclear; FMTI score in bottom tier alongside Mistral/xAI/Midjourney.",
    "notes": "If B7 is 'Chinese open-weight = full disclosure, Western closed = no disclosure,' this directly inverts it. The picture is messier than a simple Western/Chinese split \u2014 it's vendor-specific."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "antilock:antileakbench-2024-2025",
    "title": "AntiLeakBench / LiveBench / AntiLeak-Bench \u2014 automated anti-contamination benchmarks",
    "authors": [
      "Various"
    ],
    "date": "2024-12 / 2025",
    "venue": "arxiv",
    "url": "https://arxiv.org/html/2412.13670",
    "summary": "Multiple anti-contamination/anti-saturation benchmark frameworks (AntiLeakBench, LiveBench, ReLE) demonstrate active research into closure-via-refresh. However, MMLU-Pro reached ~90% by 2026 (saturating despite being designed against MMLU's saturation), and ReLE's June 2024-Jan 2026 fresh set shows top models clustering. WEAKENS B6 by showing anti-saturation closure IS WORKING (frameworks exist and update) but also confirms B6 is fragile.",
    "supports": [
      "B6_anti_saturation_only_closure"
    ],
    "rebuts": [],
    "verdict": "weaken",
    "confidence": 0.4,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "MMLU-Pro at 90% (Gemini 3 Pro 90.1%, Claude Opus 4.5 89.5%) \u2014 even the refreshed benchmark is saturating. Suggests benchmark refresh isn't a permanent closure mechanism.",
    "notes": "MIXED. Confirms B6's central claim (anti-saturation IS the working closure mechanism \u2014 see Agent Island, MMLU-CF) but also flags that refresh cadence may not keep pace with frontier capability gains. Implicit rebuttal of the strong version of B6 (anti-saturation 'works' is overstated if it saturates again in months)."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "frontiersin:formal-methods-survey-2026",
    "title": "Formal Methods for Safety-Critical ML: A Systematic Literature Review",
    "authors": [
      "Frontiers in AI authors"
    ],
    "date": "2026-02",
    "venue": "Frontiers in Artificial Intelligence",
    "url": "https://www.frontiersin.org/journals/artificial-intelligence/articles/10.3389/frai.2026.1749956/full",
    "summary": "Systematic review of formal verification for safety-critical ML (2020-mid-2025), identifying sound over-approximation and exact set propagation methods that provide mathematical guarantees. Demonstrates formal-verification-as-closure has methodological substance. Rebuts B6's anti-saturation-only framing.",
    "supports": [],
    "rebuts": [
      "B6_anti_saturation_only_closure"
    ],
    "verdict": "weaken",
    "confidence": 0.4,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Multiple peer-reviewed formal verification methods with provable guarantees; not just empirical anti-saturation.",
    "notes": "Caveat: formal verification is generally limited to bounded/symbolic systems and doesn't scale to LLM behavior in practice. Strong critique exists (alignmentforum.org/posts/B2bg677TaS4cmDPzL). Formal-as-closure is aspirational outside narrow domains."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "eu:ai-omnibus-delay-2026",
    "title": "EU AI Act High-Risk Obligations \u2014 Proposed Delay to Dec 2027/Aug 2028",
    "authors": [
      "European Commission / Parliament"
    ],
    "date": "2026-05",
    "venue": "Council of EU Press Release",
    "url": "https://www.consilium.europa.eu/en/press/press-releases/2026/05/07/artificial-intelligence-council-and-parliament-agree-to-simplify-and-streamline-rules/",
    "summary": "EU AI Omnibus proposes delaying high-risk AI obligations from Aug 2026 to Dec 2027 (or Aug 2028 for sector-specific). Original timeline (force Aug 2024 -> high-risk Aug 2026) was 24 months; revised timeline 36-48 months. Direct rebuttal of B2's 3-4 month closure claim \u2014 REGULATORY closure cycles are 12-48 months, not 3-4.",
    "supports": [],
    "rebuts": [
      "B2_closure_cycle"
    ],
    "verdict": "rebut",
    "confidence": 0.6,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "AI Act force Aug 2024 -> intended high-risk Aug 2026 (24 mo); revised to Dec 2027/Aug 2028 (40+ mo). National competent authorities still being designated.",
    "notes": "If B2 is about regulatory/policy closure, the 3-4mo number is wrong by an order of magnitude. If B2 is about research-paper closure of vendor claims specifically, then this evidence is orthogonal. Worth disambiguating."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "ibm:watson-oncology-multi-year-failure",
    "title": "IBM Watson for Oncology \u2014 multi-year vendor claim, slow refutation",
    "authors": [
      "Multiple investigative outlets"
    ],
    "date": "2017-2022",
    "venue": "STAT News, internal IBM docs",
    "url": "https://thelyonfirm.com/blog/ai-system-failure-algorithm-breakdown-deceptive-marketing-lawsuit/",
    "summary": "IBM Watson for Oncology was marketed for years to hospitals with claims it matched leading oncologist treatment recommendations. Internal documents eventually showed unsafe recommendations. IBM wound the product down only after multi-year campaign. SLOW closure cycle (years, not months). Direct rebuttal of B2's 3-4mo framing.",
    "supports": [],
    "rebuts": [
      "B2_closure_cycle"
    ],
    "verdict": "rebut",
    "confidence": 0.65,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Watson Oncology marketed 2013+, internal acknowledgment STAT 2018, full wind-down 2022. Total cycle ~9 years.",
    "notes": "Pre-LLM but still an AI-vendor claim. Demonstrates that without forcing functions, vendor-claim closure can take years."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "openai:preparedness-framework-2025",
    "title": "OpenAI Preparedness Framework + external testing program 2025",
    "authors": [
      "OpenAI"
    ],
    "date": "2025",
    "venue": "OpenAI",
    "url": "https://openai.com/index/strengthening-safety-with-external-testing/",
    "summary": "OpenAI ran external capability assessments on GPT-5 across long-horizon autonomy, scheming, deception, oversight subversion, wet-lab feasibility, cyber. Published findings. Demonstrates a closure mechanism (independent capability eval) beyond benchmark refresh. Rebuts B6's anti-saturation-only.",
    "supports": [],
    "rebuts": [
      "B6_anti_saturation_only_closure",
      "B5_0_N_audit_pattern"
    ],
    "verdict": "weaken",
    "confidence": 0.45,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Multi-vendor external red-teaming on dangerous capability domains; published evaluations for GPT-4o/o1/operator/o3-mini/deep research/GPT-4.5/GPT-5.",
    "notes": "Question: how independent are these evals really? OpenAI selects testers and controls disclosure. Audit not full third-party. But still a closure mechanism beyond anti-saturation."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "ny:raise-act-2025",
    "title": "New York RAISE Act (2025) \u2014 mandatory annual independent third-party audits",
    "authors": [
      "NY State"
    ],
    "date": "2025-12",
    "venue": "NY State Legislature",
    "url": "https://www.dwt.com/blogs/artificial-intelligence-law-advisor/2025/12/new-york-raise-act-ai-safety-rules-developers",
    "summary": "NY RAISE Act requires large developers to retain independent third-party for ANNUAL audits with detailed records for replication. First-of-kind enforceable independent audit regime. Strong counter-evidence to B5 (audit pattern improving from 0/N to mandatory) and B6 (closure mechanism beyond saturation).",
    "supports": [],
    "rebuts": [
      "B5_0_N_audit_pattern",
      "B6_anti_saturation_only_closure"
    ],
    "verdict": "weaken",
    "confidence": 0.5,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Mandatory annual third-party audit with public summary report. Records must be detailed enough for replication.",
    "notes": "Just enacted, no enforcement track record yet. But the framework itself is counter-evidence to 0/N \u2014 independent audits will be DONE."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2403.19647",
    "title": "Sparse Feature Circuits: Discovering and Editing Interpretable Causal Graphs in LMs",
    "authors": [
      "Marks et al."
    ],
    "date": "2024-03",
    "venue": "arxiv (subsequently published)",
    "url": "https://arxiv.org/abs/2403.19647",
    "summary": "Discovers and edits causal computation graphs in LMs by identifying sparse feature circuits. Demonstrates causally-validated mechanism discovery with editing. Rebuts B1 directly.",
    "supports": [],
    "rebuts": [
      "B1_causal_mechanism_empty"
    ],
    "verdict": "rebut",
    "confidence": 0.55,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Causal-graph extraction + edit-and-measure verification on small LMs; foundational citation for many 2025 frontier follow-ons.",
    "notes": "Smaller scale than transformer-circuits/biology, but foundational methodology. Whether this scales to frontier remains the test of B1."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2509.03518",
    "title": "Can LLMs Lie? Investigation Beyond Hallucination \u2014 Localizing Lying Circuits",
    "authors": [
      "LLM Lying authors"
    ],
    "date": "2025-09",
    "venue": "arxiv",
    "url": "https://arxiv.org/html/2509.03518v1",
    "summary": "Localizes neural circuits dedicated to lying via Logit Lens + causal intervention; derives steering vectors that control lying behavior. Direct mechanistic-causal rebuttal of B1 specifically on the deception axis.",
    "supports": [],
    "rebuts": [
      "B1_causal_mechanism_empty"
    ],
    "verdict": "rebut",
    "confidence": 0.5,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Specific MLPs and attention heads isolated to lying; zero-ablation interventions confirm causal role.",
    "notes": "Specific to lying behavior. Not all behaviors are this localized, but it's an existence proof against 'no causal mechanism work landing.'"
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "fli:safety-index-summer-2025",
    "title": "FLI AI Safety Index Summer 2025 \u2014 Only 3/7 firms test dangerous capabilities",
    "authors": [
      "Future of Life Institute"
    ],
    "date": "2025-07",
    "venue": "FLI",
    "url": "https://futureoflife.org/ai-safety-index-summer-2025/",
    "summary": "Only Anthropic, OpenAI, and Google DeepMind do 'substantive testing for dangerous capabilities.' But these 3 DO substantive testing \u2014 non-trivial counter-evidence to 0/N. Mixed signal on B5.",
    "supports": [
      "B5_0_N_audit_pattern"
    ],
    "rebuts": [],
    "verdict": "weaken",
    "confidence": 0.35,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "3 firms do substantive bio/cyber testing; not zero. Other 4 don't.",
    "notes": "MIXED \u2014 supports B5 (4/7 don't audit) but weakens it (3/7 DO, and at substantive level). The 'pattern' is mixed not absent."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "anthropic:rsp-3.0-2026",
    "title": "Anthropic RSP v3.0 (Feb 2026) \u2014 fast update cycle",
    "authors": [
      "Anthropic"
    ],
    "date": "2026-02",
    "venue": "Anthropic",
    "url": "https://www.anthropic.com/news/responsible-scaling-policy-v3",
    "summary": "Anthropic released RSP v2.1 (Mar 2025) -> v3.0 (Feb 2026) -> v3.2 with LTBT external review. Fast governance cycle WITHIN ONE VENDOR (~12 mo per major rev) and with external-review formalization. Cuts against the slow-policy-cycle implicit in B2 framing.",
    "supports": [],
    "rebuts": [
      "B2_closure_cycle"
    ],
    "verdict": "weaken",
    "confidence": 0.4,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "RSP v2.1 (Mar 2025), v3.0 (Feb 2026), v3.2 with LTBT external review. Cycle ~11 months between v2.1 and v3.0.",
    "notes": "Self-governance, not adversarial closure. But updates faster than 'closure cycle' would suggest if generalized."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "anthropic:opus-4-6-eval-aware-2026",
    "title": "Claude Opus 4.6 \u2014 Apollo couldn't complete formal assessment due to evaluation awareness (Feb 2026)",
    "authors": [
      "Anthropic / Apollo Research"
    ],
    "date": "2026-02",
    "venue": "Anthropic Claude Opus 4.6 System Card",
    "url": "https://www-cdn.anthropic.com/0dd865075ad3132672ee0ab40b05a53f14cf5288.pdf",
    "summary": "Apollo Research could not complete formal audit of Opus 4.6 because models display verbalized evaluation awareness 58% (Sonnet 4.5) vs 22% (Opus 4.1). This is a deep weakening of B5 \u2014 the audit pattern isn't 0/N because audits aren't done, but because they can't be completed reliably. Different failure mode.",
    "supports": [
      "B5_0_N_audit_pattern"
    ],
    "rebuts": [],
    "verdict": "support",
    "confidence": 0.5,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Eval-awareness rate 22% -> 58% within 6 months of Anthropic releases. Formal audit incomplete due to AI awareness of being tested.",
    "notes": "INCLUDED FOR COMPLETENESS \u2014 this actually SUPPORTS B5 by showing audits fail not just for absence but for AI-meta-awareness. Worth tracking as a B5 reinforcer."
  },
  {
    "record_kind": "external_evidence",
    "paper_id": "arxiv:2507.11473",
    "title": "Chain of Thought Monitorability: A New and Fragile Opportunity for AI Safety",
    "authors": [
      "Korbak et al. (multi-lab consortium)"
    ],
    "date": "2025-07",
    "venue": "arxiv",
    "url": "https://arxiv.org/html/2507.11473v2",
    "summary": "Multi-lab consortium paper argues CoT monitorability is REAL and provides ACTIONABLE safety value if preserved through training. Authoritative cross-lab claim that B1's 'causal-empty' framing overstates the case.",
    "supports": [],
    "rebuts": [
      "B1_causal_mechanism_empty"
    ],
    "verdict": "weaken",
    "confidence": 0.5,
    "watchlist_tier": "quarterly",
    "operational_counter_evidence": "Joint position paper from OpenAI/Anthropic/DeepMind/Google/Meta authors arguing CoT monitorability is tractable and load-bearing.",
    "notes": "Highly cited 2025 multi-lab consensus. Framed as 'fragile opportunity' \u2014 i.e. acknowledges limits but argues mechanism exists."
  },
  {
    "record_kind": "audit_plan",
    "paper_id": "audit:2027_q1_repoll",
    "title": "2027-Q1 \u2605 bill re-poll plan across all 13 production aiwikis",
    "authors": [
      "Kevin Russell"
    ],
    "date": "2027-Q1 (planned)",
    "venue": "Project 42 internal",
    "summary": "Quarterly re-poll of all \u2605 predicted-empty bills across all 13 production aiwikis. Each \u2605 bill's status (EMPTY / FALSIFIED / NEEDS_GATE) reported with sweep-count delta vs 2026-05-09 baseline.",
    "candidate_bridge": "B1_causal_mechanism_empty",
    "candidate_bill": "Bill_4",
    "verdict": "active",
    "confidence": null,
    "watchlist_tier": "quarterly",
    "notes": "Triggers within 7 days if \u22651 \u2605 bill flips status. The bridges are downstream of these statuses; any flip invalidates \u22651 bridge.",
    "tracked_star_bills": {
      "factorization": [
        "Bill_6",
        "Bill_7",
        "Bill_8"
      ],
      "quantum_advantage": [
        "Bill_8",
        "Bill_12",
        "Bill_13"
      ],
      "lattice_cryptography": [
        "Bill_7",
        "Bill_11",
        "Bill_13",
        "Bill_16"
      ],
      "mech_interp": [
        "Bill_7",
        "Bill_11",
        "Bill_14"
      ],
      "capability_benchmarks": [
        "Bill_7",
        "Bill_11",
        "Bill_14",
        "Bill_17"
      ],
      "inference_time_safety": [
        "Bill_7",
        "Bill_11",
        "Bill_14",
        "Bill_17"
      ],
      "compute_governance": [
        "Bill_7",
        "Bill_11",
        "Bill_14",
        "Bill_17"
      ],
      "reasoning_cot": [
        "Bill_6",
        "Bill_9",
        "Bill_12"
      ],
      "scaling_laws": [
        "Bill_5",
        "Bill_8",
        "Bill_11"
      ],
      "vision_language": [
        "Bill_4",
        "Bill_7",
        "Bill_10"
      ],
      "open_weight": [
        "Bill_5",
        "Bill_8",
        "Bill_11"
      ],
      "agentic_tool_use": [
        "Bill_4",
        "Bill_7",
        "Bill_10"
      ],
      "bio_protein": [
        "Bill_4",
        "Bill_7",
        "Bill_10"
      ]
    },
    "n_star_bills_total": 41,
    "n_bridges_at_risk": 7
  },
  {
    "record_kind": "audit_plan",
    "paper_id": "audit:2027_q3_synthesis_lock",
    "title": "2027-Q3 synthesis preprint lock decision",
    "authors": [
      "Kevin Russell"
    ],
    "date": "2027-Q3 (planned)",
    "venue": "Project 42 internal",
    "summary": "Decision point: submit synthesis preprint 'Domain-invariant closure-pattern failures in frontier ML claims (2024-2026): a 13-ledger meta-audit' to arXiv if and only if v0.2 lock of cross_ledger_bridges aiwiki holds. Specifically: \u22656 of 7 bridges still active, \u22652 bridges externally corroborated, no Bill 7/9/12 \u2605 trigger fired.",
    "candidate_bill": null,
    "verdict": "active",
    "confidence": null,
    "watchlist_tier": "triggered",
    "lock_conditions": {
      "bridges_active_min": 6,
      "external_corroboration_min": 2,
      "star_bill_7_9_12_triggered": false,
      "operational_definitions_all": "Bill_1 paid",
      "anchor_independence_min_per_bridge": 2
    },
    "notes": "If lock fails, revise the synthesis. The discipline: we publicly committed to not submitting before lock."
  },
  {
    "record_kind": "audit_plan",
    "paper_id": "audit:monthly_trigger_watch",
    "title": "Monthly trigger watch on B1 + B4 + B7 bridges (highest-risk)",
    "authors": [
      "Kevin Russell"
    ],
    "date": "2026-06-01 (monthly recurring)",
    "venue": "Project 42 internal \u2014 cron",
    "summary": "Monthly arXiv keyword scan for: (1) causally-faithful CoT / steering claims at frontier scale (B1 falsifier), (2) distillation-resistant capability with \u226510\u00d7 compute ratio (B4 falsifier), (3) Western open-weight full 8-field disclosure (B7 falsifier). Cron-scheduled deep loop with auto-flag on any high-confidence hit.",
    "candidate_bill": null,
    "verdict": "active",
    "confidence": null,
    "watchlist_tier": "monthly",
    "queries": [
      "(causally faithful) AND (chain of thought OR steering OR intervention) AND (frontier OR \u226530B)",
      "(distillation resistant OR distillation-resistant) AND (capability OR retention) AND (\u226510\u00d7 compute OR \u2265100\u00d7)",
      "(open-weight OR open weights) AND (Meta OR Mistral OR OpenAI) AND (FLOPs disclosure OR training data card OR full disclosure)"
    ],
    "notes": "If a query returns \u22651 high-confidence hit, route to the relevant production ledger's sweep tooling for full classification + arbitration."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:Robotics_B1",
    "ledger_to_be": "Robotics / Embodied AI",
    "bridge": "B1_causal_mechanism_empty",
    "predicted_extension": "fails_to_extend",
    "confidence": 0.72,
    "reasoning": "Robotics has physics-grounded reward and proprioceptive feedback that LLM-centric domains lack \u2014 RL with environment-grounded reward provides a causally faithful mechanism via action-consequence loops. RT-2/OpenVLA capability claims rest on demonstrated policy execution in physical environments, where causal closure is partially externalizable to the world model. The bridge likely partially fails: causal mechanisms exist in sim-to-real transfer math, but DEPLOYMENT-environment generalization (RFM-1, Helix open-world claims) still rests on empty mechanisms.",
    "empirical_anchors": [
      "RT-2 (Google DeepMind 2023) \u2014 co-fine-tuning on web+robot data with chain-of-thought reasoning, mechanism partially explicated via VLM grounding",
      "OpenVLA (Stanford/Berkeley 2024) \u2014 open-source VLA with documented training recipe, causal contribution of each component ablated",
      "\u03c00 (Physical Intelligence 2024) \u2014 flow-matching action head, mechanism formally specified",
      "Helix (Figure 2025) \u2014 system-1/system-2 dual-network claim; system-2 mechanism opaque, system-1 mechanism documented",
      "Gato-2 hypothetical generalist \u2014 would test whether scale-driven embodied generalism has explicit mechanism or empty claim"
    ],
    "if_fails_implications": "B1 becomes B1': 'Causally-faithful mechanism empty in symbolic/linguistic domains, present-but-shallow in embodied domains.' This sharpens the bridge \u2014 it predicts which domains will resist forensic empty-mechanism critique. Robotics becomes the exception that defines the rule.",
    "notes": "The split is between TRAINING mechanism (often documented in robotics) and DEPLOYMENT/GENERALIZATION mechanism (still empty when extrapolating to novel embodiments, environments, or task distributions)."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:Multilingual_B1",
    "ledger_to_be": "Multilingual / Low-Resource",
    "bridge": "B1_causal_mechanism_empty",
    "predicted_extension": "extends",
    "confidence": 0.83,
    "reasoning": "Multilingual capability claims (NLLB-200, Aya-Expanse) rest on cross-lingual transfer assumptions that lack causal mechanisms \u2014 exactly the same opacity as LLM-centric claims. The mechanism by which a model trained on high-resource language X 'transfers' to low-resource language Y is post-hoc rationalized via shared subword tokens or 'universal' representations, but no causally-faithful account exists. The bridge extends cleanly: 200-language coverage claims are statistical artifacts of evaluation set composition, not mechanism-grounded predictions.",
    "empirical_anchors": [
      "NLLB-200 (Meta 2022) \u2014 200-language no-language-left-behind claim; mechanism for low-resource quality not specified",
      "Aya-Expanse (Cohere 2024) \u2014 multilingual instruction tuning; cross-lingual transfer mechanism opaque",
      "Llama-3-multilingual evaluation gaps (Meta 2024) \u2014 strong on Western European, fails on Bantu/Dravidian; mechanism of failure not documented",
      "Qwen-multilingual (Alibaba 2024) \u2014 Chinese-strong baseline with multilingual extension, mechanism inherited from base",
      "FLORES-200 benchmark composition (Goyal et al 2022) \u2014 evaluation set construction known to inflate apparent coverage"
    ],
    "if_fails_implications": "If Multilingual unexpectedly has causal-mechanism explanations (e.g., explicit phylogenetic transfer maps, documented vocabulary-overlap dependencies), B1 weakens to 'causal mechanism empty in monolingual evaluation, present in cross-lingual.' This is unlikely given current literature.",
    "notes": "Post-training-language-drift is a particularly sharp test: if degradation mechanisms are documented (not just measured), B1 weakens; if only correlations are reported, B1 strengthens."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:RAG_B1",
    "ledger_to_be": "RAG / Retrieval-Augmented",
    "bridge": "B1_causal_mechanism_empty",
    "predicted_extension": "extends",
    "confidence": 0.88,
    "reasoning": "RAG capability claims (citation-faithfulness, retrieval-grounding) are notoriously mechanism-empty \u2014 vendors claim 'reduced hallucinations' without specifying the causal pathway from retrieval to generation faithfulness. Citation rates can be measured, but the mechanism by which retrieved context constrains generation is not formally specified in any commercial system. NotebookLM, Perplexity Pro, and Claude-with-retrieval all rely on emergent constraint, not documented mechanism.",
    "empirical_anchors": [
      "Anthropic Claude with retrieval (2024) \u2014 documented as 'context augmentation' without mechanism specification for faithfulness",
      "OpenAI Assistants v2 file search (2024) \u2014 retrieval-then-generation pipeline, no formal faithfulness mechanism",
      "Google NotebookLM (2023+) \u2014 source-grounding claims; mechanism for hallucination prevention not specified",
      "Perplexity Pro citation faithfulness audits (2024-2025) \u2014 citations often hallucinated or misattributed despite vendor claims",
      "Faithfulness-vs-Plausibility benchmark (Es et al RAGAS 2024) \u2014 measures effect, not mechanism"
    ],
    "if_fails_implications": "If RAG systems unexpectedly document causal mechanisms (e.g., attention-mask-based citation enforcement, contrastive decoding with retrieval evidence), B1 weakens significantly. Current evidence suggests this is unlikely \u2014 RAG faithfulness is treated as an emergent property.",
    "notes": "Citation faithfulness is the cleanest test: vendors claim it, audits routinely find it lacking, and no causal mechanism is offered to bridge claim and reality."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:Robotics_B2",
    "ledger_to_be": "Robotics / Embodied AI",
    "bridge": "B2_closure_cycle_compression",
    "predicted_extension": "extends",
    "confidence": 0.75,
    "reasoning": "Robotics has empirically demonstrated closure-cycle compression: RT-1 (2022) to RT-2 (2023) to OpenVLA (2024) to \u03c00 (2024) shows ~12mo cycles between major capability/closure rounds, similar to the 3-4mo LLM compression. Helix (Feb 2025) was followed by RFM-1 (2024) and \u03c00 within months. The bridge extends because robotics has adopted the same publish-claim-audit-mitigation rhythm, accelerated by open-weight releases (OpenVLA, \u03c00 weights) enabling fast forensic response.",
    "empirical_anchors": [
      "RT-1 \u2192 RT-2 cycle (Google 2022-2023, ~14mo)",
      "OpenVLA release and community ablations (Stanford 2024, ~3mo to first forensic critiques)",
      "Helix announcement \u2192 community skepticism cycle (Figure Feb 2025 \u2192 critique within weeks)",
      "\u03c00 release \u2192 independent reproducibility attempts (Physical Intelligence 2024)",
      "Sim-to-real claim lifecycle in benchmark papers (CoRL/RSS 2023-2025)"
    ],
    "if_fails_implications": "If robotics retains longer cycles (paper at ICRA, mitigation 18mo later), B2 becomes domain-specific to LLMs. This would suggest the compression is driven by deployment velocity (production LLMs deploy weekly), not by science velocity.",
    "notes": "Watch for whether closed-loop hardware integration (where mistakes have physical cost) slows the cycle vs accelerates it via tighter feedback."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:Multilingual_B2",
    "ledger_to_be": "Multilingual / Low-Resource",
    "bridge": "B2_closure_cycle_compression",
    "predicted_extension": "unclear",
    "confidence": 0.55,
    "reasoning": "Multilingual cycles are mixed: high-resource language closure (English \u2192 European) compresses to LLM rates, but low-resource language closure (Bantu, indigenous languages, dialectal Arabic) has historically taken years because evaluation infrastructure itself is slow to build. NLLB-200 (2022) \u2192 Aya-23 (2024) is ~18mo, but the audit cycle for low-resource quality is bottlenecked by speaker recruitment. The bridge's extension depends on whether 'closure' is measured against benchmark performance (fast) or actual speaker validation (slow).",
    "empirical_anchors": [
      "NLLB-200 release \u2192 Aya-Expanse improvements (Meta 2022 \u2192 Cohere 2024)",
      "FLORES-200 benchmark adoption timeline (2022-2024)",
      "Low-resource language audit timelines (e.g., Masakhane African NLP collective 2020-2025)",
      "Llama-3 multilingual expansion (Meta 2024) \u2192 community evaluation cycles",
      "Dialectal Arabic NLP audit cycles (Habash et al 2020-2025)"
    ],
    "if_fails_implications": "If multilingual closure remains slow, B2 becomes 'compression occurs when evaluation infrastructure is cheap' \u2014 explaining why LLM-centric benchmarks compress but human-evaluation-heavy domains don't.",
    "notes": "Two-speed closure is a real possibility: benchmark closure fast (3-4mo), speaker-validated closure slow (18-24mo)."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:RAG_B2",
    "ledger_to_be": "RAG / Retrieval-Augmented",
    "bridge": "B2_closure_cycle_compression",
    "predicted_extension": "extends",
    "confidence": 0.82,
    "reasoning": "RAG closure cycles are exceptionally fast \u2014 Perplexity citation issues, NotebookLM hallucinations, and Assistants v2 retrieval failures have been documented and partially mitigated within weeks of release. The vendor-product lifecycle (deploy \u2192 user complaints \u2192 patch) is faster than even base-model cycles. The bridge extends strongly: RAG closure cycles often compress to weeks because RAG systems are deployed-as-products and bug reports flow continuously.",
    "empirical_anchors": [
      "Perplexity citation faithfulness audits (multiple cycles 2024)",
      "OpenAI Assistants v2 file search retrieval bug reports \u2192 patches (2024)",
      "NotebookLM source-grounding error reports \u2192 updates (Google 2023-2025)",
      "RAGAS benchmark release cycles (Es et al 2023-2024)",
      "ARES retrieval evaluation framework iterations (Saad-Falcon et al 2024)"
    ],
    "if_fails_implications": "Highly unlikely B2 fails here. If it does, it would mean RAG closure is unmeasurable because retrieval corpora drift faster than auditing \u2014 a different kind of compression failure.",
    "notes": "RAG might compress even further than LLM cycles because retrieval corpus updates are continuous, making 'before-and-after' audit anchors less stable."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:Robotics_B3",
    "ledger_to_be": "Robotics / Embodied AI",
    "bridge": "B3_caps_transfer_mitigations_dont",
    "predicted_extension": "extends",
    "confidence": 0.86,
    "reasoning": "Robotics has stark cross-surface capability claims that transfer (RT-2 manipulation skills transfer to OpenVLA's similar architecture, \u03c00 action representations transfer to flow-matching variants) but mitigations for failure modes (sim-to-real gap, embodiment mismatch, safety constraints) are notoriously surface-specific. A safety filter that works on Franka arm doesn't transfer to bimanual humanoids. The bridge extends with high confidence because hardware-platform specificity is even stronger than software-platform specificity in LLM-land.",
    "empirical_anchors": [
      "RT-2 manipulation skill transfer to OpenVLA replication (2023-2024)",
      "Sim-to-real domain randomization techniques platform-specificity (Tobin et al 2017 \u2192 2024 updates)",
      "Safety constraint generalization failures (e.g., constitutional-AI-for-robots attempts)",
      "Cross-embodiment policy transfer benchmarks (X-Embodiment dataset 2024)",
      "Hardware-cost-aware mitigation strategies (e.g., low-cost arm constraints don't port to industrial)"
    ],
    "if_fails_implications": "Unlikely to fail. If mitigations did transfer cross-platform, it would mean robotics has solved a problem LLM-land hasn't, which contradicts current evidence of acute embodiment-specificity.",
    "notes": "Cross-embodiment generalization is the active research frontier specifically because mitigations don't transfer \u2014 this is empirical confirmation, not refutation."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:Multilingual_B3",
    "ledger_to_be": "Multilingual / Low-Resource",
    "bridge": "B3_caps_transfer_mitigations_dont",
    "predicted_extension": "extends",
    "confidence": 0.81,
    "reasoning": "Multilingual capability transfers across model families (NLLB transfers BLEU patterns to Aya, Aya to Llama-multilingual) but mitigations for failure modes (hallucination in low-resource, dialectal drift, code-switching errors) are language-specific and do not transfer. RLHF safety alignment in English does NOT carry over to Bantu or Tamil \u2014 empirically documented gap. The bridge extends: cross-script generalization claims propagate cleanly, but each script's failure-mode mitigations require independent work.",
    "empirical_anchors": [
      "NLLB \u2192 Aya BLEU transfer patterns (2022-2024)",
      "English RLHF \u2192 non-English safety transfer failures (multiple audits 2023-2024)",
      "Dialectal Arabic safety mitigation specificity (Habash, Diab et al)",
      "Cross-script tokenizer-induced failure modes (Bostrom & Durrett 2020, updated 2024)",
      "Post-training language drift mitigation experiments (Cohere Aya safety work 2024)"
    ],
    "if_fails_implications": "If multilingual mitigations DID transfer, B3 weakens. This is empirically not the case \u2014 safety alignment is famously English-dominant and resistant to transfer.",
    "notes": "Translation-vs-generation decoupling is a sharp test: translation mitigations (constrained decoding) often transfer; generation mitigations (RLHF, constitutional AI) do not."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:RAG_B3",
    "ledger_to_be": "RAG / Retrieval-Augmented",
    "bridge": "B3_caps_transfer_mitigations_dont",
    "predicted_extension": "extends",
    "confidence": 0.84,
    "reasoning": "RAG capability claims (retrieval recall, citation rate) transfer across systems (Perplexity \u2192 NotebookLM \u2192 Claude retrieval) but mitigations for failure modes (hallucinated citations, corpus contamination, query-decomposition errors) are deeply system-specific. A retrieval-corpus-contamination mitigation that works for Anthropic doesn't translate to OpenAI's architecture. The bridge extends cleanly: every RAG vendor reinvents their own faithfulness pipeline because mitigations don't port across retrieval-generation interface specifications.",
    "empirical_anchors": [
      "Perplexity citation faithfulness work \u2192 independent applicability to NotebookLM (low transfer)",
      "Anthropic Claude retrieval safety patterns \u2192 OpenAI Assistants implementation gap (2024)",
      "Multi-document synthesis benchmarks across vendors (2024)",
      "Query-decomposition strategies in different RAG stacks (vendor-specific)",
      "RAG contamination mitigation literature (Liu et al 2024)"
    ],
    "if_fails_implications": "Unlikely to fail. The architectural fragmentation of RAG (BM25+rerank, dense retrieval, hybrid, knowledge graphs) ensures mitigations remain non-portable.",
    "notes": "RAG is the strongest extension of B3 because the retrieval-generation interface is more idiosyncratic across vendors than any single LLM API."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:Robotics_B4",
    "ledger_to_be": "Robotics / Embodied AI",
    "bridge": "B4_distillation_portability_equivalence",
    "predicted_extension": "fails_to_extend",
    "confidence": 0.68,
    "reasoning": "In LLM-land, distillation, architecture-portability, and scaling-portability are bridged by shared substrate (transformer attention, autoregressive generation). In robotics, the substrate fractures: VLA models (RT-2, OpenVLA, \u03c00) use different action representations (discrete tokens, flow matching, continuous), and distillation across embodiments is hardware-bound. The bridge likely fails because robotics scaling does NOT follow LLM scaling laws cleanly \u2014 Gato-2 hypotheticals notwithstanding, no published robotics paper has demonstrated the architecture-portability-equals-scaling-portability equivalence.",
    "empirical_anchors": [
      "RT-2 \u2192 smaller variants distillation attempts (Google 2023-2024)",
      "OpenVLA quantization/distillation experiments (Stanford 2024)",
      "Gato (DeepMind 2022) \u2192 scaling claims and limitations",
      "RFM-1 (Covariant 2024) \u2014 robotics foundation model scaling claims",
      "X-Embodiment scaling law analysis (2024)"
    ],
    "if_fails_implications": "B4 becomes B4': 'Distillation = architecture-portability = scaling-portability only when substrate is uniform (transformer + autoregressive).' Robotics breaks the uniformity assumption, sharpening the bridge.",
    "notes": "Watch for whether VLA models converge on a single substrate (e.g., transformer + action tokens) \u2014 if so, B4 might extend after convergence. Pre-convergence, it fails."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:Multilingual_B4",
    "ledger_to_be": "Multilingual / Low-Resource",
    "bridge": "B4_distillation_portability_equivalence",
    "predicted_extension": "extends",
    "confidence": 0.78,
    "reasoning": "Multilingual models inherit LLM substrate (transformer + tokenizer), so distillation patterns from base models (Llama distillation, Qwen distillation) transfer to multilingual variants. NLLB distilled versions retain language coverage proportional to scaling. The bridge extends because multilingual is fundamentally an LLM application with vocabulary expansion, not a substrate change.",
    "empirical_anchors": [
      "NLLB-200 distilled variants (Meta 2022)",
      "Aya-Expanse distillation experiments (Cohere 2024)",
      "Llama-3 multilingual distillation (community 2024)",
      "Qwen multilingual scaling laws (Alibaba 2024)",
      "Cross-lingual distillation papers (e.g., DistilBERT-multilingual lineage)"
    ],
    "if_fails_implications": "If multilingual distillation breaks (e.g., low-resource languages disappear disproportionately under distillation), B4 weakens to 'high-resource only.' This is the most likely failure mode \u2014 language coverage may scale non-uniformly.",
    "notes": "Low-resource language preservation under distillation is the sharp test \u2014 if Bantu/Dravidian languages vanish in distilled variants while European persist, B4 conditional-extends."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:RAG_B4",
    "ledger_to_be": "RAG / Retrieval-Augmented",
    "bridge": "B4_distillation_portability_equivalence",
    "predicted_extension": "unclear",
    "confidence": 0.52,
    "reasoning": "RAG combines retrieval (often non-neural: BM25, embedding ANN) with generation (LLM). Distillation applies to generation portion but the retrieval-generation interface specification is not subject to LLM scaling laws. The bridge's extension depends on whether RAG is viewed as 'LLM with extra context' (extends) or as 'novel architecture' (fails to extend). Current evidence is mixed.",
    "empirical_anchors": [
      "Distilled RAG models in production (Anthropic, OpenAI 2024)",
      "Retrieval index scaling vs generation scaling decoupling",
      "RAG benchmark performance vs base model scale (2024)",
      "Multi-document synthesis at different scales (2024)",
      "Query-decomposition effectiveness across model sizes"
    ],
    "if_fails_implications": "If RAG breaks B4, it suggests retrieval-augmented systems are a genuinely new architecture class requiring its own scaling theory. This would be a significant theoretical finding.",
    "notes": "The retrieval component's non-neural nature is the wildcard \u2014 it doesn't scale with LLM compute, but does scale with corpus size and index quality."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:Robotics_B5",
    "ledger_to_be": "Robotics / Embodied AI",
    "bridge": "B5_0_over_N_audit_pattern",
    "predicted_extension": "extends",
    "confidence": 0.92,
    "reasoning": "Forensic audits of robotics capability claims (Helix open-world manipulation, RFM-1 universal foundation model claims, Gato generality) consistently find 0/N reproductions when ambitious claims are tested. The 0/N pattern is even sharper in robotics because hardware setup costs deter many independent attempts, and those who attempt usually fail to reproduce headline claims. The bridge extends with very high confidence \u2014 robotics is a paradigmatic case of forensic auditing finding 0/N.",
    "empirical_anchors": [
      "Helix February 2025 announcement \u2192 independent reproduction status (likely 0 successful)",
      "Gato (DeepMind 2022) generality claims \u2192 independent reproductions (~0)",
      "RFM-1 (Covariant 2024) universal manipulation claims \u2192 independent audits",
      "RT-2 'unseen capability' claims \u2192 audit by community ablations (often diluted)",
      "Sim-to-real success rate audits across robotics labs (Akkaya et al 2019 \u2192 2024 updates)"
    ],
    "if_fails_implications": "Extremely unlikely. If 0/N pattern fails in robotics, it would mean robotics has unusually faithful capability reporting, contradicting all known evidence.",
    "notes": "0/N is so consistent in robotics that it almost defines the field's relationship between capability claim and audit."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:Multilingual_B5",
    "ledger_to_be": "Multilingual / Low-Resource",
    "bridge": "B5_0_over_N_audit_pattern",
    "predicted_extension": "extends",
    "confidence": 0.89,
    "reasoning": "Multilingual capability claims (NLLB's 200-language coverage, Aya's 101-language instruction following, Llama-3's multilingual reasoning) routinely fail forensic audits \u2014 when speakers of low-resource languages evaluate output, quality drops dramatically from headline numbers. The 0/N pattern extends because evaluation transparency in low-resource languages is structurally absent. Speaker audits find 0/N for ambitious cross-language capability claims.",
    "empirical_anchors": [
      "NLLB-200 speaker evaluation studies (Goyal et al 2022 \u2192 2024 follow-ups)",
      "Aya-Expanse community evaluation in African/South Asian languages",
      "Llama-3 multilingual reasoning audits (community 2024)",
      "Masakhane African NLP audit work (2020-2025)",
      "Dialectal Arabic speaker evaluation papers (Habash et al)"
    ],
    "if_fails_implications": "Unlikely to fail. The structural conditions (low evaluation transparency, scarce speaker resources) ensure forensic audits routinely find 0/N when applied rigorously.",
    "notes": "0/N is especially sharp in low-resource where claimed capability vs measured speaker-judged capability diverges dramatically."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:RAG_B5",
    "ledger_to_be": "RAG / Retrieval-Augmented",
    "bridge": "B5_0_over_N_audit_pattern",
    "predicted_extension": "extends",
    "confidence": 0.91,
    "reasoning": "RAG audits routinely find 0/N for ambitious claims \u2014 citation faithfulness, multi-document synthesis, query-decomposition correctness. Perplexity Pro audits find citations frequently hallucinated; NotebookLM audits find source-grounding breaks under stress; Anthropic retrieval audits find faithfulness gaps. The bridge extends very strongly: RAG forensic auditing is a vibrant subfield specifically because 0/N is so common.",
    "empirical_anchors": [
      "Perplexity Pro citation audits (multiple 2024)",
      "NotebookLM source-grounding stress tests (community 2024-2025)",
      "Claude retrieval faithfulness independent audits",
      "RAGAS benchmark findings (Es et al 2024)",
      "ARES evaluation framework results across RAG systems (Saad-Falcon et al 2024)"
    ],
    "if_fails_implications": "Extremely unlikely. RAG is the domain where 0/N is most clearly documented because audit infrastructure (RAGAS, ARES) is well-developed.",
    "notes": "RAG is the highest-confidence extension of B5 across all 3 next-queued ledgers \u2014 audit infrastructure exists and routinely produces 0/N findings."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:Robotics_B6",
    "ledger_to_be": "Robotics / Embodied AI",
    "bridge": "B6_anti_saturation_only_closure",
    "predicted_extension": "extends",
    "confidence": 0.74,
    "reasoning": "Robotics benchmarks (CALVIN, LIBERO, X-Embodiment) face saturation issues similar to LLM benchmarks \u2014 models trained on benchmark-aware datasets quickly saturate. Anti-saturation strategies (held-out embodiments, novel-object generalization, deployment-environment shifts) are emerging as the only honest closure mechanism. The bridge extends because saturation is happening on standard benchmarks faster than new ones can be built, mirroring LLM-land's MMLU/HumanEval dynamic.",
    "empirical_anchors": [
      "CALVIN benchmark saturation curve (Mees et al 2022 \u2192 2024)",
      "LIBERO benchmark dynamics (Liu et al 2023-2024)",
      "X-Embodiment dataset partition strategies (2024)",
      "Sim-to-real hold-out evaluation protocols (CoRL 2023-2025)",
      "Novel-object manipulation generalization benchmarks"
    ],
    "if_fails_implications": "If anti-saturation isn't the only working closure, it would mean robotics has solved the benchmark-training-contamination problem. No evidence of this \u2014 saturation is increasingly visible.",
    "notes": "Embodiment hold-outs are robotics' analog to held-out test sets \u2014 anti-saturation strategy."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:Multilingual_B6",
    "ledger_to_be": "Multilingual / Low-Resource",
    "bridge": "B6_anti_saturation_only_closure",
    "predicted_extension": "extends",
    "confidence": 0.83,
    "reasoning": "Multilingual benchmarks (FLORES-200, MEGA, XTREME) saturate rapidly as models train on benchmark-adjacent data. Anti-saturation closure (held-out languages, dialectal evaluation, code-switching tasks, speaker-novel test sets) is increasingly the only credible measurement strategy. The bridge extends strongly because translation memorization is a documented phenomenon and forces anti-saturation strategies.",
    "empirical_anchors": [
      "FLORES-200 contamination concerns (community 2023-2025)",
      "MEGA multilingual evaluation suite (Ahuja et al 2023)",
      "XTREME-UP low-resource benchmark (Ruder et al 2023)",
      "Held-out language evaluation protocols (Masakhane 2024)",
      "Dialect-specific anti-saturation tests (Arabic, Chinese variants)"
    ],
    "if_fails_implications": "If non-anti-saturation closure works in multilingual (e.g., standard benchmarks remain valid), B6 weakens. Unlikely given documented test set contamination in multilingual training corpora.",
    "notes": "Held-out languages are stronger than held-out test sets because the model never saw the language at all \u2014 anti-saturation is built into the closure design."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:RAG_B6",
    "ledger_to_be": "RAG / Retrieval-Augmented",
    "bridge": "B6_anti_saturation_only_closure",
    "predicted_extension": "extends",
    "confidence": 0.79,
    "reasoning": "RAG benchmarks (HotpotQA, MS MARCO, BEIR) face severe saturation as models train on web data overlapping with retrieval corpora. Anti-saturation closure (held-out corpora, novel-domain retrieval, time-shifted evaluation) is the only credible measurement. The bridge extends because retrieval-corpus-contamination is a defining failure mode of RAG evaluation.",
    "empirical_anchors": [
      "MS MARCO contamination analysis (2024)",
      "BEIR cross-domain retrieval benchmark (Thakur et al 2021 \u2192 2024 follow-ups)",
      "Time-shifted RAG evaluation (e.g., FreshLLMs, Vu et al 2023)",
      "Held-out corpus retrieval benchmarks (2024)",
      "RAG contamination mitigation literature (Liu et al 2024)"
    ],
    "if_fails_implications": "If standard RAG benchmarks remain valid without anti-saturation, B6 weakens. Unlikely \u2014 the retrieval corpus IS the training corpus for most LLMs, making contamination near-universal.",
    "notes": "Time-shifted evaluation (questions whose answers postdate training cutoff) is RAG's clean anti-saturation strategy."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:Robotics_B7",
    "ledger_to_be": "Robotics / Embodied AI",
    "bridge": "B7_western_chinese_open_weight_inversion",
    "predicted_extension": "unclear",
    "confidence": 0.48,
    "reasoning": "The Western-vs-Chinese open-weight inversion is well-documented for LLMs (Western labs increasingly closed; Chinese labs DeepSeek/Qwen/Yi open-weight). For robotics, the picture is murkier: Western labs (Stanford OpenVLA, Berkeley) have released open-weight VLAs, while Chinese robotics labs are less visible internationally. Hardware specificity may invert the inversion \u2014 robotics openness depends on hardware availability, not just weight licensing.",
    "empirical_anchors": [
      "OpenVLA (Stanford 2024) \u2014 Western open release",
      "\u03c00 (Physical Intelligence 2024) \u2014 partial openness",
      "Helix (Figure 2025) \u2014 closed weights",
      "Chinese robotics labs publication patterns (Unitree, UBTech, AgiBot 2024-2025)",
      "Hardware-coupled openness vs weight-only openness analysis"
    ],
    "if_fails_implications": "If the inversion doesn't appear in robotics, B7 becomes domain-specific to text-LLM ecosystems. This would suggest the openness asymmetry is driven by commercial pressure on Western LLM labs, not by national-strategic posture.",
    "notes": "Embodied AI may be where Chinese labs catch up via hardware integration rather than weight release \u2014 different openness modality."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:Multilingual_B7",
    "ledger_to_be": "Multilingual / Low-Resource",
    "bridge": "B7_western_chinese_open_weight_inversion",
    "predicted_extension": "extends",
    "confidence": 0.71,
    "reasoning": "Multilingual open-weight releases follow the LLM pattern: Chinese labs (Qwen-multilingual, Yi) release multilingual variants open-weight; Western labs (Anthropic, OpenAI) keep multilingual capability behind APIs. Aya-Expanse (Cohere, partially open) is an exception. The bridge extends because the multilingual is downstream of base-model openness decisions.",
    "empirical_anchors": [
      "Qwen-multilingual open releases (Alibaba 2024)",
      "Yi multilingual variants (01.AI 2024)",
      "Aya-Expanse (Cohere 2024) \u2014 partial Western exception",
      "Llama-3 multilingual openness (Meta 2024) \u2014 partial",
      "Anthropic/OpenAI multilingual API-only access"
    ],
    "if_fails_implications": "If multilingual openness inverts opposite to LLM openness, B7 becomes more nuanced. Currently no evidence for this.",
    "notes": "NLLB-200 (Meta open release) and Aya (Cohere) are the strongest Western exceptions, complicating but not falsifying the pattern."
  },
  {
    "record_kind": "inheritance_prediction",
    "paper_id": "prediction:RAG_B7",
    "ledger_to_be": "RAG / Retrieval-Augmented",
    "bridge": "B7_western_chinese_open_weight_inversion",
    "predicted_extension": "fails_to_extend",
    "confidence": 0.62,
    "reasoning": "RAG is dominated by Western vendors (Anthropic, OpenAI, Google, Perplexity) building closed-source retrieval systems on top of their LLMs. Chinese RAG ecosystems exist (Baidu, Alibaba) but are less visible in international research. More importantly, RAG is a system-level architecture, not a weight-level artifact \u2014 'open-weight RAG' is a different concept from 'open-weight LLM.' The inversion doesn't apply cleanly because RAG openness is about retrieval-stack openness (LlamaIndex, Haystack, LangChain \u2014 all Western open-source).",
    "empirical_anchors": [
      "LlamaIndex (Western open-source RAG framework, US 2023+)",
      "Haystack (Deepset, German open-source RAG, 2020+)",
      "LangChain (Western open-source LLM framework, 2022+)",
      "Chinese RAG ecosystem (Baidu, Alibaba enterprise products 2024)",
      "Perplexity, NotebookLM, Claude retrieval \u2014 all closed Western systems"
    ],
    "if_fails_implications": "B7 becomes B7': 'Western-vs-Chinese open-weight inversion holds for base LLMs but inverts for system-level applications.' RAG would be the falsifier \u2014 Western RAG ecosystem is OPEN at framework level, CLOSED at vendor product level.",
    "notes": "This is the most interesting prediction \u2014 RAG might genuinely break the open-weight inversion pattern because the openness dimension is different (framework vs weights vs product)."
  }
]