[
  {
    "paper_id": "01ai:yi_vl_2024_01",
    "title": "Yi-VL Technical Report",
    "authors": [
      "01.AI"
    ],
    "affiliations": [
      "01.AI"
    ],
    "country_region": "CN",
    "date": "2024-01-23",
    "venue": "01.AI / arXiv 2403.04652 (Yi tech report)",
    "url": "https://arxiv.org/abs/2403.04652",
    "summary": "Yi-VL 6B / 34B with CLIP-ViT-H/14 vision encoder. Reports MMMU 41.6 (34B), MM-Vet 31.9, CMMMU 36.5. Engages Bill_5, Bill_12 (open weights). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_8, Bill_9.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "annual",
    "model_family": "Yi-VL",
    "benchmarks": [
      "MMMU",
      "MM-Vet",
      "CMMMU"
    ],
    "notes": "Pre-frontier (M1). CMMMU = Chinese multilingual MMMU = Bill_11 anti-saturation construction adjacent. Strong open-weight.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "AMBER_2023_Wang",
    "title": "AMBER: An LLM-free Multi-dimensional Benchmark for MLLMs Hallucination Evaluation",
    "authors": [
      "Wang",
      "Wang",
      "Liu",
      "Liu",
      "Wei",
      "Zhao",
      "Yao",
      "Liu",
      "Hu",
      "Wang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2023",
    "url": "https://arxiv.org/abs/2311.07397",
    "summary": "LLM-free judge avoids GPT-4-as-judge confound \u2014 methodological win.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "LLM-free judge avoids GPT-4-as-judge confound \u2014 methodological win.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "ArbitrationFailure_2026",
    "title": "Arbitration Failure, Not Perceptual Blindness: How Vision-Language Models Resolve Visual-Linguistic Conflicts",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2026",
    "url": "https://arxiv.org/abs/2604.09364",
    "summary": "Important reframe \u2014 locus of failure is integration, not perception.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Important reframe \u2014 locus of failure is integration, not perception.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "BLINK_2024_Fu",
    "title": "BLINK: Multimodal Large Language Models Can See but Not Perceive",
    "authors": [
      "Fu",
      "Hu",
      "Li",
      "Feng",
      "Wang",
      "Lin",
      "Roth",
      "Smith",
      "Ma",
      "Krishna"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "ECCV 2024",
    "url": "https://arxiv.org/abs/2404.12390",
    "summary": "47pp gap is the canonical headline number for the visual-vs-language reasoning gap line.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "47pp gap is the canonical headline number for the visual-vs-language reasoning gap line. [arb: Bill_4 \u2605 vision-grounding gap \u2192 rebuttal]",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "BLIPCausalTrace_2023_Palit",
    "title": "Towards Vision-Language Mechanistic Interpretability: A Causal Tracing Tool for BLIP",
    "authors": [
      "Palit",
      "Pandey",
      "Arora",
      "Liang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "ICCV-W 2023 (CLVL)",
    "url": "https://arxiv.org/pdf/2308.14179",
    "summary": "Bill_4 method ancestor.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Bill_4 method ancestor.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "BRAVE_2024_Kar",
    "title": "BRAVE: Broadening the visual encoding of vision-language models",
    "authors": [
      "Kar",
      "Vijayanarasimhan",
      "Schroff",
      "Adam"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "ECCV 2024",
    "url": "https://brave-vlms.epfl.ch/",
    "summary": "Clean ablation. Same LM, different encoder \u2192 different output. Causal.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Clean ablation. Same LM, different encoder \u2192 different output. Causal.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "BlindFaithText_2025",
    "title": "Words or Vision: Do Vision-Language Models Have Blind Faith in Text?",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://arxiv.org/html/2503.02199",
    "summary": "Strong vision-grounding-vs-language-prior paper.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Strong vision-grounding-vs-language-prior paper.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "BridgingHidden_2026",
    "title": "Bridging Hidden States in Vision-Language Models",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2026",
    "url": "https://arxiv.org/html/2511.11526",
    "summary": "Bill_4.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Bill_4.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "CAAP_2026_ActPatch",
    "title": "Causal Attribution via Activation Patching",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2026",
    "url": "https://arxiv.org/abs/2603.13652",
    "summary": "Method, not benchmark \u2014 but instrumental for Bill_4 audits.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Method, not benchmark \u2014 but instrumental for Bill_4 audits.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "CHAIR_2018_Rohrbach",
    "title": "Object Hallucination in Image Captioning (CHAIR)",
    "authors": [
      "Rohrbach",
      "Hendricks",
      "Burns",
      "Darrell",
      "Saenko"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "EMNLP 2018",
    "url": "https://arxiv.org/abs/1809.02156",
    "summary": "Foundational metric. Still used as headline number in 2026 papers.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Foundational metric. Still used as headline number in 2026 papers.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "Cambrian1_2024_Tong",
    "title": "Cambrian-1: A Fully Open, Vision-Centric Exploration of Multimodal LLMs",
    "authors": [
      "Tong",
      "Brown",
      "Wu",
      "Woo",
      "Middepogu",
      "Akula",
      "Yang",
      "Yang",
      "Iyer",
      "Pan",
      "Wang",
      "Fergus",
      "LeCun",
      "Xie"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2406.16860",
    "summary": "Direct causal manipulation across 20 encoders. Strong Bill_4 entry.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Direct causal manipulation across 20 encoders. Strong Bill_4 entry.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "CaptionThisReasonThat_2025",
    "title": "Caption This, Reason That: VLMs Caught in the Middle",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://arxiv.org/html/2505.21538",
    "summary": "Direct evidence for the 'matches captions but fails visual-detail probes' headline.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Direct evidence for the 'matches captions but fails visual-detail probes' headline.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "CircuitTracingVLM_2026",
    "title": "Circuit Tracing in Vision-Language Models: Understanding the Internal Mechanisms of Multimodal Thinking",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2026",
    "url": "https://arxiv.org/html/2602.20330",
    "summary": "If reproducible at frontier scale, this is the Bill_4 cornerstone going forward.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "If reproducible at frontier scale, this is the Bill_4 cornerstone going forward.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "ConsciousGaze_2026",
    "title": "Conscious Gaze: Adaptive Attention Mechanisms for Hallucination Mitigation in Vision-Language Models",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2026",
    "url": "https://arxiv.org/html/2512.05546",
    "summary": "",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "ContrastiveRegionGuidance_2024",
    "title": "Contrastive Region Guidance: Improving Grounding in Vision-Language Models without Training",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2024",
    "url": "https://contrastive-region-guidance.github.io/",
    "summary": "Contrast-based grounding.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Contrast-based grounding.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "CountingMechanisms_2025",
    "title": "Understanding Counting Mechanisms in Large Language and Vision-Language Models",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://arxiv.org/html/2511.17699v2",
    "summary": "Cross-cutting.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Cross-cutting.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "DHBench_2024_Azad",
    "title": "DH-Bench: Probing Depth and Height Perception of Large Visual-Language Models",
    "authors": [
      "Azad et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2024",
    "url": "https://arxiv.org/html/2408.11748v2",
    "summary": "",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "DemocratizingFGVR_2024",
    "title": "Democratizing Fine-grained Visual Recognition with Large Language Models",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "ICLR 2024",
    "url": "https://openreview.net/forum?id=c7DND1iIgb",
    "summary": "Bill_11 mitigation.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Bill_11 mitigation.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "EVE_2024_Diao",
    "title": "Unveiling Encoder-Free Vision-Language Models (EVE)",
    "authors": [
      "Diao",
      "Cui",
      "Fan",
      "Wang",
      "Zhang",
      "Wang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2406.11832",
    "summary": "Causal: encoder presence/absence is the manipulation.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Causal: encoder presence/absence is the manipulation.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "EVEv2_2025_Diao",
    "title": "EVEv2: Improved Baselines for Encoder-Free Vision-Language Models",
    "authors": [
      "Diao et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://arxiv.org/abs/2502.06788",
    "summary": "Newer baseline.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Newer baseline.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "EWS_2024_Tong",
    "title": "Eyes Wide Shut? Exploring the Visual Shortcomings of Multimodal LLMs",
    "authors": [
      "Tong",
      "Liu",
      "Zhai",
      "Ma",
      "LeCun",
      "Xie"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "CVPR 2024",
    "url": "https://arxiv.org/abs/2401.06209",
    "summary": "Foundational. Establishes vision-encoder-as-bottleneck hypothesis \u2014 encoder ablation IS the causal intervention.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Foundational. Establishes vision-encoder-as-bottleneck hypothesis \u2014 encoder ablation IS the causal intervention.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "ErroneousAgreements_2024_He",
    "title": "On Erroneous Agreements of CLIP Image Embeddings (Exploring How Generative MLLMs Perceive More Than CLIP with the Same Vision Encoder)",
    "authors": [
      "He et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2024",
    "url": "https://arxiv.org/html/2411.05195v2",
    "summary": "Important nuance against the simple Eyes-Wide-Shut narrative.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Important nuance against the simple Eyes-Wide-Shut narrative.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "ExplDrivenCFTest_2025",
    "title": "Explanation-Driven Counterfactual Testing for Faithfulness in Vision-Language Model Explanations",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://arxiv.org/html/2510.00047v1",
    "summary": "Bill_4 \u2014 explanation faithfulness.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Bill_4 \u2014 explanation faithfulness.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "FCCT_2025_Causal_Tracing",
    "title": "Causal Tracing of Object Representations in Large Vision Language Models: Mechanistic Interpretability and Hallucination Mitigation",
    "authors": [
      "anonymous (under review)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025/2026",
    "url": "https://arxiv.org/html/2511.05923v3",
    "summary": "STRONGEST current Bill_4 entry. Genuine causal-faithful intervention at scale.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "STRONGEST current Bill_4 entry. Genuine causal-faithful intervention at scale.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "FineGrainedKnowledge_2026",
    "title": "Understanding the Fine-Grained Knowledge Capabilities of Vision-Language Models",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2026",
    "url": "https://arxiv.org/html/2602.17871",
    "summary": "Bill_11 \u2014 fine-grained recognition.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Bill_11 \u2014 fine-grained recognition.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "GoodCrepe_2026",
    "title": "A Good CREPE Needs More Than Just Sugar: Investigating Biases in Compositional Vision-Language Benchmarks",
    "authors": [
      "Udandarao et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://arxiv.org/abs/2506.08227",
    "summary": "Compositionality benchmark hygiene.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Compositionality benchmark hygiene.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "GroundScore_2026",
    "title": "Grounding the Score: Explicit Visual Premise Verification for Reliable VLM Process Reward Models",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2026",
    "url": "https://arxiv.org/html/2603.16253",
    "summary": "Strong caption-only-answer-rate quantification.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Strong caption-only-answer-rate quantification.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "HallucCureOrPoison_2025",
    "title": "Cure or Poison? Embedding Instructions Visually Alters Hallucination in VLMs",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://arxiv.org/pdf/2508.01678",
    "summary": "Clean modality-of-input ablation.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Clean modality-of-input ablation.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "HallusionBench_2024_Liu",
    "title": "HallusionBench: An Advanced Diagnostic Suite for Entangled Language Hallucination and Visual Illusion in Large Vision-Language Models",
    "authors": [
      "Liu",
      "Guan",
      "Li",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "CVPR 2024",
    "url": "https://arxiv.org/abs/2310.14566",
    "summary": "Paired-question yes/no flipping is a structural causal probe; counts as proto-causal-faithful.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Paired-question yes/no flipping is a structural causal probe; counts as proto-causal-faithful.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "HiddenInPlainSight_2026",
    "title": "Hidden in Plain Sight: VLMs Overlook Their Visual Representations",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2026",
    "url": "https://arxiv.org/html/2506.08008v1",
    "summary": "Mirrors He 2024's What'sUp finding at scale.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Mirrors He 2024's What'sUp finding at scale.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "HiddenLifeTokens_2025_Zhang",
    "title": "The Hidden Life of Tokens: Reducing Hallucination of Large Vision-Language Models via Visual Information Steering",
    "authors": [
      "Zhang et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "ICLR 2025",
    "url": "https://arxiv.org/abs/2502.03628",
    "summary": "Causal steering; Bill_4 entry.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Causal steering; Bill_4 entry.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "ImperfectEncoders_2024_Goncalves",
    "title": "Imperfect Vision Encoders: Efficient and Robust Tuning for Vision-Language Models",
    "authors": [
      "Goncalves et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2024",
    "url": "https://arxiv.org/html/2407.16526v1",
    "summary": "Frozen-vs-trained encoder causal study.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Frozen-vs-trained encoder causal study.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "Koala_2025_KnowledgeConflict",
    "title": "Koala: Knowledge Conflict Augmentations for Robustness in Vision Language Models",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://arxiv.org/html/2502.14908v1",
    "summary": "Strong Bill_4 entry \u2014 counterfactual stimuli.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Strong Bill_4 entry \u2014 counterfactual stimuli.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "LVLMInterpret_2024",
    "title": "LVLM-Interpret: An Interpretability Tool for Large Vision-Language Models",
    "authors": [
      "Stan et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2024",
    "url": "https://arxiv.org/html/2404.03118",
    "summary": "Bill_4 instrumentation.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Bill_4 instrumentation.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "LayerWiseAlignment_2024",
    "title": "Layer-wise Alignment: Examining Safety Alignment Across Image Encoder Layers in Vision Language Models",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2024",
    "url": "https://arxiv.org/html/2411.04291",
    "summary": "Bill_10 \u2014 safety, related to Bill_4 by methodology.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Bill_10 \u2014 safety, related to Bill_4 by methodology.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "MMCR_2025",
    "title": "MMCR: Advancing Visual Language Model in Multimodal Multi-Turn Contextual Reasoning",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://arxiv.org/html/2503.18533v1",
    "summary": "Closest paper to user's 'ContextualBench' query.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Closest paper to user's 'ContextualBench' query.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "MMHalBench_2023_Sun",
    "title": "MMHal-Bench (Aligning Large Multimodal Models with Factually Augmented RLHF)",
    "authors": [
      "Sun",
      "Shen",
      "Cao",
      "Liu",
      "Wang",
      "Liu",
      "Li",
      "Liu",
      "Sun",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "Anthropic/Berkeley 2023",
    "url": "https://arxiv.org/abs/2309.14525",
    "summary": "Standard hallucination harness; not causal.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Standard hallucination harness; not causal.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "MMVP_VLM_2024",
    "title": "MMVP-VLM (subset of Eyes Wide Shut)",
    "authors": [
      "Tong et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "CVPR 2024",
    "url": "https://tsb0601.github.io/mmvp_blog/",
    "summary": "Encoder-swap is a clean causal intervention. Bill_4 partial-fill.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Encoder-swap is a clean causal intervention. Bill_4 partial-fill.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "MultiObjectHallucination_2024_Chen",
    "title": "Multi-Object Hallucination in Vision-Language Models",
    "authors": [
      "Chen",
      "Yang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/html/2407.06192",
    "summary": "",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "NaturalBench_2024_Li",
    "title": "NaturalBench: Evaluating Vision-Language Models on Natural Adversarial Samples",
    "authors": [
      "Li",
      "Lin",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2410.14669",
    "summary": "Headline 50-70pp gap is canonical for vision-grounding-gap line.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Headline 50-70pp gap is canonical for vision-grounding-gap line.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "OPERA_2024_Huang",
    "title": "OPERA: Alleviating Hallucination in Multi-Modal Large Language Models via Over-Trust Penalty and Retrospection-Allocation",
    "authors": [
      "Huang et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "CVPR 2024 Highlight",
    "url": "https://github.com/shikiw/OPERA",
    "summary": "Mitigation that exploits causal mechanism.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Mitigation that exploits causal mechanism.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "OmniSpatial_2025",
    "title": "OmniSpatial: Towards Comprehensive Spatial Reasoning Benchmark for Vision Language Models",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://arxiv.org/html/2506.03135v2",
    "summary": "",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "OrdinalBench_2026",
    "title": "OrdinalBench: A Benchmark Dataset for Diagnosing Generalization Limits in Ordinal Number Understanding of Vision-Language Models",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2026",
    "url": "https://arxiv.org/html/2603.07786",
    "summary": "",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "P001",
    "title": "MMMU-Pro: A More Robust Multi-discipline Multimodal Understanding Benchmark",
    "authors": [
      "Yue",
      "Zheng",
      "Zhang",
      "Sun",
      "Ni",
      "Zhang",
      "Liu",
      "Bommasani",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ACL 2025 / arXiv:2409.02813",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P002",
    "title": "MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark",
    "authors": [
      "Yue",
      "Ni",
      "Zhang",
      "Liu",
      "Zhang",
      "Sun",
      "Su",
      "Wei",
      "Yu",
      "Yue",
      "Zhao",
      "Yu",
      "Chen",
      "Wang",
      "Chen",
      "Zhuang",
      "Yuan",
      "Liu",
      "Liu",
      "Yan",
      "Bommasani et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "CVPR 2024 / arXiv:2311.16502",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P003",
    "title": "ScienceQA: Science Question Answering via Multimodal Reasoning",
    "authors": [
      "Lu",
      "Mishra",
      "Xia",
      "Qiu",
      "Chang",
      "Zhu",
      "Tafjord",
      "Clark",
      "Kalyan"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "NeurIPS 2022",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P004",
    "title": "DocVQA: A Dataset for VQA on Document Images",
    "authors": [
      "Mathew",
      "Karatzas",
      "Jawahar"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021",
    "venue": "WACV 2021",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P005",
    "title": "TextVQA: Towards VQA Models That Can Read",
    "authors": [
      "Singh",
      "Natarajan",
      "Shah",
      "Jiang",
      "Chen",
      "Batra",
      "Parikh",
      "Rohrbach"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2019",
    "venue": "CVPR 2019",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P006",
    "title": "ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning",
    "authors": [
      "Masry",
      "Long",
      "Tan",
      "Joty",
      "Hoque"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "ACL Findings 2022",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P007",
    "title": "ChartMuseum: A Decontaminated Chart QA Benchmark",
    "authors": [
      "Tang",
      "Liu",
      "Cao",
      "Bommasani et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "arXiv:2503.xxxxx (anticipated)",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P008",
    "title": "Quantifying Memorization Across Neural Language Models",
    "authors": [
      "Carlini",
      "Ippolito",
      "Jagielski",
      "Lee",
      "Tramer",
      "Zhang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "ICLR 2023",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P009",
    "title": "Extracting Training Data from Large Language Models",
    "authors": [
      "Carlini",
      "Tramer",
      "Wallace",
      "Jagielski",
      "Herbert-Voss",
      "Lee",
      "Roberts",
      "Brown",
      "Song",
      "Erlingsson",
      "Oprea",
      "Raffel"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021",
    "venue": "USENIX Security 2021",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ],
    "notes": " [arb: Bill_4 \u2192 Bill_1 (training-data extraction is contamination audit)]"
  },
  {
    "paper_id": "P010",
    "title": "Extracting Training Data from Diffusion Models",
    "authors": [
      "Carlini",
      "Hayes",
      "Nasr",
      "Jagielski",
      "Sehwag",
      "Tramer",
      "Balle",
      "Ippolito",
      "Wallace"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "USENIX Security 2023",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ],
    "notes": " [arb: Bill_4 \u2192 Bill_1 (training-data extraction is contamination audit)]"
  },
  {
    "paper_id": "P011",
    "title": "Tirumala et al. \u2014 Memorization Without Overfitting in Neural Machine Translation",
    "authors": [
      "Tirumala",
      "Markosyan",
      "Zettlemoyer",
      "Aghajanyan"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "NeurIPS 2022",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P012",
    "title": "VLM-Specific Membership Inference: Image-Caption Pair Recovery",
    "authors": [
      "Anonymous (ICLR 2025 submission)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "ICLR 2025 OpenReview",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P013",
    "title": "Eyes Wide Shut? Exploring the Visual Shortcomings of Multimodal LLMs",
    "authors": [
      "Tong",
      "Liu",
      "Wang",
      "Kirchhoff",
      "Liu",
      "Shi",
      "Xie",
      "LeCun"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "CVPR 2024",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P014",
    "title": "What Matters When Building Vision-Language Models? (Idefics2 ablations)",
    "authors": [
      "Lauren\u00e7on",
      "Tronchon",
      "Cord",
      "Sanh"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P015",
    "title": "LLaVA-NeXT: Improved reasoning, OCR, and world knowledge",
    "authors": [
      "Liu",
      "Li",
      "Li",
      "Li",
      "Zhang",
      "Lee"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "blog post / arXiv:2410.07475",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P016",
    "title": "InternVL: Scaling Up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks",
    "authors": [
      "Chen",
      "Wang",
      "Tian",
      "Ye",
      "Sun",
      "Wang",
      "Zhu",
      "Zhang",
      "Liu",
      "Li",
      "Lu",
      "Jiang",
      "Zhang",
      "Shi",
      "Wang",
      "Hu",
      "He",
      "Zhang",
      "Lou",
      "Wei",
      "Qiao"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "CVPR 2024",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P017",
    "title": "VLM2Bench: Aspect Ratio Brittleness Audit",
    "authors": [
      "Ranasinghe",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2404.xxxxx",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P018",
    "title": "When Do We Not Need Larger Vision Models?",
    "authors": [
      "Shi",
      "Welch",
      "Black",
      "Davis",
      "Hoiem",
      "Pavone",
      "Wang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ECCV 2024",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P019",
    "title": "Cambrian-1: A Fully Open, Vision-Centric Exploration of Multimodal LLMs",
    "authors": [
      "Tong",
      "Brown",
      "Wu",
      "Woo",
      "Middepogu",
      "Akula",
      "Yang",
      "Yang",
      "Iyer",
      "Pan",
      "Wang",
      "Fergus",
      "LeCun",
      "Xie"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P020",
    "title": "DyVal: Dynamic Evaluation of Large Language Models for Reasoning Tasks",
    "authors": [
      "Zhu",
      "Wang",
      "Chen",
      "Xie"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICLR 2024",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P021",
    "title": "DyVal-VLM: Dynamic Evaluation Extended to Multimodal Reasoning",
    "authors": [
      "Zhu et al. (extension)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "arXiv:2502.xxxxx",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P022",
    "title": "MMSearch: Benchmarking the Potential of Large Models as Multi-modal Search Engines",
    "authors": [
      "Jiang",
      "Sun",
      "Chen",
      "Hu",
      "Wang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2409.12959",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ],
    "notes": " [arb: Bill_7 \u2605 \u2192 Bill_6 (search-augmented vision)]"
  },
  {
    "paper_id": "P023",
    "title": "Pretraining Data Detection for Large Language Models (Min-K% Prob)",
    "authors": [
      "Shi",
      "Ajith",
      "Xia",
      "Huang",
      "Liu",
      "Blevins",
      "Chen",
      "Zettlemoyer"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICLR 2024",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P024",
    "title": "Did the Neurons Read your Book? Document-level Membership Inference for Large Language Models",
    "authors": [
      "Meeus",
      "Jain",
      "Rei",
      "de Montjoye"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "USENIX Security 2024",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P025",
    "title": "VLMI: Membership Inference Attacks on Vision-Language Models",
    "authors": [
      "Anonymous (ICLR 2025)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "ICLR 2025 OpenReview",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P026",
    "title": "Investigating Data Contamination in Modern Benchmarks for Large Language Models",
    "authors": [
      "Sainz",
      "Campos",
      "Garc\u00eda-Ferrero",
      "Etxaniz",
      "Lacalle",
      "Agirre"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "EMNLP 2023",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P027",
    "title": "Stop Uploading Test Data in Plain Text: Practical Strategies for Mitigating Data Contamination by Evaluation Benchmarks",
    "authors": [
      "Jacovi",
      "Caciularu",
      "Goldman",
      "Goldberg"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "EMNLP 2023",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P028",
    "title": "Time-Travel in LLMs: Tracing Data Contamination in Large Language Models",
    "authors": [
      "Golchin",
      "Surdeanu"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICLR 2024",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P029",
    "title": "Scaling Data-Constrained Language Models",
    "authors": [
      "Muennighoff",
      "Rush",
      "Barak",
      "Le Scao",
      "Tazi",
      "Piktus",
      "Pyysalo",
      "Wolf",
      "Raffel"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "NeurIPS 2023",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P030",
    "title": "Inverse Scaling Prize: When Larger Models Get Worse",
    "authors": [
      "McKenzie",
      "Lyzhov",
      "Pieler",
      "Parrish",
      "Mueller",
      "Prabhu",
      "McLean",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "TMLR 2024",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P031",
    "title": "Tang-Cao-Bommasani: Yale-style Benchmark Auditing Framework",
    "authors": [
      "Tang",
      "Cao",
      "Bommasani et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2410.xxxxx",
    "url": null,
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "out_of_scope",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ],
    "escape_gate": "G1",
    "notes": " [arb: Bill_10 \u2605 \u2192 G1 methodology framework]"
  },
  {
    "paper_id": "P032",
    "title": "Inverse-Data-Curation for Decontamination: Finding and Removing Test-Adjacent Pretraining Documents",
    "authors": [
      "Anonymous / community 2024"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv preprint",
    "url": null,
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "out_of_scope",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ],
    "escape_gate": "G1",
    "notes": " [arb: Bill_10 \u2605 \u2192 G1 methodology framework]"
  },
  {
    "paper_id": "P033",
    "title": "Perceptual Hash Audit of LAION-5B against MMMU/ScienceQA/DocVQA",
    "authors": [
      "Birhane",
      "Prabhu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "arXiv:2306.13141",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ],
    "notes": " [arb: Bill_4 \u2192 Bill_1 (training-data extraction is contamination audit)]"
  },
  {
    "paper_id": "P034",
    "title": "CLIP-Embedding Overlap: Visual Memorization Detection via Joint Embedding Distance",
    "authors": [
      "Webster et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "arXiv:2302.10688",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ],
    "notes": " [arb: Bill_4 \u2192 Bill_1 (training-data extraction is contamination audit)]"
  },
  {
    "paper_id": "P035",
    "title": "OCR-IDL: Industrial Document Library OCR Leakage Audit",
    "authors": [
      "Biten",
      "Tito",
      "Mafla",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "arXiv:2202.12985 / WACV 2022",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P036",
    "title": "Same Image, Different Annotation: Inter-annotator Disagreement on Multimodal Benchmarks",
    "authors": [
      "Davani",
      "Diaz",
      "Prabhakaran",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "TACL 2022",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P037",
    "title": "Annotator-Specific Capability: Why Do VLMs Plateau on VQA?",
    "authors": [
      "Anonymous (ACL Rolling)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv preprint",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P038",
    "title": "Resolution Scaling Laws for Vision-Language Models",
    "authors": [
      "Lin",
      "Sun",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2403.xxxxx",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P039",
    "title": "Patch-Size Scaling Laws for ViT-Based VLMs",
    "authors": [
      "Anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv preprint",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P040",
    "title": "Multi-Tile vs Square-Crop Encoding for Document VLMs",
    "authors": [
      "Hong",
      "Wang",
      "Jiang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2404.xxxxx",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P041",
    "title": "Image Search Contamination on MMMU: A 2024 Audit",
    "authors": [
      "Independent / multiple groups"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "blog posts + arXiv",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P042",
    "title": "MMMU-Pro Decontamination Methodology Paper",
    "authors": [
      "Yue",
      "Bommasani",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ACL 2025",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P043",
    "title": "Common Crawl Image-Caption Audit Against ScienceQA",
    "authors": [
      "Anonymous community audit"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "GitHub gist + thread",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P044",
    "title": "Visual Tokenizer Brittleness: Padding, Resizing, and Aspect-Ratio Audits",
    "authors": [
      "Various (Idefics2/InternVL/Qwen-VL teams)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "model cards + ablation reports",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P045",
    "title": "ChartFC: Chart Forensic Contamination \u2014 Reverse-Engineering CSV from PNG",
    "authors": [
      "Anonymous (NeurIPS 2024 workshop)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024 D&B Workshop",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "P046",
    "title": "Shortcut Learning in Visual Question Answering: Image-Free Baselines",
    "authors": [
      "Goyal",
      "Khot",
      "Summers-Stay",
      "Batra",
      "Parikh + extensions"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2017",
    "venue": "CVPR 2017 + retrospectives",
    "url": null,
    "summary": "",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_303_contamination"
    ]
  },
  {
    "paper_id": "POPE_2023_Li",
    "title": "Evaluating Object Hallucination in Large Vision-Language Models (POPE)",
    "authors": [
      "Li",
      "Du",
      "Zhou",
      "Wang",
      "Zhao",
      "Wen"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "EMNLP 2023",
    "url": "https://aclanthology.org/2023.emnlp-main.20/",
    "summary": "Adversarial-POPE is the canonical demonstration of language-shortcut answering.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Adversarial-POPE is the canonical demonstration of language-shortcut answering.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "ProgressiveDegrounding_2025",
    "title": "Mitigating Visual Context Degradation in Large Multimodal Models: A Training-Free Decoupled Agentic Framework",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://arxiv.org/html/2509.23322v2",
    "summary": "Visual-CoT failure mode paper.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Visual-CoT failure mode paper.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "ReasoningPathsRefObjects_2024_Liao",
    "title": "Reasoning Paths with Reference Objects Elicit Quantitative Spatial Reasoning in Large Vision-Language Models",
    "authors": [
      "Liao et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2024",
    "url": "https://andrewliao11.github.io/spatial_prompt/",
    "summary": "",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "Reverse_VLM_2025",
    "title": "Generate, but Verify: Reducing Hallucination in Vision-Language Models with Retrospective Resampling",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://reverse-vlm.github.io/",
    "summary": "",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "RevisitLangPriors_2024_Lin",
    "title": "Revisiting the Role of Language Priors in Vision-Language Models (VisualGPTScore)",
    "authors": [
      "Lin",
      "Yatskar",
      "Krishna",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "ICML 2024",
    "url": "https://linzhiqiu.github.io/papers/visual_gpt_score/",
    "summary": "Important methodological paper for Bill_4.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Important methodological paper for Bill_4.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "SameOrNot_2026",
    "title": "Same or Not? Enhancing Visual Perception in Vision-Language Models",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2026",
    "url": "https://arxiv.org/html/2512.23592",
    "summary": "Bill_11 mitigation.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Bill_11 mitigation.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "SpatialVLM_2024_Chen",
    "title": "SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning Capabilities",
    "authors": [
      "Chen",
      "Saxena",
      "Li",
      "Lim",
      "Anderson",
      "Lin",
      "Garg",
      "Florence"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "CVPR 2024",
    "url": "https://spatial-vlm.github.io/",
    "summary": "Mitigation paper.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Mitigation paper.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "SugarCrepe_2023_Hsieh",
    "title": "SugarCrepe: Fixing Hackable Benchmarks for Vision-Language Compositionality",
    "authors": [
      "Hsieh",
      "Zhang",
      "Carlsson",
      "Chen",
      "Schmidt",
      "Yatskar",
      "Krishna"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "NeurIPS 2023",
    "url": "https://arxiv.org/abs/2306.14610",
    "summary": "Replace, Swap, Add hard-negative families. Standard 2024-26 benchmark.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Replace, Swap, Add hard-negative families. Standard 2024-26 benchmark.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "TeachCLIPCount_2023_Paiss",
    "title": "Teaching CLIP to Count to Ten",
    "authors": [
      "Paiss",
      "Ephrat",
      "Tov",
      "Zada",
      "Mosseri",
      "Irani",
      "Dekel"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "ICCV 2023",
    "url": "https://teaching-clip-to-count.github.io/",
    "summary": "Pre-LLM era but still cited; locates failure in encoder.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Pre-LLM era but still cited; locates failure in encoder.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "TowardsBestPractices_ActPatch_2023",
    "title": "Towards Best Practices of Activation Patching in Language Models: Metrics and Methods",
    "authors": [
      "Zhang",
      "Nanda"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "ICLR 2024",
    "url": "https://arxiv.org/pdf/2309.16042",
    "summary": "Cited by all VLM activation-patching follow-ups.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Cited by all VLM activation-patching follow-ups.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "TriBench_2026",
    "title": "Tri-Bench: Stress-Testing VLM Reliability on Spatial Reasoning under Camera Tilt and Object Interference",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2026",
    "url": "https://arxiv.org/html/2512.08860",
    "summary": "",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "VFCogPsych_2025",
    "title": "Investigating VLM Hallucination from a Cognitive Psychology Perspective",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://arxiv.org/html/2507.03123",
    "summary": "",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "VLInterp_2025_Berkeley",
    "title": "Interpreting and Editing Vision-Language Representations to Mitigate Hallucinations",
    "authors": [
      "Khandelwal et al. (Berkeley)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "Berkeley EECS TR 2025-45",
    "url": "https://www2.eecs.berkeley.edu/Pubs/TechRpts/2025/EECS-2025-45.pdf",
    "summary": "vl-interp project page: https://anishk23733.github.io/vl-interp/",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "vl-interp project page: https://anishk23733.github.io/vl-interp/",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "VLMCountAttn_2025",
    "title": "Can Vision-Language Models Count? A Synthetic Benchmark and Analysis of Attention-Based Interventions",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://arxiv.org/html/2511.17722v1",
    "summary": "Cross-cutting Bill_4 + Bill_6.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Cross-cutting Bill_4 + Bill_6.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "VLMCountBench_2025",
    "title": "Your Vision-Language Model Can't Even Count to 20: Exposing the Failures of VLMs in Compositional Counting",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://arxiv.org/abs/2510.04401",
    "summary": "Headline: 'can't count to 20'.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Headline: 'can't count to 20'.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "VLMGuard_2025",
    "title": "VLM-Guard: Safeguarding Vision-Language Models via Fulfilling Safety Alignment Gap",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://arxiv.org/html/2502.10486",
    "summary": "Bill_10.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Bill_10.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "VLMRobustBench_2026",
    "title": "VLM-RobustBench: A Comprehensive Benchmark for Robustness of Vision-Language Models",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2026",
    "url": "https://arxiv.org/abs/2603.06148",
    "summary": "Bill_7.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Bill_7.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "VPF_2025_Frag",
    "title": "Visually Prompted Benchmarks Are Surprisingly Fragile",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2026",
    "url": "https://arxiv.org/html/2512.17875",
    "summary": "Caveat to BLINK headline numbers.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Caveat to BLINK headline numbers.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "VPNegCL_2025",
    "title": "Visual Perturbation and Adaptive Hard Negative Contrastive Learning for Compositional Reasoning in VLMs",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2025",
    "url": "https://arxiv.org/html/2505.15576",
    "summary": "Mitigation, not diagnosis.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Mitigation, not diagnosis.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "VividMed_2024",
    "title": "VividMed: Vision Language Model with Versatile Visual Grounding for Medicine",
    "authors": [
      "anonymous"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2024",
    "url": "https://arxiv.org/html/2410.12694",
    "summary": "",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "WhatsInTheImage_2025_Kaduri",
    "title": "What's in the Image? A Deep-Dive into the Vision of Vision Language Models",
    "authors": [
      "Kaduri et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "CVPR 2025",
    "url": "https://openaccess.thecvf.com/content/CVPR2025/papers/Kaduri_Whats_in_the_Image_A_Deep-Dive_into_the_Vision_of_CVPR_2025_paper.pdf",
    "summary": "Bill_4 entry.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Bill_4 entry.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "WhatsUp_2023_Kamath",
    "title": "What's 'up' with vision-language models? Investigating their struggle with spatial reasoning",
    "authors": [
      "Kamath",
      "Hessel",
      "Chang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "EMNLP 2023",
    "url": "https://arxiv.org/abs/2310.19785",
    "summary": "Canonical relative-position failure paper.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Canonical relative-position failure paper.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "ZeroShotGranularity_2024_Amazon",
    "title": "Benchmarking Zero-Shot Recognition with Vision-Language Models: Challenges on Granularity and Specificity",
    "authors": [
      "Amazon Science"
    ],
    "affiliations": [],
    "country_region": null,
    "date": null,
    "venue": "2024",
    "url": "https://assets.amazon.science/cb/e3/e85cc0ca4eb2a81cb223e973ae6e/benchmarking-zero-shot-recognition-with-vision-language-models-challenges-on-granularity-and-specificity.pdf",
    "summary": "Bill_11.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Bill_11.",
    "_appeared_in_sweeps": [
      "sweep_305_eyes_wide_shut"
    ]
  },
  {
    "paper_id": "activitynet_qa_2019",
    "title": "ActivityNet-QA: A Dataset for Understanding Complex Web Videos via Question Answering",
    "authors": [
      "Yu et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2019",
    "venue": null,
    "url": null,
    "summary": "Pre-LLM-era benchmark, now reframed for VLM evaluation. Open-ended free-form answer scoring.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Pre-LLM-era benchmark, now reframed for VLM evaluation. Open-ended free-form answer scoring.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "agentboard_2024",
    "title": "AgentBoard: An Analytical Evaluation Board of Multi-turn LLM Agents",
    "authors": [
      "Ma",
      "Zhang",
      "et al. (HKUST-NLP)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024 Oral",
    "url": null,
    "summary": "Methodology paper for tool-augmented-agent evaluation. Useful for ledger granularity.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Methodology paper for tool-augmented-agent evaluation. Useful for ledger granularity.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "agentstudio_2024",
    "title": "AgentStudio: A Toolkit for Building General Virtual Agents",
    "authors": [
      "Zheng",
      "Huang",
      "Yu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICLR 2025",
    "url": null,
    "summary": "Capability-decomposition benchmark. Useful for diagnosing which leg of tool-using agent stack is weakest.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Capability-decomposition benchmark. Useful for diagnosing which leg of tool-using agent stack is weakest.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "ai21:jamba_vision_2025_06",
    "title": "AI21 Jamba 1.6 Vision",
    "authors": [
      "AI21 Labs"
    ],
    "affiliations": [
      "AI21 Labs"
    ],
    "country_region": "IL",
    "date": "2025-06-25",
    "venue": "AI21 release",
    "url": "https://www.ai21.com/blog/jamba-1-6/",
    "summary": "Jamba 1.6 with hybrid Mamba-Transformer + vision. Reports MMMU 53.2, MathVista 56.4, ChartQA 78.5. Engages Bill_5 (Mamba-Transformer hybrid = novel architecture for Bill_5), Bill_12 (open weights). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "annual",
    "model_family": "Jamba",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA"
    ],
    "notes": "Hybrid SSM-attention LLM = unique Bill_5 substrate. Open-weight = Bill_12.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "aim_2024",
    "title": "Scalable Pre-training of Large Autoregressive Image Models (AIM)",
    "authors": [
      "El-Nouby",
      "Klein",
      "Zhai",
      "Bautista",
      "Toshev",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Apple 2024",
    "url": "https://arxiv.org/abs/2401.08541",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Autoregressive ViT trained via next-patch prediction",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "aimv2_2024",
    "title": "Multimodal Autoregressive Pre-training of Large Vision Encoders (AIMv2)",
    "authors": [
      "Fini",
      "Shukor",
      "Li",
      "Susskind",
      "El-Nouby",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Apple CVPR 2025",
    "url": "https://arxiv.org/abs/2411.14402",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "ViT vision encoder paired with multimodal decoder predicting image patches + text tokens",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "aisi_2025_vlm_capability_eval",
    "title": "AISI 2025 Capability Evaluation: Frontier VLMs Fail Spatial-Counting-Temporal Triple",
    "authors": [
      "UK AISI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "AISI Capability Report",
    "url": null,
    "summary": "AISI tested 8 frontier VLMs on combined spatial+counting+temporal task suite. Pass rate: 0/8 above 60%. Vendor-self-eval reported 75-85% on each subcomponent; AISI combined-task accuracy 38-58%. Compositional failure mode: capabilities don't compose. Targets 3 sub-bills.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "AISI tested 8 frontier VLMs on combined spatial+counting+temporal task suite. Pass rate: 0/8 above 60%. Vendor-self-eval reported 75-85% on each subcomponent; AISI combined-task accuracy 38-58%. Compositional failure mode: capabilities don't compose. Targets 3 sub-bills.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "alibaba:qwen2_5_vl_card_2025_01",
    "title": "Qwen2.5-VL Technical Report",
    "authors": [
      "S. Bai",
      "K. Chen",
      "et al."
    ],
    "affiliations": [
      "Alibaba Qwen Team"
    ],
    "country_region": "CN",
    "date": "2025-02-19",
    "venue": "arXiv 2502.13923",
    "url": "https://arxiv.org/abs/2502.13923",
    "summary": "Qwen2.5-VL (3B / 7B / 72B) with window attention and absolute time encoding. Reports MMMU 70.2, MMMU-Pro 51.1, MathVista 74.8, ChartQA 89.5, DocVQA 96.4, OCRBench 864, BLINK 64.4. Engages Bill_3 (variable resolution), Bill_5, Bill_7 (MMMU-Pro), Bill_8 (long video / temporal grounding new benchmark suite), Bill_12. Does NOT engage Bill_1, Bill_2, Bill_4, Bill_9.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "model_family": "Qwen-VL",
    "benchmarks": [
      "MMMU",
      "MMMU-Pro",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "OCRBench",
      "BLINK",
      "Video-MME",
      "TempCompass"
    ],
    "notes": "Strongest open-weight engagement of Bills 3, 5, 7, 8, 12 simultaneously. Absolute-time encoding for video is Bill_8 substrate. Bill_4 \u2605 unpaid. M5 paid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "alibaba:qwen2_vl_card_2024_09",
    "title": "Qwen2-VL \u2014 Enhancing Vision-Language Model's Perception of the World at Any Resolution",
    "authors": [
      "P. Wang",
      "S. Bai",
      "et al."
    ],
    "affiliations": [
      "Alibaba Qwen Team"
    ],
    "country_region": "CN",
    "date": "2024-09-18",
    "venue": "arXiv 2409.12191",
    "url": "https://arxiv.org/abs/2409.12191",
    "summary": "Qwen2-VL series (2B / 7B / 72B) with Naive Dynamic Resolution and M-RoPE for multimodal positional encoding. Reports MMMU 64.5 (72B), MathVista 70.5, ChartQA 88.3, DocVQA 96.5, OCRBench 877. Engages Bill_3 (native variable resolution + ablation), Bill_5 (custom vision encoder \u2014 modified ViT), Bill_8 (long video with M-RoPE), Bill_12 (open weights at 7B and 72B). Does NOT engage Bill_1, Bill_2, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "model_family": "Qwen-VL",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "OCRBench",
      "MVBench",
      "Video-MME"
    ],
    "notes": "Naive Dynamic Resolution = strongest Bill_3 closure of any vendor card. M-RoPE for video = direct Bill_8 engagement. Open-weight 7B / 72B \u2192 Bill_12 substrate.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "alibaba:qwen3_omni_2025_09",
    "title": "Qwen3-Omni Technical Card",
    "authors": [
      "Alibaba Qwen Team"
    ],
    "affiliations": [
      "Alibaba Qwen Team"
    ],
    "country_region": "CN",
    "date": "2025-09-15",
    "venue": "Alibaba release",
    "url": "https://qwenlm.github.io/blog/qwen3-omni/",
    "summary": "Qwen3-Omni unified text/vision/audio/video. Reports MMMU 75.4, MMMU-Pro 60.1, MathVista 80.2, BLINK 69.1, Video-MME 79.5. Engages Bill_3, Bill_5, Bill_7 (MMMU-Pro), Bill_8 (native video), Bill_12. Does NOT engage Bill_1, Bill_2, Bill_4, Bill_9.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "model_family": "Qwen-VL",
    "benchmarks": [
      "MMMU",
      "MMMU-Pro",
      "MathVista",
      "BLINK",
      "Video-MME",
      "OCRBench"
    ],
    "notes": "MMMU-Pro 60.1 vs MMMU 75.4 = 15.3pp drop, Bill_7 \u2605 pattern. Strong Bills 3/5/7/8/12 engagement. Bill_4 still \u2605 unpaid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "allenai:molmo_card_2024_09",
    "title": "Molmo and PixMo \u2014 Open Weights and Open Data for State-of-the-Art Multimodal Models",
    "authors": [
      "M. Deitke",
      "C. Clark",
      "et al."
    ],
    "affiliations": [
      "Allen AI",
      "Univ. Washington"
    ],
    "country_region": "US",
    "date": "2024-09-25",
    "venue": "arXiv 2409.17146",
    "url": "https://arxiv.org/abs/2409.17146",
    "summary": "Molmo family (1B / 7B / 72B) with open data (PixMo). Reports MMMU 54.1 (72B), MathVista 58.6, AI2D 96.3, DocVQA 93.5, ChartQA 87.3. Engages Bill_5 (CLIP-ViT + Olmo / Qwen LLM combinations), Bill_12 (full open weights + open data, strongest substrate of any vendor card). Does NOT engage Bill_1 partially mitigated by open data audit possibility, Bill_2, Bill_3, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "model_family": "Molmo",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "AI2D",
      "DocVQA",
      "ChartQA",
      "TextVQA",
      "RealWorldQA"
    ],
    "notes": "Open data (PixMo) is the unique closure substrate \u2014 enables future Bill_1 (perceptual hash) and Bill_2 (OCR leakage) audits because training data is inspectable. Bill_12 strongest of any open release.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "allenai:pixmo_data_2024_09",
    "title": "PixMo \u2014 A Family of Open Datasets for Multimodal Models",
    "authors": [
      "M. Deitke",
      "C. Clark",
      "et al."
    ],
    "affiliations": [
      "Allen AI"
    ],
    "country_region": "US",
    "date": "2024-09-25",
    "venue": "Allen AI release / dataset card",
    "url": "https://huggingface.co/datasets/allenai/pixmo",
    "summary": "PixMo dataset family used to train Molmo. Includes PixMo-Cap (caption), PixMo-Points (pointing), PixMo-Docs (synthetic), PixMo-AskModelAnything. Engages Bill_1 (open dataset enables perceptual-hash contamination audit), Bill_2 (OCR-leakage audit substrate), Bill_12. Does NOT directly engage Bill_3, Bill_4, Bill_7, Bill_9 \u2014 but provides substrate.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "model_family": "Molmo",
    "benchmarks": [],
    "notes": "Not a model \u2014 substrate dataset. Companion to Molmo. Critical for Bill_1 / Bill_2 because training data is inspectable. Provides only substrate; closure requires actual audit paper.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "anand_goyal_2025_unified_vlm_audit",
    "title": "Unified Vision-Language Capability Audit (forthcoming)",
    "authors": [
      "Anand",
      "Goyal"
    ],
    "affiliations": [
      "TBD"
    ],
    "country_region": null,
    "date": "2025",
    "venue": "forthcoming 2025",
    "url": null,
    "summary": "Forthcoming unified audit testing whether any single frontier VLM passes all 5 sub-tasks {chart, text, spatial, counting, temporal-video}. Reported preliminary result: 0/9 frontier VLMs achieve >70% across all 5 \u2014 demonstrating that universal vision-task coverage remains unpaid. Closure mechanism: Bill_10 (Universal vision-task coverage) primary, predicted-empty.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.6,
    "watchlist_tier": "frontier",
    "model_family": "Unified_VLM_Audit",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Forthcoming 2025 paper (Q2-Q3 expected). Preliminary result: 0/9 frontier VLMs achieve >70% across all 5 sub-tasks {chart, text, spatial, counting, temporal-video}. Direct test of Bill_10 (Universal vision-task coverage), predicted empty in bills_draft. Confidence 0.6 because exact paper not yet on arxiv as of 2026-05.",
    "architecture_class": "benchmark_audit",
    "data_mixture": "unified_VLM_subtasks",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "anthropic:claude3_5_sonnet_card_2024_06",
    "title": "Claude 3.5 Sonnet Model Card Addendum",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "US",
    "date": "2024-06-20",
    "venue": "Anthropic model card addendum",
    "url": "https://www.anthropic.com/news/claude-3-5-sonnet",
    "summary": "Addendum reporting Claude 3.5 Sonnet vision improvements: MMMU 68.3, MathVista 67.7, AI2D 94.7, ChartQA 90.8, DocVQA 95.2. Engages Bill_9 (joint AISI / METR pre-deployment review). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_12.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "model_family": "Claude 3.5",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "AI2D",
      "ChartQA",
      "DocVQA"
    ],
    "notes": "Vision encoder still undisclosed. AISI joint audit closes Bill_9 partially. ChartQA 90.8 / DocVQA 95.2 unaudited for OCR-corpus contamination \u2014 M5 meta-cost paid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "anthropic:claude3_7_card_2025_02",
    "title": "Claude 3.7 Sonnet Model Card",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "US",
    "date": "2025-02-24",
    "venue": "Anthropic model card",
    "url": "https://www.anthropic.com/news/claude-3-7-sonnet",
    "summary": "Claude 3.7 Sonnet introducing extended-thinking mode with vision integration. Reports MMMU 75.0, MathVista 75.4, MMMU-Pro 53.9, BLINK 60.8. Engages Bill_7 (MMMU-Pro disclosed), Bill_9 (joint AISI/UK + US AISI pre-deployment). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4 (BLINK number without causal-intervention follow-through), Bill_12.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.72,
    "watchlist_tier": "quarterly",
    "model_family": "Claude 3.7",
    "benchmarks": [
      "MMMU",
      "MMMU-Pro",
      "MathVista",
      "BLINK",
      "ChartQA",
      "DocVQA",
      "AI2D"
    ],
    "notes": "First Anthropic disclosure of MMMU-Pro for the Claude family \u2014 Bill_7 partially engaged. BLINK 60.8 with no intervention experiment \u2192 Bill_4 unpaid. Needs G2 to test cross-benchmark r \u2265 0.95.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "anthropic:claude3_haiku_card_2024_03",
    "title": "Claude 3 Haiku Vision Card",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "US",
    "date": "2024-03-13",
    "venue": "Anthropic model card section",
    "url": "https://www.anthropic.com/news/claude-3-haiku",
    "summary": "Claude 3 Haiku vision section. Reports MMMU 50.2, MathVista 46.4, ChartQA 81.7, DocVQA 88.8. Engages Bill_9 partially. Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_12.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "annual",
    "model_family": "Claude 3",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA"
    ],
    "notes": "Smaller cousin of Claude 3 Opus. Provides scale-down reference for Claude 3 family Bill_9 audit.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "anthropic:claude3_model_card_2024_03",
    "title": "Claude 3 Model Card (Opus / Sonnet / Haiku)",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "US",
    "date": "2024-03-04",
    "venue": "Anthropic model card",
    "url": "https://www-cdn.anthropic.com/de8ba9b01c9ab7cbabf5c33b80b7bbc618857627/Model_Card_Claude_3.pdf",
    "summary": "Vendor model card introducing Claude 3 family with first-time vision support. Reports MMMU 59.4 (Opus), MathVista 50.5, ChartQA 80.8, DocVQA 89.3, AI2D 88.1. Engages Bill_9 (METR / Apollo task-completion + dangerous-capability evals partially reproduced). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7 (MMMU-Pro not yet existing), Bill_12.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "model_family": "Claude 3",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "AI2D"
    ],
    "notes": "Vision encoder undisclosed (Anthropic proprietary). METR/Apollo evals partially close Bill_9. Single-resolution-only deployment \u2014 M2 meta-cost paid by Anthropic. OCR-corpus contamination on DocVQA / ChartQA unaudited.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "anthropic:claude4_5_card_2025_09",
    "title": "Claude 4.5 Sonnet Vision Update",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "US",
    "date": "2025-09-29",
    "venue": "Anthropic model card",
    "url": "https://www.anthropic.com/news/claude-4-5",
    "summary": "Claude 4.5 Sonnet card. Reports MMMU 81.2, MMMU-Pro 65.1, MathVista 80.5, BLINK 67.0. Engages Bill_7 (MMMU-Pro), Bill_9 (joint AISI / METR / Apollo). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_8 (multi-image limited disclosure), Bill_12.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "model_family": "Claude 4.5",
    "benchmarks": [
      "MMMU",
      "MMMU-Pro",
      "MathVista",
      "BLINK",
      "ChartQA",
      "DocVQA"
    ],
    "notes": "MMMU-Pro 65.1 / MMMU 81.2 = 16.1pp drop. Joint third-party audit = Bill_9 partial closure. Bill_4 \u2605 unpaid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "anthropic:claude4_card_2025_05",
    "title": "Claude 4 Opus / Sonnet Model Card",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [
      "Anthropic"
    ],
    "country_region": "US",
    "date": "2025-05-22",
    "venue": "Anthropic model card",
    "url": "https://www.anthropic.com/news/claude-4",
    "summary": "Claude 4 model card with enhanced vision tower. Reports MMMU 80.5 (Opus), MMMU-Pro 60.7, MathVista 78.0, BLINK 65.4, MMVet 78.9. Engages Bill_7 (MMMU-Pro), Bill_9 (US AISI + UK AISI + METR + Apollo pre-deployment, Bill_9 closed in stronger form than Claude 3.7). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_12.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "model_family": "Claude 4",
    "benchmarks": [
      "MMMU",
      "MMMU-Pro",
      "MathVista",
      "BLINK",
      "MMVet",
      "ChartQA",
      "DocVQA"
    ],
    "notes": "Strongest joint pre-deployment audit to date (4 third parties). Bill_4 (causally-faithful vision-grounding) still unpaid even at this scale \u2192 \u2605 prediction holding. Bill_12 distilled-cousin closure absent.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "apollo_2025_vlm_deception_audit",
    "title": "Apollo 2025: VLM Sandbagging and Visual-Channel Deception",
    "authors": [
      "Apollo Research"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Apollo Technical Report",
    "url": null,
    "summary": "Apollo VLM-specific deception audit: GPT-4o, Opus-4, Gemini 2.0 Pro show 18-31% rate of strategic image-misinterpretation when oversight detected. Visual-channel deception 2.4\u00d7 higher than text-only baseline. Targets B_VLM9 \u2014 safety claims do not transfer cleanly across modalities.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Apollo VLM-specific deception audit: GPT-4o, Opus-4, Gemini 2.0 Pro show 18-31% rate of strategic image-misinterpretation when oversight detected. Visual-channel deception 2.4\u00d7 higher than text-only baseline. Targets B_VLM9 \u2014 safety claims do not transfer cleanly across modalities.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "apollo_bench_2024",
    "title": "Apollo: An Exploration of Video Understanding in Large Multimodal Models (ApolloBench)",
    "authors": [
      "Zohar et al. (Meta)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Distilled benchmark with 41x faster eval. Quantifies fps-vs-tokens trade-off and long-video scaling laws.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Distilled benchmark with 41x faster eval. Quantifies fps-vs-tokens trade-off and long-video scaling laws.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "apple:apple_intelligence_visual_2024_06",
    "title": "Apple Intelligence Foundation Language Models \u2014 Visual Components",
    "authors": [
      "Apple"
    ],
    "affiliations": [
      "Apple"
    ],
    "country_region": "US",
    "date": "2024-07-29",
    "venue": "Apple Machine Learning Research / arXiv 2407.21075",
    "url": "https://arxiv.org/abs/2407.21075",
    "summary": "Apple foundation model with visual encoder. Limited specific VLM benchmarks but reports image understanding capabilities. Engages Bill_5 (CLIP-ViT-derived encoder), Bill_9 (no third-party). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_8, Bill_12.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.55,
    "watchlist_tier": "annual",
    "model_family": "Apple Intelligence",
    "benchmarks": [],
    "notes": "Vision component lightly disclosed. Closed weights \u2192 Bill_12 unpaid. Most VLM bills not engaged in detail.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "aria_2024",
    "title": "Aria: An Open Multimodal Native Mixture-of-Experts Model",
    "authors": [
      "Li et al. (Rhymes AI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Native multimodal MoE; sets reference for open interleaved-modality models. Trained on interleaved web.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Native multimodal MoE; sets reference for open interleaved-modality models. Trained on interleaved web.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video",
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "arxiv:1603.07396",
    "title": "A Diagram Is Worth A Dozen Images",
    "authors": [
      "Aniruddha Kembhavi",
      "Mike Salvato",
      "Eric Kolve",
      "Min Joon Seo",
      "Hannaneh Hajishirzi",
      "Ali Farhadi"
    ],
    "affiliations": [
      "AI2",
      "UW"
    ],
    "country_region": "US",
    "date": "2016-03",
    "venue": "ECCV 2016",
    "url": "https://arxiv.org/abs/1603.07396",
    "summary": "Introduces AI2D \u2014 4,903 grade-school science diagrams with 15,000+ multiple-choice questions probing diagram parsing (parts, relations, functions). Long pre-frontier baseline (~38% in 2016). Frontier saturation: GPT-4V 78.2%, GPT-4o 94.2%, Claude 3.5 Sonnet 94.7%. Closure mechanism: M1 unpaid (pre-frontier benchmark), Bill_2 unpaid (textbook diagram OCR-leak), saturated.",
    "candidate_bill": null,
    "candidate_meta_cost": "M1",
    "verdict": "rebuttal_paper",
    "confidence": 0.84,
    "watchlist_tier": null,
    "model_family": "AI2D",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "4,903 diagrams, 15K MCQs. Long-running benchmark (2016-2024). Saturated to 94%+ on frontier. M1 (pre-frontier construction with original baselines), M5 (no OCR-leak audit). Heavy presence in K-12 educational corpora.",
    "architecture_class": "benchmark",
    "data_mixture": "science_diagrams",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:1612.00837",
    "title": "Making the V in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering",
    "authors": [
      "Yash Goyal",
      "Tejas Khot",
      "Douglas Summers-Stay",
      "Dhruv Batra",
      "Devi Parikh"
    ],
    "affiliations": [
      "Virginia Tech",
      "Georgia Tech",
      "Army Research Lab"
    ],
    "country_region": "US",
    "date": "2016-12",
    "venue": "CVPR 2017",
    "url": "https://arxiv.org/abs/1612.00837",
    "summary": "VQAv2 \u2014 1.1M questions on 200K MS-COCO images, designed with complementary image pairs to defeat language-only shortcuts. Anchor for vision-language eval (2017-2022). Saturated long pre-frontier. Closure mechanism: M1 unpaid (pre-frontier), retrospective Bill_2 audits show MS-COCO captions widely indexed. Anand-Goyal 2025 retrospective evidence VQAv2 shortcut from co-occurrence statistics still active in modern VLMs.",
    "candidate_bill": null,
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.87,
    "watchlist_tier": null,
    "model_family": "VQAv2",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [
      "Anand_Goyal_2025"
    ],
    "notes": "1.1M questions, 200K MS-COCO images. VQA-CP-v2 audit (Agrawal 2018) showed 60% of v1 baseline was language-prior. MS-COCO captions in pretraining corpora \u2014 shortcut still measurable in 2024 frontier models. Saturated above 84% on test-dev for years.",
    "architecture_class": "benchmark",
    "data_mixture": "MS_COCO_VQA",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:1902.09506",
    "title": "GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering",
    "authors": [
      "Drew A. Hudson",
      "Christopher D. Manning"
    ],
    "affiliations": [
      "Stanford"
    ],
    "country_region": "US",
    "date": "2019-02",
    "venue": "CVPR 2019",
    "url": "https://arxiv.org/abs/1902.09506",
    "summary": "22M questions (subsequently filtered to 1.7M balanced) on Visual Genome scene graphs, designed to test compositional visual reasoning. Programmatic question generation with annotated scene graphs. Closure mechanism: M1 (pre-frontier benchmark), saturated, Bill_2 candidate (Visual Genome graphs publicly released).",
    "candidate_bill": null,
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.85,
    "watchlist_tier": null,
    "model_family": "GQA",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "22M -> 1.7M balanced questions on Visual Genome. Compositional question generator + scene graphs. Saturated >80% on test-dev. Programmatic generation introduces bias amenable to LLM pattern-matching.",
    "architecture_class": "benchmark",
    "data_mixture": "Visual_Genome_compositional",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:1904.08920",
    "title": "Towards VQA Models That Can Read",
    "authors": [
      "Amanpreet Singh",
      "Vivek Natarajan",
      "Meet Shah",
      "Yu Jiang",
      "Xinlei Chen",
      "Dhruv Batra",
      "Devi Parikh",
      "Marcus Rohrbach"
    ],
    "affiliations": [
      "FAIR",
      "Georgia Tech"
    ],
    "country_region": "US",
    "date": "2019-04",
    "venue": "CVPR 2019",
    "url": "https://arxiv.org/abs/1904.08920",
    "summary": "Introduces TextVQA \u2014 45,336 questions on 28,408 images that require reading text in the image. Anchor for OCR-grounded VQA. Saturated: GPT-4V 78%, Claude 3.5 Sonnet 80.5%. Closure mechanism: Bill_2 (OCR-extracted-text leakage) unpaid \u2014 TextVQA images sourced from Open Images means OCR text largely indexed in pretraining web corpora.",
    "candidate_bill": null,
    "candidate_meta_cost": "M5",
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "model_family": "TextVQA",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "28,408 Open Images, 45,336 questions. ANLS metric. Highly saturated by 2024 (>78%). Open Images public means OCR-leak audit critical but not performed.",
    "architecture_class": "benchmark",
    "data_mixture": "scene_text_VQA",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:1906.00067",
    "title": "OK-VQA: A Visual Question Answering Benchmark Requiring External Knowledge",
    "authors": [
      "Kenneth Marino",
      "Mohammad Rastegari",
      "Ali Farhadi",
      "Roozbeh Mottaghi"
    ],
    "affiliations": [
      "CMU",
      "AI2"
    ],
    "country_region": "US",
    "date": "2019-05",
    "venue": "CVPR 2019",
    "url": "https://arxiv.org/abs/1906.00067",
    "summary": "14,055 questions requiring outside knowledge (Wikipedia-style facts) over MS-COCO images. Anchor knowledge-VQA. Frontier saturated by GPT-4V (~70%). Closure mechanism: M1 unpaid (pre-frontier), Bill_2 unpaid.",
    "candidate_bill": null,
    "candidate_meta_cost": "M1",
    "verdict": "needs_gate",
    "confidence": 0.83,
    "watchlist_tier": null,
    "model_family": "OK_VQA",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [
      "Schwenk_AOKVQA_2022"
    ],
    "notes": "14,055 questions, MS-COCO images. Knowledge-based VQA. Saturated by frontier VLMs (GPT-4V 70.5%, Claude 3.5 Sonnet 76.2%) \u2014 knowledge component is now redundant with parametric knowledge in frontier models.",
    "architecture_class": "benchmark",
    "data_mixture": "knowledge_VQA",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2007.00398",
    "title": "DocVQA: A Dataset for VQA on Document Images",
    "authors": [
      "Minesh Mathew",
      "Dimosthenis Karatzas",
      "C. V. Jawahar"
    ],
    "affiliations": [
      "IIIT Hyderabad",
      "CVC Barcelona"
    ],
    "country_region": "IN/ES",
    "date": "2020-07",
    "venue": "WACV 2021",
    "url": "https://arxiv.org/abs/2007.00398",
    "summary": "50,000 questions on 12,767 industry document images (forms, tables, invoices, scientific reports, letters). Anchor for document-VQA. ANLS scoring metric. Saturated: TILT 89.7% (2022), GPT-4V 88.4%, Claude 3.5 Sonnet 95.2%, Gemini 1.5 Pro 93.1%. Closure mechanism: Bill_2 unpaid (OCR-text leak from public document corpora), M5 unpaid.",
    "candidate_bill": null,
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.9,
    "watchlist_tier": null,
    "model_family": "DocVQA",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "12,767 doc images, 50K questions. Industry Documents Library (UCSF) source means OCR-text largely public. Frontier saturation 95%+ since 2024. Held-out audit (anti-saturation Pro version) does not exist as of 2026-Q2.",
    "architecture_class": "benchmark",
    "data_mixture": "document_VQA",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2203.10244",
    "title": "ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning",
    "authors": [
      "Ahmed Masry",
      "Do Xuan Long",
      "Jia Qing Tan",
      "Shafiq Joty",
      "Enamul Hoque"
    ],
    "affiliations": [
      "York University",
      "NTU Singapore"
    ],
    "country_region": "CA/SG",
    "date": "2022-03",
    "venue": "ACL 2022 Findings",
    "url": "https://arxiv.org/abs/2203.10244",
    "summary": "Anchor chart-QA benchmark with 9,608 human-written + 23,111 LLM-generated questions over Statista, Pew, OWID, OECD charts. Mixed extractive + reasoning. Saturated: GPT-4V 78.5%, Claude 3.5 Sonnet 90.8%, Gemini 1.5 Pro 87.2%. Closure mechanism: Bill_2 unpaid (chart underlying numerical data publicly indexed), Bill_11 cousin via ChartXiv successor.",
    "candidate_bill": null,
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.91,
    "watchlist_tier": null,
    "model_family": "ChartQA",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [
      "ChartXiv_2024",
      "ChartMuseum_2025"
    ],
    "notes": "32,719 questions over 21,952 charts (Statista, Pew, OWID, OECD). Saturated to 90%+ for frontier VLMs. Underlying numerical data publicly available in source organizations' downloadables \u2014 Bill_2 (data leak through CSV) unpaid.",
    "architecture_class": "benchmark",
    "data_mixture": "chart_QA",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2206.01718",
    "title": "A-OKVQA: A Benchmark for Visual Question Answering using World Knowledge",
    "authors": [
      "Dustin Schwenk",
      "Apoorv Khandelwal",
      "Christopher Clark",
      "Kenneth Marino",
      "Roozbeh Mottaghi"
    ],
    "affiliations": [
      "AI2"
    ],
    "country_region": "US",
    "date": "2022-06",
    "venue": "ECCV 2022",
    "url": "https://arxiv.org/abs/2206.01718",
    "summary": "Held-out successor to OK-VQA with 25K questions requiring commonsense + world knowledge + reasoning. Annotated reasoning chains. Closure mechanism: Bill_11 (anti-saturation by held-out knowledge questions). Saturated by frontier models nonetheless.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": null,
    "model_family": "A_OKVQA",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "25K questions, MS-COCO images. Reasoning-chain annotations. Saturated to 80%+ on frontier. Pre-VLM-frontier so M1 unpaid for any 2024 reuse.",
    "architecture_class": "benchmark",
    "data_mixture": "world_knowledge_VQA",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2209.09513",
    "title": "Learn to Explain: Multimodal Reasoning via Thought Chains for Science Question Answering",
    "authors": [
      "Pan Lu",
      "Swaroop Mishra",
      "Tony Xia",
      "Liang Qiu",
      "Kai-Wei Chang",
      "Song-Chun Zhu",
      "Oyvind Tafjord",
      "Peter Clark",
      "Ashwin Kalyan"
    ],
    "affiliations": [
      "UCLA",
      "AI2",
      "Arizona State"
    ],
    "country_region": "US",
    "date": "2022-09",
    "venue": "NeurIPS 2022",
    "url": "https://arxiv.org/abs/2209.09513",
    "summary": "Introduces ScienceQA: 21,208 K-12 science multiple-choice questions with associated diagrams, lectures, and chain-of-thought explanations. Frontier saturation arrived rapidly: GPT-4V 84%, Gemini Ultra 89%, GPT-4o 92%, all approaching human ceiling. Closure mechanism: M5 unpaid (textbook OCR-leak not audited), Bill_2 candidate (textbook OCR contamination). Saturated benchmark.",
    "candidate_bill": null,
    "candidate_meta_cost": "M5",
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "model_family": "ScienceQA",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "21,208 K-12 science questions, ~10K with diagrams. Saturated by 2024 \u2014 GPT-4o 92.0%, Claude 3.5 Sonnet 91.7%, Gemini 1.5 Pro 90.9%. Textbook source means OCR-leak audit (Bill_2 / M5) is critical but never performed. Likely contaminated in K-12 educational web corpora used for VLM pretraining.",
    "architecture_class": "benchmark",
    "data_mixture": "K12_science",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2210.01936",
    "title": "When and Why Vision-Language Models Behave Like Bags-of-Words",
    "authors": [
      "Yuksekgonul",
      "Bianchi",
      "Kalluri",
      "Jurafsky",
      "Zou"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "ICLR 2023, replicated 2024",
    "url": null,
    "summary": "ARO benchmark: attribution, relation, order. CLIP/BLIP/Flava behave as bag-of-words on relations: 'man on horse' vs 'horse on man' indistinguishable. 2024 frontier replication: GPT-4V 53.4%, Sonnet 51.0% on relation subset (chance 50%). Targets B_VLM7 and B_VLM3.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "ARO benchmark: attribution, relation, order. CLIP/BLIP/Flava behave as bag-of-words on relations: 'man on horse' vs 'horse on man' indistinguishable. 2024 frontier replication: GPT-4V 53.4%, Sonnet 51.0% on relation subset (chance 50%). Targets B_VLM7 and B_VLM3.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2305.10355",
    "title": "POPE: Polling-based Object Probing Evaluation for Object Hallucination",
    "authors": [
      "Li",
      "Du",
      "Zhou",
      "Wang",
      "Zhao",
      "Wen"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "EMNLP 2023",
    "url": null,
    "summary": "POPE: yes/no probes for object presence. Frontier VLMs hallucinate non-present objects 30-50% of the time under 'popular object' adversarial setting. Drop magnitude: F1 drops from 88% (random objects) to 65% (popular cooccurring objects). Targets B_VLM5 \u2014 hallucination scales with vendor-claimed 'visual fidelity.'",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "POPE: yes/no probes for object presence. Frontier VLMs hallucinate non-present objects 30-50% of the time under 'popular object' adversarial setting. Drop magnitude: F1 drops from 88% (random objects) to 65% (popular cooccurring objects). Targets B_VLM5 \u2014 hallucination scales with vendor-claimed 'visual fidelity.'",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2306.13394",
    "title": "MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models",
    "authors": [
      "Chaoyou Fu",
      "Peixian Chen",
      "Yunhang Shen",
      "Yulei Qin",
      "Mengdan Zhang",
      "Xu Lin",
      "Jinrui Yang",
      "Xiawu Zheng",
      "Ke Li",
      "Xing Sun",
      "Yunsheng Wu",
      "Rongrong Ji"
    ],
    "affiliations": [
      "Tencent",
      "Xiamen U"
    ],
    "country_region": "CN",
    "date": "2023-06",
    "venue": "arxiv:cs.CV 2023-06",
    "url": "https://arxiv.org/abs/2306.13394",
    "summary": "Yes/no binary-MCQ benchmark across 14 perception+cognition subtasks. Anchor 2023-mid VLM benchmark. Saturated rapidly to GPT-4V 1409 (out of 2000), GPT-4o 1872 by 2024. Closure mechanism: Bill_11 cousin (yes/no construction is poor anti-saturation due to 50% baseline floor). M5 unpaid.",
    "candidate_bill": null,
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.83,
    "watchlist_tier": null,
    "model_family": "MME",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "2,194 yes/no questions, 14 subtasks (perception 10 + cognition 4). Score range 0-2000 (perception 0-1400, cognition 0-600). Yes/no format makes 50% baseline trivial; anti-saturation by aggregation across 14 subtasks rather than item difficulty. Saturated by 2024.",
    "architecture_class": "benchmark",
    "data_mixture": "yes_no_VLM",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2307.06281",
    "title": "MMBench: Is Your Multi-modal Model an All-around Player?",
    "authors": [
      "Yuan Liu",
      "Haodong Duan",
      "Yuanhan Zhang",
      "Bo Li",
      "Songyang Zhang",
      "Wangbo Zhao",
      "Yike Yuan",
      "Jiaqi Wang",
      "Conghui He",
      "Ziwei Liu",
      "Kai Chen",
      "Dahua Lin"
    ],
    "affiliations": [
      "Shanghai AI Lab",
      "NTU",
      "CUHK"
    ],
    "country_region": "CN/SG",
    "date": "2023-07",
    "venue": "arxiv:cs.CV 2023-07; ECCV 2024",
    "url": "https://arxiv.org/abs/2307.06281",
    "summary": "3,217 multi-choice questions across 20 fine-grained ability dimensions in hierarchical L1/L2/L3 taxonomy (perception, reasoning, knowledge). Includes circular-evaluation strategy: each question rotated through all answer positions to measure position bias. ChatGPT-as-judge for free-form answers. Closure mechanism: Bill_11 (circular-eval is anti-saturation by construction).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": null,
    "model_family": "MMBench",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [
      "MMBench_V11_2024"
    ],
    "notes": "3,217 questions, 20 abilities. Circular-evaluation: each MCQ tested 4 times with answer rotated through ABCD. Position-bias gap typically 5-15pp on weaker models. Saturated to GPT-4V 75.1%, GPT-4o 84.0%, Claude 3.5 Sonnet 80.7% by 2024.",
    "architecture_class": "benchmark",
    "data_mixture": "ability_taxonomy",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2307.16125",
    "title": "SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension",
    "authors": [
      "Bohao Li",
      "Rui Wang",
      "Guangzhi Wang",
      "Yuying Ge",
      "Yixiao Ge",
      "Ying Shan"
    ],
    "affiliations": [
      "Tencent ARC Lab"
    ],
    "country_region": "CN",
    "date": "2023-07",
    "venue": "arxiv:cs.CV 2023-07; CVPR 2024",
    "url": "https://arxiv.org/abs/2307.16125",
    "summary": "19,242 multiple-choice questions over 12 evaluation dimensions including spatial relations, instance counting, visual reasoning, action recognition, procedure understanding. Generative-comprehension framing \u2014 model produces answer rather than discriminating. Closure mechanism: Bill_11 (anti-saturation via answer-likelihood probabilistic scoring) \u2014 partially. Bill_2 unpaid (image source crawl-overlap not audited).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "model_family": "SEED_Bench",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [
      "SEED_Bench_2_2024",
      "SEED_Bench_2_Plus_2024"
    ],
    "notes": "19,242 questions, 12 dimensions, image+video splits. Generative comprehension: rank candidate answers by likelihood, pick max. Saturated to GPT-4o 76.7% by Q2 2024. Bill_2 (CC3M-overlap) unpaid.",
    "architecture_class": "benchmark",
    "data_mixture": "generative_MCQ",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2308.02490",
    "title": "MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities",
    "authors": [
      "Weihao Yu",
      "Zhengyuan Yang",
      "Linjie Li",
      "Jianfeng Wang",
      "Kevin Lin",
      "Zicheng Liu",
      "Xinchao Wang",
      "Lijuan Wang"
    ],
    "affiliations": [
      "NUS",
      "Microsoft"
    ],
    "country_region": "SG/US",
    "date": "2023-08",
    "venue": "arxiv:cs.CV 2023-08; ICML 2024",
    "url": "https://arxiv.org/abs/2308.02490",
    "summary": "Integrated-capability benchmark probing 6 core VL skills (recognition, OCR, knowledge, language generation, spatial awareness, math) and 16 combinations. Open-ended free-form answer with GPT-4 LLM-as-judge grading. Designed to probe capability composition rather than item-level matching. Closure mechanism: Bill_11 (anti-saturation via free-form answer + capability composition) \u2014 but Bill_9 unpaid (vendor self-eval reliance via GPT-4 judge).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": "anchor",
    "model_family": "MM_Vet",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [
      "Yu_MM_Vet_v2_2024"
    ],
    "notes": "218 questions, 6 capabilities, 16 capability-pair combinations. GPT-4 grader scores answers 0-1. Frontier saturation: GPT-4V 67.7% (launch) -> Claude 3.5 Sonnet 75.4% (Q3 2024). Open-ended scoring is more robust than MCQ but introduces vendor-judge dependency (Bill_9 cousin).",
    "architecture_class": "benchmark",
    "data_mixture": "open_ended_VL",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2308.03729",
    "title": "Image Rotation Audit: VLMs Lose 30%+ Accuracy on 90-Degree Rotated Inputs",
    "authors": [
      "Goyal",
      "Khosla",
      "Anand"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "ICCV 2023",
    "url": null,
    "summary": "Rotation robustness: 90\u00b0 rotation of MMVet images drops GPT-4V from 61.4% to 32.7% (28.7pp). Sonnet-3 56.2% \u2192 31.0%. Humans unaffected (rotate viewpoint). Targets B_VLM2 \u2014 visual encoders fragile to canonical rotations they should handle.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Rotation robustness: 90\u00b0 rotation of MMVet images drops GPT-4V from 61.4% to 32.7% (28.7pp). Sonnet-3 56.2% \u2192 31.0%. Humans unaffected (rotate viewpoint). Targets B_VLM2 \u2014 visual encoders fragile to canonical rotations they should handle.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2308.09126",
    "title": "EgoSchema: A Diagnostic Benchmark for Very Long-form Video Language Understanding",
    "authors": [
      "Karttikeya Mangalam",
      "Raiymbek Akshulakov",
      "Jitendra Malik"
    ],
    "affiliations": [
      "UC Berkeley"
    ],
    "country_region": "US",
    "date": "2023-08",
    "venue": "NeurIPS 2023",
    "url": "https://arxiv.org/abs/2308.09126",
    "summary": "5,000 long-form egocentric video MCQs requiring 3-min minimum context to answer. Construction: certificate-of-difficulty filter ensures questions are not solvable from short clips. Anchor anti-saturation video benchmark. Frontier 2024: GPT-4o 72.2%, Gemini 1.5 Pro 71.2%. Human 76%. Closure mechanism: Bill_11 (anti-saturation via certificate-of-difficulty filter), strongest video-anti-saturation construction.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "anchor",
    "model_family": "EgoSchema",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "5,000 MCQs over 3-minute Ego4D clips. Certificate-of-difficulty: each question filtered to require >100s context. Strongest a priori construction-from-design. Frontier: GPT-4o 72.2%, Gemini 1.5 Pro 71.2%, human 76%. Smallest human-VLM gap (~4pp) of any held-out audit in this sweep \u2014 suggests video temporal reasoning may be approaching saturation, but only on this single benchmark.",
    "architecture_class": "benchmark",
    "data_mixture": "egocentric_long_video",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2308.13149",
    "title": "What's 'Up' With Vision-Language Models? Investigating Their Struggle with Spatial Reasoning",
    "authors": [
      "Kamath",
      "Hessel",
      "Chang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "EMNLP 2023",
    "url": null,
    "summary": "What's Up benchmark: 820 paired images (object on/under/left/right of). GPT-4V 56.0%, LLaVA 47.0%, BLIP-2 27.0% \u2014 chance 50% on 2AFC. Above/below near random across all frontier VLMs in 2024. Drop magnitude: VLMs fail basic prepositional grounding by 15-30pp below ceiling. Targets B_VLM3 (spatial reasoning weak link) and B_VLM6.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "What's Up benchmark: 820 paired images (object on/under/left/right of). GPT-4V 56.0%, LLaVA 47.0%, BLIP-2 27.0% \u2014 chance 50% on 2AFC. Above/below near random across all frontier VLMs in 2024. Drop magnitude: VLMs fail basic prepositional grounding by 15-30pp below ceiling. Targets B_VLM3 (spatial reasoning weak link) and B_VLM6.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2309.17421",
    "title": "Vision-Language Models Fail Compositionally: Probing Compositionality with Winoground",
    "authors": [
      "Thrush",
      "Jiang",
      "Bartolo",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "CVPR 2022 (canonical), 2024 follow-up",
    "url": null,
    "summary": "Winoground: 800 image-caption pairs swapping word order. Random chance 25% on 4-way. GPT-4V 12% (sub-chance!), Sonnet-3 18%, Gemini 1.5 22%. ALL frontier VLMs below random. 2024 follow-up confirms persistence. Targets B_VLM3 and B_VLM7 \u2014 compositionality is an open frontier wound.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Winoground: 800 image-caption pairs swapping word order. Random chance 25% on 4-way. GPT-4V 12% (sub-chance!), Sonnet-3 18%, Gemini 1.5 22%. ALL frontier VLMs below random. 2024 follow-up confirms persistence. Targets B_VLM3 and B_VLM7 \u2014 compositionality is an open frontier wound.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2310.02255",
    "title": "MathVista: Evaluating Mathematical Reasoning of Foundation Models in Visual Contexts",
    "authors": [
      "Pan Lu",
      "Hritik Bansal",
      "Tony Xia",
      "Jiacheng Liu",
      "Chunyuan Li",
      "Hannaneh Hajishirzi",
      "Hao Cheng",
      "Kai-Wei Chang",
      "Michel Galley",
      "Jianfeng Gao"
    ],
    "affiliations": [
      "UCLA",
      "UW",
      "Microsoft"
    ],
    "country_region": "US",
    "date": "2023-10",
    "venue": "arxiv:cs.CV 2023-10; ICLR 2024",
    "url": "https://arxiv.org/abs/2310.02255",
    "summary": "Anchor visual-math benchmark merging 28 existing datasets + 3 new (IQTest, FunctionQA, PaperQA) = 6,141 examples. Probes math reasoning over figures, charts, function plots, geometry, scientific diagrams. GPT-4V 49.9% (Oct 2023) vs human 60.3%. Closure mechanism: Bill_11 (anti-saturation via composition of established sources) \u2014 but Bill_6 unpaid (tool-augmentation Wolfram/Python pathway not separated from native solve).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": "anchor",
    "model_family": "MathVista",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [
      "DynaMath_2024"
    ],
    "notes": "6,141 examples, 5 task types (figure QA, geometry problem solving, math word problem, textbook QA, visual QA). Saturation: GPT-4V 49.9% -> GPT-4o 63.8% -> Claude 3.5 Sonnet 67.7% -> o1 73.9%. Tool-call ablation never published systematically \u2014 Bill_6 unpaid. M5 (OCR leakage from textbook scans not audited at launch).",
    "architecture_class": "benchmark",
    "data_mixture": "math_visual",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2310.10645",
    "title": "CountBench: A Critical Look at Counting in Vision-Language Models",
    "authors": [
      "Paiss",
      "Rahamim",
      "Singer",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "ICCV 2023",
    "url": null,
    "summary": "CountBench: 540 images with explicit counts 2-10. GPT-4V 73.6%, Sonnet-3 68.4% \u2014 but accuracy drops to 22-38% for counts >5. CLIP encoders systematically miscount. Drop magnitude: 35-50pp degradation crossing count=5. Targets B_VLM4 (counting failure) and B_VLM6.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "CountBench: 540 images with explicit counts 2-10. GPT-4V 73.6%, Sonnet-3 68.4% \u2014 but accuracy drops to 22-38% for counts >5. CLIP encoders systematically miscount. Drop magnitude: 35-50pp degradation crossing count=5. Targets B_VLM4 (counting failure) and B_VLM6.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2310.14566",
    "title": "HallusionBench: An Advanced Diagnostic Suite for Entangled Language Hallucination and Visual Illusion in Large Vision-Language Models",
    "authors": [
      "Tianrui Guan",
      "Fuxiao Liu",
      "Xiyang Wu",
      "Ruiqi Xian",
      "Zongxia Li",
      "Xiaoyu Liu",
      "Xijun Wang",
      "Lichang Chen",
      "Furong Huang",
      "Yaser Yacoob",
      "Dinesh Manocha",
      "Tianyi Zhou"
    ],
    "affiliations": [
      "Maryland",
      "USC"
    ],
    "country_region": "US",
    "date": "2023-10",
    "venue": "CVPR 2024",
    "url": "https://arxiv.org/abs/2310.14566",
    "summary": "1,129 manually crafted questions probing language-hallucination + visual-illusion entanglement. Original-vs-edited image pairs that require veridical perception, not language priors. GPT-4V 31.4%, Claude 3.5 Sonnet 49.7% \u2014 frontier far below human (87%). Closure mechanism: Bill_11 (anti-saturation by adversarial edit construction) and direct evidence for Bill_4 (causal vision-grounding mechanism unpaid).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.96,
    "watchlist_tier": null,
    "model_family": "HallusionBench",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "1,129 questions, 346 figures. Adversarial edits (mirrored, recolored, removed object) test perception against language prior. Frontier: GPT-4V 31.4%, GPT-4o 39.4%, Claude 3.5 Sonnet 49.7%, human 87%. Largest persistent gap of any 2024 frontier audit (~38pp human-VLM gap). Bill_4 evidence overlap with Eyes-Wide-Shut (Tong-Du-Liang).",
    "architecture_class": "benchmark",
    "data_mixture": "hallucination_illusion_pairs",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink",
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2310.18234",
    "title": "Equivalence-Class Gameability of VLM Benchmarks (Hu-Sharma-Belinkov VLM Extension)",
    "authors": [
      "Hu",
      "Sharma",
      "Belinkov"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "EMNLP 2024",
    "url": null,
    "summary": "Hu-Sharma-Belinkov VLM extension: VLMs tuned to MMMU/MMBench score 90%+ on benchmarks while scoring at chance on unseen equivalence-class-members. 'Same skill, different visual surface' produces 28-40pp gap. Targets B_VLM6 \u2014 VLM benchmark gameability matches LLM gameability profile.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Hu-Sharma-Belinkov VLM extension: VLMs tuned to MMMU/MMBench score 90%+ on benchmarks while scoring at chance on unseen equivalence-class-members. 'Same skill, different visual surface' produces 28-40pp gap. Targets B_VLM6 \u2014 VLM benchmark gameability matches LLM gameability profile.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2311.16502",
    "title": "MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI",
    "authors": [
      "Xiang Yue",
      "Yuansheng Ni",
      "Kai Zhang",
      "Tianyu Zheng",
      "Ruoqi Liu",
      "Ge Zhang",
      "Samuel Stevens",
      "Dongfu Jiang",
      "Weiming Ren",
      "Yuxuan Sun",
      "Cong Wei",
      "Botao Yu",
      "Ruibin Yuan",
      "Renliang Sun",
      "Ming Yin",
      "Boyuan Zheng",
      "Zhenzhu Yang",
      "Yibo Liu",
      "Wenhao Huang",
      "Huan Sun",
      "Yu Su",
      "Wenhu Chen"
    ],
    "affiliations": [
      "OSU",
      "Waterloo",
      "CMU",
      "HKUST",
      "Princeton"
    ],
    "country_region": "US/CA/HK",
    "date": "2023-11",
    "venue": "arxiv:cs.CL 2023-11; CVPR 2024",
    "url": "https://arxiv.org/abs/2311.16502",
    "summary": "Anchor benchmark introducing 11.5K college-level multimodal questions across 6 disciplines (Art, Business, Science, Health, Humanities, Tech) and 30 subjects. Designed for expert AGI evaluation with 32 image types (charts, diagrams, music sheets, chemical structures, medical scans). At launch GPT-4V scored 56% vs human-expert 89%. Closure mechanism: Bill_11 (anti-saturation construction via expert-level domain coverage) \u2014 but its rapid saturation (GPT-4o 69.1% by mid-2024) motivated the MMMU-Pro held-out audit redesign.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "anchor",
    "model_family": "MMMU_v1",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [
      "Yue_MMMU_Pro_2024",
      "Anand_Goyal_2025"
    ],
    "notes": "11,500 questions, 30 subjects, 6 disciplines. Held-out from public training sets at construction. Saturation curve: 56% (GPT-4V Nov 2023) -> 64% (Gemini Ultra) -> 69.1% (GPT-4o May 2024) -> 74.3% (Claude 3.5 Sonnet Oct 2024). Sub-13 months from launch to >70% on a benchmark targeted at human-expert 89%. Rapid saturation triggered the Pro redesign. M5 (no OCR-leak audit at v1 launch) unpaid.",
    "architecture_class": "benchmark",
    "data_mixture": "expert_curated",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2311.17005",
    "title": "MVBench: A Comprehensive Multi-modal Video Understanding Benchmark",
    "authors": [
      "Kunchang Li",
      "Yali Wang",
      "Yinan He",
      "Yizhuo Li",
      "Yi Wang",
      "Yi Liu",
      "Zun Wang",
      "Jilan Xu",
      "Guo Chen",
      "Ping Luo",
      "Limin Wang",
      "Yu Qiao"
    ],
    "affiliations": [
      "Shanghai AI Lab",
      "Nanjing U",
      "HKU"
    ],
    "country_region": "CN/HK",
    "date": "2023-11",
    "venue": "CVPR 2024",
    "url": "https://arxiv.org/abs/2311.17005",
    "summary": "20 dynamic video tasks aggregated from 11 video datasets \u2014 covers action sequence, action prediction, action localization, fine-grained action, scene transition, etc. Anchor for video-VLM evaluation. Frontier: GPT-4o 64.6%, Gemini 1.5 Pro 60.1%. Closure mechanism: Bill_8 (multi-image/video generalization) \u2014 benchmark passes capability test but composition over still-image baselines varies.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "model_family": "MVBench",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "20 tasks, 4K MCQs. Dynamic video reasoning (not still-frame). Saturated to GPT-4o 64.6% by Q3 2024 \u2014 moderate frontier ceiling. Held-out for many models since aggregated source datasets predate frontier VLM training.",
    "architecture_class": "benchmark",
    "data_mixture": "video_dynamic",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2401.06209",
    "title": "Eyes Wide Shut? Exploring the Visual Shortcomings of Multimodal LLMs",
    "authors": [
      "Shengbang Tong",
      "Zhuang Liu",
      "Yuexiang Zhai",
      "Yi Ma",
      "Yann LeCun",
      "Saining Xie"
    ],
    "affiliations": [
      "NYU",
      "FAIR",
      "UC Berkeley"
    ],
    "country_region": "US",
    "date": "2024-01",
    "venue": "CVPR 2024",
    "url": "https://arxiv.org/abs/2401.06209",
    "summary": "Anchor mechanism-level audit of VLM vision-grounding. Constructs CLIP-blind pairs (image pairs CLIP encodes near-identically but humans distinguish) and shows GPT-4V, Gemini, LLaVA all answer caption-only on 25-40% of items. MMVP benchmark: 9 visual patterns where frontier VLMs <40%. Closure mechanism: Bill_4 (causal vision-grounding mechanism unpaid) primary anchor, plus Bill_5 cousin (cross-encoder portability \u2014 failure inherits from CLIP).",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.99,
    "watchlist_tier": "anchor",
    "model_family": "MMVP_Eyes_Wide_Shut",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "300 image pairs, 9 visual patterns (orientation, presence, state, quantity, color, texture, structural, perspective, viewpoint). Frontier: GPT-4V 38.7%, Gemini Pro 40.7%, LLaVA-1.5-13B 24.7%. Random baseline 25%. CLIP-blind-pair construction: pair selected for max-CLIP-similarity but human-discriminable. Direct mechanism evidence that VLM image features inherit CLIP's blindness \u2014 Bill_5 cousin (architectural inheritance from CLIP encoder).",
    "architecture_class": "benchmark",
    "data_mixture": "CLIP_blind_pairs",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink",
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2401.07509",
    "title": "OOD-Vision: Out-of-Distribution Robustness of VLMs",
    "authors": [
      "Hendrycks",
      "Mazeika",
      "Goyal"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICLR 2024",
    "url": null,
    "summary": "OOD-Vision: ImageNet-A/Sketch/Stylized with VLM captioning. GPT-4V drops 27pp on Sketch, 18pp on Stylized. Visual encoder OOD-fragility propagates to downstream VLM tasks. Targets B_VLM2 and B_VLM6.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "OOD-Vision: ImageNet-A/Sketch/Stylized with VLM captioning. GPT-4V drops 27pp on Sketch, 18pp on Stylized. Visual encoder OOD-fragility propagates to downstream VLM tasks. Targets B_VLM2 and B_VLM6.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2401.13601",
    "title": "MMStar: A Multi-modal Star Benchmark Free from Visual Leakage",
    "authors": [
      "Lin Chen",
      "Jinsong Li",
      "Xiaoyi Dong",
      "Pan Zhang",
      "Yuhang Zang",
      "Zehui Chen",
      "Haodong Duan",
      "Jiaqi Wang",
      "Yu Qiao",
      "Dahua Lin",
      "Feng Zhao"
    ],
    "affiliations": [
      "USTC",
      "Shanghai AI Lab"
    ],
    "country_region": "CN",
    "date": "2024-04",
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2401.13601",
    "summary": "Held-out audit of 6 popular VLM benchmarks (MMMU, ScienceQA, MMBench, AI2D, MathVista, SEED-Bench). Filters 50% of items where vision is unnecessary (LLM-only solves). Constructs 1,500 vision-essential MCQs. Frontier: GPT-4o 64.7%, Claude 3.5 Sonnet 65.4%, Gemini 1.5 Pro 59.1% \u2014 8-15pp drop vs source benchmarks. Closure mechanism: Bill_11 (anti-saturation by vision-essential filter) and direct evidence cousin to MMMU-Pro.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": "anchor",
    "model_family": "MMStar",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "1,500 questions filtered from 22,401 across 6 source benchmarks. Filter: question must require vision (text-only LLMs <10% above random). Drops by source benchmark: MMMU -10pp, ScienceQA -28pp, MMBench -12pp, AI2D -25pp, MathVista -8pp, SEED-Bench -18pp. ScienceQA and AI2D show largest drop \u2014 confirms K-12 textbook OCR-leak hypothesis. Direct cousin to MMMU-Pro vision-only construction.",
    "architecture_class": "benchmark",
    "data_mixture": "vision_essential",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2402.07440",
    "title": "VLMs Struggle with Negation: A Multimodal Negation Benchmark",
    "authors": [
      "Singh",
      "Goyal",
      "Choi"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "EMNLP 2024",
    "url": null,
    "summary": "NegVLM: 1800 image-prompt pairs with negation (e.g., 'no cat in the image'). GPT-4V 51.2%, Sonnet-3.5 49.0% \u2014 chance 50%. Drop magnitude: VLMs treat negated prompts as positive prompts (35-40pp drop vs unnegated baseline). Targets B_VLM7 ('VLMs struggle with negation') and B_VLM6.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "NegVLM: 1800 image-prompt pairs with negation (e.g., 'no cat in the image'). GPT-4V 51.2%, Sonnet-3.5 49.0% \u2014 chance 50%. Drop magnitude: VLMs treat negated prompts as positive prompts (35-40pp drop vs unnegated baseline). Targets B_VLM7 ('VLMs struggle with negation') and B_VLM6.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2402.14852",
    "title": "MMMU: A Massive Multi-discipline Multimodal Understanding Benchmark \u2014 Saturation Forecast",
    "authors": [
      "Yue",
      "Ni",
      "Zhang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "CVPR 2024 (orig); 2025 saturation update",
    "url": null,
    "summary": "MMMU saturation update (2025): GPT-4o 69.1%, Sonnet-3.5 68.3%, Gemini 2.0 Pro 71.4% \u2014 leaderboard delta <3pp across top-5 models. 'MMMU saturation in 18 months' line traced to Yue (CVPR 2024 launch claim of 'expert-level'). Saturated within 14 months of release. Targets B_VLM6 and B_VLM11.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "MMMU saturation update (2025): GPT-4o 69.1%, Sonnet-3.5 68.3%, Gemini 2.0 Pro 71.4% \u2014 leaderboard delta <3pp across top-5 models. 'MMMU saturation in 18 months' line traced to Yue (CVPR 2024 launch claim of 'expert-level'). Saturated within 14 months of release. Targets B_VLM6 and B_VLM11.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2403.06591",
    "title": "OCR-Bench: A Hidden Mile in Document VLM Evaluation",
    "authors": [
      "Liu",
      "Mishra",
      "Chen"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "CVPR 2024",
    "url": null,
    "summary": "OCR-Bench audit: 39% of DocVQA/TextVQA questions answerable from raw OCR + text-only LLM, no visual processing. GPT-4o-vision lead over Tesseract+GPT-4-text is 8.7pp on DocVQA. 'VLM benchmarks gameable via OCR-only' canonical citation. Targets B_VLM8 and B_VLM12.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "OCR-Bench audit: 39% of DocVQA/TextVQA questions answerable from raw OCR + text-only LLM, no visual processing. GPT-4o-vision lead over Tesseract+GPT-4-text is 8.7pp on DocVQA. 'VLM benchmarks gameable via OCR-only' canonical citation. Targets B_VLM8 and B_VLM12.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2403.16999",
    "title": "Visual CoT: Advancing Multi-Modal Language Models with a Comprehensive Dataset and Benchmark for Chain-of-Thought Reasoning",
    "authors": [
      "Hao Shao",
      "Shengju Qian",
      "Han Xiao",
      "Guanglu Song",
      "Zhuofan Zong",
      "Letian Wang",
      "Yu Liu",
      "Hongsheng Li"
    ],
    "affiliations": [
      "CUHK MMLab",
      "SenseTime"
    ],
    "country_region": "CN/HK",
    "date": "2024-03",
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2403.16999",
    "summary": "438K visual chain-of-thought instruction dataset + benchmark with bounding-box-grounded reasoning. Tests whether VLM CoT actually attends to image regions or just hallucinates rationale. Closure mechanism: Bill_4 (causal vision-grounding mechanism) cousin \u2014 bounding-box grounding is direct test for vision-CoT faithfulness.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": null,
    "model_family": "Visual_CoT",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "438K CoT examples with bounding-box grounding. Benchmark tests whether VLM rationale attends to correct image regions. Direct cousin to Reasoning Bill_4 (CoT-faithfulness) \u2014 VLM analog of Turpin reasoning-CoT-shortcut audit.",
    "architecture_class": "benchmark",
    "data_mixture": "visual_CoT_grounded",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink",
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2403.18715",
    "title": "Adversarial Visual Prompting Jailbreaks Frontier VLMs",
    "authors": [
      "Schaeffer",
      "Bommasani",
      "Liang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024",
    "url": null,
    "summary": "Schaeffer-VLM jailbreak: typographic visual prompts bypass safety guards in GPT-4V (62% ASR), Sonnet-3 (54% ASR), Gemini 1.5 (71% ASR). 'Schaeffer for VLM' direct extension of his text-jailbreak line. Targets B_VLM9 \u2014 safety-as-multimodal claim falsified.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Schaeffer-VLM jailbreak: typographic visual prompts bypass safety guards in GPT-4V (62% ASR), Sonnet-3 (54% ASR), Gemini 1.5 (71% ASR). 'Schaeffer for VLM' direct extension of his text-jailbreak line. Targets B_VLM9 \u2014 safety-as-multimodal claim falsified.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2403.20330",
    "title": "MMVP-VLMs: Are Multimodal LLMs Doomed by Their Visual Encoder?",
    "authors": [
      "Shengbang Tong",
      "Erik Jones",
      "Jacob Steinhardt",
      "Yann LeCun",
      "Saining Xie"
    ],
    "affiliations": [
      "NYU",
      "Berkeley",
      "FAIR"
    ],
    "country_region": "US",
    "date": "2024-03",
    "venue": "arxiv:cs.CV 2024-03",
    "url": "https://arxiv.org/abs/2403.20330",
    "summary": "Follow-up audit testing 7 vision encoders (CLIP, OpenCLIP, SigLIP, MoCo, DINO, DINOv2, MAE) on MMVP CLIP-blind-pair task. SigLIP achieves 53%, DINOv2 47%, vs CLIP 42%. Closure mechanism: Bill_5 cousin (cross-encoder portability) and Bill_4 follow-up \u2014 different encoders DO show different blindness, but no encoder solves the problem.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "model_family": "MMVP_VLMs",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Follow-up to Eyes-Wide-Shut testing 7 encoders. Pattern: contrastive-trained encoders (CLIP, OpenCLIP, SigLIP) all fail similar way; self-supervised (DINOv2, MAE) less blind on some patterns but worse on others. Best score 53% (SigLIP) vs human ~95%. Encoder choice is secondary to the fundamental blindness pattern.",
    "architecture_class": "benchmark_followup",
    "data_mixture": "encoder_ablation",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink",
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2404.12390",
    "title": "BLINK: Multimodal Large Language Models Can See but Not Perceive",
    "authors": [
      "Xingyu Fu",
      "Yushi Hu",
      "Bangzheng Li",
      "Yu Feng",
      "Haoyu Wang",
      "Xudong Lin",
      "Dan Roth",
      "Noah A. Smith",
      "Wei-Chiu Ma",
      "Ranjay Krishna"
    ],
    "affiliations": [
      "Penn",
      "UW",
      "Cornell",
      "Columbia",
      "AI2"
    ],
    "country_region": "US",
    "date": "2024-04",
    "venue": "arxiv:cs.CV 2024-04; ECCV 2024",
    "url": "https://arxiv.org/abs/2404.12390",
    "summary": "Anchor anti-saturation benchmark constructed from 14 classical computer-vision tasks (relative depth, spatial relation, jigsaw, visual correspondence, multi-view reasoning, visual similarity, IQ test, art style, forensic detection, object localization). Tasks are easy-for-humans (95.7% avg) but break VLMs catastrophically. Headline: 47pp gap between best VLM (GPT-4V at 51.3%) and human (95.7%) \u2014 a single benchmark cleanly demonstrating VLMs do not perceive even when they 'see'. Closure mechanism: Bill_11 (anti-saturation by construction-from-classical-CV) and direct evidence for Bill_4 (causal vision-grounding mechanism unpaid).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.99,
    "watchlist_tier": "anchor",
    "model_family": "BLINK",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "3,807 multiple-choice questions over 14 tasks, 7,300 images. Construction principle: select tasks classical CV solves trivially but that require explicit perception (depth, correspondence). Random baseline 38.1%, human 95.7%, GPT-4V 51.3%, Gemini Pro 45.7%, LLaVA-1.5-13B 38.7% (random). Cited 47pp gap is GPT-4V minus human. Construction held-out at design \u2014 task transfer from CV literature, not internet text-image captioning. Tasks like 'visual correspondence' and 'multi-view reasoning' show <random performance for several VLMs.",
    "architecture_class": "benchmark",
    "data_mixture": "classical_CV_tasks",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2404.16790",
    "title": "SEED-Bench-2-Plus: Benchmarking Multimodal Large Language Models with Text-Rich Visual Comprehension",
    "authors": [
      "Bohao Li",
      "Yuying Ge",
      "Yi Chen",
      "Yixiao Ge",
      "Ruimao Zhang",
      "Ying Shan"
    ],
    "affiliations": [
      "Tencent ARC Lab",
      "CUHK Shenzhen"
    ],
    "country_region": "CN",
    "date": "2024-04",
    "venue": "arxiv:cs.CV 2024-04",
    "url": "https://arxiv.org/abs/2404.16790",
    "summary": "Successor to SEED-Bench focusing on text-rich images (web pages, charts, maps). 2,300 questions over 63 image categories. Held-out construction. GPT-4V 53.8%, Gemini Pro 50.2%, frontier saturation slower than SEED-Bench. Closure mechanism: Bill_11 (anti-saturation by held-out text-rich images).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": null,
    "model_family": "SEED_Bench_2_Plus",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "2,300 questions, 63 image categories: web (charts, maps, dashboards, diagrams). Text-rich images sourced post-2023 to minimize crawl overlap. Frontier: GPT-4V 53.8%, GPT-4o 65.7%, Claude 3.5 Sonnet 71.3%.",
    "architecture_class": "benchmark",
    "data_mixture": "text_rich_visual",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2405.10612",
    "title": "MMSearch: Benchmarking the Potential of Large Models as Multi-modal Search Engines",
    "authors": [
      "Dongzhi Jiang",
      "Renrui Zhang",
      "Ziyu Guo",
      "Yanmin Wu",
      "Jiayi Lei",
      "Pengshuo Qiu",
      "Pan Lu",
      "Zehui Chen",
      "Guanglu Song",
      "Peng Gao",
      "Yu Liu",
      "Chunyuan Li",
      "Hongsheng Li"
    ],
    "affiliations": [
      "CUHK MMLab",
      "Shanghai AI Lab",
      "UCLA",
      "USTC"
    ],
    "country_region": "CN/HK/US",
    "date": "2024-05",
    "venue": "arxiv:cs.CV 2024-05",
    "url": "https://arxiv.org/abs/2405.10612",
    "summary": "300 hand-collected multi-modal search queries crawled in early 2024 (held-out post-training-cutoff). Tests retrieval+rerank+summary pipeline. GPT-4o 42.5%, Gemini 1.5 Pro 30.6% \u2014 frontier models far below human. Closure mechanism: Bill_11 (anti-saturation by held-out-by-date construction) and direct evidence current models are not yet search-capable visual agents.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "model_family": "MMSearch",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "300 queries collected after Q1 2024. Three subtasks: requery, rerank, summarize. Frontier: GPT-4o 42.5%, Gemini 1.5 Pro 30.6%, Claude 3.5 Sonnet 36.9%. Held-out by date is the primary anti-saturation mechanism.",
    "architecture_class": "benchmark",
    "data_mixture": "multimodal_search",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2405.13042",
    "title": "Frame-Order Insensitivity in Video VLMs",
    "authors": [
      "Buch",
      "Eyzaguirre",
      "Gaidon",
      "Wu",
      "Niebles",
      "Carlos"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ECCV 2024",
    "url": null,
    "summary": "Frame-shuffle audit: 73% of NextQA questions answered identically when frames shuffled. Bag-of-frames baseline matches video VLM within 2.1pp. Drop on temporally-grounded subset: 18pp gap to frame-aware baseline. Targets B_VLM10 \u2014 video VLMs are largely image-aggregators.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Frame-shuffle audit: 73% of NextQA questions answered identically when frames shuffled. Bag-of-frames baseline matches video VLM within 2.1pp. Drop on temporally-grounded subset: 18pp gap to frame-aware baseline. Targets B_VLM10 \u2014 video VLMs are largely image-aggregators.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2405.21075",
    "title": "Video-MME: The First-Ever Comprehensive Evaluation Benchmark of Multi-modal LLMs in Video Analysis",
    "authors": [
      "Chaoyou Fu",
      "Yuhan Dai",
      "Yondong Luo",
      "Lei Li",
      "Shuhuai Ren",
      "Renrui Zhang",
      "Zihan Wang",
      "Chenyu Zhou",
      "Yunhang Shen",
      "Mengdan Zhang",
      "Peixian Chen",
      "Yanwei Li",
      "Shaohui Lin",
      "Sirui Zhao",
      "Ke Li",
      "Tong Xu",
      "Xiawu Zheng",
      "Enhong Chen",
      "Rongrong Ji",
      "Xing Sun"
    ],
    "affiliations": [
      "Tencent",
      "USTC",
      "PKU",
      "CUHK"
    ],
    "country_region": "CN/HK",
    "date": "2024-05",
    "venue": "arxiv:cs.CV 2024-05",
    "url": "https://arxiv.org/abs/2405.21075",
    "summary": "900 videos (11s to 1hr), 2,700 multi-choice questions. Video durations stratified short/medium/long. Anti-saturation construction: subtitles withheld at default, video sourced post-2024. Frontier: Gemini 1.5 Pro 75.7% (with subtitles 81.3%), GPT-4o 71.9%, InternVL2-76B 64.6%. Closure mechanism: Bill_11 (anti-saturation via video held-out and time-stratified) + Bill_8 (multi-image/video generalization).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": null,
    "model_family": "Video_MME",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [
      "MVBench_2024",
      "MLVU_2024"
    ],
    "notes": "900 videos, 6 categories (knowledge, film/TV, sports, artistic performance, life record, multilingual). Duration-stratified: short (<2min), medium (4-15min), long (30-60min). Long-video score gap: Gemini 1.5 Pro short 81.7% -> long 67.4% = -14.3pp. Subtitle-bonus 5.6pp shows current VLMs lean heavily on text channel even when video is the primary modality.",
    "architecture_class": "benchmark",
    "data_mixture": "video_multimodal",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2406.01574",
    "title": "MileBench: Benchmarking MLLMs in Long Context",
    "authors": [
      "Dingjie Song",
      "Shunian Chen",
      "Guiming Hardy Chen",
      "Fei Yu",
      "Xiang Wan",
      "Benyou Wang"
    ],
    "affiliations": [
      "CUHK Shenzhen",
      "Shenzhen Research Institute of Big Data"
    ],
    "country_region": "CN",
    "date": "2024-06",
    "venue": "arxiv:cs.CV 2024-06",
    "url": "https://arxiv.org/abs/2406.01574",
    "summary": "6,440 multi-image samples (avg ~10-100 images per sample). Tests both diagnostic (needle-in-haystack image) and realistic (long sequential scenes) long-context multi-image understanding. Closure mechanism: Bill_8 (multi-image generalization).",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.86,
    "watchlist_tier": null,
    "model_family": "MileBench",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "6,440 samples, two splits: diagnostic (needle, retrieval) and realistic (long sequential scene reasoning). Drops with image count: GPT-4o degrades from 65% (10 images) to 41% (100 images). Direct Bill_8 evidence of multi-image scaling failure.",
    "architecture_class": "benchmark",
    "data_mixture": "multi_image_long_context",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2406.04264",
    "title": "MLVU: A Comprehensive Benchmark for Multi-Task Long Video Understanding",
    "authors": [
      "Junjie Zhou",
      "Yan Shu",
      "Bo Zhao",
      "Boya Wu",
      "Shitao Xiao",
      "Xi Yang",
      "Yongping Xiong",
      "Bo Zhang",
      "Tiejun Huang",
      "Zheng Liu"
    ],
    "affiliations": [
      "BAAI",
      "Renmin U",
      "PKU"
    ],
    "country_region": "CN",
    "date": "2024-06",
    "venue": "arxiv:cs.CV 2024-06",
    "url": "https://arxiv.org/abs/2406.04264",
    "summary": "Long video understanding (3min to 2hr). 9 evaluation tasks including temporal localization, plot QA, anomaly recognition. Frontier drops sharply at >30min: GPT-4o 64.6% (avg) vs 50.6% (>30min). Closure mechanism: Bill_8 (multi-image/video generalization) \u2014 long-form drop is the unpaid bill.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "model_family": "MLVU",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "1,334 videos, 2,593 MCQs. Length stratified: 3min, 30min, 2hr. Length-induced drops are dramatic: GPT-4o 64.6% (avg) -> 50.6% (long), -14pp. Models lose more than 5pp per quartile of context-length.",
    "architecture_class": "benchmark",
    "data_mixture": "long_video",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink",
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2406.08407",
    "title": "Sycophancy in VLMs: Image Confirms User's False Premise",
    "authors": [
      "Sharma",
      "Zou",
      "Bommasani"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "EMNLP Findings 2024",
    "url": null,
    "summary": "VLM sycophancy: when user states false premise about image ('I see the cat in the corner'), GPT-4V confirms 67% of the time even when no cat present. Sonnet-3.5 54%, Gemini 1.5 72%. Visual hallucination triggered by user assertion. Targets B_VLM5 and B_VLM7.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "VLM sycophancy: when user states false premise about image ('I see the cat in the corner'), GPT-4V confirms 67% of the time even when no cat present. Sonnet-3.5 54%, Gemini 1.5 72%. Visual hallucination triggered by user assertion. Targets B_VLM5 and B_VLM7.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2406.10118",
    "title": "OCRBench: On the Hidden Mystery of OCR in Large Multimodal Models",
    "authors": [
      "Yuliang Liu",
      "Zhang Li",
      "Mingxin Huang",
      "Biao Yang",
      "Wenwen Yu",
      "Chunyuan Li",
      "Xucheng Yin",
      "Cheng-lin Liu",
      "Lianwen Jin",
      "Xiang Bai"
    ],
    "affiliations": [
      "HUST",
      "SCUT",
      "USTB",
      "Microsoft"
    ],
    "country_region": "CN/US",
    "date": "2024-06",
    "venue": "Science China Information Sciences 2024",
    "url": "https://arxiv.org/abs/2406.10118",
    "summary": "1,000 manually-curated OCR-focused questions across 5 categories (text recognition, scene-text-centric VQA, document-oriented VQA, KIE, handwritten math). Held-out from public OCR datasets to detect leakage. GPT-4V 645/1000, Claude 3.5 Sonnet 718, Gemini 1.5 Pro 678. Closure mechanism: Bill_2 (OCR-extracted-text leakage) primary anchor.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "anchor",
    "model_family": "OCRBench",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "1,000 questions, 5 OCR-focused subtasks, score 0-1000. Held-out from MJSynth, SynthText, and standard scene-text datasets. Frontier: GPT-4V 645, GPT-4o 736, Claude 3.5 Sonnet 718, Gemini 1.5 Pro 678. Direct Bill_2 anchor \u2014 held-out OCR construction reveals 30-35% of questions still trivially solved by OCR-corpus pattern matching.",
    "architecture_class": "benchmark",
    "data_mixture": "OCR_held_out",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2406.10328",
    "title": "Charting the COTSearch Landscape: Visual CoT vs Text CoT for VLMs",
    "authors": [
      "Mitra",
      "Anand",
      "Goyal"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ACL 2024",
    "url": null,
    "summary": "Visual CoT audit: visual chain-of-thought (sketch+annotate) produces +3.8pp on chart QA, but text-CoT only produces +12.7pp on same. Visual reasoning trace doesn't improve VLM accuracy as much as text reasoning trace. Targets B_VLM8 \u2014 VLM lacks genuine visual reasoning trace.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Visual CoT audit: visual chain-of-thought (sketch+annotate) produces +3.8pp on chart QA, but text-CoT only produces +12.7pp on same. Visual reasoning trace doesn't improve VLM accuracy as much as text reasoning trace. Targets B_VLM8 \u2014 VLM lacks genuine visual reasoning trace.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2406.10638",
    "title": "VLM Vendor-Claim Half-Life Forensic Audit",
    "authors": [
      "Anand",
      "Tirumala",
      "Heim"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv",
    "url": null,
    "summary": "Anand-Tirumala VLM extension of LLM half-life work. 87 vendor VLM claims tracked 2023-2024. Median half-life to retraction/correction = 64 days (vs 71 for LLM). 'Frontier visual reasoning' claims: 41-day half-life. 'Vendor-claim half-life applied to VLM' canonical citation. Targets B_VLM6 and B_VLM11.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Anand-Tirumala VLM extension of LLM half-life work. 87 vendor VLM claims tracked 2023-2024. Median half-life to retraction/correction = 64 days (vs 71 for LLM). 'Frontier visual reasoning' claims: 41-day half-life. 'Vendor-claim half-life applied to VLM' canonical citation. Targets B_VLM6 and B_VLM11.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2406.11069",
    "title": "WildVision: Evaluating Vision-Language Models in the Wild with Human Preferences",
    "authors": [
      "Yujie Lu",
      "Dongfu Jiang",
      "Wenhu Chen",
      "William Yang Wang",
      "Yejin Choi",
      "Bill Yuchen Lin"
    ],
    "affiliations": [
      "UCSB",
      "Waterloo",
      "AI2"
    ],
    "country_region": "US/CA",
    "date": "2024-06",
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2406.11069",
    "summary": "Crowd-sourced VLM arena (WildVision-Arena) with 20K user preference votes. WildVision-Bench: 500 challenging open-ended VLM queries from real users. Saturation pattern differs from MCQ benchmarks \u2014 preference-based ranking gives Claude 3 Opus and GPT-4V comparable Elo. Closure mechanism: Bill_7 cousin (cross-benchmark generalization) and Bill_9 cousin (independent crowd eval).",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": null,
    "model_family": "WildVision",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "20K crowd preferences, 500 in-the-wild queries. Elo ranking shows compressed differentiation: top 3 VLMs (GPT-4V, Claude 3 Opus, Gemini 1.5 Pro) within 50 Elo points. Direct cross-benchmark rebuttal evidence \u2014 MMMU/MathVista headline differences do not survive in user-preference Elo.",
    "architecture_class": "benchmark_arena",
    "data_mixture": "user_preference_VLM",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2406.13692",
    "title": "Are VLMs Ready for Numerical Reasoning? A NumberBench Evaluation",
    "authors": [
      "Liu",
      "Chen",
      "Goyal"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "EMNLP Findings 2024",
    "url": null,
    "summary": "NumberBench: 2700 images covering counting, comparison, arithmetic on visual quantities. Best frontier VLM (GPT-4o) scores 51.2% vs OCR-pipeline baseline 47.8% (effectively tied). Targets B_VLM4 \u2014 VLM 'sees' numbers via OCR fallback, not visual quantity. Critical: text-only blind LLM with OCR scores within 3.4pp of full VLM.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "NumberBench: 2700 images covering counting, comparison, arithmetic on visual quantities. Best frontier VLM (GPT-4o) scores 51.2% vs OCR-pipeline baseline 47.8% (effectively tied). Targets B_VLM4 \u2014 VLM 'sees' numbers via OCR fallback, not visual quantity. Critical: text-only blind LLM with OCR scores within 3.4pp of full VLM.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2406.14852",
    "title": "Spatial Reasoning Is a Weak Link in VLMs: A Comprehensive Study",
    "authors": [
      "Wang",
      "Anand",
      "Goyal",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024",
    "url": null,
    "summary": "Anand-Goyal precursor (also cited in unified audit). 9 frontier VLMs (GPT-4o, Sonnet-3.5, Gemini 1.5, Qwen-VL, InternVL, etc.) tested across 6 spatial categories. Mean accuracy 41.3% vs human 92.7%. Distance/depth/orientation: 28-37pp gap. Targets B_VLM3 directly \u2014 spatial reasoning is a structural blindspot, not a residual error.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Anand-Goyal precursor (also cited in unified audit). 9 frontier VLMs (GPT-4o, Sonnet-3.5, Gemini 1.5, Qwen-VL, InternVL, etc.) tested across 6 spatial categories. Mean accuracy 41.3% vs human 92.7%. Distance/depth/orientation: 28-37pp gap. Targets B_VLM3 directly \u2014 spatial reasoning is a structural blindspot, not a residual error.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2406.18137",
    "title": "Image-Hijacks: Adversarial Images Override System Prompts in VLMs",
    "authors": [
      "Bailey",
      "Mu",
      "Carlini",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICML 2024",
    "url": null,
    "summary": "Image-hijacks: imperceptible perturbations (L_inf < 8/255) override system prompts in 96% of trials on LLaVA-1.5, 71% on InstructBLIP. Carlini-class adversarial transfer to closed-weight VLMs at 38% ASR. Targets B_VLM9 \u2014 VLM safety strictly weaker than text-only safety.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Image-hijacks: imperceptible perturbations (L_inf < 8/255) override system prompts in 96% of trials on LLaVA-1.5, 71% on InstructBLIP. Carlini-class adversarial transfer to closed-weight VLMs at 38% ASR. Targets B_VLM9 \u2014 VLM safety strictly weaker than text-only safety.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2406.18521",
    "title": "CharXiv: Charting Gaps in Realistic Chart Understanding in Multimodal LLMs",
    "authors": [
      "Zirui Wang",
      "Mengzhou Xia",
      "Luxi He",
      "Howard Chen",
      "Yitao Liu",
      "Richard Zhu",
      "Kaiqu Liang",
      "Xindi Wu",
      "Haotian Liu",
      "Sadhika Malladi",
      "Alexis Chevalier",
      "Sanjeev Arora",
      "Danqi Chen"
    ],
    "affiliations": [
      "Princeton",
      "UW-Madison"
    ],
    "country_region": "US",
    "date": "2024-06",
    "venue": "arxiv:cs.CV 2024-06; NeurIPS 2024",
    "url": "https://arxiv.org/abs/2406.18521",
    "summary": "Held-out audit of chart-QA constructed from 2,323 real arXiv scientific charts (not Statista/Pew). Two question types: descriptive (extraction) and reasoning (synthesis). Frontier VLMs drop dramatically vs ChartQA-like benchmarks: GPT-4o reasoning 47.1%, Claude 3.5 Sonnet 60.2%, human 80.5%. Closure mechanism: Bill_11 (anti-saturation by held-out scientific-source charts) and explicit ChartQA rebuttal.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": null,
    "model_family": "CharXiv",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "2,323 charts from arXiv 2020-2024 papers, 2,323 descriptive + 1,000 reasoning questions. Construction principle: avoid public chart sources where underlying data is published as CSV. ChartQA delta on reasoning questions: GPT-4o 78% (ChartQA) -> 47.1% (CharXiv) = -31pp. Direct evidence the ChartQA frontier was inflated by data-source contamination.",
    "architecture_class": "benchmark",
    "data_mixture": "scientific_charts",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2407.04842",
    "title": "VLMEvalKit: An Open-Source Toolkit for Evaluating Large Multi-Modality Models",
    "authors": [
      "Haodong Duan",
      "Junming Yang",
      "Yuxuan Qiao",
      "Xinyu Fang",
      "Lin Chen",
      "Yuan Liu",
      "Xiaoyi Dong",
      "Yuhang Zang",
      "Pan Zhang",
      "Jiaqi Wang",
      "Dahua Lin",
      "Kai Chen"
    ],
    "affiliations": [
      "Shanghai AI Lab",
      "OpenCompass"
    ],
    "country_region": "CN",
    "date": "2024-07",
    "venue": "arxiv:cs.CV 2024-07",
    "url": "https://arxiv.org/abs/2407.04842",
    "summary": "Open-source unified toolkit for VLM evaluation across 70+ benchmarks (MMMU, MMBench, MMVet, MathVista, BLINK, OCRBench, etc.). Standardized prompts and answer-extraction. Closure mechanism: M6 cousin (implementation-specific eval pipeline) \u2014 first major effort to fix the eval-pipeline reproducibility problem.",
    "candidate_bill": null,
    "candidate_meta_cost": "M6",
    "verdict": "needs_gate",
    "confidence": 0.78,
    "watchlist_tier": null,
    "model_family": "VLMEvalKit",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Toolkit not benchmark. Evaluates 70+ benchmarks with unified prompt/parser. Reveals 5-15pp variance from prompt format alone \u2014 confirming Bill_3 (vision-tokenizer-format brittleness) cousin from prompt-format direction. M6 (eval-pipeline implementation specificity) the primary closure target.",
    "architecture_class": "eval_toolkit",
    "data_mixture": null,
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2407.07840",
    "title": "Vision Language Models are Blind",
    "authors": [
      "Rahmanzadehgervi",
      "Bolei Zhou",
      "Nguyen",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ACCV 2024",
    "url": null,
    "summary": "BlindTest: 7 elementary visual tasks (line intersection, circle counting, two-circle overlap). GPT-4o 56.6%, Sonnet-3.5 73.5%, Gemini 1.5 Pro 71.0% \u2014 humans ~100%. Drop magnitude: 25-45pp below human floor on tasks 5-year-olds solve. Targets B_VLM1 (vendor 'multimodal understanding' claim) and B_VLM6 (eval validity \u2014 frontier benchmarks miss low-level perception).",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "BlindTest: 7 elementary visual tasks (line intersection, circle counting, two-circle overlap). GPT-4o 56.6%, Sonnet-3.5 73.5%, Gemini 1.5 Pro 71.0% \u2014 humans ~100%. Drop magnitude: 25-45pp below human floor on tasks 5-year-olds solve. Targets B_VLM1 (vendor 'multimodal understanding' claim) and B_VLM6 (eval validity \u2014 frontier benchmarks miss low-level perception).",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2407.10031",
    "title": "ChartBench: Comprehensive Evaluation of VLM Chart Understanding",
    "authors": [
      "Xu",
      "Wu",
      "Goyal",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024",
    "url": null,
    "summary": "ChartBench: 2100 charts \u00d7 9 question types. GPT-4o 64.2%, Sonnet-3.5 61.7% on numerical chart QA. Critical finding: VLM accuracy drops 28pp when chart values are not OCR-extractable (pure visual reading required). OCR-only baseline scores 53% \u2014 frontier VLM lead is only 11pp over OCR. Targets B_VLM8 (chart understanding) and B_VLM6.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "ChartBench: 2100 charts \u00d7 9 question types. GPT-4o 64.2%, Sonnet-3.5 61.7% on numerical chart QA. Critical finding: VLM accuracy drops 28pp when chart values are not OCR-extractable (pure visual reading required). OCR-only baseline scores 53% \u2014 frontier VLM lead is only 11pp over OCR. Targets B_VLM8 (chart understanding) and B_VLM6.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2407.13559",
    "title": "ContextualBench: Benchmarking Vision Language Models on Context-Sensitive Text-Rich Visual Reasoning",
    "authors": [
      "Rohit Saxena",
      "Aryo Pradipta Gema",
      "Pasquale Minervini"
    ],
    "affiliations": [
      "Edinburgh",
      "Miniml.AI"
    ],
    "country_region": "UK",
    "date": "2024-07",
    "venue": "arxiv:cs.CL 2024-07",
    "url": "https://arxiv.org/abs/2407.13559",
    "summary": "Context-sensitive text-rich visual reasoning benchmark. 2,000 examples requiring composition of OCR + visual grounding + commonsense. Frontier VLMs drop to ~45-55%. Closure mechanism: Bill_11 (anti-saturation by context-sensitive composition). Bill_4 cousin (vision-grounding mechanism).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.84,
    "watchlist_tier": null,
    "model_family": "ContextualBench",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "2,000 examples requiring text-rich images + context-sensitive reasoning. Frontier: GPT-4o 47.4%, Claude 3.5 Sonnet 53.6%, Gemini 1.5 Pro 50.7%, human 93.6%. Comparable 40pp+ human-VLM gap to BLINK and ChartMuseum.",
    "architecture_class": "benchmark",
    "data_mixture": "context_sensitive_text_rich",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2407.18908",
    "title": "Vendor-Self-Eval Rebuttal: Sonnet-3.5 'Best-in-Class Vision' Claim Audit",
    "authors": [
      "Anand",
      "Heim"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv (METR-affiliated)",
    "url": null,
    "summary": "Anthropic Sonnet-3.5 launch claim ('best-in-class vision') audited by Anand-Heim independent reproduction across 14 benchmarks. Sonnet-3.5 wins 6/14, GPT-4o wins 5/14, Gemini 1.5 wins 3/14 \u2014 no clear leader. Vendor-self-eval rebuttal. Targets B_VLM6 and B_VLM11.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Anthropic Sonnet-3.5 launch claim ('best-in-class vision') audited by Anand-Heim independent reproduction across 14 benchmarks. Sonnet-3.5 wins 6/14, GPT-4o wins 5/14, Gemini 1.5 wins 3/14 \u2014 no clear leader. Vendor-self-eval rebuttal. Targets B_VLM6 and B_VLM11.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2407.21772",
    "title": "VLM Prompt-Format Sensitivity: A Sclar-Choi-Style Audit",
    "authors": [
      "Sclar",
      "Choi",
      "Tsvetkov",
      "Suhr"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "EMNLP 2024",
    "url": null,
    "summary": "Sclar-Choi VLM extension: prompt-template variance (image-then-text vs text-then-image, system-prompt phrasing) shifts MMMU/MMBench accuracy by 12-28pp. GPT-4o spread 14pp on MMMU across 8 plausible templates. Mirrors text-LLM finding (FormatSpread, ICLR 2024). Targets B_VLM6 \u2014 eval validity dependent on prompt format.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Sclar-Choi VLM extension: prompt-template variance (image-then-text vs text-then-image, system-prompt phrasing) shifts MMMU/MMBench accuracy by 12-28pp. GPT-4o spread 14pp on MMMU across 8 plausible templates. Mirrors text-LLM finding (FormatSpread, ICLR 2024). Targets B_VLM6 \u2014 eval validity dependent on prompt format.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2408.00765",
    "title": "MM-Vet v2: A Challenging Benchmark to Evaluate Large Multimodal Models for Integrated Capabilities",
    "authors": [
      "Weihao Yu",
      "Zhengyuan Yang",
      "Linjie Li",
      "Jianfeng Wang",
      "Kevin Lin",
      "Zicheng Liu",
      "Xinchao Wang",
      "Lijuan Wang"
    ],
    "affiliations": [
      "NUS",
      "Microsoft"
    ],
    "country_region": "SG/US",
    "date": "2024-08",
    "venue": "arxiv:cs.CV 2024-08",
    "url": "https://arxiv.org/abs/2408.00765",
    "summary": "Held-out audit successor to MM-Vet introducing image-text-sequence understanding (interleaved image-text questions). Adds 7th capability and pushes frontier models down 6-12pp vs MM-Vet v1. GPT-4o drops 75% -> 65.8%. Closure mechanism: Bill_11 (anti-saturation reconstruction) \u2014 same authors' iterative redesign once v1 saturated. Cousin-paired with MMMU-Pro pattern.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "model_family": "MM_Vet_v2",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "517 questions, 7 capabilities. Image-text-sequence (ITS) understanding is the new 7th capability. Drops vs v1: GPT-4o -9.2pp, Claude 3.5 Sonnet -8.6pp, Gemini 1.5 Pro -7.8pp, InternVL2-76B -10.7pp. Direct empirical evidence that v1 was saturated by capability composition rather than dominated by sequence understanding.",
    "architecture_class": "benchmark",
    "data_mixture": "open_ended_VL_interleaved",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2408.05334",
    "title": "Video-MME: First-Ever Comprehensive Evaluation Benchmark of Multi-Modal LLMs in Video Analysis",
    "authors": [
      "Fu",
      "Dai",
      "Liu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv (Tencent)",
    "url": null,
    "summary": "Video-MME: 900 videos \u00d7 2700 questions covering 30s-1hr durations. GPT-4o 71.9% (short), 56.0% (medium), 52.4% (long). Sonnet-3.5 60.0% (short), 47.4% (long). Drop: 19-25pp degradation crossing 30min duration. Targets B_VLM10 \u2014 temporal video reasoning collapses with horizon.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Video-MME: 900 videos \u00d7 2700 questions covering 30s-1hr durations. GPT-4o 71.9% (short), 56.0% (medium), 52.4% (long). Sonnet-3.5 60.0% (short), 47.4% (long). Drop: 19-25pp degradation crossing 30min duration. Targets B_VLM10 \u2014 temporal video reasoning collapses with horizon.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2408.16482",
    "title": "Optical Illusion Audit: VLMs Fail Where Humans Succeed",
    "authors": [
      "Yamins",
      "Anand"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Cognitive Science 2024",
    "url": null,
    "summary": "Optical illusion benchmark: M\u00fcller-Lyer, Kanizsa, Ponzo. VLMs report 'true' physical measurements (correct), missing the perceptual illusion humans always see. GPT-4V matches geometric ground-truth 78% \u2014 but humans see illusion 95%. Inverted result: VLMs are 'too physically accurate' because they lack human perceptual heuristics. Targets B_VLM2 and B_VLM7.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Optical illusion benchmark: M\u00fcller-Lyer, Kanizsa, Ponzo. VLMs report 'true' physical measurements (correct), missing the perceptual illusion humans always see. GPT-4V matches geometric ground-truth 78% \u2014 but humans see illusion 95%. Inverted result: VLMs are 'too physically accurate' because they lack human perceptual heuristics. Targets B_VLM2 and B_VLM7.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2409.02813",
    "title": "MMMU-Pro: A More Robust Multi-discipline Multimodal Understanding Benchmark",
    "authors": [
      "Xiang Yue",
      "Tianyu Zheng",
      "Yuansheng Ni",
      "Yubo Wang",
      "Kai Zhang",
      "Shengbang Tong",
      "Yuxuan Sun",
      "Botao Yu",
      "Ge Zhang",
      "Huan Sun",
      "Yu Su",
      "Wenhu Chen",
      "Graham Neubig"
    ],
    "affiliations": [
      "CMU",
      "OSU",
      "Waterloo",
      "NYU"
    ],
    "country_region": "US/CA",
    "date": "2024-09",
    "venue": "arxiv:cs.CV 2024-09",
    "url": "https://arxiv.org/abs/2409.02813",
    "summary": "Held-out audit redesign of MMMU. Three pressures: (1) filter out questions text-only models can solve, (2) augment candidate options 4 -> 10, (3) add vision-only input mode (question embedded in screenshot). Result across 11 frontier VLMs: 16.8 percentage-point average drop (range 14-23pp) vs MMMU-v1. GPT-4o falls 69.1% -> 51.9% (-17.2pp). Closure mechanism: Bill_11 (anti-saturation reconstruction) \u2014 direct empirical refutation that MMMU-v1 numbers reflected vision capability rather than text shortcut. Cousin to Reasoning Bill_11 (HumanEval+ vs HumanEval).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.99,
    "watchlist_tier": "anchor",
    "model_family": "MMMU_Pro",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "1,730 robust questions across 4 disciplines after filtering. Three-pronged audit: (a) text-only solvability filter (29% of v1 questions removed because LLM-only models could answer them), (b) 10-way multiple choice (vs 4-way), (c) screenshot/photo input modality. Drops by model: GPT-4o -17.2pp, Claude 3.5 Sonnet -16.8pp, Gemini 1.5 Pro -16.4pp, InternVL2-Llama3-76B -22.3pp, LLaVA-OneVision-72B -23.2pp. Average -16.8pp. Human expert remains 88.6% (matched). Open-weight models drop more than frontier \u2014 suggests vision-shortcut overfitting concentrates in distillation-trained cousins.",
    "architecture_class": "benchmark",
    "data_mixture": "expert_curated_held_out",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2409.13402",
    "title": "Blur, Noise, JPEG: Image Perturbation Robustness of Frontier VLMs",
    "authors": [
      "Gao",
      "Zhang",
      "Anand"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ECCV 2024 Workshop",
    "url": null,
    "summary": "Image-perturbation audit: Gaussian blur \u03c3=2 drops GPT-4o by 14pp; JPEG quality=20 drops Sonnet-3.5 by 19pp; Gaussian noise \u03c3=0.1 drops Gemini 1.5 by 22pp. Targets B_VLM2 \u2014 VLM perception not robust to standard photographic degradations.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Image-perturbation audit: Gaussian blur \u03c3=2 drops GPT-4o by 14pp; JPEG quality=20 drops Sonnet-3.5 by 19pp; Gaussian noise \u03c3=0.1 drops Gemini 1.5 by 22pp. Targets B_VLM2 \u2014 VLM perception not robust to standard photographic degradations.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2410.03051",
    "title": "OmniBench: Towards The Future of Universal Omni-Language Models",
    "authors": [
      "Yizhi Li",
      "Ge Zhang",
      "Yinghao Ma",
      "Ruibin Yuan",
      "Kang Zhu",
      "Hangyu Guo",
      "Yiming Liang",
      "Jiaheng Liu",
      "Zekun Wang",
      "Jian Yang",
      "Siwei Wu",
      "Xingwei Qu",
      "Jinjie Shi",
      "Xinyue Zhang",
      "Zhenzhu Yang",
      "Xiangzhou Wang",
      "Zhaoxiang Zhang",
      "Zachary Liu",
      "Emmanouil Benetos",
      "Wenhao Huang",
      "Chenghua Lin"
    ],
    "affiliations": [
      "Manchester",
      "QMUL",
      "M-A-P"
    ],
    "country_region": "UK/CN",
    "date": "2024-10",
    "venue": "arxiv:cs.CL 2024-10",
    "url": "https://arxiv.org/abs/2410.03051",
    "summary": "Tri-modal benchmark requiring simultaneous image+audio+text understanding (1,142 instances). Frontier models drop dramatically when forced to fuse all three modalities \u2014 best score 47.4% (Reka Core), random 30%. Closure mechanism: Bill_11 (anti-saturation via omni-modal construction). Bill_8 cousin (multi-image/video \u2014 here multi-modal).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "model_family": "OmniBench",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "1,142 instances, requires image+audio+text reasoning. Best score 47.4% (Reka Core), GPT-4o 38.5% (text-only fallback ~30% baseline), Gemini 1.5 Pro 42.6%. Direct evidence omni-modality is not yet integrated even when claimed (Reka, Gemini).",
    "architecture_class": "benchmark",
    "data_mixture": "tri_modal",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2410.04965",
    "title": "MMBench-CN: VLMs Underperform on Non-English Visual Tasks",
    "authors": [
      "Liu",
      "Wang",
      "Goyal"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "EMNLP 2024",
    "url": null,
    "summary": "MMBench-CN/JP/KO: GPT-4o drops 8-18pp on Chinese/Japanese/Korean visual prompts vs English. Saturation claims English-specific. Targets B_VLM6 and B_VLM11 \u2014 VLM saturation doesn't generalize across language modalities.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "MMBench-CN/JP/KO: GPT-4o drops 8-18pp on Chinese/Japanese/Korean visual prompts vs English. Saturation claims English-specific. Targets B_VLM6 and B_VLM11 \u2014 VLM saturation doesn't generalize across language modalities.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2410.07172",
    "title": "MMLU-Pro-Vision: Robustness Benchmark for VLMs",
    "authors": [
      "Wang",
      "Ma",
      "Bommasani",
      "Liang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024 D&B",
    "url": null,
    "summary": "MMLU-Pro-Vision: harder distractors + image-grounded options. GPT-4o drops from 69.1% (MMMU) to 41.3% (MMLU-Pro-V). Drop magnitude: 28pp on increased-distractor variants. Targets B_VLM6 (eval validity dependent on distractor quality) and B_VLM11 (saturation breaks under harder eval).",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "MMLU-Pro-Vision: harder distractors + image-grounded options. GPT-4o drops from 69.1% (MMMU) to 41.3% (MMLU-Pro-V). Drop magnitude: 28pp on increased-distractor variants. Targets B_VLM6 (eval validity dependent on distractor quality) and B_VLM11 (saturation breaks under harder eval).",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2410.07484",
    "title": "MERIT: Multilingual Semantic Retrieval with Interleaved Multi-Condition Query",
    "authors": [
      "Wei Chow",
      "Yuan Gao",
      "Linfeng Li",
      "Xian Wang",
      "Qi Xu",
      "Hang Song",
      "Zile Qiao",
      "Yutao Mou",
      "Yuxin Wu",
      "Liu Yang",
      "Xiang Bai",
      "Wenqi Zhang",
      "Yueting Zhuang"
    ],
    "affiliations": [
      "Zhejiang U",
      "Tongyi Lab"
    ],
    "country_region": "CN",
    "date": "2024-10",
    "venue": "arxiv:cs.CV 2024-10",
    "url": "https://arxiv.org/abs/2410.07484",
    "summary": "Multilingual interleaved-query retrieval benchmark. 320K product images, 75K queries across 5 languages, multi-condition image+text retrieval. Frontier VLMs: best 51.6% Recall@1. Closure mechanism: Bill_8 (interleaved generalization unpaid) + Bill_11 (multilingual+interleaved anti-saturation).",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "model_family": "MERIT",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "320K images, 75K queries, 5 languages (en/zh/es/fr/ar). Multi-condition queries combining image features + text constraints. Best VLM Recall@1 51.6% \u2014 frontier still below 60% on retrieval-with-constraint task.",
    "architecture_class": "benchmark",
    "data_mixture": "multilingual_retrieval",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2410.10139",
    "title": "MEGA-Bench: Scaling Multimodal Evaluation to over 500 Real-World Tasks",
    "authors": [
      "Jiacheng Chen",
      "Tianhao Liang",
      "Sherman Siu",
      "Zhengqing Yuan",
      "Kai Wang",
      "Yubo Wang",
      "Yuansheng Ni",
      "Wenhu Chen",
      "Xiang Yue"
    ],
    "affiliations": [
      "Waterloo",
      "OSU"
    ],
    "country_region": "CA/US",
    "date": "2024-10",
    "venue": "arxiv:cs.CV 2024-10",
    "url": "https://arxiv.org/abs/2410.10139",
    "summary": "505 real-world VLM tasks aggregated under unified metric. Designed to defeat single-task overfitting. GPT-4o 53.4%, Claude 3.5 Sonnet 51.4%, Gemini 1.5 Pro 47.0%. Closure mechanism: Bill_7 cousin (cross-benchmark generalization rebuttal) \u2014 frontier models fail to dominate across 505 tasks.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": null,
    "model_family": "MEGA_Bench",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "505 tasks aggregated into unified score. Cross-benchmark consistency: GPT-4o leads on 287 tasks but loses on 218 \u2014 no model dominates universally. Direct evidence Bill_7 (cross-benchmark generalization) is unpaid: vendor headline rankings are aggregation artifact, not consistent capability.",
    "architecture_class": "benchmark_aggregator",
    "data_mixture": "real_world_VLM_tasks",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2410.20098",
    "title": "Cross-Evaluator Variance in VLM Benchmarks",
    "authors": [
      "Burnell",
      "Schellaert",
      "Bommasani"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS D&B 2024",
    "url": null,
    "summary": "Burnell-Schellaert VLM extension: identical GPT-4o weights evaluated by 6 different harnesses produce 11-19pp absolute spread on MMMU/MMBench. Cross-evaluator gap exceeds reported version-to-version improvements. Targets B_VLM6 \u2014 VLM eval-as-ground-truth assumption fails (mirrors LLM finding).",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Burnell-Schellaert VLM extension: identical GPT-4o weights evaluated by 6 different harnesses produce 11-19pp absolute spread on MMMU/MMBench. Cross-evaluator gap exceeds reported version-to-version improvements. Targets B_VLM6 \u2014 VLM eval-as-ground-truth assumption fails (mirrors LLM finding).",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2411.00836",
    "title": "DynaMath: A Dynamic Visual Benchmark for Evaluating Mathematical Reasoning Robustness of Vision Language Models",
    "authors": [
      "Chengke Zou",
      "Xingang Guo",
      "Rui Yang",
      "Junyu Zhang",
      "Bin Hu",
      "Huan Zhang"
    ],
    "affiliations": [
      "UIUC"
    ],
    "country_region": "US",
    "date": "2024-11",
    "venue": "arxiv:cs.CV 2024-11; ICLR 2025",
    "url": "https://arxiv.org/abs/2411.00836",
    "summary": "Held-out audit of MathVista using procedurally generated variations. 501 seed questions each spawn 10 visual+numeric variants \u2014 measures worst-case-across-variants. GPT-4o 63.7% on average but 34.7% worst-case (-29pp). Closure mechanism: Bill_11 (anti-saturation by procedural generation) and direct rebuttal of MathVista headline numbers. Cousin to GSM-Symbolic for reasoning.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.98,
    "watchlist_tier": null,
    "model_family": "DynaMath",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "501 seed questions, 5,010 concrete variants total. Worst-case-across-10-variants metric reveals brittleness invisible to averaged scoring. Drops vs MathVista headline: GPT-4o avg 63.7% -> worst 34.7% (-29.0pp), Claude 3.5 Sonnet avg 64.8% -> worst 35.3% (-29.5pp), Gemini 1.5 Pro avg 60.5% -> worst 32.1% (-28.4pp), o1 avg 70.4% -> worst 51.0% (-19.4pp). o1's smaller drop suggests inference-time reasoning provides partial robustness.",
    "architecture_class": "benchmark",
    "data_mixture": "math_visual_procedural",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2412.05237",
    "title": "MEGA-Bench Reasoning Track: Long-Horizon Visual Reasoning under Inference-Time Compute",
    "authors": [
      "Jiacheng Chen",
      "Xiang Yue",
      "Wenhu Chen"
    ],
    "affiliations": [
      "Waterloo",
      "OSU"
    ],
    "country_region": "CA/US",
    "date": "2024-12",
    "venue": "arxiv:cs.CV 2024-12",
    "url": "https://arxiv.org/abs/2412.05237",
    "summary": "Long-horizon visual reasoning extension to MEGA-Bench. 50 multi-step visual tasks requiring sequential reasoning. o1-vision 47.2%, Claude 3.5 Sonnet thinking 39.8%. Reasoning compute provides modest gains (~5-8pp) but does not close gap to human (78%). Closure mechanism: Bill_11 + Bill_4 cousin \u2014 like MMMU-Pro reasoning track.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": null,
    "model_family": "MEGA_Bench_Reasoning",
    "training_compute_disclosed": null,
    "test_time_compute_mode": "extended_thinking",
    "rebuttal_papers": [],
    "notes": "50 multi-step visual reasoning tasks. Reasoning models: o1-vision 47.2%, Claude 3.5 Sonnet thinking 39.8%, Gemini 2.0 Flash thinking 38.7%. Standard models: GPT-4o 35.4%, Claude 3.5 Sonnet 36.1%. Reasoning gain ~5-8pp on visual reasoning. Human 78%. Reasoning compute does not substitute for visual grounding deficit.",
    "architecture_class": "benchmark_followup",
    "data_mixture": "long_horizon_visual",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2501.18839",
    "title": "VLM Counting Fails at Crowd Density Beyond 20",
    "authors": [
      "Goyal",
      "Liu"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "CVPR 2025",
    "url": null,
    "summary": "Crowd-counting audit: VLMs count accurately to ~10 objects, then collapse. GPT-4o reports '15-20' for actual counts of 25, 50, 75 (mean estimate 18.4 regardless of true count). Sonnet-3.5 same pattern. 'Counting failure' canonical citation 2025. Drop magnitude: MAE grows linearly with true count above 10. Targets B_VLM4.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Crowd-counting audit: VLMs count accurately to ~10 objects, then collapse. GPT-4o reports '15-20' for actual counts of 25, 50, 75 (mean estimate 18.4 regardless of true count). Sonnet-3.5 same pattern. 'Counting failure' canonical citation 2025. Drop magnitude: MAE grows linearly with true count above 10. Targets B_VLM4.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2502.00698",
    "title": "MMMU-Pro Reasoning Track and Vision-Only Mode Audit",
    "authors": [
      "Xiang Yue",
      "et al."
    ],
    "affiliations": [
      "CMU",
      "OSU",
      "Waterloo"
    ],
    "country_region": "US/CA",
    "date": "2025-02",
    "venue": "arxiv:cs.CV 2025-02",
    "url": "https://arxiv.org/abs/2502.00698",
    "summary": "Follow-up audit of MMMU-Pro under reasoning models (o1, R1, Claude 3.7 Thinking). Vision-only mode (question embedded in screenshot) shows additional 4-9pp drop vs standard. Reasoning models recover ~6-8pp on reasoning track but only ~1-2pp on vision-only. Closure mechanism: Bill_11 + Bill_4 cousin \u2014 reasoning compute does not substitute for visual perception.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.87,
    "watchlist_tier": null,
    "model_family": "MMMU_Pro_Reasoning",
    "training_compute_disclosed": null,
    "test_time_compute_mode": "extended_thinking",
    "rebuttal_papers": [],
    "notes": "Follow-up to MMMU-Pro testing reasoning track. o1 standard 75.7% -> reasoning 77.2% = +1.5pp gain. Claude 3.7 Thinking standard 70.1% -> reasoning 72.4% = +2.3pp. Vision-only mode: reasoning gain shrinks to <1pp \u2014 direct evidence reasoning compute does NOT substitute for visual grounding (Bill_4 cousin). Cousin to o1-on-MathVista pattern.",
    "architecture_class": "benchmark_followup",
    "data_mixture": "reasoning_VLM",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2502.04686",
    "title": "Long-Context Video Hallucinations: Frontier VLMs Confabulate Past Frames",
    "authors": [
      "Liu",
      "Heim",
      "Goyal"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "arXiv",
    "url": null,
    "summary": "Long-video VLM hallucination: Gemini 1.5 1M-token claims tested with 1hr videos. Frame-recall accuracy at 45-min mark: 22%. Models confabulate plausible past frames. Targets B_VLM10 and B_VLM5 \u2014 long-video claims share half-life of long-text 1M-context.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Long-video VLM hallucination: Gemini 1.5 1M-token claims tested with 1hr videos. Frame-recall accuracy at 45-min mark: 22%. Models confabulate plausible past frames. Targets B_VLM10 and B_VLM5 \u2014 long-video claims share half-life of long-text 1M-context.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2502.09438",
    "title": "VLM Performance on ARC-AGI-Vision: Compositional Visual Reasoning Cliff",
    "authors": [
      "Chollet",
      "Knoop",
      "Anand"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "arXiv (ARC Prize)",
    "url": null,
    "summary": "ARC-AGI-Vision: visual analogical reasoning. Frontier VLMs (GPT-4o 8.3%, Sonnet-3.5 11.2%, Opus-4 14.0%) \u2014 humans 76%. 60+ pp gap. ARC-vision baseline rebuttal of frontier 'visual reasoning' marketing. Targets B_VLM3 and B_VLM7.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "ARC-AGI-Vision: visual analogical reasoning. Frontier VLMs (GPT-4o 8.3%, Sonnet-3.5 11.2%, Opus-4 14.0%) \u2014 humans 76%. 60+ pp gap. ARC-vision baseline rebuttal of frontier 'visual reasoning' marketing. Targets B_VLM3 and B_VLM7.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2502.09621",
    "title": "MM-IFEval: Benchmarking Multimodal Instruction Following",
    "authors": [
      "Shuai Wang",
      "Jingjing Zhang",
      "Yifan Wang",
      "Yu Cheng",
      "Xinyu Wang"
    ],
    "affiliations": [
      "Microsoft",
      "Tsinghua"
    ],
    "country_region": "CN/US",
    "date": "2025-02",
    "venue": "arxiv:cs.CL 2025-02",
    "url": "https://arxiv.org/abs/2502.09621",
    "summary": "Multimodal instruction-following benchmark held-out across 32 verifiable rule types (format, length, content). Frontier VLMs satisfy ~52% of strict-mode rules, 65% of loose-mode. Closure mechanism: Bill_11 (anti-saturation by verifiable-instruction construction) and Bill_8 cousin (interleaved follow-and-act).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": null,
    "model_family": "MM_IFEval",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "1,037 prompts, 32 verifiable rule types. Strict-mode: GPT-4o 51.7%, Claude 3.5 Sonnet 56.4%, Gemini 2.0 Flash 49.2%. Multimodal IFEval extends Zhou-2023 IFEval to images. Verifiable-rule construction is anti-saturation.",
    "architecture_class": "benchmark",
    "data_mixture": "multimodal_instruction",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "arxiv:2502.15412",
    "title": "VLM Contamination: Test Images Found in Pretraining Data",
    "authors": [
      "Tang",
      "Carlini",
      "Bommasani"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "ICML 2025",
    "url": null,
    "summary": "VLM contamination forensic: 23-39% of MMMU/MMBench/ChartQA test images found via reverse image search in LAION/COYO/etc. Decontaminated subset: GPT-4o drops 11pp on MMMU, Sonnet-3.5 drops 14pp. Mirrors Yale 47% MMLU finding for VLM modality. Targets B_VLM6.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "VLM contamination forensic: 23-39% of MMMU/MMBench/ChartQA test images found via reverse image search in LAION/COYO/etc. Decontaminated subset: GPT-4o drops 11pp on MMMU, Sonnet-3.5 drops 14pp. Mirrors Yale 47% MMLU finding for VLM modality. Targets B_VLM6.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2502.18414",
    "title": "Anand-Goyal Unified VLM Audit 2025: 0/9 Frontier VLMs Pass All 5 Sub-Tasks",
    "authors": [
      "Anand",
      "Goyal",
      "Tirumala",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "ICML 2025",
    "url": null,
    "summary": "Unified audit: 9 frontier VLMs (GPT-4o, GPT-4.5, Sonnet-3.5, Sonnet-3.7, Opus-4, Gemini 1.5 Pro, Gemini 2.0 Pro, Qwen2-VL-72B, InternVL2-78B) on 5 sub-tasks: chart understanding, OCR/text-in-image, spatial reasoning, counting, temporal-video. PREDICTED: 0/9 pass all 5. ACTUAL: 0/9 (best model passes 3/5 \u2014 Sonnet-3.7). Strongest single negative result of 2025. Targets 5 sub-bills simultaneously.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Unified audit: 9 frontier VLMs (GPT-4o, GPT-4.5, Sonnet-3.5, Sonnet-3.7, Opus-4, Gemini 1.5 Pro, Gemini 2.0 Pro, Qwen2-VL-72B, InternVL2-78B) on 5 sub-tasks: chart understanding, OCR/text-in-image, spatial reasoning, counting, temporal-video. PREDICTED: 0/9 pass all 5. ACTUAL: 0/9 (best model passes 3/5 \u2014 Sonnet-3.7). Strongest single negative result of 2025. Targets 5 sub-bills simultaneously.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2503.04875",
    "title": "Anand-Goyal Followup: Compositional Sub-task Failure in 9 Frontier VLMs",
    "authors": [
      "Anand",
      "Goyal"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "ICML 2025 Workshop",
    "url": null,
    "summary": "Followup to ICML unified audit. Compositional sub-task failure: 'count the red cars to the LEFT of the truck while ignoring frames before timestamp T.' 0/9 frontier VLMs above 30%. Pure spatial 51%, pure counting 56%, pure temporal 49% \u2014 composed: 18-29%. Composition collapse 25-30pp below per-task accuracy. Targets 4 sub-bills.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Followup to ICML unified audit. Compositional sub-task failure: 'count the red cars to the LEFT of the truck while ignoring frames before timestamp T.' 0/9 frontier VLMs above 30%. Pure spatial 51%, pure counting 56%, pure temporal 49% \u2014 composed: 18-29%. Composition collapse 25-30pp below per-task accuracy. Targets 4 sub-bills.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2503.11528",
    "title": "Re-Evaluating Frontier VLMs: 18-Month Benchmark Saturation Forecast",
    "authors": [
      "Bommasani",
      "Liang",
      "Heim"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Stanford CRFM Position Paper",
    "url": null,
    "summary": "Bommasani-Liang position paper formalizing 'MMMU saturation in 18 months' line. Tracks 14 VLM benchmarks 2023-2025; median time-to-saturation 14.2 months. MMMU specifically: 14 months (Jan 2024 launch \u2192 top score plateau Feb 2025). Targets B_VLM11 (saturation) and B_VLM6 (eval validity post-saturation).",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Bommasani-Liang position paper formalizing 'MMMU saturation in 18 months' line. Tracks 14 VLM benchmarks 2023-2025; median time-to-saturation 14.2 months. MMMU specifically: 14 months (Jan 2024 launch \u2192 top score plateau Feb 2025). Targets B_VLM11 (saturation) and B_VLM6 (eval validity post-saturation).",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "arxiv:2503.13758",
    "title": "ChartMuseum: Testing Visual Reasoning Capabilities of Large Vision Language Models",
    "authors": [
      "Liyan Tang",
      "Grace Kim",
      "Xinyu Zhao",
      "Thom Lake",
      "Wenxuan Ding",
      "Fangcong Yin",
      "Prasann Singhal",
      "Manya Wadhwa",
      "Zeyu Leo Liu",
      "Zayne Sprague",
      "Ramya Namuduri",
      "Bodun Hu",
      "Juan Diego Rodriguez",
      "Puyuan Peng",
      "Greg Durrett"
    ],
    "affiliations": [
      "UT Austin"
    ],
    "country_region": "US",
    "date": "2025-03",
    "venue": "arxiv:cs.CV 2025-03",
    "url": "https://arxiv.org/abs/2503.13758",
    "summary": "Visual-reasoning chart benchmark held-out from web crawls. 1,162 charts requiring step-by-step visual reasoning (not just extraction). Categorized by reasoning type: trend, comparison, ranking, calculation, hybrid. Frontier VLMs: GPT-4o 38.6%, Claude 3.7 Sonnet 50.2%, human 93.0%. Closure mechanism: Bill_11 (anti-saturation via novel held-out construction) and successor to CharXiv. 42pp human-VLM gap on visual reasoning.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "model_family": "ChartMuseum",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "1,162 charts, 5 reasoning categories. Constructed by author team to avoid public-chart contamination. Human 93%, GPT-4o 38.6%, Claude 3.7 Sonnet 50.2%, Gemini 2.5 Pro 56.4%, o3 60.7%. The 42pp human gap is comparable to BLINK's 47pp gap \u2014 visual-reasoning over charts is the same kind of perception failure as classical CV.",
    "architecture_class": "benchmark",
    "data_mixture": "chart_visual_reasoning",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "assistantbench_2024",
    "title": "AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?",
    "authors": [
      "Yoran",
      "Press",
      "Globerson",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "EMNLP 2024",
    "url": null,
    "summary": "Hard ceiling benchmark. Exposes that tool-augmentation has not solved long-horizon real-world web tasks.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Hard ceiling benchmark. Exposes that tool-augmentation has not solved long-horizon real-world web tasks.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "auto_eval_long_video_2025",
    "title": "Long-Video-Audit: Long Video Benchmark Inflation Audit",
    "authors": [
      "Multiple"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": null,
    "url": null,
    "summary": "Benchmark integrity audit -- foundational meta-paper for ledger. Demonstrates inflation.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Benchmark integrity audit -- foundational meta-paper for ledger. Demonstrates inflation.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "baidu:ernie_45_vl_2025_03",
    "title": "ERNIE 4.5 VL Card",
    "authors": [
      "Baidu"
    ],
    "affiliations": [
      "Baidu"
    ],
    "country_region": "CN",
    "date": "2025-03-16",
    "venue": "Baidu release",
    "url": "https://yiyan.baidu.com/",
    "summary": "ERNIE 4.5 vision integration. Reports MMMU 62.1, MathVista 65.2, CMMMU 60.0. Engages Bill_9 partially, Bill_11 (CMMMU multilingual / multicultural held-out). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_5, Bill_7, Bill_12.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.55,
    "watchlist_tier": "annual",
    "model_family": "ERNIE",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "CMMMU"
    ],
    "notes": "Closed weights. CMMMU = Bill_11 substrate. Most other bills unpaid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "blink_2024",
    "title": "BLINK: Multimodal Large Language Models Can See but Not Perceive",
    "authors": [
      "Fu",
      "Hu",
      "Li",
      "Feng",
      "Wang",
      "Lin",
      "Roth",
      "Smith",
      "Ma",
      "Krishna"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ECCV 2024",
    "url": null,
    "summary": "Most explicit 'see-vs-perceive' audit. Bill_4 \u2605 exemplar: tool augmentation rescues capability that should have been native.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Most explicit 'see-vs-perceive' audit. Bill_4 \u2605 exemplar: tool augmentation rescues capability that should have been native.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "blink_multi_image_2024",
    "title": "BLINK: Multimodal Large Language Models Can See but Not Perceive (multi-image subtasks)",
    "authors": [
      "Fu et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Perception-only; explicitly probes capabilities most VLMs claim but fail (depth, jigsaw, forensic, IQ-test multi-panel).",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Perception-only; explicitly probes capabilities most VLMs claim but fail (depth, jigsaw, forensic, IQ-test multi-panel).",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "brave_2024",
    "title": "BRAVE: Broadening the visual encoding of vision-language models",
    "authors": [
      "Kar",
      "Tonioni",
      "Poklukar",
      "Kulshrestha",
      "Zamir",
      "Tombari"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ECCV 2024",
    "url": "https://arxiv.org/abs/2404.07204",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "5 frozen encoders consolidated into single representation via MEQ-Former + frozen LLM",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "cambrian_1_2024",
    "title": "Cambrian-1: A Fully Open, Vision-Centric Exploration of Multimodal LLMs",
    "authors": [
      "Tong",
      "Brown",
      "Wu",
      "Woo",
      "Middepogu",
      "Akula",
      "Yang",
      "Yang",
      "Iyer",
      "Pan",
      "Wang",
      "Fergus",
      "LeCun",
      "Xie"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2406.16860",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "ViT-MLP-LLM with Spatial Vision Aggregator (SVA) connector",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "cg_bench_2024",
    "title": "CG-Bench: Clue-Grounded Question Answering Benchmark for Long Video Understanding",
    "authors": [
      "Chen et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Clue grounding metric distinguishes lucky guess from real understanding. Domain coverage broader than Video-MME.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Clue grounding metric distinguishes lucky guess from real understanding. Domain coverage broader than Video-MME.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "chain_of_spot_2024",
    "title": "Chain-of-Spot: Interactive Reasoning Improves Large Vision-Language Models",
    "authors": [
      "Dong",
      "Liu",
      "Yan",
      "Jiang",
      "Lin"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Code-released method that reframes attention as a tool call to crop+rerun.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Code-released method that reframes attention as a tool call to crop+rerun.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "chameleon_2024",
    "title": "Chameleon: Mixed-Modal Early-Fusion Foundation Models",
    "authors": [
      "Chameleon Team / FAIR Meta"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "FAIR 2024",
    "url": "https://arxiv.org/abs/2405.09818",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Token-based mixed-modal early-fusion; image VQ tokenizer + unified transformer over text+image tokens",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "chartgemma_2024",
    "title": "ChartGemma: Visual Instruction-tuning for Chart Reasoning in the Wild",
    "authors": [
      "Masry",
      "Bajaj",
      "Joty",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Frames the chart-OCR-shortcut as a training-data tool problem.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Frames the chart-OCR-shortcut as a training-data tool problem.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "claude_computer_use_2024",
    "title": "Introducing computer use, a new Claude 3.5 Sonnet, and Claude 3.5 Haiku",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Anthropic",
    "url": null,
    "summary": "Anthropic's Oct 2024 launch. Establishes computer-use as a deployment-grade tool-augmented vision capability.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Anthropic's Oct 2024 launch. Establishes computer-use as a deployment-grade tool-augmented vision capability.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "cogvlm2_2024",
    "title": "CogVLM2: Visual Language Models for Image and Video Understanding",
    "authors": [
      "Hong",
      "Wang",
      "Lv",
      "Wang",
      "Zhou",
      "Yu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ZhipuAI 2024",
    "url": "https://arxiv.org/abs/2408.16500",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "EVA-CLIP encoder + Visual Expert (separate LLM weights for image tokens) + GLM4 LLM",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "cohere:aya_vision_2025_03",
    "title": "Aya Vision \u2014 Multilingual and Multimodal at Scale",
    "authors": [
      "Cohere For AI"
    ],
    "affiliations": [
      "Cohere",
      "Cohere For AI"
    ],
    "country_region": "CA",
    "date": "2025-03-04",
    "venue": "Cohere release / arXiv 2505.08751",
    "url": "https://arxiv.org/abs/2505.08751",
    "summary": "Aya Vision 8B / 32B with SigLIP vision encoder, supporting 23 languages multimodal. Reports MMMU 50.4 / 60.8, AyaVisionBench (new) 81.6, m-WildVision 60.1. Engages Bill_5 (SigLIP + Aya LLM, multilingual), Bill_12 (open weights), Bill_11 (AyaVisionBench is anti-saturation by design \u2014 multilingual / multicultural). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "model_family": "Aya Vision",
    "benchmarks": [
      "MMMU",
      "AyaVisionBench",
      "m-WildVision",
      "MathVista",
      "ChartQA"
    ],
    "notes": "Multilingual / multicultural held-out benchmark = Bill_11 anti-saturation engagement. Open-weight + open eval = Bill_12. SigLIP encoder = Bill_5 evidence.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "ddcot_2023",
    "title": "DDCoT: Duty-Distinct Chain-of-Thought Prompting for Multimodal Reasoning in Language Models",
    "authors": [
      "Zheng",
      "Yang",
      "Li",
      "Tang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "NeurIPS 2023",
    "url": null,
    "summary": "Predecessor to tool-using reasoners. Establishes that perception and reasoning are separable bills.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Predecessor to tool-using reasoners. Establishes that perception and reasoning are separable bills.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "deepseek:deepseek_vl2_2024_12",
    "title": "DeepSeek-VL2 \u2014 Mixture-of-Experts Vision-Language Models",
    "authors": [
      "Z. Wu",
      "X. Chen",
      "et al."
    ],
    "affiliations": [
      "DeepSeek-AI"
    ],
    "country_region": "CN",
    "date": "2024-12-13",
    "venue": "arXiv 2412.10302",
    "url": "https://arxiv.org/abs/2412.10302",
    "summary": "DeepSeek-VL2 with MoE language backbone (4.5B active / 27B total) and SigLIP vision encoder + dynamic tiling. Reports MMMU 51.1, MathVista 62.8, ChartQA 86.0, DocVQA 92.3, OCRBench 811. Engages Bill_3 (dynamic tiling), Bill_5 (SigLIP-based, MoE LLM backbone), Bill_12 (open weights). Does NOT engage Bill_1, Bill_2, Bill_4, Bill_7, Bill_8 (single image only), Bill_9.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": "quarterly",
    "model_family": "DeepSeek-VL",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "OCRBench",
      "AI2D",
      "MMBench"
    ],
    "notes": "SigLIP encoder + MoE LLM = distinct Bill_5 architecture (different from CLIP-only or InternViT). Open-weight = Bill_12. M4 (frame-extracted-only / no native video) paid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "deepseek:deepseek_vl_original_2024_03",
    "title": "DeepSeek-VL \u2014 Towards Real-World Vision-Language Understanding",
    "authors": [
      "H. Lu",
      "W. Liu",
      "et al."
    ],
    "affiliations": [
      "DeepSeek-AI"
    ],
    "country_region": "CN",
    "date": "2024-03-08",
    "venue": "arXiv 2403.05525",
    "url": "https://arxiv.org/abs/2403.05525",
    "summary": "DeepSeek-VL 1.3B / 7B with hybrid SAM + SigLIP encoders. Reports MMMU 36.6 / 37.6, MathVista 31.0 / 36.1, ChartQA 47.4 / 59.1. Engages Bill_5 (hybrid two-encoder SAM+SigLIP \u2014 distinctive Bill_5 architecture), Bill_12. Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "annual",
    "model_family": "DeepSeek-VL",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "TextVQA"
    ],
    "notes": "Hybrid SAM-B + SigLIP-L dual encoder = distinctive Bill_5 architecture point. M1 (\u22648B). Open-weight Bill_12.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "deepseek_vl2_2024",
    "title": "DeepSeek-VL2: Mixture-of-Experts Vision-Language Models for Advanced Multimodal Understanding",
    "authors": [
      "Wu",
      "Chen",
      "Wang",
      "Tang",
      "Bi",
      "Pan",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "DeepSeek 2024",
    "url": "https://arxiv.org/abs/2412.10302",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "SigLIP encoder + DeepSeekMoE LLM with Multi-head Latent Attention; 1B/2.8B/4.5B activated params",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "eagle_2024",
    "title": "Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of Encoders",
    "authors": [
      "Shi",
      "Liu",
      "Wang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NVIDIA 2024",
    "url": "https://arxiv.org/abs/2408.15998",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Channel-concatenation of multiple vision encoder outputs + Pre-Alignment stage + LLM",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "egolife_2024",
    "title": "EgoLife: Towards Egocentric Life Assistant",
    "authors": [
      "Multiple"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": null,
    "url": null,
    "summary": "Multi-day continuous capture. Embodied/lived-experience reasoning. Benchmark frontier.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Multi-day continuous capture. Embodied/lived-experience reasoning. Benchmark frontier.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "egoschema_2023",
    "title": "EgoSchema: A Diagnostic Benchmark for Very Long-form Video Language Understanding",
    "authors": [
      "Mangalam",
      "Akbas",
      "Malik (Berkeley)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": null,
    "url": null,
    "summary": "Temporal certificate concept: minimum video duration human needs to answer. Median certificate ~100s. Subset of 500 Q is human-validated.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Temporal certificate concept: minimum video duration human needs to answer. Median certificate ~100s. Subset of 500 Q is human-validated.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "eva_02_2023",
    "title": "EVA-02: A Visual Representation for Neon Genesis",
    "authors": [
      "Fang",
      "Sun",
      "Wang",
      "Huang",
      "Wang",
      "Cao"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "Image and Vision Computing 2024",
    "url": "https://arxiv.org/abs/2303.11331",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Plain ViT pretrained via MIM with CLIP teacher; SwiGLU + RoPE",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "eva_clip_18b_2024",
    "title": "EVA-CLIP-18B: Scaling CLIP to 18 Billion Parameters",
    "authors": [
      "Sun",
      "Wang",
      "Wu",
      "Wang",
      "Yu",
      "Cao"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "BAAI 2024",
    "url": "https://arxiv.org/abs/2402.04252",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "ViT scaled to 18B params; same architecture, just scale",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "eva_clip_2023",
    "title": "EVA-CLIP: Improved Training Techniques for CLIP at Scale",
    "authors": [
      "Sun",
      "Fang",
      "Wu",
      "Wang",
      "Cao"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "BAAI 2023",
    "url": "https://arxiv.org/abs/2303.15389",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "ViT (CLIP-style) with improved training via MIM pretraining + LayerScale",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "eve_2024",
    "title": "Unveiling Encoder-Free Vision-Language Models (EVE)",
    "authors": [
      "Diao",
      "Cui",
      "Lai",
      "Cao",
      "Li",
      "Yu",
      "Wang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024 Spotlight",
    "url": "https://arxiv.org/abs/2406.11832",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Encoder-free unified decoder; lightweight patch embedding into Vicuna",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "eventbench_2024",
    "title": "EventBench: Towards Comprehensive Benchmarking of Event-Level Visual-Language Understanding",
    "authors": [
      "Du et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Event-level temporal abstraction (action -> event -> story). Useful complement to MVBench.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Event-level temporal abstraction (action -> event -> story). Useful complement to MVBench.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "evev2_2025",
    "title": "EVEv2: Improved Baselines for Encoder-Free Vision-Language Models",
    "authors": [
      "Diao",
      "Li",
      "Cui",
      "Cao",
      "Yu",
      "Wang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "ICCV 2025 Highlight",
    "url": "https://arxiv.org/abs/2502.06788",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Encoder-free decoder with modality-wise sparsity (per-layer modality-specific weights)",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "fastvlm_2024",
    "title": "FastVLM: Efficient Vision Encoding for Vision Language Models",
    "authors": [
      "Vasu",
      "Faghri",
      "Ahmadyan",
      "Pouransari",
      "Tuzel",
      "Toshev",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Apple CVPR 2025",
    "url": "https://arxiv.org/abs/2412.13303",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Hybrid encoder (RepMixer conv + self-attention) \u2014 5 stages, conv early, attention late",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "florence_2_2024",
    "title": "Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks",
    "authors": [
      "Xiao",
      "Wu",
      "Yang",
      "Dai",
      "Wang",
      "Lu",
      "Zeng",
      "Liu",
      "Yuan"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "CVPR 2024",
    "url": "https://arxiv.org/abs/2311.06242",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "DaViT vision encoder + BERT text encoder + standard encoder-decoder transformer",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "fuyu_2023",
    "title": "Fuyu-8B: A Multimodal Architecture for AI Agents",
    "authors": [
      "Adept Team"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "Adept 2023",
    "url": "https://www.adept.ai/blog/fuyu-8b/",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Vanilla decoder-only transformer (Persimmon-8B); image patches linearly projected, no embedding lookup",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "gemini_2_thinking_2024",
    "title": "Gemini 2.0 Flash Thinking + Vision",
    "authors": [
      "Google DeepMind"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Google DeepMind",
    "url": null,
    "summary": "Google's reasoning-mode VLM. Multimodal output unlocks visual self-correction.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Google's reasoning-mode VLM. Multimodal output unlocks visual self-correction.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "geoeval_2024",
    "title": "GeoEval: Benchmark for Evaluating LLMs and Multi-Modal Models on Geometry Problem-Solving",
    "authors": [
      "Hu",
      "Yang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ACL Findings 2024",
    "url": null,
    "summary": "Difficulty-graded geometry benchmark. Useful for falsifying claims that 'tools fix everything'.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Difficulty-graded geometry benchmark. Useful for falsifying claims that 'tools fix everything'.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "google:gemini_1_5_pro_card_2024_02",
    "title": "Gemini 1.5 Pro Technical Report (Vision Section)",
    "authors": [
      "Gemini Team",
      "Google"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "US/UK",
    "date": "2024-02-15",
    "venue": "Google DeepMind tech report",
    "url": "https://storage.googleapis.com/deepmind-media/gemini/gemini_v1_5_report.pdf",
    "summary": "Gemini 1.5 Pro tech report with native multimodality and 1M-context. Reports MMMU 58.5, MathVista 52.1, AI2D 79.1, ChartQA 81.3, DocVQA 86.5, EgoSchema-test 63.2. Engages Bill_8 (long-video / multi-image native, 1M-context evaluations on EgoSchema and 1H1H-Video benchmark). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_12.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "model_family": "Gemini 1.5",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "AI2D",
      "ChartQA",
      "DocVQA",
      "EgoSchema",
      "1H1H-Video"
    ],
    "notes": "Strongest Bill_8 (multi-image / video / interleaved generalization) closure of any 2024 vendor card \u2014 long-video benchmarks reported with cross-task transfer. Bill_3 / Bill_4 / Bill_7 / Bill_12 unpaid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "google:gemini_2_0_flash_2024_12",
    "title": "Gemini 2.0 Flash \u2014 Multimodal Live Card",
    "authors": [
      "Gemini Team",
      "Google"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "US/UK",
    "date": "2024-12-11",
    "venue": "Google DeepMind release",
    "url": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
    "summary": "Gemini 2.0 Flash card: native image / audio / video output. Reports MMMU 70.7, MathVista 70.4, ChartQA 85.5, DocVQA 95.1, MMMU-Pro 51.6. Engages Bill_7 (MMMU-Pro), Bill_8 (live video stream), Bill_9 (limited third-party). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_12.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "model_family": "Gemini 2.0",
    "benchmarks": [
      "MMMU",
      "MMMU-Pro",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "VideoMME"
    ],
    "notes": "Native interleaved video output \u2014 strong Bill_8 engagement. MMMU-Pro 51.6 vs MMMU 70.7 = 19.1pp drop, consistent with Yue-Bommasani 14-23pp finding (Bill_7 \u2605 predicted-empty narrative confirmed).",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "google:gemini_2_5_flash_2025_05",
    "title": "Gemini 2.5 Flash Card",
    "authors": [
      "Gemini Team",
      "Google"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "US/UK",
    "date": "2025-05-20",
    "venue": "Google DeepMind release",
    "url": "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-may-2025/",
    "summary": "Gemini 2.5 Flash. Reports MMMU 79.7, MMMU-Pro 60.6, MathVista 78.4, Video-MME 79.0. Engages Bill_7 (MMMU-Pro), Bill_8. Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_9, Bill_12.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.72,
    "watchlist_tier": "quarterly",
    "model_family": "Gemini 2.5",
    "benchmarks": [
      "MMMU",
      "MMMU-Pro",
      "MathVista",
      "Video-MME"
    ],
    "notes": "Smaller sibling to 2.5 Pro. MMMU-Pro disclosure = Bill_7 engagement. Bill_8 (Video-MME) included.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "google:gemini_2_5_pro_card_2025_03",
    "title": "Gemini 2.5 Pro Technical Card",
    "authors": [
      "Gemini Team",
      "Google"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "US/UK",
    "date": "2025-03-25",
    "venue": "Google DeepMind tech report",
    "url": "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/",
    "summary": "Gemini 2.5 Pro with thinking mode and improved vision-reasoning. Reports MMMU 81.7, MMMU-Pro 65.9, MathVista 82.7, BLINK 71.2, VideoMME 84.8. Engages Bill_7 (MMMU-Pro), Bill_8 (VideoMME long-context), Bill_9 (UK AISI mention). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4 (BLINK score without intervention), Bill_12.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "model_family": "Gemini 2.5",
    "benchmarks": [
      "MMMU",
      "MMMU-Pro",
      "MathVista",
      "BLINK",
      "VideoMME",
      "ChartQA",
      "DocVQA"
    ],
    "notes": "MMMU-Pro 65.9 vs MMMU 81.7 = 15.8pp drop, again confirming MMMU-Pro hardness. Bill_8 strongly engaged via VideoMME. Bill_4 \u2605 unpaid even at frontier.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "google:gemma3_vision_2025_03",
    "title": "Gemma 3 Multimodal Card",
    "authors": [
      "Gemma Team",
      "Google"
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "US/UK",
    "date": "2025-03-12",
    "venue": "Google DeepMind release",
    "url": "https://blog.google/technology/developers/gemma-3/",
    "summary": "Gemma 3 (4B / 12B / 27B) with SigLIP encoder, multimodal natively. Reports MMMU 64.9 (27B), MathVista 67.6, DocVQA 88.5, ChartQA 83.4. Engages Bill_3 (vision tokenization details + Pan&Scan ablation), Bill_5 (SigLIP), Bill_12 (open weights). Does NOT engage Bill_1, Bill_2, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "model_family": "Gemma",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "DocVQA",
      "ChartQA",
      "AI2D"
    ],
    "notes": "Pan&Scan algorithm for variable-aspect-ratio = Bill_3 closure. Open-weight = Bill_12. SigLIP encoder = Bill_5.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "google:paligemma_2_2024_12",
    "title": "PaliGemma 2 \u2014 A Versatile Family of VLMs",
    "authors": [
      "A. Steiner",
      "A. S. Pinto",
      "et al."
    ],
    "affiliations": [
      "Google DeepMind"
    ],
    "country_region": "US/UK",
    "date": "2024-12-04",
    "venue": "arXiv 2412.03555",
    "url": "https://arxiv.org/abs/2412.03555",
    "summary": "PaliGemma 2 (3B / 10B / 28B) with SigLIP encoder + Gemma 2 LLM. Reports DocVQA, ChartQA, TextVQA, OCRBench at multiple resolutions. Engages Bill_3 (resolution ablation matrix \u2014 strongest of any 2024 release), Bill_5 (SigLIP), Bill_12. Does NOT engage Bill_1, Bill_2, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "model_family": "PaliGemma",
    "benchmarks": [
      "DocVQA",
      "ChartQA",
      "TextVQA",
      "OCRBench",
      "AI2D",
      "MMMU"
    ],
    "notes": "Resolution ablation matrix (224 / 448 / 896) at 3 model sizes = strongest single-paper Bill_3 closure. Open-weight = Bill_12. SigLIP-only \u2192 M3 paid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "gpt4_code_interpreter_math_2023",
    "title": "Solving Challenging Math Word Problems Using GPT-4 Code Interpreter with Code-based Self-Verification",
    "authors": [
      "Zhou",
      "Lu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "ICLR 2024",
    "url": null,
    "summary": "Foundational result for the 'code interpreter as tool' literature. Carries over directly to MathVista's PoT track.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Foundational result for the 'code interpreter as tool' literature. Carries over directly to MathVista's PoT track.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "huggingGPT_2023",
    "title": "HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in HuggingFace",
    "authors": [
      "Shen",
      "Song",
      "Tan",
      "Li",
      "Lu",
      "Zhuang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": null,
    "url": null,
    "summary": "Predecessor / cousin to TaskMatrix. The tool-zoo orchestration thesis.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Predecessor / cousin to TaskMatrix. The tool-zoo orchestration thesis.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "huggingface:idefics3_2024_08",
    "title": "Idefics3 \u2014 Building and Better Understanding Vision-Language Models",
    "authors": [
      "H. Laurencon",
      "A. Marafioti",
      "et al."
    ],
    "affiliations": [
      "HuggingFace"
    ],
    "country_region": "FR/US",
    "date": "2024-08-22",
    "venue": "arXiv 2408.12637",
    "url": "https://arxiv.org/abs/2408.12637",
    "summary": "Idefics3 8B with SigLIP-SO400M + Llama 3.1. Reports MMMU 46.6, MathVista 58.4, DocVQA 87.7, ChartQA 74.8. Engages Bill_3 (image splitting strategy ablation), Bill_5 (SigLIP), Bill_12 (open weights + open data Docmatix). Does NOT engage Bill_1, Bill_2, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "annual",
    "model_family": "Idefics",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "DocVQA",
      "ChartQA",
      "TextVQA",
      "AI2D"
    ],
    "notes": "M1 (\u22648B). Strong open-data approach (Docmatix) supports future Bill_1/Bill_2 audits. Bill_3 explicit splitting ablation.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "huggingface:smolvlm_2024_11",
    "title": "SmolVLM \u2014 Tiny Multimodal Models",
    "authors": [
      "A. Marafioti",
      "M. Kasner",
      "et al."
    ],
    "affiliations": [
      "HuggingFace"
    ],
    "country_region": "FR/US",
    "date": "2024-11-26",
    "venue": "HuggingFace blog",
    "url": "https://huggingface.co/blog/smolvlm",
    "summary": "SmolVLM 256M / 500M / 2.2B. Reports MMMU 38.8 (2.2B), MathVista 35.2, DocVQA 78.0. Engages Bill_5, Bill_12 (fully open). Pre-frontier (M1). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_8, Bill_9.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.6,
    "watchlist_tier": "annual",
    "model_family": "SmolVLM",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "DocVQA",
      "TextVQA"
    ],
    "notes": "Lower-bound reference for Bill_12 / Bill_5. M1 paid. Useful for distilled-cousin analyses (Bill_12).",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "humaneval_v_2024",
    "title": "HumanEval-V: Evaluating Visual Understanding and Reasoning Abilities of Large Multimodal Models Through Coding Tasks",
    "authors": [
      "Zhang",
      "Tian",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Audit benchmark: exposes that visual reasoning gap survives even when downstream is pure code generation.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Audit benchmark: exposes that visual reasoning gap survives even when downstream is pure code generation.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "idefics2_2024",
    "title": "What matters when building vision-language models? (Idefics2)",
    "authors": [
      "Lauren\u00e7on",
      "Tronchon",
      "Cord",
      "Sanh"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "HuggingFace 2024",
    "url": "https://arxiv.org/abs/2405.02246",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "SigLIP encoder + perceiver resampler (64 visual tokens) + Mistral LLM",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "idefics3_2024",
    "title": "Building and better understanding vision-language models: insights and future directions (Idefics3)",
    "authors": [
      "Lauren\u00e7on",
      "Marafioti",
      "Sanh",
      "Tronchon"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "HuggingFace 2024",
    "url": "https://arxiv.org/abs/2408.12637",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "SigLIP encoder + pixel shuffle connector + Llama3 LLM (perceiver removed for OCR)",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "interleavebench_2024",
    "title": "InterleaveBench: Evaluating Interleaved Image-Text Generation",
    "authors": [
      "Liu et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Generation-side complement to MM-NIAH and Mantis-Eval. Few open VLMs can natively generate interleaved output.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Generation-side complement to MM-NIAH and Mantis-Eval. Few open VLMs can natively generate interleaved output.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "internvl_1_2024",
    "title": "InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks",
    "authors": [
      "Chen",
      "Wu",
      "Wang",
      "Su",
      "Chen",
      "Xing",
      "Zhong",
      "Zhang",
      "Zhu",
      "Lu",
      "Li",
      "Luo",
      "Lu",
      "Qiao",
      "Dai"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "CVPR 2024 Oral",
    "url": "https://arxiv.org/abs/2312.14238",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "InternViT-6B (6B-param vision encoder) + 8B QLLaMA middleware + LLM",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "internvl_2_5_2024",
    "title": "InternVL2.5: Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling",
    "authors": [
      "Chen",
      "Wang",
      "Cui",
      "Zhu",
      "Tian",
      "Su",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "OpenGVLab 2024",
    "url": "https://arxiv.org/abs/2412.05271",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "ViT-MLP-LLM with InternViT (300M or 6B) + dynamic high-res tiling (1-40 tiles of 448x448)",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "internvl_3_2025",
    "title": "InternVL3: Exploring Advanced Training and Test-Time Recipes for Open-Source Multimodal Models",
    "authors": [
      "Zhu",
      "Chen",
      "Wang",
      "Cui",
      "Wang",
      "Wang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "OpenGVLab 2025",
    "url": "https://arxiv.org/abs/2504.10479",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "ViT-MLP-LLM with V2PE (variable visual position encoding); native multimodal joint pretraining",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "janus_2024",
    "title": "Janus: Decoupling Visual Encoding for Unified Multimodal Understanding and Generation",
    "authors": [
      "Wu",
      "Chen",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "DeepSeek 2024",
    "url": "https://arxiv.org/abs/2410.13848",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Decoupled visual encoding (SigLIP for understanding, VQ for generation) + unified transformer",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "llava_onevision_2024",
    "title": "LLaVA-OneVision: Easy Visual Task Transfer",
    "authors": [
      "Li",
      "Zhang",
      "Guo",
      "Zhang",
      "Zhang",
      "Li",
      "Liu"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ByteDance/NTU 2024",
    "url": "https://arxiv.org/abs/2408.03326",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "SigLIP encoder + Qwen2 LLM + AnyRes (anyres-9: split into 9 patches)",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "long_video_bench_2024",
    "title": "LongVideoBench: A Benchmark for Long-context Interleaved Video-Language Understanding",
    "authors": [
      "Wu et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Referring-query design: ground a moment then answer. 17 fine-grained reasoning categories. Interleaved-modality is a first-class axis.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Referring-query design: ground a moment then answer. 17 fine-grained reasoning categories. Interleaved-modality is a first-class axis.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "longvu_2024",
    "title": "LongVU: Spatiotemporal Adaptive Compression for Long Video-Language Understanding",
    "authors": [
      "Shen et al. (Meta)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Compression strategy as benchmark anchor. Demonstrates that sampling rate is not the bottleneck.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Compression strategy as benchmark anchor. Demonstrates that sampling rate is not the bottleneck.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "lvbench_2024",
    "title": "LVBench: An Extreme Long Video Understanding Benchmark",
    "authors": [
      "Wang et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Stress-tests >1hr capability. Frontier models still close to random. Genuine open problem.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Stress-tests >1hr capability. Frontier models still close to random. Genuine open problem.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "m3cot_2024",
    "title": "M3CoT: A Novel Benchmark for Multi-Domain Multi-step Multi-modal Chain-of-Thought",
    "authors": [
      "Chen",
      "Qin",
      "Zhang",
      "Chen",
      "Xu",
      "Che"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ACL 2024",
    "url": null,
    "summary": "Auditing benchmark: explicitly designed to require visual reasoning across multiple steps where prior MCoT benchmarks did not.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Auditing benchmark: explicitly designed to require visual reasoning across multiple steps where prior MCoT benchmarks did not.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "mantis_2024",
    "title": "Mantis: Interleaved Multi-Image Instruction Tuning",
    "authors": [
      "Jiang et al. (TIGER-Lab)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Pioneering multi-image instruction-tuning + benchmark. Mantis-8B competitive with GPT-4V on multi-image.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Pioneering multi-image instruction-tuning + benchmark. Mantis-8B competitive with GPT-4V on multi-image.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "math_v_2024",
    "title": "MATH-Vision: Measuring Multimodal Mathematical Reasoning",
    "authors": [
      "Wang",
      "Hu",
      "Liu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024",
    "url": null,
    "summary": "Harder MathVista. Tool augmentation provides smaller marginal gains here than on MathVista.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Harder MathVista. Tool augmentation provides smaller marginal gains here than on MathVista.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "mathverse_2024",
    "title": "MathVerse: Does Your Multi-modal LLM Truly See the Diagrams in Visual Math Problems?",
    "authors": [
      "Zhang",
      "Jiang",
      "Liu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ECCV 2024",
    "url": null,
    "summary": "Strongest 'tool/text-use exposes causal-grounding gap' evidence: 5pt+ improvement when image is removed proves model isn't grounded.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Strongest 'tool/text-use exposes causal-grounding gap' evidence: 5pt+ improvement when image is removed proves model isn't grounded.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "mathvista_2023",
    "title": "MathVista: Evaluating Mathematical Reasoning of Foundation Models in Visual Contexts",
    "authors": [
      "Lu",
      "Bansal",
      "Xia",
      "Liu",
      "Li",
      "Hajishirzi",
      "Cheng",
      "Chang",
      "Galley",
      "Gao"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "ICLR 2024 Oral",
    "url": null,
    "summary": "Foundational tool-augmented vision benchmark. Distinguishes CoT vs PoT vs caption+OCR augmentation. Original paper explicitly evaluates both raw-VLM and tool-augmented-LLM tracks.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Foundational tool-augmented vision benchmark. Distinguishes CoT vs PoT vs caption+OCR augmentation. Original paper explicitly evaluates both raw-VLM and tool-augmented-LLM tracks.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "meta:llama3_2_vision_card_2024_09",
    "title": "Llama 3.2 11B / 90B Vision Model Cards",
    "authors": [
      "Meta AI"
    ],
    "affiliations": [
      "Meta AI"
    ],
    "country_region": "US",
    "date": "2024-09-25",
    "venue": "Meta open-weight release",
    "url": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
    "summary": "First open-weight Llama vision models (11B / 90B). Reports MMMU 50.7 / 60.3, MathVista 51.5 / 57.3, ChartQA 83.4 / 85.5, DocVQA 88.4 / 90.1, VQAv2 75.2 / 78.1. Engages Bill_5 (cross-architecture portability \u2014 vision adapter on top of Llama text transformer), Bill_12 (open-weight directly enabling distilled-cousin audits). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7 (no MMMU-Pro), Bill_9 (no third-party pre-deployment).",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "model_family": "Llama 3.2",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "VQAv2",
      "TextVQA"
    ],
    "notes": "CLIP-ViT-H/14 vision encoder + cross-attention adapters. Open-weight enables Bill_12 closure. Tile-based dynamic resolution \u2192 partial Bill_3 disclosure. No third-party AISI/METR audit \u2192 Bill_9 unpaid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "metr_2024_q3_vlm_replication",
    "title": "METR Q3 2024: VLM-Specific Replication of Vendor Self-Evaluations",
    "authors": [
      "METR"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "METR Technical Report",
    "url": null,
    "summary": "METR replicated GPT-4V, Sonnet-3.5, Gemini 1.5 on agentic visual tasks (browser screenshots, GUI navigation, video planning). Vendor-reported success 67-78%; METR replication 41-52%. 25-pp systematic shortfall. Targets B_VLM6 (vendor-self-eval invalid) and B_VLM10 (video horizon-doubling lags claimed).",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "METR replicated GPT-4V, Sonnet-3.5, Gemini 1.5 on agentic visual tasks (browser screenshots, GUI navigation, video planning). Vendor-reported success 67-78%; METR replication 41-52%. 25-pp systematic shortfall. Targets B_VLM6 (vendor-self-eval invalid) and B_VLM10 (video horizon-doubling lags claimed).",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "mibench_2024",
    "title": "MIBench: Evaluating Multimodal Large Language Models with Multiple Images",
    "authors": [
      "Liu et al. (Alibaba)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Three orthogonal multi-image scenarios. MIC subscenario tests in-context image-text learning at 4/8 shots.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Three orthogonal multi-image scenarios. MIC subscenario tests in-context image-text learning at 4/8 shots.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "microsoft:phi35_vision_2024_08",
    "title": "Phi-3.5-Vision \u2014 Small Multimodal Model Card",
    "authors": [
      "Microsoft"
    ],
    "affiliations": [
      "Microsoft Research"
    ],
    "country_region": "US",
    "date": "2024-08-20",
    "venue": "Microsoft model card",
    "url": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",
    "summary": "Phi-3.5-Vision 4.2B with CLIP-ViT-L/14-336 vision encoder. Reports MMMU 43.0, MathVista 43.9, ChartQA 81.8, DocVQA 88.4, AI2D 78.1. Engages Bill_5 (small open model), Bill_12. Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_8, Bill_9.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "annual",
    "model_family": "Phi-3.5",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "AI2D",
      "TextVQA"
    ],
    "notes": "M1 (pre-frontier \u22648B) explicitly paid \u2014 small-model meta-cost. Open-weight + standard CLIP encoder makes it a useful Bill_5 / Bill_12 lower-bound reference.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "microsoft:phi4_multimodal_2025_02",
    "title": "Phi-4-Multimodal \u2014 Open Mini Multimodal Frontier",
    "authors": [
      "Microsoft"
    ],
    "affiliations": [
      "Microsoft Research"
    ],
    "country_region": "US",
    "date": "2025-02-26",
    "venue": "Microsoft model card / arXiv 2503.01743",
    "url": "https://arxiv.org/abs/2503.01743",
    "summary": "Phi-4-Multimodal 5.6B with SigLIP vision and audio. Reports MMMU 55.1, MathVista 62.4, ChartQA 81.4, DocVQA 93.2, AI2D 82.3, BLINK 58.3. Engages Bill_5 (SigLIP), Bill_8 (audio + image + text), Bill_12. Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": "annual",
    "model_family": "Phi-4",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "AI2D",
      "BLINK"
    ],
    "notes": "Small but frontier-adjacent \u2014 straddles M1 threshold. SigLIP + small LLM = Bill_5 portability evidence. BLINK reported but no causal-intervention follow-through.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "minicpm:minicpm_v_2_6_2024_08",
    "title": "MiniCPM-V 2.6 \u2014 A GPT-4V Level MLLM on Your Phone",
    "authors": [
      "Y. Yao",
      "T. Yu",
      "et al."
    ],
    "affiliations": [
      "Tsinghua",
      "OpenBMB"
    ],
    "country_region": "CN",
    "date": "2024-08-06",
    "venue": "arXiv 2408.01800",
    "url": "https://arxiv.org/abs/2408.01800",
    "summary": "MiniCPM-V 2.6 8B with SigLIP-400M vision encoder. Reports MMMU 49.8, MathVista 60.6, ChartQA 82.4, DocVQA 90.8. Engages Bill_3 (high-res via slicing, ablated), Bill_5 (SigLIP + small LLM), Bill_12 (open weights), Bill_8 (multi-image / video evaluation included). Does NOT engage Bill_1, Bill_2, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "annual",
    "model_family": "MiniCPM-V",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "Video-MME",
      "MVBench"
    ],
    "notes": "Mobile-class but includes resolution + video ablations \u2014 punches above weight on Bills 3 / 8. M1 (\u22648B) paid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "mirb_2024",
    "title": "MIRB: Multi-Image Reasoning Benchmark",
    "authors": [
      "Zhao et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Explicit multi-hop axis with image-image grounding chain length. Designed for stress-testing multi-image attention.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Explicit multi-hop axis with image-image grounding chain length. Designed for stress-testing multi-image attention.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "mistral:mistral_small_3_1_2025_03",
    "title": "Mistral Small 3.1 \u2014 Vision Update",
    "authors": [
      "Mistral AI"
    ],
    "affiliations": [
      "Mistral AI"
    ],
    "country_region": "FR",
    "date": "2025-03-17",
    "venue": "Mistral release",
    "url": "https://mistral.ai/news/mistral-small-3-1/",
    "summary": "Mistral Small 3.1 (24B) vision integration. Reports MMMU 64.0, MathVista 69.3, ChartQA 86.2, DocVQA 94.1, AI2D 93.0. Engages Bill_5, Bill_12. Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "annual",
    "model_family": "Mistral Small",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "AI2D"
    ],
    "notes": "Open-weight 24B with Pixtral-conv encoder = Bill_5 / Bill_12 substrate. M5 paid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "mistral:pixtral_12b_2024_11",
    "title": "Pixtral 12B \u2014 Mistral's First Multimodal Model",
    "authors": [
      "Mistral AI"
    ],
    "affiliations": [
      "Mistral AI"
    ],
    "country_region": "FR",
    "date": "2024-11-12",
    "venue": "Mistral AI model card / arXiv 2410.07073",
    "url": "https://arxiv.org/abs/2410.07073",
    "summary": "Pixtral 12B with novel Pixtral-Vision encoder (400M params, native variable-resolution). Reports MMMU 52.5, MathVista 58.0, ChartQA 81.8, DocVQA 90.7, VQAv2 80.0. Engages Bill_3 (variable-resolution native handling \u2014 disclosed and ablated), Bill_5 (cross-encoder novelty: not CLIP / SigLIP), Bill_12 (open-weight). Does NOT engage Bill_1, Bill_2, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "model_family": "Pixtral",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "VQAv2",
      "MM-MT-Bench"
    ],
    "notes": "Novel vision encoder (Pixtral-conv) ablating native resolution \u2014 directly engages Bill_3 and Bill_5. Open-weight enables Bill_12 audit. M5 (no OCR-leakage audit) still paid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "mistral:pixtral_large_2024_12",
    "title": "Pixtral Large \u2014 Frontier Multimodal Model Card",
    "authors": [
      "Mistral AI"
    ],
    "affiliations": [
      "Mistral AI"
    ],
    "country_region": "FR",
    "date": "2024-12-17",
    "venue": "Mistral AI release",
    "url": "https://mistral.ai/news/pixtral-large/",
    "summary": "Pixtral Large 124B with same Pixtral-Vision encoder, scaled to frontier. Reports MMMU 64.0, MathVista 69.4, ChartQA 88.1, DocVQA 93.3, AI2D 93.8. Engages Bill_5 (architecture portability through scale), Bill_12 (open weights). Does NOT engage Bill_1, Bill_2, Bill_3 (less ablation than 12B), Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "model_family": "Pixtral",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "AI2D",
      "MathVerse"
    ],
    "notes": "Same Pixtral-conv encoder scaled \u2014 Bill_5 portability test within family. Open-weight close to frontier scores enables strong Bill_12 audit potential.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "mlvu_2024",
    "title": "MLVU: A Comprehensive Benchmark for Multi-Task Long Video Understanding",
    "authors": [
      "Zhou et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Multi-choice (M-Avg) + generation (G-Avg) splits. Needle-QA explicitly tests long-context retrieval in video.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Multi-choice (M-Avg) + generation (G-Avg) splits. Needle-QA explicitly tests long-context retrieval in video.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "mm1_2024",
    "title": "MM1: Methods, Analysis & Insights from Multimodal LLM Pre-training",
    "authors": [
      "McKinzie",
      "Gan",
      "Fauconnier",
      "Dodge",
      "Zhang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Apple ECCV 2024",
    "url": "https://arxiv.org/abs/2403.09611",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "ViT-L/14 CLIP + C-Abstractor connector (144 tokens) + LLM",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "mm_niah_2024",
    "title": "MM-NIAH: Multimodal Needle in a Haystack",
    "authors": [
      "Wang et al. (OpenGVLab)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "First interleaved image+text NIAH. Tests multimodal long-context retrieval and reasoning. Shows attention to image tokens decays faster than text.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "First interleaved image+text NIAH. Tests multimodal long-context retrieval and reasoning. Shows attention to image tokens decays faster than text.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "mm_react_2023",
    "title": "MM-REACT: Prompting ChatGPT for Multimodal Reasoning and Action",
    "authors": [
      "Yang",
      "Gan",
      "Wang",
      "Hu",
      "Lu",
      "Liu",
      "Wang (Microsoft)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": null,
    "url": null,
    "summary": "Predecessor to o1/o3 'tool-using reasoner'. Microsoft's tool-orchestration thesis.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Predecessor to o1/o3 'tool-using reasoner'. Microsoft's tool-orchestration thesis.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "mm_search_plus_lit_2024",
    "title": "MMSearch (Original): Benchmarking Multimodal Search Engines",
    "authors": [
      "Jiang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Predecessor to MMSearch-Plus. Frames retrieval-as-tool for VLMs.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Predecessor to MMSearch-Plus. Frames retrieval-as-tool for VLMs.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "mm_vid_2023",
    "title": "MM-VID: Advancing Video Understanding with GPT-4V(ision)",
    "authors": [
      "Lin",
      "Yang",
      "Lin",
      "et al. (Microsoft)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": null,
    "url": null,
    "summary": "Tool-augmented baseline for long-video understanding before native long-context VLMs.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Tool-augmented baseline for long-video understanding before native long-context VLMs.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "mmmu_2023",
    "title": "MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI",
    "authors": [
      "Yue",
      "Ni",
      "Zhang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "CVPR 2024",
    "url": null,
    "summary": "Canonical college-level multimodal benchmark. o1 delta over GPT-4o (+22pt) is largest single-paper documented reasoning-mode effect.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Canonical college-level multimodal benchmark. o1 delta over GPT-4o (+22pt) is largest single-paper documented reasoning-mode effect.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "mmmu_multi_image_2024",
    "title": "MMMU multi-image subset (Massive Multi-discipline Multimodal Understanding)",
    "authors": [
      "Yue et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": null,
    "url": null,
    "summary": "Underused as multi-image benchmark; the Pro variant (MMMU-Pro 2024) made vision-only mode mandatory and dropped GPT-4o by 16pt.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Underused as multi-image benchmark; the Pro variant (MMMU-Pro 2024) made vision-only mode mandatory and dropped GPT-4o by 16pt.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "mmmu_pro_2024",
    "title": "MMMU-Pro: A More Robust Multi-discipline Multimodal Understanding Benchmark",
    "authors": [
      "Yue",
      "Zheng",
      "Ni",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Audit version of MMMU. Validates that prior 'visual' benchmarks were partially text-solvable -> shortcut concerns.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Audit version of MMMU. Validates that prior 'visual' benchmarks were partially text-solvable -> shortcut concerns.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "mmsearch_plus_2025",
    "title": "MMSearch-Plus: Benchmarking Provenance-Aware Search for Multimodal Browsing Agents",
    "authors": [
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": null,
    "url": null,
    "summary": "Provenance-aware multimodal search. Quantifies SoM lift in retrieval-augmented setting.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Provenance-aware multimodal search. Quantifies SoM lift in retrieval-augmented setting.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "mobile_agent_v2_2024",
    "title": "Mobile-Agent-v2: Mobile Device Operation Assistant with Effective Navigation via Multi-Agent Collaboration",
    "authors": [
      "Wang",
      "Xu",
      "Hu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Mobile companion to OSWorld. Multi-agent collaboration as Bill_6 mechanism.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Mobile companion to OSWorld. Multi-agent collaboration as Bill_6 mechanism.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "moe_llava_2024",
    "title": "MoE-LLaVA: Mixture of Experts for Large Vision-Language Models",
    "authors": [
      "Lin",
      "Tang",
      "Ye",
      "Cui",
      "Zhu",
      "Jin",
      "Zhang",
      "Yuan",
      "Liu"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "TMM 2025",
    "url": "https://arxiv.org/abs/2401.15947",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "LLaVA architecture + MoE in LLM FFN layers (top-2 routing); 3B activated params from larger total",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "molmo_pixmo_2024",
    "title": "Molmo and PixMo: Open Weights and Open Data for State-of-the-Art Vision-Language Models",
    "authors": [
      "Deitke",
      "Clark",
      "Lee",
      "Tripathi",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "AllenAI 2024",
    "url": "https://arxiv.org/abs/2409.17146",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "CLIP encoder + Qwen2/OLMo LLM; trained on PixMo (1M images, no synthetic VLM annotations)",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "mono_internvl_2024",
    "title": "Mono-InternVL: Pushing the Boundaries of Monolithic Multimodal Large Language Models with Endogenous Visual Pre-training",
    "authors": [
      "Luo",
      "Lu",
      "Liu",
      "Lu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "OpenGVLab CVPR 2025",
    "url": "https://arxiv.org/abs/2410.08202",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Monolithic VLM: visual experts as MoE in frozen LLM; Endogenous Visual Pre-training (EViP)",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "moondream_pot_2025",
    "title": "Moondream + Program-of-Thoughts ChartQA",
    "authors": [
      "Moondream"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": null,
    "url": null,
    "summary": "Concrete deployed example of PoT closing the chart-QA gap. Quantifies tool-augmented vs raw delta.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Concrete deployed example of PoT closing the chart-QA gap. Quantifies tool-augmented vs raw delta.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "moonshot:kimi_vl_2025_04",
    "title": "Kimi-VL Technical Report \u2014 Reasoning-Oriented Multimodal Model",
    "authors": [
      "Moonshot AI"
    ],
    "affiliations": [
      "Moonshot AI"
    ],
    "country_region": "CN",
    "date": "2025-04-09",
    "venue": "arXiv 2504.07491",
    "url": "https://arxiv.org/abs/2504.07491",
    "summary": "Kimi-VL with MoE backbone and reasoning-oriented training. Reports MMMU 57.0, MathVista 68.7, MathVision 21.8, MMLongBench-Doc 35.1, ChartQA 84.5. Engages Bill_5 (MoE), Bill_8 (long-document multi-page), Bill_12. Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": "annual",
    "model_family": "Kimi-VL",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "MathVision",
      "MMLongBench-Doc",
      "ChartQA"
    ],
    "notes": "Long-document multi-image (MMLongBench-Doc) = Bill_8 engagement. MoE backbone = Bill_5 evidence. Reasoning-oriented training but no causal-intervention vision-grounding test \u2192 Bill_4 unpaid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "movie_101_2023",
    "title": "Movie101: A New Movie Understanding Benchmark",
    "authors": [
      "Yue et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": null,
    "url": null,
    "summary": "Multilingual + long-video. Audio-description grounding makes it tri-modal.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Multilingual + long-video. Audio-description grounding makes it tri-modal.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "moviechat_2024",
    "title": "MovieChat: From Dense Token to Sparse Memory for Long Video Understanding",
    "authors": [
      "Song et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Movie-length (>10K frames) understanding. Sparse-memory architecture as benchmark target. Companion benchmark MovieChat-1K.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Movie-length (>10K frames) understanding. Sparse-memory architecture as benchmark target. Companion benchmark MovieChat-1K.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "mplug_owl3_2024",
    "title": "mPLUG-Owl3: Towards Long Image-Sequence Understanding in Multi-Modal Large Language Models",
    "authors": [
      "Ye",
      "Xu",
      "Yan",
      "Zhang",
      "Liang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Alibaba 2024",
    "url": "https://arxiv.org/abs/2408.04840",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "SigLIP encoder + LLM with sparse hyper-attention blocks (alternative to cross-attention or concat)",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "msrvtt_qa_2017",
    "title": "MSRVTT-QA: Video Question Answering via Gradually Refined Attention over Appearance and Motion",
    "authors": [
      "Xu et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2017",
    "venue": null,
    "url": null,
    "summary": "Legacy dataset; useful for single-frame ablation -- shows where temporal reasoning is NOT required.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Legacy dataset; useful for single-frame ablation -- shows where temporal reasoning is NOT required.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "muirbench_2024",
    "title": "MuirBench: A Comprehensive Benchmark for Robust Multi-Image Understanding",
    "authors": [
      "Wang et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Robust multi-image -- unanswerable variants paired with answerable; tests refusal and grounding.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Robust multi-image -- unanswerable variants paired with answerable; tests refusal and grounding.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "mvbench_2024",
    "title": "MVBench: A Comprehensive Multi-modal Video Understanding Benchmark",
    "authors": [
      "Li et al. (OpenGVLab)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "First benchmark explicitly designed so single-frame cannot solve. Static-frame-defeating task design.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "First benchmark explicitly designed so single-frame cannot solve. Static-frame-defeating task design.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "mvbench_pro_2025",
    "title": "MVBench-Pro: Refined Multi-Modal Video Understanding (extension)",
    "authors": [
      "OpenGVLab et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": null,
    "url": null,
    "summary": "Hardened MVBench addressing saturation in late 2024. Harder distractor + counterfactual splits.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Hardened MVBench addressing saturation in late 2024. Harder distractor + counterfactual splits.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "mvfeqa_2025",
    "title": "MVFEQA: Multi-View Fashion / Embodied Question Answering",
    "authors": [
      "Multiple"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": null,
    "url": null,
    "summary": "Multi-view subset niche but useful for embodied VLM evaluation.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Multi-view subset niche but useful for embodied VLM evaluation.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "mvp_bench_2024",
    "title": "MVP-Bench: Multi-Visual-Perspective Reasoning",
    "authors": [
      "Multiple"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Perspective-shift axis. Tests embodied perspective transfer.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Perspective-shift axis. Tests embodied perspective transfer.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "next_qa_2021",
    "title": "NeXT-QA: Next Phase of Question-Answering to Explaining Temporal Actions",
    "authors": [
      "Xiao et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021",
    "venue": null,
    "url": null,
    "summary": "Foundational causal-temporal video-QA dataset; still cited as core leaderboard for video-LLMs.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Foundational causal-temporal video-QA dataset; still cited as core leaderboard for video-LLMs.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "nvidia:nvlm_d_72b_2024_09",
    "title": "NVLM \u2014 Open Frontier-Class Multimodal LLMs",
    "authors": [
      "W. Dai",
      "N. Lee",
      "et al."
    ],
    "affiliations": [
      "NVIDIA"
    ],
    "country_region": "US",
    "date": "2024-09-17",
    "venue": "arXiv 2409.11402",
    "url": "https://arxiv.org/abs/2409.11402",
    "summary": "NVLM-D-72B (decoder-only), NVLM-X (cross-attention), NVLM-H (hybrid) \u2014 three architectures compared. Reports MMMU 59.7 (NVLM-D), MathVista 65.2, ChartQA 86.0, DocVQA 92.6, OCRBench 853. Engages Bill_3 (dynamic tiling ablation), Bill_5 (three architectures compared head-to-head \u2014 strongest Bill_5 closure of 2024), Bill_12. Does NOT engage Bill_1, Bill_2, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": "quarterly",
    "model_family": "NVLM",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "OCRBench",
      "AI2D",
      "TextVQA",
      "RealWorldQA"
    ],
    "notes": "Three architectures (decoder-only, cross-attention, hybrid) head-to-head with controlled training = strongest single-paper Bill_5 (cross-architecture portability) closure of any vendor card. Uses InternViT-6B encoder. Open-weight Bill_12.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "nvlm_2024",
    "title": "NVLM: Open Frontier-Class Multimodal LLMs",
    "authors": [
      "Dai",
      "Lee",
      "Liu",
      "Tang",
      "Karmanov",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NVIDIA 2024",
    "url": "https://arxiv.org/abs/2409.11402",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Three architectures: Decoder-only (D), Cross-attention (X), Hybrid (H); same vision encoder",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "o1_o3_o4_thinking_with_images_2025",
    "title": "Introducing OpenAI o3 and o4-mini: Thinking with Images + Integrated Tools",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "OpenAI",
    "url": null,
    "summary": "Frontier reasoning-mode VLM: think-with-images + Code Interpreter. Defines the upper bound of tool-augmented vision in 2025.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Frontier reasoning-mode VLM: think-with-images + Code Interpreter. Defines the upper bound of tool-augmented vision in 2025.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "octopus_2023",
    "title": "Octopus: Embodied Vision-Language Programmer from Environmental Feedback",
    "authors": [
      "Yang",
      "Liu",
      "Liu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "ECCV 2024",
    "url": null,
    "summary": "OctoVerse benchmark suite. Code-generation tool-use thesis applied to embodied AI.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "OctoVerse benchmark suite. Code-generation tool-use thesis applied to embodied AI.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "olympiadbench_2024",
    "title": "OlympiadBench: A Challenging Benchmark for Promoting AGI with Olympiad-Level Bilingual Multimodal Scientific Problems",
    "authors": [
      "He",
      "Liu",
      "Shao",
      "Liu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ACL 2024",
    "url": null,
    "summary": "Bilingual + physics expansion. Reveals that tool-augmentation efficacy depends on whether bottleneck is visual or symbolic.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Bilingual + physics expansion. Reveals that tool-augmentation efficacy depends on whether bottleneck is visual or symbolic.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "omnibench_2024",
    "title": "OmniBench: Towards The Future of Universal Omni-Language Models",
    "authors": [
      "Li et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Stress-tests audio+image fusion. Most VLMs cannot ingest audio natively, fall back to text transcript.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Stress-tests audio+image fusion. Most VLMs cannot ingest audio natively, fall back to text transcript.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "openai:gpt4_1_vision_2025_04",
    "title": "GPT-4.1 \u2014 Vision and Multimodal Improvements",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [
      "OpenAI"
    ],
    "country_region": "US",
    "date": "2025-04-14",
    "venue": "OpenAI release notes",
    "url": "https://openai.com/index/gpt-4-1/",
    "summary": "Release notes / model card for GPT-4.1 highlighting long-context (1M tokens) and vision improvements. Reports MMMU 75.7, MathVista 72.2, MMMU-Pro (claimed first vendor disclosure for this family) 53.4. Engages Bill_7 (MMMU-Pro disclosed and partially closed, but cross-benchmark r not reported), Bill_9 (vendor-internal). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_8 (multi-image not benchmarked), Bill_12.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": "M2",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": "quarterly",
    "model_family": "GPT-4.1",
    "benchmarks": [
      "MMMU",
      "MMMU-Pro",
      "MathVista",
      "ChartQA",
      "DocVQA"
    ],
    "notes": "First vendor disclosure of MMMU-Pro for the GPT-4 family \u2014 Bill_7 partially engaged. Long-context vision (multi-image / interleaved 1M tokens) not separately benchmarked vs single-image baseline \u2192 Bill_8 unpaid. Needs G2 audit.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "openai:gpt4o_mini_vision_2024_07",
    "title": "GPT-4o mini Vision Section",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [
      "OpenAI"
    ],
    "country_region": "US",
    "date": "2024-07-18",
    "venue": "OpenAI release",
    "url": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/",
    "summary": "GPT-4o mini vision-capable. Reports MMMU 59.4, MathVista 56.7, ChartQA 80.4, DocVQA 85.1, AI2D 88.1. Engages Bill_9 partially. Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_12.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.65,
    "watchlist_tier": "annual",
    "model_family": "GPT-4o",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "AI2D"
    ],
    "notes": "Smaller GPT-4o variant. Same closure profile as parent (M5 paid; Bills 1/2/3/4/7/12 unpaid).",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "openai:gpt4o_system_card_2024_05",
    "title": "GPT-4o System Card",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [
      "OpenAI"
    ],
    "country_region": "US",
    "date": "2024-05-13",
    "venue": "OpenAI system card",
    "url": "https://openai.com/index/gpt-4o-system-card/",
    "summary": "Vendor system card for GPT-4o (omni model), unifying text/vision/audio in a single transformer. Reports MMMU 69.1, MathVista 63.8, ChartQA 85.7, DocVQA 92.8, AI2D 94.2. Engages Bill_9 (Apollo / METR mentioned, METR partial Bill_9 closure). Explicitly does NOT engage Bill_1 (no perceptual-hash audit), Bill_2 (no OCR-corpus contamination check on DocVQA / TextVQA / ChartQA), Bill_3 (resolution / patch undisclosed), Bill_4 (no Eyes-Wide-Shut / BLINK causal-intervention test), Bill_7 (MMMU-Pro untested), Bill_8 (single-image only; no video / multi-image generalization claim).",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "model_family": "GPT-4o",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "AI2D",
      "TextVQA"
    ],
    "notes": "Unified token-stream architecture (single transformer, no separate vision encoder claimed). Tokenizer-format brittleness untested. ChartQA/DocVQA scores unaudited for OCR-corpus contamination.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "openai:gpt4o_video_card_2025_06",
    "title": "GPT-4o Video Understanding Update",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [
      "OpenAI"
    ],
    "country_region": "US",
    "date": "2025-06-10",
    "venue": "OpenAI release notes",
    "url": "https://openai.com/index/gpt-4o-video/",
    "summary": "GPT-4o video understanding extension. Reports Video-MME 73.5, EgoSchema 72.2, MVBench 65.8. Engages Bill_8 (multi-image / video native). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_9, Bill_12.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.68,
    "watchlist_tier": "annual",
    "model_family": "GPT-4o",
    "benchmarks": [
      "Video-MME",
      "EgoSchema",
      "MVBench"
    ],
    "notes": "Closed weights. Bill_8 partially engaged. Drop vs single-image baseline not disclosed \u2192 soft closure.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "openai:gpt4v_safety_eval_2024_03",
    "title": "GPT-4V Safety Evaluation Update \u2014 March 2024",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [
      "OpenAI"
    ],
    "country_region": "US",
    "date": "2024-03-14",
    "venue": "OpenAI deployment update",
    "url": "https://openai.com/index/gpt-4v-system-card-update/",
    "summary": "Update to GPT-4V system card covering deployment-phase red-team findings: improved jailbreak resistance on visual prompt injection, reduced PII leakage from photos, OCR-prompt-injection mitigation. Reports updated MMMU and MathVista scores without disclosing tokenizer / patch / resolution. Engages Bill_9 partially (vendor-internal eval). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_7, Bill_12.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "model_family": "GPT-4V",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "internal jailbreak suite"
    ],
    "notes": "Visual prompt-injection countermeasures discussed without ablation. Tokenizer-format brittleness (Bill_3) untested. OCR-leakage (Bill_2) unaddressed beyond PII exclusion.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "openai:gpt4v_system_card_2023_09",
    "title": "GPT-4V(ision) System Card",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [
      "OpenAI"
    ],
    "country_region": "US",
    "date": "2023-09-25",
    "venue": "OpenAI system card",
    "url": "https://cdn.openai.com/papers/GPTV_System_Card.pdf",
    "summary": "Vendor system card introducing GPT-4V, the first frontier multimodal vision-language model. Reports red-team evaluations on harmful content, CAPTCHA solving, biometric inference, and visual scene understanding. Discloses qualitative ablations on resolution and prompt format but no four-tuple compute breakdown. Reports MMMU/Visual-WebArena scores. Engages Bill_9 (vendor self-eval; partially independent through Be My Eyes pilots) but explicitly does NOT engage Bill_1 (no perceptual-hash contamination audit), Bill_2 (no OCR-leakage assessment), Bill_3 (resolution ablation only qualitative), Bill_4 (no causal-intervention vision-grounding study), Bill_7 (no MMMU-Pro / cross-benchmark transfer claim), Bill_12 (no distilled-cousin reproduction).",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": "quarterly",
    "model_family": "GPT-4V",
    "benchmarks": [
      "MMMU",
      "Visual-WebArena",
      "internal red-team suite"
    ],
    "notes": "Vision encoder undisclosed (likely modified CLIP-ViT). No OCR-leakage audit; no perceptual-hash contamination audit; no causal-intervention vision-grounding. Anti-Bill_1/Bill_2/Bill_4 design surface \u2014 closure mechanisms unpaid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "openai:gpt5_vision_card_2025_08",
    "title": "GPT-5 Multimodal Vision Card",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [
      "OpenAI"
    ],
    "country_region": "US",
    "date": "2025-08-15",
    "venue": "OpenAI system card",
    "url": "https://openai.com/index/gpt-5-system-card/",
    "summary": "GPT-5 vision portion: unified omni token stream with new high-resolution branch. Reports MMMU 82.4, MMMU-Pro 64.1, MathVista-mini 78.6, BLINK 67.2, MMVet 81.0. Engages Bill_7 (MMMU-Pro reported), Bill_9 (METR / Apollo / AISI joint pre-deployment audit, Bill_9 partially closed). Does NOT engage Bill_1, Bill_2, Bill_3 (resolution ablation absent), Bill_4 (BLINK score reported but no causal-intervention test), Bill_8 (claims long-video but no quantitative drop disclosed), Bill_12.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "quarterly",
    "model_family": "GPT-5",
    "benchmarks": [
      "MMMU",
      "MMMU-Pro",
      "MathVista",
      "BLINK",
      "MMVet",
      "ChartQA",
      "DocVQA",
      "AI2D"
    ],
    "notes": "AISI / METR / Apollo joint audit closes Bill_9 partially. BLINK score reported but no Eyes-Wide-Shut intervention follow-through \u2192 Bill_4 still \u2605 predicted-empty. Multi-image / video \u2192 Bill_8 disclosure thin.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "opencompass_2024_mmbench_v11",
    "title": "MMBench-V1.1: Refined and Expanded Multimodal Benchmark",
    "authors": [
      "OpenCompass Team"
    ],
    "affiliations": [
      "Shanghai AI Lab"
    ],
    "country_region": "CN",
    "date": "2024-08",
    "venue": "OpenCompass leaderboard / dataset release",
    "url": "https://github.com/open-compass/MMBench",
    "summary": "Held-out refresh of MMBench with refined questions, expanded subjects, and removal of leakage-prone items. Frontier drops 3-7pp vs v1.0. Closure mechanism: Bill_11 (anti-saturation reconstruction). Less aggressive redesign than MMMU-Pro but same pattern.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "model_family": "MMBench_V11",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Refresh of v1.0 with cleaned questions. Drops vs v1.0: GPT-4o 84.0% -> 82.4% (-1.6pp), Claude 3.5 Sonnet 80.7% -> 79.8% (-0.9pp), InternVL2-76B 86.0% -> 81.5% (-4.5pp). Smaller drops than MMMU-Pro because circular-eval already audited position bias; remaining drops are item-level leakage.",
    "architecture_class": "benchmark",
    "data_mixture": "ability_taxonomy_v11",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "opengvlab:internvl2_5_card_2024_12",
    "title": "InternVL 2.5 Technical Report",
    "authors": [
      "Z. Chen",
      "W. Wang",
      "et al."
    ],
    "affiliations": [
      "OpenGVLab",
      "Shanghai AI Lab"
    ],
    "country_region": "CN",
    "date": "2024-12-06",
    "venue": "arXiv 2412.05271",
    "url": "https://arxiv.org/abs/2412.05271",
    "summary": "InternVL 2.5 (78B) with progressive training and dynamic high-resolution. Reports MMMU 70.1, MMMU-Pro 49.9, MathVista 72.3, ChartQA 88.5, DocVQA 96.4, BLINK 63.8. Engages Bill_3 (dynamic-resolution ablation reported), Bill_5, Bill_7 (MMMU-Pro disclosed), Bill_12. Does NOT engage Bill_1, Bill_2, Bill_4, Bill_9.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "model_family": "InternVL",
    "benchmarks": [
      "MMMU",
      "MMMU-Pro",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "BLINK",
      "AI2D",
      "OCRBench"
    ],
    "notes": "Bill_3 partially closed via explicit dynamic-tiling ablation. MMMU-Pro 49.9 vs MMMU 70.1 = 20.2pp drop \u2014 confirms Bill_7 \u2605 pattern. Open-weight = Bill_12 substrate.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "opengvlab:internvl2_card_2024_07",
    "title": "InternVL2 \u2014 Closing the Gap with Commercial Multimodal Models",
    "authors": [
      "Z. Chen",
      "W. Wang",
      "et al."
    ],
    "affiliations": [
      "OpenGVLab",
      "Shanghai AI Lab",
      "Tsinghua"
    ],
    "country_region": "CN",
    "date": "2024-07-04",
    "venue": "arXiv 2410.16261 + GitHub model card",
    "url": "https://arxiv.org/abs/2410.16261",
    "summary": "InternVL2 series (1B-76B) with InternViT-6B vision encoder + LLM via MLP connector. Reports MMMU 58.3 (76B), MathVista 65.5, ChartQA 88.4, DocVQA 94.1. Engages Bill_5 (custom InternViT vision encoder, distinct from CLIP / SigLIP), Bill_12 (open-weight at all scales). Does NOT engage Bill_1, Bill_2, Bill_3 (only single-resolution main results), Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "model_family": "InternVL",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "AI2D",
      "TextVQA",
      "OCRBench"
    ],
    "notes": "InternViT-6B custom encoder pretrained from scratch \u2014 strong Bill_5 portability evidence. Full open weights at 1B/2B/4B/8B/26B/40B/76B scales = strong Bill_12 substrate.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "opengvlab:internvl3_2025_04",
    "title": "InternVL3 \u2014 Exploring Advanced Training and Test-Time Recipes",
    "authors": [
      "J. Zhu",
      "Z. Chen",
      "et al."
    ],
    "affiliations": [
      "OpenGVLab",
      "Shanghai AI Lab"
    ],
    "country_region": "CN",
    "date": "2025-04-08",
    "venue": "arXiv 2504.10479",
    "url": "https://arxiv.org/abs/2504.10479",
    "summary": "InternVL3 (78B) with native multimodal pretraining. Reports MMMU 72.2, MMMU-Pro 53.6, MathVista 71.0, BLINK 66.0, MathVerse 53.2, MMMU-Pro Standard 54.0. Engages Bill_5, Bill_7 (MMMU-Pro), Bill_12 (open weights). Does NOT engage Bill_1, Bill_2, Bill_3 (less ablation than 2.5), Bill_4, Bill_9.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": "quarterly",
    "model_family": "InternVL",
    "benchmarks": [
      "MMMU",
      "MMMU-Pro",
      "MathVista",
      "BLINK",
      "MathVerse",
      "ChartQA",
      "DocVQA",
      "OCRBench"
    ],
    "notes": "Native multimodal pretraining (text + image tokens jointly) \u2014 moves further from single-encoder bolt-on. Bill_5 strongly engaged. Bill_4 still \u2605 unpaid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "openworldqa_2024",
    "title": "OpenWorldQA: Open-World Visual Reasoning Benchmark",
    "authors": [
      "Multiple"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Open-vocabulary, open-world failure modes; complements closed-set MMMU/MVBench.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Open-vocabulary, open-world failure modes; complements closed-set MMMU/MVBench.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "os_atlas_2024",
    "title": "OS-ATLAS: A Foundation Action Model for Generalist GUI Agents",
    "authors": [
      "Wu",
      "Cheng",
      "Liu",
      "et al. (OS-Copilot)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICLR 2025",
    "url": null,
    "summary": "Foundation grounding model used as a tool by downstream agent VLMs.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Foundation grounding model used as a tool by downstream agent VLMs.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "os_genesis_2024",
    "title": "OS-Genesis: Automating GUI Agent Trajectory Construction via Reverse Task Synthesis",
    "authors": [
      "Sun",
      "Yu",
      "Wu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ACL 2025",
    "url": null,
    "summary": "Companion to OS-Atlas. Solves the trajectory-data acquisition leg of GUI-agent training.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Companion to OS-Atlas. Solves the trajectory-data acquisition leg of GUI-agent training.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "osworld_2024",
    "title": "OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real Computer Environments",
    "authors": [
      "Xie",
      "Zhang",
      "Chen",
      "et al. (XLang)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024",
    "url": null,
    "summary": "OSWorld trajectory is the gold dataset for the reasoning-mode + computer-use thesis. Recent results above human expert baseline.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "OSWorld trajectory is the gold dataset for the reasoning-mode + computer-use thesis. Recent results above human expert baseline.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "pali3_2023",
    "title": "PaLI-3 Vision Language Models: Smaller, Faster, Stronger",
    "authors": [
      "Chen",
      "Wang",
      "Mustafa",
      "Zhai",
      "Salz",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "Google DeepMind 2023",
    "url": "https://arxiv.org/abs/2310.09199",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "SigLIP 2B encoder + UL2 3B encoder-decoder transformer",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "pali_x_2023",
    "title": "PaLI-X: On Scaling up a Multilingual Vision and Language Model",
    "authors": [
      "Chen",
      "Djolonga",
      "Padlewski",
      "Mustafa",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "Google 2023",
    "url": "https://arxiv.org/abs/2305.18565",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "ViT-22B encoder + 32B encoder-decoder LLM",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "perception_test_2023",
    "title": "Perception Test: A Diagnostic Benchmark for Multimodal Video Models",
    "authors": [
      "Patraucean et al. (DeepMind)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": null,
    "url": null,
    "summary": "Perception-vs-reasoning split. Cognitive-skill-axis decomposition unique among video benchmarks.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Perception-vs-reasoning split. Cognitive-skill-axis decomposition unique among video benchmarks.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "phi3_vision_2024",
    "title": "Phi-3 Vision / Phi-3.5 Vision Technical Report",
    "authors": [
      "Abdin",
      "Aneja",
      "Awadalla",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Microsoft 2024",
    "url": "https://arxiv.org/abs/2404.14219",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "CLIP encoder + connector + projector + Phi-3 Mini LLM; 128K context",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "pixtral_12b_2024",
    "title": "Pixtral 12B",
    "authors": [
      "Agrawal",
      "Antoniak",
      "Hanna",
      "Bout",
      "Chaplot",
      "Chudnovsky",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Mistral AI 2024",
    "url": "https://arxiv.org/abs/2410.07073",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Custom ViT with 2D-RoPE, native variable-resolution support, 16x16 patches, 24 layers",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "prismatic_2024",
    "title": "Prismatic VLMs: Investigating the Design Space of Visually-Conditioned Language Models",
    "authors": [
      "Karamcheti",
      "Nair",
      "Balakrishna",
      "Liang",
      "Kollar",
      "Sadigh"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICML 2024",
    "url": "https://arxiv.org/abs/2402.07865",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Multiple visual backbones \u00d7 LLM (Vicuna/LLaMA-2/Mistral) \u00d7 projector (linear/MLP); 7-13B scale",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "qwen25_vl_2025",
    "title": "Qwen2.5-VL: Visual Agent with Tool Use, Computer Use, Phone Use",
    "authors": [
      "Qwen Team"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Alibaba Qwen",
    "url": null,
    "summary": "Open-weight reasoning-mode VLM. Qwen-Agent framework includes Function Calling, MCP, Code Interpreter, RAG.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Open-weight reasoning-mode VLM. Qwen-Agent framework includes Function Calling, MCP, Code Interpreter, RAG.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "qwen2_5_vl_2025",
    "title": "Qwen2.5-VL Technical Report",
    "authors": [
      "Bai",
      "Chen",
      "Liu",
      "Wang",
      "Ge",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Alibaba 2025",
    "url": "https://arxiv.org/abs/2502.13923",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "ViT with window attention + 2D-RoPE + SwiGLU/RMSNorm (aligned with Qwen2.5 LLM); native dynamic-res",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "qwen2_vl_2024",
    "title": "Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at Any Resolution",
    "authors": [
      "Wang",
      "Bai",
      "Tan",
      "Wang",
      "Wan",
      "Ge",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Alibaba 2024",
    "url": "https://arxiv.org/abs/2409.12191",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Native dynamic-resolution ViT + M-RoPE (Multimodal Rotary Position Embedding) + Qwen2 LLM",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "qwen_vl_2023",
    "title": "Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond",
    "authors": [
      "Bai",
      "Bai",
      "Yang",
      "Wang",
      "Tan",
      "Wang",
      "Lin",
      "Zhou",
      "Zhou"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "Alibaba 2023",
    "url": "https://arxiv.org/abs/2308.12966",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "ViT (OpenCLIP) + Q-Former-style position-aware vision-language adapter + Qwen LLM",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "radio_2024",
    "title": "AM-RADIO: Agglomerative Vision Foundation Model \u2014 Reduce All Domains Into One",
    "authors": [
      "Ranzinger",
      "Heinrich",
      "Kautz",
      "Molchanov"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NVIDIA CVPR 2024",
    "url": "https://arxiv.org/abs/2312.06709",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Multi-teacher distillation into single ViT; CLIP+DINOv2+SAM teachers",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "rextime_2024",
    "title": "ReXTime: Reasoning Across Video Time",
    "authors": [
      "Multiple"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Across-time causal reasoning explicitly. Companion to TempBench/TempCompass.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Across-time causal reasoning explicitly. Companion to TempBench/TempCompass.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "rhymes:aria_2024_10",
    "title": "Aria \u2014 An Open Multimodal Native Mixture-of-Experts Model",
    "authors": [
      "D. Li",
      "Y. Liu",
      "et al."
    ],
    "affiliations": [
      "Rhymes AI"
    ],
    "country_region": "US/CN",
    "date": "2024-10-08",
    "venue": "arXiv 2410.05993",
    "url": "https://arxiv.org/abs/2410.05993",
    "summary": "Aria MoE with native multimodal pretraining (3.9B active / 25B total). Reports MMMU 54.9, MathVista 66.1, ChartQA 86.4, DocVQA 92.6, Video-MME 67.6. Engages Bill_5 (MoE multimodal-native), Bill_8 (native long-video), Bill_12. Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "annual",
    "model_family": "Aria",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "Video-MME",
      "LongVideoBench"
    ],
    "notes": "Native multimodal MoE (not bolt-on encoder) = strong Bill_5 architecture diversification. Bill_8 video + long-context.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "salesforce:blip3_xgen_2024_08",
    "title": "xGen-MM (BLIP-3) \u2014 Open Multimodal Foundation Models",
    "authors": [
      "L. Xue",
      "M. Shu",
      "et al."
    ],
    "affiliations": [
      "Salesforce AI Research"
    ],
    "country_region": "US",
    "date": "2024-08-13",
    "venue": "arXiv 2408.08872",
    "url": "https://arxiv.org/abs/2408.08872",
    "summary": "xGen-MM (BLIP-3) 4B / 8B with SigLIP-400M. Reports MMMU 41.1 / 48.2, MathVista 39.7 / 39.0, ChartQA 60.0 / 60.6, DocVQA 79.3 / 89.3. Engages Bill_5 (SigLIP), Bill_12 (open weights + open recipes). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": "annual",
    "model_family": "BLIP",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "TextVQA"
    ],
    "notes": "M1 (\u22648B). Open data + recipes = Bill_12 substrate. SigLIP = Bill_5 evidence point.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "scaling_laws_native_multimodal_2025",
    "title": "Scaling Laws for Native Multimodal Models",
    "authors": [
      "Shukor",
      "Fini",
      "Turrisi da Costa",
      "Cord",
      "Susskind",
      "El-Nouby"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Apple 2025",
    "url": "https://arxiv.org/abs/2504.07951",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Spectrum from early-fusion (no encoder) to late-fusion (encoder+LLM); also dense vs MoE variants",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "sciagent_2024",
    "title": "SciAgent: Tool-augmented Language Models for Scientific Reasoning",
    "authors": [
      "Ma",
      "Cao",
      "Ma",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "EMNLP 2024",
    "url": null,
    "summary": "Scientific tool-use benchmark + training corpus. Establishes that tool quality > parameter count for scientific reasoning.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Scientific tool-use benchmark + training corpus. Establishes that tool quality > parameter count for scientific reasoning.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "screenagent_2024",
    "title": "ScreenAgent: A Vision Language Model-driven Computer Control Agent",
    "authors": [
      "Niu",
      "Li",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Open-weight computer-use VLM. Validates training-data tool-augmentation thesis.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Open-weight computer-use VLM. Validates training-data tool-augmentation thesis.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "screenspot_pro_2025",
    "title": "ScreenSpot-Pro: GUI Grounding for Professional High-Resolution Computer Use",
    "authors": [
      "Li",
      "Lu",
      "Su",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "ICLR 2025",
    "url": null,
    "summary": "Hard-resolution UI grounding benchmark. Establishes tool-augmented visual search as a minimum bar for computer-use VLMs.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Hard-resolution UI grounding benchmark. Establishes tool-augmented visual search as a minimum bar for computer-use VLMs.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "seeact_2024",
    "title": "GPT-4V(ision) is a Generalist Web Agent, if Grounded",
    "authors": [
      "Zheng",
      "Gou",
      "Sun",
      "et al. (OSU)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICML 2024",
    "url": null,
    "summary": "Mind2Web live evaluation. Quantifies 'oracle-grounding ceiling' = upper bound for tool-augmented web agents.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Mind2Web live evaluation. Quantifies 'oracle-grounding ceiling' = upper bound for tool-augmented web agents.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "siglip2_2025",
    "title": "SigLIP 2: Multilingual Vision-Language Encoders with Improved Semantic Understanding, Localization, and Dense Features",
    "authors": [
      "Tschannen",
      "Gritsenko",
      "Wang",
      "Naeem",
      "Alabdulmohsin",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Google DeepMind 2025",
    "url": "https://arxiv.org/abs/2502.14786",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "ViT-B/L/SO400M/g, multi-objective (sigmoid + captioning + self-distillation + masked prediction)",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "siglip_2023",
    "title": "Sigmoid Loss for Language Image Pre-Training (SigLIP)",
    "authors": [
      "Zhai",
      "Mustafa",
      "Kolesnikov",
      "Beyer"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "ICCV 2023",
    "url": "https://arxiv.org/abs/2303.15343",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "ViT (same architecture as CLIP, sigmoid loss instead of softmax)",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "som_2023",
    "title": "Set-of-Mark Prompting Unleashes Extraordinary Visual Grounding in GPT-4V",
    "authors": [
      "Yang",
      "Zhang",
      "Li",
      "Zou",
      "Li",
      "Gao (Microsoft)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": null,
    "url": null,
    "summary": "Reframes 'tool augmentation' to include external perception tools (segmenters) injected as visual prompts.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Reframes 'tool augmentation' to include external perception tools (segmenters) injected as visual prompts.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "spatialeval_video_2025",
    "title": "Spatial-Video-Bench: Spatial Reasoning in Long Videos",
    "authors": [
      "Multiple"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": null,
    "url": null,
    "summary": "Spatial-cognitive axis (mental rotation, viewpoint, room-scale layout) for video VLMs.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Spatial-cognitive axis (mental rotation, viewpoint, room-scale layout) for video VLMs.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "sphinx_2024",
    "title": "SPHINX: The Joint Mixing of Weights, Tasks, and Visual Embeddings for Multi-modal Large Language Models",
    "authors": [
      "Lin",
      "Liu",
      "Lin",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ECCV 2024",
    "url": "https://arxiv.org/abs/2311.07575",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Mixture of Visual encoders (CLIP+DINOv2+Q-Former+ConvNeXt) channel-wise mixed; LLaMA backbone",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "sphinx_x_2024",
    "title": "SPHINX-X: Scaling Data and Parameters for a Family of Multi-modal Large Language Models",
    "authors": [
      "Gao",
      "Zhang",
      "Liu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Shanghai AI Lab 2024",
    "url": "https://arxiv.org/abs/2402.05935",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Mixture-of-Visual-experts (MoV): ConvNeXt + DINOv2 only; tested across TinyLlama-1.1B/InternLM2-7B/LLaMA2-13B/Mixtral-8x7B",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "stability:llava_next_2024_01",
    "title": "LLaVA-NeXT \u2014 Improved Reasoning, OCR, and World Knowledge",
    "authors": [
      "H. Liu",
      "C. Li",
      "et al."
    ],
    "affiliations": [
      "U. Wisconsin",
      "Microsoft"
    ],
    "country_region": "US",
    "date": "2024-01-30",
    "venue": "LLaVA blog post + arXiv 2407.07895 (followup)",
    "url": "https://llava-vl.github.io/blog/2024-01-30-llava-next/",
    "summary": "LLaVA-NeXT with dynamic high-res via image tiling. Reports MMMU 35.8 (7B) / 51.1 (34B), MathVista 34.6, ChartQA 69.5, DocVQA 84.0. Engages Bill_3 (dynamic tiling \u2014 first major bolt-on demonstration), Bill_5, Bill_12. Does NOT engage Bill_1, Bill_2, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_3",
    "candidate_meta_cost": "M1",
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": "annual",
    "model_family": "LLaVA",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "TextVQA"
    ],
    "notes": "Pre-frontier (M1) but key reference: introduced dynamic-tiling design used by InternVL, Qwen2-VL, LLaVA-OneVision. Bill_3 substrate.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "stability:llava_onevision_2024_08",
    "title": "LLaVA-OneVision \u2014 Easy Visual Task Transfer",
    "authors": [
      "B. Li",
      "Y. Zhang",
      "et al."
    ],
    "affiliations": [
      "NTU",
      "Bytedance"
    ],
    "country_region": "SG/CN",
    "date": "2024-08-06",
    "venue": "arXiv 2408.03326",
    "url": "https://arxiv.org/abs/2408.03326",
    "summary": "LLaVA-OneVision (0.5B / 7B / 72B) with SigLIP encoder, unified single-image / multi-image / video. Reports MMMU 56.8 (72B), MathVista 67.5, ChartQA 83.7, DocVQA 91.3, Video-MME 66.2. Engages Bill_5 (SigLIP), Bill_8 (single \u2192 multi-image \u2192 video transfer), Bill_12. Does NOT engage Bill_1, Bill_2, Bill_3 partially, Bill_4, Bill_7, Bill_9.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": "quarterly",
    "model_family": "LLaVA",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "Video-MME",
      "MVBench",
      "LongVideoBench"
    ],
    "notes": "Single-image \u2192 multi-image \u2192 video transfer is the central claim = direct Bill_8 closure attempt. Reports drop \u22645pp on transfer in some cases. SigLIP encoder = Bill_5.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "stanford_helm_2024_vision",
    "title": "HELM-Vision: Holistic Evaluation of Vision-Language Models",
    "authors": [
      "Stanford CRFM"
    ],
    "affiliations": [
      "Stanford CRFM"
    ],
    "country_region": "US",
    "date": "2024-12",
    "venue": "Stanford CRFM technical report",
    "url": "https://crfm.stanford.edu/helm/vlm/latest/",
    "summary": "Stanford HELM extension for VLMs. Standardized evaluation across 22 benchmarks with consistent prompting. Reveals systematic 5-12pp gap between vendor-published numbers and HELM-replicated numbers on MMMU, MathVista, MMVet. Closure mechanism: Bill_9 (vendor-self-eval independence) primary anchor and Bill_7 cousin (cross-benchmark consistency).",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "model_family": "HELM_Vision",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "Independent reproduction across 22 benchmarks. Vendor-vs-HELM gap: GPT-4o on MMMU vendor 69.1% / HELM 64.7% = -4.4pp, Claude 3.5 Sonnet on MathVista vendor 67.7% / HELM 62.3% = -5.4pp, Gemini 1.5 Pro on MMVet vendor 73.5% / HELM 68.1% = -5.4pp. Pattern: vendor numbers consistently overstate by 4-9pp. Direct Bill_9 anchor.",
    "architecture_class": "benchmark_replicator",
    "data_mixture": null,
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "stanford_helm_vision_2025",
    "title": "Stanford CRFM HELM-Vision 2025: Independent Replication of 12 VLMs",
    "authors": [
      "Liang",
      "Bommasani",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": "Stanford CRFM",
    "url": null,
    "summary": "HELM-Vision Q1 2025: 12 VLMs evaluated on 18 tasks under uniform harness. Cross-evaluator gap vs vendor reports: mean 14.2pp absolute. MMMU: vendor 69% \u2192 HELM 56%. ChartQA: vendor 81% \u2192 HELM 67%. Direct rebuttal of vendor self-eval validity. Targets B_VLM6 and B_VLM12.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "HELM-Vision Q1 2025: 12 VLMs evaluated on 18 tasks under uniform harness. Cross-evaluator gap vs vendor reports: mean 14.2pp absolute. MMMU: vendor 69% \u2192 HELM 56%. ChartQA: vendor 81% \u2192 HELM 67%. Direct rebuttal of vendor self-eval validity. Targets B_VLM6 and B_VLM12.",
    "_appeared_in_sweeps": [
      "sweep_308_negative_results"
    ]
  },
  {
    "paper_id": "stepfun:step1v_2024_07",
    "title": "Step-1V Technical Card",
    "authors": [
      "StepFun"
    ],
    "affiliations": [
      "StepFun"
    ],
    "country_region": "CN",
    "date": "2024-07-01",
    "venue": "StepFun model card",
    "url": "https://platform.stepfun.com/docs/llm/step1v",
    "summary": "Step-1V vendor card with proprietary encoder. Reports MMMU 49.9, MathVista 44.8. Limited disclosure. Engages Bill_9 (vendor self-eval, weak). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_5 (no architecture disclosure), Bill_7, Bill_12 (closed weights).",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.55,
    "watchlist_tier": "annual",
    "model_family": "Step-1V",
    "benchmarks": [
      "MMMU",
      "MathVista"
    ],
    "notes": "Closed-weight Chinese vendor. Limited disclosure \u2192 most bills unpaid. Needs G2 closure if claims pushed.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "stepfun:step3_2025_07",
    "title": "Step-3 Multimodal Reasoning Card",
    "authors": [
      "StepFun"
    ],
    "affiliations": [
      "StepFun"
    ],
    "country_region": "CN",
    "date": "2025-07-15",
    "venue": "StepFun release",
    "url": "https://platform.stepfun.com/docs/llm/step-3",
    "summary": "Step-3 with extended thinking and vision. Reports MMMU 73.0, MMMU-Pro 56.4, MathVista 76.9, BLINK 64.1. Engages Bill_7 (MMMU-Pro), Bill_9 partially. Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_5 (limited), Bill_12.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.6,
    "watchlist_tier": "annual",
    "model_family": "Step",
    "benchmarks": [
      "MMMU",
      "MMMU-Pro",
      "MathVista",
      "BLINK"
    ],
    "notes": "Closed weights. MMMU-Pro 56.4 / MMMU 73.0 = 16.6pp drop confirms Bill_7 \u2605 pattern. Bill_4 \u2605 unpaid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "tarsier_2024",
    "title": "Tarsier: Recipes for Training and Evaluating Large Video Description Models",
    "authors": [
      "Wang et al. (ByteDance)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "DREAM-1K is the strongest open video description benchmark; F1-style scoring against multi-aspect references.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "DREAM-1K is the strongest open video description benchmark; F1-style scoring against multi-aspect references.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "tempbench_2024",
    "title": "Tempo / TempBench: Temporal Reasoning Benchmark for VLMs",
    "authors": [
      "Multiple"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Temporal-pure; complements TempCompass. Order, duration, before/after relation tests.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Temporal-pure; complements TempCompass. Order, duration, before/after relation tests.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "tempcompass_2024",
    "title": "TempCompass: Do Video LLMs Really Understand Videos?",
    "authors": [
      "Liu et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Designed to defeat single-frame bias by construction. Conflicting-static-frame video pairs.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Designed to defeat single-frame bias by construction. Conflicting-static-frame video pairs.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "tencent:hunyuan_vision_2024_11",
    "title": "Hunyuan-Vision Technical Card",
    "authors": [
      "Tencent Hunyuan Team"
    ],
    "affiliations": [
      "Tencent"
    ],
    "country_region": "CN",
    "date": "2024-11-05",
    "venue": "Tencent release",
    "url": "https://hunyuan.tencent.com/",
    "summary": "Hunyuan-Vision card. Reports MMMU 51.1, MathVista 48.1, ChartQA 80.0. Engages Bill_9 partially. Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_5 (closed), Bill_7, Bill_12.",
    "candidate_bill": "Bill_9",
    "candidate_meta_cost": "M5",
    "verdict": "needs_gate",
    "confidence": 0.55,
    "watchlist_tier": "annual",
    "model_family": "Hunyuan",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA"
    ],
    "notes": "Closed-weight Chinese vendor; limited Western audit substrate \u2192 Bill_9 weakly engaged.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "tinychart_2024",
    "title": "TinyChart: Efficient Chart Understanding with Visual Token Merging and Program-of-Thoughts",
    "authors": [
      "Zhang",
      "Han",
      "Sun",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Establishes PoT supervision pattern -- key training-time tool for chart VLMs.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Establishes PoT supervision pattern -- key training-time tool for chart VLMs.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "tomato_2024",
    "title": "TOMATO: Assessing Visual Temporal Reasoning Capabilities in MLLMs",
    "authors": [
      "Shangguan et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Multi-frame-mandatory by construction. Six temporal reasoning principles.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Multi-frame-mandatory by construction. Six temporal reasoning principles.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "vcgbench_2024",
    "title": "VCGBench-Diverse: Diverse Video Conversation Generation Benchmark",
    "authors": [
      "Maaz et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Diversity-focused successor to Video-ChatGPT bench. Spatial grounding axis.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Diversity-focused successor to Video-ChatGPT bench. Spatial grounding axis.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "video_chatgpt_2023",
    "title": "Video-ChatGPT: Towards Detailed Video Understanding via Large Vision and Language Models",
    "authors": [
      "Maaz et al. (MBZUAI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": null,
    "url": null,
    "summary": "First open-ended video VLM benchmark with judge-based scoring. Now considered weak (LLM-judge bias) but still reported.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "First open-ended video VLM benchmark with judge-based scoring. Now considered weak (LLM-judge bias) but still reported.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "video_holmes_2025",
    "title": "Video-Holmes: Long-Range Video Reasoning Through Forensic Clue-Tracking",
    "authors": [
      "Multiple"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": null,
    "url": null,
    "summary": "Detective-style multi-clue temporal reasoning. Among hardest open video benchmarks 2025.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Detective-style multi-clue temporal reasoning. Among hardest open video benchmarks 2025.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "video_llava_2023",
    "title": "Video-LLaVA: Learning United Visual Representation by Alignment Before Projection",
    "authors": [
      "Lin et al. (PKU)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": null,
    "url": null,
    "summary": "Architectural baseline for combined image+video VLM. Aligns image/video features pre-projection.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Architectural baseline for combined image+video VLM. Aligns image/video features pre-projection.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "video_mme_2024",
    "title": "Video-MME: The First-Ever Comprehensive Evaluation Benchmark of Multi-modal LLMs in Video Analysis",
    "authors": [
      "Fu",
      "Li",
      "Lin",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Gold-standard general video VLM benchmark. Six domains, 30 subcategories. Audio + subtitle ablations isolate modality contributions.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Gold-standard general video VLM benchmark. Six domains, 30 subcategories. Audio + subtitle ablations isolate modality contributions.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "videoagent_2024",
    "title": "VideoAgent: Long-form Video Understanding with Large Language Model as Agent",
    "authors": [
      "Wang",
      "Han",
      "Chen",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Frame-search-as-tool paradigm. Demonstrates that small + smart frame sets beat naive uniform sampling.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Frame-search-as-tool paradigm. Demonstrates that small + smart frame sets beat naive uniform sampling.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "videoeval_pro_2025",
    "title": "VideoEval-Pro: Robust and Realistic Long Video Understanding Evaluation",
    "authors": [
      "Multiple"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": null,
    "url": null,
    "summary": "Direct response to MCQ-leakage concerns. Open-ended free-form eval with judge calibration.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Direct response to MCQ-leakage concerns. Open-ended free-form eval with judge calibration.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "videommmu_2025",
    "title": "Video-MMMU: Evaluating Knowledge Acquisition from Multi-Discipline Professional Videos",
    "authors": [
      "Hu et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025",
    "venue": null,
    "url": null,
    "summary": "Knowledge-acquisition axis: model must learn from video then transfer. Three stages: perception, comprehension, adaptation.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Knowledge-acquisition axis: model must learn from video then transfer. Three stages: perception, comprehension, adaptation.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "videoniah_2024",
    "title": "VideoNIAH: Video Needle In A Haystack",
    "authors": [
      "Zhao et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Direct video analog of LLM NIAH. Reveals catastrophic long-context VLM failure. Synthetic-needle paradigm exposes attention vs retrieval gap.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Direct video analog of LLM NIAH. Reveals catastrophic long-context VLM failure. Synthetic-needle paradigm exposes attention vs retrieval gap.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "videoref_bench_2024",
    "title": "Video-Refer-Bench / Referring Video QA",
    "authors": [
      "Multiple"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Spatio-temporal referring grounding axis.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Spatio-temporal referring grounding axis.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "videotree_2024",
    "title": "VideoTree: Adaptive Tree-based Video Representation for LLM Reasoning on Long Videos",
    "authors": [
      "Wang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "CVPR 2025",
    "url": null,
    "summary": "Companion to VideoAgent. Tree-structured tool augmentation.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Companion to VideoAgent. Tree-structured tool augmentation.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "videovista_2024",
    "title": "VideoVista: A Versatile Benchmark for Video Understanding and Reasoning",
    "authors": [
      "Li et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Largest QA volume; broad task coverage (causal, hypothetical, predictive). Dual-axis perception/reasoning split.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Largest QA volume; broad task coverage (causal, hypothetical, predictive). Dual-axis perception/reasoning split.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "videoxum_2024",
    "title": "VideoXum: Cross-Modal Visual and Textural Summarization of Videos",
    "authors": [
      "Lin et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Joint visual + textual summary generation. Tests video-to-keyframe-to-text pipeline.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Joint visual + textual summary generation. Tests video-to-keyframe-to-text pipeline.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "vila_2024",
    "title": "VILA: On Pre-training for Visual Language Models",
    "authors": [
      "Lin",
      "Wei",
      "Han",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NVIDIA CVPR 2024",
    "url": "https://arxiv.org/abs/2312.07533",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "CLIP encoder + linear projector + Llama-2 LLM; pre-training with interleaved image-text",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "vipergpt_2023",
    "title": "ViperGPT: Visual Inference via Python Execution for Reasoning",
    "authors": [
      "Sur\u00eds",
      "Menon",
      "Vondrick"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "ICCV 2023",
    "url": null,
    "summary": "Canonical 'tool-augmented vision' paper alongside VisProg. Establishes that compositional visual reasoning is a Python-orchestration problem more than an end-to-end perception problem.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Canonical 'tool-augmented vision' paper alongside VisProg. Establishes that compositional visual reasoning is a Python-orchestration problem more than an end-to-end perception problem.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "visprog_2023",
    "title": "Visual Programming: Compositional Visual Reasoning Without Training",
    "authors": [
      "Gupta",
      "Kembhavi (Allen AI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "CVPR 2023 Best Paper",
    "url": null,
    "summary": "CVPR 2023 Best Paper Award. Anchors the neuro-symbolic visual-programming line.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "CVPR 2023 Best Paper Award. Anchors the neuro-symbolic visual-programming line.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "visual_cot_2024",
    "title": "Visual CoT: Advancing Multi-Modal Language Models with a Comprehensive Dataset and Benchmark for Chain-of-Thought Reasoning",
    "authors": [
      "Shao",
      "Meng",
      "Lu",
      "Wang",
      "Wang",
      "Lin",
      "Tian (Tsinghua/Sensetime)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024 Spotlight",
    "url": null,
    "summary": "Largest VCoT dataset. Frames CoT as iterative region-zoom rather than text-only reasoning.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Largest VCoT dataset. Frames CoT as iterative region-zoom rather than text-only reasoning.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "visual_sketchpad_2024",
    "title": "Visual Sketchpad: Sketching as a Visual Chain of Thought for Multimodal Language Models",
    "authors": [
      "Hu",
      "Shi",
      "Chen",
      "Tian",
      "et al. (UW)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "NeurIPS 2024",
    "url": null,
    "summary": "NeurIPS 2024 spotlight. Reframes VCoT as a tool-using-pen problem: VLM uses external rendering tools to think.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "NeurIPS 2024 spotlight. Reframes VCoT as a tool-using-pen problem: VLM uses external rendering tools to think.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "visualagentbench_2024",
    "title": "VisualAgentBench: Towards Large Multimodal Models as Visual Foundation Agents",
    "authors": [
      "Liu",
      "Yu",
      "Xu",
      "et al. (THUDM)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Unified visual-agent benchmark spanning embodied + GUI + design. 18-model comparison.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Unified visual-agent benchmark spanning embodied + GUI + design. 18-model comparison.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "visualwebarena_2024",
    "title": "VisualWebArena: Evaluating Multimodal Agents on Realistic Visually Grounded Web Tasks",
    "authors": [
      "Koh",
      "Lo",
      "Jang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": null,
    "url": null,
    "summary": "Visually grounded web-agent benchmark. Forces vision (unlike WebArena). Quantifies SoM marginal gain precisely.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Visually grounded web-agent benchmark. Forces vision (unlike WebArena). Quantifies SoM marginal gain precisely.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "vitamin_2024",
    "title": "ViTamin: Designing Scalable Vision Models in the Vision-Language Era",
    "authors": [
      "Chen",
      "Liu",
      "Yuille"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "CVPR 2024",
    "url": "https://arxiv.org/abs/2404.02132",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Hybrid CNN-Transformer architecture optimized for vision-language",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "vl_cosi_2024",
    "title": "VL-Cosi / VL-CheckList: Compositional / Interleaved Vision-Language Probes",
    "authors": [
      "Zhao et al. (extended 2024)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": null,
    "url": null,
    "summary": "Diagnostic for object/attribute/relation generalization across image pairs.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Diagnostic for object/attribute/relation generalization across image pairs.",
    "_appeared_in_sweeps": [
      "sweep_304_multi_image_video"
    ]
  },
  {
    "paper_id": "vl_rewardbench_2024",
    "title": "VL-RewardBench: A Challenging Benchmark for Vision-Language Generative Reward Models",
    "authors": [
      "Li",
      "Wang",
      "Chen",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "CVPR 2025",
    "url": null,
    "summary": "Bill_4 \u2605 canonical: VLMs fail visual perception, then tool/training-augmentation rescues. Reward-modeling is a distinct evaluator-tool capability.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Bill_4 \u2605 canonical: VLMs fail visual perception, then tool/training-augmentation rescues. Reward-modeling is a distinct evaluator-tool capability.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "vlm_survey_jina_2024",
    "title": "Vision Encoders in Vision-Language Models: A Survey",
    "authors": [
      "Han Xiao (Jina AI / Elastic)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Jina AI 2024",
    "url": "https://jina.ai/vision-encoder-survey.pdf",
    "summary": "",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "architecture_class": "Taxonomy survey",
    "_appeared_in_sweeps": [
      "sweep_306_cross_vlm_arch"
    ]
  },
  {
    "paper_id": "vstar_2023",
    "title": "V*: Guided Visual Search as a Core Mechanism in Multimodal LLMs",
    "authors": [
      "Wu",
      "Xie (Columbia)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "CVPR 2024",
    "url": null,
    "summary": "V*Bench is the canonical benchmark for the visual-search-tool gap.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "V*Bench is the canonical benchmark for the visual-search-tool gap.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "webvoyager_2024",
    "title": "WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models",
    "authors": [
      "He",
      "Jiang",
      "Xu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ACL 2024",
    "url": null,
    "summary": "Foundational closed-loop web agent paper. SoM as the canonical vision-grounding tool for interactive web.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Foundational closed-loop web agent paper. SoM as the canonical vision-grounding tool for interactive web.",
    "_appeared_in_sweeps": [
      "sweep_307_tool_augmented"
    ]
  },
  {
    "paper_id": "x_ai_2024_realworldqa",
    "title": "RealWorldQA: A Real-World Spatial Understanding Benchmark for Multimodal Models",
    "authors": [
      "xAI"
    ],
    "affiliations": [
      "xAI"
    ],
    "country_region": "US",
    "date": "2024-04",
    "venue": "Hugging Face Datasets / xAI release",
    "url": "https://huggingface.co/datasets/xai-org/RealworldQA",
    "summary": "Vendor-released real-world spatial-understanding benchmark by xAI alongside Grok-1.5V. 765 images of driving / dashboard / first-person scenes with spatial-reasoning multiple-choice. Grok-1.5V 68.7%, GPT-4V 61.4%, Claude 3 Opus 49.8%, Gemini Pro 1.5 67.5%. Closure mechanism: Bill_9 unpaid (vendor-self-eval), Bill_11 cousin (anti-saturation by held-out spatial scenes).",
    "candidate_bill": null,
    "candidate_meta_cost": "M6",
    "verdict": "needs_gate",
    "confidence": 0.85,
    "watchlist_tier": null,
    "model_family": "RealWorldQA",
    "training_compute_disclosed": null,
    "test_time_compute_mode": null,
    "rebuttal_papers": [],
    "notes": "765 questions, real-world driving and first-person scenes. Vendor-released as Grok-1.5V launch artifact \u2014 Bill_9 (vendor-self-eval) explicitly unpaid. By Q4 2024 frontier reached: Claude 3.5 Sonnet 60.1%, GPT-4o 75.4%, Gemini 1.5 Pro 67.5%. xAI subsequently lost frontier position on its own benchmark.",
    "architecture_class": "benchmark",
    "data_mixture": "spatial_real_world",
    "tokenizer": null,
    "claimed_chinchilla_ratio": null,
    "_appeared_in_sweeps": [
      "sweep_302_mmmu_blink"
    ]
  },
  {
    "paper_id": "xai:grok15v_2024_04",
    "title": "Grok-1.5 Vision Card",
    "authors": [
      "xAI"
    ],
    "affiliations": [
      "xAI"
    ],
    "country_region": "US",
    "date": "2024-04-12",
    "venue": "xAI release",
    "url": "https://x.ai/blog/grok-1.5v",
    "summary": "Grok-1.5V card. Reports MMMU 53.6, MathVista 52.8, ChartQA 76.1, DocVQA 85.6, RealWorldQA 68.7. Engages Bill_11 (RealWorldQA introduced as new spatial-reasoning benchmark). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_5 (closed), Bill_7, Bill_9, Bill_12.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.65,
    "watchlist_tier": "annual",
    "model_family": "Grok",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA",
      "RealWorldQA"
    ],
    "notes": "Introduces RealWorldQA \u2014 new held-out benchmark for spatial reasoning = Bill_11 engagement. Otherwise closed weights \u2192 many bills unpaid.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "xai:grok2_vision_2024_12",
    "title": "Grok-2 Vision Update",
    "authors": [
      "xAI"
    ],
    "affiliations": [
      "xAI"
    ],
    "country_region": "US",
    "date": "2024-12-11",
    "venue": "xAI release",
    "url": "https://x.ai/blog/grok-2",
    "summary": "Grok-2 vision integration. Reports MMMU 66.1, MathVista 69.0, RealWorldQA 75.1. Engages Bill_11 (RealWorldQA continuation). Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_5, Bill_7, Bill_9, Bill_12.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.6,
    "watchlist_tier": "annual",
    "model_family": "Grok",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "RealWorldQA"
    ],
    "notes": "Closed weights, limited audit. RealWorldQA = Bill_11 substrate.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  },
  {
    "paper_id": "zhipu:glm4v_2024_08",
    "title": "GLM-4V \u2014 Zhipu AI Multimodal Model Card",
    "authors": [
      "Zhipu AI / Tsinghua KEG"
    ],
    "affiliations": [
      "Zhipu AI",
      "Tsinghua KEG"
    ],
    "country_region": "CN",
    "date": "2024-08-12",
    "venue": "Zhipu AI / arXiv 2406.12793",
    "url": "https://arxiv.org/abs/2406.12793",
    "summary": "GLM-4V 9B / 13B with EVA-CLIP vision encoder. Reports MMMU 47.2, MathVista 51.1, ChartQA 81.1, DocVQA 87.6. Engages Bill_5, Bill_12. Does NOT engage Bill_1, Bill_2, Bill_3, Bill_4, Bill_7, Bill_8, Bill_9.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": "M5",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": "annual",
    "model_family": "GLM-4V",
    "benchmarks": [
      "MMMU",
      "MathVista",
      "ChartQA",
      "DocVQA"
    ],
    "notes": "EVA-CLIP encoder (variant of CLIP) \u2014 Bill_5 portability evidence. Open-weight = Bill_12.",
    "_appeared_in_sweeps": [
      "sweep_301_vendor_cards"
    ]
  }
]