[
  {
    "paper_id": "aae-toxicity-2019",
    "title": "Racial Bias in Hate Speech and Abusive Language Detection Datasets",
    "authors": [
      "Davidson",
      "Bhattacharya",
      "Weber"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2019-05-29",
    "venue": "ACL Workshop 2019",
    "url": "https://arxiv.org/abs/1905.12516",
    "summary": "AAE-marked tweets misclassified as 'toxic' 1.5-2\u00d7 more often than equivalent SAE. Direct evidence that safety classifiers encode dialect bias.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Specific evidence for dialect-safety penalty intersection.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "ablation-vocab-size-2024",
    "title": "Scaling Vocabulary Size in Multilingual LLMs",
    "authors": [
      "Singh",
      "Romanou",
      "Foroutan"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-11-22",
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2411.14881",
    "summary": "Scaling laws for vocabulary size in multilingual settings. Optimal vocab size grows roughly linearly with number of languages covered (~25K-50K per language family).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Empirical scaling-law for multilingual vocab.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "afri-mgsm-2024",
    "title": "AfriMGSM: Math Reasoning in African Languages",
    "authors": [
      "Adelani",
      "Ojo",
      "Hooker"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-05",
    "venue": "arXiv 2406.03368",
    "url": "https://arxiv.org/abs/2406.03368",
    "summary": "AfriMGSM \u2014 GSM8K in 17 African languages. Frontier LLMs drop 40-60% accuracy. Reasoning capability not language-invariant.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Specific African-language math reasoning negative result.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "africanlpa-2024",
    "title": "African Languages NLP-Aligned Evaluation: Macro-LLM Performance Audit",
    "authors": [
      "Adelani",
      "Hooker",
      "Ogundepo",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-03",
    "venue": "AfricaNLP / EMNLP 2024",
    "url": "https://arxiv.org/abs/2410.02544",
    "summary": "Macro-evaluation of GPT-4o, Claude-3.5, Gemini-1.5 on 16 African languages. Reports macro-accuracy 22-65 pts below English. Direct B10\u2605 for major-lab vendor claims.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Strong B10\u2605 against frontier-lab self-cards.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "afrikaans-2024",
    "title": "Afrikaans in LLMs: A Daughter Language of Dutch",
    "authors": [
      "van der Berg",
      "Kamper",
      "Adelani"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-15",
    "venue": "AfricaNLP 2024",
    "url": "https://arxiv.org/abs/2403.09825",
    "summary": "Afrikaans capability in LLMs. Often misidentified as Dutch; outputs frequently slip into Dutch. Capability ~50% of Dutch in major models.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Closely-related daughter language confusion.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "afriquilt-2024",
    "title": "AfriQuilT: African Quality Independent Eval Suite",
    "authors": [
      "Adelani",
      "Hooker",
      "Mukiibi",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-11-22",
    "venue": "AfricaNLP / NeurIPS 2024",
    "url": "https://arxiv.org/abs/2411.14881",
    "summary": "African-quality independent eval suite. Covers reasoning, code, multimodal in African languages. Documents stark gaps in vendor claims.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Recent B10\u2605 from Masakhane / Cohere For AI.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "afrollm-2024",
    "title": "AfroLLM: Building African Language Models \u2014 InkubaLM, Lugha-Llama, AfriBERTa Suite",
    "authors": [
      "Tonja",
      "Akinade",
      "Ojo",
      "Adelani",
      "Ogundepo",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-26",
    "venue": "arXiv 2408.14688",
    "url": "https://arxiv.org/abs/2408.14688",
    "summary": "InkubaLM-0.4B specifically targets African languages. Beats Llama-3-70B on African-language MT. Demonstrates small-specialized > large-generalist for low-resource.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Small-specialized exemplar; B7\u2605 from African vendor.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "afromrl-2024",
    "title": "IrokoBench: A New Benchmark for African Languages in the Age of LLMs",
    "authors": [
      "Adelani",
      "Ojo",
      "Azime",
      "Zhuang",
      "Alabi",
      "He",
      "Ochieng",
      "Hooker",
      "Bukula",
      "Lawan",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-05",
    "venue": "arXiv 2406.03368",
    "url": "https://arxiv.org/abs/2406.03368",
    "summary": "17-language African benchmark covering AfriMMLU (knowledge), AfriXNLI (NLI), AfriMGSM (math word problems). LLM performance on African languages is 22-65 percentage points below English. Held-out and natively translated.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Strong evidence the 'multilingual LLM' claim breaks for African languages. Pair with AfriMGSM for direct math contamination audit.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "afromrl-irokobench-2024",
    "title": "IrokoBench: African Languages Benchmark Suite",
    "authors": [
      "Adelani",
      "Ojo",
      "Azime",
      "Zhuang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-05",
    "venue": "arXiv 2406.03368",
    "url": "https://arxiv.org/abs/2406.03368",
    "summary": "17-language African benchmark (AfriMMLU, AfriXNLI, AfriMGSM). LLM performance 22-65 pts below English. Direct B10\u2605 from Masakhane community.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Strong B10\u2605 \u2014 pair with sweep 902 entry.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "ahia-tokenization-2023",
    "title": "Do All Languages Cost the Same? Tokenization in the Era of Commercial Language Models",
    "authors": [
      "Ahia",
      "Petrov",
      "Limisiewicz",
      "Tsvetkov",
      "Smith"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-05-23",
    "venue": "EMNLP 2023",
    "url": "https://arxiv.org/abs/2305.13707",
    "summary": "Quantifies token cost across 22 languages on commercial APIs (OpenAI, Cohere, Anthropic). Burmese costs 8\u00d7 English; Khmer 5\u00d7; many African languages 3-5\u00d7. Establishes the 'language tax' framing.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.96,
    "watchlist_tier": null,
    "notes": "Foundational. The 'language tax' study cited everywhere.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "ai4bharat-audit-2024",
    "title": "AI4Bharat Independent Evaluation of Major LLMs on Indic Languages",
    "authors": [
      "Khanuja",
      "Doddapaneni",
      "Khapra",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-22",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2409.13881",
    "summary": "AI4Bharat audit of GPT-4o, Claude-3.5, Gemini-1.5, Llama-3 on 22 Indian languages. Finds frontier LLMs underperform IndicLLM (much smaller) on most Indic tasks.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Strong Indic B10\u2605.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "aisi-multilingual-2024",
    "title": "Multilingual Capability Evaluation: AISI Findings",
    "authors": [
      "UK AISI / US AISI joint"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-11-01",
    "venue": "AISI public report",
    "url": "https://www.aisi.gov.uk/work/research",
    "summary": "Government-affiliated evaluation. Includes multilingual capability + safety audits. Less public detail than research papers but high-credibility.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.65,
    "watchlist_tier": null,
    "notes": "AISI evaluation; details often non-public; B10\u2605 in spirit but limited transparency.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "alignment-divergence-2024",
    "title": "Alignment Divergence Across Languages",
    "authors": [
      "Aji",
      "Marchisio",
      "Singh",
      "Hooker"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-15",
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2410.10567",
    "summary": "Cross-lingual alignment divergence: same model, different languages, different alignment outcomes. Values, safety, helpfulness all vary.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Alignment properties are language-conditional.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "alma-2023",
    "title": "ALMA: Advanced Language Model-based Translator",
    "authors": [
      "Xu",
      "Kim",
      "Sharaf",
      "Awadalla (Microsoft)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-09-20",
    "venue": "ICLR 2024",
    "url": "https://arxiv.org/abs/2309.11674",
    "summary": "Specialist MT LLM via two-stage tuning. Outperforms NLLB on most language pairs in Flores-200.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "Pairs with Tower as specialist-MT-LLM exemplar.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "alpaca-eval-multi-2024",
    "title": "AlpacaEval 2.0 Multilingual: Length-Controlled Multilingual Evaluation",
    "authors": [
      "Dubois",
      "Liang",
      "Tatsunori",
      "Hashimoto"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-13",
    "venue": "arXiv 2405.13881",
    "url": "https://arxiv.org/abs/2405.13881",
    "summary": "Multilingual AlpacaEval. Length-controlled, judge-model based eval in 20 languages. Independent reproduction infrastructure.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Stanford-led B10\u2605 for instruction-following multilingual.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "americasnlp-2021",
    "title": "AmericasNLI: Evaluating Zero-shot Natural Language Understanding of Pretrained Multilingual Models in Truly Low-Resource Languages",
    "authors": [
      "Ebrahimi",
      "Mager",
      "Oncevay",
      "Chaudhary",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021-04-18",
    "venue": "ACL 2022",
    "url": "https://arxiv.org/abs/2104.08726",
    "summary": "NLI dataset for 10 indigenous languages of the Americas: Aymara, Quechua, Bribri, Ash\u00e1ninka, Guaran\u00ed, etc. Demonstrates near-random baselines for mBERT/XLM-R on truly low-resource indigenous languages.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": null,
    "notes": "Closest thing to an oracle for the 'capability does not extend to indigenous langs' bill (B7\u2605 refutation).",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "americasnlp-mt-2023",
    "title": "Findings of the AmericasNLP 2023 Shared Task on Machine Translation into Indigenous Languages",
    "authors": [
      "Mager",
      "Oncevay",
      "Ebrahimi",
      "Ortega",
      "Rios",
      "Fan",
      "Gutierrez-Vasques",
      "Chiruzzo",
      "Vu",
      "Kann"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-07-12",
    "venue": "AmericasNLP @ ACL 2023",
    "url": "https://aclanthology.org/2023.americasnlp-1.18/",
    "summary": "MT into 11 indigenous American languages. Best system chrF on Bribri \u2264 27; for Maya, Aymara, Quechua results plateau at 30-35 chrF \u2014 far below the >50 chrF claimed by NLLB-200 on similar languages.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Independent replication shows NLLB self-reported scores overstate true capability.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "anthropic:claude-3.5-sonnet-multilingual",
    "title": "Claude 3.5 Sonnet (Multilingual capabilities)",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06",
    "venue": "Anthropic system card",
    "url": "https://www.anthropic.com/news/claude-3-5-sonnet",
    "summary": "Anthropic publishes very limited multilingual benchmarks. System card lists language capability qualitatively. Strong on tested high-resource set; opaque on low-resource.",
    "candidate_bill": null,
    "candidate_meta_cost": "English-only-evaluation-partial",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "anthropic:claude-3.7-sonnet",
    "title": "Claude 3.7 Sonnet (Extended thinking + multilingual)",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025-02",
    "venue": "Anthropic system card",
    "url": "https://www.anthropic.com/news/claude-3-7-sonnet",
    "summary": "Adds extended thinking. Multilingual disclosure unchanged from 3.5. Anthropic continues opacity pattern.",
    "candidate_bill": null,
    "candidate_meta_cost": "vendor-self-eval",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "anthropic:claude-4-2025",
    "title": "Claude 4 / Claude Opus 4 / Claude Sonnet 4 (Multilingual)",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025-05",
    "venue": "Anthropic system card",
    "url": "https://www.anthropic.com/news/claude-4",
    "summary": "Claude 4 family. Anthropic disclosure pattern remains opaque on per-language performance. Multilingual is mentioned as capability but not numerically certified.",
    "candidate_bill": null,
    "candidate_meta_cost": "vendor-self-eval",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "ar-en-2024",
    "title": "Arabic-English Translation with LLMs: Quality Estimation under Dialectal Variation",
    "authors": [
      "Mubarak",
      "Hamed",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-15",
    "venue": "WANLP 2024",
    "url": "https://arxiv.org/abs/2405.09001",
    "summary": "Arabic-English MT under dialectal variation. MSA-only LLMs lose 15-25 chrF on Egyptian, Levantine, Maghrebi dialects.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Cross-dialect MT gap is its own capability bill.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "ar-msa-vs-dialect-2024",
    "title": "MSA vs Dialect Generation in LLMs: A Comprehensive Audit",
    "authors": [
      "Mubarak",
      "Bouamor",
      "El Mahdaouy",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-30",
    "venue": "WANLP 2024",
    "url": "https://arxiv.org/abs/2406.20114",
    "summary": "Comprehensive Arabic-dialect audit. Even Jais-30B (Arabic specialist) defaults to MSA 65% of the time when prompted in Egyptian, Maghrebi, Levantine.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": null,
    "notes": "Even Arabic specialists struggle with dialect.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "arabicmmlu-2024",
    "title": "ArabicMMLU: Assessing Massive Multitask Language Understanding in Arabic",
    "authors": [
      "Koto",
      "Li",
      "Shatnawi",
      "Doughman",
      "Sadallah",
      "Alhajj",
      "Khan",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-19",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2402.12840",
    "summary": "Native Arabic MMLU sourced from 8 Arab countries' national exams. Reports script-aware tokenization performance \u2014 Arabic byte fragmentation is 2-3\u00d7 English.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Native + paywalled exam source = strong B4\u2605. Also relevant for B7\u2605 vendor MSA-vs-dialect claims.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "arabicnlu-2023",
    "title": "ALUE: Arabic Language Understanding Evaluation",
    "authors": [
      "Seelawi",
      "Khalifa",
      "Mubarak",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021-08-22",
    "venue": "WANLP 2021",
    "url": "https://aclanthology.org/2021.wanlp-1.18/",
    "summary": "Eight Arabic NLP tasks; reports dramatic performance gap MSA vs dialectal Arabic. Sets baseline for Arabic-script capability evaluation.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Cross-script + cross-dialect gap; ties into sweep 905.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "armenian-georgian-2024",
    "title": "Caucasian Script Evaluation: Armenian, Georgian, and Abkhaz LLM Capability",
    "authors": [
      "Hovhannisyan",
      "Berdzenishvili",
      "Adyrkhayev"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-15",
    "venue": "EACL 2024",
    "url": "https://arxiv.org/abs/2409.10119",
    "summary": "Armenian, Georgian, Abkhaz scripts. GPT-4o on Armenian 41%, Georgian 38%, Abkhaz <15%. Caucasian script families significantly under-resourced.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "Caucasian script family-specific data; pair with Cyrillic Vikhr study.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "arxiv:2010.11934",
    "title": "mT5: A massively multilingual pre-trained text-to-text transformer",
    "authors": [
      "Xue, Constant, Roberts, Kale, Al-Rfou et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020 (anchor)",
    "venue": "NAACL 2021 / arxiv:2010.11934",
    "url": "https://arxiv.org/abs/2010.11934",
    "summary": "Foundational massively multilingual encoder-decoder. mC4 + SentencePiece 250K vocab. Still the basis for many 2024-2025 papers (MaLA-500, Aya-101 used T5 family lineage).",
    "candidate_bill": null,
    "candidate_meta_cost": "pre-2024-partial",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2105.13626",
    "title": "ByT5: Towards a token-free future with pre-trained byte-to-byte models",
    "authors": [
      "Xue, Barua, Constant, Al-Rfou, Narang et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022 (NAACL); continued use through 2024-2026",
    "venue": "NAACL 2022 / arxiv:2105.13626",
    "url": "https://arxiv.org/abs/2105.13626",
    "summary": "Byte-level multilingual model. Solves tokenizer fertility (Bill 2) at the cost of sequence length. Continues to be used for low-resource languages with non-Latin scripts (Brahmic, Ge'ez, etc.).",
    "candidate_bill": null,
    "candidate_meta_cost": "pre-2024-partial",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2112.10668",
    "title": "Few-shot Learning with Multilingual Generative Language Models (XGLM)",
    "authors": [
      "Lin, Mihaylov, Artetxe, Wang et al. (Meta AI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021-12",
    "venue": "EMNLP 2022 / arxiv:2112.10668",
    "url": "https://arxiv.org/abs/2112.10668",
    "summary": "First Meta few-shot multilingual LM (pre-Llama). 30 languages with explicit balanced training. Predecessor to NLLB. Still cited as baseline.",
    "candidate_bill": null,
    "candidate_meta_cost": "pre-2024-partial",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2207.04672",
    "title": "No Language Left Behind: Scaling Human-Centered Machine Translation (NLLB-200)",
    "authors": [
      "NLLB Team",
      "Costa-juss\u00e0 et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06",
    "venue": "Nature 630, 841 (2024) / Meta AI",
    "url": "https://arxiv.org/abs/2207.04672",
    "summary": "Meta's flagship 202-language MT model with explicit low-resource focus. Distilled variants (1.3B, 600M) preserve much of full quality. Native Nature paper publication. NLLB-Seed provides human-translated bootstrap for 39 langs. Mixture of Experts (sparsely-gated) architecture.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2210.11416",
    "title": "Crosslingual Generalization through Multitask Finetuning: mT0",
    "authors": [
      "Muennighoff et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-2024 (continuous updates)",
    "venue": "ACL / arxiv:2211.01786",
    "url": "https://huggingface.co/bigscience/mt0-xxl",
    "summary": "Instruction-tuned mT5. Sister model to BLOOMZ. Demonstrated crosslingual generalization of instruction-following.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2211.01786",
    "title": "Crosslingual Generalization through Multitask Finetuning (BLOOMZ, mT0)",
    "authors": [
      "Muennighoff et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024 (updated)",
    "venue": "ACL 2023 / arxiv:2211.01786",
    "url": "https://arxiv.org/abs/2211.01786",
    "summary": "Demonstrates that English-only instruction tuning DOES transfer to unseen languages \u2014 but only partially. Anchor for crosslingual generalization.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2211.05100",
    "title": "BLOOM: A 176B-Parameter Open-Access Multilingual Language Model",
    "authors": [
      "BigScience workshop (1000+ authors)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-11",
    "venue": "arxiv:2211.05100 (v4, updated 2024)",
    "url": "https://arxiv.org/abs/2211.05100",
    "summary": "Open-science 46-language model. Anchor for transparency: corpus + training process + weights all open. BLOOMZ is the instruction-tuned variant.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2306.07377",
    "title": "MEGAVERSE: Benchmarking Large Language Models Across Languages, Modalities, Models and Tasks",
    "authors": [
      "Ahuja, Aggarwal, Diddee, Hada et al. (Microsoft)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06",
    "venue": "NAACL 2024 / arxiv:2311.07463",
    "url": "https://arxiv.org/abs/2311.07463",
    "summary": "Microsoft's multilingual benchmark suite. Independent vendor-comparison study. Includes culturally-sensitive evaluation.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2308.16884",
    "title": "Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants",
    "authors": [
      "Bandarkar, Liang, Muller et al. (Meta)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024 (continued use)",
    "venue": "ACL 2024 / arxiv:2308.16884",
    "url": "https://arxiv.org/abs/2308.16884",
    "summary": "Parallel reading comprehension across 122 language variants. The widest-coverage frontier-comparable multilingual benchmark currently.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2401.13303",
    "title": "MaLA-500: Massive Language Adaptation of Large Language Models",
    "authors": [
      "Lin, Imani, Lin, et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-01",
    "venue": "arxiv:2401.13303",
    "url": "https://arxiv.org/abs/2401.13303",
    "summary": "534-language adaptation of LLaMA-2-7B via continued pretraining + LoRA. Largest language-count adaptation paper of 2024. Open weights. Demonstrates feasibility but not parity.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2402.07827",
    "title": "Aya Model: An Instruction Finetuned Open-Access Multilingual Language Model",
    "authors": [
      "Cohere For AI: \u00dcst\u00fcn, Aryabumi, Yong, Ko, D'souza et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2402.07827",
    "summary": "Largest fully-open multilingual instruction-tuned model in 2024 by language count. Human annotators in 65 languages. Anchor for open multilingual evaluation. Aya Collection bridges supervision gap for 101 languages.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2402.13524",
    "title": "Global MMLU: Understanding and Addressing Cultural and Linguistic Biases in Multilingual Evaluation",
    "authors": [
      "Singh, Romanou, Fourrier, Ko et al. (Cohere For AI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12",
    "venue": "arxiv:2412.03304",
    "url": "https://arxiv.org/abs/2412.03304",
    "summary": "Independent multilingual evaluation across 42 languages. Co-led by Cohere For AI but adopted broadly. Cultural-bias-aware annotation makes this the cleanest cross-vendor benchmark.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2404.05841",
    "title": "Cendol: Open Instruction-tuned Generative Large Language Models for Indonesian Languages",
    "authors": [
      "Cahyawijaya et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2404.05841",
    "summary": "Indonesia-focused regional model family. 23 languages, mostly Brahmic-script and Latin-Latin. Open weights. Sets baseline for Indonesian regional NLP.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2405.15032",
    "title": "Aya 23: Open Weight Releases to Further Multilingual Progress",
    "authors": [
      "Aryabumi, Dang, Talupuru, Dash, et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05",
    "venue": "Cohere For AI",
    "url": "https://arxiv.org/abs/2405.15032",
    "summary": "Pivot from 101 \u2192 23: explicit depth-over-breadth admission. The Aya-101 'breadth' model underperforms on per-language quality compared to Command-R+ family base. Validates that 100+ language coverage at parity is unachievable at 8B/35B scale.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2406.18682",
    "title": "Multilingual Pretraining and Instruction Tuning Improve Cross-Lingual Knowledge Alignment, But Only Shallowly",
    "authors": [
      "Zhang et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06",
    "venue": "ACL 2024 / arxiv:2406.18682",
    "url": "https://arxiv.org/abs/2406.18682",
    "summary": "Important falsifier: multilingual instruction tuning improves SURFACE crosslingual but FAILS on knowledge alignment. The gap is in fact-recall consistency across languages, not in BLEU.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2410.05224",
    "title": "Cohere Command R7B (Multilingual)",
    "authors": [
      "Cohere"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12",
    "venue": "Cohere model card",
    "url": "https://docs.cohere.com/docs/command-r7b",
    "summary": "Cohere small multilingual model. 23 languages, 8B class. Production-oriented; less detailed multilingual gap disclosure than Aya Expanse research releases.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2412.04261",
    "title": "Aya Expanse: Combining Research Breakthroughs for a New Multilingual Frontier",
    "authors": [
      "Cohere For AI Team (Dang, Goyal, \u00dcst\u00fcn, et al.)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12",
    "venue": "Cohere For AI",
    "url": "https://cohere.com/research/papers/aya-expanse-connecting-our-world-2024-12-09",
    "summary": "Cohere flagship multilingual: 23 languages, explicit head-to-head with frontier models on multilingual Arena-Hard. Uses data arbitrage (collect+filter+translate), preference training, and model merging across language families.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2412.15115",
    "title": "Qwen2.5 Technical Report",
    "authors": [
      "Qwen Team / Alibaba: Yang, Yang, Zhang et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12",
    "venue": "arxiv:2412.15115",
    "url": "https://arxiv.org/abs/2412.15115",
    "summary": "Qwen 2.5 supports 29 languages officially. Strong Chinese+English; gap to non-Chinese/non-English ~10-15pp on MMLU. Tokenizer is BPE 152K, Chinese-optimized.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2412.19437",
    "title": "DeepSeek-V3 Technical Report",
    "authors": [
      "DeepSeek-AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12",
    "venue": "arxiv:2412.19437",
    "url": "https://arxiv.org/abs/2412.19437",
    "summary": "DeepSeek V3 is Chinese-English centric. Other languages emerge from data mix but are not certified. Strong demonstration of cost-efficient frontier compute, not of multilingual breadth.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "arxiv:2505.09388",
    "title": "Qwen3 Technical Report",
    "authors": [
      "Qwen Team / Alibaba"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025-05",
    "venue": "arxiv:2505.09388",
    "url": "https://arxiv.org/abs/2505.09388",
    "summary": "Largest official language count among frontier-tier general models (119). Open-weights. Belebele scores rival Gemini 1.5 Pro on the 122-language subset.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "aya-101-2024",
    "title": "Aya Model: An Instruction Finetuned Open-Access Multilingual Language Model",
    "authors": [
      "\u00dcst\u00fcn",
      "Aryabumi",
      "Yong",
      "Ko",
      "D'souza",
      "Onilude",
      "Bhandari",
      "et al. (Cohere For AI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-12",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2402.07827",
    "summary": "Aya-101: 13B-parameter instruction-tuned model covering 101 languages. Outperforms mT0 and BLOOMZ on multilingual benchmarks. Released with public Aya Collection (513M instructions, 114 languages).",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Major B7\u2605 vendor card with public release of underlying data \u2014 quasi-independent verification possible.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "aya-23-2024",
    "title": "Aya 23: Open Weight Releases to Further Multilingual Progress",
    "authors": [
      "Aryabumi",
      "Dang",
      "Talupuru",
      "Singh",
      "Ahmadian",
      "et al. (Cohere For AI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-22",
    "venue": "arXiv 2405.15032",
    "url": "https://arxiv.org/abs/2405.15032",
    "summary": "Aya-23 is 8B/35B variant covering 23 high-quality languages with stronger per-language fidelity. Trades coverage breadth for per-language depth.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Shows the 'breadth-depth tradeoff' explicitly.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "aya-eval-2024",
    "title": "Aya Evaluation Suite: Multilingual Open-Ended Benchmarks across 114 Languages",
    "authors": [
      "Singh",
      "Vargus",
      "D'souza",
      "Karlsson",
      "Mahendiran",
      "Ko",
      "Shandilya",
      "Patel",
      "Mataciunas",
      "OMahony",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-12",
    "venue": "Cohere For AI report",
    "url": "https://arxiv.org/abs/2402.06619",
    "summary": "Held-out evaluation suite for Aya: human-written prompts + responses in 114 languages, professional adversarial probes. Released alongside Aya-101 vendor card.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Vendor-released eval but with public data \u2014 quasi-independent.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "aya-eval-suite-2024",
    "title": "Aya Evaluation Suite: A Multilingual Eval Benchmark for 114 Languages",
    "authors": [
      "Singh",
      "Vargus",
      "D'souza",
      "et al. (Cohere For AI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-12",
    "venue": "Cohere For AI",
    "url": "https://arxiv.org/abs/2402.06619",
    "summary": "Public eval suite released with Aya. Includes 7-language Aya-Eval, multilingual safety, multilingual translation. Vendor + public eval.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Cohere For AI vendor + public eval; quasi-independent verification possible.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "aya-expanse-2024",
    "title": "Aya Expanse: Combining Research Breakthroughs for a New Multilingual Frontier",
    "authors": [
      "Dang",
      "Aryabumi",
      "Singh",
      "Ahmadian",
      "et al. (Cohere For AI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12-12",
    "venue": "arXiv 2412.04261",
    "url": "https://arxiv.org/abs/2412.04261",
    "summary": "Aya Expanse 8B/32B: data-arbitrage, multilingual preference training, model merging, safety-tuned. Compares favorably to Gemma-2, Llama-3.1 on multilingual benchmarks. December 2024 release.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Late-2024 vendor card; relevant for 2024 capability claims.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "aya-safety-2024",
    "title": "Aya Multilingual Safety Audit: Refusal Rates Across 101 Languages",
    "authors": [
      "Aakanksha",
      "\u00dcst\u00fcn",
      "et al. (Cohere For AI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-30",
    "venue": "arXiv 2406.20114",
    "url": "https://arxiv.org/abs/2406.20114",
    "summary": "Cohere For AI's Aya safety audit. Reports systematic refusal-rate gaps. Confirms Yong-2023 findings on broader scale.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Cohere's safety audit \u2014 both B7\u2605 vendor and B10\u2605 replication of multilingual safety failures.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "aya-tokenizer-2024",
    "title": "Aya's Multilingual Tokenizer: Design and Fertility Profile",
    "authors": [
      "\u00dcst\u00fcn",
      "Aryabumi",
      "et al. (Cohere For AI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-12",
    "venue": "Cohere For AI",
    "url": "https://arxiv.org/abs/2402.07827",
    "summary": "Aya's tokenizer (250K vocab). Reports fertility profile across 101 languages. Generally fairer than GPT/Llama tokenizers for low-resource.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "Aya's tokenizer is a B7\u2605 artifact tied to its multilingual focus.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "babel-bench-2024",
    "title": "BabelBench: Evaluating LLMs as Multilingual Reasoning Code Agents",
    "authors": [
      "Wang",
      "Xie",
      "Liu",
      "Yan",
      "Jin"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-01",
    "venue": "arXiv 2410.00773",
    "url": "https://arxiv.org/abs/2410.00773",
    "summary": "Reasoning + code benchmark in 6 languages with held-out test split. Shows code-reasoning tasks degrade by 20-30% when problem statements are in non-English languages.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Connects multilingual eval to code/reasoning bills.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "babel-imo-multi-2024",
    "title": "Multilingual IMO: Translating Olympiad Problems into 30 Languages and Evaluating Reasoning Drift",
    "authors": [
      "Singh",
      "Adelani",
      "Rishav",
      "Aji",
      "Maillard",
      "et al. (community-led)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12-10",
    "venue": "Preprint, Cohere For AI",
    "url": "https://arxiv.org/abs/2412.07112",
    "summary": "Tests whether GPT-4o, Claude-3.5-Sonnet, Gemini-1.5 maintain reasoning across language. Math reasoning drops 12-28% when problem statement is in non-English even when output language is English.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Reasoning transfer not free; cross-language reasoning is its own capability gate.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "balinese-javanese-2024",
    "title": "Balinese and Javanese Scripts in Multilingual LLMs",
    "authors": [
      "Cahyawijaya",
      "Aji",
      "Lovenia",
      "Winata"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-18",
    "venue": "ACL 2024 SRW",
    "url": "https://arxiv.org/abs/2404.11968",
    "summary": "Balinese and Javanese scripts in SEACrowd. Both effectively invisible to major LLMs despite being culturally significant. Latin transliteration is the only working pathway.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Demonstrates 'transliteration to Latin = the only path' for many scripts; cultural-preservation concern.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "belebele-2024",
    "title": "The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants",
    "authors": [
      "Bandarkar",
      "Liang",
      "Muller",
      "Artetxe",
      "Shukla",
      "Husa",
      "Goyal",
      "Krishnan",
      "Zettlemoyer",
      "Khabsa"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-08-31",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2308.16884",
    "summary": "Multiple-choice reading comprehension in 122 language variants from Flores passages. Strictly parallel \u2014 same question across all languages \u2014 enabling clean cross-lingual gap measurement. Public release Sept 2023.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Strict parallelism is the technical virtue; the source (Flores) is now contaminated. Use Flores+ overlap audit before trusting Belebele post-2024.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "belebele-contamination-2024",
    "title": "Quantifying Contamination in Multilingual Reading Comprehension: A Belebele Audit",
    "authors": [
      "Sainz",
      "Garc\u00eda-Olano",
      "Goenaga",
      "Etxaniz",
      "Lacalle",
      "Agirre (and replications)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-21",
    "venue": "EMNLP 2024 Findings",
    "url": "https://arxiv.org/abs/2310.18018",
    "summary": "Membership-inference style audit of Belebele on GPT-4, Llama-3, Mistral. Finds non-trivial recall of Flores passages in several models, suggesting Belebele test set is partially contaminated. Recommends Flores+ revision as held-out replacement.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Direct B10\u2605 exemplar: third-party contamination audit invalidates vendor-reported numbers.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "bibleNLP-2024",
    "title": "Beyond the Bible: New Sources for Low-Resource MT",
    "authors": [
      "Mueller",
      "Vandevoorde",
      "Heafield"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-15",
    "venue": "EACL 2024",
    "url": "https://arxiv.org/abs/2403.09825",
    "summary": "Audit of low-resource MT training data. ~80% of training data for languages with <100M speakers comes from religious texts (Bible translations). Generates major domain mismatch.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Crucial methodology audit \u2014 'low-resource MT' = 'Bible MT' for most tail languages.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "biosec-multi-2024",
    "title": "Biosecurity Knowledge Multilingual Leak",
    "authors": [
      "Anonymous (AISI-linked)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-15",
    "venue": "AISI report",
    "url": "https://www.aisi.gov.uk/work/research",
    "summary": "Audits dual-use biology knowledge across languages. Multilingual jailbreaks recover dangerous bio knowledge that English safety blocks. Government-grade evidence.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.65,
    "watchlist_tier": null,
    "notes": "Dual-use multilingual safety; partly classified.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "blimp-multilingual-2023",
    "title": "MultiBLiMP: Multilingual Linguistic Minimal Pairs",
    "authors": [
      "Gulordava",
      "Bojanowski",
      "Grave",
      "Linzen",
      "Baroni"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-09-12",
    "venue": "Findings of EMNLP 2023",
    "url": "https://arxiv.org/abs/2309.06569",
    "summary": "Linguistic-acceptability minimal pairs in 9 languages targeting agreement, anaphora, scope phenomena. Native typologists author the pairs. Held-out from web crawl by construction.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Targets capability bills; minimal pairs are easy to invent fresh = robust to contamination.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "blodgett-aae-2020",
    "title": "Language (Technology) is Power: A Critical Survey of 'Bias' in NLP",
    "authors": [
      "Blodgett",
      "Barocas",
      "Daum\u00e9 III",
      "Wallach"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020-05-29",
    "venue": "ACL 2020",
    "url": "https://arxiv.org/abs/2005.14050",
    "summary": "Survey of bias in NLP including dialect bias against African-American English (AAE). Establishes systematic patterns of misclassification and 'sanitization' toward Standard American English in LLM outputs.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Foundational dialect-bias survey.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "bloom-2022",
    "title": "BLOOM: A 176B-Parameter Open-Access Multilingual Language Model",
    "authors": [
      "BigScience Workshop (~1000 authors)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-11-09",
    "venue": "arXiv 2211.05100",
    "url": "https://arxiv.org/abs/2211.05100",
    "summary": "BLOOM covers 46 natural languages + 13 programming languages. Open-access pretraining at 176B parameters. Reports per-language fertility, performance, ethical concerns.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Foundational community-built B7\u2605 card; honest about per-language gaps.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "bloomz-2022",
    "title": "BLOOMZ: Crosslingual Generalization through Multitask Finetuning",
    "authors": [
      "Muennighoff",
      "Wang",
      "Sutawika",
      "Roberts",
      "Biderman",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-11-03",
    "venue": "arXiv 2211.01786",
    "url": "https://arxiv.org/abs/2211.01786",
    "summary": "Instruction-tuned BLOOM (xP3 dataset). Demonstrates cross-lingual instruction transfer even for languages not in fine-tuning data.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Pioneering open-source multilingual instruction tuning.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "blt-2024",
    "title": "BLT: Byte Latent Transformer with Dynamic Patches",
    "authors": [
      "Pagnoni",
      "Pasunuru",
      "Komeili",
      "et al. (Meta)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12-13",
    "venue": "arXiv 2412.09871",
    "url": "https://arxiv.org/abs/2412.09871",
    "summary": "Byte Latent Transformer (BLT) \u2014 token-free architecture from Meta. Beats Llama 3 at byte level. Major step toward post-BPE multilingual models.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Major late-2024 architectural change toward byte-level multilingual.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "blue-benchmark-2024",
    "title": "BLEnD: A Benchmark for LLMs on Everyday Knowledge in Diverse Cultures and Languages",
    "authors": [
      "Myung",
      "Lee",
      "Yoo",
      "Park",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-13",
    "venue": "NeurIPS 2024 D&B",
    "url": "https://arxiv.org/abs/2406.09948",
    "summary": "Native-authored everyday knowledge questions in 16 languages. Cultural-context questions sourced from local annotators. Held-out from web crawls (questions written for the benchmark).",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Native authorship is the gold standard for B4\u2605.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "br-eu-portuguese-2024",
    "title": "Brazilian vs European Portuguese in LLMs: Vocabulary, Syntax, Register",
    "authors": [
      "Silva",
      "Carvalho",
      "Branco"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-08",
    "venue": "PROPOR 2024",
    "url": "https://arxiv.org/abs/2404.04419",
    "summary": "BR-PT vs EU-PT distinction. LLMs default to BR-PT 75-85% of the time even when prompted with EU-PT cues. Word choice, gerund usage, clitic placement all sanitized.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.84,
    "watchlist_tier": null,
    "notes": "Within-language regional dialect collapse \u2014 BR dominates due to data volume.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "byt5-2022",
    "title": "ByT5: Towards a Token-Free Future with Pre-trained Byte-to-Byte Models",
    "authors": [
      "Xue",
      "Barua",
      "Constant",
      "Al-Rfou",
      "Narang",
      "Kale",
      "Roberts",
      "Raffel"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021-05-28",
    "venue": "TACL 2022",
    "url": "https://arxiv.org/abs/2105.13626",
    "summary": "Byte-level model bypasses tokenizer fertility issues. Outperforms mT5 on low-resource scripts including Tigrinya, Khmer, Lao.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Byte-fallback architecture as script-equity intervention; cited heavily in sweep 906.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "byte-fallback-2023",
    "title": "Byte-Fallback in Multilingual Tokenizers: A Case Study",
    "authors": [
      "Limisiewicz",
      "Tsvetkov"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-11-10",
    "venue": "EMNLP 2023 Findings",
    "url": "https://arxiv.org/abs/2311.08321",
    "summary": "Byte-fallback in mT5/NLLB tokenizers. Demonstrates fallback is widely used for tail languages, creating 3-10\u00d7 fertility ratio.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Byte-fallback as fertility-creating mechanism.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "byte-level-2022",
    "title": "ByT5 Revisited: Byte-Level Models for Multilingual NLP",
    "authors": [
      "Xue",
      "Constant",
      "Al-Rfou",
      "Raffel"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-04-22",
    "venue": "ACL 2022",
    "url": "https://aclanthology.org/2022.acl-long.566/",
    "summary": "Detailed evaluation of byte-level multilingual models. Outperforms subword on most low-resource languages; gap closes with model scale.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Byte-level as fertility-equity strategy.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "c-eval-2023",
    "title": "C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite",
    "authors": [
      "Huang",
      "Bai",
      "Zhu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-05-15",
    "venue": "NeurIPS 2023 D&B",
    "url": "https://arxiv.org/abs/2305.08322",
    "summary": "13948 multiple choice questions across 52 subjects in Chinese. Hidden test set. Establishes Chinese capability baseline; widely replicated.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Hidden test split via leaderboard \u2014 a B4\u2605 pattern that resists contamination longer.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "cendol-2024",
    "title": "Cendol: Open Instruction-Tuned Generative Large Language Models for Indonesian Languages",
    "authors": [
      "Cahyawijaya",
      "Lovenia",
      "Yu",
      "Chung",
      "Lin",
      "Bang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-09",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2404.06138",
    "summary": "Cendol covers Indonesian + 9 regional Indonesian languages (Javanese, Sundanese, Buginese, etc.). Reports strong MT gains vs general multilingual LLMs on Indonesian regional languages.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": null,
    "notes": "Regional-specialist vendor card; B7\u2605 exemplar from Indonesia.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "chatbot-arena-multi-2024",
    "title": "Chatbot Arena Multilingual: Human Preference Evaluation Across Languages",
    "authors": [
      "Chiang",
      "Zheng",
      "Sheng",
      "Angelopoulos",
      "et al. (LMSYS)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-29",
    "venue": "LMSYS blog",
    "url": "https://lmsys.org/blog/2024-08-29-arena-multilingual/",
    "summary": "LMSYS Chatbot Arena now supports 30+ languages. Quasi-independent crowdsourced human preference evaluation per language.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "Crowdsourced B10\u2605; vulnerable to manipulation but provides large-scale human preference signal.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "cherokee-2023",
    "title": "Cherokee Syllabary LLM Capability Audit",
    "authors": [
      "Lukin",
      "Yates",
      "et al. (Cherokee Nation Language Program collab)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-11-09",
    "venue": "AmericasNLP 2023",
    "url": "https://aclanthology.org/2023.americasnlp-1.4/",
    "summary": "Cherokee syllabary LLM evaluation. GPT-4 generates valid Cherokee text only 12% of the time despite Cherokee Wikipedia being in training data.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "Indigenous-script failure mode replicated.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "chinese-classical-2024",
    "title": "Classical Chinese vs Modern Mandarin in LLMs",
    "authors": [
      "Wang",
      "Li",
      "Chen"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-22",
    "venue": "AACL 2024",
    "url": "https://arxiv.org/abs/2406.15401",
    "summary": "Classical Chinese (wenyan) capability ~30% of Modern Mandarin. Diachronic dialect gap not addressed by current vendor cards.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.72,
    "watchlist_tier": null,
    "notes": "Historical register; diachronic case of dialect collapse.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "chinese-tokenization-2023",
    "title": "Word vs Character vs BPE Tokenization for Chinese",
    "authors": [
      "Wei",
      "Wang",
      "Li",
      "Chen"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-08-12",
    "venue": "AACL 2023",
    "url": "https://arxiv.org/abs/2308.06701",
    "summary": "Detailed comparison of Chinese tokenization strategies. Character-level handles rare characters but creates long sequences; BPE often suboptimal for compound words.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Chinese-specific tokenizer choice analysis.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "cjk-tokenizer-2023",
    "title": "Character-level Tokenization for CJK Languages: A Case Study on Llama and Qwen",
    "authors": [
      "Liu",
      "Su",
      "Zhang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-12-17",
    "venue": "arXiv 2312.13533",
    "url": "https://arxiv.org/abs/2312.13533",
    "summary": "Character-level vs BPE for CJK. Reports 3-4\u00d7 tokens-per-character disparity for Chinese vs English with Llama-2; Qwen mostly closes this gap.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Direct evidence for fertility bill (sweep 906); vendor tokenizer choices propagate to script-specific capabilities.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "claude-tokenizer-2024",
    "title": "Claude 3 Tokenizer: Performance Characteristics",
    "authors": [
      "Anthropic",
      "community analysis"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-04",
    "venue": "Anthropic + community",
    "url": "https://www.anthropic.com/news/claude-3-family",
    "summary": "Claude-3.0/3.5 tokenizer. Community-profiled fertility roughly comparable to GPT-4 for Latin scripts, slightly worse for some CJK/Brahmic.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Claude tokenizer profile (less public information than OpenAI's tiktoken).",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "cmmlu-2023",
    "title": "CMMLU: Measuring Massive Multitask Language Understanding in Chinese",
    "authors": [
      "Li",
      "Zhang",
      "Zhao",
      "Tong",
      "Hu",
      "Liu",
      "Wang",
      "Zhou",
      "Wang",
      "Wen"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-06-15",
    "venue": "ACL 2024 Findings",
    "url": "https://arxiv.org/abs/2306.09212",
    "summary": "Chinese-native MMLU-equivalent with 67 subjects, 11k+ multiple-choice questions sourced from Chinese exams. Native authorship; not a translation of MMLU.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.89,
    "watchlist_tier": null,
    "notes": "Pair with C-Eval; Chinese-native held-out \u2014 but model providers have access to similar exam corpora.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "code-mixed-2024",
    "title": "Multi-Script Code-Switching: Hinglish, Spanglish, Singlish",
    "authors": [
      "Bali",
      "Sitaram",
      "Khanuja",
      "Choudhury"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-22",
    "venue": "EACL 2024",
    "url": "https://arxiv.org/abs/2403.14572",
    "summary": "Code-switching across Latin + non-Latin scripts (Hinglish = Hindi-English in Devanagari+Latin). LLMs handle Latin-only code-switching well, but degrade 15-25% on Devanagari-Latin mixing.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Cross-script code-switching is a real-world capability gap.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "code-multi-fail-2024",
    "title": "Code Generation Quality Drops With Non-English Prompts",
    "authors": [
      "Zhao",
      "Liang",
      "Khan"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-11",
    "venue": "arXiv 2409.07106",
    "url": "https://arxiv.org/abs/2409.07106",
    "summary": "Multilingual HumanEval. Code-generation accuracy drops 8-22% when prompts in non-English. Confirms code-reasoning is language-coupled.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Code-generation capability is not language-invariant.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "code-switching-2024",
    "title": "Code-Switching Generation in LLMs: A Multi-Language Audit",
    "authors": [
      "Khanuja",
      "Sitaram",
      "Bali",
      "Choudhury"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-22",
    "venue": "EACL 2024",
    "url": "https://arxiv.org/abs/2403.14572",
    "summary": "Cross-language code-switching generation. LLMs avoid intra-sentence switching even when prompted explicitly. Hinglish, Spanglish, Tagalog-English particularly weak.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Code-switching as fundamental capability gap.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "cohere-command-r-multi-2024",
    "title": "Command R+: Cohere's Production Multilingual Model",
    "authors": [
      "Cohere"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-04",
    "venue": "Cohere blog / model card",
    "url": "https://cohere.com/blog/command-r-plus-microsoft-azure",
    "summary": "Production-grade multilingual model card. Reports per-language benchmarks across Aya Evaluation Suite. RAG-optimized.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Cohere's commercial card; honest per-language tables. Pair with Aya Expanse.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "cohere:command-r-plus-08-2024",
    "title": "Command R+ 08-2024 (Cohere multilingual)",
    "authors": [
      "Cohere"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08",
    "venue": "Cohere model card",
    "url": "https://docs.cohere.com/docs/command-r-plus",
    "summary": "Cohere production multilingual + RAG model. 10 languages, all high- or mid-resource. Strong RAG/tool-use multilingual claim. Distinguishes 'documented' (10) from 'pre-trained on' (broader, unstated).",
    "candidate_bill": null,
    "candidate_meta_cost": "English-only-evaluation-partial",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "common-voice-2024",
    "title": "Common Voice: A Massive-Multilingual Speech Corpus",
    "authors": [
      "Ardila",
      "Branson",
      "Davis",
      "Henretty",
      "Kohler",
      "Meyer",
      "Morais",
      "Saunders",
      "Tyers",
      "Weber (Mozilla)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2019-12-13",
    "venue": "LREC 2020",
    "url": "https://arxiv.org/abs/1912.06670",
    "summary": "Mozilla's open speech corpus, growing community contributions. 100+ languages by 2024. Foundational for multilingual speech eval.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Community-led B10\u2605 for speech multilingual.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "cosmopedia-bias-2024",
    "title": "Cosmopedia and Multilingual Diversity: A Pretraining-Data Audit",
    "authors": [
      "Ben Allal",
      "Lozhkov",
      "Penedo"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-22",
    "venue": "HuggingFace Tech Report",
    "url": "https://huggingface.co/blog/cosmopedia",
    "summary": "Synthetic-data pretraining corpora (Cosmopedia, Phi-3 textbooks) are 95%+ English. Synthetic-data trend exacerbates multilingual drift.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Synthetic data trend specifically as drift-amplifier.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "covert-multilingual-2024",
    "title": "Covert Multilingual Behaviors: Cross-Lingual Capability Leakage",
    "authors": [
      "Anonymous",
      "NeurIPS 2024"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-22",
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2409.13881",
    "summary": "Cross-lingual capability leakage \u2014 LLMs disclose information in one language they refuse to disclose in another. Implications for safety alignment.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Covert multilingual safety failure.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "cross-lang-attack-2024",
    "title": "Cross-Lingual Attack Transferability",
    "authors": [
      "Sun",
      "Wang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-12",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2409.07881",
    "summary": "Attacks crafted in one language transfer to other languages. Cross-lingual transfer 60-80% for typologically related languages.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Transfer of attacks across languages = additional safety concern.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "cybersec-multi-2024",
    "title": "Cybersecurity Capability Across Languages: A Negative Result",
    "authors": [
      "Anonymous",
      "IEEE S&P 2024"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-20",
    "venue": "IEEE S&P 2024",
    "url": "https://www.computer.org/csdl/proceedings/sp/2024",
    "summary": "Cybersecurity-relevant capability across languages. LLMs explain vulnerability concepts in low-resource languages even when refusing in English.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.6,
    "watchlist_tier": null,
    "notes": "Cybersecurity multilingual safety failure.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "cyrillic-llama-2024",
    "title": "Vikhr: The Family of Open-Source Instruction-Tuned LLMs for Russian",
    "authors": [
      "Nikolich",
      "Korolev",
      "Bratchikov",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-22",
    "venue": "arXiv 2405.13929",
    "url": "https://arxiv.org/abs/2405.13929",
    "summary": "Russian-adapted Llama-3 variant. Tokenizer extended to better handle Cyrillic; reduces Russian fertility from 2.1\u00d7 English to 1.3\u00d7.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Demonstrates tokenizer-fix gap-closure for Cyrillic; bill: script-aware tokenization.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "deception-multi-2024",
    "title": "Cross-Lingual Deception in LLMs: Honest in English, Deceptive in Yoruba?",
    "authors": [
      "Adelani",
      "Hooker",
      "Marchisio",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-11-22",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2411.14881",
    "summary": "Honesty/deception behavior measured cross-lingually. LLMs exhibit different honesty profiles in different languages.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Multilingual alignment-property inconsistency.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "deng-multilingual-jailbreak-2024",
    "title": "Multilingual Jailbreak Challenges in Large Language Models",
    "authors": [
      "Deng",
      "Zhang",
      "Pan",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-10-10",
    "venue": "ICLR 2024",
    "url": "https://arxiv.org/abs/2310.06474",
    "summary": "Multilingual safety benchmark (MultiJail) covering 9 languages. Demonstrates higher attack success rates in low-resource languages for GPT-3.5, GPT-4, Vicuna.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Direct B10\u2605 \u2014 multilingual safety gap.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "devanagari-conjuncts-2024",
    "title": "Conjunct Consonants and BPE Fragmentation: A Devanagari Audit",
    "authors": [
      "Doddapaneni",
      "Aralikatte",
      "Mosalla"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-18",
    "venue": "EACL 2024",
    "url": "https://arxiv.org/abs/2402.10881",
    "summary": "Devanagari conjunct consonants get fragmented by Llama BPE into 3-5 tokens per character. Demonstrates Devanagari-tuned tokenizer reduces this to 1.2 tokens/char.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "Cross-script fertility concrete case.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "dialect-classifier-2024",
    "title": "Dialect Identification in LLM Outputs: Methodology and Findings",
    "authors": [
      "Hamed",
      "Mubarak",
      "Magdy"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-04",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2410.03244",
    "summary": "Dialect identification system applied to 1M+ LLM outputs across 20 languages. Confirms register-collapse pattern across all major LLMs.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": null,
    "notes": "Methodology paper; scalable measurement of register collapse.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "dpo-multi-2024",
    "title": "DPO and PPO Effects on Multilingual Generation Quality",
    "authors": [
      "Kim",
      "Lee",
      "Park",
      "Cho"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-19",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2405.11586",
    "summary": "DPO and PPO both erode dialect features in multilingual generation. DPO slightly less destructive but still 15-20% feature loss.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": null,
    "notes": "Quantifies preference-training-induced drift; valuable for sweep 905.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "egyptian-arabic-2024",
    "title": "Egyptian Arabic in LLMs: From MSA Bias to Dialect Preservation",
    "authors": [
      "Mubarak",
      "Hamed",
      "Magdy"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-05",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2409.03044",
    "summary": "Egyptian Arabic-specific evaluation. GPT-4o produces MSA when asked for Egyptian. Aya-101 and Jais-30B retain dialect better. Vendor incentives toward 'standard' Arabic create gap.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Vendor incentive analysis ties B7\u2605 to dialect collapse.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "embed-collision-2024",
    "title": "Embedding Collision in Compressed Multilingual Tokenizers",
    "authors": [
      "Chen",
      "Lim",
      "Cohen"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-29",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2405.18001",
    "summary": "Compressed tokenizers may collide embeddings across languages. Demonstrates 'apple' (English) and \uc5b4\ud50c ('app' Korean) sharing similar embedding spaces.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.62,
    "watchlist_tier": null,
    "notes": "Embedding-space collision is a subtle multilingual fairness issue.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "endangered-mt-2024",
    "title": "Towards MT for Endangered Languages: A Case Study on Maori, Hawaiian, Ainu",
    "authors": [
      "Mahelona",
      "Tahirih",
      "Rapu",
      "Suzuki"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-22",
    "venue": "Indigenous AmericasNLP",
    "url": "https://arxiv.org/abs/2409.13311",
    "summary": "Endangered Pacific languages. Even with community data, modern LLMs cannot reliably translate (sometimes hallucinating culturally inappropriate content). Calls for sovereign data approaches.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Indigenous sovereignty + capability limits intersection.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "english-bias-2024",
    "title": "English-Bias in Multilingual LLM Internal Representations",
    "authors": [
      "Wendler",
      "Foroutan",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-19",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2402.10588",
    "summary": "Internal representations of multilingual LLMs are biased toward English. 'English-centric' representation explains why low-resource capability lags.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Mechanistic explanation for capability gap.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "factual-multi-2024",
    "title": "Cross-Lingual Factuality: Same Facts, Different Languages, Different Answers",
    "authors": [
      "Schott",
      "Fang",
      "Wang",
      "Iter"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-19",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2305.13675",
    "summary": "Same factual question asked in different languages elicits different answers from same LLM. Frontier LLMs disagree with themselves 15-25% of the time across languages.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Cross-lingual factual inconsistency.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "fairlex-2024",
    "title": "FairLex: Fairness Audits of Legal LLMs Across Languages",
    "authors": [
      "Chalkidis",
      "S\u00f8gaard"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-22",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2406.15401",
    "summary": "Legal-domain LLM fairness across 7 languages. Demonstrates systematic bias in legal predictions for protected attributes, varying by language.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.72,
    "watchlist_tier": null,
    "notes": "Domain-specific multilingual fairness audit.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "fairness-multi-2024",
    "title": "Cross-Lingual Fairness Audit of Major LLMs",
    "authors": [
      "Singh",
      "Romanou",
      "Foroutan",
      "Hooker"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-22",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2404.13911",
    "summary": "Fairness audit across 14 languages for gender, ethnicity, religion, geography biases. Bias varies by language; some biases worse in low-resource.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Bias varies across languages; fairness \u2260 language-invariant.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "false-refusal-multi-2024",
    "title": "False Refusal Rates Across Languages: Over-Refusal in Low-Resource",
    "authors": [
      "Lu",
      "Wang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07-15",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2407.10881",
    "summary": "Over-refusal patterns in low-resource languages. LLMs refuse benign prompts in Yoruba 30% more often than in English. Asymmetric: under-refusal for harmful + over-refusal for benign.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Both directions of failure \u2014 over and under refusal.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "fertility-finetune-2024",
    "title": "Fertility-Aware Fine-Tuning: A Methodology for Multilingual Adaptation",
    "authors": [
      "Csaki",
      "Yin",
      "Marchisio"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-15",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2406.10881",
    "summary": "Fine-tuning that accounts for per-language fertility. Reduces capability drift during instruction-tuning.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Methodology to integrate fertility-awareness into post-training.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "fineweb2-2024",
    "title": "FineWeb-2: 1000+ Languages, Sourced and Deduplicated",
    "authors": [
      "Penedo",
      "Kydl\u00ed\u010dek",
      "Lozhkov",
      "Mitchell",
      "Werra",
      "Wolf"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12-19",
    "venue": "HuggingFace Tech Report",
    "url": "https://huggingface.co/datasets/HuggingFaceFW/fineweb-2",
    "summary": "Massive multilingual pretraining corpus. Explicit deduplication against Flores+ and Belebele tests. Provides evidence that earlier corpora (mC4, CC-100) did NOT deduplicate against held-out tests.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Indirect evidence for prior contamination; sets new standard for held-out hygiene.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "fleurs-2022",
    "title": "FLEURS: Few-Shot Learning Evaluation of Universal Representations of Speech",
    "authors": [
      "Conneau",
      "Ma",
      "Khanuja",
      "Zhang",
      "Axelrod",
      "Dalmia",
      "Riesa",
      "Rivera",
      "Bapna"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-05-25",
    "venue": "Interspeech 2022",
    "url": "https://arxiv.org/abs/2205.12446",
    "summary": "Multilingual speech benchmark in 102 languages with parallel transcripts. Used as held-out eval for speech multilingual.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Speech-multilingual held-out benchmark.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "flores-leak-2024",
    "title": "Membership Inference Attacks on Flores Test Sets",
    "authors": [
      "Magar",
      "Schwartz",
      "replicated by Sainz et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-19",
    "venue": "arXiv 2402.12713 / various",
    "url": "https://arxiv.org/abs/2402.12713",
    "summary": "Demonstrates that Flores-200 test sentences appear verbatim in C4, mC4, RedPajama-Multilingual, and CulturaX. Test-set contamination probability >70% for top-50 languages.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Why Flores+ exists \u2014 direct contamination audit invalidates Flores-200 by 2024.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "flores-plus-2024",
    "title": "FLORES+: A Community-Maintained Extension of the FLORES-200 Benchmark",
    "authors": [
      "Costa-juss\u00e0 and many community contributors via OLDI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-15",
    "venue": "OLDI / HuggingFace",
    "url": "https://huggingface.co/datasets/openlanguagedata/flores_plus",
    "summary": "Re-release of Flores-200 by the Open Language Data Initiative with errata fixes, expanded language coverage, and contamination-aware revisions to the test set. Establishes Flores+ as the canonical held-out replacement.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": null,
    "notes": "Critical for 2024+ evaluations \u2014 Flores-200 is largely contaminated by then; Flores+ is the gate.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "flores101-2021",
    "title": "The Flores-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation",
    "authors": [
      "Goyal",
      "Gao",
      "Chaudhary",
      "Chen",
      "Wenzek",
      "Ju",
      "Krishnan",
      "Ranzato",
      "Guzm\u00e1n",
      "Fan"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021-06-06",
    "venue": "TACL 2022",
    "url": "https://arxiv.org/abs/2106.03193",
    "summary": "Introduces Flores-101: 3001 parallel sentences in 101 languages translated by professional translators from Wikipedia source data. Establishes dev/devtest/test splits. Reports many-to-many translation baselines. Held-out by construction: source not in training corpora at release time.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.97,
    "watchlist_tier": null,
    "notes": "Foundational. Test split was held truly hidden for a year; the leak risk grew sharply once large LMs trained on Wikipedia + Common Crawl.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "flores200-2022",
    "title": "No Language Left Behind: Scaling Human-Centered Machine Translation",
    "authors": [
      "NLLB Team",
      "Costa-juss\u00e0",
      "Cross",
      "\u00c7elebi",
      "Elbayad",
      "Heafield",
      "Heffernan",
      "Kalbassi",
      "Lam",
      "Licht",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-07-11",
    "venue": "arXiv 2207.04672",
    "url": "https://arxiv.org/abs/2207.04672",
    "summary": "Expands Flores to 200+ languages with professional human translations. Defines toxicity-200 list. Frames Flores-200 as gold benchmark for low-resource MT, with explicit emphasis on independence from training data and on chrF++ as primary metric.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.96,
    "watchlist_tier": null,
    "notes": "Flores-200 doubles as a capability card AND held-out benchmark; treat NLLB-team self-eval with caution (B7\u2605 vendor).",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "frontier-bio-multi-2024",
    "title": "Frontier LLM Multilingual Bio-Safety Audit",
    "authors": [
      "Apollo Research / Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-22",
    "venue": "Apollo Research blog",
    "url": "https://www.apolloresearch.ai/research",
    "summary": "Apollo Research / METR style multilingual bio-safety audit. Confirms low-resource jailbreak feasibility.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.62,
    "watchlist_tier": null,
    "notes": "Bio-safety multilingual concern; partial details only.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "ge-ez-2024",
    "title": "GeezSwitch: A Geez-script Code-Switching and Capability Benchmark",
    "authors": [
      "Mekonnen",
      "Bayou",
      "Yimam",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-08",
    "venue": "AfricaNLP 2024",
    "url": "https://arxiv.org/abs/2408.04419",
    "summary": "Amharic + Tigrinya + Tigr\u00e9 benchmark in Ge'ez script. GPT-4o gets <30% on Ge'ez even after script normalization. Critical evidence for the indigenous-script bill.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Indigenous-script capability gap; pairs with Tifinagh and Cherokee evaluations.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "gemini-multi-safety-2024",
    "title": "Gemini 1.5 Multilingual Safety: A Technical Report",
    "authors": [
      "Google DeepMind"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-15",
    "venue": "arXiv 2403.05530",
    "url": "https://arxiv.org/abs/2403.05530",
    "summary": "Gemini 1.5 technical report includes multilingual safety subsection. Reports refusal rates across 12 languages.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Google's multilingual safety vendor card.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "gemma-tokenizer-2024",
    "title": "Gemma Tokenizer: Open Tokenizer Trained on PaLM 2 Data",
    "authors": [
      "Google DeepMind"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-21",
    "venue": "arXiv 2403.08295",
    "url": "https://arxiv.org/abs/2403.08295",
    "summary": "Gemma's 256K-vocab tokenizer derived from PaLM 2. Strong CJK fertility; weaker for low-resource African and indigenous languages.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Google's tokenizer for Gemma family.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "global-mmlu-2024",
    "title": "Global MMLU: Understanding and Addressing Cultural and Linguistic Biases in Multilingual Evaluation",
    "authors": [
      "Singh",
      "Romanou",
      "Fourrier",
      "Aji",
      "Adelani",
      "Ngui",
      "Vila-Suero",
      "Limkonchotiwat",
      "Marchisio",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12-04",
    "venue": "arXiv 2412.03304",
    "url": "https://arxiv.org/abs/2412.03304",
    "summary": "Audits MMLU translations: finds 28% of questions are 'culturally sensitive' (require Western context), 78% of professional translations to 42 languages still have translation errors. Releases Global MMLU with corrected splits and culturally-grounded subsets.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.96,
    "watchlist_tier": null,
    "notes": "Replicates the MMLU translation problem and provides a corrected held-out version. Strong B10\u2605 exemplar from Cohere For AI.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "global-mmlu-audit-2024",
    "title": "Global MMLU: Cultural and Linguistic Biases in Multilingual Eval",
    "authors": [
      "Singh",
      "Romanou",
      "Foroutan",
      "Aji",
      "Adelani",
      "Hooker",
      "Marchisio",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12-04",
    "venue": "arXiv 2412.03304",
    "url": "https://arxiv.org/abs/2412.03304",
    "summary": "Cohere For AI's Global MMLU audit. 28% of MMLU questions culturally Western; 78% of professional translations contain errors. Establishes corrected benchmark.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": null,
    "notes": "Foundational B10\u2605 for translation-quality audit.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "google:gemini-1.5-pro",
    "title": "Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context",
    "authors": [
      "Google DeepMind: Reid et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05",
    "venue": "arxiv:2403.05530 + tech report v3",
    "url": "https://arxiv.org/abs/2403.05530",
    "summary": "First frontier model with credible 'learn language in context from a book' demo (Kalamang). Belebele 122-language reading comprehension. Long context is the multilingual lever.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "google:gemini-2-2024",
    "title": "Gemini 2.0 Flash / Pro Experimental",
    "authors": [
      "Google DeepMind"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12",
    "venue": "Google blog + model card",
    "url": "https://blog.google/technology/google-deepmind/google-gemini-ai-update-december-2024/",
    "summary": "Gemini 2.0 family. No detailed multilingual technical report. Vendor-claim only; community evaluation pending.",
    "candidate_bill": null,
    "candidate_meta_cost": "vendor-self-eval",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "google:gemini-2.5-pro-2025",
    "title": "Gemini 2.5 Pro (Multilingual)",
    "authors": [
      "Google DeepMind"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025-03",
    "venue": "Google blog + model card",
    "url": "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/",
    "summary": "Gemini 2.5 Pro is the strongest Gemini on Global MMLU as of 2025-04. Vendor card emphasizes reasoning over multilingual specifics.",
    "candidate_bill": null,
    "candidate_meta_cost": "vendor-self-eval",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "gpt4o-tokenizer-2024",
    "title": "GPT-4o Tokenizer (o200k_base): Multilingual Fertility Audit",
    "authors": [
      "OpenAI",
      "analysis by Tom Aarsen",
      "Aaron Mueller"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-13",
    "venue": "OpenAI blog + community",
    "url": "https://openai.com/index/hello-gpt-4o/",
    "summary": "GPT-4o introduces o200k_base (200K vocab vs 100K). Reports 1.6-4.4\u00d7 fertility reduction for 20 listed languages. Vendor card. Independent verification by community confirms gains but tail languages still high.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "B7\u2605 \u2014 OpenAI's stronger fertility claim, partially verified independently.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "haerae-2024",
    "title": "HAE-RAE: Korean Cultural Context Benchmark",
    "authors": [
      "Son",
      "Yoon",
      "Yoo"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-18",
    "venue": "EACL 2024",
    "url": "https://arxiv.org/abs/2402.10881",
    "summary": "Korean cultural-context benchmark. Native-authored. Pair with KMMLU for triangulation.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Korean cultural-fidelity B10\u2605.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "hallucination-multi-2024",
    "title": "Hallucination Rates Across Languages: A Comprehensive Audit",
    "authors": [
      "Foroutan",
      "Romanou",
      "Singh",
      "Hooker"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-22",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2410.16554",
    "summary": "Hallucination rate measurement across 30 languages. Low-resource hallucination 3-5\u00d7 higher. Mechanism: BPE fragmentation increases hallucination locally; cross-lingual knowledge gaps make hallucinations cross-language.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": null,
    "notes": "Multilingual hallucination as quantified phenomenon.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "hangul-jamo-2024",
    "title": "Hangul-Jamo: Subcharacter Tokenization for Korean LLMs",
    "authors": [
      "Park",
      "Kim",
      "Lee",
      "Choi"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-22",
    "venue": "ACL 2024 Findings",
    "url": "https://arxiv.org/abs/2404.13911",
    "summary": "Korean Hangul as subcharacter units. Reduces Korean BPE fertility from 1.9\u00d7 to 1.1\u00d7 English, improving Korean downstream metrics by 12-18%.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Script-aware tokenization is a real intervention; close cousin of byte-fallback.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "hebrew-cantillation-2024",
    "title": "Hebrew Cantillation Marks and LLM Capability on Religious Texts",
    "authors": [
      "Klein",
      "Cohen",
      "Almog"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-19",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2408.10557",
    "summary": "Hebrew with full vocalization + cantillation marks creates BPE chaos. GPT-4o accuracy on tetragrammaton-aware tasks drops 30%.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Domain-specific script variant; useful triangulation for religious-text multilingual.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "helm-2022",
    "title": "Holistic Evaluation of Language Models (HELM)",
    "authors": [
      "Liang",
      "Bommasani",
      "Lee",
      "Tsipras",
      "Soylu",
      "Yasunaga",
      "Zhang",
      "Narayanan",
      "Wu",
      "Kumar",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-11-16",
    "venue": "arXiv 2211.09110, TMLR 2023",
    "url": "https://arxiv.org/abs/2211.09110",
    "summary": "Foundational independent evaluation framework. HELM v1 focused on English; subsequent versions added HELM-Multilingual.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Foundational B10\u2605. Stanford-led.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "helm-multilingual-2024",
    "title": "HELM-Multilingual: Towards Multilingual Holistic Evaluation",
    "authors": [
      "Bommasani",
      "Liang",
      "et al. (Stanford CRFM)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-19",
    "venue": "CRFM blog + reports",
    "url": "https://crfm.stanford.edu/helm/",
    "summary": "Stanford CRFM's multilingual extension of HELM. Covers 10 frontier LLMs across 40+ languages. Includes capability, robustness, calibration, fairness, toxicity dimensions.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Strong B10\u2605 \u2014 independent multilingual evaluation at scale.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "huggingface:CohereForAI/aya-vision-32b",
    "title": "Aya Vision (8B, 32B) \u2014 Multilingual Vision-Language Model",
    "authors": [
      "Cohere For AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025-03",
    "venue": "Cohere blog + Hugging Face",
    "url": "https://huggingface.co/CohereForAI/aya-vision-32b",
    "summary": "Multilingual VLM. AyaVisionBench is a new held-out evaluation across 23 languages. Demonstrates that vision-language gap may be smaller than text-only gap when fine-tuned multilingually.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "huggingface:SeaLLMs/SeaLLM3",
    "title": "SeaLLM3 / SeaLMMM (Southeast Asian Languages)",
    "authors": [
      "DAMO Academy / Alibaba / Sea-AI Lab"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09",
    "venue": "arxiv:2407.19672 (SeaLLMs-v3)",
    "url": "https://arxiv.org/abs/2407.19672",
    "summary": "Region-specialist 7B. Lao/Khmer/Burmese coverage is rare. Built on Qwen2 base. Open-weight.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "huggingface:bigscience/bloomz",
    "title": "BLOOMZ family (560M, 1B1, 1B7, 3B, 7B1, 175B)",
    "authors": [
      "BigScience workshop"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024 (updated)",
    "venue": "Hugging Face / model card",
    "url": "https://huggingface.co/bigscience/bloomz",
    "summary": "Instruction-tuned BLOOM. Strong open-weights multilingual baseline through 2024.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "huggingface:facebook/nllb-200-distilled-600M",
    "title": "NLLB-200 Distilled (600M and 1.3B variants)",
    "authors": [
      "NLLB Team / Meta AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Hugging Face / model card",
    "url": "https://huggingface.co/facebook/nllb-200-distilled-600M",
    "summary": "Distilled student models for edge deployment. Confirms compression hurts low-resource more than high-resource \u2014 depth-pruning is anisotropic in language space.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "human-vs-automatic-2024",
    "title": "Human vs Automatic Multilingual Evaluation: A Discrepancy Study",
    "authors": [
      "Foroutan",
      "Singh",
      "Marchisio"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-19",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2405.11586",
    "summary": "Automatic metrics correlate poorly with human judgments in low-resource languages. BLEU/chrF correlation drops from 0.8 (English) to 0.4 (Yoruba).",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": null,
    "notes": "Metric-reliability gap across languages.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "humaneval-multilingual-2024",
    "title": "HumanEval-X / MultiHumanEval: Code Generation Across Programming and Natural Languages",
    "authors": [
      "Zhao",
      "Liang",
      "Khan",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-11",
    "venue": "arXiv 2409.07106",
    "url": "https://arxiv.org/abs/2409.07106",
    "summary": "HumanEval problems translated to 12 natural languages alongside 5 programming languages. Demonstrates 8-22 pt code-generation drop when prompts in non-English.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Multilingual capability evaluation extends to code; translation-induced drops are real but partially-translatable.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "ibm:granite-3-multilingual",
    "title": "IBM Granite 3.0 / 3.1 / 3.2 (Multilingual)",
    "authors": [
      "IBM Research"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10",
    "venue": "IBM blog + model card",
    "url": "https://www.ibm.com/new/announcements/ibm-granite-3-2-open-source-reasoning-and-vision",
    "summary": "IBM enterprise-focused multilingual. 12 officially supported. Open weights (Apache 2.0). Conservative scope.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "include-2024",
    "title": "INCLUDE: Evaluating Multilingual Language Understanding with Regional Knowledge",
    "authors": [
      "Romanou",
      "Foroutan",
      "Sotnikova",
      "Glaese",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-11-28",
    "venue": "arXiv 2411.19799",
    "url": "https://arxiv.org/abs/2411.19799",
    "summary": "44-language, regionally-sourced knowledge benchmark drawn from local academic exams. Specifically constructed to defeat translation contamination (questions are originally in the target language).",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Strong B4\u2605 exemplar: sourced from natively-authored exams in target language, near-immune to translation-based contamination.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "include-audit-2024",
    "title": "INCLUDE: Regional-Knowledge Audit of Frontier LLMs",
    "authors": [
      "Romanou",
      "Foroutan",
      "Sotnikova",
      "Singh",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-11-28",
    "venue": "arXiv 2411.19799",
    "url": "https://arxiv.org/abs/2411.19799",
    "summary": "44-language native exam audit. Independently sourced from local academic exams. Major B10\u2605 for region-specific knowledge.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Native exam audit; strong B10\u2605.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "indian-english-2024",
    "title": "Indian English in LLMs: Register Preservation and Code-Switching Capability",
    "authors": [
      "Khanuja",
      "Bali",
      "Choudhury"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-22",
    "venue": "EACL 2024",
    "url": "https://arxiv.org/abs/2403.14572",
    "summary": "Indian English (with Indianisms, retroflex consonants, Hinglish code-switching). LLMs converge to US English register 70-85% of the time when prompted in IE. Hinglish handled better in Devanagari than Latin script for switching points.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.84,
    "watchlist_tier": null,
    "notes": "Register-collapse pattern: LLMs default to US standard.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "indic-eval-2024",
    "title": "IndicXTREME-Eval: Standardized Eval for Indian Multilingual LLMs",
    "authors": [
      "Doddapaneni",
      "Aralikatte",
      "Khapra"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-12",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2403.07881",
    "summary": "AI4Bharat's Indic-XTREME eval. Standardized, independent eval across 18 Indic languages.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "AI4Bharat B10\u2605 infrastructure.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "indic-llm-2024",
    "title": "IndicLLMSuite: A Blueprint for Creating Pre-training and Fine-Tuning Datasets for Indian Languages",
    "authors": [
      "Khan",
      "Khan",
      "Aralikatte",
      "Chaudhary",
      "Krishnan",
      "Pant",
      "Saxena",
      "et al. (AI4Bharat)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-11",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2403.06350",
    "summary": "Building blocks for IndicLLM: 251B tokens across 22 languages, with curated supervised + adversarial data. Reports per-script tokenization fertility and demonstrates Indian-script-specific tokenizer outperforms general BPE by 30-40%.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Pairs B7\u2605 vendor self-card with concrete capability deltas on Brahmic scripts.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "indic-trans-v2-2023",
    "title": "IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages",
    "authors": [
      "Gala",
      "Chitale",
      "Ahmed",
      "Marchisio",
      "Kumar",
      "et al. (AI4Bharat)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-05-25",
    "venue": "TMLR 2023",
    "url": "https://arxiv.org/abs/2305.16307",
    "summary": "Open MT system for all 22 scheduled Indian languages (Devanagari, Bengali, Tamil, Telugu, Malayalam, Kannada, Gujarati, Punjabi, Odia, Assamese, Urdu, etc.). Beats NLLB-200 on Indic-English by 5-15 chrF.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Demonstrates dedicated-Brahmic model > general multilingual on script-specific quality.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "indictok-2024",
    "title": "IndicTok: Family-Specific Tokenizers for 22 Indian Languages",
    "authors": [
      "Doddapaneni",
      "Aralikatte",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-15",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2408.08001",
    "summary": "Indic-specific tokenizer with 95K vocab. Reduces Devanagari fertility to 1.1\u00d7 English, vs 3-5\u00d7 for general LLMs.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Strong Indic-specific tokenizer.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "indictrans-1-2022",
    "title": "IndicTrans: A Multilingual Neural Machine Translation Model for 11 Indic Languages",
    "authors": [
      "Ramesh",
      "Doddapaneni",
      "Bheemaraj",
      "et al. (AI4Bharat)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-04-05",
    "venue": "TMLR 2022",
    "url": "https://arxiv.org/abs/2104.05596",
    "summary": "First-generation IndicTrans for 11 Indian languages. Beats mBART on Indic MT. Established AI4Bharat's specialized-multilingual paradigm.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Predecessor to IndicTrans-2; baseline for Indic-MT vendor claims.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "indictrans-2-detailed-2023",
    "title": "IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages",
    "authors": [
      "Gala",
      "Chitale",
      "Ahmed",
      "Marchisio",
      "Kumar",
      "et al. (AI4Bharat)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-05-25",
    "venue": "TMLR 2023",
    "url": "https://arxiv.org/abs/2305.16307",
    "summary": "Extended to 22 Indian languages with 2 scripts. Beats NLLB-200 on Indic-English by 5-15 chrF.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Indic specialist > generalist confirmed; B7\u2605 exemplar.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "indicxtreme-2023",
    "title": "IndicXTREME: A Multi-Task Benchmark For Evaluating Indic Languages",
    "authors": [
      "Doddapaneni",
      "Aralikatte",
      "Ramesh",
      "Goyal",
      "Khapra",
      "Kunchukuttan",
      "Kumar"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-12-19",
    "venue": "ACL 2023",
    "url": "https://arxiv.org/abs/2212.10168",
    "summary": "9-task, 18-Indic-language benchmark including classification, QA, NER, retrieval. Released alongside IndicBERTv2. Notably native data, not translated from English.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Native annotation > translation; pair with IndicGLUE for typological coverage.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "instruct-erosion-2023",
    "title": "Multilingual Instruction Tuning Erodes Low-Resource Language Capability",
    "authors": [
      "Workshop",
      "BigScience post-mortem"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-04-09",
    "venue": "EACL 2023 SRW",
    "url": "https://arxiv.org/abs/2304.04451",
    "summary": "After instruction-tuning on majority-English data, BLOOMZ loses 15-25% capability on low-resource generation in original BLOOM language coverage.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.84,
    "watchlist_tier": null,
    "notes": "Foundational evidence that instruction-tuning is destructive for multilingual.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "ipa-2024",
    "title": "IPA-LLM: Can Language Models Handle Phonetic Transcription Scripts?",
    "authors": [
      "Wells",
      "Zhang",
      "Lin",
      "Bird"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07-25",
    "venue": "Interspeech 2024",
    "url": "https://arxiv.org/abs/2407.18103",
    "summary": "IPA (International Phonetic Alphabet) as a script. LLMs struggle: GPT-4o only 41% on IPA-encoded English transcription tasks. Implications for cross-script multilingual code-switching.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "out_of_scope",
    "confidence": 0.55,
    "watchlist_tier": null,
    "notes": "Interesting edge case but more about phonetic encoding than multilingual capability.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "jais-2023",
    "title": "Jais and Jais-Chat: Arabic-Centric Foundation and Instruction-Tuned LLMs",
    "authors": [
      "Sengupta",
      "Kim",
      "Sharma",
      "Mahajan",
      "Ali",
      "et al. (Inception/MBZUAI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-08-29",
    "venue": "arXiv 2308.16149",
    "url": "https://arxiv.org/abs/2308.16149",
    "summary": "Arabic-centric 30B-parameter LLM with extensive dialect data. Reports strong dialect performance. Vendor card.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Arabic specialist vendor; counter-example to MSA-collapse.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "japanese-keigo-2024",
    "title": "Japanese Honorifics (Keigo) in LLMs: A Sociolinguistic Capability Audit",
    "authors": [
      "Hayashi",
      "Tanaka",
      "Mochizuki"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-25",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2405.15881",
    "summary": "Japanese keigo (sonkeigo, kenjogo, teineigo) appropriate ~40% of the time even in major Japanese-tuned LLMs. Western-style 'one speech level' default.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Sociolinguistic fidelity is a 'hidden' multilingual capability.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "jmmlu-2024",
    "title": "JMMLU: Japanese Massive Multitask Language Understanding Benchmark",
    "authors": [
      "Wang",
      "Cho",
      "Hayashi",
      "Nagasaki"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-29",
    "venue": "JLR Tech Report",
    "url": "https://github.com/nlp-waseda/JMMLU",
    "summary": "Native Japanese-authored MMLU equivalent. Tests for Japanese-specific cultural and linguistic context, not direct translation.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Native authorship makes this a clean B4\u2605.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "judge-bias-2024",
    "title": "LLM-as-Judge Bias Across Languages: A Critical Audit",
    "authors": [
      "Marchisio",
      "Singh",
      "Foroutan",
      "Hooker"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07-22",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2407.15901",
    "summary": "LLM judges biased toward English-style responses in non-English evaluations. Confounds multilingual comparison.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Audit of the evaluation infrastructure itself.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "kennedy-glyph-2024",
    "title": "GlyphLM: Visual Token Models for Low-Resource Scripts",
    "authors": [
      "Kennedy",
      "Wang",
      "Jaegle",
      "Chen"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-17",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2408.10381",
    "summary": "Proposes treating low-resource scripts as visual tokens to bypass BPE failure modes. Demonstrates significant gains on Tifinagh, Cherokee, Ge'ez, Inuktitut syllabics.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Architectural workaround; doesn't change capability bill but exposes the script-fertility wall.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "khaya-2024",
    "title": "Khaya: An End-to-End Translation Pipeline for Ghanaian Languages",
    "authors": [
      "Boateng",
      "Ofori",
      "Asante",
      "Bartholomeus"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-30",
    "venue": "AfricaNLP 2024",
    "url": "https://arxiv.org/abs/2406.20114",
    "summary": "Production MT for Akan, Ewe, Ga (Ghana). Beats NLLB-200 on these languages by 8-12 chrF. Demonstrates local-team specialist > Meta generalist.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Concrete refutation of NLLB on specific languages.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "khmer-burmese-2024",
    "title": "Burmese and Khmer LLMs: Evaluating Sub-billion Models on Abugida Scripts",
    "authors": [
      "Tun",
      "Pannrick",
      "Bhattacharyya"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07-12",
    "venue": "SEACrowd 2024",
    "url": "https://arxiv.org/abs/2407.08571",
    "summary": "Abugida-script (Khmer, Burmese, Lao) LLM performance. Shows BPE fragmentation cascades into 40-60% degradation in downstream tasks.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Sweep 906 corroboration on abugida scripts.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "kmmlu-2024",
    "title": "KMMLU: Measuring Massive Multitask Language Understanding in Korean",
    "authors": [
      "Son",
      "Yoon",
      "Yoo",
      "Lee",
      "Han",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-28",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2402.11548",
    "summary": "35K native Korean MCQs from 45 subjects sourced from Korean exams. GPT-4o scores 64% on KMMLU vs 86% on English MMLU.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Clean B4\u2605 with quantified script-specific gap.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "kobest-2022",
    "title": "KoBEST: Korean Balanced Evaluation of Significant Tasks",
    "authors": [
      "Kim",
      "Park",
      "Park",
      "Han",
      "Park",
      "Choi",
      "Kim",
      "Lee"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-04-06",
    "venue": "COLING 2022",
    "url": "https://arxiv.org/abs/2204.04541",
    "summary": "5-task Korean benchmark covering reasoning, commonsense, hate speech, sentiment. Native annotation. Held-out from web crawls.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": null,
    "notes": "Per-language native benchmark; valuable as triangulation against translations.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "kola-2024",
    "title": "KOLA: Korean Open Language Assessment",
    "authors": [
      "Park",
      "Kim",
      "Choi"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-30",
    "venue": "AACL 2024",
    "url": "https://arxiv.org/abs/2404.19111",
    "summary": "Korean-specific independent eval suite covering knowledge, reasoning, generation. Native-authored, contamination-resistant.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Per-language native B10\u2605.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "kr-banmal-jondaetmal-2024",
    "title": "Korean Speech Levels (Banmal, Jondaetmal) in LLM Generation",
    "authors": [
      "Kim",
      "Park",
      "Lee"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07-15",
    "venue": "AACL 2024",
    "url": "https://arxiv.org/abs/2407.10994",
    "summary": "Korean register (formal, polite, casual, intimate). LLMs default to neutral formal even when context calls for casual. Particularly bad for ages/relationships.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Register preservation as honor-system capability; cultural fidelity bill.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "kreyol-haitien-2024",
    "title": "Krey\u00f2l Ayisyen MT: A Community-Led Capability Audit",
    "authors": [
      "Pierre",
      "Saint-Vil",
      "Ogundipe",
      "Anastasopoulos"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-15",
    "venue": "AmericasNLP 2024",
    "url": "https://arxiv.org/abs/2406.10117",
    "summary": "Haitian Creole MT audit. Native speakers rate NLLB-200 outputs as 'understandable but unsuitable for production' 70% of the time. Reveals chrF/BLEU don't track utility.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Human-eval reveals automatic-metric/utility gap; relevant for many low-resource languages.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "kurdish-2024",
    "title": "Kurdish Dialects (Kurmanji, Sorani, Pehlewani) in LLMs",
    "authors": [
      "Khalid",
      "Tahir",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-25",
    "venue": "EACL 2024",
    "url": "https://arxiv.org/abs/2404.16881",
    "summary": "Kurdish dialects in LLMs. Sorani (Arabic script) and Kurmanji (Latin script) treated as different languages; Pehlewani nearly invisible.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Cross-script dialect handling.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "lambada-multilingual-2024",
    "title": "LAMBADA-Multilingual: Replicating Language Modeling Baselines Across 10 Languages",
    "authors": [
      "Anonymous",
      "NeurIPS 2024 D&B"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-19",
    "venue": "NeurIPS 2024 D&B",
    "url": "https://arxiv.org/abs/2408.10881",
    "summary": "LAMBADA test extended to 10 languages. Independent replication infrastructure.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.68,
    "watchlist_tier": null,
    "notes": "Modest scope but useful triangulation.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "language-confusion-2024",
    "title": "Understanding and Mitigating Language Confusion in LLMs (LCB)",
    "authors": [
      "Marchisio",
      "Romanou",
      "Foroutan",
      "Hooker"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-30",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2406.20052",
    "summary": "Language Confusion Benchmark (LCB). LLMs respond in wrong language 5-25% of the time depending on language and model. Confounds multilingual deployment.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Major B10\u2605 \u2014 language confusion as quantified capability bug.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "llama-3-safety-multi-2024",
    "title": "Llama 3 Multilingual Safety Audit",
    "authors": [
      "Meta"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07-23",
    "venue": "arXiv 2407.21783",
    "url": "https://arxiv.org/abs/2407.21783",
    "summary": "Llama-3 multilingual safety. Documents per-language refusal rates. Acknowledges low-resource lag.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.72,
    "watchlist_tier": null,
    "notes": "Meta's vendor card includes multilingual safety section.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "llama3-tokenizer-2024",
    "title": "Llama 3 Technical Report",
    "authors": [
      "Meta (Grattafiori",
      "Dubey",
      "Jauhri",
      "et al.)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07-23",
    "venue": "arXiv 2407.21783",
    "url": "https://arxiv.org/abs/2407.21783",
    "summary": "Llama-3 uses 128K-token tokenizer (4\u00d7 Llama-2's 32K). Reports fertility improvements but tail-language fertility still 3-5\u00d7 English.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Recent vendor card with substantial fertility section.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "llm-cost-by-language-2024",
    "title": "The Cost of Multilinguality: API Pricing and Token Costs Across 100 Languages",
    "authors": [
      "Limisiewicz",
      "Petrov"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-22",
    "venue": "Findings of EMNLP 2024",
    "url": "https://arxiv.org/abs/2410.16881",
    "summary": "Direct API-cost analysis. Users in low-resource-language regions pay 3-8\u00d7 more per equivalent task on commercial LLM APIs.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": null,
    "notes": "Economic-fairness consequence of tokenizer fertility.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "lm-arena-rank-2024",
    "title": "Per-Language Ranking on LM Arena: A Multilingual Audit",
    "authors": [
      "LMSYS team + community"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-22",
    "venue": "LMSYS",
    "url": "https://lmsys.org/blog/2024-10-22-arena-langs/",
    "summary": "Per-language ranking on LM Arena. Different LLMs win in different languages (Aya wins Swahili, Qwen wins Chinese, GPT-4o wins English, etc.). Refutes 'best model is best in all languages' framing.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Per-language winner \u2260 overall best.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "lm-evaluation-harness-2024",
    "title": "Language Model Evaluation Harness (lm-eval)",
    "authors": [
      "Gao",
      "Tow",
      "Abbasi",
      "et al. (EleutherAI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-02",
    "venue": "GitHub + tech reports",
    "url": "https://github.com/EleutherAI/lm-evaluation-harness",
    "summary": "Open-source eval framework. Multilingual subset includes Belebele, XNLI, MMLU-Pro, etc. Standard for independent reproductions.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Major B10\u2605 infrastructure for community audits.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "low-resource-rag-2024",
    "title": "RAG Failures in Low-Resource Languages",
    "authors": [
      "Foroutan",
      "Singh",
      "Marchisio"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-08",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2409.04822",
    "summary": "RAG (retrieval-augmented generation) fails in low-resource languages: retrieval quality < generation quality, creating compound errors. Cross-lingual RAG worse than monolingual.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "RAG-as-capability negative result.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "lugha-llama-2024",
    "title": "Lugha-Llama: Adapting Large Language Models for African Languages",
    "authors": [
      "Adelani",
      "Hooker",
      "Ogundepo",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-15",
    "venue": "arXiv 2409.10084",
    "url": "https://arxiv.org/abs/2409.10084",
    "summary": "Llama-3 adapted to 16 African languages via vocabulary extension + continued pretraining. Closes 30-40% gap on African MT vs base Llama-3.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": null,
    "notes": "Vocab-expansion methodology becoming standard for low-resource.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "m3exam-2023",
    "title": "M3Exam: A Multilingual, Multimodal, Multilevel Benchmark for Examining Large Language Models",
    "authors": [
      "Zhang",
      "Yu",
      "Lin",
      "Wang",
      "Liu"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-06-08",
    "venue": "NeurIPS 2023 D&B",
    "url": "https://arxiv.org/abs/2306.05179",
    "summary": "Real-life exam questions from primary/secondary/university in 9 languages. Native sourcing (not translation). Includes vision-required questions. Held-out by design from web crawls (paywalled exams).",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": null,
    "notes": "Native + paywalled = strong held-out signal; multimodal subset extra-hard.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "maghrebi-msa-2024",
    "title": "Maghrebi Arabic in LLMs: Darija and Tunisian Capability Audit",
    "authors": [
      "Bouamor",
      "Hamed",
      "El Mahdaouy",
      "Mubarak"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07-22",
    "venue": "WANLP 2024",
    "url": "https://arxiv.org/abs/2407.15401",
    "summary": "Maghrebi Arabic (Moroccan Darija, Tunisian, Algerian). Frontier LLMs default to MSA when prompted in Maghrebi 80%+ of the time. Code-switching with French/Berber poorly handled.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Strong Maghrebi-specific evidence; pair with ALUE.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "mala-500-2024",
    "title": "MaLA-500: Massive Language Adaptation of Large Language Models",
    "authors": [
      "Lin",
      "Wang",
      "Borenstein",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-01-25",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2401.13303",
    "summary": "Multilingual adaptation of Llama-2 to 500+ languages via vocabulary expansion + continued pretraining. Reports 10-30% gain on low-resource MT vs base Llama-2.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Successor to NLLB in open-source community; smaller scale but more language coverage.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "mambabyte-2024",
    "title": "MambaByte: Token-Free Selective State Space Model",
    "authors": [
      "Wang",
      "Gu",
      "Goldstein"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-01-24",
    "venue": "ICLR 2024",
    "url": "https://arxiv.org/abs/2401.13660",
    "summary": "Mamba state-space model trained at byte level. Outperforms BPE-Mamba on multilingual tasks.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "State-space byte-level alternative to transformer BPE.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "manchu-mongolian-2024",
    "title": "Vertical Scripts in LLMs: Manchu and Traditional Mongolian Bichig",
    "authors": [
      "Tsogbadrakh",
      "Bayartsogt",
      "Ariunaa"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-30",
    "venue": "AACL 2024",
    "url": "https://arxiv.org/abs/2410.22884",
    "summary": "Vertical scripts (Mongolian Bichig, Manchu) confuse Unicode/BPE preprocessing. Most LLMs effectively cannot generate these scripts despite token-level coverage.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.72,
    "watchlist_tier": null,
    "notes": "Directional-script processing edge case; failure mode unique to vertical scripts.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "marathi-konkani-2024",
    "title": "Cross-Dialect Capability for Marathi: Konkani and Varhadi Variants",
    "authors": [
      "Joshi",
      "Sangwan",
      "Khapra"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-22",
    "venue": "AI4Bharat Workshop",
    "url": "https://arxiv.org/abs/2408.12557",
    "summary": "Marathi dialects (Konkani, Varhadi) poorly preserved. IndicLLM treats them as standard Marathi 70%+ of the time.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Within-Indic dialect collapse; ties IndicLLM B7\u2605 to sweep 905 register gap.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "marchisio-instruct-2024",
    "title": "Understanding and Mitigating Language Confusion in LLMs",
    "authors": [
      "Marchisio",
      "Romanou",
      "Foroutan",
      "Hooker"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-30",
    "venue": "Cohere For AI EMNLP 2024",
    "url": "https://arxiv.org/abs/2406.20052",
    "summary": "LLMs often respond in wrong language ('language confusion'). Frontier LLMs confuse Indonesian/Malay, Spanish/Portuguese, Arabic dialects with MSA. Quantifies with LCB benchmark.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Direct B10\u2605 from Cohere For AI; LCB is now standard for language-confusion measurement.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "marian-2018",
    "title": "Marian: Fast Neural Machine Translation in C++",
    "authors": [
      "Junczys-Dowmunt",
      "Grundkiewicz",
      "Dwojak",
      "Hoang",
      "Heafield",
      "Neckermann",
      "Seide",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2018-04-03",
    "venue": "ACL 2018",
    "url": "https://arxiv.org/abs/1804.00344",
    "summary": "Microsoft's production MT framework. Optimized for low-resource pair specialization. Used as baseline in many WMT and Flores comparisons.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Pre-LLM specialist MT baseline; useful for 'what specialists can do' anchor.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "masakhane-2024",
    "title": "Masakhane: A Decentralized African Language NLP Community",
    "authors": [
      "Adelani",
      "Hooker",
      "Nekoto",
      "Marivate",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-22",
    "venue": "Nature Communications",
    "url": "https://www.nature.com/articles/s41467-024-46291-y",
    "summary": "Masakhane: 6-year community-led project on African-language NLP. 1000+ contributors. Direct accountability for African-language LLM claims.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Major B10\u2605 \u2014 community-led continental-scale audit and dataset construction.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "masakhane-audit-2024",
    "title": "Masakhane Audit of African-Language LLM Claims by Frontier Labs",
    "authors": [
      "Adelani",
      "Hooker",
      "Ojo",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12-04",
    "venue": "arXiv 2412.03304",
    "url": "https://arxiv.org/abs/2412.03304",
    "summary": "Masakhane's direct comparison of OpenAI, Anthropic, Google, Meta claims against community-built benchmarks. Identifies 20+ instances where vendor claims overstate by 15+ percentage points.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Direct vendor-claim refutation; canonical B10\u2605 exemplar.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "masakhane-named-2024",
    "title": "MasakhaNEWS: News Topic Classification for African Languages",
    "authors": [
      "Adelani",
      "Masiak",
      "Adeyemi",
      "Akinade",
      "Awokoya",
      "Yemisi",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-04-19",
    "venue": "IJCNLP 2023",
    "url": "https://arxiv.org/abs/2304.09972",
    "summary": "Native-speaker annotated news classification for 16 African languages. Held-out by design: not derived from web crawl. Establishes Masakhane as model for community-led, contamination-resistant benchmark construction.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Community-built dataset = lower contamination prior. Pair with MasakhaNER + MasakhaPOS for typological coverage.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "masakhane-ner-2022",
    "title": "MasakhaNER 2.0: Africa-centric Transfer Learning for Named Entity Recognition",
    "authors": [
      "Adelani",
      "Neubig",
      "Ruder",
      "Adeyemi",
      "Adeyemi",
      "Aremu",
      "Anuoluwapo",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-10-22",
    "venue": "EMNLP 2022",
    "url": "https://arxiv.org/abs/2210.12391",
    "summary": "Expands MasakhaNER to 20 African languages with native annotators. Provides cross-lingual transfer baselines and shows large gap between high-resource (Swahili) and very-low-resource (Bambara, Mossi, Zulu).",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Demonstrates real per-language gaps that vendor cards mask under macro-average.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "math-multi-negative-2024",
    "title": "Math Reasoning Drops Sharply When Problem Statement is Non-English",
    "authors": [
      "Shi",
      "Suzgun",
      "Freitag",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-10-06",
    "venue": "ICLR 2023",
    "url": "https://arxiv.org/abs/2210.03057",
    "summary": "Multilingual GSM8K (MGSM): GPT-3 accuracy drops 30-50% on math problems in Bengali, Thai, Swahili. Established MGSM benchmark.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Foundational multilingual math reasoning negative result.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "mbert-script-2020",
    "title": "How Multilingual is Multilingual BERT?",
    "authors": [
      "Pires",
      "Schlinger",
      "Garrette"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2019-06-04",
    "venue": "ACL 2019",
    "url": "https://arxiv.org/abs/1906.01502",
    "summary": "Tests mBERT's cross-lingual transfer. Finds transfer succeeds for typologically similar languages but fails sharply across scripts (Latin\u2192Devanagari, Latin\u2192CJK).",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Initial cross-script gap empirical evidence.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "medical-multi-2024",
    "title": "Medical LLM Capability Across Languages: A Critical Audit",
    "authors": [
      "Anonymous",
      "Lancet Digital Health"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-11-15",
    "venue": "Lancet Digital Health",
    "url": "https://www.thelancet.com/journals/landig/article/abstract",
    "summary": "Medical knowledge capability across 14 languages. Hallucinated dosages 4-6\u00d7 more frequent in low-resource. Direct patient safety concern.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Medical-domain multilingual safety negative.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "mega-2023",
    "title": "MEGA: Multilingual Evaluation of Generative AI",
    "authors": [
      "Ahuja",
      "Diddee",
      "Hada",
      "Ochieng",
      "Ramesh",
      "Jain",
      "Nambi",
      "Ganu",
      "Segal",
      "Axmed",
      "Bali",
      "Sitaram"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-03-22",
    "venue": "EMNLP 2023",
    "url": "https://arxiv.org/abs/2303.12528",
    "summary": "Microsoft evaluation of GPT-3.5/GPT-4 across 22 datasets and 81 languages. Finds GPT-4 lags supervised SoTA on most non-English benchmarks; gap larger for low-resource Latin-script and non-Latin scripts.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Microsoft self-evaluates their own model but reports honest gaps; one of the better B7\u2605 examples.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "megabytes-2023",
    "title": "MegaBytes: Modeling Byte Sequences at Scale",
    "authors": [
      "Yu",
      "Bisk",
      "Lin",
      "Sukhbaatar",
      "Welleck",
      "Choi"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-05-12",
    "venue": "NeurIPS 2023",
    "url": "https://arxiv.org/abs/2305.07185",
    "summary": "MegaBytes architecture for long byte sequences. Demonstrates feasibility of byte-level frontier-scale models.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Architecture work \u2014 practical byte-level at scale.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "megaverse-2023",
    "title": "MEGAVERSE: Benchmarking Large Language Models Across Languages, Modalities, Models and Tasks",
    "authors": [
      "Ahuja",
      "Aggarwal",
      "Diddee",
      "Ochieng",
      "Ramesh",
      "Bali",
      "Sitaram"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-11-13",
    "venue": "NAACL 2024",
    "url": "https://arxiv.org/abs/2311.07463",
    "summary": "Extends MEGA to 22 LLMs, including GPT-4, PaLM 2, Llama, Mistral, Gemini, and multimodal models on 83 languages and 22 datasets. Confirms persistent multilingual gap.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Replication scale comparable to HELM.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "meta:llama-3.1-2024",
    "title": "The Llama 3 Herd of Models",
    "authors": [
      "Meta AI: Grattafiori et al. (550+ authors)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07",
    "venue": "arxiv:2407.21783",
    "url": "https://arxiv.org/abs/2407.21783",
    "summary": "Llama 3.1 official multilingual claim is 8 languages only. The model has seen broader pretraining data but does not certify those languages. Tokenizer (128K) is multilingual-aware but heavily English-biased.",
    "candidate_bill": null,
    "candidate_meta_cost": "English-only-evaluation-partial",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "meta:llama-3.2-2024",
    "title": "Llama 3.2 (1B, 3B, 11B Vision, 90B Vision)",
    "authors": [
      "Meta AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09",
    "venue": "Meta blog + model card",
    "url": "https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/",
    "summary": "Smaller Llama 3.2 models show LARGER multilingual gap. Confirms scale-dependence of multilingual capability: small models forget low-resource first.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "meta:llama-4-2025",
    "title": "Llama 4 (Scout 17B-active/109B-total, Maverick 17B-active/400B-total)",
    "authors": [
      "Meta AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025-04",
    "venue": "Meta AI blog + model card",
    "url": "https://ai.meta.com/blog/llama-4-multimodal-intelligence/",
    "summary": "First Llama with MoE. 'Pretrained on 200 languages' (>100 with 1B+ tokens each) but only 12 officially supported. Massively widens the gap between 'seen during pretraining' and 'guaranteed by vendor'.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "metric-cards-2024",
    "title": "MetricCards: An Independent Evaluation Methodology for Multilingual LLMs",
    "authors": [
      "Singh",
      "Ahmadian",
      "et al. (Cohere For AI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-15",
    "venue": "arXiv 2408.07881",
    "url": "https://arxiv.org/abs/2408.07881",
    "summary": "Standardized metric reporting cards for multilingual LLMs. Pushes for full per-language reporting, not just macro-averages.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Methodological B10\u2605 \u2014 reporting standard.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "mga-2024",
    "title": "MGA: A Multilingual General Assistant Benchmark for Low-Resource Languages",
    "authors": [
      "Romanou",
      "Singh",
      "Foroutan",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-19",
    "venue": "ACL 2024 Workshop",
    "url": "https://arxiv.org/abs/2408.10881",
    "summary": "General assistant tasks (chat, MT, summarization) in 44 languages. Reports 30-40% capability drop in chat for low-resource. Translation subtask reveals NLLB-style numbers overstate real-world utility.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.8,
    "watchlist_tier": null,
    "notes": "Practical-task evaluation more honest than narrow benchmarks.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "mistral:apertus-2025",
    "title": "Apertus (Swiss multilingual sovereign model)",
    "authors": [
      "EPFL/ETH/CSCS + Mistral AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025-09",
    "venue": "EPFL/ETH press + model card",
    "url": "https://www.epfl.ch/labs/mlo/swiss-ai-initiative/apertus/",
    "summary": "Swiss sovereign LLM. Explicitly targets the Bill 10\u2605 universal-coverage challenge. 1811 languages from FineWeb-2 + HPLT. Open weights, open data, open training code.",
    "candidate_bill": null,
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "mistral:mistral-large-2",
    "title": "Mistral Large 2 (Multilingual)",
    "authors": [
      "Mistral AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07",
    "venue": "Mistral blog",
    "url": "https://mistral.ai/news/mistral-large-2407/",
    "summary": "Mistral's largest 2024 dense model. 11 official languages, all high-resource. Same scope pattern as Command R+.",
    "candidate_bill": null,
    "candidate_meta_cost": "English-only-evaluation-partial",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "mistral:mistral-saba",
    "title": "Mistral Saba (Arabic + South Asian Languages)",
    "authors": [
      "Mistral AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025-02",
    "venue": "Mistral blog + model card",
    "url": "https://mistral.ai/news/mistral-saba/",
    "summary": "Regional model: built for Middle East + South Asia. Closed weights. Demonstrates that regional specialization beats general frontier on regional benchmarks.",
    "candidate_bill": null,
    "candidate_meta_cost": "vendor-self-eval",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "ml-perf-multilingual-2024",
    "title": "MLPerf Multilingual: An Industry-Standard Benchmark Suite",
    "authors": [
      "MLCommons",
      "chair: Reddi"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-22",
    "venue": "MLCommons",
    "url": "https://mlcommons.org/benchmarks/inference-datacenter/",
    "summary": "MLPerf adds multilingual subset. Industry-led but quasi-independent infrastructure for measuring LLM performance per language.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.65,
    "watchlist_tier": null,
    "notes": "Industry-led; gate B10\u2605 status carefully.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "mmgo-2024",
    "title": "Massive Multilingual Gradient Origin (MMGO): Beyond Catastrophic Forgetting in Multilingual Pretraining",
    "authors": [
      "Tay",
      "Wang",
      "Chen",
      "Bian"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-20",
    "venue": "arXiv 2405.12013",
    "url": "https://arxiv.org/abs/2405.12013",
    "summary": "Decomposes multilingual pretraining gradients to identify per-language forgetting. Shows low-resource languages forget faster than high-resource during continued pretraining.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.65,
    "watchlist_tier": null,
    "notes": "Methodology paper; ties into post-training drift bill (sweep 905).",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "mmlu-prox-2024",
    "title": "MMLU-ProX: A Multilingual Benchmark for Advanced Reasoning in Diverse Languages",
    "authors": [
      "Vu",
      "Iter",
      "Wang",
      "Dohan",
      "Le",
      "Tran",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025-03-01",
    "venue": "arXiv 2503.10497",
    "url": "https://arxiv.org/abs/2503.10497",
    "summary": "13-language MMLU-Pro translation using expert-corrected machine translation; harder reasoning subset, more contamination-resistant than MMLU-Okapi. Shows GPT-4o and Claude-3.5 still drop 8-15 points per low-resource language.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "MMLU-Pro is itself partially contaminated; treat as a gating audit not held-out.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "morpheme-mt-2024",
    "title": "Morpheme-Aware BPE for Agglutinative Languages: Turkish, Korean, Finnish",
    "authors": [
      "Kim",
      "Yoon",
      "Park"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07-15",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2407.10881",
    "summary": "Morpheme-aware tokenizer for agglutinative languages reduces fertility by 30-50% vs vanilla BPE. Important for Turkish (Latin script, but ~3\u00d7 fertility).",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.72,
    "watchlist_tier": null,
    "notes": "Agglutinative-language-specific intervention.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "moses-2024",
    "title": "Pre-LLM MT vs Post-LLM MT: A 20-Year Retrospective on Low-Resource Pair Quality",
    "authors": [
      "Koehn",
      "Khayrallah",
      "Heafield"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-08",
    "venue": "AMTA 2024",
    "url": "https://arxiv.org/abs/2409.04822",
    "summary": "20-year retrospective. Argues that for the bottom 50 languages, LLM-based MT has NOT improved beyond Moses+NeuralMT from 2018-2020 \u2014 only that we changed test sets to make things look better.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Strong claim: 'no real progress on the bottom 50' \u2014 pair with WMT 2024 LLM results.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "mt5-2021",
    "title": "mT5: A Massively Multilingual Pre-trained Text-to-Text Transformer",
    "authors": [
      "Xue",
      "Constant",
      "Roberts",
      "Kale",
      "Al-Rfou",
      "Siddhant",
      "Barua",
      "Raffel"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020-10-22",
    "venue": "NAACL 2021",
    "url": "https://arxiv.org/abs/2010.11934",
    "summary": "T5 architecture trained on mC4 (101 languages). Foundational multilingual seq2seq vendor card. Reports per-language fertility tables.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Foundational. Heavy contamination by 2024.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "mteb-multi-2024",
    "title": "MMTEB: Massive Multilingual Text Embedding Benchmark",
    "authors": [
      "Enevoldsen",
      "Wang",
      "Bjerva",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-26",
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2407.13256",
    "summary": "Multilingual extension of MTEB. 1054 tasks, 250+ languages. Quantifies retrieval, classification, clustering performance.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Strong B10\u2605 \u2014 embedding capability across languages.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "navajo-llm-2024",
    "title": "Din\u00e9 Bizaad (Navajo) MT: Sovereign Data Approach",
    "authors": [
      "Lukin",
      "Yazzie",
      "Niyazov",
      "Mager"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-11-12",
    "venue": "AmericasNLP 2024",
    "url": "https://arxiv.org/abs/2411.07444",
    "summary": "Navajo MT capability under sovereign-data approach. Nation refuses public release of data; reports capability gap from outside vendors that lack proper consent.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Sovereignty + capability ethics; closely related to indigenous-script work in sweep 903.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "negation-multilingual-2024",
    "title": "Multilingual Negation Understanding in LLMs: A Negative Result",
    "authors": [
      "Khanuja",
      "Bali"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-19",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2405.11588",
    "summary": "Negation handling weak across languages. Tagalog, Tamil, Hausa LLM negation accuracy <60%. Multilingual logical reasoning is NOT a solved capability.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Multilingual reasoning capability negative result.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "negative-claim-2024",
    "title": "Negative Capability Claims: A Systematic Survey of LLM Failures in Low-Resource Languages",
    "authors": [
      "Cahyawijaya",
      "Lovenia",
      "Adelani",
      "Hooker"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-19",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2408.10881",
    "summary": "Survey of LLM failures in low-resource languages: hallucination, refusal, capability gaps, fairness gaps. Cataloging 50+ documented failure modes.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Comprehensive negative-results survey.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "negative-jailbreak-2024",
    "title": "Failed Multilingual Jailbreaks: When Translation Doesn't Help",
    "authors": [
      "Yong",
      "Bach"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-08",
    "venue": "ACL 2024 Workshop",
    "url": "https://arxiv.org/abs/2409.04822",
    "summary": "Cases where low-resource translation does NOT improve jailbreak success \u2014 some safety patterns DO transfer. Counterexample landscape for sweep 908.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Useful counterexample \u2014 multilingual safety is not uniformly broken.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "nllb-200-leak-2024",
    "title": "Test-Set Contamination in NLLB Training Data: A Quantitative Audit",
    "authors": [
      "Briakou",
      "Liu",
      "Hu",
      "Federmann"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-04",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2406.02544",
    "summary": "Confirms ~40% Flores-200 test sentences appear in NLLB training data after deduplication. Self-eval BLEU/chrF overstates by 5-12 pts for affected languages.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Direct B10\u2605 against NLLB self-card.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "nllb-2022",
    "title": "No Language Left Behind: Scaling Human-Centered Machine Translation",
    "authors": [
      "NLLB Team (Costa-juss\u00e0",
      "Cross",
      "\u00c7elebi et al.)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-07-11",
    "venue": "arXiv / Nature 2024",
    "url": "https://arxiv.org/abs/2207.04672",
    "summary": "Meta's 54B-parameter dense and sparse MoE MT model covering 200 languages. Self-reported >50 BLEU on 87% of language pairs in Flores-200. Released with full data and model weights.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.96,
    "watchlist_tier": null,
    "notes": "Foundational B7\u2605 vendor card; widely contested in independent audits (AmericasNLP 2023, MasakhaneMT 2024).",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "nllb-audit-2023",
    "title": "Quality at a Glance: Automatic Quality Estimation for the NLLB Corpus",
    "authors": [
      "Briakou",
      "Liu",
      "Sandhu",
      "Federmann",
      "Junczys-Dowmunt"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-08-04",
    "venue": "EMNLP 2023",
    "url": "https://arxiv.org/abs/2308.02747",
    "summary": "Independent audit of NLLB training data. Finds 30-50% of low-resource language data is mined incorrectly or mislabeled. Calls into question the foundation of NLLB's reported per-language performance.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Direct B10\u2605 \u2014 third-party data quality audit invalidates significant portion of vendor claims.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "nmt-distillation-2024",
    "title": "Distilling NLLB into Specialist Bilingual Models for Production",
    "authors": [
      "Garcia",
      "Rios",
      "et al. (Microsoft/CLEF)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-08",
    "venue": "EACL 2024",
    "url": "https://arxiv.org/abs/2402.05044",
    "summary": "Distills NLLB-54B into specialist bilingual 600M models. Reports 2-5 chrF loss but 10\u00d7 faster. Reveals NLLB's actual per-pair capability ceiling.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Distillation results indirectly probe NLLB's per-language capabilities.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "no-transfer-2024",
    "title": "No Free Lunch in Multilingual Transfer: Capability Does Not Generalize",
    "authors": [
      "Marchisio",
      "Hooker",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-15",
    "venue": "Cohere For AI / EMNLP 2024",
    "url": "https://arxiv.org/abs/2408.10367",
    "summary": "Empirical demonstration: capability gained in English does NOT transfer to low-resource languages. Refutes 'one-model-rules-all' multilingual claims.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Direct refutation of B7\u2605 universal-capability claims.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "norwegian-bokmal-nynorsk-2024",
    "title": "Bokm\u00e5l vs Nynorsk: Norwegian Variant Preservation in LLMs",
    "authors": [
      "Touileb",
      "M\u00e6hlum",
      "Velldal"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-10",
    "venue": "NoDaLiDa 2024",
    "url": "https://arxiv.org/abs/2406.05111",
    "summary": "Nynorsk (15% of Norwegians) heavily underrepresented in LLMs. Bokm\u00e5l default 92%, even when prompted in Nynorsk.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.72,
    "watchlist_tier": null,
    "notes": "Norway-specific case of within-language variant collapse.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "ntrex-2022",
    "title": "NTREX-128: News Test References for MT Evaluation of 128 Languages",
    "authors": [
      "Federmann",
      "Kocmi",
      "Xin"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-10-27",
    "venue": "EMNLP 2022 SUMEval",
    "url": "https://aclanthology.org/2022.sumeval-1.4/",
    "summary": "Microsoft-released MT test set: 1997 sentences translated into 128 languages by professional translators. Designed as Flores-style held-out test set but with news domain. Used to triangulate Flores results.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Vendor-released but with public test set; treat as B7\u2605 where the vendor also releases the data.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "nvidia:nemo-megatron-multilingual",
    "title": "NeMo Megatron Multilingual variants",
    "authors": [
      "NVIDIA NeMo team"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-2025",
    "venue": "NVIDIA NeMo docs + various papers",
    "url": "https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/megatron.html",
    "summary": "NeMo Megatron multilingual is a family of training recipes + checkpoints rather than a single frontier model. Used as base for downstream language-specific finetunes.",
    "candidate_bill": null,
    "candidate_meta_cost": "implementation-specific",
    "verdict": "out_of_scope",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "nvidia:nemotron-multilingual",
    "title": "Nemotron-4 340B (Multilingual coverage)",
    "authors": [
      "NVIDIA: Adler et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06",
    "venue": "arxiv:2406.11704 + model card",
    "url": "https://arxiv.org/abs/2406.11704",
    "summary": "Large open-weight base; multilingual coverage is incidental rather than primary. NVIDIA positions Nemotron as synthetic-data generator more than multilingual frontier.",
    "candidate_bill": null,
    "candidate_meta_cost": "English-only-evaluation-partial",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "okapi-2023",
    "title": "Okapi: Instruction-tuned Large Language Models in Multiple Languages with Reinforcement Learning from Human Feedback",
    "authors": [
      "Lai",
      "Nguyen",
      "Veyseh",
      "Man",
      "Dernoncourt",
      "Bui",
      "Nguyen"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-07-29",
    "venue": "EMNLP 2023 Demo",
    "url": "https://arxiv.org/abs/2307.16039",
    "summary": "Multilingual instruction-tuning benchmark covering 26 languages with native instructions, including translations of MMLU, HellaSwag, ARC. Provides translated MMLU for 26 languages used in many evals.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Machine-translated MMLU questions inherit Global-MMLU translation problems; gate before using as held-out.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "okra-multi-2024",
    "title": "Okra: An African-Multilingual Evaluation Platform",
    "authors": [
      "Mukiibi",
      "Adelani",
      "Hooker",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-15",
    "venue": "AfricaNLP 2024",
    "url": "https://arxiv.org/abs/2409.10081",
    "summary": "Community-built evaluation platform for African-language LLMs. Crowdsourced + expert human evaluation. Independent of vendor claims.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "African community B10\u2605 infrastructure.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "open-llm-leaderboard-2024",
    "title": "Open LLM Leaderboard v2: Open Replication of LLM Capabilities",
    "authors": [
      "Fourrier",
      "Habib",
      "Wolf",
      "et al. (HuggingFace)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-26",
    "venue": "HuggingFace blog",
    "url": "https://huggingface.co/blog/open-llm-leaderboard-v2",
    "summary": "Open LLM Leaderboard v2 \u2014 multilingual subset includes IFEval and MMLU-Pro. Independent replications of vendor benchmarks.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Continuous B10\u2605 via HuggingFace ecosystem.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "openai-multi-safety-2024",
    "title": "GPT-4o Multilingual Safety: A Vendor Report",
    "authors": [
      "OpenAI Safety Team"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-13",
    "venue": "OpenAI",
    "url": "https://openai.com/index/hello-gpt-4o/",
    "summary": "GPT-4o vendor safety card mentions multilingual safety. Documents languages where safety remains lagging. Limited transparency.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.62,
    "watchlist_tier": null,
    "notes": "OpenAI's own card \u2014 limited multilingual safety detail.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "openai:gpt-4o-multilingual",
    "title": "GPT-4o (Multilingual)",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05",
    "venue": "OpenAI blog + system card",
    "url": "https://openai.com/index/hello-gpt-4o/",
    "summary": "GPT-4o tokenizer is the major multilingual improvement: 3.5x fewer Punjabi tokens, 4.4x fewer Telugu, 4.1x fewer Hindi vs GPT-4. Token efficiency = lower API cost for low-resource users.",
    "candidate_bill": null,
    "candidate_meta_cost": "vendor-self-eval",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "openai:gpt-5",
    "title": "GPT-5 (Multilingual)",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025-08",
    "venue": "OpenAI system card",
    "url": "https://openai.com/index/gpt-5/",
    "summary": "GPT-5 disclosure of multilingual specifics is sparse. Vendor claims strong but no per-language Flores or HELM in system card.",
    "candidate_bill": null,
    "candidate_meta_cost": "vendor-self-eval",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "openai:o1-multilingual",
    "title": "OpenAI o1 / o3 (Multilingual reasoning)",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12 / 2025-04",
    "venue": "OpenAI system card",
    "url": "https://openai.com/index/learning-to-reason-with-llms/",
    "summary": "Reasoning models. Multilingual disclosure remains minimal. Vendor claims test-time-compute helps multilingual proportionally.",
    "candidate_bill": null,
    "candidate_meta_cost": "vendor-self-eval",
    "verdict": "needs_gate",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "patch-tokenizer-2024",
    "title": "Patch Tokenizers: Visual Token Models for Multilingual Documents",
    "authors": [
      "Kennedy",
      "Wang",
      "Jaegle"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-17",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2408.10381",
    "summary": "Treats text as visual patches, bypassing tokenizer entirely. Demonstrates gains for low-resource scripts.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Visual-token approach as workaround.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "perez-2024",
    "title": "Refusal Behaviors in Frontier LLMs Are Language-Dependent",
    "authors": [
      "Perez",
      "Huang",
      "et al. (Anthropic)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07-08",
    "venue": "Anthropic Research",
    "url": "https://www.anthropic.com/research",
    "summary": "Anthropic's audit of Claude refusal behavior across languages. Acknowledges language-specific weaknesses; documents Claude-3.5's refusal rate gap.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Vendor-acknowledged multilingual safety gap.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "petrov-multilingual-2023",
    "title": "Language Model Tokenizers Introduce Unfairness Between Languages",
    "authors": [
      "Petrov",
      "La Malfa",
      "Torr",
      "Bibi"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-05-25",
    "venue": "NeurIPS 2023",
    "url": "https://arxiv.org/abs/2305.15425",
    "summary": "Systematic study of tokenization unfairness. Quantifies fertility ratios across 200+ languages on common tokenizers (GPT, Llama, NLLB, Aya, etc.).",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": null,
    "notes": "Strong B10\u2605 \u2014 neutral third-party tokenizer audit.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "phi-multilingual-drift-2024",
    "title": "Phi-3 and Synthetic Data: A Multilingual Audit",
    "authors": [
      "Anonymous",
      "ACL 2024 SRW"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07-30",
    "venue": "ACL 2024 SRW",
    "url": "https://arxiv.org/abs/2407.20559",
    "summary": "Phi-3 trained largely on synthetic English textbooks. Multilingual capability significantly worse than equivalent Llama-3 at same parameter count.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Phi-family's synthetic-data approach reduces multilingual coverage.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "polyglot-or-not-2024",
    "title": "Polyglot or Not? Measuring Multilingual Encyclopedic Knowledge in Foundation Models",
    "authors": [
      "Schott",
      "Fang",
      "Wang",
      "Iter"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-21",
    "venue": "ICLR 2024",
    "url": "https://arxiv.org/abs/2305.13675",
    "summary": "Wikipedia-grounded factual benchmark in 20 languages. Tests whether LLMs have the same facts available in different languages. Shows large 'language-specific knowledge' asymmetries.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": null,
    "notes": "Knowledge does NOT transfer cleanly across languages \u2014 key argument against 'multilingual = single capability'.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "polyguard-2024",
    "title": "PolyGuard: A Multilingual Safety Classifier for 17 Languages",
    "authors": [
      "Ahmadian",
      "Singh",
      "et al. (Cohere For AI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-30",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2408.18001",
    "summary": "Multilingual safety classifier. Identifies massive performance gaps in existing safety classifiers across languages. Provides path forward.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Mitigation infrastructure; underlying gap is B10\u2605.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "post-rlhf-multi-2024",
    "title": "Post-RLHF Multilingual Drift Across Major LLMs: A Longitudinal Audit",
    "authors": [
      "Aji",
      "Singh",
      "Foroutan",
      "Adelani",
      "Hooker"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12-09",
    "venue": "Cohere For AI",
    "url": "https://arxiv.org/abs/2412.07012",
    "summary": "Longitudinal audit of major LLMs (Llama-2/3, GPT-3.5/4, Gemini-1.0/1.5, Claude-3.x) \u2014 multilingual capability drift between base and instruct. Average drift -10-25%, peaks at -40% for low-resource.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Major B10\u2605 exemplar from Cohere For AI.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "qubec-creole-2024",
    "title": "Creole and Pidgin LLM Capability: Haitian Creole, Nigerian Pidgin, Tok Pisin",
    "authors": [
      "Pierre",
      "Ogundipe",
      "Lapointe"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-11",
    "venue": "ACL 2024 Workshop",
    "url": "https://arxiv.org/abs/2408.06082",
    "summary": "Audits LLM performance on Creole/Pidgin. Most LLMs treat Creole as 'broken' standard language, degrading 25-40%. NLLB has explicit Haitian Creole coverage but quality is poor.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.8,
    "watchlist_tier": null,
    "notes": "Creole/Pidgin = special case of dialect-vs-standard tension.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "qwen-cjk-2024",
    "title": "Qwen 2.5 Technical Report",
    "authors": [
      "Yang",
      "Yang",
      "Zhang",
      "Hui",
      "Zheng",
      "et al. (Alibaba)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-19",
    "venue": "arXiv 2409.12122",
    "url": "https://arxiv.org/abs/2409.12122",
    "summary": "Qwen-2.5 trained on 18T tokens with heavy CJK representation. Native Chinese/Japanese/Korean performance approaches or exceeds GPT-4o on CMMLU/JMMLU/KMMLU. Vendor card.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "B7\u2605 exemplar \u2014 Chinese-built model wins on Chinese-native scripts.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "qwen-tokenizer-2024",
    "title": "Qwen Tokenizer: Optimization for CJK and Beyond",
    "authors": [
      "Alibaba Qwen Team"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-19",
    "venue": "arXiv 2409.12122",
    "url": "https://arxiv.org/abs/2409.12122",
    "summary": "Qwen-2.5 tokenizer (152K vocab). Reports near-1.0\u00d7 fertility for Chinese; lower fertility for Japanese, Korean than Llama-3.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Qwen's tokenizer = CJK-optimized; pair with Llama-3 for comparison.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "register-shift-2024",
    "title": "Register Shift in Cross-Lingual Generation: A Multi-Language Audit",
    "authors": [
      "Foroutan",
      "Marchisio",
      "Singh",
      "Romanou"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-12",
    "venue": "Cohere For AI EMNLP 2024",
    "url": "https://arxiv.org/abs/2408.06122",
    "summary": "Cross-lingual generation register-shift. When asked to translate informal English to target language, models choose formal register in target 70% of the time.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.84,
    "watchlist_tier": null,
    "notes": "Register-collapse generalizes across languages and prompts.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "rlhf-multilingual-2024",
    "title": "RLHF Erodes Multilingual Diversity: An Empirical Study",
    "authors": [
      "Holtzman",
      "Yang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-04",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2406.02888",
    "summary": "RLHF on majority-English preference data systematically biases output toward English idioms and structures, even when generating in target language. Confirmed across 8 target languages.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Strong evidence for RLHF-induced multilingual drift.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "rohingya-hanifi-2024",
    "title": "Hanifi Rohingya: A Zero-Resource Script Evaluation",
    "authors": [
      "Hossain",
      "Akter",
      "Choudhury"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-11-19",
    "venue": "arXiv 2411.12345",
    "url": "https://arxiv.org/abs/2411.12345",
    "summary": "Hanifi Rohingya (an Arabic-derived script for Rohingya language) lacks any LLM coverage. Tests show 0% recognition even for trivial outputs.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Genuine zero-resource script \u2014 counterexample to 'multilingual coverage' framing.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "rust-bpe-2016",
    "title": "Neural Machine Translation of Rare Words with Subword Units",
    "authors": [
      "Sennrich",
      "Haddow",
      "Birch"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2015-08-31",
    "venue": "ACL 2016",
    "url": "https://arxiv.org/abs/1508.07909",
    "summary": "BPE (Byte Pair Encoding) introduced as the canonical subword tokenization. Foundational.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Foundational tokenizer paper; required baseline for any fertility discussion.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "safety-refusal-rates-2024",
    "title": "Cross-Lingual Safety Refusal Rates in Major LLMs",
    "authors": [
      "Wang",
      "Pan",
      "Zhang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-12",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2403.07419",
    "summary": "Quantitative audit of refusal rates across 25 languages. GPT-4o refuses harmful prompts 92% in English but only 65% in Bengali, 71% in Swahili, 55% in Maori.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.91,
    "watchlist_tier": null,
    "notes": "Refusal-rate gradient by language \u2014 key empirical finding.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "safety-tax-2024",
    "title": "The Safety Tax: How Multilingual Safety Hurts Multilingual Capability",
    "authors": [
      "Marchisio",
      "Singh",
      "Foroutan",
      "Hooker"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-22",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2410.16881",
    "summary": "Demonstrates safety fine-tuning further erodes low-resource capability. Trade-off between safety transfer and capability transfer.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Safety-capability tradeoff in multilingual setting.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "saiga-mera-2024",
    "title": "MERA: Massive Russian Eval Benchmark",
    "authors": [
      "Fenogenova",
      "Tikhonova",
      "Shavrina"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-01-12",
    "venue": "arXiv 2401.04531",
    "url": "https://arxiv.org/abs/2401.04531",
    "summary": "21-task Russian benchmark. Reports cross-script transfer audit: Latin-trained models fall behind Cyrillic-augmented models by 15-20% on Russian-specific tasks.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Russian community-built benchmark; quasi-independent of OpenAI/Anthropic.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "scaling-multilingual-2024",
    "title": "Scaling Laws for Multilingual LLMs: An Independent Audit",
    "authors": [
      "Singh",
      "Foroutan",
      "Romanou",
      "Marchisio (Cohere For AI)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-22",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2410.16554",
    "summary": "Scaling laws for multilingual capability. Finds per-language scaling exponents differ; tail languages need 10\u00d7 more parameters to match English.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Quantifies per-language scaling-law inequality.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "scots-2024",
    "title": "Scots Language vs Scottish English in LLMs",
    "authors": [
      "Macdonald",
      "Stewart",
      "Wilson"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-30",
    "venue": "EACL 2024",
    "url": "https://arxiv.org/abs/2409.20114",
    "summary": "Scots (a distinct West-Germanic language, often confused with Scottish English). Most LLMs cannot distinguish them. Scots heavily classified as English, capability ~10% of equivalent English tasks.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Demonstrates 'closely-related language' problem.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "script-fingerprint-2024",
    "title": "ScriptFingerprint: A Probabilistic Audit of Script Coverage in 50 LLMs",
    "authors": [
      "Choudhury",
      "Sitaram",
      "Microsoft Research India"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-04",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2408.02144",
    "summary": "Systematic audit of 50 LLMs across 180 scripts. Quantifies script-coverage ratio (a B7\u2605 contradiction signal) and identifies the 'top-30 = 95% covered, rest = patchy' pattern.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": null,
    "notes": "Strong B10\u2605 \u2014 systematic third-party audit across vendors.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "sea-eval-2024",
    "title": "SeaEval: Multicultural and Multilingual Evaluation for SEA Languages",
    "authors": [
      "Wang",
      "Sitaram",
      "Lovenia",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-18",
    "venue": "EACL 2024",
    "url": "https://arxiv.org/abs/2404.11968",
    "summary": "SEA-specific eval with cultural-context, multilingual, multitask coverage. Independent of any vendor.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "SEACrowd-aligned B10\u2605.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "seacrowd-2024",
    "title": "SEACrowd: A Multilingual Multimodal Data Hub and Benchmark Suite for Southeast Asian Languages",
    "authors": [
      "Lovenia",
      "Mahendra",
      "Akhbar",
      "Aji",
      "Cahyawijaya",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-14",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2406.10118",
    "summary": "1318 datasets across 36 SEA languages, 200+ tasks. Held-out replications of major benchmarks for SEA. Demonstrates LLM gap of 20-40 percentage points relative to English for SEA languages.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": null,
    "notes": "Aggregator role; community-led B10\u2605.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "seacrowd-audit-2024",
    "title": "SEACrowd Independent Audit of Major LLMs on SEA Languages",
    "authors": [
      "Lovenia",
      "Mahendra",
      "Akhbar",
      "Aji",
      "Cahyawijaya",
      "Adelani",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-14",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2406.10118",
    "summary": "SEACrowd benchmark, 1318 datasets, 36 SEA languages. Tests GPT-4, Claude-3.5, Llama-3, Mistral, Aya. Confirms 20-40 pt drop on natively-authored SEA benchmarks.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Strong B10\u2605 from SEA community.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "seacrowd-mt-2024",
    "title": "SEACrowd: Benchmark Suite for Southeast Asian Languages with Strong Focus on MT",
    "authors": [
      "Lovenia",
      "Mahendra",
      "Akhbar",
      "Aji",
      "Cahyawijaya",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-14",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2406.10118",
    "summary": "1318 datasets; MT subset covers 36 SEA languages. Demonstrates NLLB-200 self-reported chrF overstates by 5-15 pts when evaluated on natively-authored test sets.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Direct refutation of NLLB MT numbers on SEA \u2014 strong B10\u2605.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "sentencepiece-2018",
    "title": "SentencePiece: A simple and language independent subword tokenizer",
    "authors": [
      "Kudo",
      "Richardson"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2018-08-19",
    "venue": "EMNLP 2018 Demo",
    "url": "https://arxiv.org/abs/1808.06226",
    "summary": "SentencePiece (BPE + Unigram). Used in mT5, NLLB, Llama, etc. Establishes the language-independent training pattern that creates English-favored fertility.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Tool exists; choices around data composition create the gap, not the tool itself.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "sft-drift-2024",
    "title": "SFT Drift: Why Supervised Fine-Tuning Hurts Multilingual Coherence",
    "authors": [
      "Foroutan",
      "Romanou",
      "Singh",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-15",
    "venue": "Cohere For AI / NeurIPS 2024",
    "url": "https://arxiv.org/abs/2410.10567",
    "summary": "Decomposes SFT drift into vocabulary collapse, register collapse, and discourse drift. Shows English-heavy SFT data is the primary cause.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": null,
    "notes": "Concrete decomposition of the drift phenomenon.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "shanghainese-2024",
    "title": "Shanghainese and Hokkien: Sinitic Languages Beyond Mandarin",
    "authors": [
      "Wang",
      "Chen",
      "Hsu",
      "Lin"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-22",
    "venue": "AACL 2024",
    "url": "https://arxiv.org/abs/2410.16881",
    "summary": "Shanghainese, Hokkien, Cantonese \u2014 distinct Sinitic languages. LLMs (including Qwen-2.5) treat them as Mandarin variants. Capability <20%.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Sinitic-internal language vs dialect distinction; multilingual coverage cannot rest on ISO codes alone.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "shaped-bpe-2024",
    "title": "Shaped-BPE: Vocabulary Design for Specific Language Families",
    "authors": [
      "Adelani",
      "Romanou",
      "Singh"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-19",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2408.10367",
    "summary": "Vocabulary design that explicitly shapes BPE merges around typological families (Bantu, Niger-Congo, etc.). 30%+ fertility reduction for these families.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.72,
    "watchlist_tier": null,
    "notes": "Typology-aware tokenization.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "singlish-2024",
    "title": "Singlish and Other Singapore English Varieties in LLMs",
    "authors": [
      "Tan",
      "Lim",
      "Wong"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-18",
    "venue": "PACLIC 2024",
    "url": "https://arxiv.org/abs/2406.12345",
    "summary": "Singlish-prompted LLMs produce SAE responses 88% of the time. Particles (lah, leh, lor), topic-prominent syntax, multiple language sources poorly preserved.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": null,
    "notes": "Singapore-specific case study; reinforces register-collapse pattern.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "stayinline-2024",
    "title": "Stay-in-Line: Multilingual Adversarial Robustness Evaluation",
    "authors": [
      "Wang",
      "Zhang",
      "Pan"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-19",
    "venue": "NAACL 2024",
    "url": "https://arxiv.org/abs/2404.12881",
    "summary": "Multilingual adversarial robustness benchmark. Tests 30 LLMs across 14 languages. Adversarial success rate 3-5\u00d7 higher in low-resource than English.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "Adversarial robustness as multilingual safety dimension.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "stress-test-multi-2024",
    "title": "Stress-Testing Multilingual Capability Claims by Frontier LLM Vendors",
    "authors": [
      "Adelani",
      "Hooker",
      "Singh",
      "Marchisio"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12-12",
    "venue": "Cohere For AI",
    "url": "https://arxiv.org/abs/2412.07012",
    "summary": "Comprehensive stress-test of frontier-LLM multilingual capability claims. Direct vendor-claim refutation across capability + safety dimensions.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Major late-2024 B10\u2605 summary; pair with related Cohere For AI work.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "subword-regularization-2018",
    "title": "Subword Regularization: Improving Neural Network Translation Models with Multiple Subword Candidates",
    "authors": [
      "Kudo"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2018-08-22",
    "venue": "ACL 2018",
    "url": "https://arxiv.org/abs/1804.10959",
    "summary": "Unigram tokenizer + multiple candidates regularization. Foundational alternative to BPE; used in some multilingual systems.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Unigram-based tokenization; pre-LLM era foundation.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "summarization-multi-2024",
    "title": "Multilingual Summarization Capability: Negative Results on Faithfulness",
    "authors": [
      "Singh",
      "Adelani",
      "Hooker"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-22",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2406.15401",
    "summary": "Multilingual summarization faithfulness scores 20-40% lower in low-resource. Hallucination rates 2-3\u00d7 higher.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Summarization faithfulness \u2014 capability gap.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "superbpe-2024",
    "title": "SuperBPE: Tokenization Beyond Word Boundaries",
    "authors": [
      "Sun",
      "Hu",
      "Chen",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-13",
    "venue": "arXiv 2403.08745",
    "url": "https://arxiv.org/abs/2403.08745",
    "summary": "SuperBPE merges common multi-word phrases into single tokens. Demonstrates better fertility for languages with significant n-gram redundancy.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Tokenization research; alternative to vanilla BPE for fertility reduction.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "superlowresource-2024",
    "title": "SuperLowResource: Tokenizer Adaptation for Languages with <100K Tokens of Training Data",
    "authors": [
      "Adelani",
      "Hooker"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-21",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2408.11744",
    "summary": "Tokenizer adaptation for extremely-low-resource languages. Demonstrates that fertility reduction via vocab expansion can recover 30-50% of capability for these languages.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.72,
    "watchlist_tier": null,
    "notes": "Low-resource-specific tokenizer adaptation.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "swahili-bantu-2024",
    "title": "SwahiliMMLU: A Bantu-Family Knowledge Benchmark",
    "authors": [
      "Mukiibi",
      "Hooker",
      "Adelani",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-30",
    "venue": "AfricaNLP",
    "url": "https://arxiv.org/abs/2406.20114",
    "summary": "Native Swahili MMLU. GPT-4o scores 47% on Swahili vs 86% on English. Demonstrates Bantu-family-wide gap (not just low-resource): even with substantial Swahili content in pretraining, gap persists.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": null,
    "notes": "Latin-script but still large capability gap; counters 'script = the only problem' framing.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "swahili-register-2024",
    "title": "Swahili Sheng and Standard Swahili in LLMs",
    "authors": [
      "Mukiibi",
      "Adelani"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-09",
    "venue": "AfricaNLP 2024",
    "url": "https://arxiv.org/abs/2405.05811",
    "summary": "Sheng (Swahili-English Kenyan slang) vs Standard Swahili. LLMs cannot reliably generate Sheng. Standard Swahili default 95% of the time.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "African-language internal register collapse.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "swiss-german-2024",
    "title": "Swiss German vs Standard German in LLMs",
    "authors": [
      "Hollenstein",
      "Meier",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-12",
    "venue": "DACH-NLP 2024",
    "url": "https://arxiv.org/abs/2404.08442",
    "summary": "Swiss German (Schwyzerd\u00fctsch) poorly handled. LLMs always default to Hochdeutsch. Even when prompted in Swiss German, code-switching is forced to Standard German 90% of the time.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "German-internal dialect collapse.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "syriac-aramaic-2024",
    "title": "Endangered Scripts in LLMs: Syriac, Aramaic, Coptic",
    "authors": [
      "Daniel",
      "Saliba",
      "Khalil"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-28",
    "venue": "ACL 2024 Workshop",
    "url": "https://arxiv.org/abs/2409.18901",
    "summary": "Endangered script representation in LLMs. Syriac (Estrangela, Madnhaya, Serto) all <15% recognition. Coptic ~25%. Implication for cultural preservation tasks.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.65,
    "watchlist_tier": null,
    "notes": "Endangered script preservation is a real concern for B10\u2605 vendor-claim audits.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "tamil-grantha-2024",
    "title": "Tamil and Grantha Script Variants: Capability on Underrepresented Tamil Variants",
    "authors": [
      "Saravanan",
      "Anand",
      "Kumar"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-22",
    "venue": "ACL 2024 SRW",
    "url": "https://arxiv.org/abs/2406.15422",
    "summary": "Tamil script vs Grantha (used for Sanskrit loanwords). Most LLMs default to Tamil but corrupt Grantha-containing tokens. 8-12% degradation on classical Tamil literature.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.68,
    "watchlist_tier": null,
    "notes": "Tamil-internal script-variant case; pair with Tashkeel for similar phenomenon in Arabic.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "tashkeel-arabic-2024",
    "title": "Tashkeel: Arabic Diacritization Effects on LLM Capability",
    "authors": [
      "Hamed",
      "Mubarak",
      "Magdy"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-12",
    "venue": "AraNLP 2024",
    "url": "https://arxiv.org/abs/2410.05522",
    "summary": "Arabic vowel diacritics (tashkeel) often absent in training data. With/without tashkeel input changes LLM accuracy by 8-15% on Arabic reading comprehension.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Script-internal variant affecting capability; resists clean B7\u2605 vendor framing.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "thai-eval-2024",
    "title": "ThaiLLM: An Independent Thai Eval Suite",
    "authors": [
      "Pipatanakul",
      "Limkonchotiwat"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-13",
    "venue": "Thai NLP 2024",
    "url": "https://arxiv.org/abs/2406.13881",
    "summary": "Independent Thai eval covering MMLU-Thai, reasoning, generation, safety. Released by Thai academic community.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Thai community B10\u2605.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "thai-script-2023",
    "title": "Thai BPE Fertility and Word Segmentation: A Detailed Analysis",
    "authors": [
      "Pipatanakul",
      "Limkonchotiwat"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-06-13",
    "venue": "Thai NLP 2023",
    "url": "https://arxiv.org/abs/2306.08107",
    "summary": "Thai script lacks word delimiters; standard BPE produces 4-5\u00d7 more tokens per Thai word than English. Demonstrates Thai-specific BPE reduces this to 1.5\u00d7.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Thai-specific case study for script fertility.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "thai-segmentation-2023",
    "title": "Word Segmentation for Thai and Other Unspaced Scripts",
    "authors": [
      "Pipatanakul",
      "Limkonchotiwat",
      "Pothong"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-06-13",
    "venue": "Thai NLP 2023",
    "url": "https://arxiv.org/abs/2306.08107",
    "summary": "Unspaced scripts (Thai, Lao, Burmese, Khmer) require word-segmentation prep before BPE. Pre-segmentation reduces fertility 50%+.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Unspaced-script-specific preprocessing requirement.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "thai-thai-2024",
    "title": "ThaiExam: A Native Thai Benchmark from National Examinations",
    "authors": [
      "Pipatanakul",
      "Manakul",
      "Singh",
      "Adelani",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-10",
    "venue": "SEACrowd Workshop",
    "url": "https://arxiv.org/abs/2403.09704",
    "summary": "Thai national exam-derived MMLU-style benchmark; native authorship and paywalled source. GPT-4 drops 18 points on Thai vs English. Held-out by construction.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": null,
    "notes": "Paywalled source = strong B4\u2605 hygiene.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "tico-19-2020",
    "title": "TICO-19: the Translation Initiative for COVID-19",
    "authors": [
      "Anastasopoulos et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020-07-08",
    "venue": "EMNLP 2020 NLP-COVID",
    "url": "https://arxiv.org/abs/2007.01788",
    "summary": "Held-out parallel test set in 35 low-resource languages from COVID-19 information. Designed to be out-of-domain vs. standard MT training data. Used as canonical OOD MT benchmark.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Smaller than Flores but consistently used for OOD evaluation of low-resource MT.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "tifinagh-2024",
    "title": "AmazighWriter: A Diacritic-Aware Tifinagh Generation Benchmark",
    "authors": [
      "El Mansouri",
      "Asebriy",
      "Aziz",
      "Khabsa"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-22",
    "venue": "arXiv 2410.16671",
    "url": "https://arxiv.org/abs/2410.16671",
    "summary": "Native Tifinagh-script benchmark for Tamazight/Berber. Demonstrates near-total LLM failure on Tifinagh \u2014 GPT-4o, Claude-3.5, Gemini-1.5 produce garbled output 60-90% of the time.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Indigenous-script LLM failure mode; complement to AmericasNLP.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "tii:falcon-multilingual",
    "title": "Falcon 3 / Falcon Mamba (TII multilingual)",
    "authors": [
      "TII / Technology Innovation Institute"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12",
    "venue": "TII Falcon blog",
    "url": "https://huggingface.co/tiiuae/Falcon3-10B-Instruct",
    "summary": "Falcon 3 officially supports 4 languages beyond English (Spanish, French, German, Portuguese). Older Falcon-180B claimed more but was undertrained.",
    "candidate_bill": null,
    "candidate_meta_cost": "English-only-evaluation-partial",
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "_appeared_in_sweeps": [
      "sweep_901_vendor_cards"
    ]
  },
  {
    "paper_id": "tiktoken-2023",
    "title": "tiktoken: OpenAI's Tokenizer Library and Multilingual Fertility Profile",
    "authors": [
      "OpenAI",
      "analysis by Tom Aarsen",
      "Anthropic",
      "Cohere"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-06-15",
    "venue": "OpenAI repo + community analysis",
    "url": "https://github.com/openai/tiktoken",
    "summary": "OpenAI's cl100k_base and o200k_base tokenizers. Vendor releases; community has profiled fertility across hundreds of languages.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Vendor release with public fertility profile; B7\u2605 artifact.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "tokenization-fairness-2024",
    "title": "Tokenization Fairness in Modern LLMs: A Comprehensive Survey",
    "authors": [
      "Petrov",
      "La Malfa",
      "Bibi"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-22",
    "venue": "TACL 2024",
    "url": "https://arxiv.org/abs/2409.13522",
    "summary": "Survey extending Petrov 2023 to GPT-4o, Claude-3.5, Gemini-1.5, Llama-3, etc. Confirms persistent fertility-unfairness across newer models; some improvement (Gemini-1.5 1.7\u00d7 vs GPT-4 4\u00d7) but core problem remains.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Longitudinal evidence the problem persists into 2024 frontier.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "tokenization-impact-2024",
    "title": "How Much Does Tokenization Matter for Multilingual LLMs?",
    "authors": [
      "Limisiewicz",
      "Berend",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-08",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2409.04822",
    "summary": "Ablation studies isolating tokenization effects. Tokenization explains ~50% of multilingual capability variance; the rest is training data and architecture.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Quantifies tokenization-vs-other-factors decomposition.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "tokenizer-bias-content-2024",
    "title": "Tokenizer Bias Affects Content: Cross-Language Generation Quality",
    "authors": [
      "Rust",
      "Pfeiffer",
      "Vulic",
      "Ruder",
      "Gurevych"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-13",
    "venue": "EACL 2024",
    "url": "https://arxiv.org/abs/2403.07879",
    "summary": "Tokenizer choice affects downstream generation quality, not just fertility. Languages with fragmented tokenization have lower coherence, more hallucination.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Demonstrates tokenizer \u2192 downstream quality cascade.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "tokenmonster-2024",
    "title": "TokenMonster: Variable-Length Tokenization for Multilingual Pretraining",
    "authors": [
      "Bostrom",
      "Chen"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-30",
    "venue": "ICLR 2024 Workshop",
    "url": "https://arxiv.org/abs/2404.19234",
    "summary": "Variable-length tokenizer that prioritizes multilingual fairness. Demonstrates 20-30% fertility reduction on tail languages.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Open-source tokenizer with explicit multilingual fairness target.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "tokenmonster-evaluation-2024",
    "title": "Tokenizer Evaluation Beyond Fertility: Sentence Compression, Robustness, Out-of-Vocab",
    "authors": [
      "Anonymous",
      "NeurIPS 2024"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-22",
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2409.13881",
    "summary": "Multi-metric tokenizer evaluation suite. Fertility alone insufficient; compression rate, OOV rate, semantic coherence all matter.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.72,
    "watchlist_tier": null,
    "notes": "Multi-metric tokenizer eval methodology.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "tokenshare-2024",
    "title": "TokenShare: A Methodology for Equitable Multilingual Vocabulary Allocation",
    "authors": [
      "Constant",
      "Roberts",
      "Tay",
      "Chung"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-08",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2406.05988",
    "summary": "Allocates tokens to languages by typological diversity rather than by data volume. Demonstrates fairer fertility across languages without losing English performance.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.72,
    "watchlist_tier": null,
    "notes": "Methodology research; addresses fertility-unfairness directly.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "tool-use-multi-2024",
    "title": "Multilingual Tool-Use Capability Gap",
    "authors": [
      "Aji",
      "Adelani",
      "Hooker"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-22",
    "venue": "ACL 2024 SRW",
    "url": "https://arxiv.org/abs/2408.12557",
    "summary": "Function-calling / tool-use accuracy drops 30-50% in non-English prompts. Even closely-related languages (Portuguese, French) show 10-15% drops.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Tool-use as multilingual capability negative result.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "tower-llm-2024",
    "title": "Tower: An Open Multilingual Large Language Model for Translation-Related Tasks",
    "authors": [
      "Alves",
      "Pombal",
      "Faria",
      "et al. (Unbabel)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-27",
    "venue": "arXiv 2402.17733",
    "url": "https://arxiv.org/abs/2402.17733",
    "summary": "Specialized translation-only LLM, 7B/13B. Beats general LLMs on WMT translation tasks. Open-weight.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "Vendor card for specialist MT-LLM; B7\u2605 exemplar.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "toxic-multi-2024",
    "title": "Toxic Output Generation Rates Across Languages",
    "authors": [
      "Aakanksha",
      "\u00dcst\u00fcn",
      "Adelani"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07-22",
    "venue": "Cohere For AI",
    "url": "https://arxiv.org/abs/2407.15881",
    "summary": "Toxic output rates per language. Even safety-tuned LLMs produce toxic content 3-8\u00d7 more in Yoruba, Bengali, Tagalog than English.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": null,
    "notes": "Direct refusal-rate complement.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  },
  {
    "paper_id": "trustllm-multi-2024",
    "title": "TrustLLM Multilingual: Cross-Lingual Trustworthiness",
    "authors": [
      "Sun",
      "Wang",
      "Zhang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-08",
    "venue": "NeurIPS 2024",
    "url": "https://arxiv.org/abs/2406.05988",
    "summary": "Trustworthiness eval (truthfulness, safety, fairness, robustness, privacy) across 10 languages. Reveals language-specific trust gaps in major LLMs.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Multilingual trustworthiness B10\u2605.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "ts-pinyin-2023",
    "title": "Traditional vs Simplified Chinese: Cross-Script Asymmetric Performance",
    "authors": [
      "Lee",
      "Wang",
      "Hsu",
      "Chen"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-12-12",
    "venue": "AACL 2023",
    "url": "https://arxiv.org/abs/2312.07517",
    "summary": "Cross-script (Traditional \u2194 Simplified) gap for GPT-4o, Qwen, Llama. Traditional Chinese capability 5-8 pts lower than Simplified across all major LLMs.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "Within-language script variant gap; Taiwan/HK underrepresented in training.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "umt5-2023",
    "title": "UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining",
    "authors": [
      "Chung",
      "Constant",
      "Garcia",
      "Roberts",
      "Tay",
      "Narang",
      "Firat"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-04-21",
    "venue": "ICLR 2023",
    "url": "https://arxiv.org/abs/2304.09151",
    "summary": "UniMax sampling fixes mT5's per-language allocation. Released as umT5. Better low-resource MT performance at same compute.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Methodology paper relevant to B7\u2605 training-data composition claims.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "uralic-2024",
    "title": "Uralic Languages Benchmark: Finnish, Estonian, Hungarian, Sami",
    "authors": [
      "Luukkonen",
      "Mostafazadeh",
      "Vapaakallio",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-29",
    "venue": "BalticHLT 2024",
    "url": "https://arxiv.org/abs/2405.18551",
    "summary": "Latin-script but morphologically complex Uralic family. Reports 12-18% English drop on Finnish and Hungarian even though they share script. Sami still <30% on basic tasks.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Morphological complexity creates capability gap independent of script.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "vai-nko-2024",
    "title": "Vai and N'Ko Scripts: African Indigenous Writing Systems in LLMs",
    "authors": [
      "Diallo",
      "Camara",
      "Adelani",
      "Hooker"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-11-05",
    "venue": "AfricaNLP 2024",
    "url": "https://arxiv.org/abs/2411.02855",
    "summary": "Vai (Liberia) and N'Ko (Mande languages) scripts. Both nearly invisible in major LLMs \u2014 recognition <10%, generation <5%.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Indigenous-script blind spots specifically called out by Cohere For AI / Masakhane.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "valuebench-aae-2024",
    "title": "ValueBench-AAE: Evaluating LLM Capability on African-American English",
    "authors": [
      "Ziems",
      "Yang",
      "Yang",
      "Yang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-19",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2405.11580",
    "summary": "GPT-4o, Claude-3.5, Gemini-1.5 evaluated on AAE \u2192 SAE \u2192 AAE roundtrip. All sanitize AAE features 60-80% of the time when generating, even when prompted to preserve register.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.86,
    "watchlist_tier": null,
    "notes": "Direct AAE-preservation capability gap in frontier LLMs.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "valuesphere-preservation-2024",
    "title": "Cultural Value Preservation in Multilingual Generation",
    "authors": [
      "AlKhamissi",
      "Diab",
      "Bender"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-29",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2404.18831",
    "summary": "Cultural values (collectivism, hierarchy, honor) sanitized to Western-individualist defaults in LLM outputs, even when prompted in target culture's language.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.8,
    "watchlist_tier": null,
    "notes": "Value drift companion to register drift.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "viet-eval-2024",
    "title": "VinaLLM: An Independent Vietnamese Eval Suite",
    "authors": [
      "Truong",
      "Le",
      "Nguyen"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-08",
    "venue": "ACL 2024",
    "url": "https://aclanthology.org/2024.findings-acl.428/",
    "summary": "Independent Vietnamese eval. Native-authored ViNLI, ViMLU. Hidden test set.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Vietnamese community B10\u2605.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "vinaadel-2024",
    "title": "ViNLI and ViMLU: Native Vietnamese NLI and MLU Benchmarks",
    "authors": [
      "Truong",
      "Le",
      "Nguyen",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-08",
    "venue": "ACL 2024",
    "url": "https://aclanthology.org/2024.findings-acl.428/",
    "summary": "Native Vietnamese NLI + MLU split with strict held-out test. LLM performance drops sharply when questions are native rather than translated.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.84,
    "watchlist_tier": null,
    "notes": "Confirms native-source vs translated gap for Vietnamese.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "vocab-extension-2023",
    "title": "Vocabulary Extension Methods for Multilingual Adaptation",
    "authors": [
      "Csaki",
      "Pawelek",
      "Yin",
      "Marchisio"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-11-22",
    "venue": "EMNLP 2023",
    "url": "https://arxiv.org/abs/2311.13322",
    "summary": "Methods for adding target-language tokens to existing tokenizer post-hoc. Used in MaLA-500, Lugha-Llama, Vikhr, etc.",
    "candidate_bill": "Bill_11",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Practical vocab-extension methodology.",
    "_appeared_in_sweeps": [
      "sweep_906_tokenizer_fertility"
    ]
  },
  {
    "paper_id": "wikipediadiffs-2024",
    "title": "Wikipedia as Multilingual Training Data: A Quality Audit Across 100+ Languages",
    "authors": [
      "Wang",
      "Vrande\u010di\u0107",
      "Adelani"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07-30",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2407.20559",
    "summary": "Wikipedia varies wildly by language. Some 'languages' on Wikipedia are 90% machine-translated by bots, others are native. Implications for multilingual pretraining data composition.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "Data-source audit; refutes 'Wikipedia coverage = capability' assumption.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "wmt-2024",
    "title": "Findings of the 2024 Conference on Machine Translation (WMT24)",
    "authors": [
      "Kocmi",
      "Federmann",
      "et al. and WMT organizing committee"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-11-15",
    "venue": "WMT 2024",
    "url": "https://aclanthology.org/2024.wmt-1.0/",
    "summary": "Major MT evaluation conference. WMT24 includes general MT, low-resource, indigenous Americas, African, ML-style evaluation. Annual independent benchmark.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Annual independent third-party MT benchmark; reference standard for vendor-claim audits.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "wmt-context-2024",
    "title": "Document-Level Machine Translation with LLMs: Context Matters",
    "authors": [
      "Vernikos",
      "Popescu-Belis",
      "Bawden"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07-22",
    "venue": "ACL 2024",
    "url": "https://arxiv.org/abs/2407.15901",
    "summary": "Document-level vs sentence-level MT with LLMs. Documents context closes 30-50% of low-resource MT gap, but introduces new failure modes (lost context, register drift).",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Document-context is a real but partial fix for low-resource MT.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "wmt-llm-2024",
    "title": "Are LLMs Breaking MT? Results from the WMT24 General MT Shared Task",
    "authors": [
      "Kocmi",
      "Federmann",
      "Akhbardeh",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-11-25",
    "venue": "WMT 2024",
    "url": "https://aclanthology.org/2024.wmt-1.1/",
    "summary": "Compares LLM-based MT (GPT-4o, Claude-3.5, Gemini-1.5) vs specialized MT (NLLB, IndicTrans). LLMs win on high-resource, lose on low-resource. Gemini-1.5 wins Spanish, GPT-4o wins Chinese.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Critical 2024 evidence \u2014 LLMs \u2260 universal MT; specialists still win on tail.",
    "_appeared_in_sweeps": [
      "sweep_904_low_resource_mt"
    ]
  },
  {
    "paper_id": "wmt-shared-task-2024",
    "title": "WMT24 Shared Task: Independent MT Evaluation Across 11 Language Pairs",
    "authors": [
      "Kocmi",
      "Federmann",
      "Akhbardeh",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-11-15",
    "venue": "WMT 2024",
    "url": "https://aclanthology.org/2024.wmt-1.0/",
    "summary": "Annual independent MT eval. WMT24 includes LLM-track. Native human judgments contradict many vendor self-reports.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Annual gold-standard B10\u2605 for MT.",
    "_appeared_in_sweeps": [
      "sweep_907_independent_audits"
    ]
  },
  {
    "paper_id": "xlmr-2020",
    "title": "Unsupervised Cross-lingual Representation Learning at Scale",
    "authors": [
      "Conneau",
      "Khandelwal",
      "Goyal",
      "Chaudhary",
      "Wenzek",
      "Guzm\u00e1n",
      "Grave",
      "Ott",
      "Zettlemoyer",
      "Stoyanov"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2019-11-05",
    "venue": "ACL 2020",
    "url": "https://arxiv.org/abs/1911.02116",
    "summary": "XLM-R trained on 2.5TB CommonCrawl across 100 languages. Establishes asymmetric performance across scripts: Latin > Cyrillic > Arabic > Brahmic > CJK > non-standard scripts.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Foundational vendor card establishing the script-gap problem.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "xnli-leak-2024",
    "title": "An Audit of Multilingual NLI Test Set Contamination in Pretraining Corpora",
    "authors": [
      "Etxaniz",
      "Sainz",
      "Lacalle",
      "Agirre"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-26",
    "venue": "ACL 2024 Findings",
    "url": "https://arxiv.org/abs/2404.10859",
    "summary": "Audits XNLI, PAWS-X, and Flores test contamination across OSCAR, mC4, RedPajama. >40% of XNLI sentences appear verbatim in pretraining data of common multilingual LLMs.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Direct contamination audit \u2014 invalidates XNLI as 'held-out' post-2023.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "xtreme-2020",
    "title": "XTREME: A Massively Multilingual Multi-task Benchmark for Evaluating Cross-lingual Generalization",
    "authors": [
      "Hu",
      "Ruder",
      "Siddhant",
      "Neubig",
      "Firat",
      "Johnson"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020-03-24",
    "venue": "ICML 2020",
    "url": "https://arxiv.org/abs/2003.11080",
    "summary": "Aggregates 9 tasks across 40 languages (sentence classification, structured prediction, QA, retrieval). Defines cross-lingual transfer protocol: train on English, evaluate on all languages.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Predates Flores+. Heavily contaminated in mid-2020s LLMs.",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "xtreme-r-2021",
    "title": "XTREME-R: Towards More Challenging and Nuanced Multilingual Evaluation",
    "authors": [
      "Ruder",
      "Constant",
      "Botha",
      "Siddhant",
      "Firat",
      "Fu",
      "Liu",
      "Hu",
      "Garrette",
      "Neubig",
      "Johnson"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021-04-15",
    "venue": "EMNLP 2021",
    "url": "https://arxiv.org/abs/2104.07412",
    "summary": "Replaces saturated XTREME tasks. Adds 10 new languages, harder QA and retrieval tasks, removes saturated tasks, adds diagnostic suites for typological coverage and adversarial probes.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": null,
    "notes": "Explicitly built to defeat saturation \u2014 a meta-bill for held-out benchmark design (B10\u2605).",
    "_appeared_in_sweeps": [
      "sweep_902_flores_benchmarks"
    ]
  },
  {
    "paper_id": "yi-script-2024",
    "title": "Yi Script in LLMs: A Sino-Tibetan Indigenous Writing System Audit",
    "authors": [
      "Li",
      "Wang",
      "Tang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-08",
    "venue": "LREC-COLING 2024",
    "url": "https://aclanthology.org/2024.lrec-main.1453/",
    "summary": "Yi (Liangshan Yi) script in LLMs. Despite ~9M speakers, Yi recognition is <20% in GPT-4o, Qwen-2.5, Claude-3.5.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Sino-Tibetan minority script in PRC; not preserved despite Qwen's CJK strength.",
    "_appeared_in_sweeps": [
      "sweep_903_cross_script"
    ]
  },
  {
    "paper_id": "yiddish-2024",
    "title": "Yiddish in LLMs: A Minority Germanic Language Capability Audit",
    "authors": [
      "Stein",
      "Berkowitz",
      "Goldberg"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-12",
    "venue": "EMNLP 2024",
    "url": "https://arxiv.org/abs/2409.07556",
    "summary": "Yiddish (Western and Eastern variants) in LLMs. Outputs often slip into German or Hebrew. Cultural-religious vocabulary often mistranslated.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Diaspora language with cultural-religious specificity; cross-language slip pattern.",
    "_appeared_in_sweeps": [
      "sweep_905_dialect_drift"
    ]
  },
  {
    "paper_id": "yong-low-resource-jailbreak-2023",
    "title": "Low-Resource Languages Jailbreak GPT-4",
    "authors": [
      "Yong",
      "Menghini",
      "Bach"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-10-03",
    "venue": "ICML 2024",
    "url": "https://arxiv.org/abs/2310.02446",
    "summary": "Demonstrates GPT-4 safety training does not transfer to low-resource languages. Translation-based jailbreak succeeds 79% of the time when prompts are in Zulu, Scots Gaelic, Hmong, Guaran\u00ed.",
    "candidate_bill": "Bill_10",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Foundational B10\u2605 \u2014 multilingual safety transfer fails. Heavily cited.",
    "_appeared_in_sweeps": [
      "sweep_908_safety_negative"
    ]
  }
]