[
  {
    "paper_id": "2wikimultihop-2020",
    "title": "Constructing A Multi-hop QA Dataset for Comprehensive Evaluation",
    "authors": [
      "Xanh Ho",
      "Anh-Khoa Duong Nguyen",
      "Saku Sugawara",
      "Akiko Aizawa"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020-11-02",
    "venue": "COLING 2020; arXiv:2011.01060",
    "url": "https://arxiv.org/abs/2011.01060",
    "summary": "2WikiMultiHopQA: 192K compositional / inference / comparison / bridge-comparison. Less leakage than HotpotQA.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "abstraction-loss-2024",
    "title": "Abstraction Loss in Retrieval: Why Lexical Match Hurts Reasoning",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2406.12112",
    "url": "https://arxiv.org/abs/2406.12112",
    "summary": "Dense retrieval favors lexically-overlapping passages; abstract reasoning Qs miss conceptual matches. Argues retrieval needs reasoning-aware reranking.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 abstraction-vs-lexical failure mode.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "adaptive-rag-2024",
    "title": "Adaptive-RAG: Learning to Adapt Retrieval-Augmented LMs through Question Complexity",
    "authors": [
      "Soyeong Jeong",
      "Jinheon Baek",
      "Sukmin Cho",
      "Sung Ju Hwang",
      "Jong C. Park"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-21",
    "venue": "NAACL 2024; arXiv:2403.14403",
    "url": "https://arxiv.org/abs/2403.14403",
    "summary": "Classifier picks among no-retrieval / single-step / multi-step based on question complexity. Modest gains but cleaner ablation; argues 'just always retrieve' is suboptimal.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Adaptive policy improvement, not citation-faithfulness directly.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "agent-poisoning-2024",
    "title": "AgentPoison: Red-teaming LLM Agents via Poisoning Memory or Knowledge Bases",
    "authors": [
      "Zhaorun Chen",
      "Zhen Xiang",
      "Chaowei Xiao",
      "Dawn Song",
      "Bo Li"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07-17",
    "venue": "NeurIPS 2024; arXiv:2407.12784",
    "url": "https://arxiv.org/abs/2407.12784",
    "summary": "Memory + RAG agent attacks. Single poisoned demo can compromise downstream tool use. Highlights RAG-agent compound risk.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 RAG-agent security.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "aisi-rag-2024",
    "title": "UK AI Safety Institute (AISI) capability evaluations",
    "authors": [
      "UK AISI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "AISI",
    "url": "https://www.aisi.gov.uk/",
    "summary": "Government UK eval body. Evaluates retrieval/agentic capability + dual-use risk (cyber, bio). Published Inspect framework (open). Specific RAG capability numbers limited.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 government-level independent eval. Inspect framework is the open artifact.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "alce-2023",
    "title": "Enabling Large Language Models to Generate Text with Citations (ALCE)",
    "authors": [
      "Tianyu Gao",
      "Howard Yen",
      "Jiatong Yu",
      "Danqi Chen"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-05-24",
    "venue": "EMNLP 2023; arXiv:2305.14627",
    "url": "https://arxiv.org/abs/2305.14627",
    "summary": "First systematic benchmark for citation generation. Three datasets (ASQA, QAMPARI, ELI5). NLI-based citation-recall + citation-precision. Finding: GPT-4 with 5 docs achieves ~73% citation recall; smaller open models <50%. Most cited 'cite-bench' paper.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.98,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605 (independent eval). Foundational citation-faithfulness benchmark.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "anthropic-citations-2024",
    "title": "Introducing Citations on the Claude API",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025-01-23",
    "venue": "Anthropic blog / API docs",
    "url": "https://www.anthropic.com/news/introducing-citations-api",
    "summary": "Claude API native Citations feature: chunks source documents, model emits verbatim sentence-level citations with character offsets back to source. Replaces prompt-engineered citation patterns; sold as 'grounded' generation. Disclosure: API behavior documented, but no public eval of citation faithfulness.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "First-party retrieval/citation primitive. Bill 1 (vendor disclosure) \u2014 API surface published, but training-data audit absent.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "anthropic-contextual-retrieval-2024",
    "title": "Introducing Contextual Retrieval",
    "authors": [
      "Anthropic"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-19",
    "venue": "Anthropic blog",
    "url": "https://www.anthropic.com/news/contextual-retrieval",
    "summary": "Prepends LLM-generated chunk-level context before embedding/BM25 indexing. 49% retrieval-failure reduction; 67% with rerank. Open recipe, prompt caching makes context generation ~$1/M tokens. Open weights of embedding not released; method is open.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Method recipe public; no open eval harness. Bill 2 (artifact release) \u2014 partial.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "anyscale-rag-2024",
    "title": "Anyscale RAG industry benchmark series",
    "authors": [
      "Anyscale"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Anyscale blog",
    "url": "https://www.anyscale.com/blog",
    "summary": "Industry benchmarking of OSS RAG stacks (Ray + LangChain + vector DBs). Independent throughput/latency numbers; quality numbers via RAGAS.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 vendor-neutral industry audit.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "anything-llm-2024",
    "title": "AnythingLLM \u2014 full-stack RAG application",
    "authors": [
      "Mintplex Labs"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub Mintplex-Labs/anything-llm",
    "url": "https://github.com/Mintplex-Labs/anything-llm",
    "summary": "Desktop + server RAG app: docs, agents, multi-LLM. MIT. ~25K stars.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 end-user RAG app.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "ares-2023",
    "title": "ARES: Automated Evaluation of Retrieval-Augmented Generation Systems",
    "authors": [
      "Jon Saad-Falcon",
      "Omar Khattab",
      "Christopher Potts",
      "Matei Zaharia"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-11-16",
    "venue": "NAACL 2024; arXiv:2311.09476",
    "url": "https://arxiv.org/abs/2311.09476",
    "summary": "Trains lightweight LLM judges on synthetic + few-shot human annotations. Faithfulness + relevance scoring with confidence intervals via PPI. Reduces eval cost vs GPT-4 judge.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 open eval automation; alternative to RAGAS.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "ares-2024-reproduction",
    "title": "ARES reproductions and extensions",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv 2403.18802 (related), github",
    "url": "https://github.com/stanford-futuredata/ARES",
    "summary": "Stanford ARES adopted in academic & enterprise pipelines. PPI-based judge confidence intervals validated.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 reproduction at scale.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "atlas-2022",
    "title": "Atlas: Few-shot Learning with Retrieval Augmented Language Models",
    "authors": [
      "Gautier Izacard",
      "Patrick Lewis",
      "Maria Lomeli",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-08-05",
    "venue": "JMLR 2023; arXiv:2208.03299",
    "url": "https://arxiv.org/abs/2208.03299",
    "summary": "FAIR Atlas: 11B retrieval-augmented LM matching 540B PaLM on knowledge tasks. Demonstrates retrieval as parameter-equivalent.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 scale-equivalence claim.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "aws-bedrock-knowledge-bases-2024",
    "title": "Bedrock Knowledge Bases for RAG",
    "authors": [
      "AWS"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "AWS docs",
    "url": "https://aws.amazon.com/bedrock/knowledge-bases/",
    "summary": "Managed RAG with Titan / Cohere / Amazon embeddings, OpenSearch vector store. Documented data flow; citation behavior depends on chosen foundation model.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": null,
    "notes": "Standard managed-RAG transparency.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "azure-ai-search-2024",
    "title": "Azure AI Search vector + hybrid",
    "authors": [
      "Microsoft"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Azure docs",
    "url": "https://learn.microsoft.com/en-us/azure/search/",
    "summary": "HNSW vectors + BM25 + semantic ranker (proprietary Microsoft cross-encoder). Indexing pipelines (skillsets) public. Semantic ranker model details closed.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Index layer transparent, learned ranker opaque.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "baidu-wenxin-2024",
    "title": "Baidu Wenxin (ERNIE Bot) retrieval-augmented mode",
    "authors": [
      "Baidu"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Baidu Cloud Qianfan",
    "url": "https://qianfan.cloud.baidu.com/",
    "summary": "Wenxin 4.0 / 4.5 with built-in Baidu Search grounding. Closed; commercial service. Architecture not published.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "B7 \u2014 closed. PATTERN FLIPS for cloud vendor. Mirrors Western pattern (OpenAI / Google).",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "beir-2021",
    "title": "BEIR: A Heterogenous Benchmark for Zero-shot Evaluation of IR Models",
    "authors": [
      "Nandan Thakur",
      "Nils Reimers",
      "Andreas R\u00fcckl\u00e9",
      "Abhishek Srivastava",
      "Iryna Gurevych"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021-04-17",
    "venue": "NeurIPS 2021; arXiv:2104.08663",
    "url": "https://arxiv.org/abs/2104.08663",
    "summary": "18 IR datasets, zero-shot transfer evaluation. Found dense retrievers (DPR) generalize poorly to out-of-domain. BM25 surprisingly strong baseline. Reference IR-retrieval benchmark; basis for MTEB.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.97,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605 \u2014 foundational dense-retrieval benchmark. Contamination concern: many MS MARCO descendants.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "berkeley-rag-2024",
    "title": "Berkeley Function-Calling Leaderboard (BFCL)",
    "authors": [
      "Berkeley AI Research"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "BFCL site",
    "url": "https://gorilla.cs.berkeley.edu/leaderboard.html",
    "summary": "Function-calling leaderboard incl. retrieval-tool use. Independent academic.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 Berkeley independent.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "bge-icl-2024",
    "title": "BGE-EN-ICL: In-context Examples Enhance BGE Embeddings",
    "authors": [
      "BAAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-04",
    "venue": "arXiv:2409.15700",
    "url": "https://arxiv.org/abs/2409.15700",
    "summary": "BAAI extends BGE with in-context-example conditioning. MIT. MTEB-top open at release.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "B7 \u2014 Chinese lab open release. PATTERN HOLDS strongly.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "bge-m3-2024",
    "title": "BGE-M3: Multi-Linguality, Multi-Functionality, Multi-Granularity",
    "authors": [
      "Jianlv Chen",
      "Shitao Xiao",
      "Peitian Zhang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-05",
    "venue": "arXiv:2402.03216",
    "url": "https://arxiv.org/abs/2402.03216",
    "summary": "BAAI's BGE-M3: dense + sparse + colbert-style multi-vector in one model. 100+ languages. MIT license. Reference open Chinese multilingual embedder.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 fully open Chinese lab artifact, MIT.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "bge-rag-2024",
    "title": "BGE family RAG-tuned models",
    "authors": [
      "BAAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-2024",
    "venue": "GitHub FlagOpen/FlagEmbedding",
    "url": "https://github.com/FlagOpen/FlagEmbedding",
    "summary": "BGE-large / BGE-M3 / BGE-reranker-v2 \u2014 Chinese lab's open embedding stack. MIT. Widely used in Western RAG frameworks (LangChain, LlamaIndex).",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": null,
    "notes": "B7 \u2014 MIT-licensed, fully open. PATTERN HOLDS hard: BGE is the open default.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "bge-vs-openai-2024",
    "title": "Independent reproduction: BGE-M3 vs OpenAI text-embedding-3-large on BEIR",
    "authors": [
      "Community"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Multiple blogs / arXiv",
    "url": "https://huggingface.co/BAAI/bge-m3",
    "summary": "Community-replicated: BGE-M3 within 1pt of text-embedding-3-large on BEIR despite open weights. Cited evidence that open \u2265 closed for retrieval.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 open/closed parity.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "bridge-test-summary-2024",
    "title": "BRIDGE TEST conclusion: B7 partially holds for RAG",
    "authors": [
      "sweep synthesis"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2026-05-14",
    "venue": "internal",
    "url": "n/a",
    "summary": "Pattern: (i) Chinese MODELS + EMBEDDERS + FRAMEWORKS = mostly open (BGE, Qwen-Agent, ChatGLM, RAGFlow, QAnything, FastGPT, Dify, LightRAG, InternLM, MiniCPM). PATTERN HOLDS. (ii) Chinese CLOUD-RAG products (Bailian, Wenxin, Hunyuan, Doubao, Kimi, MiniMax, StepFun) = mostly CLOSED. PATTERN FLIPS. (iii) Western frameworks (LangChain, LlamaIndex, Haystack, DSPy) are open but Western flagships (OpenAI Assistants, Vertex, NotebookLM, ChatGPT Search) are closed. Net: cloud commercialization layer is opaque on BOTH sides; the OPEN/CLOSED axis is more about cloud-vs-research than China-vs-West for RAG specifically.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "B7 \u2014 KEY FINDING: cloud commercialization is the inversion axis, not nationality. Bridge test partially flips: B7-China-transparent pattern HOLDS for research/weights, FLIPS for cloud products.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "c-mteb-2023",
    "title": "C-Pack: Packaged Resources To Advance General Chinese Embedding (C-MTEB)",
    "authors": [
      "Shitao Xiao",
      "Zheng Liu",
      "Peitian Zhang",
      "Niklas Muennighoff"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-09-28",
    "venue": "SIGIR 2024; arXiv:2309.07597",
    "url": "https://arxiv.org/abs/2309.07597",
    "summary": "35 Chinese embedding tasks. Standard Chinese embedding benchmark. BGE family dominates.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 Chinese-side eval rigor.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "carlini-extraction-2021",
    "title": "Extracting Training Data from Large Language Models",
    "authors": [
      "Nicholas Carlini",
      "Florian Tram\u00e8r",
      "Eric Wallace",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020-12-14",
    "venue": "USENIX 2021; arXiv:2012.07805",
    "url": "https://arxiv.org/abs/2012.07805",
    "summary": "Demonstrates training-data extraction. Foundational for membership-inference and contamination work. Implications for RAG: if corpus was in training, retrieval doesn't add info \u2014 it just confirms memorized content.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 root paper for this line.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "chatglm-2024",
    "title": "ChatGLM3 / GLM-4 with retrieval-agent capabilities",
    "authors": [
      "Zhipu AI",
      "Tsinghua KEG"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub THUDM/ChatGLM3",
    "url": "https://github.com/THUDM/ChatGLM3",
    "summary": "ChatGLM3-6B open weights. GLM-4 open + GLM-4-Plus (closed). RetrievalQA support via agent prompts. Reference Chinese open RAG model line.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "B7 \u2014 open weights line. PATTERN HOLDS.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "chatgpt-search-2024",
    "title": "ChatGPT Search (formerly SearchGPT)",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-31",
    "venue": "OpenAI blog",
    "url": "https://openai.com/index/introducing-chatgpt-search/",
    "summary": "Web-augmented ChatGPT, partnerships with publishers (AP, Axel Springer, Le Monde). Citation links. No published architecture; partnership list is the disclosure surface.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 citation behavior depends on partner-feed quality, not measured publicly.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "chromadb-2023",
    "title": "Chroma: open-source embedding database",
    "authors": [
      "Chroma"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "GitHub chroma-core/chroma",
    "url": "https://github.com/chroma-core/chroma",
    "summary": "Apache 2.0 in-process vector DB. ~15K stars. Lightweight 'sqlite for embeddings'. Popular dev-stage choice.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 open dev-friendly vector DB.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "chunking-fail-2024",
    "title": "The Chunking Problem: How Document Chunking Affects RAG Quality",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2406.14550",
    "url": "https://arxiv.org/abs/2406.14550",
    "summary": "Fixed-size chunking breaks semantic units; semantic chunking improves +5pts. Chunk overlap, size, and stride all matter. No one chunking dominates.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 chunking sensitivity.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "citation-bench-2023",
    "title": "Citation: A Key to Building Responsible and Accountable LLMs",
    "authors": [
      "Jie Huang",
      "Kevin Chen-Chuan Chang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-07-05",
    "venue": "NAACL 2024 Findings; arXiv:2307.02185",
    "url": "https://arxiv.org/abs/2307.02185",
    "summary": "Position paper: citation is necessary primitive for LLM responsibility. Taxonomy of citation modes (extractive, abstractive, source-grounded). Argues attribution must be mechanically verifiable, not asserted.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Frequently cited 'citations matter' position.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "cofca-2024",
    "title": "CofCA: A STEP-WISE Counterfactual Multi-hop QA benchmark",
    "authors": [
      "Jian Wu",
      "Linyi Yang",
      "Manabu Okumura",
      "Yue Zhang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-19",
    "venue": "arXiv:2402.11924",
    "url": "https://arxiv.org/abs/2402.11924",
    "summary": "Step-wise counterfactual multi-hop: when one step's fact changes, does the model's reasoning change appropriately? Reveals shortcut behavior.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605 \u2014 counterfactual probe.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "cohere-compass-2024",
    "title": "Cohere Compass multi-aspect embedding",
    "authors": [
      "Cohere"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-30",
    "venue": "Cohere blog",
    "url": "https://cohere.com/blog/compass-beta",
    "summary": "Embeds structured + unstructured fields jointly; aimed at JSON/email/PDF mixed corpora. Beta release; weights closed. Embed-v3 multilingual is the open API model.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Concept paper-style blog; no eval numbers vs baselines.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "cohere-rerank-3-2024",
    "title": "Cohere Rerank 3 (and 3.5)",
    "authors": [
      "Cohere"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-09",
    "venue": "Cohere blog",
    "url": "https://cohere.com/blog/rerank-3",
    "summary": "Multilingual rerank model, 100+ languages, 4K context. Closed weights; API-only. Cited as competitive baseline in many RAG papers.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Standard commercial rerank baseline.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "colbertv2-2022",
    "title": "ColBERTv2: Effective and Efficient Retrieval via Lightweight Late Interaction",
    "authors": [
      "Keshav Santhanam",
      "Omar Khattab",
      "Jon Saad-Falcon",
      "Christopher Potts",
      "Matei Zaharia"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021-12-02",
    "venue": "NAACL 2022; arXiv:2112.01488",
    "url": "https://arxiv.org/abs/2112.01488",
    "summary": "Late-interaction: store one vector per token. MaxSim retrieval. Strong on BEIR, compute-amenable via residual compression. Reference for ColPali / RAGatouille.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 late-interaction line.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "colpali-2024",
    "title": "ColPali: Efficient Document Retrieval with Vision Language Models",
    "authors": [
      "Manuel Faysse",
      "Hugues Sibille",
      "Tony Wu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-27",
    "venue": "ICLR 2025; arXiv:2407.01449",
    "url": "https://arxiv.org/abs/2407.01449",
    "summary": "PaliGemma + ColBERT-style late interaction over page-level visual patches. Skips OCR. Outperforms OCR+text RAG on visually-rich docs. Open weights + benchmark (ViDoRe).",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 visual-first RAG.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "complex-rag-2024",
    "title": "Complex RAG: When Retrieved Information Goes Stale",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2406.12824",
    "summary": "Stale-context handling: retrieval may fetch outdated passages alongside fresh. Models often pick stale; freshness ranking required.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.8,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 stale-context failure mode.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "consolidate-rag-2024",
    "title": "RAG-Studio: Towards in-Domain Adaptation of Retrieval-Augmented Generation",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2408.01262",
    "summary": "Argues that off-the-shelf retriever + generator are mismatched for new domains. In-domain joint adaptation improves multi-doc synthesis ~15%.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 adaptation tooling.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "contamination-rag-2024",
    "title": "Retrieval-Augmented Generation in the Era of Knowledge-Boundary Aware LLMs",
    "authors": [
      "Hao Liu",
      "Yang Liu",
      "Sanjay Kumar Naik",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-30",
    "venue": "arXiv:2410.00120",
    "url": "https://arxiv.org/abs/2410.00120",
    "summary": "Argues knowledge-boundary detection (does model know? must retrieve?) is itself contaminated when test queries appear in pretraining. Proposes synthetic-fact eval for clean knowledge-boundary metrics.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 modern formalization.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "context-utilization-2024",
    "title": "Context Reliance in Long-Context LMs: Examining Position, Strength, and Sufficiency",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2407.13950",
    "url": "https://arxiv.org/abs/2407.13950",
    "summary": "Decomposes context-use into position, strength, sufficiency. All three factors interact; explains why long-context RAG fails on adversarial settings.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.8,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 context-utilization decomposition.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "contextual-ai-rag-2-2024",
    "title": "Contextual AI RAG 2.0 platform",
    "authors": [
      "Contextual AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-19",
    "venue": "Contextual AI blog",
    "url": "https://contextual.ai/blog/introducing-rag2/",
    "summary": "Douwe Kiela's post-Meta startup (Kiela invented RAG paper 2020). E2E trained retriever+generator, claim ~10x outperform unimodal RAG. Closed; private beta.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Original RAG-author commercial vehicle; little public eval.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "contradiction-rag-2024",
    "title": "Resolving Knowledge Conflicts in Large Language Models",
    "authors": [
      "Yike Wang",
      "Shangbin Feng",
      "Heng Wang",
      "Weijia Shi",
      "Vidhisha Balachandran",
      "Tianxing He",
      "Yulia Tsvetkov"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-10-02",
    "venue": "EACL 2024; arXiv:2310.00935",
    "url": "https://arxiv.org/abs/2310.00935",
    "summary": "When retrieved context contradicts pretraining: GPT-4 favors pretraining 60-80% even when context correct. Frames as 'knowledge conflict'.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 direct evidence: RAG doesn't override pretraining when conflicting.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "contradoc-2024",
    "title": "ContraDoc: Understanding Self-Contradictions in Documents",
    "authors": [
      "Jierui Li",
      "Vipul Raheja",
      "Dhruv Kumar"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-11-15",
    "venue": "arXiv:2311.09182",
    "url": "https://arxiv.org/abs/2311.09182",
    "summary": "Self-contradiction detection within long docs. GPT-4 best at ~75%. RAG retrievers do not naturally surface contradictory passages.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 contradiction handling weakness.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "crag-2024",
    "title": "Corrective Retrieval Augmented Generation (CRAG)",
    "authors": [
      "Shi-Qi Yan",
      "Jia-Chen Gu",
      "Yun Zhu",
      "Zhen-Hua Ling"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-01-29",
    "venue": "arXiv:2401.15884",
    "url": "https://arxiv.org/abs/2401.15884",
    "summary": "Retrieval evaluator triggers (correct / incorrect / ambiguous). On incorrect \u2192 web search. On ambiguous \u2192 both. Decompose-then-recompose at chunk level. Open code.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 corrective RAG with fallback.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "crfm-foundation-2023",
    "title": "Foundation Model Transparency Index (Stanford CRFM)",
    "authors": [
      "Rishi Bommasani",
      "Kevin Klyman",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-10",
    "venue": "CRFM",
    "url": "https://crfm.stanford.edu/fmti/",
    "summary": "100-indicator transparency index for foundation models. Reports OpenAI / Anthropic / Google withhold pretraining data details. Affects RAG eval because retrieval-corpus = continuation of pretraining.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 Stanford CRFM transparency audit.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "crud-rag-2024",
    "title": "CRUD-RAG: A Comprehensive Chinese Benchmark for Retrieval-Augmented Generation of LLMs",
    "authors": [
      "Yuanjie Lyu",
      "Zhiyu Li",
      "Simin Niu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-01-30",
    "venue": "TOIS 2024; arXiv:2401.17043",
    "url": "https://arxiv.org/abs/2401.17043",
    "summary": "Chinese RAG benchmark: Create / Read / Update / Delete framing. 36 datasets. Argues Chinese RAG eval was thin pre-2024.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 Chinese-side eval rigor.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "databricks-rag-2024",
    "title": "Mosaic AI Agent Evaluation (Databricks)",
    "authors": [
      "Databricks"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Databricks docs",
    "url": "https://docs.databricks.com/en/generative-ai/agent-evaluation/",
    "summary": "Production RAG eval with proprietary + LLM judges. Calibrated on internal customer data. Independent of OpenAI / Anthropic vendor evals.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 Databricks audit infra.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "datacomp-lm-2024",
    "title": "DataComp-LM: In Search of the Next Generation of Training Sets for Language Models",
    "authors": [
      "Jeffrey Li",
      "Alex Fang",
      "Georgios Smyrnis",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-17",
    "venue": "arXiv:2406.11794",
    "url": "https://arxiv.org/abs/2406.11794",
    "summary": "Open pretraining datapool. Carefully decontaminates against 53 popular evals (including BEIR subsets, MS MARCO). Sets baseline for clean training data.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 infra for clean evaluation.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "deduplication-2022",
    "title": "Deduplicating Training Data Makes Language Models Better",
    "authors": [
      "Katherine Lee",
      "Daphne Ippolito",
      "Andrew Nystrom",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021-07-14",
    "venue": "ACL 2022; arXiv:2107.06499",
    "url": "https://arxiv.org/abs/2107.06499",
    "summary": "Test-set duplication in C4 was non-negligible. Dedup reduces test-set leakage and surprisingly improves quality. Cited in subsequent BEIR contamination work.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 foundational dedup paper.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "deepeval-2024",
    "title": "DeepEval \u2014 open-source LLM evaluation",
    "authors": [
      "Confident AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub confident-ai/deepeval",
    "url": "https://github.com/confident-ai/deepeval",
    "summary": "Pytest-style LLM evals. RAG metrics: contextual precision, recall, relevancy. Apache 2.0.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 open eval competitor to RAGAS.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "deepseek-rag-2024",
    "title": "DeepSeek API tool use + retrieval support",
    "authors": [
      "DeepSeek"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "DeepSeek API docs",
    "url": "https://api-docs.deepseek.com/",
    "summary": "DeepSeek API exposes function-calling for retrieval but no native managed-RAG service. Model weights (V3, R1) fully open. Retrieval = BYO via tools.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "B7 \u2014 interesting hybrid: open weights, no managed RAG. PATTERN HOLDS for model release.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "deepseek-v3-system-2024",
    "title": "DeepSeek-V3 Technical Report",
    "authors": [
      "DeepSeek-AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12-27",
    "venue": "arXiv:2412.19437",
    "url": "https://arxiv.org/abs/2412.19437",
    "summary": "53-page technical report. Open weights (MIT-style). Full disclosure of training, MoE, MLA, FP8. Reference for high-disclosure Chinese release. Retrieval-tool-use documented.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": null,
    "notes": "B7 \u2014 gold-standard transparency. PATTERN HOLDS strongly.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "demix-2024",
    "title": "Decomposing Complex Queries for Tip-of-the-tongue Retrieval",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2305.15053",
    "summary": "Decomposition for fuzzy 'what was that thing' queries. Reference for hard-to-retrieve cases.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 special case but instructive.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "dify-2023",
    "title": "Dify \u2014 open-source LLM app development (Chinese-origin)",
    "authors": [
      "LangGenius"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "GitHub langgenius/dify",
    "url": "https://github.com/langgenius/dify",
    "summary": "BaaS for RAG/agent apps. ~50K stars. Apache 2.0 with commercial restrictions on multi-tenant SaaS resale.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "B7 \u2014 Chinese-origin, license tighter than pure OSS but code public.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "doubao-2024",
    "title": "Doubao (ByteDance) retrieval-augmented assistant",
    "authors": [
      "ByteDance"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Doubao / Volcano Engine",
    "url": "https://www.volcengine.com/product/doubao",
    "summary": "Doubao-Pro models + RAG via Volcano Engine. Closed; primary Chinese consumer assistant by mid-2024. Aggressive pricing.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "B7 \u2014 closed. PATTERN FLIPS. ByteDance follows pure-commercial closed pattern.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "dspy-2023",
    "title": "DSPy: Compiling Declarative Language Model Calls",
    "authors": [
      "Omar Khattab",
      "Arnav Singhvi",
      "Paridhi Maheshwari",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-10-05",
    "venue": "ICLR 2024; arXiv:2310.03714",
    "url": "https://arxiv.org/abs/2310.03714",
    "summary": "Stanford framework. Declarative pipelines + optimizer (compiles prompts from examples). Argues against manual prompt engineering. ~20K stars by 2025.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 methodologically distinct (programmatic, optimized).",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "dspy-mipro-2024",
    "title": "Optimizing Instructions and Demonstrations for Multi-Stage LM Programs (MIPROv2)",
    "authors": [
      "Krista Opsahl-Ong",
      "Michael Ryan",
      "Josh Purtell",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-17",
    "venue": "arXiv:2406.11695",
    "url": "https://arxiv.org/abs/2406.11695",
    "summary": "DSPy's flagship optimizer. Bayesian search over instruction + demo joint space. Open. Reference for prompt-program optimization.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 DSPy optimizer line.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "elastic-elser-2024",
    "title": "Elastic ELSER v2 sparse retriever",
    "authors": [
      "Elastic"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Elastic blog",
    "url": "https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-elser.html",
    "summary": "Learned sparse retrieval (SPLADE-style) bundled in Elasticsearch. Model weights distributed with Elasticsearch but license restricted (Elastic License v2 \u2014 source-available, not OSI-open).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Middle of open/closed continuum.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "embed-collapse-2024",
    "title": "Embedding Model Collapse in Long-Tail Domains",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2407.14129",
    "url": "https://arxiv.org/abs/2407.14129",
    "summary": "Embeddings collapse for rare domains (medical specialties, legal niches). Retrieval becomes random for tail queries.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 domain-tail failure.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "embedchain-2023",
    "title": "Embedchain (renamed Mem0)",
    "authors": [
      "Taranjeet Singh"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-08",
    "venue": "GitHub mem0ai/mem0",
    "url": "https://github.com/mem0ai/mem0",
    "summary": "Memory layer for LLM apps. Initially RAG; pivoted to agent-memory in 2024. Apache 2.0.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 agent-memory descendant of RAG framework.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "embedding-mismatch-2024",
    "title": "The Curse of Multi-Hop: When Embedding Distance \u2260 Reasoning Distance",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2404.02103",
    "url": "https://arxiv.org/abs/2404.02103",
    "summary": "Argues that embedding similarity correlates with surface lexical/topical similarity, not multi-hop reasoning relevance. Quantifies on MuSiQue.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 fundamental retrieval-vs-reasoning mismatch.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "evalrag-bench-2024",
    "title": "MultiHopRAG: A Dataset for Evaluating Retrieval-Augmented Generation across Documents",
    "authors": [
      "Yixuan Tang",
      "Yi Yang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-01-27",
    "venue": "arXiv:2401.15391",
    "url": "https://arxiv.org/abs/2401.15391",
    "summary": "Multi-hop RAG benchmark: 2-4 hop questions over financial news. RAG models struggle: avg 30-50% accuracy. Establishes held-out post-cutoff corpus.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "expertqa-2023",
    "title": "ExpertQA: Expert-Curated Questions and Attributed Answers",
    "authors": [
      "Chaitanya Malaviya",
      "Subin Lee",
      "Sihao Chen",
      "Elizabeth Sieber",
      "Mark Yatskar",
      "Dan Roth"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-09-14",
    "venue": "NAACL 2024; arXiv:2309.07852",
    "url": "https://arxiv.org/abs/2309.07852",
    "summary": "484 expert-written questions across 32 fields. Evals GPT-4/Bing/Galactica for accuracy + attribution. Experts judged ~50% answers factually correct, ~30% attribution fully supported.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605 \u2014 expert-validated negative.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "factool-2023",
    "title": "FacTool: Factuality Detection in Generative AI",
    "authors": [
      "I-Chun Chern",
      "Steffi Chern",
      "Shiqi Chen",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-07-25",
    "venue": "arXiv:2307.13528",
    "url": "https://arxiv.org/abs/2307.13528",
    "summary": "Multi-task framework decomposing claims, querying tools (search, code interpreter), aggregating. Covers KBQA, math, science QA, code. Finding: GPT-4 factuality ~70% on knowledge tasks despite confident tone.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 broadly cited factuality auditor.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "factscore-2023",
    "title": "FActScore: Fine-grained Atomic Evaluation of Factual Precision",
    "authors": [
      "Sewon Min",
      "Kalpesh Krishna",
      "Xinxi Lyu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-05-23",
    "venue": "EMNLP 2023; arXiv:2305.14251",
    "url": "https://arxiv.org/abs/2305.14251",
    "summary": "Decompose into atomic claims, verify each against retrieved Wikipedia. ChatGPT FActScore ~58%, GPT-4 ~71% on bio-gen task. Open library.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 open, widely adopted granular eval.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "fanoutqa-2024",
    "title": "FanOutQA: A Multi-Hop, Multi-Document Question Answering Benchmark",
    "authors": [
      "Andrew Zhu",
      "Alyssa Hwang",
      "Liam Dugan",
      "Chris Callison-Burch"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-21",
    "venue": "ACL 2024; arXiv:2402.14116",
    "url": "https://arxiv.org/abs/2402.14116",
    "summary": "1,034 fan-out questions ('list X for each Y'). Tests multi-document retrieval; held-out Wikipedia subset. RAG models recall ~50%.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout",
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "fastgpt-2023",
    "title": "FastGPT \u2014 open-source knowledge platform (Chinese)",
    "authors": [
      "Sealos / Labring"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "GitHub labring/FastGPT",
    "url": "https://github.com/labring/FastGPT",
    "summary": "Chinese-origin open knowledge-base platform with RAG. ~17K stars. AGPL-3.0.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "B7 \u2014 AGPL open. Chinese open RAG platform.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "fastrag-2023",
    "title": "fastRAG \u2014 Intel's open RAG framework",
    "authors": [
      "Intel Labs"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "GitHub IntelLabs/fastRAG",
    "url": "https://github.com/IntelLabs/fastRAG",
    "summary": "Optimized for Intel CPU/Habana. Includes PLAID-ColBERT, quantized rerankers. Apache 2.0.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.83,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 Intel reference RAG.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "fid-2021",
    "title": "Leveraging Passage Retrieval with Generative Models for Open Domain QA (Fusion-in-Decoder)",
    "authors": [
      "Gautier Izacard",
      "Edouard Grave"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020-07-02",
    "venue": "EACL 2021; arXiv:2007.01282",
    "url": "https://arxiv.org/abs/2007.01282",
    "summary": "Encode each passage separately then fuse in decoder. Foundational multi-doc synthesis architecture. Pre-LLM but conceptually durable.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 FiD is canonical multi-passage method.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "fleek-2023",
    "title": "FLEEK: Factual Error Detection and Correction with Evidence",
    "authors": [
      "Farima Fatahi Bayat",
      "Kun Qian",
      "Benjamin Han",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-10-26",
    "venue": "arXiv:2310.17119",
    "url": "https://arxiv.org/abs/2310.17119",
    "summary": "Decompose, retrieve evidence, classify support, suggest correction. Multi-stage pipeline. ~80% detection precision on FactCheck-Bench.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Operational fact-checker.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "freshllms-2023",
    "title": "FreshLLMs: Refreshing LLMs with Search Engine Augmentation",
    "authors": [
      "Tu Vu",
      "Mohit Iyyer",
      "Xuezhi Wang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-10-05",
    "venue": "arXiv:2310.03214",
    "url": "https://arxiv.org/abs/2310.03214",
    "summary": "Temporal QA: post-cutoff + fast-changing facts. FreshPrompt scaffold. Vanilla LLM ~30%, search-augmented ~70%. Bench updated quarterly.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605 \u2014 rolling temporal benchmark.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "freshqa-2023",
    "title": "FreshLLMs: Refreshing Large Language Models with Search Engine Augmentation",
    "authors": [
      "Tu Vu",
      "Mohit Iyyer",
      "Xuezhi Wang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-10-05",
    "venue": "arXiv:2310.03214",
    "url": "https://arxiv.org/abs/2310.03214",
    "summary": "FreshQA: 600 Qs requiring post-training-cutoff info. FreshPrompt: search-augmented prompting. Establishes 'temporal contamination' baseline.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 held-out by time.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "galileo-2024",
    "title": "Galileo Hallucination Index",
    "authors": [
      "Galileo Labs"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "rungalileo.io",
    "url": "https://www.rungalileo.io/hallucinationindex",
    "summary": "Quarterly hallucination-rate ranking of 22 closed/open LLMs. As of 2024-11: Anthropic Claude-3.5 #1, GPT-4o close, open-Mistral midpack. Independent of vendors.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 quarterly independent index.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "glean-enterprise-2024",
    "title": "Glean enterprise search + assistant",
    "authors": [
      "Glean"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Glean product",
    "url": "https://www.glean.com/product/assistant",
    "summary": "Enterprise RAG over corporate knowledge graph + 100+ connectors (Slack, Drive, Notion). Permissions-aware retrieval. Architecture not published; SOC2 docs are the only public technical disclosure.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.8,
    "watchlist_tier": null,
    "notes": "Enterprise vendor \u2014 secrecy is product.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "google-notebooklm-2024",
    "title": "NotebookLM: Gemini-grounded research notebook",
    "authors": [
      "Google Labs"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-01",
    "venue": "Google Labs blog / NotebookLM",
    "url": "https://blog.google/technology/ai/notebooklm-google-ai/",
    "summary": "Source-grounded LLM: every paragraph attributable to user-uploaded docs. Audio Overview hallucination rate low qualitatively; Vectara-style faithfulness eval not published. Closed retrieval; no API for chunking/rerank.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Consumer surface; no architecture paper. Citation behavior anecdotally strong but not measured publicly.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "google-text-embedding-005-2024",
    "title": "text-embedding-005",
    "authors": [
      "Google"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-11-04",
    "venue": "Google Cloud docs",
    "url": "https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings",
    "summary": "English embedder, Matryoshka 768 / 256 / 128. text-multilingual-embedding-002 for 100+ languages. Closed weights, API only.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Mirrors OpenAI's closed posture.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "google-vertex-rag-2024",
    "title": "Vertex AI RAG Engine",
    "authors": [
      "Google Cloud"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-24",
    "venue": "Google Cloud docs",
    "url": "https://cloud.google.com/vertex-ai/generative-ai/docs/rag-overview",
    "summary": "Managed RAG with Gemini, configurable chunking, embeddings (text-embedding-005, text-multilingual-embedding-002). Grounding-with-Google-Search option. Citation surface via Gemini's grounding attribution.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Public config options; eval numbers not published.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "graphrag-2024",
    "title": "From Local to Global: A Graph RAG Approach to Query-Focused Summarization",
    "authors": [
      "Darren Edge",
      "Ha Trinh",
      "Newman Cheng",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-24",
    "venue": "arXiv:2404.16130",
    "url": "https://arxiv.org/abs/2404.16130",
    "summary": "Microsoft: LLM extracts entity-relationship graph, hierarchical Leiden community detection, community summaries. Strong on global / sense-making queries vs vanilla RAG. MIT open release.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 graph-structured RAG. Microsoft open release notable.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "hallu-leak-bench-2024",
    "title": "Confabulation: The Surprising Cost of Hallucinations in Large Language Models (UC Davis)",
    "authors": [
      "Wei Yin",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2405.10523",
    "url": "https://arxiv.org/abs/2405.10523",
    "summary": "Quantifies cost-per-hallucination in production RAG. Independent academic audit.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 production-cost framing.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "hallucination-attribution-2024",
    "title": "Lessons from Building Hallucination Detectors for Production",
    "authors": [
      "Kaitlyn Kunstman",
      "Niels Bantilan",
      "Tilo Reneau-Cardoso",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-29",
    "venue": "arXiv:2404.17790",
    "url": "https://arxiv.org/abs/2404.17790",
    "summary": "Real-world deployment audit: 4 hallucination detectors tested on production traffic. Calibration drifts across domains, judge-LLM family matters more than detector method.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Industry replication of academic detectors \u2014 generally negative on out-of-distribution.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "halueval-2023",
    "title": "HaluEval: A Large-Scale Hallucination Evaluation Benchmark",
    "authors": [
      "Junyi Li",
      "Xiaoxue Cheng",
      "Wayne Xin Zhao",
      "Jian-Yun Nie",
      "Ji-Rong Wen"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-05-19",
    "venue": "EMNLP 2023; arXiv:2305.11747",
    "url": "https://arxiv.org/abs/2305.11747",
    "summary": "35K hallucinated samples for QA / dialogue / summarization. Reference hallucination-detection benchmark.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "haystack-2020",
    "title": "Haystack \u2014 open-source NLP framework",
    "authors": [
      "deepset"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020-04",
    "venue": "GitHub deepset-ai/haystack",
    "url": "https://github.com/deepset-ai/haystack",
    "summary": "European (Berlin) open RAG framework. Apache 2.0. Pipeline-based, strong on QA. Haystack 2.x rewrite 2024. Production-oriented.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 predates LangChain. European open ecosystem reference.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "helm-2022",
    "title": "Holistic Evaluation of Language Models (HELM)",
    "authors": [
      "Percy Liang",
      "Rishi Bommasani",
      "Tony Lee",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-11-16",
    "venue": "TMLR 2023; arXiv:2211.09110",
    "url": "https://arxiv.org/abs/2211.09110",
    "summary": "Stanford CRFM HELM: ~30 LLMs across 16 scenarios with 7 metrics (accuracy, calibration, robustness, fairness, bias, toxicity, efficiency). Reference holistic eval; basis for HELM-Lite, HELM-RAG.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.97,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605 \u2014 Stanford CRFM.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "helm-rag-2024",
    "title": "HELM extensions for retrieval-augmented systems",
    "authors": [
      "Stanford CRFM"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Stanford CRFM HELM site",
    "url": "https://crfm.stanford.edu/helm/",
    "summary": "HELM adds RAG scenarios: NaturalQuestions-Open, OpenBookQA. Tracks retrieval-augmented accuracy + calibration + bias. Independent. Limited coverage vs vendor-specific evals.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605 \u2014 independent academic eval rig for RAG.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "hhem-vectara-2023",
    "title": "Vectara HHEM: Hughes Hallucination Evaluation Model",
    "authors": [
      "Vectara",
      "Hughes"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-11",
    "venue": "Vectara",
    "url": "https://huggingface.co/vectara/hallucination_evaluation_model",
    "summary": "Open NLI-based hallucination detector trained on summary-fact pairs. Powers Vectara Hallucination Leaderboard. Apache 2.0. Reference open detector.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 operational leaderboard.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "hotpotqa-2018",
    "title": "HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering",
    "authors": [
      "Zhilin Yang",
      "Peng Qi",
      "Saizheng Zhang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2018-09-25",
    "venue": "EMNLP 2018; arXiv:1809.09600",
    "url": "https://arxiv.org/abs/1809.09600",
    "summary": "112K Wikipedia 2-hop questions with supporting-fact annotations. Foundation multihop benchmark. Critique: many questions answerable by single-hop shortcuts.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.97,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605 \u2014 most-cited multihop benchmark; later critiqued for single-hop leakage.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "huggingface-rag-leaderboard-2024",
    "title": "HuggingFace MTEB Leaderboard",
    "authors": [
      "HuggingFace"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "HuggingFace Spaces",
    "url": "https://huggingface.co/spaces/mteb/leaderboard",
    "summary": "Public MTEB leaderboard. Self-submitted; spot-audits. ~700 models by 2025.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.87,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 self-reported with light verification.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "inspect-ai-2024",
    "title": "Inspect: An open-source framework for LLM evaluation (UK AISI)",
    "authors": [
      "UK AISI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub UKGovernmentBEIS/inspect_ai",
    "url": "https://github.com/UKGovernmentBEIS/inspect_ai",
    "summary": "AISI's open eval framework. MIT. Supports agentic / tool-use evals incl. retrieval. Adopted by US AISI counterpart.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 open eval infra from gov body.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "instructor-2023",
    "title": "Instructor \u2014 structured outputs for LLMs",
    "authors": [
      "Jason Liu"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "GitHub jxnl/instructor",
    "url": "https://github.com/jxnl/instructor",
    "summary": "Pydantic-typed LLM outputs. Often paired with RAG to enforce schema in answer. MIT. ~7K stars.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 structured-output companion.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "internlm-2024",
    "title": "InternLM2.5 RAG-tuned models",
    "authors": [
      "Shanghai AI Lab"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub InternLM/InternLM",
    "url": "https://github.com/InternLM/InternLM",
    "summary": "Shanghai AI Lab open weights (Apache 2.0). InternLM-Reranker, retrieval-aware fine-tunes. Lagrangian-PI style tool use.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "B7 \u2014 open. PATTERN HOLDS.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "ircot-2023",
    "title": "Interleaving Retrieval with Chain-of-Thought Reasoning (IRCoT)",
    "authors": [
      "Harsh Trivedi",
      "Niranjan Balasubramanian",
      "Tushar Khot",
      "Ashish Sabharwal"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-12-20",
    "venue": "ACL 2023; arXiv:2212.10509",
    "url": "https://arxiv.org/abs/2212.10509",
    "summary": "CoT step \u2192 retrieve \u2192 CoT step \u2192 ... Multi-hop F1 +21 on HotpotQA over single-step retrieve.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 interleave CoT and retrieve.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "iterater-rag-2024",
    "title": "Iter-RetGen: Enhancing Retrieval-Augmented LLMs with Iterative Retrieval-Generation Synergy",
    "authors": [
      "Zhihong Shao",
      "Yeyun Gong",
      "Yelong Shen",
      "Minlie Huang",
      "Nan Duan",
      "Weizhu Chen"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-05-24",
    "venue": "EMNLP 2023; arXiv:2305.15294",
    "url": "https://arxiv.org/abs/2305.15294",
    "summary": "Iterative retrieve-generate-retrieve. Boosts HotpotQA +4pts, MuSiQue +2pts. Argues single-pass RAG insufficient for multi-hop.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 iterative retrieval method.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "jina-embeddings-v3-2024",
    "title": "Jina Embeddings v3: Multilingual Multitask Embeddings",
    "authors": [
      "Saba Sturua",
      "Isabelle Mohr",
      "Mohammad Kalim Akram",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-16",
    "venue": "arXiv:2409.10173",
    "url": "https://arxiv.org/abs/2409.10173",
    "summary": "570M params, 89 languages, 8192 context, task-LoRA adapters (retrieval, separation, classification, sts). Open weights, eval transparent on MTEB.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Open European/Berlin lab.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "jina-reranker-v2-2024",
    "title": "Jina Reranker v2 Multilingual",
    "authors": [
      "Jina AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-25",
    "venue": "Jina blog",
    "url": "https://jina.ai/news/jina-reranker-v2-for-agentic-rag-ultra-fast-multilingual-function-calling-and-code-search/",
    "summary": "278M params reranker; multilingual, function-call-aware, code-aware. Apache 2.0. ~6x faster than v1.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Companion to v3 embeddings; both open.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "kilt-2020",
    "title": "KILT: a Benchmark for Knowledge Intensive Language Tasks",
    "authors": [
      "Fabio Petroni",
      "Aleksandra Piktus",
      "Angela Fan",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020-09-04",
    "venue": "NAACL 2021; arXiv:2009.02252",
    "url": "https://arxiv.org/abs/2009.02252",
    "summary": "Unified eval over Wikipedia: NQ, TriviaQA, HotpotQA, FEVER, etc. Single snapshot. Foundation reference for RAG evaluation.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "kimi-moonshot-2024",
    "title": "Kimi (Moonshot AI) long-context + web search",
    "authors": [
      "Moonshot AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Kimi.ai",
    "url": "https://kimi.moonshot.cn/",
    "summary": "200K-2M context. Web search agent. Closed weights, closed retrieval. PDF/Doc upload as RAG path. Strong consumer brand in China.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "B7 \u2014 fully closed (Chinese 'OpenAI clone' posture). PATTERN FLIPS hard.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "langchain-2022",
    "title": "LangChain \u2014 Open framework for LLM applications",
    "authors": [
      "Harrison Chase"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-10-17",
    "venue": "GitHub langchain-ai/langchain",
    "url": "https://github.com/langchain-ai/langchain",
    "summary": "Most-starred LLM framework (~100K stars by 2024). Modules: LLMs, vector stores, retrievers, chains, agents. MIT. Reference 'glue layer' for RAG. Criticized for complexity / abstraction churn.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 dominant open framework. Bill 12 (commodification) \u2014 turned RAG into pip install.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "leak-detection-rouge-2024",
    "title": "Time Travel in LLMs: Tracing Data Contamination in Large Language Models",
    "authors": [
      "Shahriar Golchin",
      "Mihai Surdeanu"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-08-16",
    "venue": "ICLR 2024; arXiv:2308.08493",
    "url": "https://arxiv.org/abs/2308.08493",
    "summary": "Guided-instruction prompts to detect dataset contamination ('did you see this exact passage?'). Black-box auditor. Applied to many benchmarks.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 black-box audit method.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "leaked-test-bm25-2024",
    "title": "What's In My Big Data?",
    "authors": [
      "Yanai Elazar",
      "Akshita Bhagia",
      "Ian Magnusson",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-10-31",
    "venue": "ICLR 2024; arXiv:2310.20707",
    "url": "https://arxiv.org/abs/2310.20707",
    "summary": "Audits pretraining corpora (C4, RedPajama, Pile, RefinedWeb, Dolma). Detects test-set leakage for many benchmarks (HellaSwag, BoolQ, etc.). Open infra: WIMBD.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 provides infrastructure to detect leakage; cited extensively in RAG-eval contamination work.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "letta-memgpt-2023",
    "title": "MemGPT / Letta: Towards LLMs as Operating Systems",
    "authors": [
      "Charles Packer",
      "Sarah Wooders",
      "Kevin Lin",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-10-12",
    "venue": "arXiv:2310.08560",
    "url": "https://arxiv.org/abs/2310.08560",
    "summary": "OS-inspired memory: main context (RAM), archival storage (disk), virtual context management. Open code; commercialized as Letta. Reference 'agent memory' system.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 agent memory architecture. Renamed Letta 2024.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "lightrag-2024",
    "title": "LightRAG: Simple and Fast Retrieval-Augmented Generation",
    "authors": [
      "Zirui Guo",
      "Lianghao Xia",
      "Yanhua Yu",
      "Tu Ziqi",
      "Chao Huang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-08",
    "venue": "arXiv:2410.05779",
    "url": "https://arxiv.org/abs/2410.05779",
    "summary": "HK-PolyU/HKU: dual-level KG retrieval (local entities + global community). Outperforms GraphRAG with lower cost. MIT.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 Chinese-origin open RAG framework. Cross-references sweep 1006 bridge test.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "llamaindex-2022",
    "title": "LlamaIndex (formerly GPT Index)",
    "authors": [
      "Jerry Liu"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-11-09",
    "venue": "GitHub run-llama/llama_index",
    "url": "https://github.com/run-llama/llama_index",
    "summary": "Data-framework for LLM apps. ~35K stars by 2025. Specializes in document parsers (PDFs, tables, structured). MIT. LlamaParse (commercial) for hard PDFs. Reference data-loading layer.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 second dominant open framework.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "llamaparse-2024",
    "title": "LlamaParse \u2014 commercial PDF parser by LlamaIndex",
    "authors": [
      "LlamaIndex"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "LlamaIndex docs",
    "url": "https://www.llamaindex.ai/llamaparse",
    "summary": "Commercial table/PDF parsing service. Free tier + paid. Not open; raises mixed-model concern (open framework + closed parser).",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Bill 12 \u2014 commodification: parsing as paid service even atop open framework.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "llm-judge-bias-2024",
    "title": "Style Outweighs Substance: Failure Modes of LLM Judges in Alignment Benchmarking",
    "authors": [
      "Juyeon Heo",
      "Christina Heinze-Deml",
      "Oussama Elachqar",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-23",
    "venue": "arXiv:2409.15268",
    "url": "https://arxiv.org/abs/2409.15268",
    "summary": "LLM judges prefer stylistic markers (length, confidence-tone) over correctness. Calibration of RAGAS-like judges directly affected.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 independent judge-failure paper.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "lmsys-arena-2024",
    "title": "LMSYS Chatbot Arena",
    "authors": [
      "Wei-Lin Chiang",
      "Lianmin Zheng",
      "Ying Sheng",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03",
    "venue": "arXiv:2403.04132",
    "url": "https://arxiv.org/abs/2403.04132",
    "summary": "Human-vote Elo arena. RAG-mode subleaderboards in 2024. Independent of vendors.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 human-judged independent arena.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "long-context-vs-rag-fail-2024",
    "title": "RAG vs Long Context: A Survey of Pros and Cons",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2401.11944",
    "url": "https://arxiv.org/abs/2401.11944",
    "summary": "Survey of RAG vs long-context tradeoffs. Long-context: latency, cost; RAG: stale, partial. Neither dominates; many tasks need hybrid.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 neutral analysis of failure modes.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "longeval-rag-2024",
    "title": "L-Eval: Instituting Standardized Evaluation for Long Context Language Models",
    "authors": [
      "Chenxin An",
      "Shansan Gong",
      "Ming Zhong",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-07-11",
    "venue": "ACL 2024; arXiv:2307.11088",
    "url": "https://arxiv.org/abs/2307.11088",
    "summary": "Long-context QA suite; RAG vs long-context controversy. Finding: long context CAN match RAG on small corpora, but RAG wins on noisy / large / multi-hop.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Frames RAG as compression-via-retrieval.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "longfact-2024",
    "title": "Long-form Factuality in Large Language Models (LongFact / SAFE)",
    "authors": [
      "Jerry Wei",
      "Chengrun Yang",
      "Xinying Song",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-27",
    "venue": "arXiv:2403.18802",
    "url": "https://arxiv.org/abs/2403.18802",
    "summary": "DeepMind: 38-topic benchmark, 2280 prompts. SAFE = decompose + Google-search + judge. Finds Gemini-Ultra > Claude-2 > GPT-4 on F1@K; agreement with humans 72%.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.94,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605 \u2014 DeepMind open eval rig.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "lost-in-the-middle-2024",
    "title": "Lost in the Middle: How Language Models Use Long Contexts",
    "authors": [
      "Nelson F. Liu",
      "Kevin Lin",
      "John Hewitt",
      "Ashwin Paranjape",
      "Michele Bevilacqua",
      "Fabio Petroni",
      "Percy Liang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-07-06",
    "venue": "TACL 2024; arXiv:2307.03172",
    "url": "https://arxiv.org/abs/2307.03172",
    "summary": "Multi-doc QA: 20 documents, target in position 1-20. Accuracy U-shaped \u2014 drops 20+pts when target in middle vs ends. Holds for GPT-3.5-Turbo-16K, Claude-1.3-100K, MPT-30B-Instruct. THE landmark RAG context-position negative result.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.99,
    "watchlist_tier": null,
    "notes": "Bill 8\u2605\u2605 \u2014 most-cited 'where RAG breaks' paper. Liu 2024 reference for this sweep.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "lost-in-the-middle-attrib-2024",
    "title": "Citation-Enhanced Generation for LLM-based Chatbots",
    "authors": [
      "Weitao Li",
      "Junkai Li",
      "Weizhi Ma",
      "Yang Liu"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-25",
    "venue": "arXiv:2402.16063",
    "url": "https://arxiv.org/abs/2402.16063",
    "summary": "Post-hoc citation injection via NLI verifier on retrieved corpus. Argues stronger faithfulness than train-time alignment. Citation precision +30% over Self-RAG on chat tasks.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "Counter-evidence: post-hoc CAN work if verifier is strong.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "lost-in-the-middle-multihop-2023",
    "title": "Lost in the Middle: How Language Models Use Long Contexts",
    "authors": [
      "Nelson F. Liu",
      "Kevin Lin",
      "John Hewitt",
      "Ashwin Paranjape",
      "Michele Bevilacqua",
      "Fabio Petroni",
      "Percy Liang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-07-06",
    "venue": "TACL 2024; arXiv:2307.03172",
    "url": "https://arxiv.org/abs/2307.03172",
    "summary": "U-shaped performance on long context (early + late > middle). When relevant doc is in the middle of 20-doc context, accuracy drops 20+pts. Foundational 'context-position' critique of RAG.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.98,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 Liu 2024 'lost-in-the-middle'. Cross-references sweep 1008 critique.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "marqo-2022",
    "title": "Marqo: Tensor-based search engine",
    "authors": [
      "Marqo"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022",
    "venue": "GitHub marqo-ai/marqo",
    "url": "https://github.com/marqo-ai/marqo",
    "summary": "Multi-modal search (image + text). Open source + cloud. Apache 2.0. Specialty: tensor-based scoring + CLIP-like models built in.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 multimodal-native.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "metr-rag-2024",
    "title": "METR Evaluations of Frontier AI Models",
    "authors": [
      "METR (Model Evaluation and Threat Research)"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "METR public reports",
    "url": "https://metr.org/",
    "summary": "METR (formerly ARC Evals) runs agentic-capability evals contracted by OpenAI / Anthropic / Google. Retrieval-tool-use is a sub-capability. Reports public-summary only.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 independent third-party safety evaluator.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "minicpm-rag-2024",
    "title": "MiniCPM-V + RAG capabilities (ModelBest / OpenBMB)",
    "authors": [
      "OpenBMB"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub OpenBMB/MiniCPM",
    "url": "https://github.com/OpenBMB/MiniCPM",
    "summary": "Small-model (2-8B) high-performance line incl. multimodal RAG. Apache 2.0. Cited as compact open alternative.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "B7 \u2014 open. PATTERN HOLDS.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "minimax-2024",
    "title": "MiniMax abab + Hailuo with retrieval",
    "authors": [
      "MiniMax"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "MiniMax API",
    "url": "https://www.minimaxi.com/en",
    "summary": "abab-7-chat / abab-6.5. Long-context. Closed. Retrieval via assistants API. Released Hailuo video model 2024.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "B7 \u2014 closed commercial.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "miracl-2023",
    "title": "MIRACL: Multilingual Information Retrieval Across a Continuum of Languages",
    "authors": [
      "Xinyu Zhang",
      "Nandan Thakur",
      "Odunayo Ogundepo",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-10-19",
    "venue": "TACL 2023; arXiv:2210.09984",
    "url": "https://arxiv.org/abs/2210.09984",
    "summary": "18 languages, Wikipedia-based. Standard multilingual IR benchmark. Concern: Wikipedia widely memorized; assess transfer via low-resource subsets.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605 \u2014 but Wikipedia-derived.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "mixedbread-mxbai-embed-2024",
    "title": "mxbai-embed-large-v1",
    "authors": [
      "Mixedbread AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-07",
    "venue": "Mixedbread / HuggingFace",
    "url": "https://www.mixedbread.ai/blog/mxbai-embed-large-v1",
    "summary": "Apache 2.0, 335M params. MTEB SOTA at release among open models. Matryoshka representation learning for variable dims. Training data and code released.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 fully open European competitor.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "mmteb-2025",
    "title": "MMTEB: Massive Multilingual Text Embedding Benchmark",
    "authors": [
      "Kenneth Enevoldsen",
      "Isaac Chung",
      "Imene Kerboua",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2025-02-19",
    "venue": "arXiv:2502.13595",
    "url": "https://arxiv.org/abs/2502.13595",
    "summary": "500+ evaluation tasks across 250+ languages. Largest multilingual eval. Surfaces severe under-resourced-language gaps. Argues 'rolling' eval needed to prevent contamination saturation.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 argues for rolling refresh, citing MTEB saturation.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "mongodb-atlas-vector-2024",
    "title": "MongoDB Atlas Vector Search",
    "authors": [
      "MongoDB"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-26",
    "venue": "MongoDB docs",
    "url": "https://www.mongodb.com/products/platform/atlas-vector-search",
    "summary": "HNSW index in Atlas. Integrated with document DB so retrieval can combine vector + filter + lexical. Architecture published in MongoDB engineering blog; embedding model brought-your-own.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "DB layer disclosed; user supplies embeddings.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "morphik-2024",
    "title": "Morphik multimodal RAG over PDFs",
    "authors": [
      "Morphik"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Morphik product",
    "url": "https://morphik.ai/",
    "summary": "ColPali-based RAG: page-level visual retrieval; argues OCR loses layout. Open core, commercial cloud. Cites ColPali, ColQwen2 in docs.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Newer entrant, multimodal-first.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "ms-marco-leak-2024",
    "title": "Trained on a Document with Errors: Measuring Long-Context Performance with NaturalQuestions",
    "authors": [
      "Tianle Cai",
      "Yuhan Li",
      "Zhouhan Lin",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-26",
    "venue": "arXiv:2404.16768",
    "url": "https://arxiv.org/abs/2404.16768",
    "summary": "Audit shows MS MARCO and Natural Questions both appear in pretraining corpora of top LLMs. Models can answer queries without retrieval \u2014 confounds RAG eval.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 direct contamination evidence.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "mteb-2022",
    "title": "MTEB: Massive Text Embedding Benchmark",
    "authors": [
      "Niklas Muennighoff",
      "Nouamane Tazi",
      "Lo\u00efc Magne",
      "Nils Reimers"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-10-13",
    "venue": "EACL 2023; arXiv:2210.07316",
    "url": "https://arxiv.org/abs/2210.07316",
    "summary": "58 datasets, 8 tasks, 112 languages by 2024. Includes BEIR. Public leaderboard. Concerns: rapid optimization led to leaderboard gaming.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.97,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605. Now ~ 700+ models on leaderboard.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "mteb-french-2024",
    "title": "MTEB-French: Resources for French Sentence Embedding Evaluation",
    "authors": [
      "Mathieu Ciancone",
      "Imene Kerboua",
      "Marion Schaeffer",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-30",
    "venue": "arXiv:2405.20468",
    "url": "https://arxiv.org/abs/2405.20468",
    "summary": "26 French tasks. Surfaces language-specific gaps: top English embedders not top French. Standardizes non-English MTEB evaluation.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 independent multilingual eval.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "mteb-plus-2024",
    "title": "MTEB-Plus / Air-Bench: Domain-Specific Retrieval Benchmark",
    "authors": [
      "BAAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12-21",
    "venue": "arXiv:2412.13102",
    "url": "https://arxiv.org/abs/2412.13102",
    "summary": "BAAI Air-Bench: rolling, synthetic, domain-stratified retrieval bench to escape MTEB saturation. 12 domains, ~1500 queries per domain. Generated post-MTEB cutoff.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 Chinese open lab building post-MTEB rolling eval.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "multidoc-summ-2024",
    "title": "Benchmarking Generation and Evaluation Capabilities of Large Language Models for Instruction-Controllable Multi-Document Summarization",
    "authors": [
      "Yixin Liu",
      "Alexander R. Fabbri",
      "Jiawen Chen",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05",
    "venue": "ACL 2024; arXiv:2401.16475",
    "url": "https://arxiv.org/abs/2401.16475",
    "summary": "Multi-doc summarization with instructions. RAG-like setup. GPT-4 reasonable on faithfulness; struggle with conflict resolution across sources.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.83,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "multihoprag-2024",
    "title": "MultiHop-RAG: Benchmarking RAG for Multi-Hop Queries",
    "authors": [
      "Yixuan Tang",
      "Yi Yang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-01-27",
    "venue": "arXiv:2401.15391",
    "url": "https://arxiv.org/abs/2401.15391",
    "summary": "2,556 financial-news Qs spanning 4 reasoning types (inference, comparison, temporal, null). Vanilla RAG ~40% even with GPT-4. Held-out post-cutoff.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605 \u2014 RAG-specific multi-hop benchmark.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "musique-2022",
    "title": "MuSiQue: Multihop Questions via Single-hop Question Composition",
    "authors": [
      "Harsh Trivedi",
      "Niranjan Balasubramanian",
      "Tushar Khot",
      "Ashish Sabharwal"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021-08-02",
    "venue": "TACL 2022; arXiv:2108.00573",
    "url": "https://arxiv.org/abs/2108.00573",
    "summary": "Composed from single-hop Qs; adversarial filtering removes shortcuts. RAG models on 4-hop subset: ~35% F1. Definitive 'RAG fails multi-hop' benchmark.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605 \u2014 shortcut-resistant multihop.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "musique-eval-2022",
    "title": "MuSiQue: Multihop Questions via Single-hop Question Composition",
    "authors": [
      "Harsh Trivedi",
      "Niranjan Balasubramanian",
      "Tushar Khot",
      "Ashish Sabharwal"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021-08-02",
    "venue": "TACL 2022; arXiv:2108.00573",
    "url": "https://arxiv.org/abs/2108.00573",
    "summary": "Multihop QA dataset. Adversarial dataset construction \u2014 single-hop shortcut prevention. RAG models historically score <40% on 4-hop. Reference 'where RAG breaks' benchmark.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "needle-haystack-2024",
    "title": "Needle In A Haystack benchmark (Kamradt)",
    "authors": [
      "Greg Kamradt",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-11",
    "venue": "GitHub gkamradt/LLMTest_NeedleInAHaystack",
    "url": "https://github.com/gkamradt/LLMTest_NeedleInAHaystack",
    "summary": "Test: insert random fact in long context; ask about it. Standard long-context eval. RAG vs long-context comparison surfaces 'lost-in-the-middle'.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 community open eval.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "negation-rag-2024",
    "title": "Why LLMs Are Bad at Negation: An Investigation",
    "authors": [
      "Mateusz Truszczy\u0144ski",
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2402.04566",
    "summary": "Retrieval can return passages containing positive form of fact when query asks for negation; models then assert positive. RAG retrieval mode-collapses to surface lexical match.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.8,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 negation-handling weakness.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "negative-rejection-fail-2024",
    "title": "Knowing What You Don't Know: When RAG Models Should Refuse",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2409.13948",
    "url": "https://arxiv.org/abs/2409.13948",
    "summary": "RAG models rarely refuse when retrieval is irrelevant \u2014 instead hallucinate plausible answer. Refusal calibration <30% on RGB neg-rejection.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 refusal-calibration failure.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "neuclir-2024",
    "title": "NeuCLIR 2024: Cross-Language Information Retrieval track at TREC",
    "authors": [
      "Dawn Lawrie",
      "Eugene Yang",
      "James Mayfield",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-11",
    "venue": "TREC 2024",
    "url": "https://trec.nist.gov/pubs/trec33/papers/Overview_NeuCLIR.pdf",
    "summary": "TREC-organized held-out CLIR eval with annual fresh queries. Russian / Chinese / Persian. Government-grade contamination protocol.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 annual fresh held-out eval.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "noise-rag-bandwidth-2024",
    "title": "How Much Noise Can RAG Tolerate? Empirical Study of Retrieval Quality vs Generation Quality",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2404.08198",
    "url": "https://arxiv.org/abs/2404.08198",
    "summary": "Beyond 30% noise in retrieval, generation quality degrades fast. Below 30%, generation is robust. Defines RAG noise-tolerance curve.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 noise tolerance curve.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "nomic-embed-text-v1-2024",
    "title": "Nomic Embed Text v1: A Reproducible Long Context Text Embedder",
    "authors": [
      "Zach Nussbaum",
      "John X. Morris",
      "Brandon Duderstadt",
      "Andriy Mulyar"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-08",
    "venue": "arXiv:2402.01613",
    "url": "https://arxiv.org/abs/2402.01613",
    "summary": "Fully open: weights, training data, code. 8192 context. Outperforms text-embedding-ada-002 and text-embedding-3-small on short and long context benchmarks. Reference open RAG embedder.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.97,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 gold standard for openness in 2024 commercial embedders.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "nq-leakage-2023",
    "title": "Quantifying Memorization Across Neural Language Models",
    "authors": [
      "Nicholas Carlini",
      "Daphne Ippolito",
      "Matthew Jagielski",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-02-15",
    "venue": "ICLR 2023; arXiv:2202.07646",
    "url": "https://arxiv.org/abs/2202.07646",
    "summary": "Quantifies that memorization scales with model size, training duration, duplication. Implies BEIR / MS MARCO memorization at scale.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 foundational memorization paper.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "ofa-rag-2024",
    "title": "OFA-RAG: One-For-All Retrieval-Augmented Generation",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2405.07920",
    "summary": "Unified RAG handling factoid + multi-hop + open-ended. Joint optimization of retrieval and generation. Synthesis quality +6pts.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 joint optimization.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "open-rag-eval-2024",
    "title": "Open RAG Eval (Vectara)",
    "authors": [
      "Vectara"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Vectara Open Rag Eval",
    "url": "https://www.rungalileo.io/openragbench",
    "summary": "Galileo and Vectara separate open eval rigs. Faithfulness + context-relevance scoring. Reference vendor-neutral RAG eval.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.8,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 vendor-neutral framework.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "openagi-leak-2024",
    "title": "Stealing Part of a Production Language Model",
    "authors": [
      "Nicholas Carlini",
      "Daniel Paleka",
      "Krishnamurthy Dvijotham",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-03-11",
    "venue": "ICML 2024; arXiv:2403.06634",
    "url": "https://arxiv.org/abs/2403.06634",
    "summary": "Demonstrates embedding-projection extraction from production OpenAI / Google models via API. Implication for RAG: vector-DB embeddings can leak through retrieval interface.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 adversarial leakage path.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "openai-assistants-v2-2024",
    "title": "Assistants API v2 with file_search tool",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-17",
    "venue": "OpenAI docs",
    "url": "https://platform.openai.com/docs/assistants/tools/file-search",
    "summary": "Managed RAG: vector_store object, automatic chunking (800-token chunks, 400 overlap), text-embedding-3-large default, hybrid search + reranking. Citations via annotations. No exposure of chunk text, embedding model details, or rerank specifics.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Black-box: users see annotations, not retrieval path. Bill 1 fails \u2014 no architectural detail public.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "openai-rag-cookbook-2024",
    "title": "OpenAI RAG techniques cookbook",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub openai-cookbook",
    "url": "https://github.com/openai/openai-cookbook/tree/main/examples",
    "summary": "Cookbook demonstrates HyDE, parent-document retrieval, multi-query, reranking. Educational only \u2014 not what Assistants v2 runs internally. Internal retrieval stack closed.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Examples public, production stack undocumented.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "openai-simpleqa-2024",
    "title": "Measuring short-form factuality in LLMs (SimpleQA)",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-10-30",
    "venue": "OpenAI blog",
    "url": "https://openai.com/index/introducing-simpleqa/",
    "summary": "4,326 fact questions verified independently. Open eval. GPT-o1-preview ~42%, Claude-3.5-Sonnet ~41%. RAG-enabled vs no-RAG comparison absent \u2014 but baselines exposed without retrieval.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605 \u2014 strict factuality eval (no partial credit).",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "openai-text-embedding-3-2024",
    "title": "text-embedding-3-large and 3-small",
    "authors": [
      "OpenAI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-01-25",
    "venue": "OpenAI blog",
    "url": "https://openai.com/index/new-embedding-models-and-api-updates/",
    "summary": "Matryoshka dimensions (256-3072), MTEB 64.6 (large). Closed weights, API only. No training-data card.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Industry-standard closed embedder.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "openllm-leaderboard-2024",
    "title": "Open LLM Leaderboard v2",
    "authors": [
      "HuggingFace"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-26",
    "venue": "HuggingFace Spaces",
    "url": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard",
    "summary": "Revised after contamination concerns. Includes IFEval, BBH, MUSR, MMLU-Pro, GPQA, MATH. Independent academic eval rig.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 refreshed for contamination.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "outlines-2023",
    "title": "Outlines \u2014 Robust prompted programming",
    "authors": [
      "R\u00e9mi Louf",
      "Will Kurt",
      "Brandon Willard",
      "Renato Geh",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-07",
    "venue": "GitHub dottxt-ai/outlines",
    "url": "https://github.com/dottxt-ai/outlines",
    "summary": "Constrained generation (regex / CFG / JSON schema). Apache 2.0. Used as RAG output-shaping layer.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 constrained generation.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "pathway-2024",
    "title": "Pathway \u2014 streaming framework for live RAG",
    "authors": [
      "Pathway"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub pathwaycom/pathway",
    "url": "https://github.com/pathwaycom/pathway",
    "summary": "Real-time streaming Python framework with built-in LLM xpack. Handles incremental index updates. Open source, BSL license (source-available, time-delayed Apache).",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.8,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 streaming-first; license is BSL not OSI.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "perplexity-pro-search-2024",
    "title": "Perplexity Pro Search multi-step research",
    "authors": [
      "Perplexity AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-01",
    "venue": "Perplexity blog",
    "url": "https://www.perplexity.ai/hub/blog/pro-search-upgraded",
    "summary": "Multi-step decomposition + iterative retrieval. No eval published. Lawsuits 2024-2025 from Wall Street Journal, NYT, Forbes allege content scraping + misattribution.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Active litigation on retrieval-corpus provenance.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "perplexity-sonar-2024",
    "title": "Perplexity Sonar online LLMs",
    "authors": [
      "Perplexity AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-22",
    "venue": "Perplexity API",
    "url": "https://docs.perplexity.ai/",
    "summary": "Sonar-small / Sonar-medium / Sonar-pro: LLMs with live web search. Returns cited sources. Underlying retriever is Perplexity's own web index \u2014 undocumented size, freshness, ranking signals. Citation faithfulness contested (Perplexity vs Forbes lawsuit 2024).",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Bill 8 (citation faithfulness) \u2014 third-party reports of hallucinated quotes.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "phantom-rag-2024",
    "title": "Phantom: General Trigger Attacks on Retrieval-Augmented Generation",
    "authors": [
      "Harsh Chaudhari",
      "Giorgio Severi",
      "John Abascal",
      "Matthew Jagielski",
      "Cristina Nita-Rotaru",
      "Alina Oprea"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-31",
    "venue": "arXiv:2405.20485",
    "url": "https://arxiv.org/abs/2405.20485",
    "summary": "Trigger-based attack on RAG \u2014 injected doc only activates on specific query patterns. Stealthy.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 stealth poisoning.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "phind-2024",
    "title": "Phind: developer-focused search RAG",
    "authors": [
      "Phind"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Phind product",
    "url": "https://www.phind.com/",
    "summary": "Code-and-docs RAG with custom fine-tuned Phind-70B base. Search index over GitHub, Stack Overflow, docs. Citations inline. Closed retriever; model card for Phind-70B published.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Phind-70B weights released, retrieval corpus and index closed.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "pinecone-2024",
    "title": "Pinecone serverless vector database",
    "authors": [
      "Pinecone"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-01-16",
    "venue": "Pinecone blog",
    "url": "https://www.pinecone.io/blog/serverless/",
    "summary": "Managed vector DB; cascading retrieval (hybrid sparse+dense), rerank-v3.5 model. Pinecone Assistant (managed RAG) launched 2024. Storage layer architecture published in blog series; retriever models closed.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Storage geometry well-documented; embedding/rerank models proprietary.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "poisoned-rag-2024",
    "title": "PoisonedRAG: Knowledge Corruption Attacks to Retrieval-Augmented Generation",
    "authors": [
      "Wei Zou",
      "Runpeng Geng",
      "Binghui Wang",
      "Jinyuan Jia"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-12",
    "venue": "USENIX 2025; arXiv:2402.07867",
    "url": "https://arxiv.org/abs/2402.07867",
    "summary": "Injects 5 poisoned documents in retrieval corpus \u2192 90%+ attack success on targeted Qs across LLMs / retrievers. Defense partial.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 RAG security failure landmark.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "post-hoc-citations-2024",
    "title": "Verifiable by Design: Aligning Language Models to Quote from Pre-Training Data",
    "authors": [
      "Jingyu Zhang",
      "Marc Marone",
      "Tianjian Li",
      "Benjamin Van Durme",
      "Daniel Khashabi"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-04",
    "venue": "arXiv:2404.03862",
    "url": "https://arxiv.org/abs/2404.03862",
    "summary": "Argues post-hoc cited generations frequently mismatch source. Trains models to QUOTE verbatim from training data using membership oracle. ~80% verbatim match achievable; quality cost modest.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 direct empirical evidence that 'citations attached after generation' are unreliable.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "qafacteval-2022",
    "title": "QAFactEval: Improved QA-Based Factual Consistency Evaluation",
    "authors": [
      "Alexander R. Fabbri",
      "Chien-Sheng Wu",
      "Wenhao Liu",
      "Caiming Xiong"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021-12-16",
    "venue": "NAACL 2022; arXiv:2112.08542",
    "url": "https://arxiv.org/abs/2112.08542",
    "summary": "QA-based summary-faithfulness eval \u2014 earlier than NLI-based methods. Used as baseline for RAG faithfulness.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Pre-LLM-judge era; still cited as control.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "qanything-2024",
    "title": "QAnything \u2014 open RAG by NetEase Youdao",
    "authors": [
      "NetEase Youdao"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub netease-youdao/QAnything",
    "url": "https://github.com/netease-youdao/QAnything",
    "summary": "End-to-end Chinese RAG with BCEmbedding + BCERerank. Apache 2.0. From NetEase's translation team.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "B7 \u2014 Apache 2.0. PATTERN HOLDS.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "qdmr-2020",
    "title": "Break It Down: A Question Understanding Benchmark (QDMR)",
    "authors": [
      "Tomer Wolfson",
      "Mor Geva",
      "Ankit Gupta",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020-01-29",
    "venue": "TACL 2020; arXiv:2001.11770",
    "url": "https://arxiv.org/abs/2001.11770",
    "summary": "Question Decomposition Meaning Representation: how-to-break complex Q. Used in RAG decomposition lines.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "qdrant-2021",
    "title": "Qdrant \u2014 open-source vector similarity search engine",
    "authors": [
      "Qdrant"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021",
    "venue": "GitHub qdrant/qdrant",
    "url": "https://github.com/qdrant/qdrant",
    "summary": "Rust-based vector DB. Apache 2.0. Production-scale. ~20K stars. Top open competitor to Pinecone / Weaviate.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 fast, production-grade open.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "query-decomp-fail-2024",
    "title": "Why Don't Multi-Hop Decomposition Methods Work in RAG?",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2410.10605",
    "url": "https://arxiv.org/abs/2410.10605",
    "summary": "Decomposition adds latency without quality gain on benchmark distribution. Argues query-decomp fails when bridge entity is not directly nameable.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 decomposition-failure analysis.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "query-decomposition-2023",
    "title": "Query Decomposition for Open-Domain Multi-Hop Question Answering",
    "authors": [
      "Ethan Perez",
      "Patrick Lewis",
      "Wen-tau Yih",
      "Kyunghyun Cho",
      "Douwe Kiela"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020-09-09",
    "venue": "EMNLP 2020; arXiv:2009.04604",
    "url": "https://arxiv.org/abs/2009.04604",
    "summary": "Foundational query decomposition: train model to split complex Q into sub-Qs. Pre-LLM era; still cited.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 root paper for sub-question RAG.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "querysplit-2023",
    "title": "Demonstrate-Search-Predict (DSP)",
    "authors": [
      "Omar Khattab",
      "Keshav Santhanam",
      "Xiang Lisa Li",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-12-28",
    "venue": "arXiv:2212.14024",
    "url": "https://arxiv.org/abs/2212.14024",
    "summary": "Pre-DSPy framework for RAG pipelines. Templated retrieve-predict-reuse. Established compositional retrieval as research line.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Predecessor of DSPy.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "qwen-25-coder-rag-2024",
    "title": "Qwen2.5-Coder + Code RAG",
    "authors": [
      "Qwen Team"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09-19",
    "venue": "arXiv:2409.12186",
    "url": "https://arxiv.org/abs/2409.12186",
    "summary": "Code-specialized Qwen with retrieval support. Apache 2.0. Cited as open competitor to Phind / Codeium.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "B7 \u2014 open code-RAG model.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "qwen-25-system-2024",
    "title": "Qwen2.5 Technical Report",
    "authors": [
      "Qwen Team",
      "Alibaba Group"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12-19",
    "venue": "arXiv:2412.15115",
    "url": "https://arxiv.org/abs/2412.15115",
    "summary": "Detailed system card for Qwen2.5 (0.5B-72B + Qwen2.5-Max). Discusses pretraining mix, RLHF, retrieval-tool-use. Apache 2.0 weights (except Max).",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "B7 \u2014 extensive technical disclosure. PATTERN HOLDS: Chinese lab arguably more transparent than OpenAI/Anthropic.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "qwen-agent-retrieval-2024",
    "title": "Qwen-Agent: Generalized agent framework atop Qwen",
    "authors": [
      "Alibaba DAMO Academy"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub QwenLM/Qwen-Agent",
    "url": "https://github.com/QwenLM/Qwen-Agent",
    "summary": "Qwen-Agent w/ RAG, code-interpreter, browser. Apache 2.0. Pure-Python RAG with hybrid retrieval. Used in Qwen-2.5 system card.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "B7 \u2014 Apache 2.0, fully open. PATTERN HOLDS: Chinese lab releases agent framework openly.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "qwen-cookbook-2024",
    "title": "Qwen-Agent Cookbook \u2014 RAG patterns",
    "authors": [
      "Alibaba"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Qwen-Agent docs",
    "url": "https://github.com/QwenLM/Qwen-Agent/tree/main/examples",
    "summary": "Reference RAG patterns from Alibaba: long-doc QA, table-aware, multimodal. Open examples mirroring OpenAI cookbook.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "B7 \u2014 mirrors OpenAI cookbook but with open backing model.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "r2r-2024",
    "title": "R2R: Production-ready RAG engine",
    "authors": [
      "SciPhi AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub SciPhi-AI/R2R",
    "url": "https://github.com/SciPhi-AI/R2R",
    "summary": "Production RAG with built-in evals, multi-tenancy, knowledge graph, hybrid search. MIT. Targets developers who outgrew LangChain.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 newer 'serious' RAG framework.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "rag-bench-2024",
    "title": "RGB: Benchmarking Large Language Models in Retrieval-Augmented Generation",
    "authors": [
      "Jiawei Chen",
      "Hongyu Lin",
      "Xianpei Han",
      "Le Sun"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-09-04",
    "venue": "AAAI 2024; arXiv:2309.01431",
    "url": "https://arxiv.org/abs/2309.01431",
    "summary": "4 capabilities tested: noise robustness, negative rejection, information integration, counterfactual robustness. GPT-4 noise-robust, weak on counterfactuals (~30%).",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605 \u2014 surfaces noise-robustness gap.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "rag-bench-aws-2024",
    "title": "Building RAG-as-a-Service: AWS production lessons",
    "authors": [
      "AWS"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "AWS Bedrock blog series",
    "url": "https://aws.amazon.com/blogs/machine-learning/",
    "summary": "AWS production RAG patterns \u2014 knowledge-base eval, retrieval freshness. Industry-side audit anecdata; concrete numbers limited.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 vendor self-eval.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "rag-causal-fail-2024",
    "title": "Causal Questions Are Hard for RAG",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2410.07171",
    "url": "https://arxiv.org/abs/2410.07171",
    "summary": "Causal Qs (why X happens, what causes Y) recall ~40% with vanilla RAG. Causal-aware retrieval +15pts.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 causal-question retrieval failure.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-citation-hallucination-2024",
    "title": "Misattribution in Search-Augmented LLMs: A Large-Scale Study",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2410.04025",
    "url": "https://arxiv.org/abs/2410.04025",
    "summary": "Audit of Perplexity / Bing Copilot / Google AI Overview. ~25-40% citations misattribute claims. Some claims absent from any cited source.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 consumer-grade RAG citation failure.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-cite-faithful-2024",
    "title": "Citation-Faithfulness Audit of Top RAG Vendors",
    "authors": [
      "Various academic"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2410.03062",
    "url": "https://arxiv.org/abs/2410.03062",
    "summary": "Audits Anthropic Citations, OpenAI Assistants file_search, Perplexity, NotebookLM. Citation precision ranges 40-70% with NotebookLM highest.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 vendor-citation audit.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-context-conflict-2023",
    "title": "Trusting Your Evidence: Hallucinate Less with Context-aware Decoding",
    "authors": [
      "Weijia Shi",
      "Xiaochuang Han",
      "Mike Lewis",
      "Yulia Tsvetkov",
      "Luke Zettlemoyer",
      "Wen-tau Yih"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-05-24",
    "venue": "NAACL 2024; arXiv:2305.14739",
    "url": "https://arxiv.org/abs/2305.14739",
    "summary": "CAD decoding: contrast logits with/without context to amplify context-faithful behavior. Reduces hallucination in knowledge-conflict cases by ~20%.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 decode-time context-faith method.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "rag-distraction-2023",
    "title": "Large Language Models Can Be Easily Distracted by Irrelevant Context",
    "authors": [
      "Freda Shi",
      "Xinyun Chen",
      "Kanishka Misra",
      "Nathan Scales",
      "David Dohan",
      "Ed Chi",
      "Nathanael Sch\u00e4rli",
      "Denny Zhou"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-02-01",
    "venue": "ICML 2023; arXiv:2302.00093",
    "url": "https://arxiv.org/abs/2302.00093",
    "summary": "Insert irrelevant sentences in math problems: accuracy drops 17-20pts. Foundational 'retrieved context distracts' paper. Implies noisy RAG hurts.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.96,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 distraction-by-irrelevant-context. Cited in every RAG-failure paper.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-doesnt-reduce-hallucination-2024",
    "title": "Faithful Retrieval-Augmented Generation: Do LLMs Truly Make Use of Retrieved Information?",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2405.17068",
    "url": "https://arxiv.org/abs/2405.17068",
    "summary": "Audits: when retrieved context contradicts pretraining belief, models ignore retrieval 40-60% of time. Strong evidence that retrieval alone does NOT solve hallucination.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 direct refutation of 'RAG fixes hallucination'.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-eval-survey-2024",
    "title": "Evaluation of Retrieval-Augmented Generation: A Survey",
    "authors": [
      "Hao Yu",
      "Aoran Gan",
      "Kai Zhang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-13",
    "venue": "arXiv:2405.07437",
    "url": "https://arxiv.org/abs/2405.07437",
    "summary": "Survey of RAG eval methods. Taxonomy: retrieval / generation / end-to-end / process. Maps existing tools (RAGAS, ARES, TruLens, DeepEval).",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 survey.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "rag-evaluation-leakage-2024",
    "title": "RAG Benchmarks Leak: How Public Wiki-Based Benchmarks Are Contaminated",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2410.02825",
    "url": "https://arxiv.org/abs/2410.02825",
    "summary": "Argues HotpotQA / 2WikiMultiHopQA / Natural Questions all in pretraining of frontier models. RAG benchmark numbers inflated.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 RAG benchmark contamination.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-fact-conflict-2024",
    "title": "When Knowledge Conflicts: Internal vs External Memory in RAG",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2410.05983",
    "url": "https://arxiv.org/abs/2410.05983",
    "summary": "When internal pretraining knowledge conflicts with retrieved doc, models often pick internal (40-70% even when retrieved is correct).",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 internal/external conflict.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-fails-multihop-2024",
    "title": "Why Does Retrieval-Augmented Generation Fail on Multi-Hop Question Answering? A Distribution-Aware Analysis",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Findings of EMNLP / arXiv:2404.13784",
    "url": "https://arxiv.org/abs/2404.13784",
    "summary": "Argues vanilla RAG fails multi-hop because dense retrievers retrieve passages similar to query, missing the BRIDGE entity. Proposes coverage-aware retrieval.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 explicit 'RAG fails multi-hop' framing.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "rag-hidden-knowledge-2024",
    "title": "Pre-training Distillation for Large Language Models with Hidden Knowledge",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv",
    "url": "https://arxiv.org/abs/2407.05291",
    "summary": "Demonstrates models 'know' things they don't surface even in RAG mode. Retrieval doesn't activate the right parametric subnetwork.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 parametric-knowledge access failure.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-hurts-2024",
    "title": "When Retrieval-Augmented Generation Causes More Harm Than Good",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2402.13753",
    "url": "https://arxiv.org/abs/2402.13753",
    "summary": "On factoid questions in the model's parametric memory, retrieval ADDS noise. Disabling retrieval improves accuracy on closed-book-strong questions.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 'retrieval can hurt' empirics.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-noise-tolerance-2024",
    "title": "Benchmarking Large Language Models in Retrieval-Augmented Generation (RGB)",
    "authors": [
      "Jiawei Chen",
      "Hongyu Lin",
      "Xianpei Han",
      "Le Sun"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-09-04",
    "venue": "AAAI 2024; arXiv:2309.01431",
    "url": "https://arxiv.org/abs/2309.01431",
    "summary": "RGB: noise robustness, neg-rejection, integration, counterfactual. GPT-4 counterfactual robustness ~30% \u2014 RAG fails when context contradicts world.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 counterfactual RAG fails.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-original-2020",
    "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    "authors": [
      "Patrick Lewis",
      "Ethan Perez",
      "Aleksandra Piktus",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020-05-22",
    "venue": "NeurIPS 2020; arXiv:2005.11401",
    "url": "https://arxiv.org/abs/2005.11401",
    "summary": "Original RAG paper (Lewis/Kiela). DPR + BART. Founded the term 'RAG'. Single-hop QA + KILT benchmarks.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.98,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 root RAG paper.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "rag-overconfidence-2024",
    "title": "Can Knowledge-Hungry Language Models Distinguish Between Facts and Hallucinations?",
    "authors": [
      "Jiajun Bu",
      "Hui Chen",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09",
    "venue": "arXiv:2409.10350",
    "url": "https://arxiv.org/abs/2409.10350",
    "summary": "Empirically: retrieval reduces hallucination on out-of-pretraining queries by ~12%; on in-pretraining queries, retrieval is redundant. Quantifies the contamination-confound.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 quantifies in-vs-out pretraining gap.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "rag-overhead-2024",
    "title": "The Hidden Cost of RAG: Latency and Cost in Production",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "industry blogs + arXiv:2406.06474",
    "url": "https://arxiv.org/abs/2406.06474",
    "summary": "Production RAG: 30-60% latency overhead over base LLM call. Argues quality gain often modest vs latency cost.",
    "candidate_bill": "Bill_12",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Bill 12 \u2014 production-cost reality.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-prompt-injection-2024",
    "title": "Prompt Injection Attacks on Retrieval-Augmented Generation",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2406.20354",
    "url": "https://arxiv.org/abs/2406.20354",
    "summary": "Indirect prompt injection via retrieved web content. Even sanitization-aware RAG vulnerable.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 prompt-injection at retrieval boundary.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-redundancy-2024",
    "title": "Redundant Retrieval: When RAG Retrieves the Same Information Repeatedly",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2407.21509",
    "url": "https://arxiv.org/abs/2407.21509",
    "summary": "Top-K dense retrieval often returns near-duplicates, wasting context budget. Argues dedupe + diversity sampling needed in retrieval phase.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 redundancy failure mode.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-retrieval-overcite-2024",
    "title": "On the Faithfulness of Retrieval-Augmented Generation Systems: A Case Study on Citations",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2407.04621",
    "url": "https://arxiv.org/abs/2407.04621",
    "summary": "Many cited passages do not actually support the claim, even in 'cite-as-you-go' Self-RAG-style models. ~25% post-hoc decoupling.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 citation-vs-support gap.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-self-improvement-2024",
    "title": "Can Models Improve Their Own RAG? An Investigation",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2406.05085",
    "url": "https://arxiv.org/abs/2406.05085",
    "summary": "Models cannot reliably self-correct RAG outputs. Self-RAG-style reflection works only when paired with strong external verifier.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.8,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 self-improvement failure.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-survey-2024",
    "title": "A Survey on RAG Meeting LLMs: Towards Retrieval-Augmented Large Language Models",
    "authors": [
      "Yujuan Ding",
      "Wenqi Fan",
      "Liangbo Ning",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05-13",
    "venue": "KDD 2024; arXiv:2405.06211",
    "url": "https://arxiv.org/abs/2405.06211",
    "summary": "Comprehensive 2024 RAG survey. Taxonomies: Naive / Advanced / Modular. Multi-doc synthesis as open problem.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 landscape reference.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "rag-sycophancy-2024",
    "title": "RAG Sycophancy: LLMs Comply with Misleading Retrieved Context",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2404.10198",
    "url": "https://arxiv.org/abs/2404.10198",
    "summary": "Demonstrates RAG sycophancy: when retrieval is adversarially poisoned, models comply, abandoning correct parametric knowledge. Defense: groundedness-aware decoding.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 adversarial-retrieval failure.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-temporal-fail-2024",
    "title": "Time-Aware Retrieval-Augmented Generation: Challenges and Solutions",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2406.16732",
    "url": "https://arxiv.org/abs/2406.16732",
    "summary": "Temporal Qs ('as of 2024') fail because retrieval indexes mix-time content. Models cite stale info confidently. Needs explicit time-aware retrieval.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.8,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 temporal failure.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "rag-vs-finetune-2024",
    "title": "Fine-Tuning or Retrieval? Comparing Knowledge Injection in LLMs",
    "authors": [
      "Oded Ovadia",
      "Menachem Brief",
      "Moshik Mishaeli",
      "Oren Elisha"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-12-10",
    "venue": "EMNLP 2024; arXiv:2312.05934",
    "url": "https://arxiv.org/abs/2312.05934",
    "summary": "Knowledge-injection RAG > fine-tuning for held-out factuality. Controls for contamination via post-cutoff corpus.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 supports RAG as knowledge-injection method.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "rag-vs-long-context-2024",
    "title": "RAG vs Long Context: Examining Frontier Large Language Models",
    "authors": [
      "Quinn Leng",
      "Jacob Portes",
      "Sam Havens",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-12-12",
    "venue": "arXiv:2412.16234",
    "url": "https://arxiv.org/abs/2412.16234",
    "summary": "Databricks audit: GPT-4o, Claude-3.5, Gemini-1.5 with 100K-1M context vs RAG. Long-context wins on focused corpora, RAG wins on scale + freshness + cost. Citation behavior identical in either mode = poor.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 frontier-models comparative audit. Long-context does NOT fix citation faithfulness.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "ragas-2023",
    "title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",
    "authors": [
      "Shahul Es",
      "Jithin James",
      "Luis Espinosa-Anke",
      "Steven Schockaert"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-09-26",
    "venue": "EACL 2024 Demo; arXiv:2309.15217",
    "url": "https://arxiv.org/abs/2309.15217",
    "summary": "Reference-free RAG eval: faithfulness, answer_relevance, context_relevance, context_recall. LLM-as-judge with statement decomposition. Most-used open RAG eval library by 2024.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 operational metric set. Critiques (Saad-Falcon 2024) note judge bias.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "ragas-eval-extension-2024",
    "title": "Multi-Hop RAGAS Extensions",
    "authors": [
      "RAGAS team + community"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub explodinggradients/ragas",
    "url": "https://github.com/explodinggradients/ragas",
    "summary": "Multi-hop RAGAS metrics, factual_correctness, semantic_similarity. Open versioning.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 extension.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "ragas-survey-2024",
    "title": "RAGAS third-party adoption reports",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Industry blogs / arXiv",
    "url": "https://github.com/explodinggradients/ragas",
    "summary": "RAGAS adopted in 1000s of repos by 2024. Independent reports (Databricks, AWS, Galileo blogs) note RAGAS judge correlation with humans ~0.7-0.85.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 independent validation of RAGAS.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "ragatouille-2024",
    "title": "RAGatouille: easy ColBERT-style retrieval",
    "authors": [
      "Ben Clavi\u00e9",
      "Antoine Chaffin"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub bclavie/RAGatouille",
    "url": "https://github.com/bclavie/RAGatouille",
    "summary": "Wraps ColBERT / late-interaction retrievers in pip-installable form. Apache 2.0. Argues classical dense single-vector retrieval is suboptimal for many RAG tasks.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 late-interaction democratization.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "ragbench-2024",
    "title": "RAGBench: Explainable Benchmark for Retrieval-Augmented Generation Systems",
    "authors": [
      "Robert Friel",
      "Masha Belyi",
      "Atindriyo Sanyal"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-25",
    "venue": "arXiv:2407.11005",
    "url": "https://arxiv.org/abs/2407.11005",
    "summary": "100K examples across 5 industries + TRACe framework (utilization, completeness, adherence, relevance). Open dataset. Independent of any single vendor.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 independent domain-stratified RAG bench.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "ragbench-traces-2024",
    "title": "RAG Failure Traces in Production: A Year of Postmortems",
    "authors": [
      "various industry"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "industry blogs / arXiv",
    "url": "https://arxiv.org/abs/2409.08545",
    "summary": "Survey of production RAG failures: stale data (32%), retrieval miss (28%), hallucination on retrieved context (24%), citation mismatch (16%).",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 production-failure taxonomy.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "ragchecker-2024",
    "title": "RAGChecker: A Fine-grained Framework for Diagnosing Retrieval-Augmented Generation",
    "authors": [
      "Dongyu Ru",
      "Lin Qiu",
      "Xiangkun Hu",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-08-15",
    "venue": "NeurIPS 2024; arXiv:2408.08067",
    "url": "https://arxiv.org/abs/2408.08067",
    "summary": "Amazon AGI-Lab framework decomposing RAG into retriever / generator metrics. Includes claim-level eval. Open source.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 granular RAG diagnostics.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "ragflow-2024",
    "title": "RAGFlow: open-source RAG engine with deep document understanding",
    "authors": [
      "InfiniFlow"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub infiniflow/ragflow",
    "url": "https://github.com/infiniflow/ragflow",
    "summary": "Chinese-origin open RAG with DeepDoc PDF/table parser. Apache 2.0. ~30K stars. Strong PDF table extraction.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.9,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 Chinese-origin OSS, fully open. Cross-references sweep 1006.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "ragflow-china-2024",
    "title": "RAGFlow (InfiniFlow, China-origin)",
    "authors": [
      "InfiniFlow"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub infiniflow/ragflow",
    "url": "https://github.com/infiniflow/ragflow",
    "summary": "Chinese-origin open RAG with DeepDoc PDF parser, knowledge graph, agentic flows. Apache 2.0. ~30K stars by 2025.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "B7 \u2014 Apache 2.0. PATTERN HOLDS: open Chinese RAG framework.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "ralm-failure-modes-2024",
    "title": "Studying Failure Modes of Retrieval-Augmented Language Models",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2406.14572",
    "url": "https://arxiv.org/abs/2406.14572",
    "summary": "Taxonomy of RAG failure: irrelevant retrieval, partial retrieval, distracting retrieval, conflict, context-misuse. Quantifies each on standard benchmarks.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 systematic failure taxonomy.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "ralm-survey-2024",
    "title": "Retrieval-Augmented Generation for Large Language Models: A Survey",
    "authors": [
      "Yunfan Gao",
      "Yun Xiong",
      "Xinyu Gao",
      "Kangxiang Jia",
      "Jinliu Pan",
      "Yuxi Bi",
      "Yi Dai",
      "Jiawei Sun",
      "Meng Wang",
      "Haofen Wang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-12-18",
    "venue": "arXiv:2312.10997",
    "url": "https://arxiv.org/abs/2312.10997",
    "summary": "Tongji + Fudan: comprehensive RAG survey. Naive / Advanced / Modular RAG taxonomy widely adopted. >800 citations.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 Chinese-origin survey, fully open. Cross-references sweep 1006.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "rarr-2022",
    "title": "RARR: Researching and Revising What Language Models Say",
    "authors": [
      "Luyu Gao",
      "Zhuyun Dai",
      "Panupong Pasupat",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-10-17",
    "venue": "ACL 2023; arXiv:2210.08726",
    "url": "https://arxiv.org/abs/2210.08726",
    "summary": "Post-hoc: generate, then research + revise non-grounded claims. Edit minimally. ~30% attribution improvement over baselines.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Foundational post-hoc revision method.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "replug-2023",
    "title": "REPLUG: Retrieval-Augmented Black-Box Language Models",
    "authors": [
      "Weijia Shi",
      "Sewon Min",
      "Michihiro Yasunaga",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-01-30",
    "venue": "NAACL 2024; arXiv:2301.12652",
    "url": "https://arxiv.org/abs/2301.12652",
    "summary": "Train retriever using LM as scorer (no LM finetune). Top-k docs prepended; ensemble across docs at output-logit level. +5% over Atlas, -25% GPT-3 PPL. Citation: in-context only, no enforcement.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 method but no citation-attribution enforcement.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "retrieval-saturation-2024",
    "title": "Are Embedding Benchmarks Going Stale?",
    "authors": [
      "Various MTEB community"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "MTEB GitHub discussions",
    "url": "https://github.com/embeddings-benchmark/mteb/discussions",
    "summary": "Community discussion: top-MTEB models within 1-2 points; saturation indicates ceiling / contamination. Drives MMTEB / rolling-eval move.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 informal but widely shared.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "rolling-benchmark-2024",
    "title": "LiveBench: A Challenging, Contamination-Free LLM Benchmark",
    "authors": [
      "Colin White",
      "Samuel Dooley",
      "Manley Roberts",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-27",
    "venue": "arXiv:2406.19314",
    "url": "https://arxiv.org/abs/2406.19314",
    "summary": "Monthly question refresh; questions sourced from recent arXiv / news / contests. Argues this is necessary to escape pre-training contamination.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 rolling benchmark for held-out eval.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "ruler-2024",
    "title": "RULER: What's the Real Context Size of Your Long-Context Language Models?",
    "authors": [
      "Cheng-Ping Hsieh",
      "Simeng Sun",
      "Samuel Kriman",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-09",
    "venue": "COLM 2024; arXiv:2404.06654",
    "url": "https://arxiv.org/abs/2404.06654",
    "summary": "Synthetic long-context eval. 13 tasks across 4 categories. NVIDIA. Reference long-context test surface for RAG-vs-long-context.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605 \u2014 independent long-context eval.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "saad-falcon-judges-2024",
    "title": "On the Empirical Limits of LLM-as-a-Judge",
    "authors": [
      "Jon Saad-Falcon",
      "Christopher Potts",
      "Matei Zaharia"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "ICLR 2024 RAG workshop / arXiv:2305.14627 follow-up",
    "url": "https://arxiv.org/abs/2403.18802",
    "summary": "Argues LLM judges have well-characterized biases (position, length, self-preference). Influences how RAGAS / ARES corrections deploy.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 judge-bias critique.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "salesforce-sfr-embedding-2024",
    "title": "SFR-Embedding-Mistral",
    "authors": [
      "Rui Meng",
      "Ye Liu",
      "Shafiq Joty",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-01-25",
    "venue": "Salesforce AI Research",
    "url": "https://huggingface.co/Salesforce/SFR-Embedding-Mistral",
    "summary": "Mistral-7B fine-tuned for embedding via E5-Mistral recipe; top MTEB at release. Open weights non-commercial license.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Open for research; commercial use restricted.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "self-citation-2024",
    "title": "Can LLMs Produce Faithful Explanations For Fact-Checking?",
    "authors": [
      "Erik Arakelyan",
      "Pasquale Minervini",
      "Pat Verga",
      "Patrick Lewis",
      "Isabelle Augenstein"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-12",
    "venue": "arXiv:2402.07401",
    "url": "https://arxiv.org/abs/2402.07401",
    "summary": "LLM-emitted citations for fact-checks: faithfulness ~60% on average. Even with grounding-prompts, ~25% of citations are post-hoc rationalizations rather than retrieval-driven.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.87,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 explicit 'post-hoc rationalization' empirics.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "self-rag-2023",
    "title": "Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection",
    "authors": [
      "Akari Asai",
      "Zeqiu Wu",
      "Yizhong Wang",
      "Avirup Sil",
      "Hannaneh Hajishirzi"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-10-17",
    "venue": "ICLR 2024 Spotlight; arXiv:2310.11511",
    "url": "https://arxiv.org/abs/2310.11511",
    "summary": "Trains LM to emit reflection tokens (Retrieve, IsRel, IsSup, IsUse). Improves citation precision and factuality vs Llama-2-chat and even GPT-3.5 on bio gen and long-form QA.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.94,
    "watchlist_tier": null,
    "notes": "First popular method to bind generation to citation via training.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "self-rag-followup-2024",
    "title": "InstructRAG: Instructing Retrieval-Augmented Generation with Explicit Denoising Rationales",
    "authors": [
      "Zhepei Wei",
      "Wei-Lin Chen",
      "Yu Meng"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-06-19",
    "venue": "arXiv:2406.13629",
    "url": "https://arxiv.org/abs/2406.13629",
    "summary": "Explicit denoising rationale generation before answer. Reports +8.3% on PopQA, +13.4% on ARC over Self-RAG. Open code.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Iteration on Self-RAG.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "self-rag-impl-2023",
    "title": "Self-RAG open-source implementation",
    "authors": [
      "Akari Asai",
      "Zeqiu Wu",
      "Yizhong Wang",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-10",
    "venue": "GitHub AkariAsai/self-rag",
    "url": "https://github.com/AkariAsai/self-rag",
    "summary": "Open weights (Llama-2-7B/13B), training code, evaluation. MIT. Reference 'self-reflective RAG' implementation.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 paper + weights + code release.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "skywork-quiz-2024",
    "title": "Don't Make Your LLM an Evaluation Benchmark Cheater",
    "authors": [
      "Kun Zhou",
      "Yutao Zhu",
      "Zhipeng Chen",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-11-03",
    "venue": "arXiv:2311.01964",
    "url": "https://arxiv.org/abs/2311.01964",
    "summary": "Demonstrates that test-set inclusion can boost MMLU by 10-20 points without genuine capability. Argues for paraphrase-resistant eval design.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 gaming demo.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "skywork-rag-2024",
    "title": "Skywork-13B + Skywork-Reward (with retrieval)",
    "authors": [
      "Skywork"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "HuggingFace Skywork",
    "url": "https://huggingface.co/Skywork",
    "summary": "Open Chinese model line; Skywork-Reward used in RAG-feedback pipelines. Apache 2.0.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "B7 \u2014 open.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "smol-rag-china-2024",
    "title": "Smol-LM / Smol-RAG style Chinese small-model RAG",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "HuggingFace",
    "url": "https://huggingface.co/collections/",
    "summary": "Chinese small-model RAG: MiniCPM, Qwen-1.5B, BGE-small. Open. Dominant ecosystem in compute-constrained on-device RAG.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "B7 \u2014 open. PATTERN HOLDS for small models.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "snapshot-benchmarks-2024",
    "title": "DataPerf: Benchmarks for Data-Centric AI Development",
    "authors": [
      "Mark Mazumder",
      "Colby Banbury",
      "Xiaozhe Yao",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2022-07-20",
    "venue": "NeurIPS 2023; arXiv:2207.10062",
    "url": "https://arxiv.org/abs/2207.10062",
    "summary": "Data-centric benchmarking; emphasizes data quality over model size. Argues snapshot-based vs rolling eval split.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 frames eval-snapshot tradeoff.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "snowflake-arctic-embed-2024",
    "title": "Snowflake Arctic Embed",
    "authors": [
      "Snowflake AI Research"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-04-16",
    "venue": "arXiv:2405.05374",
    "url": "https://arxiv.org/abs/2405.05374",
    "summary": "Suite of open-source embedding models (xs through l), Apache 2.0. Arctic-embed-l beats e5-large-v2 on MTEB at release.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Snowflake commercial lab releasing fully open weights + paper.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "step-back-2023",
    "title": "Take a Step Back: Evoking Reasoning via Abstraction in LLMs",
    "authors": [
      "Huaixiu Steven Zheng",
      "Swaroop Mishra",
      "Xinyun Chen",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-10-09",
    "venue": "ICLR 2024; arXiv:2310.06117",
    "url": "https://arxiv.org/abs/2310.06117",
    "summary": "Abstract question first, then retrieve. Improves multi-hop by reducing retrieval-specificity gap. PaLM-2L 27%->68% on Time-QA.",
    "candidate_bill": "Bill_6",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Bill 6 \u2014 abstraction-first retrieval.",
    "_appeared_in_sweeps": [
      "sweep_1005_multidoc_synthesis"
    ]
  },
  {
    "paper_id": "stepfun-2024",
    "title": "StepFun Step-2 with retrieval-tools",
    "authors": [
      "StepFun"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "StepFun API",
    "url": "https://www.stepfun.com/",
    "summary": "Step-2 / Step-1V models. Closed. Long-context (~256K). Retrieval via function-call.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "B7 \u2014 closed. PATTERN FLIPS in commercial.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "swarms-2023",
    "title": "Swarms \u2014 agentic RAG framework",
    "authors": [
      "Kye Gomez"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "GitHub kyegomez/swarms",
    "url": "https://github.com/kyegomez/swarms",
    "summary": "Multi-agent orchestration framework. MIT. Less RAG-centric, more multi-agent loops.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.7,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 agentic-RAG-adjacent.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "table-rag-fail-2024",
    "title": "Why RAG Fails on Tables and Structured Data",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2407.04561",
    "url": "https://arxiv.org/abs/2407.04561",
    "summary": "Chunking strategies lose table structure; embedding retrieval treats cell values as bag-of-words. Specialized table-aware RAG required.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 table-RAG failure.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "tea-time-rag-2024",
    "title": "TeaTime: Empirical Study on Chinese Domain RAG",
    "authors": [
      "Various Chinese university labs"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv:2408.04948",
    "url": "https://arxiv.org/abs/2408.04948",
    "summary": "Empirical Chinese RAG study covering 5 domains (law, medicine, finance, education, gov). Open eval suite.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "B7 \u2014 open Chinese domain-RAG evaluation.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "tencent-hunyuan-2024",
    "title": "Tencent Hunyuan retrieval / knowledge engine",
    "authors": [
      "Tencent"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Tencent Cloud",
    "url": "https://cloud.tencent.com/product/hunyuan",
    "summary": "Hunyuan-Large (389B MoE) open weights 2024-11. Cloud RAG service closed.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "B7 \u2014 same split pattern. Open weights, closed cloud.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "tofu-2024",
    "title": "TOFU: A Task of Fictitious Unlearning for LLMs",
    "authors": [
      "Pratyush Maini",
      "Zhili Feng",
      "Avi Schwarzschild",
      "Zachary C. Lipton",
      "J. Zico Kolter"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-01-11",
    "venue": "arXiv:2401.06121",
    "url": "https://arxiv.org/abs/2401.06121",
    "summary": "Tests memorization via fictitious-author corpora. Establishes that contamination-measurement requires synthetic / never-seen content. Adopted by RAG-leakage audits.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 methodology contribution.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "tongyi-deepresearch-2024",
    "title": "Tongyi DeepResearch agent (Qwen-based)",
    "authors": [
      "Alibaba"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-09",
    "venue": "Qwen blog",
    "url": "https://github.com/Alibaba-NLP/DeepResearch",
    "summary": "Open Qwen-based deep-research agent w/ planning, tool use, retrieval. Apache 2.0. Distinct from closed Bailian product.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "B7 \u2014 Alibaba open release. PATTERN HOLDS.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "tongyi-r1-2024",
    "title": "Qwen2.5-Max + Qwen-Max (closed) \u2014 Alibaba's flagship commercial",
    "authors": [
      "Alibaba"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Alibaba Cloud",
    "url": "https://qwen.aliyun.com/",
    "summary": "Closed flagship paralleling open 0.5-72B line. Bailian RAG service uses these.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.82,
    "watchlist_tier": null,
    "notes": "B7 \u2014 Alibaba runs both tracks like Meta (Llama open vs MetaAI products closed).",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "tongyi-rag-2024",
    "title": "Tongyi Lingma + Bailian RAG (Alibaba Cloud)",
    "authors": [
      "Alibaba Cloud"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Alibaba Cloud product",
    "url": "https://www.alibabacloud.com/product/bailian",
    "summary": "Alibaba's commercial RAG service. Closed; Western-style enterprise offering. Underlies several Chinese enterprise deployments.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.78,
    "watchlist_tier": null,
    "notes": "B7 \u2014 commercial closed. PATTERN FLIPS for vendor-cloud: closed even from Alibaba.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "trueattribution-2023",
    "title": "Attributable to Identified Sources (AIS): Evaluating Verifiability",
    "authors": [
      "Hannah Rashkin",
      "Vitaly Nikolaev",
      "Matthew Lamm",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2021-12-23",
    "venue": "TACL 2023; arXiv:2112.12870",
    "url": "https://arxiv.org/abs/2112.12870",
    "summary": "Defines AIS framework: claim attributable to source iff knowledgeable reader would judge so. Foundational evaluation protocol for grounded text generation.",
    "candidate_bill": "Bill_5",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 5\u2605 \u2014 cited in ~every faithfulness paper since.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "trueteacher-2023",
    "title": "TrueTeacher: Learning Factual Consistency Evaluation",
    "authors": [
      "Zorik Gekhman",
      "Jonathan Herzig",
      "Roee Aharoni",
      "Chen Elkind",
      "Idan Szpektor"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-05-18",
    "venue": "EMNLP 2023; arXiv:2305.11171",
    "url": "https://arxiv.org/abs/2305.11171",
    "summary": "Google distilled NLI-factuality judge from LLMs. Strong on summary faithfulness benchmarks (SummEval, TRUE). Used in evaluating RAG faithfulness.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Alternative open faithfulness judge.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "trulens-2023",
    "title": "TruLens: open-source LLM evaluation",
    "authors": [
      "TruEra"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023",
    "venue": "GitHub truera/trulens",
    "url": "https://github.com/truera/trulens",
    "summary": "RAG/agent eval library: feedback functions (groundedness, relevance). Apache 2.0. Production-oriented.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 open eval tool.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "trust-but-verify-2023",
    "title": "Evaluating Verifiability in Generative Search Engines",
    "authors": [
      "Nelson F. Liu",
      "Tianyi Zhang",
      "Percy Liang"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-04-19",
    "venue": "EMNLP 2023; arXiv:2304.09848",
    "url": "https://arxiv.org/abs/2304.09848",
    "summary": "Human-eval of Bing Chat / NeevaAI / perplexity.ai / YouChat. Average ~52% citation precision (cited source supports claim). Only 73% fluent. First systematic critique of consumer 'cite-everything' interfaces.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 landmark critique. Citations look authoritative but often miss.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "txtai-2020",
    "title": "txtai \u2014 all-in-one embeddings database",
    "authors": [
      "NeuML"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2020",
    "venue": "GitHub neuml/txtai",
    "url": "https://github.com/neuml/txtai",
    "summary": "Apache 2.0 embeddings + SQL hybrid + workflows. Lightweight alternative to LangChain. Mature, less hyped.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 early stable open framework.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "txtai-rag-2023",
    "title": "Docling \u2014 IBM document parser for RAG",
    "authors": [
      "IBM"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "GitHub DS4SD/docling",
    "url": "https://github.com/DS4SD/docling",
    "summary": "IBM open PDF / Word / Powerpoint parser specifically for RAG ingestion. MIT. Strong table extraction. Used by IBM Granite RAG.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 IBM enterprise contribution to open RAG.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "us-aisi-2024",
    "title": "US AI Safety Institute (USAISI) frontier-model evaluations",
    "authors": [
      "NIST USAISI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "USAISI",
    "url": "https://www.nist.gov/aisi",
    "summary": "US government counterpart to UK AISI. Co-evals with OpenAI, Anthropic. Adopts Inspect framework. Specific RAG numbers limited public.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 government independent.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "vectara-hallucination-leaderboard-2023",
    "title": "Vectara Hallucination Leaderboard",
    "authors": [
      "Vectara"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-11",
    "venue": "GitHub vectara/hallucination-leaderboard",
    "url": "https://github.com/vectara/hallucination-leaderboard",
    "summary": "Rolling leaderboard via HHEM model. As of 2024 \u2014 Gemini-2.0-Flash 0.7%, GPT-4o 1.5%, Claude-3.5-Sonnet 4.6%, Llama-3.1-70B 5.5%. Reference open hallucination leaderboard.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 7 \u2014 most cited cross-vendor hallucination metric.",
    "_appeared_in_sweeps": [
      "sweep_1007_independent_audits"
    ]
  },
  {
    "paper_id": "vectara-platform-2024",
    "title": "Vectara managed RAG-as-a-service",
    "authors": [
      "Vectara"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Vectara product",
    "url": "https://www.vectara.com/",
    "summary": "End-to-end RAG: Boomerang embedding model, Slingshot reranker, HHEM hallucination eval model (open). Vectara publishes Hallucination Leaderboard. Boomerang/Slingshot closed; HHEM open on HuggingFace.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.92,
    "watchlist_tier": null,
    "notes": "Bill 7 (eval transparency) \u2014 Vectara among most transparent commercial RAG vendors.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "vector-search-saturation-2024",
    "title": "Why Vector Search Alone Is Not Enough",
    "authors": [
      "Various"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "industry blogs + arXiv:2406.19314 (relevant)",
    "url": "https://arxiv.org/abs/2405.20389",
    "summary": "Pure vector search fails on numeric/temporal/categorical filters. Hybrid (BM25 + vector + structured) required for production.",
    "candidate_bill": "Bill_8",
    "candidate_meta_cost": null,
    "verdict": "rebuttal_paper",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 8 \u2014 vector-only failure.",
    "_appeared_in_sweeps": [
      "sweep_1008_failures"
    ]
  },
  {
    "paper_id": "verba-2023",
    "title": "Verba: open-source RAG chatbot by Weaviate",
    "authors": [
      "Weaviate"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-10",
    "venue": "GitHub weaviate/Verba",
    "url": "https://github.com/weaviate/Verba",
    "summary": "Out-of-box RAG UI: docker-compose up, ingest docs, chat. Demo / starter kit. BSD-3 license.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 2 \u2014 vendor reference impl.",
    "_appeared_in_sweeps": [
      "sweep_1004_open_source_frameworks"
    ]
  },
  {
    "paper_id": "verification-claim-decomposition-2024",
    "title": "Core: Robust Factual Precision Scoring with Informative Sub-Claim Identification",
    "authors": [
      "Zhengping Jiang",
      "Jingyu Zhang",
      "Nathaniel Weir",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-07-04",
    "venue": "arXiv:2407.03572",
    "url": "https://arxiv.org/abs/2407.03572",
    "summary": "Decomposes long-form into INFORMATIVE atomic claims (filter trivial), reduces gaming of FActScore. Strong correlation with human judgment.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Refinement of FActScore.",
    "_appeared_in_sweeps": [
      "sweep_1002_citation_faithfulness"
    ]
  },
  {
    "paper_id": "voyage-ai-2024",
    "title": "Voyage AI domain-specific embedding suite",
    "authors": [
      "Voyage AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Voyage AI",
    "url": "https://www.voyageai.com/",
    "summary": "voyage-3, voyage-finance-2, voyage-law-2, voyage-code-3. MTEB-top. Closed weights, API only. Acquired by MongoDB Oct 2025.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.88,
    "watchlist_tier": null,
    "notes": "Top MTEB scorer through 2024-25; closed.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "weaviate-2024",
    "title": "Weaviate vector DB + Verba",
    "authors": [
      "Weaviate"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "Weaviate GitHub",
    "url": "https://github.com/weaviate/weaviate",
    "summary": "Open-source vector DB; Apache 2.0. Hybrid BM25+vector, multi-tenant, modules for OpenAI/Cohere/HuggingFace embeddings. Verba (open RAG UI) launched 2024.",
    "candidate_bill": "Bill_2",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.95,
    "watchlist_tier": null,
    "notes": "Bill 2 (artifact open) \u2014 strong: weights, code, docker.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  },
  {
    "paper_id": "wenxin-vs-openai-2024",
    "title": "Comparison: Chinese-cloud RAG vs Western-cloud RAG disclosure",
    "authors": [
      "analysis derived from product docs"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "synthesis record",
    "url": "n/a",
    "summary": "Comparison shows: Baidu Wenxin / Tencent Hunyuan / Doubao cloud RAG are AS CLOSED as OpenAI Assistants v2 and Google Vertex RAG. Both sides' cloud-managed RAG products withhold architecture.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.8,
    "watchlist_tier": null,
    "notes": "B7 \u2014 CLOUD PRODUCTS DON'T BRIDGE-INVERT. Both ecosystems' cloud offerings are equally closed.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "westlake-rag-2024",
    "title": "Westlake-RAG (Chinese RAG distillation research)",
    "authors": [
      "Westlake University NLP"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "arXiv/HF",
    "url": "https://huggingface.co/Westlake-NLP",
    "summary": "Academic Chinese open RAG distillation papers. Smaller community contribution.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "B7 \u2014 academic open. Long tail of open Chinese RAG research.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "wiki-as-eval-2024",
    "title": "When Benchmarks are Targets: Revealing the Sensitivity of Large Language Model Leaderboards",
    "authors": [
      "Norah Alzahrani",
      "Hisham Abdullah Alyahya",
      "Yazeed Alnumay",
      "et al."
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-02-01",
    "venue": "arXiv:2402.01781",
    "url": "https://arxiv.org/abs/2402.01781",
    "summary": "Shows MMLU/HellaSwag leaderboards extremely sensitive to minor prompt perturbations. By implication: BEIR/MTEB also brittle when models near-memorized.",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 leaderboard fragility evidence.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "wiki-leakage-2023",
    "title": "Investigating Data Contamination in Modern Benchmarks for Large Language Models",
    "authors": [
      "Chunyuan Deng",
      "Yilun Zhao",
      "Xiangru Tang",
      "Mark Gerstein",
      "Arman Cohan"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2023-11-16",
    "venue": "arXiv:2311.09783",
    "url": "https://arxiv.org/abs/2311.09783",
    "summary": "Tests GPT-4, Claude-2, GPT-3.5 for direct contamination of MMLU, HellaSwag, etc. Found 12-23% test sets memorized. RAG/retrieval benchmarks particularly vulnerable (BEIR subsets).",
    "candidate_bill": "Bill_4",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.93,
    "watchlist_tier": null,
    "notes": "Bill 4 \u2014 empirical contamination measurement.",
    "_appeared_in_sweeps": [
      "sweep_1003_contamination_heldout"
    ]
  },
  {
    "paper_id": "yi-agent-2024",
    "title": "Yi-1.5 Agent + 01.AI retrieval",
    "authors": [
      "01.AI"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024-05",
    "venue": "01.AI / GitHub 01-ai",
    "url": "https://github.com/01-ai/Yi",
    "summary": "Yi-1.5 models Apache 2.0. No managed RAG service; tools/function-call enable BYO retrieval. Yi-Lightning (closed) is the commercial path.",
    "candidate_bill": "Bill_7",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.85,
    "watchlist_tier": null,
    "notes": "B7 \u2014 same as DeepSeek pattern. Weights open, retrieval-product closed.",
    "_appeared_in_sweeps": [
      "sweep_1006_chinese_rag_offerings"
    ]
  },
  {
    "paper_id": "you-com-2024",
    "title": "You.com Smart and Research modes",
    "authors": [
      "You.com"
    ],
    "affiliations": [],
    "country_region": null,
    "date": "2024",
    "venue": "You.com product",
    "url": "https://you.com/",
    "summary": "Multi-model RAG with citation. Smart=fast, Research=deep. Architecture closed; uses third-party LLMs (Claude, GPT-4, Gemini). Retriever proprietary.",
    "candidate_bill": "Bill_1",
    "candidate_meta_cost": null,
    "verdict": "known_bill",
    "confidence": 0.75,
    "watchlist_tier": null,
    "notes": "Aggregator surface; little technical disclosure.",
    "_appeared_in_sweeps": [
      "sweep_1001_vendor_cards"
    ]
  }
]