diff --git a/.gitignore b/.gitignore
index 6c80c95c3..ac2ff94c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,5 @@ surfsense_web/test-results/
 surfsense_web/blob-report/
 hermes-agent
 hermes-agent/
+
+content_research/
diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
index d45bd780c..7fe3c94df 100644
--- a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
+++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py
@@ -134,12 +134,92 @@ class EtlPipelineService:
         else:
             raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")
 
+        # When the operator opts into vision-LLM at ingest, walk the
+        # original file's embedded images and append a structured
+        # "Image Content" section. The parser's own OCR (Docling
+        # do_ocr=True, Azure DI prebuilt-read, etc.) handles text-in-
+        # image; this side handles the *visual* description which the
+        # parsers all drop today.
+        content = await self._maybe_append_picture_descriptions(request, content)
+
         return EtlResult(
             markdown_content=content,
             etl_service=etl_service,
             content_type="document",
         )
 
+    async def _maybe_append_picture_descriptions(
+        self, request: EtlRequest, markdown: str
+    ) -> str:
+        if self._vision_llm is None:
+            return markdown
+
+        from app.etl_pipeline.picture_describer import (
+            describe_pictures,
+            merge_descriptions_into_markdown,
+        )
+
+        # Per-image OCR runner: re-feed each extracted image through
+        # the ETL pipeline *as a standalone image* (no vision LLM, so
+        # the IMAGE branch falls through to the document parser, which
+        # OCRs the image with the configured backend -- Docling /
+        # Azure DI / LlamaCloud). This gives us per-image OCR text
+        # attached to the inline image block, in addition to the
+        # page-level OCR that the parser already merges into the main
+        # markdown stream. The fresh sub-service gets vision_llm=None
+        # so this call cannot recurse back into picture_describer.
+        async def _ocr_image(image_path: str, image_name: str) -> str:
+            try:
+                sub = EtlPipelineService(vision_llm=None)
+                ocr_result = await sub.extract(
+                    EtlRequest(file_path=image_path, filename=image_name)
+                )
+            except (
+                EtlUnsupportedFileError,
+                EtlServiceUnavailableError,
+            ) as exc:
+                # Common case: the configured ETL service can't OCR
+                # this image format (or no service is configured at
+                # all). Don't spam warnings -- just no OCR for it.
+                logging.debug(
+                    "Skipping per-image OCR for %s: %s", image_name, exc
+                )
+                return ""
+            return ocr_result.markdown_content
+
+        try:
+            result = await describe_pictures(
+                request.file_path,
+                request.filename,
+                self._vision_llm,
+                ocr_runner=_ocr_image,
+            )
+        except Exception:
+            # Picture description is additive; never let it fail an
+            # otherwise-successful document extraction.
+            logging.warning(
+                "Picture description failed for %s, returning parser output unchanged",
+                request.filename,
+                exc_info=True,
+            )
+            return markdown
+
+        if not result.descriptions:
+            return markdown
+
+        merged = merge_descriptions_into_markdown(markdown, result)
+        logging.info(
+            "Vision LLM described %d image(s) in %s "
+            "(skipped: %d small / %d large / %d duplicate, %d failed)",
+            len(result.descriptions),
+            request.filename,
+            result.skipped_too_small,
+            result.skipped_too_large,
+            result.skipped_duplicate,
+            result.failed,
+        )
+        return merged
+
     async def _extract_with_llamacloud(self, request: EtlRequest) -> str:
         """Try Azure Document Intelligence first (when configured) then LlamaCloud.
 
diff --git a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
index c80fbca0a..8ae0715f3 100644
--- a/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
+++ b/surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
@@ -4,12 +4,34 @@ import os
 
 from langchain_core.messages import HumanMessage
 
+# Single-shot prompt used by standalone image uploads (.png/.jpg/etc).
+# A standalone image IS the document, so we want everything: visual
+# content plus any text the model can read off it. The output is
+# combined markdown that the chunker treats as the full document body.
 _PROMPT = (
     "Describe this image in markdown. "
     "Transcribe any visible text verbatim. "
     "Be concise but complete — let the image content guide the level of detail."
 )
 
+# Per-image-in-PDF prompt. Here the image is *inside* a larger
+# document, and the ETL service (Docling/Azure DI/LlamaCloud/...) is
+# already running OCR over the whole page — including text rendered
+# into images. So we explicitly tell the model NOT to transcribe text
+# and to focus only on visual interpretation. This avoids paying
+# output tokens for OCR content the ETL pipeline already captured.
+_DESCRIPTION_PROMPT = (
+    "Describe what this image visually depicts in concise markdown. "
+    "Focus on visual content — anatomy, structures, charts, diagrams, "
+    "spatial relationships, colors, modality (e.g. axial CT, ECG strip, "
+    "histology slide), and any clinically or structurally relevant "
+    "findings.\n\n"
+    "Do NOT transcribe text from the image. Any text in the image "
+    "(axis labels, annotations, scale bars, lab values, etc.) is "
+    "already extracted by a separate OCR pipeline; duplicating it "
+    "here would be redundant. Stick to the visual interpretation."
+)
+
 _MAX_IMAGE_BYTES = (
     5 * 1024 * 1024
 )  # 5 MB (Anthropic Claude's limit, the most restrictive)
@@ -47,11 +69,10 @@ def _image_to_data_url(file_path: str) -> str:
     return f"data:{mime_type};base64,{encoded}"
 
 
-async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
-    data_url = _image_to_data_url(file_path)
+async def _invoke_vision(llm, prompt: str, data_url: str, filename: str) -> str:
     message = HumanMessage(
         content=[
-            {"type": "text", "text": _PROMPT},
+            {"type": "text", "text": prompt},
             {"type": "image_url", "image_url": {"url": data_url}},
         ]
     )
@@ -62,3 +83,36 @@ async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
     if not text or not text.strip():
         raise ValueError(f"Vision LLM returned empty content for {filename}")
     return text.strip()
+
+
+async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
+    """Single-shot: returns combined markdown for a standalone image upload.
+
+    Used when the operator uploads an image file directly (jpg/png/etc).
+    The image is the document, so the prompt asks for both visual
+    description and verbatim text in one go.
+    """
+    data_url = _image_to_data_url(file_path)
+    return await _invoke_vision(llm, _PROMPT, data_url, filename)
+
+
+async def parse_image_for_description(
+    file_path: str, filename: str, llm
+) -> str:
+    """Visual-description-only call for per-image-in-PDF use.
+
+    Used by ``picture_describer`` when an image is embedded inside a
+    larger document. Returns a markdown description of what the image
+    visually depicts; deliberately does NOT include text-in-image OCR
+    because the ETL service (Docling, Azure DI, LlamaCloud, ...) is
+    already running OCR over the entire page and would duplicate that
+    text content.
+    """
+    data_url = _image_to_data_url(file_path)
+    return await _invoke_vision(llm, _DESCRIPTION_PROMPT, data_url, filename)
+
+
+__all__ = [
+    "parse_image_for_description",
+    "parse_with_vision_llm",
+]
diff --git a/surfsense_backend/app/etl_pipeline/picture_describer.py b/surfsense_backend/app/etl_pipeline/picture_describer.py
new file mode 100644
index 000000000..f6bda2d4e
--- /dev/null
+++ b/surfsense_backend/app/etl_pipeline/picture_describer.py
@@ -0,0 +1,678 @@
+"""Extract embedded images from PDFs, describe them, and inject the
+descriptions inline into the parser's markdown.
+
+When the operator passes ``use_vision_llm=True`` for a PDF, the document
+parsers (DOCLING / LLAMACLOUD / Azure DI / UNSTRUCTURED) extract text
+but mostly drop the actual image content -- a CT scan inside a clinical
+PDF becomes (at best) a ``<!-- image -->`` placeholder in the markdown,
+and the caption text below it.
+
+This module fills that gap. After the document parser produces markdown
+text, we:
+
+1. Walk the original PDF with :mod:`pypdf`, pulling out each embedded
+   image (deduped by sha256, size-capped to match the vision LLM's own
+   limits).
+2. Run the vision LLM on each unique image (visual description) and,
+   in parallel when an OCR runner is provided, re-feed the same image
+   through the ETL service for per-image OCR.
+3. **Inject** a horizontal-rule-delimited markdown section -- with
+   named "OCR text" and "Visual description" sub-sections -- where the
+   image actually appears in the parser's markdown. Two splice modes,
+   chosen by which marker the parser emitted:
+
+   - **Replace** Docling-style ``<!-- image -->`` placeholders (and an
+     optional ``Image: <filename>`` caption line). The placeholder
+     carries no useful content of its own, so we substitute our block
+     for it.
+   - **Append after** layout-aware ``<figure>...</figure>`` blocks
+     (Azure DI ``prebuilt-layout``, LlamaCloud premium). Those blocks
+     already contain parser-extracted chart values / OCR'd labels /
+     captions, which are themselves useful for retrieval -- so we
+     PRESERVE the figure verbatim and add our vision-LLM block
+     immediately after it. The chunk then contains both the parser's
+     structured numbers AND the VLM's semantic interpretation.
+
+   Either way, the image content stays in context with the surrounding
+   document body rather than getting orphaned at the end -- crucial for
+   retrieval, where a single chunk should contain the question, the
+   image content, and the answer options together.
+
+If no placeholders, figures, or captions can be matched (e.g. an
+unusual parser output), we fall back to appending an
+``## Image Content`` section so no image content is silently lost.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import contextlib
+import hashlib
+import logging
+import re
+import tempfile
+from collections.abc import Awaitable, Callable
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+# Type alias for the OCR callback. Takes (file_path, filename), returns
+# the OCR'd markdown text -- or empty string if no text was found, or
+# raises if OCR failed unrecoverably (which the describer catches and
+# treats as "no OCR for this image" rather than failing the whole doc).
+OcrRunner = Callable[[str, str], Awaitable[str]]
+
+logger = logging.getLogger(__name__)
+
+
+# Bound how many vision LLM calls we make in parallel for a single
+# document. Vision models are typically rate-limited; 4 concurrent
+# calls is a safe default that respects most provider limits while
+# keeping wall-clock manageable for image-heavy PDFs.
+_VISION_CONCURRENCY = 4
+
+# Match parse_with_vision_llm's per-image cap so we don't even attempt
+# images that the vision LLM would reject anyway (Anthropic's 5 MB
+# limit is the most restrictive among the major providers).
+_MAX_IMAGE_BYTES = 5 * 1024 * 1024
+
+# Skip degenerate images: tracking pixels, very small decorative dots,
+# scanner-introduced artefacts. We can't cheaply check pixel dimensions
+# without decoding the image, so we approximate: anything under 1 KB is
+# almost certainly not informative content.
+_MIN_IMAGE_BYTES = 1024
+
+
+@dataclass
+class PictureDescription:
+    """A single extracted image with its visual description and (optionally) OCR.
+
+    Two content fields by design, each produced by the *right* tool:
+
+    - ``description``: the vision LLM's visual interpretation. What the
+      image depicts (anatomy, charts, layout, etc.) -- the semantic
+      content that only a vision model can produce.
+    - ``ocr_text``: text-in-image extracted by re-feeding the image
+      through the configured ETL service (Docling/Azure DI/LlamaCloud)
+      *as if it were a standalone image upload*. Specialist OCR engine,
+      per-image attribution, no vision LLM tokens spent on text. None
+      when no OCR was requested or OCR found no text.
+    """
+
+    page_number: int                # 1-indexed
+    ordinal_in_page: int            # 0-indexed within the page
+    name: str                       # name pypdf assigned (e.g. "Im0")
+    sha256: str                     # hash of the raw image bytes
+    description: str                # visual description (markdown)
+    ocr_text: str | None = None     # OCR text from the ETL service, if any
+
+
+@dataclass
+class PictureExtractionResult:
+    """Aggregate result of extracting all pictures from a document."""
+
+    descriptions: list[PictureDescription] = field(default_factory=list)
+    skipped_too_small: int = 0
+    skipped_too_large: int = 0
+    skipped_duplicate: int = 0
+    failed: int = 0
+
+    @property
+    def has_content(self) -> bool:
+        return bool(self.descriptions)
+
+
+def _is_pdf(filename: str) -> bool:
+    return filename.lower().endswith(".pdf")
+
+
+def _pick_suffix(name: str) -> str:
+    lower = name.lower()
+    for ext in (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif", ".webp"):
+        if lower.endswith(ext):
+            return ".jpeg" if ext == ".jpg" else ext
+    return ".png"
+
+
+def _extract_pdf_images(file_path: str) -> list[tuple[int, int, str, bytes]]:
+    """Pull every embedded image out of a PDF.
+
+    Returns ``(page_number_1_indexed, ordinal_in_page, name, bytes)``.
+    Per-page and per-image failures are logged and skipped -- one bad
+    image must not fail the whole document.
+    """
+
+    from pypdf import PdfReader
+
+    out: list[tuple[int, int, str, bytes]] = []
+    try:
+        reader = PdfReader(file_path)
+    except Exception:
+        logger.warning(
+            "pypdf failed to open %s for image extraction",
+            file_path,
+            exc_info=True,
+        )
+        return out
+
+    for page_idx, page in enumerate(reader.pages):
+        try:
+            images = list(page.images)
+        except Exception:
+            logger.warning(
+                "pypdf failed to enumerate images on page %d of %s",
+                page_idx + 1,
+                file_path,
+                exc_info=True,
+            )
+            continue
+        for img_idx, img in enumerate(images):
+            try:
+                name = getattr(img, "name", None) or f"page{page_idx + 1}_img{img_idx}"
+                data = img.data
+            except Exception:
+                logger.warning(
+                    "pypdf failed to read image %d on page %d of %s",
+                    img_idx,
+                    page_idx + 1,
+                    file_path,
+                    exc_info=True,
+                )
+                continue
+            out.append((page_idx + 1, img_idx, name, data))
+    return out
+
+
+async def _describe_one(
+    page_number: int,
+    ordinal: int,
+    name: str,
+    sha256: str,
+    data: bytes,
+    vision_llm: Any,
+    semaphore: asyncio.Semaphore,
+    ocr_runner: OcrRunner | None,
+) -> PictureDescription | None:
+    from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
+
+    suffix = _pick_suffix(name)
+    # NamedTemporaryFile + delete=False because the vision-LLM helper
+    # and the OCR runner each open the path themselves; we clean up in
+    # the finally. Same temp file feeds both, which is correct: vision
+    # LLM and OCR are looking at the same image, just asking different
+    # questions of it.
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        tmp.write(data)
+        tmp_path = tmp.name
+    try:
+        async with semaphore:
+            tasks: list[Awaitable[Any]] = [
+                parse_image_for_description(tmp_path, name, vision_llm),
+            ]
+            if ocr_runner is not None:
+                tasks.append(ocr_runner(tmp_path, name))
+
+            # return_exceptions=True so a failure in one branch (most
+            # often OCR) doesn't poison the other.
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        description_result = results[0]
+        if isinstance(description_result, BaseException):
+            logger.warning(
+                "Vision LLM failed for image %s on page %d, skipping",
+                name,
+                page_number,
+                exc_info=description_result,
+            )
+            return None
+        description = str(description_result)
+
+        ocr_text: str | None = None
+        if ocr_runner is not None and len(results) > 1:
+            ocr_result = results[1]
+            if isinstance(ocr_result, BaseException):
+                logger.warning(
+                    "Per-image OCR failed for image %s on page %d, "
+                    "omitting OCR field for this image",
+                    name,
+                    page_number,
+                    exc_info=ocr_result,
+                )
+            else:
+                stripped = str(ocr_result).strip()
+                # Empty OCR (or whitespace-only) means the OCR engine
+                # found no text in this image. Record that as None so
+                # the rendered block doesn't include a useless empty tag.
+                ocr_text = stripped or None
+    finally:
+        with contextlib.suppress(OSError):
+            Path(tmp_path).unlink()
+
+    return PictureDescription(
+        page_number=page_number,
+        ordinal_in_page=ordinal,
+        name=name,
+        sha256=sha256,
+        description=description,
+        ocr_text=ocr_text,
+    )
+
+
+async def describe_pictures(
+    file_path: str,
+    filename: str,
+    vision_llm: Any,
+    *,
+    ocr_runner: OcrRunner | None = None,
+) -> PictureExtractionResult:
+    """Extract embedded images from a document and describe each via vision LLM.
+
+    When ``ocr_runner`` is provided, each image is also passed to it
+    (in parallel with the vision LLM) and the returned text is recorded
+    in :attr:`PictureDescription.ocr_text`. The runner is typically a
+    closure over a vision-LLM-less ``EtlPipelineService`` -- this lets
+    the same OCR engine that processes standalone image uploads
+    (Docling/Azure DI/LlamaCloud) also process embedded-in-PDF images,
+    giving per-image OCR attribution alongside the page-level OCR that
+    the parser already does.
+
+    Currently PDF-only. For non-PDF documents this returns an empty
+    result and the caller should leave the parser's markdown untouched.
+    """
+
+    result = PictureExtractionResult()
+    if not _is_pdf(filename) or vision_llm is None:
+        return result
+
+    raw_images = _extract_pdf_images(file_path)
+    if not raw_images:
+        return result
+
+    seen_hashes: set[str] = set()
+    eligible: list[tuple[int, int, str, str, bytes]] = []
+    for page_number, ordinal, name, data in raw_images:
+        if len(data) > _MAX_IMAGE_BYTES:
+            result.skipped_too_large += 1
+            continue
+        if len(data) < _MIN_IMAGE_BYTES:
+            result.skipped_too_small += 1
+            continue
+        sha = hashlib.sha256(data).hexdigest()
+        if sha in seen_hashes:
+            result.skipped_duplicate += 1
+            continue
+        seen_hashes.add(sha)
+        eligible.append((page_number, ordinal, name, sha, data))
+
+    if not eligible:
+        return result
+
+    semaphore = asyncio.Semaphore(_VISION_CONCURRENCY)
+    tasks = [
+        _describe_one(p, o, n, sha, d, vision_llm, semaphore, ocr_runner)
+        for (p, o, n, sha, d) in eligible
+    ]
+    descriptions = await asyncio.gather(*tasks)
+    for desc in descriptions:
+        if desc is None:
+            result.failed += 1
+        else:
+            result.descriptions.append(desc)
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Rendering: build the per-image markdown block + inject inline.
+# ---------------------------------------------------------------------------
+
+
+def _format_image_block(
+    name: str,
+    description: str,
+    ocr_text: str | None = None,
+) -> str:
+    """Render the per-image block as a horizontal-rule-delimited section.
+
+    Why no blockquote / no raw HTML / no XML?
+    -----------------------------------------
+    We tried each in turn and each failed in the document viewer:
+
+    - **Raw HTML / XML** (``<image>...</image>``): unknown elements
+      have no render rules in Streamdown or PlateJS, so the content
+      survives in the markdown source but is invisible to humans.
+    - **Blockquote with nested blocks**: nested fenced code blocks,
+      bullet lists, numbered lists, tables -- any *block* element
+      inside a ``>``-prefixed blockquote -- gets evicted by Streamdown
+      / remark, dropping everything after it onto the document level.
+      The vision LLM happily produces bulleted descriptions, so this
+      hit the viewer in practice.
+
+    A horizontal-rule-delimited section, by contrast, contains only
+    standard top-level markdown -- bold labels and free-form body --
+    so the description's native markdown (lists, prose, tables) all
+    renders natively in every renderer.
+
+    Layout (OCR section omitted when ``ocr_text`` is None/empty):
+
+        ---
+
+        **Embedded image:** `MM-130-a.jpeg`
+
+        **OCR text:**
+        Slice 24 / 60
+        L
+        R
+
+        **Visual description:**
+
+        - Axial contrast-enhanced CT showing a large cystic mass...
+        - Mass effect on the adjacent stomach.
+
+        ---
+
+    Still LLM-friendly: the ``**Embedded image:** `<filename>``` prefix
+    is unique and trivially regex-able (``^\\*\\*Embedded image:\\*\\* `(.+?)`$``).
+
+    Returned with leading and trailing blank-line padding so the rules
+    never merge with adjacent paragraphs after splicing.
+    """
+
+    parts: list[str] = [f"**Embedded image:** `{name}`"]
+
+    if ocr_text and ocr_text.strip():
+        # Bold "OCR text:" label with trailing two spaces (=> <br>) so
+        # the first OCR line sits directly under the label rather than
+        # forcing a paragraph break that some renderers would style
+        # differently. Subsequent OCR lines also use trailing two spaces
+        # for hard breaks, so multi-line OCR renders line-by-line
+        # without needing a (fragile) fenced code block.
+        ocr_clean_lines = [
+            ln.rstrip() for ln in ocr_text.strip().splitlines() if ln.strip()
+        ]
+        parts.append("")
+        parts.append("**OCR text:**  ")
+        for i, raw in enumerate(ocr_clean_lines):
+            suffix = "" if i == len(ocr_clean_lines) - 1 else "  "
+            parts.append(f"{raw}{suffix}")
+
+    parts.append("")
+    parts.append("**Visual description:**")
+    parts.append("")
+    parts.append(description.strip())
+
+    body = "\n".join(parts)
+    # Wrap with blank lines + horizontal rules so the block is clearly
+    # delimited from surrounding paragraphs and survives splicing into
+    # the middle of any markdown stream.
+    return "\n\n---\n\n" + body + "\n\n---\n\n"
+
+
+# Patterns we'll try to splice into. Each pattern captures the
+# original-PDF filename when one is available (group 1).
+#
+# Replace-style markers (the matched span is substituted with our block
+# because it carries no useful content of its own):
+#
+# 1. Docling's image placeholder followed by an "Image: <filename>"
+#    caption line. This is what our medxpertqa renderer produces:
+#    reportlab places the JPEG, then a caption, and Docling outputs
+#    the placeholder + caption.
+# 2. Docling's image placeholder alone (filename unknown -- we fall
+#    back to pypdf's name).
+# 3. A bare "Image: <filename>" caption line with no preceding
+#    placeholder. Rare in practice, but covers parsers that drop the
+#    placeholder entirely.
+_PLACEHOLDER_WITH_CAPTION = re.compile(
+    r"<!--\s*image\s*-->\s*\n\s*Image:\s*(\S+)\s*(?:\n|$)",
+    re.IGNORECASE,
+)
+_PLACEHOLDER_ONLY = re.compile(
+    r"<!--\s*image\s*-->",
+    re.IGNORECASE,
+)
+_CAPTION_ONLY = re.compile(
+    r"^[ \t]*Image:\s*(\S+)\s*$",
+    re.IGNORECASE | re.MULTILINE,
+)
+
+# Append-after marker (the matched span is preserved verbatim and our
+# block is inserted immediately after it):
+#
+# 4. ``<figure>...</figure>`` as emitted by layout-aware parsers (Azure
+#    Document Intelligence ``prebuilt-layout``, LlamaCloud premium).
+#    The figure's own contents -- chart bar values, axis labels,
+#    inline ``<figcaption>``, embedded ``<table>`` for tabular figures
+#    -- are themselves specialist OCR output, so we keep them and add
+#    our vision-LLM block alongside. ``[^>]*`` in the open tag tolerates
+#    optional attributes like ``<figure id="...">``; ``re.DOTALL``
+#    lets ``.`` cross the newlines inside the block.
+_FIGURE_BLOCK = re.compile(
+    r"<figure\b[^>]*>.*?</figure>",
+    re.DOTALL | re.IGNORECASE,
+)
+
+
+def _replace_one_match(
+    markdown: str,
+    pattern: re.Pattern[str],
+    descriptions: list[PictureDescription],
+    desc_idx: int,
+) -> tuple[str, int]:
+    """Replace the first occurrence of ``pattern`` with the next image block.
+
+    Returns the new markdown and the new ``desc_idx`` (advanced if a
+    replacement happened, unchanged otherwise).
+    """
+
+    if desc_idx >= len(descriptions):
+        return markdown, desc_idx
+
+    match = pattern.search(markdown)
+    if not match:
+        return markdown, desc_idx
+
+    desc = descriptions[desc_idx]
+    captured_name: str | None = None
+    if match.groups():
+        captured_name = match.group(1)
+    name = captured_name or desc.name
+    block = _format_image_block(name, desc.description, desc.ocr_text)
+
+    new_markdown = markdown[: match.start()] + block + markdown[match.end():]
+    return new_markdown, desc_idx + 1
+
+
+def _splice_after_figures(
+    markdown: str,
+    descriptions: list[PictureDescription],
+    desc_idx: int,
+) -> tuple[str, int]:
+    """Append vision-LLM blocks immediately after each ``<figure>...</figure>``.
+
+    Layout-aware parsers (Azure DI ``prebuilt-layout``, LlamaCloud
+    premium) wrap each figure / chart / inline table in this tag and
+    carry their own OCR of the figure's text content inside it. That
+    content is useful on its own, so we keep the original block
+    verbatim and add our vision-LLM block right after it -- giving
+    retrieval both signals in the same chunk.
+
+    Descriptions are matched to figures in document order (first
+    description -> first figure, etc.). All splice points are computed
+    upfront with :func:`re.finditer` and applied in REVERSE order so
+    earlier offsets stay valid as the markdown grows. Returns the
+    advanced ``desc_idx`` for the caller's leftover-handling.
+    """
+
+    if desc_idx >= len(descriptions):
+        return markdown, desc_idx
+
+    matches = list(_FIGURE_BLOCK.finditer(markdown))
+    if not matches:
+        return markdown, desc_idx
+
+    n_to_splice = min(len(matches), len(descriptions) - desc_idx)
+    if n_to_splice <= 0:
+        return markdown, desc_idx
+
+    out = markdown
+    # Walk in reverse so each splice's end-offset still points at the
+    # right place in the (still-mutating) string.
+    for i in range(n_to_splice - 1, -1, -1):
+        match = matches[i]
+        desc = descriptions[desc_idx + i]
+        block = _format_image_block(desc.name, desc.description, desc.ocr_text)
+        out = out[: match.end()] + block + out[match.end():]
+
+    return out, desc_idx + n_to_splice
+
+
+def inject_descriptions_inline(
+    markdown: str,
+    result: PictureExtractionResult,
+) -> tuple[str, int]:
+    """Splice per-image markdown blocks into the document at image positions.
+
+    Walks the markdown left-to-right, consuming descriptions in order.
+    Tries two splicing strategies, in this order:
+
+    1. **Append-after** for ``<figure>...</figure>`` blocks emitted by
+       layout-aware parsers (Azure DI ``prebuilt-layout``, LlamaCloud
+       premium). The figure block carries the parser's own OCR of the
+       figure -- we preserve it and add our vision-LLM block right
+       after.
+    2. **Replace** for Docling-style markers, in priority order:
+
+       - ``<!-- image -->`` followed by ``Image: <filename>`` caption,
+       - ``<!-- image -->`` placeholder alone,
+       - bare ``Image: <filename>`` caption.
+
+    A document typically uses one style or the other (depending on
+    which parser produced its markdown), so the two paths don't fight
+    each other in practice. When they do co-occur, figures are
+    consumed first.
+
+    Returns ``(new_markdown, n_inlined)`` -- the count of descriptions
+    that were placed inline. The caller decides what to do with any
+    leftover descriptions (typically: append them at the end).
+    """
+
+    if not result.descriptions:
+        return markdown, 0
+
+    descriptions = result.descriptions
+    desc_idx = 0
+    out = markdown
+
+    # Step 1: layout-aware figures. One-shot batch -- finds ALL
+    # <figure> blocks, splices in document order until we exhaust
+    # either side.
+    out, desc_idx = _splice_after_figures(out, descriptions, desc_idx)
+
+    # Step 2: Docling-style replacement markers. One match per
+    # iteration, so a doc that has both a figure (consumed above) and
+    # a Docling placeholder (consumed below) still works.
+    while desc_idx < len(descriptions):
+        before_idx = desc_idx
+        out, desc_idx = _replace_one_match(
+            out, _PLACEHOLDER_WITH_CAPTION, descriptions, desc_idx
+        )
+        if desc_idx > before_idx:
+            continue
+        out, desc_idx = _replace_one_match(
+            out, _PLACEHOLDER_ONLY, descriptions, desc_idx
+        )
+        if desc_idx > before_idx:
+            continue
+        out, desc_idx = _replace_one_match(
+            out, _CAPTION_ONLY, descriptions, desc_idx
+        )
+        if desc_idx > before_idx:
+            continue
+        # No more positions to splice into.
+        break
+
+    return out, desc_idx
+
+
+def render_appended_section(
+    descriptions: list[PictureDescription],
+    *,
+    skip_notes: PictureExtractionResult | None = None,
+    heading: str = "## Image Content (vision-LLM extracted)",
+) -> str:
+    """Render leftover descriptions as an appended section.
+
+    Used as a fallback when not every description could be inlined
+    (either because the parser produced no detectable image markers,
+    or because there were more extracted images than markers).
+    """
+
+    if not descriptions and not skip_notes:
+        return ""
+
+    parts: list[str] = ["", heading, ""]
+    for desc in descriptions:
+        parts.append(
+            _format_image_block(desc.name, desc.description, desc.ocr_text)
+        )
+        parts.append("")
+
+    if skip_notes is not None:
+        notes: list[str] = []
+        if skip_notes.skipped_too_large:
+            notes.append(f"{skip_notes.skipped_too_large} too large (> 5 MB)")
+        if skip_notes.skipped_too_small:
+            notes.append(f"{skip_notes.skipped_too_small} too small (< 1 KB)")
+        if skip_notes.skipped_duplicate:
+            notes.append(f"{skip_notes.skipped_duplicate} duplicate")
+        if skip_notes.failed:
+            notes.append(f"{skip_notes.failed} failed")
+        if notes:
+            parts.append(f"_Note: {', '.join(notes)} image(s) skipped._")
+
+    return "\n".join(parts)
+
+
+def merge_descriptions_into_markdown(
+    markdown: str,
+    result: PictureExtractionResult,
+) -> str:
+    """Top-level: inline what we can, append what's left over.
+
+    This is the function the ETL pipeline actually calls. It guarantees
+    that no successfully-described image is silently dropped: anything
+    we can't splice inline gets appended at the end with a heading
+    that makes it clear those came from the document but weren't
+    location-matched.
+    """
+
+    if not result.descriptions:
+        return markdown
+
+    new_markdown, n_inlined = inject_descriptions_inline(markdown, result)
+    leftover = result.descriptions[n_inlined:]
+
+    if not leftover:
+        return new_markdown
+
+    # Distinguish in the heading whether NONE were inlined (parser
+    # produced no markers at all) vs SOME (mismatched count).
+    heading = (
+        "## Image Content (vision-LLM extracted)"
+        if n_inlined == 0
+        else "## Image Content (additional, no inline marker found)"
+    )
+    section = render_appended_section(leftover, heading=heading)
+    if not section:
+        return new_markdown
+    return f"{new_markdown.rstrip()}\n\n{section.lstrip()}\n"
+
+
+__all__ = [
+    "PictureDescription",
+    "PictureExtractionResult",
+    "describe_pictures",
+    "inject_descriptions_inline",
+    "merge_descriptions_into_markdown",
+    "render_appended_section",
+]
diff --git a/surfsense_backend/app/services/docling_service.py b/surfsense_backend/app/services/docling_service.py
index af9a7d2d5..cf51efb4a 100644
--- a/surfsense_backend/app/services/docling_service.py
+++ b/surfsense_backend/app/services/docling_service.py
@@ -77,10 +77,16 @@ class DoclingService:
             # Create pipeline options with version-safe attribute checking
             pipeline_options = PdfPipelineOptions()
 
-            # Disable OCR (user request)
+            # Enable OCR so text-in-image (chart axes, ECG annotations,
+            # lab tables embedded as images, scanned pages, etc.) is
+            # lifted into the main markdown stream. This pairs with the
+            # vision-LLM picture-description pass downstream — OCR
+            # captures literal text; vision LLM captures the visual
+            # content. Together they give a faithful representation of
+            # PDFs that mix text and images.
             if hasattr(pipeline_options, "do_ocr"):
-                pipeline_options.do_ocr = False
-                logger.info("⚠️ OCR disabled by user request")
+                pipeline_options.do_ocr = True
+                logger.info("✅ OCR enabled for embedded text-in-image extraction")
             else:
                 logger.warning("⚠️ OCR attribute not available in this Docling version")
 
diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py
index 1271550df..137c27cda 100644
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@@ -123,10 +123,6 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
     """Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline."""
     from app.etl_pipeline.etl_document import EtlRequest
     from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-    from app.etl_pipeline.file_classifier import (
-        FileCategory,
-        classify_file as etl_classify,
-    )
 
     await _notify(ctx, "parsing", "Processing file")
     await ctx.task_logger.log_task_progress(
@@ -135,8 +131,12 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
         {"processing_stage": "extracting"},
     )
 
+    # Fetch the vision LLM whenever the operator opts in. The ETL
+    # pipeline decides what to do with it: image files run through the
+    # vision LLM directly; document files (PDFs) get per-image
+    # descriptions appended via picture_describer.
     vision_llm = None
-    if ctx.use_vision_llm and etl_classify(ctx.filename) == FileCategory.IMAGE:
+    if ctx.use_vision_llm:
         from app.services.llm_service import get_vision_llm
 
         vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
@@ -230,7 +230,16 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
 
     await _notify(ctx, "parsing", "Extracting content")
 
-    etl_result = await EtlPipelineService().extract(
+    # Document files (PDF, docx, etc.) get vision LLM treatment too:
+    # the ETL pipeline appends a per-image description section when
+    # vision_llm is provided. See picture_describer.describe_pictures.
+    vision_llm = None
+    if ctx.use_vision_llm:
+        from app.services.llm_service import get_vision_llm
+
+        vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
+
+    etl_result = await EtlPipelineService(vision_llm=vision_llm).extract(
         EtlRequest(
             file_path=ctx.file_path,
             filename=ctx.filename,
@@ -418,8 +427,12 @@ async def _extract_file_content(
         billable_pages = estimated_pages * mode.page_multiplier
         await page_limit_service.check_page_limit(user_id, billable_pages)
 
+    # Vision LLM is provided to the ETL pipeline for any file category
+    # when the operator opts in. Image files run through it directly;
+    # document files (PDFs) get per-image descriptions appended via
+    # picture_describer.
     vision_llm = None
-    if use_vision_llm and category == FileCategory.IMAGE:
+    if use_vision_llm:
         from app.services.llm_service import get_vision_llm
 
         vision_llm = await get_vision_llm(session, search_space_id)
diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
index 8571136c3..edfe94406 100644
--- a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py
@@ -741,6 +741,372 @@ async def test_extract_image_falls_back_to_document_without_vision_llm(
     assert result.content_type == "document"
 
 
+# ---------------------------------------------------------------------------
+# Document path with vision LLM: per-image descriptions are appended
+# ---------------------------------------------------------------------------
+
+
+def _fake_extraction_result(*descriptions):
+    from app.etl_pipeline.picture_describer import (
+        PictureDescription,
+        PictureExtractionResult,
+    )
+
+    return PictureExtractionResult(
+        descriptions=[
+            PictureDescription(
+                page_number=d["page"],
+                ordinal_in_page=d.get("ordinal", 0),
+                name=d["name"],
+                sha256=d.get("sha", "deadbeef"),
+                description=d["desc"],
+            )
+            for d in descriptions
+        ]
+    )
+
+
+async def test_extract_pdf_with_vision_llm_inlines_image_blocks(tmp_path, mocker):
+    """A PDF with an `<!-- image -->` placeholder + caption gets the
+    block spliced inline (no orphaned ``## Image Content`` section).
+
+    This is the headline scenario for the medxpertqa benchmark: the
+    image content lives in the same chunk as the surrounding case text
+    so retrieval pulls the question, image, and answer options together.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {
+        "content": (
+            "# MedXpertQA-MM MM-130\n\n"
+            "## Clinical case\n\nA 44-year-old man...\n\n"
+            "<!-- image -->\nImage: MM-130-a.jpeg\n\n"
+            "## Answer choices\n\nA) ...\n"
+        )
+    }
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    extraction = _fake_extraction_result(
+        {
+            "page": 1,
+            "name": "Im0",
+            "desc": "Axial CT showing a large cystic mass.",
+        }
+    )
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(return_value=extraction),
+    )
+
+    fake_llm = mocker.MagicMock()
+    result = await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    md = result.markdown_content
+    # The placeholder + caption are gone, replaced by a horizontal-
+    # rule-delimited section with the captioned filename.
+    assert "<!-- image -->" not in md
+    assert "Image: MM-130-a.jpeg" not in md
+    assert "**Embedded image:** `MM-130-a.jpeg`" in md
+    assert "**Visual description:**" in md
+    assert "Axial CT showing a large cystic mass." in md
+    # No OCR section -- our fake_extraction_result has no ocr_text,
+    # and the format omits the section when there's no text to show.
+    assert "**OCR text:**" not in md
+    # No raw HTML / XML tags or blockquote wrapping leak.
+    assert "<image" not in md
+    assert "> **Embedded image:**" not in md
+    # No appended section -- everything went inline.
+    assert "## Image Content" not in md
+    # Surrounding case text + answer options are preserved.
+    assert "A 44-year-old man..." in md
+    assert "## Answer choices" in md
+    assert "A) ..." in md
+
+
+async def test_extract_pdf_with_vision_llm_appends_when_no_marker(tmp_path, mocker):
+    """When parser markdown has no image markers, descriptions get appended.
+
+    This is the fallback path for parsers that drop image placeholders
+    entirely. The image content still ends up in the markdown -- just
+    in a clearly-labeled section rather than inline.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {
+        "content": "# Parsed PDF text\n\nNo image markers anywhere.\n"
+    }
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    extraction = _fake_extraction_result(
+        {"page": 1, "name": "Im0", "desc": "An image description."}
+    )
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(return_value=extraction),
+    )
+
+    fake_llm = mocker.MagicMock()
+    result = await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    md = result.markdown_content
+    assert "# Parsed PDF text" in md
+    assert "## Image Content (vision-LLM extracted)" in md
+    assert "**Embedded image:** `Im0`" in md
+    assert "An image description." in md
+
+
+async def test_extract_pdf_without_vision_llm_skips_picture_descriptions(
+    tmp_path, mocker
+):
+    """No vision LLM -> parser markdown returned as-is."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    describe_mock = mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(),
+    )
+
+    result = await EtlPipelineService().extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    assert result.markdown_content == "# Parsed PDF text"
+    assert "<image" not in result.markdown_content
+    describe_mock.assert_not_called()
+
+
+async def test_extract_pdf_with_vision_llm_swallows_describe_failure(
+    tmp_path, mocker
+):
+    """A pypdf or vision LLM blow-up never fails the document upload."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(side_effect=RuntimeError("pypdf exploded")),
+    )
+
+    fake_llm = mocker.MagicMock()
+    result = await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    assert result.markdown_content == "# Parsed PDF text"
+    assert result.etl_service == "DOCLING"
+
+
+async def test_extract_pdf_with_vision_llm_no_images_returns_parser_text(
+    tmp_path, mocker
+):
+    """Vision-LLM-enabled PDF with zero extracted images is unchanged."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Just text, no images"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    empty = _fake_extraction_result()
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(return_value=empty),
+    )
+
+    fake_llm = mocker.MagicMock()
+    result = await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    assert result.markdown_content == "# Just text, no images"
+    assert "<image" not in result.markdown_content
+
+
+# ---------------------------------------------------------------------------
+# Per-image OCR runner: wiring + behaviour
+#
+# When extracting a PDF with a vision LLM, the ETL service must ALSO
+# pass an ``ocr_runner`` to picture_describer. The runner is a closure
+# that re-feeds each extracted image through a vision-LLM-less
+# EtlPipelineService -- i.e. the same OCR engine that handles
+# standalone image uploads (Docling/Azure DI/LlamaCloud) gets a crack
+# at each embedded image, with the text attached to the inline block.
+# ---------------------------------------------------------------------------
+
+
+async def test_extract_pdf_passes_ocr_runner_to_describe_pictures(
+    tmp_path, mocker
+):
+    """The ETL service must wire an ocr_runner kwarg to describe_pictures."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    describe_mock = mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=mocker.AsyncMock(return_value=_fake_extraction_result()),
+    )
+
+    fake_llm = mocker.MagicMock()
+    await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    describe_mock.assert_awaited_once()
+    _, kwargs = describe_mock.await_args
+    assert "ocr_runner" in kwargs
+    assert callable(kwargs["ocr_runner"])
+
+
+async def test_extract_pdf_ocr_runner_invokes_document_parser_on_image(
+    tmp_path, mocker
+):
+    """The OCR runner closure should re-extract each image via the parser.
+
+    We capture the runner that the ETL service passes to
+    describe_pictures, invoke it with a fake image path, and assert
+    that Docling was called with that image. This proves the closure
+    is wired to a vision-LLM-less sub-pipeline (otherwise it would
+    recurse into the vision LLM and never hit the OCR engine).
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+    image_file = tmp_path / "Im0.png"
+    image_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 100)
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {
+        "content": "Slice 24 / 60   L   R"
+    }
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    captured: dict = {}
+
+    async def capture_runner(*args, **kwargs):
+        captured["runner"] = kwargs["ocr_runner"]
+        return _fake_extraction_result()
+
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=capture_runner,
+    )
+
+    fake_llm = mocker.MagicMock()
+    await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    runner = captured["runner"]
+    ocr_text = await runner(str(image_file), "Im0.png")
+
+    assert ocr_text == "Slice 24 / 60   L   R"
+    # Docling was invoked twice in total: once for the PDF, once for
+    # the image we re-fed via the runner.
+    assert fake_docling.process_document.await_count == 2
+
+
+async def test_extract_pdf_ocr_runner_returns_empty_on_unsupported_image(
+    tmp_path, mocker
+):
+    """Unsupported image format → runner returns empty string, doesn't raise.
+
+    Common case: a PDF embeds a JPEG2000 or CCITT-TIFF image that
+    Docling can't load. We don't want an unsupported format on ONE
+    embedded image to spoil the whole PDF extraction; the runner
+    should swallow the EtlUnsupportedFileError and return "" so the
+    image gets a description but no OCR tag.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake content")
+    weird_image = tmp_path / "Im0.jp2"  # JPEG2000, unlikely to be supported
+    weird_image.write_bytes(b"\x00\x00\x00\x0CjP" + b"\x00" * 50)
+
+    mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
+
+    fake_docling = mocker.AsyncMock()
+    fake_docling.process_document.return_value = {"content": "# Parsed PDF text"}
+    mocker.patch(
+        "app.services.docling_service.create_docling_service",
+        return_value=fake_docling,
+    )
+
+    captured: dict = {}
+
+    async def capture_runner(*args, **kwargs):
+        captured["runner"] = kwargs["ocr_runner"]
+        return _fake_extraction_result()
+
+    mocker.patch(
+        "app.etl_pipeline.picture_describer.describe_pictures",
+        new=capture_runner,
+    )
+
+    fake_llm = mocker.MagicMock()
+    await EtlPipelineService(vision_llm=fake_llm).extract(
+        EtlRequest(file_path=str(pdf_file), filename="report.pdf")
+    )
+
+    runner = captured["runner"]
+    ocr_text = await runner(str(weird_image), "Im0.jp2")
+
+    assert ocr_text == ""
+
+
 # ---------------------------------------------------------------------------
 # Processing Mode enum tests
 # ---------------------------------------------------------------------------
diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_picture_describer.py b/surfsense_backend/tests/unit/etl_pipeline/test_picture_describer.py
new file mode 100644
index 000000000..407bc97a2
--- /dev/null
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_picture_describer.py
@@ -0,0 +1,967 @@
+"""Unit tests for the picture_describer module.
+
+Covers:
+
+- :func:`describe_pictures` -- the PDF image walker + per-image vision
+  LLM call (structured output split into ``ocr_text`` and
+  ``description``);
+- :func:`inject_descriptions_inline` -- in-place replacement of image
+  placeholders / captions in the parser markdown;
+- :func:`merge_descriptions_into_markdown` -- the top-level helper
+  that inlines what it can and appends what it can't;
+- :func:`render_appended_section` -- the appended-fallback renderer.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from app.etl_pipeline.picture_describer import (
+    PictureDescription,
+    PictureExtractionResult,
+    describe_pictures,
+    inject_descriptions_inline,
+    merge_descriptions_into_markdown,
+    render_appended_section,
+)
+
+pytestmark = pytest.mark.unit
+
+
+def _make_image_obj(name: str, data: bytes):
+    """Mimic pypdf's ImageFile object shape for the bits we use."""
+    img = MagicMock()
+    img.name = name
+    img.data = data
+    return img
+
+
+# ---------------------------------------------------------------------------
+# describe_pictures: short-circuits
+# ---------------------------------------------------------------------------
+
+
+async def test_describe_pictures_no_op_for_non_pdf(tmp_path):
+    """Non-PDF files are silently no-op'd; we don't try to extract images."""
+    docx_file = tmp_path / "report.docx"
+    docx_file.write_bytes(b"PK fake docx")
+
+    fake_llm = AsyncMock()
+    result = await describe_pictures(str(docx_file), "report.docx", fake_llm)
+
+    assert result.descriptions == []
+    assert result.skipped_too_large == 0
+    fake_llm.ainvoke.assert_not_called()
+
+
+async def test_describe_pictures_no_op_when_vision_llm_is_none(tmp_path):
+    """If the caller didn't provide a vision LLM, we no-op even for PDFs."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    result = await describe_pictures(str(pdf_file), "report.pdf", None)
+    assert result.descriptions == []
+
+
+async def test_describe_pictures_no_op_for_pdf_with_no_images(tmp_path, mocker):
+    """A PDF that pypdf can open but contains zero images returns empty."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[]), MagicMock(images=[])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    fake_llm = AsyncMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert result.descriptions == []
+    fake_llm.ainvoke.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# describe_pictures: happy paths
+# ---------------------------------------------------------------------------
+
+
+async def test_describe_pictures_runs_vision_llm_per_image(tmp_path, mocker):
+    """Every eligible image gets exactly one description-only vision call."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img_a = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    img_b = _make_image_obj("Im1.png", b"\x89PNG\r\n\x1a\n" + b"\xcd" * 2000)
+    page1 = MagicMock(images=[img_a])
+    page2 = MagicMock(images=[img_b])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page1, page2]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    parse_mock = mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(side_effect=["Description A", "Description B"]),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 2
+    by_name = {d.name: d.description for d in result.descriptions}
+    assert by_name == {"Im0.jpeg": "Description A", "Im1.png": "Description B"}
+    assert all(d.page_number in (1, 2) for d in result.descriptions)
+    assert parse_mock.await_count == 2
+
+
+async def test_describe_pictures_dedups_by_hash(tmp_path, mocker):
+    """An image that appears N times in the PDF is described once."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    payload = b"\x89PNG\r\n\x1a\n" + b"\x42" * 2000
+    img = _make_image_obj("logo.png", payload)
+    page1 = MagicMock(images=[img])
+    page2 = MagicMock(images=[_make_image_obj("logo.png", payload)])
+    page3 = MagicMock(images=[_make_image_obj("logo.png", payload)])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page1, page2, page3]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    parse_mock = mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="Logo desc"),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.skipped_duplicate == 2
+    assert parse_mock.await_count == 1
+
+
+async def test_describe_pictures_skips_too_small_images(tmp_path, mocker):
+    """Sub-1KB images (tracking pixels, dots, etc.) are skipped."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    tiny = _make_image_obj("dot.png", b"\x89PNG\r\n\x1a\n")
+    big = _make_image_obj("ct.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 3000)
+    page = MagicMock(images=[tiny, big])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    parse_mock = mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="CT scan"),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].name == "ct.jpeg"
+    assert result.skipped_too_small == 1
+    assert parse_mock.await_count == 1
+
+
+async def test_describe_pictures_skips_too_large_images(tmp_path, mocker):
+    """Images larger than the vision LLM's per-image cap are skipped."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    huge = _make_image_obj("huge.jpeg", b"\xff" * (6 * 1024 * 1024))
+    ok = _make_image_obj("ok.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    page = MagicMock(images=[huge, ok])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    parse_mock = mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="OK image"),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].name == "ok.jpeg"
+    assert result.skipped_too_large == 1
+    assert parse_mock.await_count == 1
+
+
+async def test_describe_pictures_swallows_per_image_failure(tmp_path, mocker):
+    """A vision LLM failure on one image must not kill the whole document."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img_a = _make_image_obj("a.jpeg", b"\xff\xd8" + b"\xab" * 2000)
+    img_b = _make_image_obj("b.jpeg", b"\xff\xd8" + b"\xcd" * 2000)
+    page = MagicMock(images=[img_a, img_b])
+
+    fake_reader = MagicMock()
+    fake_reader.pages = [page]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(side_effect=[RuntimeError("vision blew up"), "Success"]),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].description == "Success"
+    assert result.failed == 1
+
+
+async def test_describe_pictures_handles_pypdf_open_failure(tmp_path, mocker):
+    """A malformed PDF that pypdf can't open returns an empty result."""
+    pdf_file = tmp_path / "broken.pdf"
+    pdf_file.write_bytes(b"not a pdf")
+
+    mocker.patch("pypdf.PdfReader", side_effect=ValueError("EOF marker not found"))
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "broken.pdf", fake_llm)
+    assert result.descriptions == []
+
+
+# ---------------------------------------------------------------------------
+# inject_descriptions_inline: replacement patterns
+# ---------------------------------------------------------------------------
+
+
+def _desc(name="Im0", description="A CT scan."):
+    return PictureDescription(
+        page_number=1,
+        ordinal_in_page=0,
+        name=name,
+        sha256="aa",
+        description=description,
+    )
+
+
+def test_inject_no_op_when_no_descriptions():
+    markdown = "# Title\n\nbody text\n"
+    result = PictureExtractionResult()
+    out, n = inject_descriptions_inline(markdown, result)
+    assert out == markdown
+    assert n == 0
+
+
+def test_inject_replaces_placeholder_with_caption():
+    """`<!-- image -->` + `Image: <name>` together becomes one block.
+
+    This is the most common medxpertqa case: our renderer puts a caption
+    line right below the embedded JPEG, and Docling preserves both.
+    """
+    markdown = (
+        "# Case\n\n"
+        "Clinical text...\n\n"
+        "<!-- image -->\nImage: MM-130-a.jpeg\n\n"
+        "Answer choices: A) ...\n"
+    )
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "<!-- image -->" not in out
+    assert "Image: MM-130-a.jpeg" not in out  # caption consumed
+    # New format: horizontal-rule-delimited section with "Embedded
+    # image:" anchor and named "Visual description:" section. No
+    # blockquote wrapping -- nested blocks (lists, code, tables) inside
+    # a blockquote are silently dropped by Streamdown / remark.
+    assert "**Embedded image:** `MM-130-a.jpeg`" in out
+    assert "**Visual description:**" in out
+    assert "A CT scan." in out
+    # Block is delimited by horizontal rules so it stands out from
+    # surrounding paragraphs.
+    assert "\n---\n" in out
+    # No OCR section -- this fixture has no ocr_text on its descriptions.
+    assert "**OCR text:**" not in out
+    # No raw HTML tags / blockquote prefixes leak.
+    assert "<image" not in out
+    assert "</image>" not in out
+    assert "> **Embedded image:**" not in out  # we no longer wrap in `>`
+    # Surrounding context is preserved.
+    assert "Clinical text..." in out
+    assert "Answer choices: A) ..." in out
+
+
+def test_inject_uses_pypdf_name_when_no_caption():
+    """`<!-- image -->` alone uses the pypdf-given name as the attribute."""
+    markdown = "# Case\n\n<!-- image -->\n\nMore text\n"
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "**Embedded image:** `Im0`" in out
+
+
+def test_inject_replaces_bare_caption():
+    """A bare `Image: <name>` line (no placeholder) still gets replaced."""
+    markdown = "# Case\n\nText...\nImage: scan.jpeg\nMore text\n"
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "**Embedded image:** `scan.jpeg`" in out
+    assert "Image: scan.jpeg" not in out
+
+
+def test_inject_handles_multiple_images_in_order():
+    """Two placeholders + two descriptions: each consumed in document order."""
+    markdown = (
+        "Page 1\n\n<!-- image -->\nImage: a.jpeg\n\n"
+        "Between\n\n<!-- image -->\nImage: b.jpeg\n\nEnd\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[
+            PictureDescription(
+                page_number=1, ordinal_in_page=0, name="Im0", sha256="aa",
+                description="Desc A",
+            ),
+            PictureDescription(
+                page_number=2, ordinal_in_page=0, name="Im1", sha256="bb",
+                description="Desc B",
+            ),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 2
+    assert "**Embedded image:** `a.jpeg`" in out
+    assert "**Embedded image:** `b.jpeg`" in out
+    assert out.index("a.jpeg") < out.index("b.jpeg")
+    assert "Desc A" in out and "Desc B" in out
+
+
+def test_inject_returns_remaining_count_when_more_descriptions_than_markers():
+    """Three descriptions, one marker -> only one inlined, two leftover."""
+    markdown = "Just one <!-- image --> here.\n"
+    result = PictureExtractionResult(
+        descriptions=[
+            _desc(name="Im0", description="First"),
+            _desc(name="Im1", description="Second"),
+            _desc(name="Im2", description="Third"),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "**Embedded image:** `Im0`" in out
+    assert "**Embedded image:** `Im1`" not in out
+
+
+def test_inject_returns_zero_when_no_markers_present():
+    """Markdown with no image markers at all returns the input unchanged."""
+    markdown = "# Title\n\nJust text. No images mentioned at all.\n"
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 0
+    assert out == markdown
+
+
+# ---------------------------------------------------------------------------
+# render_appended_section
+# ---------------------------------------------------------------------------
+
+
+def test_render_appended_empty_when_nothing_passed():
+    assert render_appended_section([]) == ""
+
+
+def test_render_appended_renders_each_image_as_block():
+    descriptions = [
+        _desc(name="MM-130-a.jpeg", description="CT scan"),
+        _desc(name="MM-130-b.jpeg", description="Bar chart"),
+    ]
+    rendered = render_appended_section(descriptions)
+    assert "## Image Content (vision-LLM extracted)" in rendered
+    assert "**Embedded image:** `MM-130-a.jpeg`" in rendered
+    assert "CT scan" in rendered
+    assert "**Embedded image:** `MM-130-b.jpeg`" in rendered
+    assert "Bar chart" in rendered
+    # Each image block is delimited by horizontal rules.
+    assert rendered.count("\n---\n") >= 2
+    # No raw HTML / XML / blockquote prefixes.
+    assert "<image" not in rendered
+    assert "> **Embedded image:**" not in rendered
+    assert "**OCR text:**" not in rendered
+
+
+def test_render_appended_includes_skip_notes():
+    descriptions = [_desc()]
+    skip_result = PictureExtractionResult(
+        descriptions=descriptions,
+        skipped_too_small=2,
+        skipped_too_large=1,
+        skipped_duplicate=3,
+        failed=1,
+    )
+    rendered = render_appended_section(descriptions, skip_notes=skip_result)
+    assert "_Note:" in rendered
+    assert "2 too small" in rendered
+    assert "1 too large" in rendered
+    assert "3 duplicate" in rendered
+    assert "1 failed" in rendered
+
+
+# ---------------------------------------------------------------------------
+# merge_descriptions_into_markdown: top-level
+# ---------------------------------------------------------------------------
+
+
+def test_merge_inlines_when_marker_present():
+    markdown = "Text...\n\n<!-- image -->\nImage: scan.jpeg\n\nMore text\n"
+    result = PictureExtractionResult(descriptions=[_desc(name="Im0")])
+
+    out = merge_descriptions_into_markdown(markdown, result)
+
+    assert "**Embedded image:** `scan.jpeg`" in out
+    # Nothing leaked into an appended section -- we should NOT see the
+    # appended-section heading because everything went inline.
+    assert "## Image Content" not in out
+
+
+def test_merge_appends_when_no_marker_present():
+    """Zero markers means everything goes into an appended section."""
+    markdown = "Pure text doc, no image markers.\n"
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="An image desc.")]
+    )
+
+    out = merge_descriptions_into_markdown(markdown, result)
+
+    assert "Pure text doc" in out
+    assert "## Image Content (vision-LLM extracted)" in out
+    assert "**Embedded image:** `Im0`" in out
+
+
+def test_merge_appends_leftovers_with_distinct_heading():
+    """One marker, two descriptions -> one inline, second appended under
+    a heading that signals it's a leftover.
+    """
+    markdown = "Text\n\n<!-- image -->\nImage: a.jpeg\n\nEnd\n"
+    result = PictureExtractionResult(
+        descriptions=[
+            _desc(name="Im0", description="First"),
+            _desc(name="Im1", description="Second"),
+        ]
+    )
+
+    out = merge_descriptions_into_markdown(markdown, result)
+
+    assert "**Embedded image:** `a.jpeg`" in out  # inlined
+    assert "## Image Content (additional, no inline marker found)" in out
+    assert "**Embedded image:** `Im1`" in out  # appended
+
+
+# ---------------------------------------------------------------------------
+# describe_pictures: ocr_runner integration
+#
+# These tests cover the per-image OCR side-channel: when the caller
+# supplies an ``ocr_runner`` callable, each extracted image is sent
+# both to the vision LLM (visual description) and to the OCR runner
+# (text-in-image), in parallel. The OCR text -- if any -- is recorded
+# on the PictureDescription and rendered in the inline block.
+# ---------------------------------------------------------------------------
+
+
+async def test_describe_pictures_calls_ocr_runner_per_image(tmp_path, mocker):
+    """When an ocr_runner is provided, it's invoked once per eligible image."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img_a = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    img_b = _make_image_obj("Im1.png", b"\x89PNG\r\n\x1a\n" + b"\xcd" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img_a, img_b])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(side_effect=["Visual A", "Visual B"]),
+    )
+    ocr_runner = AsyncMock(side_effect=["OCR text A", "OCR text B"])
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
+    )
+
+    assert ocr_runner.await_count == 2
+    by_name = {d.name: d.ocr_text for d in result.descriptions}
+    assert by_name == {"Im0.jpeg": "OCR text A", "Im1.png": "OCR text B"}
+
+
+async def test_describe_pictures_runs_vision_and_ocr_in_parallel(
+    tmp_path, mocker
+):
+    """Vision LLM and OCR run concurrently per image, not sequentially.
+
+    We verify this by recording call timestamps: if both finish within
+    a small window relative to the per-call sleep, they ran in parallel.
+    """
+    import asyncio
+    import time
+
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    sleep_each = 0.05  # 50ms per call
+
+    async def slow_vision(*args, **kwargs):
+        await asyncio.sleep(sleep_each)
+        return "Visual"
+
+    async def slow_ocr(*args, **kwargs):
+        await asyncio.sleep(sleep_each)
+        return "OCR"
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=slow_vision,
+    )
+
+    fake_llm = MagicMock()
+    started = time.perf_counter()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=slow_ocr
+    )
+    elapsed = time.perf_counter() - started
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].ocr_text == "OCR"
+    # Sequential would be ~2*sleep_each. Parallel is ~1*sleep_each + overhead.
+    # Be generous with the bound so we're not flaky on slow CI.
+    assert elapsed < 1.5 * sleep_each, (
+        f"vision+OCR appear to be sequential (took {elapsed:.3f}s)"
+    )
+
+
+async def test_describe_pictures_treats_empty_ocr_as_none(tmp_path, mocker):
+    """Empty / whitespace-only OCR result is normalised to None.
+
+    This means the rendered image block won't carry an empty
+    "OCR text" section for images that contain no text at all
+    (e.g. a clean radiograph).
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="A radiograph."),
+    )
+    ocr_runner = AsyncMock(return_value="   \n  \n")
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
+    )
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].ocr_text is None
+
+
+async def test_describe_pictures_swallows_ocr_runner_failure(tmp_path, mocker):
+    """An OCR runner exception must not kill the description for that image.
+
+    OCR is supplementary; the vision LLM's description is the primary
+    payload. If OCR blows up we drop the OCR field for that image and
+    keep the description.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="A radiograph."),
+    )
+    ocr_runner = AsyncMock(side_effect=RuntimeError("OCR backend down"))
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
+    )
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].description == "A radiograph."
+    assert result.descriptions[0].ocr_text is None
+    assert result.failed == 0  # the IMAGE didn't fail; only its OCR did
+
+
+async def test_describe_pictures_vision_failure_with_ocr_runner_skips_image(
+    tmp_path, mocker
+):
+    """If the vision LLM fails, the image is skipped even if OCR succeeded.
+
+    The inline block's primary purpose is the visual description; an
+    OCR-only block would be misleading (it'd look like the vision
+    pipeline ran when it didn't), so we treat vision failure as image
+    failure regardless of OCR outcome.
+    """
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("scan.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(side_effect=RuntimeError("vision blew up")),
+    )
+    ocr_runner = AsyncMock(return_value="OCR text")
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(
+        str(pdf_file), "report.pdf", fake_llm, ocr_runner=ocr_runner
+    )
+
+    assert result.descriptions == []
+    assert result.failed == 1
+
+
+async def test_describe_pictures_no_ocr_runner_keeps_ocr_text_none(
+    tmp_path, mocker
+):
+    """Backward compat: omitting ocr_runner produces description-only blocks."""
+    pdf_file = tmp_path / "report.pdf"
+    pdf_file.write_bytes(b"%PDF-1.4 fake")
+
+    img = _make_image_obj("Im0.jpeg", b"\xff\xd8\xff\xe0" + b"\xab" * 2000)
+    fake_reader = MagicMock()
+    fake_reader.pages = [MagicMock(images=[img])]
+    mocker.patch("pypdf.PdfReader", return_value=fake_reader)
+
+    mocker.patch(
+        "app.etl_pipeline.parsers.vision_llm.parse_image_for_description",
+        new=AsyncMock(return_value="Visual"),
+    )
+
+    fake_llm = MagicMock()
+    result = await describe_pictures(str(pdf_file), "report.pdf", fake_llm)
+
+    assert len(result.descriptions) == 1
+    assert result.descriptions[0].ocr_text is None
+
+
+# ---------------------------------------------------------------------------
+# Rendering: "OCR text" section appears iff PictureDescription.ocr_text is set
+# ---------------------------------------------------------------------------
+
+
+def _desc_with_ocr(name="Im0", description="A CT scan.", ocr_text="L  R  10mm"):
+    return PictureDescription(
+        page_number=1,
+        ordinal_in_page=0,
+        name=name,
+        sha256="aa",
+        description=description,
+        ocr_text=ocr_text,
+    )
+
+
+def test_inject_renders_ocr_section_when_ocr_text_present():
+    markdown = "Text\n\n<!-- image -->\nImage: scan.jpeg\n\nMore\n"
+    result = PictureExtractionResult(
+        descriptions=[_desc_with_ocr(name="Im0", ocr_text="L  R  10mm")]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "**Embedded image:** `scan.jpeg`" in out
+    assert "**OCR text:**" in out
+    assert "L  R  10mm" in out
+    # OCR section comes before the visual description (literal text
+    # first, interpretation second).
+    assert out.index("**OCR text:**") < out.index("**Visual description:**")
+    # Critical: no nested-block constructs (fenced code, blockquote)
+    # that previous formats relied on -- both broke in Streamdown /
+    # PlateJS by escaping their container and dropping content.
+    assert "```" not in out
+    assert "> **" not in out
+
+
+def test_inject_renders_multiline_ocr_with_hard_breaks():
+    """Multi-line OCR uses trailing-two-spaces hard breaks so each
+    line renders on its own row, without needing a fragile fenced
+    code block or blockquote wrapper."""
+    markdown = "Text\n\n<!-- image -->\nImage: scan.jpeg\n\nMore\n"
+    ocr_multi = "Slice 24 / 60\nL\nR\n10 mm"
+    result = PictureExtractionResult(
+        descriptions=[_desc_with_ocr(name="Im0", ocr_text=ocr_multi)]
+    )
+
+    out, _ = inject_descriptions_inline(markdown, result)
+
+    # Every OCR line is present.
+    for line in ("Slice 24 / 60", "L", "R", "10 mm"):
+        assert line in out
+    # Non-last OCR lines get the trailing two-space hard break.
+    assert "Slice 24 / 60  \n" in out
+    assert "\nL  \n" in out
+    assert "\nR  \n" in out
+    # Last OCR line must NOT carry the two-space hard break (no stray <br>).
+    assert "10 mm  \n" not in out
+    assert "10 mm\n" in out
+
+
+def test_render_appended_renders_ocr_section_when_ocr_text_present():
+    descriptions = [
+        _desc_with_ocr(
+            name="MM-130-a.jpeg",
+            description="Axial CT.",
+            ocr_text="Slice 24 / 60",
+        ),
+    ]
+    rendered = render_appended_section(descriptions)
+
+    assert "**OCR text:**" in rendered
+    assert "Slice 24 / 60" in rendered
+    assert "Axial CT." in rendered
+
+
+def test_render_omits_ocr_section_when_ocr_text_is_none():
+    descriptions = [_desc(name="Im0", description="A clean radiograph.")]
+    rendered = render_appended_section(descriptions)
+
+    assert "**Embedded image:** `Im0`" in rendered
+    assert "**OCR text:**" not in rendered
+    assert "**Visual description:**" in rendered
+    # No raw HTML / blockquote prefixes.
+    assert "<image" not in rendered
+    assert "> **" not in rendered
+
+
+# ---------------------------------------------------------------------------
+# inject_descriptions_inline: <figure> blocks (layout-aware parsers)
+#
+# Azure Document Intelligence's ``prebuilt-layout`` and LlamaCloud
+# premium both emit ``<figure>...</figure>`` blocks that already contain
+# the parser's own OCR of the figure (chart bar values, axis labels,
+# inline ``<figcaption>``, embedded ``<table>`` for tabular figures).
+# That parser-side content is useful for retrieval on its own, so we
+# PRESERVE the figure verbatim and append our vision-LLM block
+# immediately after rather than substituting for it.
+# ---------------------------------------------------------------------------
+
+
+def test_inject_appends_block_after_figure_preserving_parser_content():
+    """Figure block stays intact; vision-LLM block goes right after it."""
+    markdown = (
+        "Some narrative text.\n\n"
+        "<figure>\n\n"
+        "Republican\n68\nDemocrat\n30\n"
+        "\n</figure>\n\n"
+        "Following paragraph.\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="Bar chart of party ID.")]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    # Original figure is preserved verbatim -- the parser's OCR'd
+    # numbers must still be searchable.
+    assert "<figure>" in out
+    assert "</figure>" in out
+    assert "Republican" in out and "68" in out
+    # Our vision-LLM block follows the figure, not before / inside it.
+    assert "**Embedded image:** `Im0`" in out
+    assert "Bar chart of party ID." in out
+    figure_close = out.index("</figure>")
+    embedded_at = out.index("**Embedded image:** `Im0`")
+    assert figure_close < embedded_at, "block must be appended AFTER </figure>"
+    # Surrounding narrative is preserved.
+    assert "Some narrative text." in out
+    assert "Following paragraph." in out
+
+
+def test_inject_handles_multiple_figures_in_document_order():
+    """N figures + N descriptions: each pair lands in the right place."""
+    markdown = (
+        "Page 1\n\n<figure>\nChart A bars\n</figure>\n\n"
+        "Between\n\n<figure>\nChart B bars\n</figure>\n\n"
+        "End.\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[
+            PictureDescription(
+                page_number=1, ordinal_in_page=0, name="Im0", sha256="aa",
+                description="Description of chart A.",
+            ),
+            PictureDescription(
+                page_number=2, ordinal_in_page=0, name="Im1", sha256="bb",
+                description="Description of chart B.",
+            ),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 2
+    # Both figures preserved; both descriptions inlined; order matches.
+    assert out.count("<figure>") == 2
+    assert out.count("</figure>") == 2
+    assert "Description of chart A." in out
+    assert "Description of chart B." in out
+    assert out.index("Description of chart A.") < out.index(
+        "Description of chart B."
+    )
+    # Each description appears AFTER its corresponding </figure>.
+    first_close = out.index("</figure>")
+    assert first_close < out.index("Description of chart A.")
+    second_close = out.index("</figure>", first_close + 1)
+    assert second_close < out.index("Description of chart B.")
+
+
+def test_inject_figures_with_attributes_and_nested_tags():
+    """``<figure>`` with attributes and nested tags is matched and preserved."""
+    markdown = (
+        '<figure id="fig-3" class="chart">\n'
+        '<figcaption>Source: Pew Research</figcaption>\n'
+        "<table><tr><td>Republican</td><td>57</td></tr></table>\n"
+        "</figure>\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="Survey table.")]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    # All nested HTML is preserved (chunking will pick it up).
+    assert 'id="fig-3"' in out
+    assert "<figcaption>Source: Pew Research</figcaption>" in out
+    assert "<table>" in out and "Republican" in out and "57" in out
+    # Our block sits after the closing tag.
+    assert out.index("</figure>") < out.index("**Embedded image:** `Im0`")
+
+
+def test_inject_figures_more_descriptions_than_figures_returns_remaining():
+    """Three descriptions, one figure -> one inlined, two left for caller."""
+    markdown = "Text.\n<figure>\nbar values\n</figure>\nMore.\n"
+    result = PictureExtractionResult(
+        descriptions=[
+            _desc(name="Im0", description="First desc."),
+            _desc(name="Im1", description="Second desc."),
+            _desc(name="Im2", description="Third desc."),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    assert "First desc." in out
+    # Leftovers are the caller's job; inject_descriptions_inline does
+    # not append them on its own.
+    assert "Second desc." not in out
+    assert "Third desc." not in out
+
+
+def test_inject_figures_more_figures_than_descriptions_leaves_extras_untouched():
+    """Two figures, one description -> first figure enriched, second left raw."""
+    markdown = (
+        "<figure>\nfigure 1 content\n</figure>\n"
+        "<figure>\nfigure 2 content\n</figure>\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="Only description.")]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 1
+    # Both figures still present; only the first one was enriched.
+    assert out.count("<figure>") == 2
+    assert "Only description." in out
+    # Second figure has no embedded-image block immediately after it.
+    second_open = out.index("<figure>", out.index("<figure>") + 1)
+    second_close = out.index("</figure>", second_open)
+    after_second = out[second_close:]
+    assert "**Embedded image:**" not in after_second
+
+
+def test_merge_inlines_at_figure_boundary():
+    """Top-level helper does the right thing with figures (no leftover section)."""
+    markdown = "Lead.\n<figure>\nbars\n</figure>\nTrailer.\n"
+    result = PictureExtractionResult(
+        descriptions=[_desc(name="Im0", description="Bar chart.")]
+    )
+
+    out = merge_descriptions_into_markdown(markdown, result)
+
+    # Inline succeeded -> no appended-section heading.
+    assert "## Image Content" not in out
+    assert "Bar chart." in out
+    assert "<figure>" in out and "</figure>" in out
+
+
+def test_inject_figures_then_falls_through_to_docling_marker():
+    """Mixed-marker doc: figure consumed first, then Docling placeholder.
+
+    Defensive -- single docs are usually one parser's output, but if a
+    pipeline ever stitches two parsers' markdowns together the inliner
+    should still place each description.
+    """
+    markdown = (
+        "<figure>\nChart bars: 50, 40, 30\n</figure>\n\n"
+        "Later in the doc:\n\n"
+        "<!-- image -->\nImage: scan.jpeg\n\n"
+        "End.\n"
+    )
+    result = PictureExtractionResult(
+        descriptions=[
+            _desc(name="Im0", description="Chart description."),
+            _desc(name="Im1", description="Scan description."),
+        ]
+    )
+
+    out, n = inject_descriptions_inline(markdown, result)
+
+    assert n == 2
+    # Figure preserved + augmented.
+    assert "<figure>" in out and "Chart bars: 50, 40, 30" in out
+    assert "Chart description." in out
+    # Docling placeholder + caption replaced.
+    assert "<!-- image -->" not in out
+    assert "Image: scan.jpeg" not in out
+    assert "**Embedded image:** `scan.jpeg`" in out
+    assert "Scan description." in out
diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_vision_llm.py b/surfsense_backend/tests/unit/etl_pipeline/test_vision_llm.py
new file mode 100644
index 000000000..1293ff757
--- /dev/null
+++ b/surfsense_backend/tests/unit/etl_pipeline/test_vision_llm.py
@@ -0,0 +1,146 @@
+"""Unit tests for the vision_llm parser helpers.
+
+Two helpers exist:
+
+- :func:`parse_with_vision_llm` -- single-shot for standalone image
+  uploads (.png/.jpg/etc). Returns combined markdown (description +
+  verbatim OCR mixed) since the image *is* the document.
+- :func:`parse_image_for_description` -- per-image-in-PDF call. Returns
+  visual description only; OCR is the ETL service's job.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+pytestmark = pytest.mark.unit
+
+
+# ---------------------------------------------------------------------------
+# parse_with_vision_llm: legacy single-shot path
+# ---------------------------------------------------------------------------
+
+
+async def test_parse_with_vision_llm_returns_combined_markdown(tmp_path):
+    """Standalone image uploads still go through the combined-markdown path."""
+    from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = "# A scan of something."
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    out = await parse_with_vision_llm(str(img), "scan.png", fake_llm)
+    assert out == "# A scan of something."
+    fake_llm.ainvoke.assert_awaited_once()
+
+
+async def test_parse_with_vision_llm_rejects_empty_response(tmp_path):
+    """An empty model response raises rather than silently returning blanks."""
+    from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = ""
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    with pytest.raises(ValueError, match="empty content"):
+        await parse_with_vision_llm(str(img), "scan.png", fake_llm)
+
+
+# ---------------------------------------------------------------------------
+# parse_image_for_description: per-image-in-PDF, description only
+# ---------------------------------------------------------------------------
+
+
+async def test_parse_image_for_description_returns_description(tmp_path):
+    """Description-only path returns the model's markdown unchanged."""
+    from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = "Axial CT showing a large cystic mass."
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    out = await parse_image_for_description(str(img), "scan.png", fake_llm)
+    assert out == "Axial CT showing a large cystic mass."
+
+
+async def test_parse_image_for_description_uses_description_only_prompt(tmp_path):
+    """The prompt explicitly tells the model NOT to transcribe text.
+
+    This is the contract that lets us drop OCR from the response: the
+    ETL pipeline already has the text (from page-level OCR), so asking
+    the vision LLM for it would be redundant cost.
+    """
+    from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = "A description"
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    await parse_image_for_description(str(img), "scan.png", fake_llm)
+
+    # The prompt is the first text part of the message we sent.
+    sent_messages = fake_llm.ainvoke.call_args.args[0]
+    prompt_text = sent_messages[0].content[0]["text"].lower()
+    assert "describe what this image visually depicts" in prompt_text
+    assert "do not transcribe text" in prompt_text
+
+
+async def test_parse_image_for_description_rejects_empty(tmp_path):
+    """Empty response surfaces as ValueError so the caller can skip the image."""
+    from app.etl_pipeline.parsers.vision_llm import parse_image_for_description
+
+    img = tmp_path / "scan.png"
+    img.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 200)
+
+    fake_response = MagicMock()
+    fake_response.content = "   "  # whitespace-only counts as empty
+    fake_llm = AsyncMock()
+    fake_llm.ainvoke.return_value = fake_response
+
+    with pytest.raises(ValueError, match="empty content"):
+        await parse_image_for_description(str(img), "scan.png", fake_llm)
+
+
+# ---------------------------------------------------------------------------
+# Image size + extension validation (shared by both paths)
+# ---------------------------------------------------------------------------
+
+
+def test_image_to_data_url_rejects_oversized(tmp_path):
+    """Images larger than 5 MB raise before any LLM call is made."""
+    from app.etl_pipeline.parsers.vision_llm import _image_to_data_url
+
+    big = tmp_path / "huge.png"
+    big.write_bytes(b"\x89PNG" + b"\x00" * (6 * 1024 * 1024))
+
+    with pytest.raises(ValueError, match="Image too large"):
+        _image_to_data_url(str(big))
+
+
+def test_image_to_data_url_rejects_unsupported_extension(tmp_path):
+    """Unknown extensions raise rather than guessing a MIME type."""
+    from app.etl_pipeline.parsers.vision_llm import _image_to_data_url
+
+    weird = tmp_path / "scan.xyz"
+    weird.write_bytes(b"\x00" * 100)
+
+    with pytest.raises(ValueError, match="Unsupported image extension"):
+        _image_to_data_url(str(weird))
diff --git a/surfsense_evals/.env.example b/surfsense_evals/.env.example
new file mode 100644
index 000000000..632e77d8a
--- /dev/null
+++ b/surfsense_evals/.env.example
@@ -0,0 +1,65 @@
+# surfsense_evals — environment template.
+#
+# Copy this file to `.env` (in the surfsense_evals/ project root or your
+# CWD) and fill in the values. `python-dotenv` loads it automatically
+# the first time `core.config` is imported, so every CLI subcommand
+# (`setup`, `ingest`, `run`, `report`, `teardown`, `models list`, …)
+# will pick the values up.
+#
+#   cp .env.example .env
+#   # then edit .env with your values
+#
+# `.env` is gitignored — never commit real secrets.
+
+# ---------------------------------------------------------------------------
+# 1. Backend target — REQUIRED (default works for a local dev backend)
+# ---------------------------------------------------------------------------
+SURFSENSE_API_BASE=http://localhost:8000
+
+# ---------------------------------------------------------------------------
+# 2. OpenRouter — REQUIRED for any `run` invocation
+# ---------------------------------------------------------------------------
+# The `native_pdf` arm calls OpenRouter directly; the `surfsense` arm
+# routes through SurfSense which uses the same key under the hood.
+OPENROUTER_API_KEY=sk-or-...
+
+# Override only if you proxy OpenRouter through a private gateway:
+# OPENROUTER_BASE_URL=https://openrouter.ai/api/v1
+
+# Multimodal benchmarks (medxpertqa, mmlongbench) require a vision-capable
+# slug. Recommended (verify in your catalog with `models list --grep ...`):
+#   anthropic/claude-sonnet-4.5    (default recommendation)
+#   anthropic/claude-opus-4.7      (strongest)
+#   openai/gpt-5                   (top-tier vision)
+#   google/gemini-2.5-pro          (1M-token context, best for long PDFs)
+# DO NOT use openai/gpt-5.4-mini for image-bearing benchmarks — it's
+# text-only on PDF content and the runner emits a warning if pinned.
+
+# ---------------------------------------------------------------------------
+# 3. Auth — pick EXACTLY ONE of the two modes below
+# ---------------------------------------------------------------------------
+
+# --- Mode A: LOCAL (backend started with AUTH_TYPE=LOCAL)
+# The harness POSTs these to /auth/jwt/login automatically.
+# SURFSENSE_USER_EMAIL=you@example.com
+# SURFSENSE_USER_PASSWORD=...
+
+# --- Mode B: GOOGLE OAuth (or any pre-issued JWT)
+# Open the SurfSense web UI in your browser, log in via Google, then in
+# DevTools → Application → Local Storage copy:
+#   surfsense_bearer_token  → SURFSENSE_JWT
+#   surfsense_refresh_token → SURFSENSE_REFRESH_TOKEN  (optional, enables
+#                                                       auto-refresh on 401)
+# SURFSENSE_JWT=eyJhbGciOi...
+# SURFSENSE_REFRESH_TOKEN=eyJhbGciOi...
+
+# ---------------------------------------------------------------------------
+# 4. Filesystem paths — OPTIONAL (defaults below)
+# ---------------------------------------------------------------------------
+# Where datasets, rendered PDFs, ingestion id maps, run outputs, and
+# state.json live. Default: <surfsense_evals>/data/
+# EVAL_DATA_DIR=./data
+
+# Where generated reports (summary.md / summary.json) get written.
+# Default: <surfsense_evals>/reports/
+# EVAL_REPORTS_DIR=./reports
diff --git a/surfsense_evals/.gitignore b/surfsense_evals/.gitignore
new file mode 100644
index 000000000..0f71d2635
--- /dev/null
+++ b/surfsense_evals/.gitignore
@@ -0,0 +1,29 @@
+# Python bytecode + caches
+__pycache__/
+*.py[cod]
+*.pyo
+
+# Editable-install / build artifacts
+*.egg-info/
+build/
+dist/
+.eggs/
+
+# Virtual envs (uv venv default + common alternates)
+.venv/
+venv/
+env/
+
+# Tooling caches
+.pytest_cache/
+.ruff_cache/
+.mypy_cache/
+.coverage
+.coverage.*
+htmlcov/
+
+# Local secrets — keep `.env.example` tracked, never the real `.env`.
+.env
+.env.local
+.env.*.local
+!.env.example
diff --git a/surfsense_evals/README.md b/surfsense_evals/README.md
new file mode 100644
index 000000000..c6314af80
--- /dev/null
+++ b/surfsense_evals/README.md
@@ -0,0 +1,228 @@
+# SurfSense Evals
+
+Domain-agnostic eval harness for SurfSense. Each benchmark is a Python subpackage under `suites/<domain>/<benchmark>/` that self-registers with the CLI; `core/` is the shared infrastructure (HTTP clients, arms, parsers, metrics, report writer, registry). The harness talks to SurfSense over HTTP only — it does **not** import any backend Python module — so it ships in its own venv and never bloats the FastAPI runtime image.
+
+## Benchmarks
+
+| Benchmark                       | Shape                                            | Vision required? | Default ingest             |
+|---------------------------------|--------------------------------------------------|------------------|----------------------------|
+| `medical/medxpertqa` (headline) | Native PDF vs SurfSense head-to-head, MCQ        | yes              | `vision=on, mode=basic`    |
+| `medical/mirage`                | SurfSense single-arm, MCQ                        | no               | `vision=off, mode=basic`   |
+| `medical/cure`                  | SurfSense single-arm retrieval (Recall/MRR/nDCG) | no               | `vision=off, mode=basic`   |
+| `multimodal_doc/mmlongbench`    | Native PDF vs SurfSense head-to-head, open-ended | yes              | `vision=on, mode=basic`    |
+
+Future domains (`legal/`, `finance/`, `code/`, `scientific/`) drop into `suites/` without touching `core/` or the CLI.
+
+## Install + auth
+
+```bash
+uv pip install -e ./surfsense_evals
+cp surfsense_evals/.env.example surfsense_evals/.env
+# Edit .env: SURFSENSE_API_BASE, OPENROUTER_API_KEY, and ONE of:
+#   LOCAL  → SURFSENSE_USER_EMAIL + SURFSENSE_USER_PASSWORD
+#   GOOGLE → SURFSENSE_JWT (+ optional SURFSENSE_REFRESH_TOKEN)
+#            (lift both from browser localStorage after a normal Google login)
+```
+
+## Step-by-step: run all four benchmarks
+
+The medical and multimodal_doc suites each get their own SearchSpace and pinned model, so they're independent — run them in any order. Both head-to-head benchmarks (`medxpertqa`, `mmlongbench`) require a **vision-capable** OpenRouter slug; pinning a text-only one (e.g. `openai/gpt-5.4-mini`) silently drops images and the runner emits a warning.
+
+Recommended vision slugs (use `models list --grep <name>` to confirm one): `anthropic/claude-sonnet-4.5` (balanced cost), `anthropic/claude-opus-4.7` (strongest reasoning), `openai/gpt-5` (top-tier vision), `google/gemini-2.5-pro` (best for long PDFs, 1M-token context).
+
+```bash
+# 0. (optional) discover what's registered
+python -m surfsense_evals suites list
+python -m surfsense_evals benchmarks list
+
+# 1. MEDICAL SUITE — one SearchSpace, three benchmarks
+python -m surfsense_evals setup --suite medical --provider-model anthropic/claude-sonnet-4.5
+
+#  1a. headline head-to-head: Native PDF (vision) vs SurfSense (vision RAG)
+#      Downloads dev+test JSONL + images.zip, renders one PDF per question
+#      (case + table + images + 5 options), uploads with use_vision_llm=True.
+python -m surfsense_evals ingest medical medxpertqa --split test
+python -m surfsense_evals run    medical medxpertqa --concurrency 4
+
+#  1b. MIRAGE — single-arm SurfSense MCQ accuracy
+#      (MMLU-Med / MedQA-US / MedMCQA / PubMedQA / BioASQ)
+python -m surfsense_evals ingest medical mirage
+python -m surfsense_evals run    medical mirage
+
+#  1c. CUREv1 — single-arm SurfSense retrieval (Recall@k / MRR / nDCG@10)
+python -m surfsense_evals ingest medical cure --lang en
+python -m surfsense_evals run    medical cure --lang en
+
+#  1d. write reports/medical/<UTC-ts>/summary.{md,json}
+python -m surfsense_evals report --suite medical
+
+# 2. MULTIMODAL_DOC SUITE — long PDFs with embedded images, charts, tables
+python -m surfsense_evals setup  --suite multimodal_doc --provider-model google/gemini-2.5-pro
+python -m surfsense_evals ingest multimodal_doc mmlongbench           # ~660MB, resumable
+python -m surfsense_evals run    multimodal_doc mmlongbench --concurrency 4
+python -m surfsense_evals report --suite multimodal_doc
+
+# 3. CLEANUP — soft-deletes the SearchSpaces; rendered PDFs stay cached
+python -m surfsense_evals teardown --suite medical
+python -m surfsense_evals teardown --suite multimodal_doc
+```
+
+## Asymmetric scenarios — the "vision-extract once, answer cheap" play
+
+The walkthrough above is `--scenario head-to-head` (default): both arms answer with the same vision-capable slug. SurfSense's actual architectural value-prop is that the **ingestion-time vision LLM and the runtime LLM are completely independent** — you can pay a vision LLM *once*, at ingest, to convert every embedded image into text (per-image OCR **and** semantic description, inlined where the image actually appears in the document — see [What `--use-vision-llm` produces](#what---use-vision-llm-produces) below). Then every query is served by a cheap text-only model that sees that extracted text natively. Two extra scenarios make this explicit:
+
+| `--scenario`       | Native arm answers with                | SurfSense arm answers with     | Question being measured                                                                  |
+|--------------------|----------------------------------------|--------------------------------|------------------------------------------------------------------------------------------|
+| `head-to-head`     | `--provider-model` (vision)            | `--provider-model` (vision)    | Pure RAG quality at parity. (Default.)                                                   |
+| `symmetric-cheap`  | `--provider-model` (cheap, text-only)  | `--provider-model` (same)      | Does pre-extracted image context let a non-vision LLM reason over image-heavy docs?      |
+| `cost-arbitrage`   | `--native-arm-model` (vision)          | `--provider-model` (cheap)     | How close does SurfSense get to a vision-native baseline at a fraction of per-query cost?|
+
+In all three modes the **ingest-time** vision LLM is set on the SearchSpace's `vision_llm_config_id` (auto-picked from the strongest registered global OpenRouter vision config — `claude-sonnet-4.5` > `claude-opus-4.7` > `gpt-5` > `gemini-2.5-pro`, override with `--vision-llm <slug>`). What changes is which slug the *answering* models hit per arm.
+
+### Ingest with vision, evaluate with a non-vision LLM (`symmetric-cheap`)
+
+This is the answer to *"does SurfSense give a non-vision LLM enough context to reason over image-heavy docs?"*. Both arms hit the same cheap text-only slug. The native arm is structurally blind to images (text-only LLM + raw PDFs). The SurfSense arm reads chunks that already contain the per-image OCR and visual descriptions, written there by the vision LLM at ingest time.
+
+```bash
+python -m surfsense_evals setup --suite medical \
+  --scenario symmetric-cheap \
+  --provider-model openai/gpt-5.4-mini
+  # vision LLM at ingest = auto-picked (claude-sonnet-4.5 by default)
+  # answer LLM for BOTH arms = openai/gpt-5.4-mini (text-only)
+
+python -m surfsense_evals ingest medical medxpertqa --split test  # vision=on by default
+python -m surfsense_evals run    medical medxpertqa --concurrency 4
+python -m surfsense_evals report --suite medical
+# Δ accuracy on image-required MCQs is the headline number; native arm
+# baseline is "what a text-only LLM gets without seeing the images".
+```
+
+### Cheap SurfSense vs vision-native baseline (`cost-arbitrage`)
+
+```bash
+python -m surfsense_evals setup --suite medical \
+  --scenario cost-arbitrage \
+  --provider-model openai/gpt-5.4-mini \
+  --native-arm-model anthropic/claude-sonnet-4.5
+  # vision LLM at ingest = auto-picked claude-sonnet-4.5
+  # native arm = sonnet (vision); SurfSense arm = gpt-5.4-mini (text-only)
+
+python -m surfsense_evals ingest medical medxpertqa --split test
+python -m surfsense_evals run    medical medxpertqa --concurrency 4
+python -m surfsense_evals report --suite medical
+# Report header reads:
+#   Scenario: cost-arbitrage — native arm answers with `anthropic/claude-sonnet-4.5`
+#   (vision); SurfSense answers with `openai/gpt-5.4-mini` over chunks vision-extracted
+#   at ingest by `anthropic/claude-sonnet-4.5`.
+```
+
+Notes:
+- `cost-arbitrage` requires both `--provider-model` (the cheap SurfSense slug) AND `--native-arm-model <vision slug>`.
+- `--vision-llm <slug>` is optional; if omitted the harness queries `GET /api/v1/global-vision-llm-configs` and auto-picks the strongest registered one. Pass `--no-vision-llm-setup` if you want to keep whatever vision config is already attached to the SearchSpace.
+- The runner's "looks text-only" warning is suppressed (or relabelled as informational) for `symmetric-cheap` so intentional asymmetry doesn't read as a misconfiguration.
+- All three scenario fields (`scenario`, `provider_model`, `native_arm_model`, `vision_provider_model`) are persisted to `state.json` and recorded in `run_artifact.extra` + the report header — no need to retrace what was set.
+
+## Per-benchmark useful flags
+
+`medical/medxpertqa` (`run`):
+- `--split {test,dev,all}` — pick a subset (default `test`)
+- `--task "Diagnosis"` / `--body-system "Cardiovascular"` — slice the report
+- `--require-images` — drop rare rows where every image filename failed to resolve
+- `--n 100` — quick smoke run
+- `--no-mentions` — let SurfSense retrieve unscoped ("did the @-mention matter?")
+
+`multimodal_doc/mmlongbench`:
+- `--max-docs N` (ingest) — cap downloads at the first N unique PDFs
+- `--format {str,int,float,list,none}` (run) — slice by answer format; `none` = the ~22% intentionally unanswerable hallucination probes
+- `--skip-unanswerable` (run) — drop unanswerable questions
+- `--docs <a.pdf>,<b.pdf>` (run) — scope to specific docs
+
+## Ingestion knobs (vision LLM, processing mode, summarize)
+
+The harness exposes `POST /api/v1/documents/fileupload`'s three knobs on every `ingest` subcommand:
+
+| Flag pair                                  | Effect                                                                                  |
+|--------------------------------------------|-----------------------------------------------------------------------------------------|
+| `--use-vision-llm` / `--no-vision-llm`     | Walk every embedded image in the PDF and inline image-derived text at the image's position (see below). |
+| `--processing-mode {basic,premium}`        | `premium` carries a 10× page multiplier and routes to a stronger ETL (e.g. LlamaCloud). |
+| `--should-summarize` / `--no-summarize`    | Generate a per-document summary at ingest.                                              |
+
+The "Default ingest" column in the benchmarks table is what runs if you don't pass any flag. Whatever was actually used is recorded as a `__settings__` header in the doc map (`data/<suite>/maps/<benchmark>_*_map.jsonl`) and as `extra.ingest_settings` in `run_artifact.json`, then surfaced in the report — no need to hunt through CLI history.
+
+> The backend's `ETL_SERVICE` env var (`DOCLING` | `UNSTRUCTURED` | `LLAMACLOUD`) is **not** per-upload. Restart the backend with a different `ETL_SERVICE` and re-ingest to compare ETLs (route through `--processing-mode premium` if your backend uses that mode for the stronger ETL).
+
+### What `--use-vision-llm` produces
+
+When vision is on, the backend's ETL pipeline (`app/etl_pipeline/picture_describer.py`) does, **per embedded image** in the PDF:
+
+1. Extract the raw image bytes via `pypdf` (deduped by sha256, size-capped to match the vision LLM's per-image limit).
+2. **Per-image OCR** — re-feed the image as a standalone upload through the configured ETL service (Docling / Azure DI / LlamaCloud) with `vision_llm=None`, so the ETL's OCR engine extracts the literal text-in-image.
+3. **Visual description** — call the vision LLM on the image with a description-only prompt (it's explicitly told *not* to transcribe text — that's OCR's job). Steps 2 and 3 run in parallel per image.
+4. Splice a horizontal-rule-delimited section **at the image's original position** in the parser markdown (replacing Docling's `<!-- image -->` placeholder + caption, or the bare `Image: <name>` caption a stripped-image parser leaves behind):
+
+   ```markdown
+   ---
+
+   **Embedded image:** `MM-130-a.jpeg`
+
+   **OCR text:**
+   Slice 24 / 60
+   L  R
+
+   **Visual description:**
+
+   - Axial contrast-enhanced CT showing a large cystic mass in the left upper quadrant.
+   - Mass effect on the adjacent stomach; left kidney displaced inferiorly.
+
+   ---
+   ```
+
+This is what makes `--scenario symmetric-cheap` and `--scenario cost-arbitrage` work: a non-vision LLM reading SurfSense's chunks sees the image's text and semantic content as plain markdown, alongside the surrounding case text, in the same retrieved chunk. Without it the cheap LLM would have nothing extra to read.
+
+### A/B testing the same corpus with different settings
+
+SurfSense dedupes uploads by `(filename, search_space_id)` — **not** by content hash and **not** by ingestion settings. Re-uploading the same filename to the same SearchSpace with a different `--use-vision-llm` flag silently skips re-processing. Give each variant its own SearchSpace:
+
+```bash
+# Baseline arm (vision off)
+python -m surfsense_evals setup    --suite medical --provider-model anthropic/claude-sonnet-4.5
+python -m surfsense_evals ingest   medical medxpertqa --no-vision-llm
+python -m surfsense_evals run      medical medxpertqa --n 100
+python -m surfsense_evals teardown --suite medical
+
+# Vision arm (the benchmark default)
+python -m surfsense_evals setup    --suite medical --provider-model anthropic/claude-sonnet-4.5
+python -m surfsense_evals ingest   medical medxpertqa
+python -m surfsense_evals run      medical medxpertqa --n 100
+python -m surfsense_evals report   --suite medical
+```
+
+Both runs land in `data/medical/runs/<ts>/medxpertqa/` with their settings recorded; rendered PDFs stay cached under `data/medical/medxpertqa/pdfs/` so the second `ingest` is upload-only.
+
+## Environment variables
+
+- `SURFSENSE_API_BASE` (default `http://localhost:8000`)
+- `OPENROUTER_API_KEY` — required for the `native_pdf` arm and for `models list`
+- One of `SURFSENSE_USER_EMAIL` + `SURFSENSE_USER_PASSWORD` (LOCAL), **or** `SURFSENSE_JWT` (+ optional `SURFSENSE_REFRESH_TOKEN`) for GOOGLE/pre-issued JWT
+- `EVAL_DATA_DIR` (default `<project>/data`) — datasets, rendered PDFs, ingestion id maps, run outputs, `state.json`
+- `EVAL_REPORTS_DIR` (default `<project>/reports`)
+- `OPENROUTER_BASE_URL` (default `https://openrouter.ai/api/v1`) — only if you proxy OpenRouter
+
+## Adding a new domain suite
+
+1. Create `surfsense_evals/src/surfsense_evals/suites/<domain>/<benchmark>/` with `__init__.py`, `ingest.py`, `runner.py`, optional `prompt.py`.
+2. Implement a `Benchmark` subclass (see `core/registry.py`); compose with `core.clients.*`, `core.arms.*`, `core.parse.*`, `core.metrics.*`.
+3. Call `register(MyBenchmark())` at the bottom of `<benchmark>/__init__.py`. Auto-discovery picks it up; `setup --suite <domain>` and `ingest/run <domain> <benchmark>` work immediately.
+
+Each suite gets its own SearchSpace (`eval-<suite>-<UTC-ts>`), `state.json` slot, data dir, reports dir, and pinned LLM. Suites never share a SearchSpace.
+
+## Out of scope (follow-up PRs)
+
+- Docker service for `docker compose run evals run medical medxpertqa`.
+- Multi-model sweeps (one slug per `setup` for now; aggregate reports come later).
+- A long-context-stuffing arm (give the model the same retrieved chunks SurfSense saw).
+- LLM-judge grader for MMLongBench-Doc (paper uses GPT-4 as judge; we ship a deterministic rule-based grader).
+- MedXpertQA-MM accuracy by image modality — dataset doesn't tag modality directly; we slice by `medical_task` and `body_system`.
+- A `--slot <name>` flag that decouples the state-slot key from the benchmark registry's `suite` attribute, so parallel SearchSpaces with different ingestion settings can coexist on the same benchmark without `teardown` between A/B arms.
+
+See `c:/Users/91882/.cursor/plans/medical_rag_evals_(mirage_+_curev1)_e797a324.plan.md` for the full design rationale.
diff --git a/surfsense_evals/data/.gitignore b/surfsense_evals/data/.gitignore
new file mode 100644
index 000000000..d6b7ef32c
--- /dev/null
+++ b/surfsense_evals/data/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/surfsense_evals/pyproject.toml b/surfsense_evals/pyproject.toml
new file mode 100644
index 000000000..a23e8a8be
--- /dev/null
+++ b/surfsense_evals/pyproject.toml
@@ -0,0 +1,60 @@
+[project]
+name = "surfsense-evals"
+version = "0.1.0"
+description = "Domain-agnostic evaluation harness for SurfSense (medical RAG suite ships first; legal/finance/code suites slot in under suites/)."
+readme = "README.md"
+requires-python = ">=3.12"
+license = { text = "Apache-2.0" }
+authors = [{ name = "SurfSense" }]
+
+dependencies = [
+    "httpx>=0.27.0",
+    "httpx-sse>=0.4.0",
+    "datasets>=2.21.0",
+    "huggingface_hub>=0.24.0",
+    "reportlab>=4.0.0",
+    "Pillow>=10.0.0",
+    "pyarrow>=15.0.0",
+    "pydantic>=2.6.0",
+    "tqdm>=4.66.0",
+    "numpy>=1.26.0",
+    "scikit-learn>=1.4.0",
+    "scipy>=1.12.0",
+    "python-dotenv>=1.0.0",
+    "rich>=13.7.0",
+    "trafilatura>=1.12.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-asyncio>=0.23.0",
+    "respx>=0.21.0",
+    "ruff>=0.5.0",
+]
+
+[project.scripts]
+surfsense-evals = "surfsense_evals.core.cli:main"
+
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["surfsense_evals*"]
+
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+testpaths = ["tests"]
+markers = [
+    "integration: opt-in tests that hit a live SurfSense instance (run with `-m integration`)",
+]
+
+[tool.ruff]
+line-length = 100
+target-version = "py312"
+
+[tool.ruff.lint]
+select = ["E", "F", "I", "B", "UP", "SIM", "ASYNC"]
+ignore = ["E501"]
diff --git a/surfsense_evals/reports/.gitignore b/surfsense_evals/reports/.gitignore
new file mode 100644
index 000000000..bd8c8feaa
--- /dev/null
+++ b/surfsense_evals/reports/.gitignore
@@ -0,0 +1,4 @@
+*
+!.gitignore
+!medical/
+!medical/sample_summary.md
diff --git a/surfsense_evals/scripts/download_crag_task3.py b/surfsense_evals/scripts/download_crag_task3.py
new file mode 100644
index 000000000..a646838fe
--- /dev/null
+++ b/surfsense_evals/scripts/download_crag_task3.py
@@ -0,0 +1,97 @@
+"""Download CRAG Task 3's 4 .tar.bz2 parts in parallel.
+
+Run once before ``ingest research crag_t3`` to avoid the ingest
+synchronously blocking on a 7 GB download. Skips parts already
+present and complete on disk.
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+import time
+import urllib.request
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+)
+log = logging.getLogger("download_task3")
+
+
+_BASE = (
+    "https://github.com/facebookresearch/CRAG/raw/refs/heads/main/data/"
+    "crag_task_3_dev_v4.tar.bz2.part"
+)
+_USER_AGENT = "SurfSense-Evals/0.1 (CRAG Task 3 fetch)"
+
+
+def _expected_size(url: str) -> int:
+    req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": _USER_AGENT})
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        return int(resp.headers.get("content-length", 0))
+
+
+def download_one(part: int, dest_dir: Path) -> Path:
+    url = f"{_BASE}{part}"
+    dest = dest_dir / f"crag_task_3_dev_v4.tar.bz2.part{part}"
+    expected = _expected_size(url)
+    if dest.exists() and dest.stat().st_size == expected:
+        log.info("part%d: cached (%d bytes)", part, expected)
+        return dest
+    log.info("part%d: downloading %d bytes ...", part, expected)
+    tmp = dest.with_suffix(dest.suffix + ".part_dl")
+    started = time.monotonic()
+    last_log = started
+    with urllib.request.urlopen(
+        urllib.request.Request(url, headers={"User-Agent": _USER_AGENT}),
+        timeout=900,
+    ) as resp, tmp.open("wb") as fh:
+        downloaded = 0
+        chunk = resp.read(1 << 20)
+        while chunk:
+            fh.write(chunk)
+            downloaded += len(chunk)
+            now = time.monotonic()
+            if now - last_log > 5.0:
+                pct = 100 * downloaded / expected if expected else 0
+                rate_mb = (downloaded / (now - started)) / (1 << 20)
+                log.info(
+                    "part%d: %5.1f%% (%.1f / %.1f MiB at %.1f MiB/s)",
+                    part, pct, downloaded / (1 << 20), expected / (1 << 20), rate_mb,
+                )
+                last_log = now
+            chunk = resp.read(1 << 20)
+    tmp.replace(dest)
+    elapsed = time.monotonic() - started
+    log.info(
+        "part%d: done in %.1fs (%.1f MiB/s avg)",
+        part, elapsed, (expected / (1 << 20)) / max(elapsed, 0.001),
+    )
+    return dest
+
+
+def main() -> int:
+    dest_dir = Path("data/research/crag_t3/.raw_cache")
+    dest_dir.mkdir(parents=True, exist_ok=True)
+
+    # 4 parts in parallel — typical residential connection saturates around
+    # 2 streams; GitHub raw serves these fine in parallel.
+    started = time.monotonic()
+    with ThreadPoolExecutor(max_workers=4) as ex:
+        futures = {ex.submit(download_one, i, dest_dir): i for i in range(1, 5)}
+        for fut in as_completed(futures):
+            part = futures[fut]
+            try:
+                fut.result()
+            except Exception as exc:  # noqa: BLE001
+                log.error("part%d failed: %s", part, exc)
+                return 1
+    log.info("All 4 parts downloaded in %.1fs", time.monotonic() - started)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/surfsense_evals/scripts/peek_crag_run.py b/surfsense_evals/scripts/peek_crag_run.py
new file mode 100644
index 000000000..225e5ec98
--- /dev/null
+++ b/surfsense_evals/scripts/peek_crag_run.py
@@ -0,0 +1,37 @@
+"""Tiny helper to inspect the latest CRAG run's per-question outputs."""
+
+from __future__ import annotations
+
+import glob
+import json
+from collections import defaultdict
+
+
+def main() -> None:
+    raw_path = sorted(glob.glob("data/research/runs/*/crag/raw.jsonl"))[-1]
+    print(f"Reading: {raw_path}")
+    rows = [json.loads(line) for line in open(raw_path, encoding="utf-8") if line.strip()]
+    by_q: dict[str, dict[str, dict]] = defaultdict(dict)
+    for r in rows:
+        by_q[r["qid"]][r["arm"]] = r
+
+    for qid, arms in list(by_q.items()):
+        b = arms.get("bare_llm", {})
+        l = arms.get("long_context", {})
+        s = arms.get("surfsense", {})
+        print(f"\n=== {qid} ({b.get('domain')}/{b.get('question_type')}) ===")
+        print(f"  question: {b.get('extra', {}).get('question', '?')!r}")
+        print(f"  gold: {b.get('gold')!r}")
+        for arm_name, a in (("bare_llm", b), ("long_context", l), ("surfsense", s)):
+            grade = a.get("graded", {})
+            text = (a.get("raw_text") or "").strip()
+            tail = text[-200:] if text else ""
+            print(
+                f"  [{arm_name}] grade={grade.get('grade')} "
+                f"method={grade.get('method')}"
+            )
+            print(f"    -> {tail!r}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_evals/scripts/peek_disagreements.py b/surfsense_evals/scripts/peek_disagreements.py
new file mode 100644
index 000000000..c0fe0acd9
--- /dev/null
+++ b/surfsense_evals/scripts/peek_disagreements.py
@@ -0,0 +1,64 @@
+"""Show questions where SurfSense was wrong but long-context was right (and vice versa)."""
+
+from __future__ import annotations
+
+import glob
+import json
+from collections import defaultdict
+
+
+def main() -> None:
+    raw_path = sorted(glob.glob("data/research/runs/*/crag/raw.jsonl"))[-1]
+    print(f"Reading: {raw_path}")
+    rows = [json.loads(line) for line in open(raw_path, encoding="utf-8") if line.strip()]
+    by_q: dict[str, dict[str, dict]] = defaultdict(dict)
+    for r in rows:
+        by_q[r["qid"]][r["arm"]] = r
+
+    surf_wrong_lc_right = []
+    lc_wrong_surf_right = []
+    surf_wrong_bare_right = []
+    for qid, arms in by_q.items():
+        b = arms.get("bare_llm", {}).get("graded", {}).get("grade")
+        lc = arms.get("long_context", {}).get("graded", {}).get("grade")
+        s = arms.get("surfsense", {}).get("graded", {}).get("grade")
+        if s == "incorrect" and lc == "correct":
+            surf_wrong_lc_right.append(qid)
+        if lc == "incorrect" and s == "correct":
+            lc_wrong_surf_right.append(qid)
+        if s == "incorrect" and b == "correct":
+            surf_wrong_bare_right.append(qid)
+
+    print(f"\nSurfSense INCORRECT but Long-Context CORRECT: {len(surf_wrong_lc_right)}")
+    print(f"Long-Context INCORRECT but SurfSense CORRECT: {len(lc_wrong_surf_right)}")
+    print(f"SurfSense INCORRECT but Bare CORRECT: {len(surf_wrong_bare_right)}")
+
+    print("\n=== Where SurfSense is wrong but long-context is right (top 5) ===")
+    for qid in surf_wrong_lc_right[:5]:
+        arms = by_q[qid]
+        b = arms.get("bare_llm", {})
+        print(f"\n[{qid}] domain={b.get('domain')} qtype={b.get('question_type')}")
+        print(f"  GOLD: {b.get('gold')!r}")
+        for arm_name in ("bare_llm", "long_context", "surfsense"):
+            a = arms.get(arm_name, {})
+            t = (a.get("raw_text") or "").strip()
+            tail = t[-180:] if t else ""
+            grade = a.get("graded", {})
+            print(f"  [{arm_name}] {grade.get('grade')} ({grade.get('method')}): {tail!r}")
+
+    print("\n=== Where Long-Context is wrong but SurfSense is right (top 5) ===")
+    for qid in lc_wrong_surf_right[:5]:
+        arms = by_q[qid]
+        b = arms.get("bare_llm", {})
+        print(f"\n[{qid}] domain={b.get('domain')} qtype={b.get('question_type')}")
+        print(f"  GOLD: {b.get('gold')!r}")
+        for arm_name in ("bare_llm", "long_context", "surfsense"):
+            a = arms.get(arm_name, {})
+            t = (a.get("raw_text") or "").strip()
+            tail = t[-180:] if t else ""
+            grade = a.get("graded", {})
+            print(f"  [{arm_name}] {grade.get('grade')} ({grade.get('method')}): {tail!r}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_evals/scripts/peek_t3_doc_map.py b/surfsense_evals/scripts/peek_t3_doc_map.py
new file mode 100644
index 000000000..6954cdcad
--- /dev/null
+++ b/surfsense_evals/scripts/peek_t3_doc_map.py
@@ -0,0 +1,40 @@
+"""Quick sanity-check for the CRAG Task 3 doc map after ingest."""
+
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+
+
+def main() -> int:
+    p = Path("data/research/maps/crag_t3_doc_map.jsonl")
+    if not p.exists():
+        print(f"Doc map missing: {p}")
+        return 1
+    rows = []
+    settings = {}
+    for line in p.read_text(encoding="utf-8").splitlines():
+        if not line.strip():
+            continue
+        row = json.loads(line)
+        if "__settings__" in row:
+            settings = row
+            continue
+        rows.append(row)
+    print(f"Settings header: {settings}")
+    print(f"Doc map rows:   {len(rows)}")
+    for r in rows:
+        print(f"  qid={r['qid']:<10} domain={r['domain']:<8} qtype={r['question_type']}")
+        print(f"    question: {r['question'][:90]}")
+        print(f"    gold:     {r['gold_answer'][:90]}")
+        print(
+            f"    pages:    {len(r['page_filenames'])} extracted, "
+            f"{len(r['document_ids'])} doc_ids, "
+            f"{len(r['missing_pages'])} missing"
+        )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/surfsense_evals/scripts/summarise_crag_run.py b/surfsense_evals/scripts/summarise_crag_run.py
new file mode 100644
index 000000000..646fb6a97
--- /dev/null
+++ b/surfsense_evals/scripts/summarise_crag_run.py
@@ -0,0 +1,65 @@
+"""Render a quick textual summary of the latest CRAG run."""
+
+from __future__ import annotations
+
+import glob
+import json
+
+
+def main() -> None:
+    runs = sorted(glob.glob("data/research/runs/*/crag/run_artifact.json"))
+    if not runs:
+        print("(no CRAG runs found)")
+        return
+    m = json.load(open(runs[-1], encoding="utf-8"))
+    metrics = m["metrics"]
+
+    print(f"Reading: {runs[-1]}")
+    print(f"n_questions: {m['extra']['n_questions']}")
+    print()
+    print("=== ARMS ===")
+    for arm in ("bare_llm", "long_context", "surfsense"):
+        d = metrics[arm]
+        print(
+            f"{arm:14s}: "
+            f"acc={d['accuracy']*100:5.1f}% (Wilson 95% CI "
+            f"{d['ci_low']*100:.1f}-{d['ci_high']*100:.1f}) | "
+            f"correct={d['correct_rate']*100:5.1f}% "
+            f"missing={d['missing_rate']*100:5.1f}% "
+            f"incorrect={d['incorrect_rate']*100:5.1f}% | "
+            f"truth={d['truthfulness_score']*100:+5.1f}%"
+        )
+
+    print()
+    print("=== DELTAS ===")
+    for key, d in metrics["deltas"].items():
+        print(
+            f"{key:30s}: acc={d['accuracy_pp']:+5.1f}pp "
+            f"truth={d['truthfulness_score_pp']:+5.1f}pp "
+            f"McNemar p={d['mcnemar_p_value']:.4f} ({d['mcnemar_method']}) "
+            f"bootstrap CI [{d['bootstrap_ci_low']:+.1f}, {d['bootstrap_ci_high']:+.1f}]"
+        )
+
+    print()
+    print("=== PER-QUESTION-TYPE TRUTHFULNESS ===")
+    for qt, row in sorted(metrics["per_question_type"].items()):
+        n = row["n"]
+        pieces = [f"{qt:20s} (n={n:3d}):"]
+        for arm in ("bare_llm", "long_context", "surfsense"):
+            if arm in row:
+                pieces.append(f"{arm}={row[arm]['truthfulness_score']*100:+7.1f}%")
+        print(" ".join(pieces))
+
+    print()
+    print("=== PER-DOMAIN TRUTHFULNESS ===")
+    for dom, row in sorted(metrics["per_domain"].items()):
+        n = row["n"]
+        pieces = [f"{dom:10s} (n={n:3d}):"]
+        for arm in ("bare_llm", "long_context", "surfsense"):
+            if arm in row:
+                pieces.append(f"{arm}={row[arm]['truthfulness_score']*100:+7.1f}%")
+        print(" ".join(pieces))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/surfsense_evals/src/surfsense_evals/__init__.py b/surfsense_evals/src/surfsense_evals/__init__.py
new file mode 100644
index 000000000..fc8a81482
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/__init__.py
@@ -0,0 +1,10 @@
+"""SurfSense Evals — domain-agnostic eval harness.
+
+Public entry-point is the ``surfsense_evals`` CLI (``python -m surfsense_evals``).
+Programmatic embedding is a non-goal for now; everything goes through the CLI
++ filesystem outputs (state.json, raw run JSONL, summary.md/json reports).
+"""
+
+from __future__ import annotations
+
+__version__ = "0.1.0"
diff --git a/surfsense_evals/src/surfsense_evals/__main__.py b/surfsense_evals/src/surfsense_evals/__main__.py
new file mode 100644
index 000000000..0efb932dd
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/__main__.py
@@ -0,0 +1,13 @@
+"""Module entry point: ``python -m surfsense_evals ...``.
+
+Delegates to ``core.cli.main``. ``core.cli`` lazily imports
+``surfsense_evals.suites`` so every benchmark gets a chance to register
+before argparse builds its subcommand groups.
+"""
+
+from __future__ import annotations
+
+from surfsense_evals.core.cli import main
+
+if __name__ == "__main__":  # pragma: no cover
+    raise SystemExit(main())
diff --git a/surfsense_evals/src/surfsense_evals/core/__init__.py b/surfsense_evals/src/surfsense_evals/core/__init__.py
new file mode 100644
index 000000000..b5cc64a56
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/__init__.py
@@ -0,0 +1,8 @@
+"""Domain-agnostic infrastructure shared by every suite.
+
+Nothing under ``core/`` knows or cares about a specific evaluation domain.
+Suites live under ``surfsense_evals.suites.<domain>.<benchmark>`` and
+register themselves with ``core.registry`` on import.
+"""
+
+from __future__ import annotations
diff --git a/surfsense_evals/src/surfsense_evals/core/arms/__init__.py b/surfsense_evals/src/surfsense_evals/core/arms/__init__.py
new file mode 100644
index 000000000..0e7ce46e4
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/arms/__init__.py
@@ -0,0 +1,44 @@
+"""Arm protocol + concrete arms shared across suites.
+
+Concrete arms (``NativePdfArm``, ``SurfSenseArm``, ``BareLlmArm``) are
+imported lazily via ``__getattr__`` so consumers that only need the
+protocol — e.g. the registry's ``Arm`` re-export — don't transitively
+pull in ``httpx`` providers or the SurfSense client unless they
+actually use those arms.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from .base import Arm, ArmRequest, ArmResult
+
+if TYPE_CHECKING:  # pragma: no cover
+    from .bare_llm import BareLlmArm
+    from .native_pdf import NativePdfArm
+    from .surfsense import SurfSenseArm
+
+__all__ = [
+    "Arm",
+    "ArmRequest",
+    "ArmResult",
+    "BareLlmArm",
+    "NativePdfArm",
+    "SurfSenseArm",
+]
+
+
+def __getattr__(name: str):  # PEP 562
+    if name == "NativePdfArm":
+        from .native_pdf import NativePdfArm
+
+        return NativePdfArm
+    if name == "SurfSenseArm":
+        from .surfsense import SurfSenseArm
+
+        return SurfSenseArm
+    if name == "BareLlmArm":
+        from .bare_llm import BareLlmArm
+
+        return BareLlmArm
+    raise AttributeError(f"module 'surfsense_evals.core.arms' has no attribute {name!r}")
diff --git a/surfsense_evals/src/surfsense_evals/core/arms/bare_llm.py b/surfsense_evals/src/surfsense_evals/core/arms/bare_llm.py
new file mode 100644
index 000000000..1e3215415
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/arms/bare_llm.py
@@ -0,0 +1,100 @@
+"""Bare-LLM arm: chat completion with prompt-only input, no retrieval.
+
+Pairs with ``SurfSenseArm`` for any benchmark that wants to measure
+"how much does the model already know without RAG?". For factuality /
+multi-hop benchmarks (FRAMES, MuSiQue, …) this produces the published
+"naive prompting" baseline — e.g. FRAMES's 40.8% on Gemini-Pro-1.5.
+
+Symmetric with ``NativePdfArm`` in shape, but the request carries no
+``pdf_paths``: the prompt itself is the only input the model gets.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from ..providers.openrouter_chat import OpenRouterChatProvider
+from .base import Arm, ArmRequest, ArmResult
+
+logger = logging.getLogger(__name__)
+
+
+class BareLlmArm(Arm):
+    """``Arm`` implementation backed by ``OpenRouterChatProvider``.
+
+    ``name`` defaults to ``"bare_llm"`` but is overridable per-instance.
+    Suites that want two distinct OpenRouter chat arms (e.g. CRAG's
+    ``bare_llm`` vs ``long_context`` — both backed by chat-completions
+    but exercising different prompt strategies) instantiate twice with
+    different names so the metrics aggregator can keep them separate.
+    """
+
+    name: str = "bare_llm"
+
+    def __init__(
+        self,
+        *,
+        provider: OpenRouterChatProvider,
+        max_output_tokens: int | None = 1024,
+        system_prompt: str | None = None,
+        name: str | None = None,
+    ) -> None:
+        self._provider = provider
+        self._max_output = max_output_tokens
+        self._system_prompt = system_prompt
+        if name:
+            self.name = name
+
+    @classmethod
+    def from_env(
+        cls,
+        *,
+        api_key: str,
+        model: str,
+        base_url: str = "https://openrouter.ai/api/v1",
+        max_output_tokens: int | None = 1024,
+        system_prompt: str | None = None,
+        name: str | None = None,
+    ) -> BareLlmArm:
+        provider = OpenRouterChatProvider(
+            api_key=api_key,
+            base_url=base_url,
+            model=model,
+        )
+        return cls(
+            provider=provider,
+            max_output_tokens=max_output_tokens,
+            system_prompt=system_prompt,
+            name=name,
+        )
+
+    async def answer(self, request: ArmRequest) -> ArmResult:
+        try:
+            response = await self._provider.complete(
+                prompt=request.prompt,
+                system_prompt=self._system_prompt,
+                max_tokens=self._max_output,
+            )
+        except Exception as exc:  # noqa: BLE001
+            return ArmResult(
+                arm=self.name,
+                question_id=request.question_id,
+                raw_text="",
+                error=f"{type(exc).__name__}: {exc}",
+            )
+        return ArmResult(
+            arm=self.name,
+            question_id=request.question_id,
+            raw_text=response.text,
+            input_tokens=response.input_tokens,
+            output_tokens=response.output_tokens,
+            cost_micros=response.cost_micros,
+            latency_ms=response.latency_ms,
+            extra={
+                "model": self._provider.model,
+                "finish_reason": response.finish_reason,
+            },
+        )
+
+
+__all__ = ["BareLlmArm"]
diff --git a/surfsense_evals/src/surfsense_evals/core/arms/base.py b/surfsense_evals/src/surfsense_evals/core/arms/base.py
new file mode 100644
index 000000000..3e327fef2
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/arms/base.py
@@ -0,0 +1,93 @@
+"""Arm protocol + the value types every arm exchanges with a runner.
+
+An ``Arm`` is "one way to answer one question". Two ship in this PR:
+
+* ``NativePdfArm`` — drop the PDF straight into an OpenRouter
+  chat-completions request with ``plugins=[{file-parser, engine:
+  native}]``. Used for the head-to-head "is the model good enough on
+  its own?" measurement.
+* ``SurfSenseArm`` — POST ``/api/v1/new_chat`` with the question
+  scoped to the relevant ``mentioned_document_ids``; consume the SSE
+  stream and parse citations.
+
+Both implement the same protocol so a benchmark runner only sees
+``Arm.answer(request) -> ArmResult``.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Protocol
+
+
+@dataclass
+class ArmRequest:
+    """One arm-call worth of input.
+
+    * ``question_id`` is opaque — used for logging and joining results.
+    * ``prompt`` is the fully-formatted text the arm should send. The
+      runner is responsible for prompt construction so head-to-head
+      comparisons use byte-identical text.
+    * ``pdf_paths`` is the per-question source PDFs (used by
+      ``NativePdfArm``). Empty for retrieval-only / corpus-wide
+      benchmarks.
+    * ``mentioned_document_ids`` is the SurfSense document scoping list
+      (used by ``SurfSenseArm``). When ``None`` SurfSense retrieves
+      across the whole search space.
+    * ``options`` is a free-form bag of arm-specific overrides
+      (e.g. SurfSense's ``disabled_tools``).
+    """
+
+    question_id: str
+    prompt: str
+    pdf_paths: list[Path] = field(default_factory=list)
+    mentioned_document_ids: list[int] | None = None
+    options: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class ArmResult:
+    """Outcome of one ``Arm.answer`` invocation."""
+
+    arm: str
+    question_id: str
+    raw_text: str
+    answer_letter: str | None = None
+    citations: list[dict[str, Any]] = field(default_factory=list)
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cost_micros: int = 0
+    latency_ms: int = 0
+    error: str | None = None
+    extra: dict[str, Any] = field(default_factory=dict)
+
+    @property
+    def ok(self) -> bool:
+        return self.error is None
+
+    def to_jsonl(self) -> dict[str, Any]:
+        """Stable dict shape for ``data/<suite>/runs/<ts>/<bench>_raw.jsonl``."""
+
+        return {
+            "arm": self.arm,
+            "question_id": self.question_id,
+            "answer_letter": self.answer_letter,
+            "raw_text": self.raw_text,
+            "citations": self.citations,
+            "input_tokens": self.input_tokens,
+            "output_tokens": self.output_tokens,
+            "cost_micros": self.cost_micros,
+            "latency_ms": self.latency_ms,
+            "error": self.error,
+            "extra": self.extra,
+        }
+
+
+class Arm(Protocol):
+    """One concrete way to answer questions for a given run."""
+
+    name: str
+
+    async def answer(self, request: ArmRequest) -> ArmResult:  # pragma: no cover - protocol
+        ...
diff --git a/surfsense_evals/src/surfsense_evals/core/arms/native_pdf.py b/surfsense_evals/src/surfsense_evals/core/arms/native_pdf.py
new file mode 100644
index 000000000..9294ed032
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/arms/native_pdf.py
@@ -0,0 +1,104 @@
+"""Native-PDF arm: drop the PDF straight into OpenRouter chat-completions.
+
+Generic across suites — a benchmark just supplies the prompt and the
+single PDF path. Multi-PDF questions concatenate in the runner before
+calling this arm so each ``answer`` invocation feeds the model exactly
+one ``data:application/pdf;base64,...`` block (matches the human
+"drag-and-drop one PDF into Claude" intent).
+"""
+
+from __future__ import annotations
+
+import logging
+
+from ..parse.answer_letter import extract_answer_letter
+from ..providers.openrouter_pdf import OpenRouterPdfProvider, PdfEngine
+from .base import Arm, ArmRequest, ArmResult
+
+logger = logging.getLogger(__name__)
+
+
+class NativePdfArm(Arm):
+    """``Arm`` implementation backed by ``OpenRouterPdfProvider``."""
+
+    name: str = "native_pdf"
+
+    def __init__(
+        self,
+        *,
+        provider: OpenRouterPdfProvider,
+        max_output_tokens: int | None = 1024,
+    ) -> None:
+        self._provider = provider
+        self._max_output = max_output_tokens
+
+    @classmethod
+    def from_env(
+        cls,
+        *,
+        api_key: str,
+        model: str,
+        engine: PdfEngine = PdfEngine.NATIVE,
+        base_url: str = "https://openrouter.ai/api/v1",
+        max_output_tokens: int | None = 1024,
+    ) -> NativePdfArm:
+        provider = OpenRouterPdfProvider(
+            api_key=api_key,
+            base_url=base_url,
+            model=model,
+            engine=engine,
+        )
+        return cls(provider=provider, max_output_tokens=max_output_tokens)
+
+    async def answer(self, request: ArmRequest) -> ArmResult:
+        if not request.pdf_paths:
+            return ArmResult(
+                arm=self.name,
+                question_id=request.question_id,
+                raw_text="",
+                error="native_pdf arm requires at least one pdf_path",
+            )
+        if len(request.pdf_paths) > 1:
+            # The plan calls out one-PDF-per-question so the head-to-head
+            # is fair; runners are responsible for upstream concatenation.
+            logger.debug(
+                "qid=%s native_pdf got %d pdfs; using first only",
+                request.question_id,
+                len(request.pdf_paths),
+            )
+        pdf = request.pdf_paths[0]
+        try:
+            response = await self._provider.complete(
+                prompt=request.prompt,
+                pdf_path=pdf,
+                max_tokens=self._max_output,
+            )
+        except Exception as exc:  # noqa: BLE001
+            return ArmResult(
+                arm=self.name,
+                question_id=request.question_id,
+                raw_text="",
+                error=f"{type(exc).__name__}: {exc}",
+            )
+
+        letter = extract_answer_letter(response.text)
+        return ArmResult(
+            arm=self.name,
+            question_id=request.question_id,
+            raw_text=response.text,
+            answer_letter=letter.letter,
+            input_tokens=response.input_tokens,
+            output_tokens=response.output_tokens,
+            cost_micros=response.cost_micros,
+            latency_ms=response.latency_ms,
+            extra={
+                "model": self._provider.model,
+                "engine": self._provider.engine.value,
+                "answer_letter_strategy": letter.strategy,
+                "finish_reason": response.finish_reason,
+                "pdf_filename": pdf.name,
+            },
+        )
+
+
+__all__ = ["NativePdfArm"]
diff --git a/surfsense_evals/src/surfsense_evals/core/arms/surfsense.py b/surfsense_evals/src/surfsense_evals/core/arms/surfsense.py
new file mode 100644
index 000000000..a84350dfd
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/arms/surfsense.py
@@ -0,0 +1,104 @@
+"""SurfSense arm: per-question fresh thread + ``/api/v1/new_chat`` stream.
+
+For every question:
+
+* Create a fresh ``NewChatThread`` on the suite's pinned SearchSpace.
+  This sidesteps the per-thread ``THREAD_BUSY`` 409 (a single thread
+  serialises turns, see ``surfsense_backend/app/routes/new_chat_routes.py:191-220``).
+* POST ``/api/v1/new_chat`` with the prompt and the per-question
+  ``mentioned_document_ids`` (``surfsense_backend/app/schemas/new_chat.py:241-243``).
+* Consume the SSE stream via ``NewChatClient.ask`` which accumulates
+  text deltas and returns ``StreamedAnswer``.
+* Optionally delete the thread (default ON for ephemeral runs).
+
+Citations are parsed from the streamed assistant text via the
+canonical regex port; chunk ids are returned in ``ArmResult.citations``
+for the runner to map back to corpus ids.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from ..clients import NewChatClient
+from ..parse.answer_letter import extract_answer_letter
+from .base import Arm, ArmRequest, ArmResult
+
+logger = logging.getLogger(__name__)
+
+
+class SurfSenseArm(Arm):
+    """``Arm`` implementation backed by ``NewChatClient``."""
+
+    name: str = "surfsense"
+
+    def __init__(
+        self,
+        *,
+        client: NewChatClient,
+        search_space_id: int,
+        ephemeral_threads: bool = True,
+        thread_title_prefix: str = "eval",
+    ) -> None:
+        self._client = client
+        self._search_space_id = search_space_id
+        self._ephemeral = ephemeral_threads
+        self._title_prefix = thread_title_prefix
+
+    async def answer(self, request: ArmRequest) -> ArmResult:
+        thread_id: int | None = None
+        try:
+            thread_id = await self._client.create_thread(
+                search_space_id=self._search_space_id,
+                title=f"{self._title_prefix}:{request.question_id}",
+            )
+            answer = await self._client.ask(
+                thread_id=thread_id,
+                search_space_id=self._search_space_id,
+                user_query=request.prompt,
+                mentioned_document_ids=request.mentioned_document_ids,
+                disabled_tools=request.options.get("disabled_tools"),
+            )
+        except Exception as exc:  # noqa: BLE001
+            return ArmResult(
+                arm=self.name,
+                question_id=request.question_id,
+                raw_text="",
+                error=f"{type(exc).__name__}: {exc}",
+                extra={"thread_id": thread_id},
+            )
+        finally:
+            if self._ephemeral and thread_id is not None:
+                try:
+                    await self._client.delete_thread(thread_id)
+                except Exception as exc:  # noqa: BLE001
+                    logger.debug(
+                        "Failed to delete thread %s: %s", thread_id, exc
+                    )
+
+        letter = extract_answer_letter(answer.text)
+        return ArmResult(
+            arm=self.name,
+            question_id=request.question_id,
+            raw_text=answer.text,
+            answer_letter=letter.letter,
+            citations=answer.citations,
+            latency_ms=answer.latency_ms,
+            # SurfSense doesn't surface input/output token counts in the
+            # SSE stream today; leaving the cost / token fields at 0
+            # documents that gap. Estimating from the raw text would
+            # bias the comparison against the SurfSense arm.
+            extra={
+                "thread_id": thread_id,
+                "search_space_id": self._search_space_id,
+                "answer_letter_strategy": letter.strategy,
+                "user_message_id": answer.user_message_id,
+                "assistant_message_id": answer.assistant_message_id,
+                "finished_normally": answer.finished_normally,
+                "n_raw_events": len(answer.raw_events),
+                "n_mentioned_documents": len(request.mentioned_document_ids or []),
+            },
+        )
+
+
+__all__ = ["SurfSenseArm"]
diff --git a/surfsense_evals/src/surfsense_evals/core/auth.py b/surfsense_evals/src/surfsense_evals/core/auth.py
new file mode 100644
index 000000000..1e7cc5b3e
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/auth.py
@@ -0,0 +1,273 @@
+"""Dual-mode credential resolver + httpx client factory with 401 auto-refresh.
+
+SurfSense supports ``AUTH_TYPE=LOCAL`` (email + password) and
+``AUTH_TYPE=GOOGLE`` (Google OAuth → frontend stores JWT in ``localStorage``).
+There is no headless equivalent of the Google flow, so the harness handles
+both modes by treating the JWT as the universal credential:
+
+* **LOCAL**: harness POSTs form-encoded ``username`` + ``password`` to
+  ``/auth/jwt/login``, reads ``{access_token, refresh_token}``.
+* **GOOGLE / pre-issued JWT**: operator pastes their existing JWT (and
+  optionally refresh token) into ``SURFSENSE_JWT`` /
+  ``SURFSENSE_REFRESH_TOKEN``; harness skips login.
+
+Either way ``client_with_auth`` returns one shared
+``httpx.AsyncClient`` with ``Authorization: Bearer <jwt>`` set and an
+event hook that, on a 401 with a refresh token in scope, calls
+``POST /auth/jwt/refresh`` and retries the original request once. JWT
+lifetime defaults to one day backend-side, so this matters for long
+MIRAGE runs.
+"""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, field
+from typing import Any
+
+import httpx
+
+from .config import Config
+
+logger = logging.getLogger(__name__)
+
+
+class CredentialError(RuntimeError):
+    """Raised when no credential mode is configured."""
+
+
+_NO_CREDENTIALS_MESSAGE = (
+    "No SurfSense credentials configured. Set ONE of:\n"
+    "  (LOCAL)  SURFSENSE_USER_EMAIL + SURFSENSE_USER_PASSWORD\n"
+    "  (GOOGLE) SURFSENSE_JWT (and optionally SURFSENSE_REFRESH_TOKEN)\n"
+    "For GOOGLE: log in to SurfSense in your browser, open DevTools → "
+    "Application → Local Storage → copy `surfsense_bearer_token` and "
+    "`surfsense_refresh_token` into those env vars."
+)
+
+
+@dataclass
+class TokenBundle:
+    """Mutable token state — refresh hook updates ``access_token`` in place."""
+
+    access_token: str
+    refresh_token: str | None = None
+    # ``mode`` is informational only ("local" or "jwt"); used in error messages.
+    mode: str = "jwt"
+
+
+# ---------------------------------------------------------------------------
+# Token acquisition
+# ---------------------------------------------------------------------------
+
+
+async def acquire_token(config: Config, *, http: httpx.AsyncClient | None = None) -> TokenBundle:
+    """Resolve credentials → ``TokenBundle``.
+
+    Precedence:
+
+    1. ``SURFSENSE_JWT`` set → use it directly. Refresh token captured if
+       supplied.
+    2. ``SURFSENSE_USER_EMAIL`` + ``SURFSENSE_USER_PASSWORD`` set →
+       form-encoded POST to ``/auth/jwt/login``.
+    3. Neither → raise ``CredentialError``.
+
+    The optional ``http`` argument lets tests inject a mocked client; if
+    omitted a one-shot client is created for the login call only.
+    """
+
+    if config.has_jwt_mode():
+        return TokenBundle(
+            access_token=config.surfsense_jwt or "",
+            refresh_token=config.surfsense_refresh_token,
+            mode="jwt",
+        )
+
+    if config.has_local_mode():
+        async def _login(client: httpx.AsyncClient) -> TokenBundle:
+            response = await client.post(
+                f"{config.surfsense_api_base}/auth/jwt/login",
+                data={
+                    "username": config.surfsense_user_email,
+                    "password": config.surfsense_user_password,
+                },
+                headers={"Accept": "application/json"},
+            )
+            if response.status_code != 200:
+                raise CredentialError(
+                    f"LOCAL login failed (HTTP {response.status_code}): "
+                    f"{_safe_text(response)}"
+                )
+            payload = response.json()
+            access = payload.get("access_token")
+            if not access:
+                raise CredentialError(
+                    f"LOCAL login response missing access_token: {payload!r}"
+                )
+            return TokenBundle(
+                access_token=access,
+                refresh_token=payload.get("refresh_token") or None,
+                mode="local",
+            )
+
+        if http is not None:
+            return await _login(http)
+        async with httpx.AsyncClient(timeout=httpx.Timeout(30.0, connect=10.0)) as client:
+            return await _login(client)
+
+    raise CredentialError(_NO_CREDENTIALS_MESSAGE)
+
+
+def _safe_text(response: httpx.Response, *, limit: int = 200) -> str:
+    body = response.text or ""
+    if len(body) > limit:
+        return body[:limit] + "…"
+    return body
+
+
+# ---------------------------------------------------------------------------
+# httpx client + 401 auto-refresh
+# ---------------------------------------------------------------------------
+
+
+class _AuthState:
+    """Shared mutable holder closed over by the auth event hook.
+
+    Kept private so callers can't accidentally mutate the access token
+    out-of-band; ``client_with_auth`` returns the client directly.
+    """
+
+    def __init__(self, config: Config, tokens: TokenBundle) -> None:
+        self.config = config
+        self.tokens = tokens
+        self._refresh_in_flight: bool = False
+
+
+def _build_auth_request(state: _AuthState, request: httpx.Request) -> None:
+    """Stamp the current bearer onto ``request`` (request-event hook)."""
+
+    request.headers["Authorization"] = f"Bearer {state.tokens.access_token}"
+
+
+async def _refresh_access_token(
+    state: _AuthState, transport: httpx.AsyncBaseTransport | None = None
+) -> bool:
+    """POST ``/auth/jwt/refresh`` with the current refresh token.
+
+    Returns ``True`` on success and updates ``state.tokens`` in place.
+    Returns ``False`` if no refresh token is configured or the call fails.
+    Recursive 401s are avoided by using a *new* client without the auth
+    hook.
+    """
+
+    refresh = state.tokens.refresh_token
+    if not refresh:
+        return False
+    try:
+        async with httpx.AsyncClient(
+            timeout=httpx.Timeout(15.0, connect=5.0),
+            transport=transport,
+        ) as inner:
+            response = await inner.post(
+                f"{state.config.surfsense_api_base}/auth/jwt/refresh",
+                json={"refresh_token": refresh},
+                headers={"Accept": "application/json"},
+            )
+    except httpx.HTTPError as exc:
+        logger.warning("Token refresh transport error: %s", exc)
+        return False
+    if response.status_code != 200:
+        logger.warning(
+            "Token refresh rejected (HTTP %s): %s",
+            response.status_code,
+            _safe_text(response),
+        )
+        return False
+    payload = response.json()
+    new_access = payload.get("access_token")
+    if not new_access:
+        logger.warning("Refresh response missing access_token: %r", payload)
+        return False
+    state.tokens.access_token = new_access
+    new_refresh = payload.get("refresh_token")
+    if new_refresh:
+        state.tokens.refresh_token = new_refresh
+    return True
+
+
+def client_with_auth(
+    config: Config,
+    tokens: TokenBundle,
+    *,
+    timeout: float = 60.0,
+    transport: httpx.AsyncBaseTransport | None = None,
+    base_url: str | None = None,
+) -> httpx.AsyncClient:
+    """Build a single shared ``httpx.AsyncClient`` for the SurfSense API.
+
+    * Stamps ``Authorization: Bearer <jwt>`` on every outgoing request.
+    * On any 401 response, attempts a single refresh (if a refresh token
+      is configured) and retries the original request once. The retry
+      uses a fresh stamping of the bearer header, so a successful
+      refresh transparently unblocks long runs.
+    * The retry is best-effort — repeated 401s after a refresh attempt
+      are surfaced to the caller so they can re-auth manually.
+
+    Pass ``base_url`` to scope a sub-client (e.g. tests). The default
+    keeps full URLs in calling code, which makes route-spec citations in
+    the codebase easier to grep.
+    """
+
+    state = _AuthState(config, tokens)
+
+    async def _request_hook(request: httpx.Request) -> None:
+        _build_auth_request(state, request)
+
+    # ``send`` is overridden in ``_AuthAwareClient`` to retry once on 401
+    # after refreshing the bearer. httpx's response event-hook can't
+    # *replace* a response, so we need a subclass to do the replay.
+    client = _AuthAwareClient(
+        state=state,
+        transport=transport,
+        timeout=httpx.Timeout(timeout, connect=10.0),
+        base_url=base_url or "",
+        event_hooks={"request": [_request_hook]},
+    )
+    return client
+
+
+class _AuthAwareClient(httpx.AsyncClient):
+    """``AsyncClient`` that retries once on 401 after refreshing the token."""
+
+    def __init__(self, *, state: _AuthState, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self._auth_state = state
+
+    async def send(  # type: ignore[override]
+        self, request: httpx.Request, **kwargs: Any
+    ) -> httpx.Response:
+        response = await super().send(request, **kwargs)
+        if response.status_code != 401:
+            return response
+        # Don't refresh while a refresh is itself in flight.
+        if self._auth_state._refresh_in_flight:
+            return response
+        self._auth_state._refresh_in_flight = True
+        try:
+            refreshed = await _refresh_access_token(self._auth_state)
+        finally:
+            self._auth_state._refresh_in_flight = False
+        if not refreshed:
+            return response
+        # Re-stamp and replay once. ``request`` is reusable.
+        await response.aclose()
+        request.headers["Authorization"] = f"Bearer {self._auth_state.tokens.access_token}"
+        return await super().send(request, **kwargs)
+
+
+__all__ = [
+    "CredentialError",
+    "TokenBundle",
+    "acquire_token",
+    "client_with_auth",
+]
diff --git a/surfsense_evals/src/surfsense_evals/core/cli.py b/surfsense_evals/src/surfsense_evals/core/cli.py
new file mode 100644
index 000000000..3d4d0fd24
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/cli.py
@@ -0,0 +1,790 @@
+"""Argparse CLI for ``python -m surfsense_evals``.
+
+Subcommands:
+
+* ``setup    --suite <name> --provider-model <slug> [--agent-llm-id <int>]``
+* ``teardown --suite <name>``
+* ``models  list [--provider openrouter] [--grep <s>]``
+* ``suites  list``
+* ``benchmarks list [--suite <name>]``
+* ``ingest <suite> <benchmark> [benchmark flags]``
+* ``run    <suite> <benchmark> [benchmark flags]``
+* ``report --suite <name> [--benchmark <name>]``
+
+The ``ingest`` / ``run`` subparsers are built dynamically from the
+registry — adding a new benchmark only requires registering it; the
+CLI surface comes for free. ``add_run_args`` lets each benchmark
+publish its own flags.
+
+Design choices worth flagging:
+
+* ``setup`` rejects ``agent_llm_id == 0`` (Auto / LiteLLM router) so
+  per-question accuracy is reproducible.
+* ``setup`` validates that the picked LLM config has
+  ``provider == "OPENROUTER"`` and ``model_name == --provider-model``
+  before declaring success — both arms of the head-to-head must hit
+  the same OpenRouter slug.
+* Lifecycle state is keyed by suite, so ``setup --suite legal`` does
+  not touch ``medical``'s SearchSpace, and vice versa.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import sys
+from dataclasses import dataclass
+from typing import Any
+
+import sys
+
+import httpx
+from rich.console import Console
+from rich.table import Table
+
+# Windows' legacy console (cp1252) crashes when Rich tries to write characters
+# outside the active codepage (e.g. '->', em-dashes, box-drawing). Force UTF-8
+# on stdout/stderr and disable Rich's legacy_windows render path so the file
+# stream is used directly. Modern Windows (>=10, VS Code terminal, Windows
+# Terminal, PowerShell, cmd) all interpret ANSI escapes natively.
+if sys.platform == "win32":
+    for _stream in (sys.stdout, sys.stderr):
+        try:
+            _stream.reconfigure(encoding="utf-8", errors="replace")
+        except (AttributeError, ValueError):
+            pass
+
+from . import registry
+from .auth import CredentialError, acquire_token, client_with_auth
+from .clients import SearchSpaceClient
+from .clients.search_space import LlmPreferences
+from .config import (
+    DEFAULT_SCENARIO,
+    SCENARIOS,
+    Config,
+    SuiteState,
+    clear_suite_state,
+    get_suite_state,
+    load_config,
+    set_suite_state,
+    utc_iso_timestamp,
+)
+from .vision_llm import VisionConfigError, resolve_vision_llm
+
+logger = logging.getLogger("surfsense_evals")
+console = Console(legacy_windows=False)
+
+
+# ---------------------------------------------------------------------------
+# Discovery
+# ---------------------------------------------------------------------------
+
+
+def _discover_suites() -> list[str]:
+    """Trigger ``register(...)`` for every benchmark.
+
+    Imported lazily so ``models list`` (which doesn't need any
+    benchmark) still runs fast.
+    """
+
+    from surfsense_evals.suites import discover_suites
+
+    return discover_suites()
+
+
+# ---------------------------------------------------------------------------
+# Global LLM config fetcher (used by setup + models list)
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class LlmConfigEntry:
+    id: int
+    name: str
+    provider: str
+    model_name: str
+    raw: dict[str, Any]
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> LlmConfigEntry:
+        return cls(
+            id=int(payload["id"]),
+            name=str(payload.get("name", "")),
+            provider=str(payload.get("provider", "")).upper(),
+            model_name=str(payload.get("model_name", "")),
+            raw=payload,
+        )
+
+
+async def _list_global_llm_configs(http: httpx.AsyncClient, base: str) -> list[LlmConfigEntry]:
+    response = await http.get(
+        f"{base}/api/v1/global-new-llm-configs",
+        headers={"Accept": "application/json"},
+    )
+    response.raise_for_status()
+    payload = response.json()
+    if not isinstance(payload, list):
+        raise RuntimeError(f"Unexpected /global-new-llm-configs payload: {payload!r}")
+    return [LlmConfigEntry.from_payload(item) for item in payload]
+
+
+def _resolve_openrouter_id(
+    candidates: list[LlmConfigEntry],
+    provider_model: str,
+    *,
+    explicit_id: int | None,
+) -> int:
+    """Resolve the SurfSense LLM id for ``provider_model``.
+
+    Behaviour:
+
+    * If ``explicit_id`` is given: return it directly. The caller is
+      then expected to GET-validate that the row's
+      ``provider == "OPENROUTER"`` and ``model_name`` matches the slug.
+      That branch supports positive BYOK ``NewLLMConfig`` rows whose
+      slugs may overlap with global OpenRouter virtuals.
+    * Otherwise: filter to ``provider == "OPENROUTER"`` and
+      ``model_name == provider_model``. Expect exactly one match —
+      raise with a friendly message otherwise.
+    """
+
+    if explicit_id is not None:
+        return explicit_id
+
+    matches = [
+        c for c in candidates if c.provider == "OPENROUTER" and c.model_name == provider_model
+    ]
+    if not matches:
+        sample = ", ".join(
+            f"{c.model_name} (id={c.id})" for c in candidates if c.provider == "OPENROUTER"
+        )[:600]
+        raise RuntimeError(
+            f"No OpenRouter config found for slug '{provider_model}'. "
+            "Make sure `openrouter_integration.enabled: true` in "
+            "global_llm_config.yaml and that the Celery worker has "
+            "finished its first refresh (the catalogue is fetched at "
+            "Celery startup per `app/celery_app.py`). "
+            f"Available OpenRouter slugs (sample): {sample or '<none>'}.\n"
+            "Browse with: python -m surfsense_evals models list --grep <substring>"
+        )
+    if len(matches) > 1:
+        listing = "\n".join(f"  id={c.id}  name={c.name!r}" for c in matches)
+        raise RuntimeError(
+            f"Multiple OpenRouter configs for slug '{provider_model}':\n{listing}\n"
+            "Pass --agent-llm-id <id> to disambiguate."
+        )
+    return matches[0].id
+
+
+# ---------------------------------------------------------------------------
+# Subcommand implementations
+# ---------------------------------------------------------------------------
+
+
+async def _cmd_setup(args: argparse.Namespace) -> int:
+    suite = args.suite
+    provider_model: str = args.provider_model
+    explicit_id: int | None = args.agent_llm_id
+    scenario: str = args.scenario
+    vision_llm_slug: str | None = args.vision_llm
+    native_arm_model: str | None = args.native_arm_model
+    skip_vision_setup: bool = args.no_vision_llm_setup
+
+    if explicit_id == 0:
+        console.print(
+            "[red]agent_llm_id == 0 (Auto / LiteLLM router) is not allowed — "
+            "results would not be reproducible.[/red]"
+        )
+        return 2
+
+    if scenario not in SCENARIOS:
+        console.print(
+            f"[red]Unknown scenario {scenario!r}. Pick one of: "
+            f"{', '.join(SCENARIOS)}[/red]"
+        )
+        return 2
+
+    # Scenario-specific validation. Each branch documents WHY the rule
+    # exists so the operator's mental model matches what the runner does.
+    if scenario == "cost-arbitrage":
+        if not native_arm_model:
+            console.print(
+                "[red]--scenario cost-arbitrage requires --native-arm-model "
+                "<vision-capable slug>.[/red] The native arm needs a vision "
+                "model to fairly answer image-bearing questions; SurfSense "
+                "answers from already-extracted text via --provider-model."
+            )
+            return 2
+        if native_arm_model == provider_model:
+            console.print(
+                "[yellow]--native-arm-model equals --provider-model in "
+                "cost-arbitrage; that's degenerate (same as head-to-head). "
+                "Pick a different slug or switch to --scenario head-to-head.[/yellow]"
+            )
+    elif scenario in ("head-to-head", "symmetric-cheap"):
+        if native_arm_model:
+            console.print(
+                f"[yellow]--native-arm-model is ignored for --scenario {scenario} "
+                f"(both arms answer with --provider-model={provider_model!r}).[/yellow]"
+            )
+            native_arm_model = None  # don't persist a stale value
+
+    config = load_config()
+    try:
+        token = await acquire_token(config)
+    except CredentialError as exc:
+        console.print(f"[red]{exc}[/red]")
+        return 2
+
+    async with client_with_auth(config, token) as http:
+        candidates = await _list_global_llm_configs(http, config.surfsense_api_base)
+
+        try:
+            agent_llm_id = _resolve_openrouter_id(
+                candidates, provider_model, explicit_id=explicit_id
+            )
+        except RuntimeError as exc:
+            console.print(f"[red]{exc}[/red]")
+            return 2
+
+        ss_client = SearchSpaceClient(http, config.surfsense_api_base)
+        existing = get_suite_state(config, suite)
+        if existing is not None:
+            try:
+                row = await ss_client.get(existing.search_space_id)
+                console.print(
+                    f"Reusing existing SearchSpace [cyan]{row.name}[/cyan] "
+                    f"(id={row.id}) for suite [bold]{suite}[/bold]."
+                )
+                search_space_id = row.id
+            except httpx.HTTPStatusError as exc:
+                if exc.response.status_code == 404:
+                    console.print(
+                        f"[yellow]state.json pointed at SearchSpace id={existing.search_space_id} "
+                        f"but backend returned 404; creating a fresh one.[/yellow]"
+                    )
+                    existing = None
+                else:
+                    raise
+        if existing is None:
+            ss_name = f"eval-{suite}-{utc_iso_timestamp()}"
+            row = await ss_client.create(
+                ss_name, description=f"surfsense-evals lifecycle ({suite})"
+            )
+            console.print(
+                f"Created SearchSpace [cyan]{row.name}[/cyan] (id={row.id}) "
+                f"for suite [bold]{suite}[/bold]."
+            )
+            search_space_id = row.id
+
+        # Resolve + attach the vision LLM config (unless explicitly skipped).
+        # Asymmetric scenarios make the vision LLM at ingest a hard
+        # requirement — without it, SurfSense's chunks have no image
+        # content and the entire framing collapses.
+        vision_required = scenario in ("symmetric-cheap", "cost-arbitrage")
+        vision_config_id: int | None = None
+        vision_provider_model: str | None = None
+        if not skip_vision_setup and (vision_required or vision_llm_slug is not None):
+            try:
+                vision_candidates = await ss_client.list_global_vision_llm_configs()
+                resolved = resolve_vision_llm(
+                    vision_candidates, explicit_slug=vision_llm_slug
+                )
+            except VisionConfigError as exc:
+                console.print(f"[red]{exc}[/red]")
+                return 2
+            vision_config_id = resolved.config_id
+            vision_provider_model = resolved.provider_model
+            console.print(
+                f"Vision LLM at ingest: [cyan]{vision_provider_model}[/cyan] "
+                f"(id={vision_config_id}, selected_via={resolved.selected_via})."
+            )
+
+        pref_kwargs: dict[str, Any] = {"agent_llm_id": agent_llm_id}
+        if vision_config_id is not None:
+            pref_kwargs["vision_llm_config_id"] = vision_config_id
+
+        await ss_client.set_llm_preferences(search_space_id, **pref_kwargs)
+        prefs = await ss_client.get_llm_preferences(search_space_id)
+        if not _validate_pin(prefs, provider_model):
+            agent = prefs.agent_llm or {}
+            console.print(
+                f"[red]LLM pin validation FAILED.[/red] After PUT, "
+                f"agent_llm.provider={agent.get('provider')!r}, "
+                f"model_name={agent.get('model_name')!r}; expected "
+                f"provider=OPENROUTER, model_name={provider_model!r}."
+            )
+            return 2
+        if vision_config_id is not None and prefs.vision_llm_config_id != vision_config_id:
+            console.print(
+                f"[red]Vision LLM pin validation FAILED.[/red] After PUT, "
+                f"vision_llm_config_id={prefs.vision_llm_config_id!r}; "
+                f"expected {vision_config_id!r}."
+            )
+            return 2
+
+        suite_state = SuiteState(
+            search_space_id=search_space_id,
+            agent_llm_id=agent_llm_id,
+            provider_model=provider_model,
+            created_at=utc_iso_timestamp(),
+            ingestion_maps=existing.ingestion_maps if existing else {},
+            scenario=scenario,
+            vision_llm_config_id=vision_config_id,
+            vision_provider_model=vision_provider_model,
+            native_arm_model=native_arm_model,
+        )
+        set_suite_state(config, suite, suite_state)
+
+    summary_bits = [
+        f"suite={suite!r}",
+        f"scenario={scenario!r}",
+        f"search_space_id={suite_state.search_space_id}",
+        f"agent_llm_id={suite_state.agent_llm_id}",
+        f"provider_model={suite_state.provider_model!r}",
+    ]
+    if suite_state.vision_provider_model:
+        summary_bits.append(f"vision_provider_model={suite_state.vision_provider_model!r}")
+    if suite_state.native_arm_model:
+        summary_bits.append(f"native_arm_model={suite_state.native_arm_model!r}")
+    console.print(f"[green]setup OK[/green] {' '.join(summary_bits)}")
+    return 0
+
+
+def _validate_pin(prefs: LlmPreferences, provider_model: str) -> bool:
+    agent = prefs.agent_llm or {}
+    return (
+        str(agent.get("provider", "")).upper() == "OPENROUTER"
+        and str(agent.get("model_name", "")) == provider_model
+    )
+
+
+async def _cmd_teardown(args: argparse.Namespace) -> int:
+    suite = args.suite
+    config = load_config()
+    state = get_suite_state(config, suite)
+    if state is None:
+        console.print(f"[yellow]No state for suite {suite!r}; nothing to tear down.[/yellow]")
+        return 0
+    try:
+        token = await acquire_token(config)
+    except CredentialError as exc:
+        console.print(f"[red]{exc}[/red]")
+        return 2
+    async with client_with_auth(config, token) as http:
+        ss_client = SearchSpaceClient(http, config.surfsense_api_base)
+        try:
+            await ss_client.delete(state.search_space_id)
+        except httpx.HTTPStatusError as exc:
+            console.print(
+                f"[yellow]DELETE failed (HTTP {exc.response.status_code}); "
+                "clearing state.json anyway.[/yellow]"
+            )
+    clear_suite_state(config, suite)
+    console.print(
+        f"[green]teardown OK[/green] suite={suite!r} "
+        f"(SearchSpace soft-deleted, state.json slot cleared)."
+    )
+    return 0
+
+
+async def _cmd_models_list(args: argparse.Namespace) -> int:
+    config = load_config()
+    try:
+        token = await acquire_token(config)
+    except CredentialError as exc:
+        console.print(f"[red]{exc}[/red]")
+        return 2
+    async with client_with_auth(config, token) as http:
+        entries = await _list_global_llm_configs(http, config.surfsense_api_base)
+    grep = (args.grep or "").lower()
+    provider_filter = (args.provider or "").upper()
+    rows: list[LlmConfigEntry] = []
+    for e in entries:
+        if provider_filter and e.provider != provider_filter:
+            continue
+        if grep and grep not in e.model_name.lower() and grep not in e.name.lower():
+            continue
+        rows.append(e)
+    table = Table(
+        title=f"Global LLM configs ({len(rows)} of {len(entries)})",
+        show_lines=False,
+    )
+    table.add_column("id", justify="right", style="cyan")
+    table.add_column("provider", style="magenta")
+    table.add_column("model_name", style="green")
+    table.add_column("name")
+    for e in sorted(rows, key=lambda x: (x.provider, x.model_name)):
+        table.add_row(str(e.id), e.provider, e.model_name, e.name)
+    console.print(table)
+    return 0
+
+
+def _cmd_suites_list(_args: argparse.Namespace) -> int:
+    _discover_suites()
+    suites = registry.list_suites()
+    if not suites:
+        console.print(
+            "[yellow]No suites registered. Drop a benchmark under "
+            "src/surfsense_evals/suites/<domain>/<benchmark>/.[/yellow]"
+        )
+        return 0
+    table = Table(title=f"Registered suites ({len(suites)})")
+    table.add_column("suite", style="bold")
+    table.add_column("benchmarks", style="green")
+    for suite in suites:
+        names = [b.name for b in registry.list_benchmarks(suite)]
+        table.add_row(suite, ", ".join(names) or "<none>")
+    console.print(table)
+    return 0
+
+
+def _cmd_benchmarks_list(args: argparse.Namespace) -> int:
+    _discover_suites()
+    benchmarks = registry.list_benchmarks(args.suite)
+    if not benchmarks:
+        console.print("[yellow]No benchmarks registered.[/yellow]")
+        return 0
+    table = Table(title=f"Benchmarks ({len(benchmarks)})")
+    table.add_column("suite", style="bold")
+    table.add_column("name", style="cyan")
+    table.add_column("headline", justify="center")
+    table.add_column("description")
+    for b in benchmarks:
+        table.add_row(
+            b.suite,
+            b.name,
+            "yes" if b.headline else "no",
+            getattr(b, "description", ""),
+        )
+    console.print(table)
+    return 0
+
+
+async def _cmd_ingest(args: argparse.Namespace) -> int:
+    benchmark = registry.get(args.suite, args.benchmark)
+    config = load_config()
+    state = get_suite_state(config, args.suite)
+    if state is None:
+        console.print(
+            f"[red]No setup for suite {args.suite!r}. Run "
+            f"`python -m surfsense_evals setup --suite {args.suite} "
+            f"--provider-model <slug>` first.[/red]"
+        )
+        return 2
+    try:
+        token = await acquire_token(config)
+    except CredentialError as exc:
+        console.print(f"[red]{exc}[/red]")
+        return 2
+
+    # Forward parsed CLI flags into ingest() so a benchmark can honour
+    # its own flags (e.g. MIRAGE's --skip-snippet-filter / --corpus).
+    extra_kwargs = {
+        k: v
+        for k, v in vars(args).items()
+        if k not in {"_func", "_async", "command", "subcommand", "suite", "benchmark", "log_level"}
+    }
+    async with client_with_auth(config, token) as http:
+        ctx = registry.RunContext(
+            suite=args.suite,
+            benchmark=args.benchmark,
+            config=config,
+            suite_state=state,
+            http=http,
+        )
+        await benchmark.ingest(ctx, **extra_kwargs)
+    console.print(f"[green]ingest OK[/green] {args.suite}/{args.benchmark}")
+    return 0
+
+
+async def _cmd_run(args: argparse.Namespace) -> int:
+    benchmark = registry.get(args.suite, args.benchmark)
+    config = load_config()
+    state = get_suite_state(config, args.suite)
+    if state is None:
+        console.print(
+            f"[red]No setup for suite {args.suite!r}. Run "
+            f"`python -m surfsense_evals setup --suite {args.suite} "
+            f"--provider-model <slug>` first.[/red]"
+        )
+        return 2
+    try:
+        token = await acquire_token(config)
+    except CredentialError as exc:
+        console.print(f"[red]{exc}[/red]")
+        return 2
+
+    extra_kwargs = {
+        k: v
+        for k, v in vars(args).items()
+        if k not in {"_func", "_async", "command", "subcommand", "suite", "benchmark", "log_level"}
+    }
+    async with client_with_auth(config, token) as http:
+        ctx = registry.RunContext(
+            suite=args.suite,
+            benchmark=args.benchmark,
+            config=config,
+            suite_state=state,
+            http=http,
+        )
+        artifact = await benchmark.run(ctx, **extra_kwargs)
+
+    console.print(
+        f"[green]run OK[/green] {args.suite}/{args.benchmark} → "
+        f"{artifact.raw_path}"
+    )
+    return 0
+
+
+async def _cmd_report(args: argparse.Namespace) -> int:
+    from .report import write_report
+
+    benchmark_filter = args.benchmark
+    config = load_config()
+    state = get_suite_state(config, args.suite)
+    if state is None:
+        console.print(f"[red]No setup for suite {args.suite!r}.[/red]")
+        return 2
+    benchmarks = registry.list_benchmarks(args.suite)
+    if benchmark_filter:
+        benchmarks = [b for b in benchmarks if b.name == benchmark_filter]
+        if not benchmarks:
+            console.print(
+                f"[red]No registered benchmark named {benchmark_filter!r} in suite {args.suite!r}.[/red]"
+            )
+            return 2
+
+    artifacts = _collect_artifacts(config, args.suite, [b.name for b in benchmarks])
+    if not artifacts:
+        console.print(
+            "[yellow]No run artifacts found under "
+            f"{config.suite_runs_dir(args.suite)}. Run a benchmark first.[/yellow]"
+        )
+        return 1
+
+    grouped: dict[str, list[registry.RunArtifact]] = {}
+    for art in artifacts:
+        grouped.setdefault(art.benchmark, []).append(art)
+    sections: list[registry.ReportSection] = []
+    for benchmark in benchmarks:
+        if benchmark.name not in grouped:
+            continue
+        sections.append(benchmark.report_section(grouped[benchmark.name]))
+
+    summary_path = write_report(
+        config=config,
+        suite=args.suite,
+        sections=sections,
+        run_timestamp=utc_iso_timestamp(),
+    )
+    console.print(f"[green]report OK[/green] → {summary_path}")
+    return 0
+
+
+def _collect_artifacts(
+    config: Config, suite: str, benchmark_names: list[str]
+) -> list[registry.RunArtifact]:
+    """Walk ``data/<suite>/runs/*/<benchmark>/`` for the latest artifacts.
+
+    Reads any ``run_artifact.json`` written by a benchmark runner. The
+    runner is responsible for writing this manifest alongside its raw
+    JSONL so the report writer doesn't have to know benchmark-specific
+    metric shapes.
+    """
+
+    runs_dir = config.suite_runs_dir(suite)
+    if not runs_dir.exists():
+        return []
+    artifacts: list[registry.RunArtifact] = []
+    by_bench: dict[str, registry.RunArtifact] = {}
+    for ts_dir in sorted(runs_dir.iterdir()):
+        if not ts_dir.is_dir():
+            continue
+        for bench_name in benchmark_names:
+            bench_dir = ts_dir / bench_name
+            manifest = bench_dir / "run_artifact.json"
+            if not manifest.exists():
+                continue
+            try:
+                with manifest.open("r", encoding="utf-8") as fh:
+                    payload = json.load(fh)
+            except (OSError, json.JSONDecodeError):
+                continue
+            artifact = registry.RunArtifact(
+                suite=suite,
+                benchmark=bench_name,
+                run_timestamp=ts_dir.name,
+                raw_path=bench_dir / payload.get("raw_path", "raw.jsonl"),
+                metrics=payload.get("metrics", {}),
+                extra=payload.get("extra", {}),
+            )
+            # Latest run wins per benchmark.
+            by_bench[bench_name] = artifact
+    artifacts = list(by_bench.values())
+    return artifacts
+
+
+# ---------------------------------------------------------------------------
+# Argparse wiring
+# ---------------------------------------------------------------------------
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="surfsense-evals",
+        description="SurfSense evaluation harness — domain-agnostic core + pluggable suites.",
+    )
+    parser.add_argument(
+        "--log-level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"]
+    )
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    p_setup = sub.add_parser("setup", help="Create per-suite SearchSpace + pin LLM.")
+    p_setup.add_argument("--suite", required=True)
+    p_setup.add_argument(
+        "--provider-model",
+        required=True,
+        help=(
+            "OpenRouter slug for the SurfSense answer LLM (and the native arm "
+            "too unless --native-arm-model is set), e.g. "
+            "'anthropic/claude-sonnet-4.5'."
+        ),
+    )
+    p_setup.add_argument(
+        "--agent-llm-id",
+        type=int,
+        default=None,
+        help="Optional override for BYOK NewLLMConfig rows.",
+    )
+    p_setup.add_argument(
+        "--scenario",
+        choices=SCENARIOS,
+        default=DEFAULT_SCENARIO,
+        help=(
+            "head-to-head (default): both arms answer with --provider-model; "
+            "symmetric-cheap: both arms use the same cheap text-only slug, "
+            "SurfSense pre-extracted images at ingest with a vision LLM; "
+            "cost-arbitrage: native arm uses --native-arm-model (vision), "
+            "SurfSense uses --provider-model (cheap, text-only) over chunks "
+            "the vision LLM already extracted at ingest."
+        ),
+    )
+    p_setup.add_argument(
+        "--vision-llm",
+        default=None,
+        metavar="SLUG",
+        help=(
+            "OpenRouter slug for the vision LLM SurfSense uses at ingest "
+            "when --use-vision-llm is on. If omitted in symmetric-cheap / "
+            "cost-arbitrage, the strongest registered vision config is "
+            "auto-picked (priority: claude-sonnet-4.5 > claude-opus-4.7 > "
+            "gpt-5 > gemini-2.5-pro)."
+        ),
+    )
+    p_setup.add_argument(
+        "--native-arm-model",
+        default=None,
+        metavar="SLUG",
+        help=(
+            "Required for --scenario cost-arbitrage. OpenRouter slug used "
+            "by the native_pdf arm only; SurfSense answers with "
+            "--provider-model. Ignored for head-to-head / symmetric-cheap."
+        ),
+    )
+    p_setup.add_argument(
+        "--no-vision-llm-setup",
+        action="store_true",
+        help=(
+            "Skip attaching a vision LLM config to the SearchSpace even if "
+            "the scenario would normally require one. Use when you want to "
+            "keep whatever is already attached (e.g. a per-user config)."
+        ),
+    )
+    p_setup.set_defaults(_func=_cmd_setup, _async=True)
+
+    p_teardown = sub.add_parser("teardown", help="Soft-delete the suite SearchSpace + clear state slot.")
+    p_teardown.add_argument("--suite", required=True)
+    p_teardown.set_defaults(_func=_cmd_teardown, _async=True)
+
+    p_models = sub.add_parser("models", help="LLM-config discovery helpers.")
+    models_sub = p_models.add_subparsers(dest="subcommand", required=True)
+    p_models_list = models_sub.add_parser("list", help="List global LLM configs.")
+    p_models_list.add_argument("--provider", default=None, help="Filter by provider, e.g. openrouter")
+    p_models_list.add_argument("--grep", default=None, help="Substring filter on name / model_name.")
+    p_models_list.set_defaults(_func=_cmd_models_list, _async=True)
+
+    p_suites = sub.add_parser("suites", help="List registered suites.")
+    suites_sub = p_suites.add_subparsers(dest="subcommand", required=True)
+    p_suites_list = suites_sub.add_parser("list", help="List suites.")
+    p_suites_list.set_defaults(_func=_cmd_suites_list, _async=False)
+
+    p_benchmarks = sub.add_parser("benchmarks", help="List registered benchmarks.")
+    bench_sub = p_benchmarks.add_subparsers(dest="subcommand", required=True)
+    p_bench_list = bench_sub.add_parser("list", help="List benchmarks.")
+    p_bench_list.add_argument("--suite", default=None)
+    p_bench_list.set_defaults(_func=_cmd_benchmarks_list, _async=False)
+
+    # Dynamic ingest / run subcommands need the registry populated, so
+    # discover up-front (cheap on import — modules just register).
+    _discover_suites()
+
+    p_ingest = sub.add_parser("ingest", help="Ingest a benchmark's corpus.")
+    ingest_sub = p_ingest.add_subparsers(dest="suite", required=True)
+    for suite in registry.list_suites():
+        suite_parser = ingest_sub.add_parser(suite, help=f"Ingest a {suite} benchmark.")
+        suite_bench = suite_parser.add_subparsers(dest="benchmark", required=True)
+        for benchmark in registry.list_benchmarks(suite):
+            bp = suite_bench.add_parser(benchmark.name, help=getattr(benchmark, "description", benchmark.name))
+            if hasattr(benchmark, "add_run_args"):
+                benchmark.add_run_args(bp)
+            bp.set_defaults(_func=_cmd_ingest, _async=True)
+
+    p_run = sub.add_parser("run", help="Run a benchmark.")
+    run_sub = p_run.add_subparsers(dest="suite", required=True)
+    for suite in registry.list_suites():
+        suite_parser = run_sub.add_parser(suite, help=f"Run a {suite} benchmark.")
+        suite_bench = suite_parser.add_subparsers(dest="benchmark", required=True)
+        for benchmark in registry.list_benchmarks(suite):
+            bp = suite_bench.add_parser(benchmark.name, help=getattr(benchmark, "description", benchmark.name))
+            if hasattr(benchmark, "add_run_args"):
+                benchmark.add_run_args(bp)
+            bp.set_defaults(_func=_cmd_run, _async=True)
+
+    p_report = sub.add_parser("report", help="Aggregate latest run artifacts into a summary.")
+    p_report.add_argument("--suite", required=True)
+    p_report.add_argument("--benchmark", default=None, help="Optional: report only this benchmark.")
+    p_report.set_defaults(_func=_cmd_report, _async=True)
+
+    return parser
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = _build_parser()
+    args = parser.parse_args(argv)
+    logging.basicConfig(
+        level=getattr(logging, args.log_level),
+        format="%(asctime)s %(levelname)s %(name)s %(message)s",
+    )
+    func = getattr(args, "_func", None)
+    if func is None:
+        parser.print_help()
+        return 2
+    is_async = getattr(args, "_async", False)
+    try:
+        if is_async:
+            return asyncio.run(func(args))
+        return func(args)
+    except KeyboardInterrupt:
+        console.print("[yellow]Interrupted.[/yellow]")
+        return 130
+    except Exception as exc:  # noqa: BLE001
+        logger.exception("CLI command failed")
+        console.print(f"[red]Command failed: {exc}[/red]")
+        return 1
+
+
+if __name__ == "__main__":  # pragma: no cover
+    sys.exit(main())
diff --git a/surfsense_evals/src/surfsense_evals/core/clients/__init__.py b/surfsense_evals/src/surfsense_evals/core/clients/__init__.py
new file mode 100644
index 000000000..37246c221
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/clients/__init__.py
@@ -0,0 +1,14 @@
+"""HTTP clients for the SurfSense API. All share one ``httpx.AsyncClient``."""
+
+from __future__ import annotations
+
+from .documents import DocumentsClient
+from .new_chat import NewChatClient, StreamedAnswer
+from .search_space import SearchSpaceClient
+
+__all__ = [
+    "DocumentsClient",
+    "NewChatClient",
+    "SearchSpaceClient",
+    "StreamedAnswer",
+]
diff --git a/surfsense_evals/src/surfsense_evals/core/clients/documents.py b/surfsense_evals/src/surfsense_evals/core/clients/documents.py
new file mode 100644
index 000000000..02bcf74da
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/clients/documents.py
@@ -0,0 +1,277 @@
+"""Client for ``/api/v1/documents/{fileupload,status,{id}/chunks}``.
+
+Verified against:
+
+* ``surfsense_backend/app/routes/documents_routes.py:122-292`` (POST fileupload)
+* ``surfsense_backend/app/routes/documents_routes.py:806-871`` (GET status batch)
+* ``surfsense_backend/app/routes/documents_routes.py:1062-1128`` (GET {id}/chunks paginated)
+
+Document processing is asynchronous:
+* ``POST /documents/fileupload`` returns immediately with
+  ``document_ids`` in ``pending``;
+* a Celery worker moves each through ``processing → ready/failed``;
+* the harness polls ``GET /documents/status?document_ids=...`` until
+  every doc is ``ready`` (otherwise the retriever sees an empty corpus
+  and accuracy numbers are meaningless).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import mimetypes
+from collections.abc import Iterable, Sequence
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class FileUploadResult:
+    """Mirrors the JSON returned by ``POST /documents/fileupload``."""
+
+    document_ids: list[int]
+    duplicate_document_ids: list[int]
+    total_files: int
+    pending_files: int
+    skipped_duplicates: int
+    message: str = ""
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> FileUploadResult:
+        return cls(
+            document_ids=[int(x) for x in payload.get("document_ids", [])],
+            duplicate_document_ids=[int(x) for x in payload.get("duplicate_document_ids", [])],
+            total_files=int(payload.get("total_files", 0)),
+            pending_files=int(payload.get("pending_files", 0)),
+            skipped_duplicates=int(payload.get("skipped_duplicates", 0)),
+            message=str(payload.get("message", "")),
+        )
+
+
+@dataclass
+class DocumentStatus:
+    document_id: int
+    title: str
+    document_type: str
+    state: str
+    reason: str | None = None
+
+    @property
+    def is_ready(self) -> bool:
+        return self.state == "ready"
+
+    @property
+    def is_failed(self) -> bool:
+        return self.state == "failed"
+
+
+@dataclass
+class ChunkRow:
+    id: int
+    document_id: int
+    content: str = ""
+    raw: dict[str, Any] = field(default_factory=dict)
+
+
+class DocumentProcessingFailed(RuntimeError):
+    """Raised when a polled document lands in ``failed``."""
+
+    def __init__(self, statuses: Sequence[DocumentStatus]) -> None:
+        details = ", ".join(
+            f"id={s.document_id} ({s.title!r}): {s.reason or 'unknown'}"
+            for s in statuses
+        )
+        super().__init__(f"Document(s) failed to process: {details}")
+        self.statuses = list(statuses)
+
+
+class DocumentProcessingTimeout(RuntimeError):
+    """Raised when polling exceeds the per-doc timeout budget."""
+
+
+class DocumentsClient:
+    """Document upload + status polling + chunk listing."""
+
+    def __init__(self, http: httpx.AsyncClient, base_url: str) -> None:
+        self._http = http
+        self._base = base_url.rstrip("/")
+
+    # ------------------------------------------------------------------
+    # upload
+    # ------------------------------------------------------------------
+
+    async def upload(
+        self,
+        files: Iterable[Path],
+        *,
+        search_space_id: int,
+        should_summarize: bool = False,
+        use_vision_llm: bool = False,
+        processing_mode: str = "basic",
+    ) -> FileUploadResult:
+        """Upload files to ``/api/v1/documents/fileupload``.
+
+        ``files`` is materialised to a list because we may need to
+        re-read on retry. Caller is responsible for ensuring each path
+        exists and respects the per-file size cap (50 MB backend default).
+        """
+
+        materialised = [Path(p) for p in files]
+        if not materialised:
+            return FileUploadResult(
+                document_ids=[],
+                duplicate_document_ids=[],
+                total_files=0,
+                pending_files=0,
+                skipped_duplicates=0,
+                message="No files supplied",
+            )
+
+        opened: list[tuple[str, Any]] = []
+        try:
+            for path in materialised:
+                # ``open`` directly — httpx wraps it in MultipartStream.
+                file_obj = path.open("rb")
+                mime, _ = mimetypes.guess_type(path.name)
+                opened.append(
+                    (
+                        "files",
+                        (path.name, file_obj, mime or "application/octet-stream"),
+                    )
+                )
+
+            response = await self._http.post(
+                f"{self._base}/api/v1/documents/fileupload",
+                data={
+                    "search_space_id": str(search_space_id),
+                    "should_summarize": "true" if should_summarize else "false",
+                    "use_vision_llm": "true" if use_vision_llm else "false",
+                    "processing_mode": processing_mode,
+                },
+                files=opened,
+                # Multipart uploads can be slow for big PDFs; bump per-call.
+                timeout=httpx.Timeout(120.0, connect=10.0),
+            )
+        finally:
+            for _, (_, file_obj, _) in opened:
+                try:
+                    file_obj.close()
+                except Exception:  # noqa: BLE001
+                    pass
+
+        response.raise_for_status()
+        return FileUploadResult.from_payload(response.json())
+
+    # ------------------------------------------------------------------
+    # status polling
+    # ------------------------------------------------------------------
+
+    async def get_status(
+        self, *, search_space_id: int, document_ids: Sequence[int]
+    ) -> list[DocumentStatus]:
+        if not document_ids:
+            return []
+        response = await self._http.get(
+            f"{self._base}/api/v1/documents/status",
+            params={
+                "search_space_id": search_space_id,
+                "document_ids": ",".join(str(d) for d in document_ids),
+            },
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        payload = response.json()
+        return [
+            DocumentStatus(
+                document_id=int(item["id"]),
+                title=str(item.get("title", "")),
+                document_type=str(item.get("document_type", "")),
+                state=str((item.get("status") or {}).get("state", "ready")),
+                reason=(item.get("status") or {}).get("reason"),
+            )
+            for item in payload.get("items", [])
+        ]
+
+    async def wait_until_ready(
+        self,
+        *,
+        search_space_id: int,
+        document_ids: Sequence[int],
+        timeout_s: float = 300.0,
+        initial_poll_s: float = 1.0,
+        max_poll_s: float = 10.0,
+    ) -> list[DocumentStatus]:
+        """Poll ``GET /documents/status`` until every doc is ``ready``.
+
+        Exponential backoff from ``initial_poll_s`` up to ``max_poll_s``.
+        Raises ``DocumentProcessingFailed`` if any doc lands in
+        ``failed`` (with the offending document ids), or
+        ``DocumentProcessingTimeout`` if the budget is exhausted.
+        """
+
+        if not document_ids:
+            return []
+        deadline = asyncio.get_event_loop().time() + timeout_s
+        poll = initial_poll_s
+        while True:
+            statuses = await self.get_status(
+                search_space_id=search_space_id, document_ids=document_ids
+            )
+            failed = [s for s in statuses if s.is_failed]
+            if failed:
+                raise DocumentProcessingFailed(failed)
+            ready = [s for s in statuses if s.is_ready]
+            if len(ready) == len(document_ids):
+                return statuses
+            now = asyncio.get_event_loop().time()
+            if now >= deadline:
+                pending = [s for s in statuses if not s.is_ready and not s.is_failed]
+                pending_ids = [s.document_id for s in pending]
+                raise DocumentProcessingTimeout(
+                    f"Timed out after {timeout_s:.0f}s waiting for documents "
+                    f"(still pending/processing: {pending_ids})"
+                )
+            await asyncio.sleep(min(poll, max(0.1, deadline - now)))
+            poll = min(poll * 1.5, max_poll_s)
+
+    # ------------------------------------------------------------------
+    # chunks (chunk_id -> document_id map)
+    # ------------------------------------------------------------------
+
+    async def list_chunks(
+        self, document_id: int, *, page_size: int = 100
+    ) -> list[ChunkRow]:
+        """Walk ``GET /documents/{id}/chunks`` until ``has_more=False``.
+
+        Used by ingestion to materialise the ``chunk_id -> document_id``
+        map needed for retrieval scoring (CUREv1).
+        """
+
+        rows: list[ChunkRow] = []
+        page = 0
+        while True:
+            response = await self._http.get(
+                f"{self._base}/api/v1/documents/{document_id}/chunks",
+                params={"page": page, "page_size": page_size},
+                headers={"Accept": "application/json"},
+            )
+            response.raise_for_status()
+            payload = response.json()
+            for item in payload.get("items", []):
+                rows.append(
+                    ChunkRow(
+                        id=int(item["id"]),
+                        document_id=document_id,
+                        content=str(item.get("content", "")),
+                        raw=item,
+                    )
+                )
+            if not payload.get("has_more"):
+                break
+            page += 1
+        return rows
diff --git a/surfsense_evals/src/surfsense_evals/core/clients/new_chat.py b/surfsense_evals/src/surfsense_evals/core/clients/new_chat.py
new file mode 100644
index 000000000..a4c23d010
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/clients/new_chat.py
@@ -0,0 +1,280 @@
+"""Client for ``/api/v1/threads`` and ``/api/v1/new_chat`` (SSE).
+
+Verified against:
+
+* ``surfsense_backend/app/routes/new_chat_routes.py:793-848`` (POST /threads)
+* ``surfsense_backend/app/routes/new_chat_routes.py:1073-1142`` (DELETE /threads/{id})
+* ``surfsense_backend/app/routes/new_chat_routes.py:1689-1800`` (POST /new_chat SSE)
+* ``surfsense_backend/app/routes/new_chat_routes.py:191-220`` (THREAD_BUSY / TURN_CANCELLING 409)
+* ``surfsense_backend/app/services/streaming/envelope/sse.py`` (wire framing)
+* ``surfsense_backend/app/services/streaming/events/text.py`` (text-delta events)
+* ``surfsense_backend/app/schemas/new_chat.py:234-288`` (NewChatRequest body)
+
+The wire format is "Vercel AI SDK"-flavoured SSE with one event per
+``data: <json>\n\n`` block (or the literal ``data: [DONE]\n\n``
+terminator). Text deltas arrive as ``{"type":"text-delta","id":...,"delta":...}``
+events; we accumulate them per ``id`` and emit the final concatenated
+text plus parsed citations.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import time
+from collections.abc import AsyncIterator, Sequence
+from dataclasses import dataclass, field
+from typing import Any
+
+import httpx
+
+from ..parse import iter_sse_events, parse_citations
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class StreamedAnswer:
+    """Result of a single ``/new_chat`` turn."""
+
+    text: str
+    raw_events: list[dict[str, Any]] = field(default_factory=list)
+    latency_ms: int = 0
+    user_message_id: str | None = None
+    assistant_message_id: str | None = None
+    finished_normally: bool = False
+
+    @property
+    def citations(self) -> list[dict[str, Any]]:
+        """Parsed citation tokens (lazy; small enough to recompute)."""
+
+        return [token.to_dict() for token in parse_citations(self.text)]
+
+
+class ThreadBusyError(RuntimeError):
+    """Raised after exhausting retries on a 409 ``THREAD_BUSY`` / ``TURN_CANCELLING``."""
+
+    def __init__(self, error_code: str, message: str) -> None:
+        super().__init__(f"{error_code}: {message}")
+        self.error_code = error_code
+
+
+class NewChatClient:
+    """Thread create / delete / SSE ask."""
+
+    def __init__(self, http: httpx.AsyncClient, base_url: str) -> None:
+        self._http = http
+        self._base = base_url.rstrip("/")
+
+    # ------------------------------------------------------------------
+    # threads
+    # ------------------------------------------------------------------
+
+    async def create_thread(
+        self,
+        *,
+        search_space_id: int,
+        title: str = "eval",
+        archived: bool = False,
+        visibility: str = "PRIVATE",
+    ) -> int:
+        response = await self._http.post(
+            f"{self._base}/api/v1/threads",
+            json={
+                "search_space_id": search_space_id,
+                "title": title,
+                "archived": archived,
+                "visibility": visibility,
+            },
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        payload = response.json()
+        return int(payload["id"])
+
+    async def delete_thread(self, thread_id: int) -> None:
+        response = await self._http.delete(
+            f"{self._base}/api/v1/threads/{thread_id}",
+            headers={"Accept": "application/json"},
+        )
+        if response.status_code == 404:
+            return  # idempotent
+        response.raise_for_status()
+
+    # ------------------------------------------------------------------
+    # /new_chat SSE
+    # ------------------------------------------------------------------
+
+    async def ask(
+        self,
+        *,
+        thread_id: int,
+        search_space_id: int,
+        user_query: str,
+        mentioned_document_ids: Sequence[int] | None = None,
+        disabled_tools: Sequence[str] | None = None,
+        max_busy_retries: int = 4,
+        timeout_s: float = 600.0,
+    ) -> StreamedAnswer:
+        """Stream a single turn and return the accumulated answer.
+
+        Honours backend ``THREAD_BUSY`` / ``TURN_CANCELLING`` 409
+        responses by sleeping for the ``Retry-After`` header (or the
+        ``retry-after-ms`` header if present) and replaying. Bounded
+        by ``max_busy_retries`` so a stuck thread never blocks the
+        whole run.
+        """
+
+        body: dict[str, Any] = {
+            "chat_id": thread_id,
+            "search_space_id": search_space_id,
+            "user_query": user_query,
+        }
+        if mentioned_document_ids:
+            body["mentioned_document_ids"] = list(mentioned_document_ids)
+        if disabled_tools:
+            body["disabled_tools"] = list(disabled_tools)
+
+        attempt = 0
+        while True:
+            try:
+                return await self._stream_once(body=body, timeout_s=timeout_s)
+            except ThreadBusyError as exc:
+                attempt += 1
+                if attempt > max_busy_retries:
+                    raise
+                # Cap wait at 30s; backend retry hint is exponential anyway.
+                wait = min(30.0, 0.5 * (2 ** attempt))
+                logger.info(
+                    "thread_id=%s busy (%s); retry %d/%d after %.1fs",
+                    thread_id,
+                    exc.error_code,
+                    attempt,
+                    max_busy_retries,
+                    wait,
+                )
+                await asyncio.sleep(wait)
+
+    async def _stream_once(
+        self,
+        *,
+        body: dict[str, Any],
+        timeout_s: float,
+    ) -> StreamedAnswer:
+        # Per-call timeout — the connect should be quick, the read needs
+        # to outlive the longest LLM completion.
+        timeout = httpx.Timeout(timeout_s, connect=10.0)
+        started = time.monotonic()
+        async with self._http.stream(
+            "POST",
+            f"{self._base}/api/v1/new_chat",
+            json=body,
+            headers={"Accept": "text/event-stream"},
+            timeout=timeout,
+        ) as response:
+            if response.status_code == 409:
+                detail = await self._extract_busy_detail(response)
+                raise ThreadBusyError(
+                    error_code=detail.get("errorCode", "THREAD_BUSY"),
+                    message=detail.get("message", "Thread is busy"),
+                )
+            response.raise_for_status()
+            answer = await self._consume_sse(response)
+        answer.latency_ms = int((time.monotonic() - started) * 1000)
+        return answer
+
+    @staticmethod
+    async def _extract_busy_detail(response: httpx.Response) -> dict[str, Any]:
+        try:
+            payload = json.loads(await response.aread())
+        except (json.JSONDecodeError, ValueError):
+            return {"errorCode": "THREAD_BUSY", "message": response.text}
+        if isinstance(payload, dict) and isinstance(payload.get("detail"), dict):
+            return payload["detail"]
+        return payload if isinstance(payload, dict) else {}
+
+    @staticmethod
+    async def _consume_sse(response: httpx.Response) -> StreamedAnswer:
+        """Walk SSE events, accumulate text-delta payloads.
+
+        Backend events of interest:
+
+        * ``{"type": "text-start", "id": ...}``
+        * ``{"type": "text-delta", "id": ..., "delta": ...}``
+        * ``{"type": "text-end", "id": ...}``
+        * ``{"type": "start", "messageId": ...}``  (top-level message id)
+        * ``{"type": "finish"}``
+        * literal ``[DONE]`` sentinel
+
+        Multiple ``text-start`` blocks can interleave — each gets its
+        own ``id`` and we concatenate them in arrival order. That
+        mirrors the AI SDK client behaviour: one continuous assistant
+        message visible to the user.
+        """
+
+        ordered_text_ids: list[str] = []
+        text_buffers: dict[str, list[str]] = {}
+        raw_events: list[dict[str, Any]] = []
+        user_message_id: str | None = None
+        assistant_message_id: str | None = None
+        finished = False
+
+        async for event in iter_sse_events(_aiter_lines(response)):
+            data = event.data
+            if data == "[DONE]":
+                finished = True
+                continue
+            try:
+                payload = json.loads(data)
+            except (json.JSONDecodeError, ValueError):
+                logger.debug("Skipping non-JSON SSE payload: %r", data[:120])
+                continue
+            if not isinstance(payload, dict):
+                continue
+            raw_events.append(payload)
+            ev_type = payload.get("type")
+            if ev_type == "text-delta":
+                tid = str(payload.get("id", ""))
+                delta = payload.get("delta", "")
+                if not isinstance(delta, str):
+                    continue
+                if tid not in text_buffers:
+                    text_buffers[tid] = []
+                    ordered_text_ids.append(tid)
+                text_buffers[tid].append(delta)
+            elif ev_type == "text-start":
+                tid = str(payload.get("id", ""))
+                if tid and tid not in text_buffers:
+                    text_buffers[tid] = []
+                    ordered_text_ids.append(tid)
+            elif ev_type == "start":
+                msg_id = payload.get("messageId")
+                if isinstance(msg_id, str):
+                    user_message_id = user_message_id or msg_id
+            elif ev_type == "data-user-message-id":
+                msg_id = (payload.get("data") or {}).get("id") or payload.get("id")
+                if isinstance(msg_id, str):
+                    user_message_id = msg_id
+            elif ev_type == "data-assistant-message-id":
+                msg_id = (payload.get("data") or {}).get("id") or payload.get("id")
+                if isinstance(msg_id, str):
+                    assistant_message_id = msg_id
+            elif ev_type == "finish":
+                finished = True
+
+        text = "".join("".join(text_buffers.get(tid, [])) for tid in ordered_text_ids)
+        return StreamedAnswer(
+            text=text,
+            raw_events=raw_events,
+            user_message_id=user_message_id,
+            assistant_message_id=assistant_message_id,
+            finished_normally=finished,
+        )
+
+
+async def _aiter_lines(response: httpx.Response) -> AsyncIterator[str]:
+    """Adapter so the parser can consume any line iterator (mockable in tests)."""
+
+    async for line in response.aiter_lines():
+        yield line
diff --git a/surfsense_evals/src/surfsense_evals/core/clients/search_space.py b/surfsense_evals/src/surfsense_evals/core/clients/search_space.py
new file mode 100644
index 000000000..37fa69f80
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/clients/search_space.py
@@ -0,0 +1,207 @@
+"""Client for ``/api/v1/searchspaces`` and ``/api/v1/search-spaces/{id}/llm-preferences``.
+
+Verified against:
+
+* ``surfsense_backend/app/routes/search_spaces_routes.py:116`` (POST create)
+* ``surfsense_backend/app/routes/search_spaces_routes.py:234`` (GET by id)
+* ``surfsense_backend/app/routes/search_spaces_routes.py:422`` (DELETE soft-delete)
+* ``surfsense_backend/app/routes/search_spaces_routes.py:698-849`` (GET/PUT llm-preferences)
+* ``surfsense_backend/app/schemas/search_space.py:14`` (SearchSpaceCreate body)
+* ``surfsense_backend/app/routes/vision_llm_routes.py:60`` (GET global vision configs)
+
+Note the inconsistent pluralisation in the backend: ``/searchspaces``
+(no hyphen) for CRUD, but ``/search-spaces`` (hyphenated) for the
+``llm-preferences`` sub-resource. Both are mirrored verbatim here.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+import httpx
+
+
+@dataclass
+class SearchSpaceRow:
+    """Subset of the SearchSpace row we care about."""
+
+    id: int
+    name: str
+    description: str | None
+    user_id: str
+    citations_enabled: bool
+    qna_custom_instructions: str | None
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> SearchSpaceRow:
+        return cls(
+            id=int(payload["id"]),
+            name=str(payload["name"]),
+            description=payload.get("description"),
+            user_id=str(payload.get("user_id", "")),
+            citations_enabled=bool(payload.get("citations_enabled", True)),
+            qna_custom_instructions=payload.get("qna_custom_instructions"),
+        )
+
+
+@dataclass
+class VisionLlmConfigEntry:
+    """Subset of one ``GET /global-vision-llm-configs`` row.
+
+    The backend returns negative ids for global / OpenRouter-derived
+    vision configs and positive ids for per-user BYOK rows. Either is
+    accepted by ``set_llm_preferences(vision_llm_config_id=...)``.
+    """
+
+    id: int
+    name: str
+    provider: str
+    model_name: str
+    is_auto_mode: bool
+    raw: dict[str, Any]
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> VisionLlmConfigEntry:
+        return cls(
+            id=int(payload.get("id", 0)),
+            name=str(payload.get("name", "")),
+            provider=str(payload.get("provider", "")).upper(),
+            model_name=str(payload.get("model_name", "")),
+            is_auto_mode=bool(payload.get("is_auto_mode", False)),
+            raw=payload,
+        )
+
+
+@dataclass
+class LlmPreferences:
+    """Resolved LLM preferences with the embedded full config row.
+
+    Mirrors ``LLMPreferencesRead`` from the backend so the lifecycle
+    command can introspect ``provider`` / ``model_name`` to validate the
+    OpenRouter pin.
+    """
+
+    agent_llm_id: int | None
+    document_summary_llm_id: int | None
+    image_generation_config_id: int | None
+    vision_llm_config_id: int | None
+    agent_llm: dict[str, Any] | None
+    raw: dict[str, Any]
+
+    @classmethod
+    def from_payload(cls, payload: dict[str, Any]) -> LlmPreferences:
+        return cls(
+            agent_llm_id=payload.get("agent_llm_id"),
+            document_summary_llm_id=payload.get("document_summary_llm_id"),
+            image_generation_config_id=payload.get("image_generation_config_id"),
+            vision_llm_config_id=payload.get("vision_llm_config_id"),
+            agent_llm=payload.get("agent_llm"),
+            raw=payload,
+        )
+
+
+class SearchSpaceClient:
+    """Thin wrapper around the SearchSpace + LLM preferences endpoints."""
+
+    def __init__(self, http: httpx.AsyncClient, base_url: str) -> None:
+        self._http = http
+        self._base = base_url.rstrip("/")
+
+    async def create(self, name: str, *, description: str | None = None) -> SearchSpaceRow:
+        body: dict[str, Any] = {"name": name}
+        if description is not None:
+            body["description"] = description
+        # citations_enabled defaults to True backend-side; keep that default.
+        response = await self._http.post(
+            f"{self._base}/api/v1/searchspaces",
+            json=body,
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        return SearchSpaceRow.from_payload(response.json())
+
+    async def get(self, search_space_id: int) -> SearchSpaceRow:
+        response = await self._http.get(
+            f"{self._base}/api/v1/searchspaces/{search_space_id}",
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        return SearchSpaceRow.from_payload(response.json())
+
+    async def delete(self, search_space_id: int) -> None:
+        """Soft-delete: backend prefixes name with ``[DELETING]`` and dispatches a Celery cascade."""
+
+        response = await self._http.delete(
+            f"{self._base}/api/v1/searchspaces/{search_space_id}",
+            headers={"Accept": "application/json"},
+        )
+        # 404 means it's already gone — treat as success (idempotent teardown).
+        if response.status_code == 404:
+            return
+        response.raise_for_status()
+
+    async def get_llm_preferences(self, search_space_id: int) -> LlmPreferences:
+        response = await self._http.get(
+            f"{self._base}/api/v1/search-spaces/{search_space_id}/llm-preferences",
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        return LlmPreferences.from_payload(response.json())
+
+    async def set_llm_preferences(
+        self,
+        search_space_id: int,
+        *,
+        agent_llm_id: int | None = None,
+        document_summary_llm_id: int | None = None,
+        image_generation_config_id: int | None = None,
+        vision_llm_config_id: int | None = None,
+    ) -> LlmPreferences:
+        """PUT a partial update to ``/search-spaces/{id}/llm-preferences``.
+
+        Backend uses ``model_dump(exclude_unset=True)`` so omitted fields
+        are left unchanged.
+        """
+
+        body: dict[str, Any] = {}
+        if agent_llm_id is not None:
+            body["agent_llm_id"] = agent_llm_id
+        if document_summary_llm_id is not None:
+            body["document_summary_llm_id"] = document_summary_llm_id
+        if image_generation_config_id is not None:
+            body["image_generation_config_id"] = image_generation_config_id
+        if vision_llm_config_id is not None:
+            body["vision_llm_config_id"] = vision_llm_config_id
+        response = await self._http.put(
+            f"{self._base}/api/v1/search-spaces/{search_space_id}/llm-preferences",
+            json=body,
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        return LlmPreferences.from_payload(response.json())
+
+    async def list_global_vision_llm_configs(self) -> list[VisionLlmConfigEntry]:
+        """List the registered global vision LLM configs.
+
+        Used by ``setup`` to (a) resolve an explicit ``--vision-llm <slug>``
+        to a config id and (b) auto-pick the strongest registered vision
+        config when the operator doesn't pass one. The ``Auto (Fastest)``
+        entry (``id=0``) is filtered out — accuracy must be reproducible.
+        """
+
+        response = await self._http.get(
+            f"{self._base}/api/v1/global-vision-llm-configs",
+            headers={"Accept": "application/json"},
+        )
+        response.raise_for_status()
+        payload = response.json()
+        if not isinstance(payload, list):
+            raise RuntimeError(
+                f"Unexpected /global-vision-llm-configs payload: {payload!r}"
+            )
+        return [
+            VisionLlmConfigEntry.from_payload(item)
+            for item in payload
+            if not bool(item.get("is_auto_mode", False))
+        ]
diff --git a/surfsense_evals/src/surfsense_evals/core/config.py b/surfsense_evals/src/surfsense_evals/core/config.py
new file mode 100644
index 000000000..164955914
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/config.py
@@ -0,0 +1,279 @@
+"""Environment + filesystem configuration for the harness.
+
+Two responsibilities:
+
+1. Load env vars (with sensible defaults) into a single immutable ``Config``
+   so that every other module reads it from one place.
+2. Read / write ``data/state.json``. State is keyed by suite name so multiple
+   suites can be set up in parallel and torn down independently.
+
+The pinned ``search_space_id`` lives in ``state.json`` (not env) so re-runs
+are idempotent without forcing the operator to remember an integer.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+from dotenv import load_dotenv
+
+# Resolve once at import time. ``find_dotenv`` walks up; an explicit ``.env``
+# at the package root or in CWD wins. Silent-no-op if neither exists.
+load_dotenv()
+
+
+_PROJECT_ROOT = Path(__file__).resolve().parents[3]
+"""Resolves to ``surfsense_evals/`` (the package root, not ``src/``)."""
+
+
+def _project_root() -> Path:
+    """Return the ``surfsense_evals/`` project root.
+
+    Computed from this file's path: ``src/surfsense_evals/core/config.py`` →
+    walk up four levels. Kept as a function so tests can monkeypatch.
+    """
+
+    return _PROJECT_ROOT
+
+
+@dataclass(frozen=True)
+class Config:
+    """Immutable runtime configuration."""
+
+    surfsense_api_base: str
+    openrouter_api_key: str | None
+    openrouter_base_url: str
+
+    # Credentials — exactly ONE mode must be supplied.
+    surfsense_jwt: str | None
+    surfsense_refresh_token: str | None
+    surfsense_user_email: str | None
+    surfsense_user_password: str | None
+
+    # Filesystem paths.
+    data_dir: Path
+    reports_dir: Path
+
+    @property
+    def state_path(self) -> Path:
+        return self.data_dir / "state.json"
+
+    def has_jwt_mode(self) -> bool:
+        return bool(self.surfsense_jwt)
+
+    def has_local_mode(self) -> bool:
+        return bool(self.surfsense_user_email and self.surfsense_user_password)
+
+    def credential_mode(self) -> str:
+        """Return ``"jwt"``, ``"local"``, or ``"none"`` (no credentials supplied)."""
+
+        if self.has_jwt_mode():
+            return "jwt"
+        if self.has_local_mode():
+            return "local"
+        return "none"
+
+    def suite_data_dir(self, suite: str) -> Path:
+        return self.data_dir / suite
+
+    def suite_reports_dir(self, suite: str) -> Path:
+        return self.reports_dir / suite
+
+    def suite_runs_dir(self, suite: str) -> Path:
+        return self.suite_data_dir(suite) / "runs"
+
+    def suite_maps_dir(self, suite: str) -> Path:
+        return self.suite_data_dir(suite) / "maps"
+
+
+def load_config() -> Config:
+    """Read the current process env into a ``Config``.
+
+    No validation is performed here; callers (e.g. ``auth.acquire_token``,
+    ``cli`` subcommands) decide which fields they require. This keeps
+    ``models list`` and ``suites list`` runnable without OpenRouter creds.
+    """
+
+    project_root = _project_root()
+    data_dir = Path(os.environ.get("EVAL_DATA_DIR") or (project_root / "data")).resolve()
+    reports_dir = Path(os.environ.get("EVAL_REPORTS_DIR") or (project_root / "reports")).resolve()
+    return Config(
+        surfsense_api_base=os.environ.get("SURFSENSE_API_BASE", "http://localhost:8000").rstrip("/"),
+        openrouter_api_key=os.environ.get("OPENROUTER_API_KEY") or None,
+        openrouter_base_url=os.environ.get(
+            "OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1"
+        ).rstrip("/"),
+        surfsense_jwt=os.environ.get("SURFSENSE_JWT") or None,
+        surfsense_refresh_token=os.environ.get("SURFSENSE_REFRESH_TOKEN") or None,
+        surfsense_user_email=os.environ.get("SURFSENSE_USER_EMAIL") or None,
+        surfsense_user_password=os.environ.get("SURFSENSE_USER_PASSWORD") or None,
+        data_dir=data_dir,
+        reports_dir=reports_dir,
+    )
+
+
+# ---------------------------------------------------------------------------
+# state.json — per-suite slots
+# ---------------------------------------------------------------------------
+
+
+# Scenario names — chosen at ``setup`` time, persisted in ``state.json``.
+#
+# * ``head-to-head`` (default, current behaviour): both arms answer with the
+#   SAME slug pinned via ``--provider-model``. Vision LLM at ingest is
+#   optional but recommended for image-bearing benchmarks.
+# * ``symmetric-cheap``: both arms answer with the SAME (cheap, text-only)
+#   slug; SurfSense pre-extracted images at ingest with a vision LLM.
+#   Measures whether vision-RAG ingestion lets a cheap downstream model
+#   match a vision one. Native arm structurally loses on image questions —
+#   that's the point, and the report labels it accordingly.
+# * ``cost-arbitrage``: native arm answers with an EXPENSIVE vision slug
+#   (``--native-arm-model``), SurfSense answers with a CHEAP text-only slug
+#   (``--provider-model``) over chunks the vision LLM already extracted at
+#   ingest. Measures how close SurfSense gets to native at a fraction of
+#   the per-query cost. The most compelling "shines" framing.
+SCENARIOS: tuple[str, ...] = ("head-to-head", "symmetric-cheap", "cost-arbitrage")
+DEFAULT_SCENARIO: str = "head-to-head"
+
+
+@dataclass
+class SuiteState:
+    """Per-suite persisted state.
+
+    ``provider_model`` is the slug pinned to the SearchSpace's
+    ``agent_llm`` — what answers SurfSense queries (and what the native
+    arm uses too, unless ``native_arm_model`` is set for cost-arbitrage).
+
+    ``vision_provider_model`` is the slug of the OpenRouter vision LLM
+    config attached to the SearchSpace's ``vision_llm_config_id`` — what
+    SurfSense uses to extract image content at ingest time when
+    ``use_vision_llm=True``. ``None`` means no vision config was attached
+    at setup (legacy or text-only suite).
+    """
+
+    search_space_id: int
+    agent_llm_id: int
+    provider_model: str
+    created_at: str
+    ingestion_maps: dict[str, str] = field(default_factory=dict)
+    scenario: str = DEFAULT_SCENARIO
+    vision_llm_config_id: int | None = None
+    vision_provider_model: str | None = None
+    native_arm_model: str | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "search_space_id": self.search_space_id,
+            "agent_llm_id": self.agent_llm_id,
+            "provider_model": self.provider_model,
+            "created_at": self.created_at,
+            "ingestion_maps": dict(self.ingestion_maps),
+            "scenario": self.scenario,
+            "vision_llm_config_id": self.vision_llm_config_id,
+            "vision_provider_model": self.vision_provider_model,
+            "native_arm_model": self.native_arm_model,
+        }
+
+    @classmethod
+    def from_dict(cls, payload: Mapping[str, Any]) -> SuiteState:
+        # ``scenario`` / vision / native fields default for back-compat with
+        # ``state.json`` written before scenarios shipped.
+        scenario = str(payload.get("scenario") or DEFAULT_SCENARIO)
+        if scenario not in SCENARIOS:
+            scenario = DEFAULT_SCENARIO
+        raw_vision_id = payload.get("vision_llm_config_id")
+        return cls(
+            search_space_id=int(payload["search_space_id"]),
+            agent_llm_id=int(payload["agent_llm_id"]),
+            provider_model=str(payload["provider_model"]),
+            created_at=str(payload.get("created_at") or ""),
+            ingestion_maps=dict(payload.get("ingestion_maps") or {}),
+            scenario=scenario,
+            vision_llm_config_id=int(raw_vision_id) if raw_vision_id is not None else None,
+            vision_provider_model=(
+                str(payload["vision_provider_model"])
+                if payload.get("vision_provider_model")
+                else None
+            ),
+            native_arm_model=(
+                str(payload["native_arm_model"])
+                if payload.get("native_arm_model")
+                else None
+            ),
+        )
+
+    @property
+    def effective_native_arm_model(self) -> str:
+        """Slug the native arm should use; falls back to ``provider_model``."""
+
+        return self.native_arm_model or self.provider_model
+
+
+def _load_state(config: Config) -> dict[str, Any]:
+    if not config.state_path.exists():
+        return {"suites": {}}
+    try:
+        with config.state_path.open("r", encoding="utf-8") as fh:
+            data = json.load(fh)
+    except (OSError, json.JSONDecodeError) as exc:
+        raise RuntimeError(
+            f"Failed to read state file {config.state_path}: {exc!s}. "
+            "Delete it if you want to start fresh."
+        ) from exc
+    if not isinstance(data, dict) or "suites" not in data:
+        return {"suites": {}}
+    return data
+
+
+def _write_state(config: Config, payload: Mapping[str, Any]) -> None:
+    config.data_dir.mkdir(parents=True, exist_ok=True)
+    tmp = config.state_path.with_suffix(".json.tmp")
+    with tmp.open("w", encoding="utf-8") as fh:
+        json.dump(dict(payload), fh, indent=2, sort_keys=True)
+        fh.write("\n")
+    tmp.replace(config.state_path)
+
+
+def get_suite_state(config: Config, suite: str) -> SuiteState | None:
+    """Return ``SuiteState`` for ``suite`` or ``None`` if not set up."""
+
+    state = _load_state(config)
+    raw = (state.get("suites") or {}).get(suite)
+    if not raw:
+        return None
+    return SuiteState.from_dict(raw)
+
+
+def set_suite_state(config: Config, suite: str, suite_state: SuiteState) -> None:
+    """Persist ``suite_state`` under the suite slot. Other suites are untouched."""
+
+    state = _load_state(config)
+    suites = dict(state.get("suites") or {})
+    suites[suite] = suite_state.to_dict()
+    state["suites"] = suites
+    _write_state(config, state)
+
+
+def clear_suite_state(config: Config, suite: str) -> bool:
+    """Remove the slot for ``suite``. Returns ``True`` if removal happened."""
+
+    state = _load_state(config)
+    suites = dict(state.get("suites") or {})
+    if suite not in suites:
+        return False
+    del suites[suite]
+    state["suites"] = suites
+    _write_state(config, state)
+    return True
+
+
+def utc_iso_timestamp() -> str:
+    """Filesystem-safe UTC ISO timestamp, e.g. ``2026-05-11T20-30-00Z``."""
+
+    return datetime.now(UTC).strftime("%Y-%m-%dT%H-%M-%SZ")
diff --git a/surfsense_evals/src/surfsense_evals/core/ingest_settings.py b/surfsense_evals/src/surfsense_evals/core/ingest_settings.py
new file mode 100644
index 000000000..5cdece577
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/ingest_settings.py
@@ -0,0 +1,311 @@
+"""Per-upload ingestion settings shared across every benchmark.
+
+The SurfSense ``POST /api/v1/documents/fileupload`` endpoint exposes
+exactly three knobs (verified at
+``surfsense_backend/app/routes/documents_routes.py`` and
+``surfsense_backend/app/etl_pipeline/etl_document.py``):
+
+* ``processing_mode``     — ``"basic"`` (default) | ``"premium"``
+* ``use_vision_llm``      — ``bool`` (run vision LLM during ingest to
+                            extract image content / captions / tables)
+* ``should_summarize``    — ``bool`` (generate document summary)
+
+This module gives every benchmark a uniform way to:
+
+1. Receive sensible per-benchmark defaults (text-only benchmarks
+   default vision off; image-bearing benchmarks default vision on).
+2. Accept CLI overrides (``--use-vision-llm`` / ``--no-vision-llm``,
+   ``--processing-mode {basic,premium}``,
+   ``--should-summarize`` / ``--no-summarize``).
+3. Persist the *actual* settings used into the doc-map manifest and
+   the run artifact so reports can show "vision=ON, mode=premium →
+   65% accuracy" head-to-head with "vision=OFF, mode=basic → 52%".
+
+A/B testing on the same corpus
+------------------------------
+
+SurfSense dedupes uploads by ``(filename, search_space_id)`` — NOT by
+content hash and NOT by ingestion settings. Re-uploading the same
+filename to the same SearchSpace with a different ``use_vision_llm``
+flag will hit the duplicate branch and *not* re-process. To compare
+two settings combos head-to-head on the same corpus you must give
+each combo its own SearchSpace, which today means:
+
+    teardown --suite <s>
+    setup    --suite <s> ...
+    ingest   <s> <bench>  --no-vision-llm   # baseline run
+    run      <s> <bench>
+    teardown --suite <s>
+    setup    --suite <s> ...
+    ingest   <s> <bench>  --use-vision-llm  # vision arm
+    run      <s> <bench>
+
+The runs land in different timestamped subdirectories under
+``data/<suite>/runs/`` and ``report --suite <s>`` aggregates whichever
+manifest is currently latest per benchmark.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections.abc import Mapping
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+# Keep the constant list of valid processing modes here so benchmarks
+# don't have to re-import from the backend (they don't have access to
+# the backend package anyway).
+PROCESSING_MODES: tuple[str, ...] = ("basic", "premium")
+
+
+@dataclass(frozen=True)
+class IngestSettings:
+    """Resolved per-upload knobs handed to ``DocumentsClient.upload``.
+
+    Use ``IngestSettings(...)`` directly to define benchmark defaults,
+    or ``IngestSettings.merge(defaults, opts)`` to apply CLI overrides
+    on top of those defaults.
+    """
+
+    use_vision_llm: bool = False
+    processing_mode: str = "basic"
+    should_summarize: bool = False
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "use_vision_llm": self.use_vision_llm,
+            "processing_mode": self.processing_mode,
+            "should_summarize": self.should_summarize,
+        }
+
+    @classmethod
+    def merge(cls, defaults: IngestSettings, opts: Mapping[str, Any]) -> IngestSettings:
+        """Apply CLI overrides on top of ``defaults``.
+
+        ``opts`` is the kwargs dict built by ``core.cli`` from the
+        argparse namespace (see ``_cmd_ingest`` / ``_cmd_run``). Keys
+        we look for: ``use_vision_llm`` (bool or None), ``processing_mode``
+        (str or None), ``should_summarize`` (bool or None). Anything
+        else is ignored so benchmarks can pass through their own opts.
+        """
+
+        return cls(
+            use_vision_llm=_coerce_bool(opts.get("use_vision_llm"), defaults.use_vision_llm),
+            processing_mode=_coerce_mode(opts.get("processing_mode"), defaults.processing_mode),
+            should_summarize=_coerce_bool(opts.get("should_summarize"), defaults.should_summarize),
+        )
+
+    def render_label(self) -> str:
+        """Human-readable single-line label for reports / log lines."""
+
+        return (
+            f"vision={'on' if self.use_vision_llm else 'off'}, "
+            f"mode={self.processing_mode}, "
+            f"summarize={'on' if self.should_summarize else 'off'}"
+        )
+
+
+def _coerce_bool(value: Any, default: bool) -> bool:
+    """Argparse with ``BooleanOptionalAction`` yields True/False/None.
+
+    ``None`` means the operator didn't pass the flag → fall back to
+    the benchmark default.
+    """
+
+    if value is None:
+        return default
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, str):
+        return value.strip().lower() in {"1", "true", "yes", "on"}
+    return bool(value)
+
+
+def _coerce_mode(value: Any, default: str) -> str:
+    if value is None or value == "":
+        return default
+    val = str(value).strip().lower()
+    if val not in PROCESSING_MODES:
+        raise ValueError(
+            f"Invalid processing_mode {val!r}; must be one of {PROCESSING_MODES}"
+        )
+    return val
+
+
+# ---------------------------------------------------------------------------
+# Argparse helper
+# ---------------------------------------------------------------------------
+
+
+def _add_bool_pair(
+    parser: argparse.ArgumentParser,
+    *,
+    dest: str,
+    on_flag: str,
+    off_flag: str,
+    on_help: str,
+    off_help: str,
+) -> None:
+    """Add a mutually exclusive ``--foo`` / ``--no-foo`` pair.
+
+    We don't use ``argparse.BooleanOptionalAction`` because it would
+    auto-generate ``--no-use-vision-llm`` rather than the friendlier
+    ``--no-vision-llm`` that operators reach for. Default is ``None``
+    so ``IngestSettings.merge`` can distinguish "silent" from
+    "explicit false".
+    """
+
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        on_flag,
+        dest=dest,
+        action="store_true",
+        default=None,
+        help=on_help,
+    )
+    group.add_argument(
+        off_flag,
+        dest=dest,
+        action="store_false",
+        default=None,
+        help=off_help,
+    )
+
+
+def add_ingest_settings_args(
+    parser: argparse.ArgumentParser,
+    *,
+    defaults: IngestSettings,
+) -> None:
+    """Attach the three ingest-settings flag pairs to ``parser``.
+
+    Each bool exposes a mutually exclusive ``--foo`` / ``--no-foo``
+    pair so an operator can flip either direction without restating
+    every flag. Default is ``None`` so that "operator didn't pass the
+    flag" is distinguishable from "operator explicitly passed false"
+    — ``IngestSettings.merge`` then folds in the benchmark default
+    only when the operator was silent.
+    """
+
+    settings_group = parser.add_argument_group(
+        "ingest settings",
+        f"Per-upload knobs (forwarded to /documents/fileupload). "
+        f"Defaults for this benchmark: {defaults.render_label()}.",
+    )
+    _add_bool_pair(
+        settings_group,
+        dest="use_vision_llm",
+        on_flag="--use-vision-llm",
+        off_flag="--no-vision-llm",
+        on_help=(
+            "Run vision LLM during ingest to extract image content "
+            f"(default for this benchmark: "
+            f"{'on' if defaults.use_vision_llm else 'off'})."
+        ),
+        off_help="Skip vision LLM during ingest (text-only ETL).",
+    )
+    settings_group.add_argument(
+        "--processing-mode",
+        dest="processing_mode",
+        choices=PROCESSING_MODES,
+        default=None,
+        help=(
+            "SurfSense ETL processing mode (premium uses a 10x page "
+            f"multiplier and typically routes to a stronger ETL). "
+            f"Default for this benchmark: {defaults.processing_mode!r}."
+        ),
+    )
+    _add_bool_pair(
+        settings_group,
+        dest="should_summarize",
+        on_flag="--should-summarize",
+        off_flag="--no-summarize",
+        on_help=(
+            "Have SurfSense generate a document summary at ingest "
+            f"(default for this benchmark: "
+            f"{'on' if defaults.should_summarize else 'off'})."
+        ),
+        off_help="Skip per-document summary generation.",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Doc-map manifest helpers
+# ---------------------------------------------------------------------------
+#
+# Every benchmark writes a doc-map JSONL under ``data/<suite>/maps/`` that
+# pairs source identifiers (case_id, snippet_id, doc_path, …) to the
+# SurfSense document_ids returned by the upload. To make the report
+# self-describing we also write a header line:
+#
+#     {"__settings__": {"use_vision_llm": ..., "processing_mode": ..., ...}}
+#
+# These two helpers centralise that protocol so each benchmark only has to
+# call ``write_settings_header`` and ``read_settings_header``.
+
+SETTINGS_HEADER_KEY = "__settings__"
+
+
+def settings_header_line(settings: IngestSettings) -> str:
+    """Return the JSON-serialised header line (no trailing newline)."""
+
+    return json.dumps({SETTINGS_HEADER_KEY: settings.to_dict()})
+
+
+def is_settings_header(row: Mapping[str, Any]) -> bool:
+    return SETTINGS_HEADER_KEY in row
+
+
+def read_settings_header(map_path: Path) -> dict[str, Any]:
+    """Read the ``__settings__`` header out of a doc-map JSONL.
+
+    Returns ``{}`` on a missing file, an empty file, an unreadable
+    file, or a file whose first non-blank line is not a settings
+    header (e.g. a corpus ingested before this feature existed).
+    Callers use this purely to surface settings in the report; it
+    must never fail the run.
+    """
+
+    if not map_path.exists():
+        return {}
+    try:
+        with map_path.open("r", encoding="utf-8") as fh:
+            for line in fh:
+                line = line.strip()
+                if not line:
+                    continue
+                row = json.loads(line)
+                if isinstance(row, dict) and SETTINGS_HEADER_KEY in row:
+                    return dict(row[SETTINGS_HEADER_KEY])
+                return {}
+    except (OSError, json.JSONDecodeError):
+        return {}
+    return {}
+
+
+def format_ingest_settings_md(settings: Any) -> str:
+    """Render the resolved settings as a single Markdown bullet line."""
+
+    if not isinstance(settings, Mapping) or not settings:
+        return "- SurfSense ingest settings: (not recorded — re-ingest to capture)"
+    vision = "on" if settings.get("use_vision_llm") else "off"
+    mode = settings.get("processing_mode") or "basic"
+    summarize = "on" if settings.get("should_summarize") else "off"
+    return (
+        f"- SurfSense ingest settings: vision_llm=`{vision}`, "
+        f"processing_mode=`{mode}`, summarize=`{summarize}`"
+    )
+
+
+__all__ = [
+    "PROCESSING_MODES",
+    "SETTINGS_HEADER_KEY",
+    "IngestSettings",
+    "add_ingest_settings_args",
+    "format_ingest_settings_md",
+    "is_settings_header",
+    "read_settings_header",
+    "settings_header_line",
+]
diff --git a/surfsense_evals/src/surfsense_evals/core/metrics/__init__.py b/surfsense_evals/src/surfsense_evals/core/metrics/__init__.py
new file mode 100644
index 000000000..bd0e6aafb
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/metrics/__init__.py
@@ -0,0 +1,50 @@
+"""Pure-function metric primitives. Lazy imports."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:  # pragma: no cover
+    from .comparison import McnemarResult, bootstrap_delta_ci, mcnemar_test, paired_aggregate
+    from .mc_accuracy import AccuracyResult, accuracy_with_wilson_ci, wilson_ci
+    from .retrieval import RetrievalScores, mrr, ndcg_at_k, recall_at_k, score_run
+
+__all__ = [
+    "AccuracyResult",
+    "McnemarResult",
+    "RetrievalScores",
+    "accuracy_with_wilson_ci",
+    "bootstrap_delta_ci",
+    "mcnemar_test",
+    "mrr",
+    "ndcg_at_k",
+    "paired_aggregate",
+    "recall_at_k",
+    "score_run",
+    "wilson_ci",
+]
+
+
+_MODULE_FOR = {
+    "AccuracyResult": "mc_accuracy",
+    "accuracy_with_wilson_ci": "mc_accuracy",
+    "wilson_ci": "mc_accuracy",
+    "RetrievalScores": "retrieval",
+    "mrr": "retrieval",
+    "ndcg_at_k": "retrieval",
+    "recall_at_k": "retrieval",
+    "score_run": "retrieval",
+    "McnemarResult": "comparison",
+    "bootstrap_delta_ci": "comparison",
+    "mcnemar_test": "comparison",
+    "paired_aggregate": "comparison",
+}
+
+
+def __getattr__(name: str):
+    if name in _MODULE_FOR:
+        from importlib import import_module
+
+        mod = import_module(f".{_MODULE_FOR[name]}", __name__)
+        return getattr(mod, name)
+    raise AttributeError(f"module 'surfsense_evals.core.metrics' has no attribute {name!r}")
diff --git a/surfsense_evals/src/surfsense_evals/core/metrics/comparison.py b/surfsense_evals/src/surfsense_evals/core/metrics/comparison.py
new file mode 100644
index 000000000..579576f4f
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/metrics/comparison.py
@@ -0,0 +1,258 @@
+"""Paired comparison statistics for head-to-head benchmarks.
+
+In every head-to-head benchmark (currently MedXpertQA-MM and
+MMLongBench-Doc) each question is answered by both arms (Native PDF
+and SurfSense). That makes per-question outcomes paired, so
+``McNemar's test`` on the discordant pairs is the right significance
+test for "are the two arms different?". We also expose a bootstrap
+delta CI for visualising effect size.
+
+Aggregate cost / latency / token deltas are mean-based; the runner
+slices them by arm before passing them in.
+"""
+
+from __future__ import annotations
+
+import math
+import statistics
+from collections.abc import Sequence
+from dataclasses import dataclass
+
+import numpy as np
+
+
+@dataclass(frozen=True)
+class McnemarResult:
+    """Discordant pair counts + the test statistics."""
+
+    n_total: int
+    b: int  # native correct, surfsense wrong
+    c: int  # native wrong,   surfsense correct
+    statistic: float
+    p_value: float
+    method: str
+
+    def to_dict(self) -> dict[str, float | int | str]:
+        return {
+            "n_total": self.n_total,
+            "b_native_correct_only": self.b,
+            "c_surfsense_correct_only": self.c,
+            "statistic": self.statistic,
+            "p_value": self.p_value,
+            "method": self.method,
+        }
+
+
+def mcnemar_test(
+    arm_a_correct: Sequence[bool],
+    arm_b_correct: Sequence[bool],
+    *,
+    use_exact_below: int = 11,
+) -> McnemarResult:
+    """Paired McNemar's test on per-question correctness.
+
+    ``arm_a_correct`` is treated as the reference arm (typically the
+    "native" arm); ``arm_b_correct`` is the challenger (typically
+    "surfsense"). The test statistic only depends on discordant pairs.
+
+    Default switch-over (``b + c < 11``): for very small discordant
+    samples the exact binomial test is preferred; above that the
+    continuity-corrected chi-square is well-behaved (Edwards 1948).
+    Callers can raise ``use_exact_below`` if they prefer the more
+    conservative ``b + c < 25`` rule.
+
+    No external statistical package is required: scipy is a heavy dep
+    and we only need binomial CDFs / chi-square sf, both implementable
+    in stdlib + numpy without surprises.
+    """
+
+    if len(arm_a_correct) != len(arm_b_correct):
+        raise ValueError(
+            f"Length mismatch: arm_a={len(arm_a_correct)}, arm_b={len(arm_b_correct)}"
+        )
+    n = len(arm_a_correct)
+    b = sum(1 for a, c in zip(arm_a_correct, arm_b_correct) if a and not c)
+    c = sum(1 for a, cc in zip(arm_a_correct, arm_b_correct) if (not a) and cc)
+    discordant = b + c
+    if discordant == 0:
+        return McnemarResult(
+            n_total=n, b=b, c=c, statistic=0.0, p_value=1.0, method="degenerate"
+        )
+
+    if discordant < use_exact_below:
+        # Exact binomial: under H0 each discordant pair is a Bernoulli(0.5).
+        # p-value = 2 * P(X <= min(b,c) | n=discordant, p=0.5), capped at 1.
+        k = min(b, c)
+        cdf = sum(_binom_pmf(discordant, i) for i in range(k + 1))
+        p_value = min(1.0, 2.0 * cdf)
+        return McnemarResult(
+            n_total=n, b=b, c=c, statistic=float(k), p_value=p_value, method="exact"
+        )
+
+    # Chi-square with continuity correction (McNemar-Edwards).
+    chi = ((abs(b - c) - 1) ** 2) / discordant
+    p_value = _chi2_sf(chi, df=1)
+    return McnemarResult(
+        n_total=n, b=b, c=c, statistic=chi, p_value=p_value, method="chi2_cc"
+    )
+
+
+def _binom_pmf(n: int, k: int) -> float:
+    return math.comb(n, k) * (0.5 ** n)
+
+
+def _chi2_sf(x: float, *, df: int) -> float:
+    """Survival function (1 - CDF) of chi-square; df=1 closed form."""
+
+    if x <= 0:
+        return 1.0
+    if df == 1:
+        # Chi^2(1) = N(0,1)^2; sf(x) = 2 * Phi_complement(sqrt(x))
+        return math.erfc(math.sqrt(x / 2.0))
+    # General fallback via regularized upper incomplete gamma.
+    a = df / 2.0
+    z = x / 2.0
+    return _gammaincc(a, z)
+
+
+def _gammaincc(a: float, x: float, *, max_iter: int = 200, tol: float = 1e-12) -> float:
+    """Regularised upper incomplete gamma Q(a, x). Series + continued fraction."""
+
+    if x < 0 or a <= 0:
+        return float("nan")
+    if x == 0:
+        return 1.0
+    if x < a + 1.0:
+        # Series for P(a, x); subtract from 1.
+        p_series = _gammainc_series(a, x, max_iter=max_iter, tol=tol)
+        return 1.0 - p_series
+    return _gammaincc_cf(a, x, max_iter=max_iter, tol=tol)
+
+
+def _gammainc_series(a: float, x: float, *, max_iter: int, tol: float) -> float:
+    term = 1.0 / a
+    summation = term
+    for n in range(1, max_iter):
+        term *= x / (a + n)
+        summation += term
+        if abs(term) < abs(summation) * tol:
+            break
+    log_pre = -x + a * math.log(x) - math.lgamma(a)
+    return summation * math.exp(log_pre)
+
+
+def _gammaincc_cf(a: float, x: float, *, max_iter: int, tol: float) -> float:
+    b = x + 1.0 - a
+    c_val = 1.0 / 1e-300
+    d = 1.0 / b
+    h = d
+    for i in range(1, max_iter):
+        an = -i * (i - a)
+        b += 2.0
+        d = an * d + b
+        if abs(d) < 1e-300:
+            d = 1e-300
+        c_val = b + an / c_val
+        if abs(c_val) < 1e-300:
+            c_val = 1e-300
+        d = 1.0 / d
+        delta = d * c_val
+        h *= delta
+        if abs(delta - 1.0) < tol:
+            break
+    log_pre = -x + a * math.log(x) - math.lgamma(a)
+    return h * math.exp(log_pre)
+
+
+# ---------------------------------------------------------------------------
+# Bootstrap delta CI
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class BootstrapDelta:
+    delta: float
+    ci_low: float
+    ci_high: float
+    n_resamples: int
+
+    def to_dict(self) -> dict[str, float | int]:
+        return {
+            "delta": self.delta,
+            "ci_low": self.ci_low,
+            "ci_high": self.ci_high,
+            "n_resamples": self.n_resamples,
+        }
+
+
+def bootstrap_delta_ci(
+    arm_a_correct: Sequence[bool],
+    arm_b_correct: Sequence[bool],
+    *,
+    n_resamples: int = 5000,
+    level: float = 0.95,
+    random_state: int | None = 0,
+) -> BootstrapDelta:
+    """Paired-sample bootstrap CI for ``mean(arm_b) - mean(arm_a)``.
+
+    Resamples *paired indices* with replacement so the dependency
+    between arms is preserved.
+    """
+
+    if len(arm_a_correct) != len(arm_b_correct):
+        raise ValueError("paired arms must have the same length")
+    n = len(arm_a_correct)
+    if n == 0:
+        return BootstrapDelta(0.0, 0.0, 0.0, 0)
+    a = np.asarray(arm_a_correct, dtype=np.int8)
+    b = np.asarray(arm_b_correct, dtype=np.int8)
+    delta = float(b.mean() - a.mean())
+
+    rng = np.random.default_rng(random_state)
+    deltas = np.empty(n_resamples, dtype=np.float64)
+    for i in range(n_resamples):
+        idx = rng.integers(0, n, size=n)
+        deltas[i] = b[idx].mean() - a[idx].mean()
+    alpha = (1.0 - level) / 2.0
+    ci_low, ci_high = float(np.quantile(deltas, alpha)), float(np.quantile(deltas, 1 - alpha))
+    return BootstrapDelta(delta=delta, ci_low=ci_low, ci_high=ci_high, n_resamples=n_resamples)
+
+
+# ---------------------------------------------------------------------------
+# Simple aggregate helpers (cost / latency / tokens)
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class Aggregate:
+    mean: float
+    median: float
+    p95: float
+    n: int
+
+    def to_dict(self) -> dict[str, float | int]:
+        return {"mean": self.mean, "median": self.median, "p95": self.p95, "n": self.n}
+
+
+def paired_aggregate(values: Sequence[float]) -> Aggregate:
+    """Mean / median / p95 of a list of numbers (e.g. cost-per-question)."""
+
+    if not values:
+        return Aggregate(0.0, 0.0, 0.0, 0)
+    arr = np.asarray(values, dtype=np.float64)
+    return Aggregate(
+        mean=float(arr.mean()),
+        median=float(statistics.median(values)),
+        p95=float(np.quantile(arr, 0.95)),
+        n=len(values),
+    )
+
+
+__all__ = [
+    "Aggregate",
+    "BootstrapDelta",
+    "McnemarResult",
+    "bootstrap_delta_ci",
+    "mcnemar_test",
+    "paired_aggregate",
+]
diff --git a/surfsense_evals/src/surfsense_evals/core/metrics/mc_accuracy.py b/surfsense_evals/src/surfsense_evals/core/metrics/mc_accuracy.py
new file mode 100644
index 000000000..8b0188ca4
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/metrics/mc_accuracy.py
@@ -0,0 +1,130 @@
+"""Multiple-choice accuracy + Wilson 95% confidence intervals.
+
+Wilson CI is preferred over normal-approximation because MIRAGE's
+per-task subsets can be small (PubMedQA* and BioASQ-Y/N have a few
+hundred questions each) and Wilson handles n→0 / p→{0,1} edges
+gracefully.
+
+Reference for the closed form: Wilson (1927); identical to the
+``statsmodels.stats.proportion.proportion_confint(method='wilson')``
+output and what scikit-learn implements internally for its bounded
+estimators.
+"""
+
+from __future__ import annotations
+
+import math
+from collections.abc import Mapping, Sequence
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class AccuracyResult:
+    """Per-task accuracy with Wilson CI."""
+
+    n_correct: int
+    n_total: int
+    accuracy: float
+    ci_low: float
+    ci_high: float
+
+    def to_dict(self) -> dict[str, float | int]:
+        return {
+            "n_correct": self.n_correct,
+            "n_total": self.n_total,
+            "accuracy": self.accuracy,
+            "ci_low": self.ci_low,
+            "ci_high": self.ci_high,
+        }
+
+
+# Two-sided Wilson z values. 1.959964 ≈ z_{0.975}.
+_Z_FOR_LEVEL: dict[float, float] = {
+    0.90: 1.6448536269514722,
+    0.95: 1.959963984540054,
+    0.99: 2.5758293035489004,
+}
+
+
+def wilson_ci(
+    n_correct: int, n_total: int, *, level: float = 0.95
+) -> tuple[float, float]:
+    """Two-sided Wilson score confidence interval for a proportion.
+
+    Returns ``(low, high)``. ``n_total == 0`` returns ``(0.0, 1.0)`` —
+    the maximally uncertain interval.
+    """
+
+    if n_total <= 0:
+        return 0.0, 1.0
+    if level not in _Z_FOR_LEVEL:
+        raise ValueError(f"Unsupported confidence level {level!r}")
+    z = _Z_FOR_LEVEL[level]
+    p = n_correct / n_total
+    n = n_total
+    denom = 1.0 + (z * z) / n
+    centre = (p + (z * z) / (2 * n)) / denom
+    half = (z / denom) * math.sqrt((p * (1 - p) / n) + (z * z) / (4 * n * n))
+    low = max(0.0, centre - half)
+    high = min(1.0, centre + half)
+    return low, high
+
+
+def accuracy_with_wilson_ci(
+    n_correct: int, n_total: int, *, level: float = 0.95
+) -> AccuracyResult:
+    if n_total < 0:
+        raise ValueError(f"n_total must be >= 0, got {n_total}")
+    if n_correct < 0 or n_correct > n_total:
+        raise ValueError(
+            f"n_correct must be in [0, n_total]; got n_correct={n_correct}, n_total={n_total}"
+        )
+    accuracy = (n_correct / n_total) if n_total > 0 else 0.0
+    low, high = wilson_ci(n_correct, n_total, level=level)
+    return AccuracyResult(
+        n_correct=n_correct,
+        n_total=n_total,
+        accuracy=accuracy,
+        ci_low=low,
+        ci_high=high,
+    )
+
+
+def per_task_accuracy(
+    rows: Sequence[Mapping[str, object]],
+    *,
+    task_key: str = "task",
+    correct_key: str = "is_correct",
+    level: float = 0.95,
+) -> dict[str, AccuracyResult]:
+    """Group ``rows`` by ``task_key`` and compute per-task ``AccuracyResult``.
+
+    ``rows[i][correct_key]`` must be truthy iff the answer was correct.
+    """
+
+    counts: dict[str, list[int]] = {}
+    for row in rows:
+        task = str(row.get(task_key, ""))
+        bucket = counts.setdefault(task, [0, 0])
+        bucket[1] += 1
+        if row.get(correct_key):
+            bucket[0] += 1
+    return {
+        task: accuracy_with_wilson_ci(c[0], c[1], level=level)
+        for task, c in counts.items()
+    }
+
+
+def macro_accuracy(per_task: Mapping[str, AccuracyResult]) -> float:
+    if not per_task:
+        return 0.0
+    return sum(r.accuracy for r in per_task.values()) / len(per_task)
+
+
+__all__ = [
+    "AccuracyResult",
+    "accuracy_with_wilson_ci",
+    "macro_accuracy",
+    "per_task_accuracy",
+    "wilson_ci",
+]
diff --git a/surfsense_evals/src/surfsense_evals/core/metrics/retrieval.py b/surfsense_evals/src/surfsense_evals/core/metrics/retrieval.py
new file mode 100644
index 000000000..d4cfe10ae
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/metrics/retrieval.py
@@ -0,0 +1,132 @@
+"""Retrieval metrics: Recall@k, MRR, nDCG@k.
+
+Used by CUREv1's runner to score the SurfSense arm against the
+benchmark's qrels. ``corpus_id`` is the canonical CUREv1 passage id
+(string); the runner maps SurfSense ``chunk_id`` → ``document_id`` →
+``corpus_id`` before calling these.
+
+Graded relevance (CUREv1 uses 0/1/2 grades) is honoured by ``ndcg_at_k``;
+``recall_at_k`` and ``mrr`` flatten anything > 0 to "relevant".
+"""
+
+from __future__ import annotations
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class RetrievalScores:
+    """Aggregated retrieval scores."""
+
+    recall_at_k: dict[int, float]
+    mrr: float
+    ndcg_at_10: float
+    n_queries: int
+
+    def to_dict(self) -> dict:
+        return {
+            "recall_at_k": dict(self.recall_at_k),
+            "mrr": self.mrr,
+            "ndcg_at_10": self.ndcg_at_10,
+            "n_queries": self.n_queries,
+        }
+
+
+def recall_at_k(retrieved: Sequence[str], relevant: Iterable[str], k: int) -> float:
+    """Fraction of ``relevant`` documents found in ``retrieved[:k]``."""
+
+    if not relevant:
+        return 0.0
+    relevant_set = set(relevant)
+    if not relevant_set:
+        return 0.0
+    top_k = list(retrieved)[:k]
+    hits = sum(1 for doc in top_k if doc in relevant_set)
+    return hits / len(relevant_set)
+
+
+def mrr(retrieved: Sequence[str], relevant: Iterable[str]) -> float:
+    """Reciprocal rank of the first relevant doc, 0 if none found."""
+
+    relevant_set = set(relevant)
+    for rank, doc in enumerate(retrieved, start=1):
+        if doc in relevant_set:
+            return 1.0 / rank
+    return 0.0
+
+
+def _dcg_at_k(grades: Sequence[float], k: int) -> float:
+    s = 0.0
+    for i, grade in enumerate(grades[:k], start=1):
+        # Standard log-base-2 discount; gain = 2^grade - 1 for graded relevance.
+        s += (2.0 ** grade - 1.0) / math.log2(i + 1)
+    return s
+
+
+def ndcg_at_k(
+    retrieved: Sequence[str],
+    qrels: Mapping[str, float],
+    k: int,
+) -> float:
+    """nDCG@k against graded ``qrels`` (``{doc_id: grade}``).
+
+    Unjudged documents in ``retrieved`` contribute zero gain. The
+    ideal ordering is ``qrels`` sorted by grade descending.
+    """
+
+    if not qrels:
+        return 0.0
+    grades = [float(qrels.get(doc, 0.0)) for doc in retrieved]
+    dcg = _dcg_at_k(grades, k)
+    ideal = sorted(qrels.values(), reverse=True)
+    idcg = _dcg_at_k([float(g) for g in ideal], k)
+    if idcg == 0.0:
+        return 0.0
+    return dcg / idcg
+
+
+def score_run(
+    *,
+    per_query_retrieved: Mapping[str, Sequence[str]],
+    per_query_qrels: Mapping[str, Mapping[str, float]],
+    ks: Sequence[int] = (1, 5, 10, 32),
+    ndcg_k: int = 10,
+) -> RetrievalScores:
+    """Aggregate Recall@k, MRR, nDCG@k across a run.
+
+    ``per_query_retrieved`` maps ``query_id -> ordered list of doc ids``.
+    ``per_query_qrels`` maps ``query_id -> {doc_id: grade}`` (grade > 0
+    is relevant).
+
+    Queries present in retrieved but not in qrels are skipped. Queries
+    in qrels but missing from retrieved contribute zeros.
+    """
+
+    qids = set(per_query_qrels.keys()) & set(per_query_retrieved.keys())
+    if not qids:
+        return RetrievalScores(recall_at_k={k: 0.0 for k in ks}, mrr=0.0, ndcg_at_10=0.0, n_queries=0)
+
+    recall_totals = {k: 0.0 for k in ks}
+    mrr_total = 0.0
+    ndcg_total = 0.0
+    for qid in qids:
+        retrieved = list(per_query_retrieved[qid])
+        qrels = per_query_qrels[qid]
+        relevant_docs = [d for d, g in qrels.items() if g > 0]
+        for k in ks:
+            recall_totals[k] += recall_at_k(retrieved, relevant_docs, k)
+        mrr_total += mrr(retrieved, relevant_docs)
+        ndcg_total += ndcg_at_k(retrieved, qrels, ndcg_k)
+
+    n = len(qids)
+    return RetrievalScores(
+        recall_at_k={k: v / n for k, v in recall_totals.items()},
+        mrr=mrr_total / n,
+        ndcg_at_10=ndcg_total / n,
+        n_queries=n,
+    )
+
+
+__all__ = ["RetrievalScores", "mrr", "ndcg_at_k", "recall_at_k", "score_run"]
diff --git a/surfsense_evals/src/surfsense_evals/core/parse/__init__.py b/surfsense_evals/src/surfsense_evals/core/parse/__init__.py
new file mode 100644
index 000000000..208c2d374
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/parse/__init__.py
@@ -0,0 +1,21 @@
+"""Parsers shared across suites: citations, MCQ envelopes, AI-SDK SSE."""
+
+from __future__ import annotations
+
+from .answer_letter import AnswerLetterResult, extract_answer_letter
+from .citations import CITATION_REGEX, CitationToken, ChunkCitation, UrlCitation, parse_citations
+from .freeform_answer import extract_freeform_answer
+from .sse import SseEvent, iter_sse_events
+
+__all__ = [
+    "CITATION_REGEX",
+    "CitationToken",
+    "ChunkCitation",
+    "UrlCitation",
+    "parse_citations",
+    "AnswerLetterResult",
+    "extract_answer_letter",
+    "extract_freeform_answer",
+    "SseEvent",
+    "iter_sse_events",
+]
diff --git a/surfsense_evals/src/surfsense_evals/core/parse/answer_letter.py b/surfsense_evals/src/surfsense_evals/core/parse/answer_letter.py
new file mode 100644
index 000000000..8cf23869b
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/parse/answer_letter.py
@@ -0,0 +1,122 @@
+"""Robust extractor for MCQ answer letters.
+
+Handles three answer shapes seen in the wild:
+
+1. **MedRAG envelope** — ``{"step_by_step_thinking": "...", "answer_choice": "A"}``
+   embedded somewhere in the assistant message (often inside ```` ```json ```` /
+   ``` ``` ``` fences). The regex grabs the JSON object and reads the
+   ``answer_choice`` field.
+
+2. **Final-line letter** — e.g. ``Answer: B`` or ``The correct answer is (C).``.
+   Falls back to a permissive regex over the last few lines.
+
+3. **Bare letter** — single uppercase letter at the end of the message.
+
+The function returns the parsed letter (uppercased) plus a discriminator
+of which strategy fired so the runner / report can flag suspicious
+parses (typically zero-confidence parses indicate the model didn't
+follow the prompt).
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from dataclasses import dataclass
+from typing import Literal
+
+ParserStrategy = Literal["json_envelope", "answer_line", "bare_letter", "none"]
+
+
+@dataclass(frozen=True)
+class AnswerLetterResult:
+    letter: str | None
+    strategy: ParserStrategy
+
+    @property
+    def found(self) -> bool:
+        return self.letter is not None
+
+
+# ---------------------------------------------------------------------------
+# Strategies
+# ---------------------------------------------------------------------------
+
+
+_JSON_BLOCK = re.compile(r"\{[^{}]*\"answer_choice\"\s*:\s*\"([A-Za-z])\"[^{}]*\}", re.DOTALL)
+_FENCED_JSON = re.compile(r"```(?:json)?\s*(\{.*?\})\s*```", re.DOTALL | re.IGNORECASE)
+_ANSWER_LINE = re.compile(
+    r"(?:final\s*answer|answer\s*choice|the\s+correct\s+answer\s+is|answer)\s*[:=\-]?\s*"
+    r"\(?\s*([A-Za-z])\s*[\)\.]*\s*$",
+    re.IGNORECASE | re.MULTILINE,
+)
+_BARE_LETTER = re.compile(r"^\s*\(?\s*([A-Za-z])\s*[\)\.]*\s*$", re.MULTILINE)
+
+
+def _from_json_envelope(text: str) -> str | None:
+    # Try fenced code blocks first (most likely to contain the JSON).
+    for fence in _FENCED_JSON.finditer(text):
+        try:
+            obj = json.loads(fence.group(1))
+        except (json.JSONDecodeError, ValueError):
+            continue
+        if isinstance(obj, dict):
+            choice = obj.get("answer_choice")
+            if isinstance(choice, str) and choice.strip():
+                return choice.strip()[:1].upper()
+
+    # Fall back to a tolerant regex over the whole text (handles
+    # responses that drop the fences).
+    match = _JSON_BLOCK.search(text)
+    if match:
+        return match.group(1).upper()
+    return None
+
+
+def _from_answer_line(text: str) -> str | None:
+    # Walk lines bottom-up; the answer is almost always near the end.
+    for match in reversed(list(_ANSWER_LINE.finditer(text))):
+        letter = match.group(1).upper()
+        if letter.isalpha():
+            return letter
+    return None
+
+
+def _from_bare_letter(text: str) -> str | None:
+    # Inspect only the final non-empty lines (avoid grabbing in-prose
+    # mentions of "A" or "I").
+    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
+    for ln in reversed(lines[-3:]):
+        match = _BARE_LETTER.match(ln)
+        if match:
+            return match.group(1).upper()
+    return None
+
+
+def extract_answer_letter(text: str) -> AnswerLetterResult:
+    """Run strategies in order and return the first hit.
+
+    Order: JSON envelope → final-answer-line regex → bare-letter
+    fallback. Empty / whitespace-only text returns
+    ``AnswerLetterResult(None, "none")``.
+    """
+
+    if not text or not text.strip():
+        return AnswerLetterResult(None, "none")
+
+    letter = _from_json_envelope(text)
+    if letter:
+        return AnswerLetterResult(letter, "json_envelope")
+
+    letter = _from_answer_line(text)
+    if letter:
+        return AnswerLetterResult(letter, "answer_line")
+
+    letter = _from_bare_letter(text)
+    if letter:
+        return AnswerLetterResult(letter, "bare_letter")
+
+    return AnswerLetterResult(None, "none")
+
+
+__all__ = ["AnswerLetterResult", "ParserStrategy", "extract_answer_letter"]
diff --git a/surfsense_evals/src/surfsense_evals/core/parse/citations.py b/surfsense_evals/src/surfsense_evals/core/parse/citations.py
new file mode 100644
index 000000000..1fcd35434
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/parse/citations.py
@@ -0,0 +1,110 @@
+"""Python port of the canonical citation parser.
+
+Source of truth: ``surfsense_web/lib/citations/citation-parser.ts:20-21``.
+The pattern is byte-for-byte identical to the TS export ``CITATION_REGEX``
+so a SurfSense user reading the web client and a CUREv1 retrieval scorer
+running here see the same chunk_ids extracted from the same answer.
+
+The TS reference also handles a ``urlcite{N}`` placeholder produced by
+``preprocessCitationMarkdown`` — that pre-processing step is web-only
+(GFM autolink workaround), so the harness sees raw ``[citation:URL]``
+tokens and ``parse_citations`` returns them as ``UrlCitation`` directly.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import Any, Union
+
+# Pattern preserves the TS source verbatim:
+#   /[\[【]\u200B?citation:\s*(https?:\/\/[^\]】\u200B]+|urlcite\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*)\s*\u200B?[\]】]/g
+#
+# Notes:
+# * Matches both ASCII ``[]`` and Chinese fullwidth ``【】`` brackets.
+# * Allows an optional ZWSP (``\u200B``) just inside each bracket.
+# * ``citation:`` then EITHER a URL (anything not ``]``, ``】``, or ZWSP),
+#   OR a ``urlcite\d+`` placeholder, OR one or more comma-separated
+#   chunk ids (each optionally prefixed with ``doc-`` and optionally
+#   negative).
+# * URL char class deliberately excludes the closing brackets so a
+#   ``[citation:https://x.com]`` doesn't swallow the ``]``.
+# The ZWSP must be the actual code-point — the original TS source uses
+# the regex literal ``\u200B`` which the JS engine interprets as the
+# character. Python's ``re`` doesn't process the ``\u`` escape inside
+# the pattern source, so we splice the literal character in via an
+# f-string. This keeps our pattern functionally identical to the TS
+# reference and lets ``"\u200B" in CITATION_REGEX.pattern`` succeed.
+_ZWSP = "\u200B"
+CITATION_REGEX = re.compile(
+    rf"[\[【]{_ZWSP}?citation:\s*("
+    rf"https?://[^\]】{_ZWSP}]+|urlcite\d+|(?:doc-)?-?\d+(?:\s*,\s*(?:doc-)?-?\d+)*"
+    rf")\s*{_ZWSP}?[\]】]"
+)
+
+
+@dataclass(frozen=True)
+class ChunkCitation:
+    chunk_id: int
+    is_docs_chunk: bool
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "kind": "chunk",
+            "chunk_id": self.chunk_id,
+            "is_docs_chunk": self.is_docs_chunk,
+        }
+
+
+@dataclass(frozen=True)
+class UrlCitation:
+    url: str
+
+    def to_dict(self) -> dict[str, Any]:
+        return {"kind": "url", "url": self.url}
+
+
+CitationToken = Union[ChunkCitation, UrlCitation]
+
+
+def parse_citations(text: str, *, url_map: dict[str, str] | None = None) -> list[CitationToken]:
+    """Return the citation tokens found in ``text`` in document order.
+
+    ``url_map`` is the optional ``urlciteN -> URL`` lookup that the web
+    client builds in its preprocessing step. The harness ordinarily
+    doesn't preprocess (we don't render the markdown, we score it), so
+    the default empty map means ``urlciteN`` placeholders are dropped
+    rather than mis-resolved to a missing URL.
+
+    Multi-id payloads like ``[citation:1, doc-2, -3]`` are flattened
+    into separate ``ChunkCitation`` entries — same as the TS reference.
+    """
+
+    out: list[CitationToken] = []
+    for match in CITATION_REGEX.finditer(text):
+        captured = match.group(1)
+        if captured.startswith("http://") or captured.startswith("https://"):
+            out.append(UrlCitation(url=captured.strip()))
+            continue
+        if captured.startswith("urlcite"):
+            if url_map and captured in url_map:
+                out.append(UrlCitation(url=url_map[captured]))
+            continue
+        for raw_id in (s.strip() for s in captured.split(",")):
+            is_docs_chunk = raw_id.startswith("doc-")
+            number_part = raw_id[4:] if is_docs_chunk else raw_id
+            try:
+                chunk_id = int(number_part)
+            except ValueError:
+                continue
+            out.append(ChunkCitation(chunk_id=chunk_id, is_docs_chunk=is_docs_chunk))
+    return out
+
+
+__all__ = [
+    "CITATION_REGEX",
+    "ChunkCitation",
+    "UrlCitation",
+    "CitationToken",
+    "parse_citations",
+]
diff --git a/surfsense_evals/src/surfsense_evals/core/parse/freeform_answer.py b/surfsense_evals/src/surfsense_evals/core/parse/freeform_answer.py
new file mode 100644
index 000000000..959b045a5
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/parse/freeform_answer.py
@@ -0,0 +1,85 @@
+"""Extract free-form answers from open-ended LLM responses.
+
+Used by benchmarks that don't have a fixed letter set (MMLongBench-Doc,
+DocVQA-style benchmarks, future legal/finance suites). The contract:
+
+* Strip leading "Answer:" / "Final answer:" markers if present.
+* Drop fenced code blocks if the model wrapped its answer in one.
+* Trim leading/trailing whitespace.
+* Return the *last* meaningful chunk — models often think out loud
+  before stating the answer.
+
+If the message is empty or only contains a fence, return ``""``.
+"""
+
+from __future__ import annotations
+
+import re
+
+_ANSWER_PREFIX = re.compile(
+    r"^\s*(?:final\s*answer|the\s+answer\s+is|answer)\s*[:=\-]\s*",
+    re.IGNORECASE,
+)
+# Marker-only regex (no capture group) used to find every "Answer:"
+# token position. We then slice from the LAST marker's end to the
+# next newline ourselves — robust to multiple inline answers because
+# we never let the engine greedy-capture across markers.
+_ANSWER_MARKER = re.compile(
+    r"(?:final\s*answer|the\s+answer\s+is|answer)\s*[:=\-]\s*",
+    re.IGNORECASE,
+)
+_FENCED_BLOCK = re.compile(r"```[a-zA-Z0-9]*\s*([\s\S]*?)\s*```")
+
+
+def extract_freeform_answer(text: str) -> str:
+    """Pull the model's final answer out of a possibly-verbose response."""
+
+    if not text or not text.strip():
+        return ""
+
+    # 1. Find the last line that starts with an Answer: marker. If
+    #    nothing matches, walk back to the last non-empty line.
+    lines = [ln.rstrip() for ln in text.strip().splitlines()]
+    candidate = ""
+    for ln in reversed(lines):
+        if not ln.strip():
+            continue
+        if _ANSWER_PREFIX.search(ln):
+            candidate = _ANSWER_PREFIX.sub("", ln, count=1).strip()
+            break
+
+    if not candidate:
+        # 2. Inline match: find every "Answer:" marker position and
+        # slice from the LAST marker's end to the next newline. Robust
+        # to "preamble.Answer: 42" one-liners and multiple inline
+        # markers (we always pick the final, freshest one).
+        marker_matches = list(_ANSWER_MARKER.finditer(text))
+        if marker_matches:
+            last = marker_matches[-1]
+            tail = text[last.end():]
+            nl = tail.find("\n")
+            if nl >= 0:
+                tail = tail[:nl]
+            candidate = tail.strip()
+
+    if not candidate:
+        # 3. No "Answer:" marker — try fenced blocks.
+        fences = _FENCED_BLOCK.findall(text)
+        if fences:
+            candidate = fences[-1].strip()
+        else:
+            # Last non-empty line as a fallback.
+            for ln in reversed(lines):
+                if ln.strip():
+                    candidate = ln.strip()
+                    break
+
+    # 2. Strip wrapping quotes / parens / trailing punctuation that
+    #    confuse the grader without changing meaning.
+    candidate = candidate.strip().strip("`").strip()
+    if candidate.startswith(("\"", "'")) and candidate.endswith(("\"", "'")):
+        candidate = candidate[1:-1].strip()
+    return candidate
+
+
+__all__ = ["extract_freeform_answer"]
diff --git a/surfsense_evals/src/surfsense_evals/core/parse/sse.py b/surfsense_evals/src/surfsense_evals/core/parse/sse.py
new file mode 100644
index 000000000..76ded2d13
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/parse/sse.py
@@ -0,0 +1,72 @@
+"""Minimal SSE consumer compatible with SurfSense's wire format.
+
+SurfSense uses ``app/services/streaming/envelope/sse.py`` to frame events:
+
+* ``data: <single-line-string>\\n\\n``
+* ``data: <json-string>\\n\\n``  (most events)
+* ``data: [DONE]\\n\\n``  (terminator)
+
+There is no ``event:``, ``id:``, or ``retry:`` framing in production —
+``format_sse(payload)`` only emits the ``data:`` line. This implementation
+is therefore intentionally smaller than ``httpx-sse`` (which we still
+list as a dep so callers who want richer parsing can opt in): one event
+per ``data:`` line, separated by blank lines.
+
+We accept any line iterator (an ``httpx.Response.aiter_lines`` adapter
+in production, a list in tests) so this is unit-testable without a
+network mock.
+"""
+
+from __future__ import annotations
+
+from collections.abc import AsyncIterator
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class SseEvent:
+    """A parsed SSE event. Only the ``data`` field is populated.
+
+    Multi-line payloads (``data: a\\ndata: b``) are joined with ``\\n``
+    per the SSE spec, even though SurfSense doesn't currently emit them.
+    """
+
+    data: str
+
+
+async def iter_sse_events(lines: AsyncIterator[str]) -> AsyncIterator[SseEvent]:
+    """Yield one ``SseEvent`` per blank-line-terminated frame.
+
+    Lines that are empty or whitespace flush the buffer. ``data:`` lines
+    are accumulated into the buffer; everything else is ignored
+    (matches the lenient browser EventSource behaviour).
+    """
+
+    buffer: list[str] = []
+    async for raw in lines:
+        if raw is None:
+            continue
+        line = raw.rstrip("\r")
+        if line == "":
+            if buffer:
+                yield SseEvent(data="\n".join(buffer))
+                buffer.clear()
+            continue
+        if line.startswith(":"):
+            # comment / heartbeat
+            continue
+        if line.startswith("data:"):
+            # spec: optional single space after the colon.
+            payload = line[5:]
+            if payload.startswith(" "):
+                payload = payload[1:]
+            buffer.append(payload)
+            continue
+        # Any other field (event:, id:, retry:) is currently unused.
+        continue
+
+    if buffer:
+        yield SseEvent(data="\n".join(buffer))
+
+
+__all__ = ["SseEvent", "iter_sse_events"]
diff --git a/surfsense_evals/src/surfsense_evals/core/pdf/__init__.py b/surfsense_evals/src/surfsense_evals/core/pdf/__init__.py
new file mode 100644
index 000000000..e03fa34c9
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/pdf/__init__.py
@@ -0,0 +1,31 @@
+"""Domain-agnostic PDF rendering helper. Lazy import."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:  # pragma: no cover
+    from .render import (
+        PdfImage,
+        render_pdf,
+        render_pdf_with_images,
+        render_text_files_to_pdf,
+    )
+
+__all__ = [
+    "PdfImage",
+    "render_pdf",
+    "render_pdf_with_images",
+    "render_text_files_to_pdf",
+]
+
+
+_LAZY = {"PdfImage", "render_pdf", "render_pdf_with_images", "render_text_files_to_pdf"}
+
+
+def __getattr__(name: str):
+    if name in _LAZY:
+        from . import render as _mod
+
+        return getattr(_mod, name)
+    raise AttributeError(f"module 'surfsense_evals.core.pdf' has no attribute {name!r}")
diff --git a/surfsense_evals/src/surfsense_evals/core/pdf/render.py b/surfsense_evals/src/surfsense_evals/core/pdf/render.py
new file mode 100644
index 000000000..624136d7c
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/pdf/render.py
@@ -0,0 +1,351 @@
+"""Deterministic ``.txt`` / ``.md`` → single PDF via reportlab.
+
+Used wherever a benchmark needs the same source bytes fed to both the
+native-PDF arm and the SurfSense ingestion arm. The head-to-head
+comparison is fair only if the *same* PDF is the input to both arms,
+which is why we go to lengths to make the rendering deterministic.
+
+Determinism notes:
+
+* We pin the PDF metadata to a fixed creation date and producer
+  (``reportlab`` accepts neither directly, but ``Canvas.setAuthor`` and
+  the absence of an ``info`` mutator means the bytes only differ by
+  ``CreationDate`` / ``ModDate``). We post-process the PDF to scrub
+  those if ``deterministic=True`` is passed.
+* Page size, font, margins, and tab handling are fixed in code so the
+  same input yields the same byte output across machines.
+* PDF/A is overkill for our use; basic PDF 1.4 is what every model
+  expects.
+"""
+
+from __future__ import annotations
+
+import io
+import re
+from collections.abc import Iterable, Sequence
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from pathlib import Path
+
+from reportlab.lib.pagesizes import LETTER
+from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
+from reportlab.lib.units import inch
+from reportlab.lib.utils import ImageReader
+from reportlab.platypus import (
+    Image,
+    KeepTogether,
+    PageBreak,
+    Paragraph,
+    SimpleDocTemplate,
+    Spacer,
+)
+
+
+@dataclass
+class RenderedPdf:
+    path: Path
+    n_pages_estimate: int
+    n_chars: int
+
+
+_PDF_DATE_KEY = re.compile(rb"/(?:CreationDate|ModDate)\s*\(D:[^)]*\)")
+# reportlab also writes a `/ID [<hex1><hex2>]` trailer entry that
+# embeds a per-run hash. Scrub it so two renders of the same input
+# produce the same bytes.
+_PDF_ID_ARRAY = re.compile(rb"/ID\s*\[\s*<[^>]*>\s*<[^>]*>\s*\]")
+
+
+def _scrub_dates(pdf_bytes: bytes) -> bytes:
+    """Remove ``CreationDate`` / ``ModDate`` / trailer ``/ID`` so the
+    file is byte-deterministic across runs."""
+
+    pdf_bytes = _PDF_DATE_KEY.sub(b"/CreationDate (D:19700101000000Z)", pdf_bytes)
+    pdf_bytes = _PDF_ID_ARRAY.sub(b"/ID [<00><00>]", pdf_bytes)
+    return pdf_bytes
+
+
+_DEFAULT_STYLES = getSampleStyleSheet()
+
+
+def _build_body_style() -> ParagraphStyle:
+    base = _DEFAULT_STYLES["BodyText"]
+    style = ParagraphStyle(
+        "EvalBody",
+        parent=base,
+        fontName="Helvetica",
+        fontSize=10.5,
+        leading=14,
+        spaceAfter=6,
+        spaceBefore=0,
+    )
+    return style
+
+
+def _build_heading_style() -> ParagraphStyle:
+    base = _DEFAULT_STYLES["Heading2"]
+    style = ParagraphStyle(
+        "EvalHeading",
+        parent=base,
+        fontName="Helvetica-Bold",
+        fontSize=14,
+        leading=18,
+        spaceAfter=10,
+        spaceBefore=8,
+    )
+    return style
+
+
+def _normalise_paragraphs(text: str) -> list[str]:
+    """Split a text blob into paragraphs while preserving blank-line structure."""
+
+    blocks: list[list[str]] = [[]]
+    for line in text.splitlines():
+        stripped = line.rstrip()
+        if stripped == "":
+            if blocks[-1]:
+                blocks.append([])
+            continue
+        blocks[-1].append(stripped)
+    paragraphs: list[str] = []
+    for block in blocks:
+        if not block:
+            continue
+        # Join lines within a paragraph with spaces (text-from-PDF style).
+        paragraphs.append(" ".join(block))
+    return paragraphs
+
+
+def _escape_html(text: str) -> str:
+    return (
+        text.replace("&", "&amp;")
+        .replace("<", "&lt;")
+        .replace(">", "&gt;")
+    )
+
+
+def render_pdf(
+    *,
+    title: str,
+    sections: Sequence[tuple[str | None, str]],
+    output_path: Path,
+    deterministic: bool = True,
+) -> RenderedPdf:
+    """Render one PDF from a list of ``(section_heading, section_text)`` tuples.
+
+    ``section_heading`` may be ``None`` for an unnamed section. Each
+    section is followed by a page break so the model's PDF parser sees
+    a clean structural boundary between source files.
+    """
+
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    buffer = io.BytesIO()
+    doc = SimpleDocTemplate(
+        buffer,
+        pagesize=LETTER,
+        leftMargin=0.75 * inch,
+        rightMargin=0.75 * inch,
+        topMargin=0.75 * inch,
+        bottomMargin=0.75 * inch,
+        title=title,
+        author="surfsense-evals",
+        subject="Eval input",
+        creator="surfsense-evals",
+    )
+
+    body_style = _build_body_style()
+    heading_style = _build_heading_style()
+    title_style = ParagraphStyle(
+        "EvalTitle",
+        parent=_DEFAULT_STYLES["Title"],
+        fontName="Helvetica-Bold",
+        fontSize=18,
+        leading=22,
+        spaceAfter=14,
+    )
+
+    flow: list = [Paragraph(_escape_html(title), title_style)]
+    total_chars = 0
+    for index, (heading, text) in enumerate(sections):
+        if index > 0:
+            flow.append(PageBreak())
+        if heading:
+            flow.append(Paragraph(_escape_html(heading), heading_style))
+        for paragraph in _normalise_paragraphs(text):
+            total_chars += len(paragraph)
+            flow.append(Paragraph(_escape_html(paragraph), body_style))
+            flow.append(Spacer(1, 4))
+
+    doc.build(flow)
+    pdf_bytes = buffer.getvalue()
+    if deterministic:
+        pdf_bytes = _scrub_dates(pdf_bytes)
+    output_path.write_bytes(pdf_bytes)
+
+    # Conservative page estimate: ~3000 chars per LETTER page at 10.5pt.
+    n_pages = max(1, total_chars // 3000 + len(sections))
+    return RenderedPdf(path=output_path, n_pages_estimate=n_pages, n_chars=total_chars)
+
+
+@dataclass
+class PdfImage:
+    """One image to embed inside a section.
+
+    ``caption`` is rendered below the image (italic). ``max_width_in``
+    caps the rendered width in inches; height auto-scales to preserve
+    aspect ratio (read with PIL).
+    """
+
+    path: Path
+    caption: str = ""
+    max_width_in: float = 5.5  # default leaves margin for LETTER 8.5"
+
+
+def _make_image_flowable(image: PdfImage) -> Image:
+    """Build a reportlab Image flowable scaled to fit page width."""
+
+    reader = ImageReader(str(image.path))
+    iw, ih = reader.getSize()
+    if iw <= 0 or ih <= 0:
+        raise ValueError(f"Invalid image dimensions for {image.path}: {iw}x{ih}")
+    target_w = image.max_width_in * inch
+    target_h = target_w * (ih / iw)
+    # Cap height too — some medical images are extreme portrait.
+    max_h = 7.0 * inch
+    if target_h > max_h:
+        target_h = max_h
+        target_w = target_h * (iw / ih)
+    return Image(str(image.path), width=target_w, height=target_h)
+
+
+def render_pdf_with_images(
+    *,
+    title: str,
+    sections: Sequence[tuple[str | None, str, Sequence[PdfImage] | None]],
+    output_path: Path,
+    deterministic: bool = True,
+    page_break_between_sections: bool = False,
+) -> RenderedPdf:
+    """Render a PDF that mixes text and embedded images.
+
+    Each section is ``(heading, body_text, images)``. Images render
+    inline after the body text, each followed by an italic caption.
+    Set ``page_break_between_sections=True`` if you want explicit
+    structural boundaries (mostly useful for multi-case PDFs); the
+    default keeps everything on one page when possible (so a single
+    MedXpertQA case is one PDF page with case + images + options).
+    """
+
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    buffer = io.BytesIO()
+    doc = SimpleDocTemplate(
+        buffer,
+        pagesize=LETTER,
+        leftMargin=0.75 * inch,
+        rightMargin=0.75 * inch,
+        topMargin=0.75 * inch,
+        bottomMargin=0.75 * inch,
+        title=title,
+        author="surfsense-evals",
+        subject="Eval input",
+        creator="surfsense-evals",
+    )
+
+    body_style = _build_body_style()
+    heading_style = _build_heading_style()
+    caption_style = ParagraphStyle(
+        "EvalCaption",
+        parent=body_style,
+        fontSize=9,
+        leading=11,
+        textColor="#444",
+        spaceBefore=2,
+        spaceAfter=10,
+    )
+    title_style = ParagraphStyle(
+        "EvalTitle",
+        parent=_DEFAULT_STYLES["Title"],
+        fontName="Helvetica-Bold",
+        fontSize=18,
+        leading=22,
+        spaceAfter=14,
+    )
+
+    flow: list = [Paragraph(_escape_html(title), title_style)]
+    total_chars = 0
+    for index, (heading, text, images) in enumerate(sections):
+        if index > 0 and page_break_between_sections:
+            flow.append(PageBreak())
+        if heading:
+            flow.append(Paragraph(_escape_html(heading), heading_style))
+        for paragraph in _normalise_paragraphs(text):
+            total_chars += len(paragraph)
+            flow.append(Paragraph(_escape_html(paragraph), body_style))
+            flow.append(Spacer(1, 4))
+        for image in images or []:
+            try:
+                img_flow = _make_image_flowable(image)
+            except Exception:  # noqa: BLE001 — bad image shouldn't kill PDF
+                continue
+            grouped = [img_flow]
+            if image.caption:
+                grouped.append(Paragraph(_escape_html(image.caption), caption_style))
+            else:
+                grouped.append(Spacer(1, 8))
+            flow.append(KeepTogether(grouped))
+
+    doc.build(flow)
+    pdf_bytes = buffer.getvalue()
+    if deterministic:
+        pdf_bytes = _scrub_dates(pdf_bytes)
+    output_path.write_bytes(pdf_bytes)
+
+    n_pages = max(1, total_chars // 3000 + len(sections))
+    return RenderedPdf(path=output_path, n_pages_estimate=n_pages, n_chars=total_chars)
+
+
+def render_text_files_to_pdf(
+    *,
+    title: str,
+    files: Iterable[Path],
+    output_path: Path,
+    deterministic: bool = True,
+) -> RenderedPdf:
+    """Convenience wrapper: read a list of text files, render to one PDF.
+
+    The heading of each section is the file's name (no extension), so
+    e.g. ``admission_note.txt`` becomes a section header ``admission_note``
+    in the rendered PDF. Useful for any text-only benchmark that ships
+    a corpus as separate ``.txt`` / ``.md`` shards per logical document.
+    """
+
+    sections: list[tuple[str | None, str]] = []
+    for path in files:
+        path = Path(path)
+        text = path.read_text(encoding="utf-8")
+        sections.append((path.stem, text))
+    return render_pdf(
+        title=title,
+        sections=sections,
+        output_path=output_path,
+        deterministic=deterministic,
+    )
+
+
+# Tiny self-check — handy when debugging.
+def _self_test() -> None:  # pragma: no cover
+    out = Path("./_render_self_test.pdf")
+    sections = [
+        ("intro", "Hello world.\n\nThis is a test."),
+        ("body", "Line one.\nLine two."),
+    ]
+    rendered = render_pdf(title="Self test", sections=sections, output_path=out)
+    print(f"wrote {rendered.path} ({rendered.n_chars} chars)")
+
+
+# Importing ``datetime`` keeps the timezone helper handy if a future
+# benchmark wants to embed a real timestamp without losing determinism.
+_NOW_FROZEN = datetime(2026, 5, 11, tzinfo=UTC)
diff --git a/surfsense_evals/src/surfsense_evals/core/providers/__init__.py b/surfsense_evals/src/surfsense_evals/core/providers/__init__.py
new file mode 100644
index 000000000..fa82bcbf2
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/providers/__init__.py
@@ -0,0 +1,22 @@
+"""External LLM providers (used by the native arm).
+
+Lazy imports so the SurfSense-only path doesn't transitively load the
+OpenRouter client until something actually constructs ``OpenRouterPdfProvider``.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:  # pragma: no cover
+    from .openrouter_pdf import OpenRouterPdfProvider, OpenRouterResponse
+
+__all__ = ["OpenRouterPdfProvider", "OpenRouterResponse"]
+
+
+def __getattr__(name: str):
+    if name in {"OpenRouterPdfProvider", "OpenRouterResponse"}:
+        from . import openrouter_pdf as _mod
+
+        return getattr(_mod, name)
+    raise AttributeError(f"module 'surfsense_evals.core.providers' has no attribute {name!r}")
diff --git a/surfsense_evals/src/surfsense_evals/core/providers/openrouter_chat.py b/surfsense_evals/src/surfsense_evals/core/providers/openrouter_chat.py
new file mode 100644
index 000000000..2494434be
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/providers/openrouter_chat.py
@@ -0,0 +1,118 @@
+"""Bare OpenRouter ``chat/completions`` provider — no PDF, no plugins.
+
+Used by ``BareLlmArm`` to measure "what does the model answer with
+zero retrieval context?". Same wire shape as ``OpenRouterPdfProvider``
+minus the file-parser plugin and the ``file`` content part:
+
+```json
+{
+  "model": "openai/gpt-5.4-mini",
+  "messages": [
+    {"role": "system", "content": "<optional>"},
+    {"role": "user",   "content": "<prompt>"}
+  ]
+}
+```
+
+The response shape is identical to the PDF provider's, so we re-use
+``_parse_chat_completion`` from ``openrouter_pdf`` and only specialise
+the request builder. That keeps cost-extraction, token-counting, and
+content-array handling in one place.
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from typing import Any
+
+import httpx
+
+from .openrouter_pdf import (
+    OpenRouterResponse,
+    _DEFAULT_HEADERS,
+    _parse_chat_completion,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class OpenRouterChatProvider:
+    """Stateless bare-chat client. No PDF, no file-parser plugin."""
+
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        base_url: str = "https://openrouter.ai/api/v1",
+        model: str,
+        timeout_s: float = 600.0,
+    ) -> None:
+        if not api_key:
+            raise ValueError("OPENROUTER_API_KEY is required for the bare-LLM arm.")
+        self._api_key = api_key
+        self._base = base_url.rstrip("/")
+        self._model = model
+        self._timeout = httpx.Timeout(timeout_s, connect=15.0)
+
+    @property
+    def model(self) -> str:
+        return self._model
+
+    def _build_payload(
+        self,
+        *,
+        prompt: str,
+        system_prompt: str | None,
+        max_tokens: int | None,
+    ) -> dict[str, Any]:
+        messages: list[dict[str, Any]] = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        messages.append({"role": "user", "content": prompt})
+        body: dict[str, Any] = {"model": self._model, "messages": messages}
+        if max_tokens:
+            body["max_tokens"] = max_tokens
+        return body
+
+    async def complete(
+        self,
+        *,
+        prompt: str,
+        system_prompt: str | None = None,
+        max_tokens: int | None = None,
+        http: httpx.AsyncClient | None = None,
+    ) -> OpenRouterResponse:
+        """Single chat completion. Errors are raised verbatim — caller decides retries."""
+
+        payload = self._build_payload(
+            prompt=prompt,
+            system_prompt=system_prompt,
+            max_tokens=max_tokens,
+        )
+        headers = {
+            "Authorization": f"Bearer {self._api_key}",
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+            **_DEFAULT_HEADERS,
+        }
+        url = f"{self._base}/chat/completions"
+        started = time.monotonic()
+        if http is not None:
+            response = await http.post(url, json=payload, headers=headers, timeout=self._timeout)
+        else:
+            async with httpx.AsyncClient(timeout=self._timeout) as client:
+                response = await client.post(
+                    url, json=payload, headers=headers, timeout=self._timeout
+                )
+        latency_ms = int((time.monotonic() - started) * 1000)
+        if response.status_code >= 400:
+            raise httpx.HTTPStatusError(
+                f"OpenRouter HTTP {response.status_code}: {response.text[:300]}",
+                request=response.request,
+                response=response,
+            )
+        return _parse_chat_completion(response.json(), latency_ms=latency_ms)
+
+
+__all__ = ["OpenRouterChatProvider"]
diff --git a/surfsense_evals/src/surfsense_evals/core/providers/openrouter_pdf.py b/surfsense_evals/src/surfsense_evals/core/providers/openrouter_pdf.py
new file mode 100644
index 000000000..e98590cbf
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/providers/openrouter_pdf.py
@@ -0,0 +1,231 @@
+"""Native-PDF arm provider: OpenRouter ``chat/completions`` with PDF input.
+
+Per `<https://openrouter.ai/docs/features/multimodal/pdfs>`__ the wire
+shape is OpenAI-compatible with one PDF-specific extra:
+
+```json
+{
+  "model": "anthropic/claude-sonnet-4.5",
+  "messages": [{
+    "role": "user",
+    "content": [
+      {"type": "file", "file": {"filename": "case.pdf",
+        "file_data": "data:application/pdf;base64,<b64>"}},
+      {"type": "text", "text": "<prompt>"}
+    ]
+  }],
+  "plugins": [{"id": "file-parser", "pdf": {"engine": "native"}}]
+}
+```
+
+``engine: "native"`` is the only engine that doesn't pre-OCR the
+PDF — it forwards raw bytes to PDF-native models (Claude, Gemini),
+matching what a human user does when "dropping the PDF into Claude".
+``mistral-ocr`` and ``cloudflare-ai`` are exposed as enum options for
+non-native models.
+
+Headers ``HTTP-Referer`` and ``X-Title`` make spend show up cleanly on
+the OpenRouter dashboard.
+"""
+
+from __future__ import annotations
+
+import base64
+import logging
+import time
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Any
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+class PdfEngine(str, Enum):
+    NATIVE = "native"
+    MISTRAL_OCR = "mistral-ocr"
+    CLOUDFLARE_AI = "cloudflare-ai"
+
+
+@dataclass
+class OpenRouterResponse:
+    """Subset of the OpenRouter response we care about for scoring."""
+
+    text: str
+    input_tokens: int
+    output_tokens: int
+    total_tokens: int
+    cost_micros: int
+    latency_ms: int
+    finish_reason: str | None
+    raw: dict[str, Any]
+
+
+_DEFAULT_HEADERS = {
+    "HTTP-Referer": "https://github.com/MODSetter/SurfSense",
+    "X-Title": "SurfSense-evals",
+}
+
+
+class OpenRouterPdfProvider:
+    """Thin httpx-based client. Stateless; safe to reuse per arm instance."""
+
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        base_url: str = "https://openrouter.ai/api/v1",
+        model: str,
+        engine: PdfEngine = PdfEngine.NATIVE,
+        timeout_s: float = 600.0,
+    ) -> None:
+        if not api_key:
+            raise ValueError("OPENROUTER_API_KEY is required for the native arm.")
+        self._api_key = api_key
+        self._base = base_url.rstrip("/")
+        self._model = model
+        self._engine = engine
+        self._timeout = httpx.Timeout(timeout_s, connect=15.0)
+
+    @property
+    def model(self) -> str:
+        return self._model
+
+    @property
+    def engine(self) -> PdfEngine:
+        return self._engine
+
+    def _build_payload(
+        self,
+        *,
+        prompt: str,
+        pdf_path: Path,
+        max_tokens: int | None,
+        extra_messages: list[dict[str, Any]] | None,
+    ) -> dict[str, Any]:
+        b64 = base64.b64encode(pdf_path.read_bytes()).decode("ascii")
+        user_content: list[dict[str, Any]] = [
+            {
+                "type": "file",
+                "file": {
+                    "filename": pdf_path.name,
+                    "file_data": f"data:application/pdf;base64,{b64}",
+                },
+            },
+            {"type": "text", "text": prompt},
+        ]
+        messages: list[dict[str, Any]] = list(extra_messages or [])
+        messages.append({"role": "user", "content": user_content})
+        body: dict[str, Any] = {
+            "model": self._model,
+            "messages": messages,
+            "plugins": [
+                {"id": "file-parser", "pdf": {"engine": self._engine.value}}
+            ],
+        }
+        if max_tokens:
+            body["max_tokens"] = max_tokens
+        return body
+
+    async def complete(
+        self,
+        *,
+        prompt: str,
+        pdf_path: Path,
+        max_tokens: int | None = None,
+        extra_messages: list[dict[str, Any]] | None = None,
+        http: httpx.AsyncClient | None = None,
+    ) -> OpenRouterResponse:
+        """Single chat completion. Errors are raised verbatim — runner decides retries."""
+
+        payload = self._build_payload(
+            prompt=prompt,
+            pdf_path=pdf_path,
+            max_tokens=max_tokens,
+            extra_messages=extra_messages,
+        )
+        headers = {
+            "Authorization": f"Bearer {self._api_key}",
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+            **_DEFAULT_HEADERS,
+        }
+        url = f"{self._base}/chat/completions"
+        started = time.monotonic()
+        if http is not None:
+            response = await http.post(url, json=payload, headers=headers, timeout=self._timeout)
+        else:
+            async with httpx.AsyncClient(timeout=self._timeout) as client:
+                response = await client.post(
+                    url, json=payload, headers=headers, timeout=self._timeout
+                )
+        latency_ms = int((time.monotonic() - started) * 1000)
+        if response.status_code >= 400:
+            raise httpx.HTTPStatusError(
+                f"OpenRouter HTTP {response.status_code}: {response.text[:300]}",
+                request=response.request,
+                response=response,
+            )
+        data = response.json()
+        return _parse_chat_completion(data, latency_ms=latency_ms)
+
+
+def _parse_chat_completion(payload: dict[str, Any], *, latency_ms: int) -> OpenRouterResponse:
+    """Tolerant parser for OpenRouter / OpenAI chat-completions JSON.
+
+    OpenRouter passes through any provider-specific extras, but the
+    canonical shape is ``choices[0].message.content`` (string OR array
+    of content parts) and ``usage.prompt_tokens / completion_tokens / total_tokens``.
+    Cost lives at the top level (``payload["usage"]["cost"]`` or
+    ``payload["x-or-cost"]``) depending on routing.
+    """
+
+    text = ""
+    finish_reason: str | None = None
+    choices = payload.get("choices") or []
+    if choices:
+        message = (choices[0] or {}).get("message") or {}
+        content = message.get("content")
+        if isinstance(content, str):
+            text = content
+        elif isinstance(content, list):
+            chunks: list[str] = []
+            for part in content:
+                if isinstance(part, dict) and part.get("type") in {"text", "output_text"}:
+                    chunks.append(str(part.get("text", "")))
+            text = "".join(chunks)
+        finish_reason = (choices[0] or {}).get("finish_reason") or None
+
+    usage = payload.get("usage") or {}
+    input_tokens = int(usage.get("prompt_tokens") or 0)
+    output_tokens = int(usage.get("completion_tokens") or 0)
+    total_tokens = int(usage.get("total_tokens") or (input_tokens + output_tokens))
+
+    # OpenRouter exposes cost in dollars on `usage.cost` or `cost`. We
+    # convert to integer micros to avoid float-summing surprises across
+    # 7,663 MIRAGE questions.
+    raw_cost = usage.get("cost")
+    if raw_cost is None:
+        raw_cost = payload.get("cost")
+    cost_micros = 0
+    if raw_cost is not None:
+        try:
+            cost_micros = int(round(float(raw_cost) * 1_000_000))
+        except (TypeError, ValueError):
+            cost_micros = 0
+
+    return OpenRouterResponse(
+        text=text,
+        input_tokens=input_tokens,
+        output_tokens=output_tokens,
+        total_tokens=total_tokens,
+        cost_micros=cost_micros,
+        latency_ms=latency_ms,
+        finish_reason=finish_reason,
+        raw=payload,
+    )
+
+
+__all__ = ["OpenRouterPdfProvider", "OpenRouterResponse", "PdfEngine"]
diff --git a/surfsense_evals/src/surfsense_evals/core/registry.py b/surfsense_evals/src/surfsense_evals/core/registry.py
new file mode 100644
index 000000000..cc8b725e0
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/registry.py
@@ -0,0 +1,265 @@
+"""Suite + Benchmark protocols and the global registry.
+
+The extensibility seam: ``core.cli`` walks ``surfsense_evals.suites`` on
+import, which auto-imports every benchmark subpackage, which calls
+``register(<benchmark>)`` at module bottom. The CLI then iterates the
+populated registry to build subcommand groups dynamically.
+
+Adding a new domain = drop a folder under ``suites/<domain>/<bench>/``
+that ends in ``register(MyBenchmark())``. No edits anywhere in
+``core/`` are required.
+"""
+
+from __future__ import annotations
+
+import argparse
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Protocol, runtime_checkable
+
+import httpx
+
+from .clients import DocumentsClient, NewChatClient, SearchSpaceClient
+from .config import Config, SuiteState
+
+# ---------------------------------------------------------------------------
+# Run context — what every benchmark.ingest/run receives
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class RunContext:
+    """Per-invocation environment threaded into ``ingest`` and ``run``.
+
+    A benchmark uses this to read pinned suite state, build new HTTP
+    clients on the shared ``http`` session, find the right data /
+    reports paths, and discover the active OpenRouter model + key.
+
+    ``http`` is the authenticated SurfSense client (auth event hook
+    attached). It is **not** an OpenRouter client — providers create
+    their own short-lived clients because OpenRouter doesn't share the
+    SurfSense bearer.
+    """
+
+    suite: str
+    benchmark: str
+    config: Config
+    suite_state: SuiteState
+    http: httpx.AsyncClient
+
+    @property
+    def search_space_id(self) -> int:
+        return self.suite_state.search_space_id
+
+    @property
+    def agent_llm_id(self) -> int:
+        return self.suite_state.agent_llm_id
+
+    @property
+    def provider_model(self) -> str:
+        """Slug used by the SurfSense agent (and the native arm by default).
+
+        For ``cost-arbitrage`` scenarios this is the *cheap, text-only*
+        slug — SurfSense answers from the chunks the vision LLM already
+        extracted at ingest. The native arm should use
+        ``native_arm_model`` instead in that scenario.
+        """
+
+        return self.suite_state.provider_model
+
+    @property
+    def native_arm_model(self) -> str:
+        """Slug the native_pdf arm should use.
+
+        Defaults to ``provider_model`` (head-to-head / symmetric-cheap);
+        for ``cost-arbitrage`` it returns the explicit
+        ``--native-arm-model`` so the native arm can fairly answer
+        image-bearing questions.
+        """
+
+        return self.suite_state.effective_native_arm_model
+
+    @property
+    def vision_provider_model(self) -> str | None:
+        """Slug of the OpenRouter vision LLM SurfSense used at ingest.
+
+        ``None`` if no vision config was attached at setup (legacy or
+        text-only suite). Used by runners purely to record what was
+        actually used in ``RunArtifact.extra`` and to label reports.
+        """
+
+        return self.suite_state.vision_provider_model
+
+    @property
+    def scenario(self) -> str:
+        """Scenario name pinned at setup time (see ``config.SCENARIOS``)."""
+
+        return self.suite_state.scenario
+
+    def search_space_client(self) -> SearchSpaceClient:
+        return SearchSpaceClient(self.http, self.config.surfsense_api_base)
+
+    def documents_client(self) -> DocumentsClient:
+        return DocumentsClient(self.http, self.config.surfsense_api_base)
+
+    def new_chat_client(self) -> NewChatClient:
+        return NewChatClient(self.http, self.config.surfsense_api_base)
+
+    def maps_dir(self) -> Path:
+        path = self.config.suite_maps_dir(self.suite)
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+
+    def runs_dir(self, *, run_timestamp: str) -> Path:
+        path = self.config.suite_runs_dir(self.suite) / run_timestamp / self.benchmark
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+
+    def benchmark_data_dir(self) -> Path:
+        path = self.config.suite_data_dir(self.suite) / self.benchmark
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+
+
+# ---------------------------------------------------------------------------
+# Run artifact + report section
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class RunArtifact:
+    """Everything a runner persists for the report writer to consume.
+
+    ``raw_path`` points at the JSONL of per-question ``ArmResult``
+    rows. ``metrics`` is a free-form dict the benchmark fills in (e.g.
+    ``{"native": {...}, "surfsense": {...}, "delta": {...}}``).
+    """
+
+    suite: str
+    benchmark: str
+    run_timestamp: str
+    raw_path: Path
+    metrics: dict[str, Any] = field(default_factory=dict)
+    extra: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class ReportSection:
+    """One benchmark's slice of the final summary."""
+
+    title: str
+    headline: bool
+    body_md: str
+    body_json: dict[str, Any] = field(default_factory=dict)
+
+
+# ---------------------------------------------------------------------------
+# Benchmark protocol + registry
+# ---------------------------------------------------------------------------
+
+
+@runtime_checkable
+class Benchmark(Protocol):
+    """The contract every benchmark module ends with ``register(<x>)``."""
+
+    suite: str
+    name: str
+    headline: bool
+    description: str
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:  # pragma: no cover - protocol
+        ...
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:  # pragma: no cover - protocol
+        ...
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:  # pragma: no cover - protocol
+        """Add benchmark-specific flags to ``run <suite> <benchmark>``."""
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:  # pragma: no cover - protocol
+        ...
+
+
+# ---------------------------------------------------------------------------
+# Registry storage
+# ---------------------------------------------------------------------------
+
+
+_REGISTRY: dict[tuple[str, str], Benchmark] = {}
+
+
+def register(benchmark: Benchmark) -> None:
+    """Add ``benchmark`` to the registry. Last-wins on duplicate keys.
+
+    Duplicate registrations log a warning rather than raising so a
+    benchmark module imported twice (once via auto-discovery, once via
+    a test directly importing it) doesn't blow up the CLI.
+    """
+
+    key = (benchmark.suite, benchmark.name)
+    if key in _REGISTRY:
+        import logging
+
+        logging.getLogger(__name__).warning(
+            "Benchmark %s/%s re-registered (overwriting prior)", *key
+        )
+    _REGISTRY[key] = benchmark
+
+
+def unregister(suite: str, name: str) -> None:
+    """Test helper: drop a single benchmark from the registry."""
+
+    _REGISTRY.pop((suite, name), None)
+
+
+def reset() -> None:
+    """Test helper: wipe the registry (use with monkeypatched discovery)."""
+
+    _REGISTRY.clear()
+
+
+def get(suite: str, name: str) -> Benchmark:
+    try:
+        return _REGISTRY[(suite, name)]
+    except KeyError as exc:
+        available = ", ".join(f"{s}/{n}" for s, n in sorted(_REGISTRY)) or "<none>"
+        raise KeyError(
+            f"Unknown benchmark '{suite}/{name}'. Registered: {available}"
+        ) from exc
+
+
+def list_suites() -> list[str]:
+    return sorted({s for s, _ in _REGISTRY})
+
+
+def list_benchmarks(suite: str | None = None) -> list[Benchmark]:
+    if suite is None:
+        return [_REGISTRY[k] for k in sorted(_REGISTRY)]
+    return [_REGISTRY[k] for k in sorted(_REGISTRY) if k[0] == suite]
+
+
+def snapshot() -> Mapping[tuple[str, str], Benchmark]:
+    """Read-only view for diagnostics (e.g. ``benchmarks list`` rendering)."""
+
+    return dict(_REGISTRY)
+
+
+__all__ = [
+    "Arm",
+    "Benchmark",
+    "ReportSection",
+    "RunArtifact",
+    "RunContext",
+    "get",
+    "list_benchmarks",
+    "list_suites",
+    "register",
+    "reset",
+    "snapshot",
+    "unregister",
+]
+
+
+# Re-export Arm from arms.base so suites can `from core.registry import Arm`.
+from .arms.base import Arm  # noqa: E402, F401  (deliberate re-export at bottom)
diff --git a/surfsense_evals/src/surfsense_evals/core/report/__init__.py b/surfsense_evals/src/surfsense_evals/core/report/__init__.py
new file mode 100644
index 000000000..c5ccbc64c
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/report/__init__.py
@@ -0,0 +1,18 @@
+"""Report writer + section composition primitives. Lazy import."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:  # pragma: no cover
+    from .writer import write_report
+
+__all__ = ["write_report"]
+
+
+def __getattr__(name: str):
+    if name == "write_report":
+        from .writer import write_report
+
+        return write_report
+    raise AttributeError(f"module 'surfsense_evals.core.report' has no attribute {name!r}")
diff --git a/surfsense_evals/src/surfsense_evals/core/report/writer.py b/surfsense_evals/src/surfsense_evals/core/report/writer.py
new file mode 100644
index 000000000..8d1ffa07a
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/report/writer.py
@@ -0,0 +1,89 @@
+"""Report writer — composes per-benchmark sections into one summary.
+
+Output:
+
+* ``reports/<suite>/<run-timestamp>/summary.md`` — human-readable.
+  Bullet lists only (no tables) per project's coding-standards.
+* ``reports/<suite>/<run-timestamp>/summary.json`` — same content as
+  structured JSON for downstream tooling (CI dashboards, regressions).
+
+Headline benchmarks come first in both outputs.
+"""
+
+from __future__ import annotations
+
+import json
+from collections.abc import Iterable
+from pathlib import Path
+
+from ..config import Config
+from ..registry import ReportSection
+
+
+def write_report(
+    *,
+    config: Config,
+    suite: str,
+    sections: Iterable[ReportSection],
+    run_timestamp: str,
+) -> Path:
+    """Write ``summary.md`` + ``summary.json``. Returns the path of the .md file."""
+
+    sections_list = list(sections)
+    sections_list.sort(key=lambda s: (not s.headline, s.title.lower()))
+
+    out_dir = config.suite_reports_dir(suite) / run_timestamp
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    md_path = out_dir / "summary.md"
+    json_path = out_dir / "summary.json"
+
+    md_lines: list[str] = [
+        f"# SurfSense evals — suite `{suite}`",
+        "",
+        f"- Run timestamp: `{run_timestamp}`",
+        f"- Sections: {len(sections_list)}",
+        "",
+    ]
+    headline = [s for s in sections_list if s.headline]
+    secondary = [s for s in sections_list if not s.headline]
+    if headline:
+        md_lines.append("## Headline")
+        md_lines.append("")
+        for section in headline:
+            md_lines.append(f"### {section.title}")
+            md_lines.append("")
+            md_lines.append(section.body_md.rstrip())
+            md_lines.append("")
+    if secondary:
+        md_lines.append("## Secondary measurements")
+        md_lines.append("")
+        for section in secondary:
+            md_lines.append(f"### {section.title}")
+            md_lines.append("")
+            md_lines.append(section.body_md.rstrip())
+            md_lines.append("")
+
+    md_path.write_text("\n".join(md_lines).rstrip() + "\n", encoding="utf-8")
+
+    json_payload = {
+        "suite": suite,
+        "run_timestamp": run_timestamp,
+        "sections": [
+            {
+                "title": s.title,
+                "headline": s.headline,
+                "body_md": s.body_md,
+                "body_json": s.body_json,
+            }
+            for s in sections_list
+        ],
+    }
+    json_path.write_text(
+        json.dumps(json_payload, indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+    return md_path
+
+
+__all__ = ["ReportSection", "write_report"]
diff --git a/surfsense_evals/src/surfsense_evals/core/scenarios.py b/surfsense_evals/src/surfsense_evals/core/scenarios.py
new file mode 100644
index 000000000..16874a069
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/scenarios.py
@@ -0,0 +1,58 @@
+"""Shared scenario formatting helpers for head-to-head benchmark reports.
+
+The scenario chosen at ``setup`` time (``head-to-head``, ``symmetric-cheap``,
+``cost-arbitrage``) materially changes how a head-to-head report should be
+read. This module produces the one-bullet summary every head-to-head
+runner stamps near the top of its ``report_section`` body so reviewers
+immediately see the framing — no need to dig into ``run_artifact.json``.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+from typing import Any
+
+
+def format_scenario_md(extra: Mapping[str, Any] | None) -> str:
+    """Render a scenario-aware bullet for a benchmark report.
+
+    Reads ``extra["scenario"]`` plus the runtime LLM slugs the runner
+    recorded. Falls back to a sensible "head-to-head" line if the artifact
+    pre-dates scenarios so old runs still render cleanly.
+    """
+
+    extra = dict(extra or {})
+    scenario = str(extra.get("scenario") or "head-to-head")
+    surf_slug = str(extra.get("provider_model") or "?")
+    native_slug = str(extra.get("native_arm_model") or surf_slug)
+    vision_slug = extra.get("vision_provider_model")
+
+    if scenario == "cost-arbitrage":
+        body = (
+            f"- Scenario: **cost-arbitrage** — native arm answers with "
+            f"`{native_slug}` (vision); SurfSense answers with `{surf_slug}` "
+            f"over chunks vision-extracted at ingest"
+            f"{f' by `{vision_slug}`' if vision_slug else ''}. "
+            "Measures how close SurfSense gets to native at a fraction of "
+            "the per-query cost."
+        )
+    elif scenario == "symmetric-cheap":
+        body = (
+            f"- Scenario: **symmetric-cheap** — both arms answer with "
+            f"`{surf_slug}`; SurfSense pre-extracted images at ingest"
+            f"{f' via `{vision_slug}`' if vision_slug else ''}. "
+            "Native arm structurally loses on image-bearing questions "
+            "(text-only model can't see images) — that's the point."
+        )
+    else:
+        body = (
+            f"- Scenario: head-to-head — both arms answer with `{surf_slug}` "
+            "via OpenRouter."
+        )
+        if vision_slug:
+            body += f" SurfSense ingest VLM: `{vision_slug}`."
+
+    return body
+
+
+__all__ = ["format_scenario_md"]
diff --git a/surfsense_evals/src/surfsense_evals/core/vision_llm.py b/surfsense_evals/src/surfsense_evals/core/vision_llm.py
new file mode 100644
index 000000000..ae96f1285
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/core/vision_llm.py
@@ -0,0 +1,127 @@
+"""Vision LLM resolution + auto-pick logic for the harness's ``setup`` command.
+
+Two responsibilities:
+
+1. Resolve an explicit ``--vision-llm <slug>`` to a global OpenRouter
+   vision LLM config id that ``set_llm_preferences(vision_llm_config_id=...)``
+   can accept.
+2. Auto-pick the strongest registered vision config when the operator
+   doesn't pass ``--vision-llm`` but the scenario / benchmark needs one.
+
+The priority list mirrors the recommended slugs in the README so the
+auto-pick is deterministic and reviewable.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Iterable
+from dataclasses import dataclass
+
+from .clients.search_space import VisionLlmConfigEntry
+
+# Order matters — first match wins when auto-picking. Keep these in sync
+# with the "Recommended vision slugs" table in the README so the
+# auto-pick story is the same one users read about.
+RECOMMENDED_VISION_PRIORITY: tuple[str, ...] = (
+    "anthropic/claude-sonnet-4.5",
+    "anthropic/claude-opus-4.7",
+    "openai/gpt-5",
+    "google/gemini-2.5-pro",
+)
+
+
+class VisionConfigError(RuntimeError):
+    """Raised when no vision config can be resolved (explicit or auto)."""
+
+
+@dataclass(frozen=True)
+class ResolvedVisionConfig:
+    """Result of ``resolve_vision_llm`` — what to attach + a label for logs."""
+
+    config_id: int
+    provider_model: str
+    selected_via: str  # "explicit" | "auto-priority" | "auto-fallback"
+
+
+def _openrouter_only(entries: Iterable[VisionLlmConfigEntry]) -> list[VisionLlmConfigEntry]:
+    return [e for e in entries if e.provider == "OPENROUTER" and not e.is_auto_mode]
+
+
+def resolve_vision_llm(
+    candidates: list[VisionLlmConfigEntry],
+    *,
+    explicit_slug: str | None,
+) -> ResolvedVisionConfig:
+    """Resolve a vision LLM config id from a slug or by auto-picking.
+
+    * If ``explicit_slug`` is given: must match exactly one OpenRouter
+      vision config's ``model_name``. Raises ``VisionConfigError`` with a
+      friendly listing if zero / many match.
+    * Otherwise: walk ``RECOMMENDED_VISION_PRIORITY`` in order and return
+      the first registered one. If none of the recommended slugs are
+      registered, fall back to the first OpenRouter vision config in the
+      list (deterministic by listing order). Raises ``VisionConfigError``
+      if zero are registered at all.
+    """
+
+    or_vision = _openrouter_only(candidates)
+
+    if explicit_slug is not None:
+        matches = [e for e in or_vision if e.model_name == explicit_slug]
+        if not matches:
+            sample = ", ".join(e.model_name for e in or_vision[:8]) or "<none>"
+            raise VisionConfigError(
+                f"No OpenRouter vision config found for slug '{explicit_slug}'. "
+                "Make sure `openrouter_integration.vision_enabled: true` in "
+                "global_llm_config.yaml and that the Celery worker has finished "
+                "its first refresh. "
+                f"Available OpenRouter vision slugs (sample): {sample}."
+            )
+        if len(matches) > 1:
+            listing = "\n".join(f"  id={e.id}  name={e.name!r}" for e in matches)
+            raise VisionConfigError(
+                f"Multiple OpenRouter vision configs match '{explicit_slug}':\n{listing}"
+            )
+        only = matches[0]
+        return ResolvedVisionConfig(
+            config_id=only.id,
+            provider_model=only.model_name,
+            selected_via="explicit",
+        )
+
+    if not or_vision:
+        raise VisionConfigError(
+            "No OpenRouter vision LLM configs are registered with this "
+            "SurfSense backend. Either pass `--no-vision-llm` to the ingest "
+            "step (text-only ingestion), or enable "
+            "`openrouter_integration.vision_enabled: true` in "
+            "global_llm_config.yaml so the Celery worker syncs vision-capable "
+            "OpenRouter models on next refresh."
+        )
+
+    by_slug = {e.model_name: e for e in or_vision}
+    for preferred in RECOMMENDED_VISION_PRIORITY:
+        match = by_slug.get(preferred)
+        if match is not None:
+            return ResolvedVisionConfig(
+                config_id=match.id,
+                provider_model=match.model_name,
+                selected_via="auto-priority",
+            )
+
+    # Fallback: first registered OpenRouter vision config. Deterministic
+    # because the backend returns them in a stable order.
+    fallback = or_vision[0]
+    return ResolvedVisionConfig(
+        config_id=fallback.id,
+        provider_model=fallback.model_name,
+        selected_via="auto-fallback",
+    )
+
+
+__all__ = [
+    "RECOMMENDED_VISION_PRIORITY",
+    "ResolvedVisionConfig",
+    "VisionConfigError",
+    "resolve_vision_llm",
+]
diff --git a/surfsense_evals/src/surfsense_evals/suites/__init__.py b/surfsense_evals/src/surfsense_evals/suites/__init__.py
new file mode 100644
index 000000000..95ed958ca
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/__init__.py
@@ -0,0 +1,66 @@
+"""Suite registry auto-discovery.
+
+Importing ``surfsense_evals.suites`` walks every subpackage one level deep
+(domain like ``medical``) AND its benchmark subpackages
+(``medical/medxpertqa``, ``medical/mirage``, ``medical/cure``). Each
+benchmark's ``__init__.py`` is expected to call
+``core.registry.register(<Benchmark>)`` at module bottom; merely importing
+the module is enough to populate the registry.
+
+Adding a new domain is therefore: drop a folder under ``suites/`` with the
+right structure. No edits anywhere else.
+
+Subpackages whose name starts with ``_`` are skipped — that's reserved for
+test fixtures (e.g. ``suites/_demo/``) so they don't accidentally show up
+in ``benchmarks list``.
+"""
+
+from __future__ import annotations
+
+import importlib
+import logging
+import pkgutil
+from typing import Iterable
+
+logger = logging.getLogger(__name__)
+
+
+def _iter_subpackages(package) -> Iterable[str]:
+    """Yield fully-qualified subpackage names one level deep, skipping ``_*``."""
+
+    for module_info in pkgutil.iter_modules(package.__path__, prefix=f"{package.__name__}."):
+        if not module_info.ispkg:
+            continue
+        leaf = module_info.name.rsplit(".", 1)[-1]
+        if leaf.startswith("_"):
+            continue
+        yield module_info.name
+
+
+def discover_suites() -> list[str]:
+    """Import every domain + benchmark subpackage so registrations fire.
+
+    Returns the list of fully-qualified benchmark module names that were
+    successfully imported. Failures are logged (not raised) so a single
+    broken benchmark doesn't take down the whole CLI — the operator still
+    sees the working benchmarks via ``benchmarks list``.
+    """
+
+    import surfsense_evals.suites as _suites  # self-import for __path__
+
+    imported: list[str] = []
+    for domain_name in _iter_subpackages(_suites):
+        try:
+            domain_pkg = importlib.import_module(domain_name)
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("Failed to import suite domain %s: %s", domain_name, exc)
+            continue
+        for benchmark_name in _iter_subpackages(domain_pkg):
+            try:
+                importlib.import_module(benchmark_name)
+                imported.append(benchmark_name)
+            except Exception as exc:  # noqa: BLE001
+                logger.warning(
+                    "Failed to import benchmark %s: %s", benchmark_name, exc
+                )
+    return imported
diff --git a/surfsense_evals/src/surfsense_evals/suites/_demo/__init__.py b/surfsense_evals/src/surfsense_evals/suites/_demo/__init__.py
new file mode 100644
index 000000000..9a8cd447e
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/_demo/__init__.py
@@ -0,0 +1,8 @@
+"""Test fixture suite — skipped by the auto-discovery walker (name starts with ``_``).
+
+Imported explicitly by ``tests/core/test_registry.py`` to prove the
+register-on-import contract works without polluting the production
+benchmark list.
+"""
+
+from __future__ import annotations
diff --git a/surfsense_evals/src/surfsense_evals/suites/_demo/hello/__init__.py b/surfsense_evals/src/surfsense_evals/suites/_demo/hello/__init__.py
new file mode 100644
index 000000000..1da33926c
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/_demo/hello/__init__.py
@@ -0,0 +1,46 @@
+"""Demo benchmark — registers on import, used only by the registry tests."""
+
+from __future__ import annotations
+
+import argparse
+from typing import Any
+
+from ....core.registry import (
+    Benchmark,
+    ReportSection,
+    RunArtifact,
+    RunContext,
+    register,
+)
+
+
+class HelloBenchmark:
+    suite: str = "_demo"
+    name: str = "hello"
+    headline: bool = False
+    description: str = "Demo benchmark used by the registry test."
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument("--echo", default="hi")
+
+    async def ingest(self, ctx: RunContext, **_opts: Any) -> None:  # pragma: no cover
+        return None
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:  # pragma: no cover
+        return RunArtifact(
+            suite=self.suite,
+            benchmark=self.name,
+            run_timestamp="0",
+            raw_path=ctx.benchmark_data_dir() / "raw.jsonl",
+            metrics={"echo": opts.get("echo")},
+        )
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:
+        return ReportSection(
+            title="Hello demo",
+            headline=False,
+            body_md="- runs: " + str(len(artifacts)),
+        )
+
+
+register(HelloBenchmark())
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/__init__.py b/surfsense_evals/src/surfsense_evals/suites/medical/__init__.py
new file mode 100644
index 000000000..9c0067e25
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/__init__.py
@@ -0,0 +1,7 @@
+"""Medical RAG benchmarks (MedXpertQA-MM headline + MIRAGE/CUREv1 secondary).
+
+Subpackages register themselves with ``core.registry`` on import. The
+``suites/__init__.py`` discovery walker imports them automatically.
+"""
+
+from __future__ import annotations
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/cure/__init__.py b/surfsense_evals/src/surfsense_evals/suites/medical/cure/__init__.py
new file mode 100644
index 000000000..e13224be7
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/cure/__init__.py
@@ -0,0 +1,18 @@
+"""CUREv1 — secondary single-arm SurfSense retrieval measurement.
+
+Source: https://huggingface.co/datasets/clinia/CUREv1
+Paper: https://arxiv.org/html/2412.06954v4
+
+Pure retrieval benchmark — 10 medical disciplines, English/French/Spanish
+queries, expert-curated qrels (graded 0/1/2). The harness ingests the
+corpus, runs each query via SurfSense's ``/api/v1/new_chat``, parses
+chunk citations, maps them back to CUREv1 ``corpus-id``, and scores
+Recall@k / MRR / nDCG@10 against qrels.
+"""
+
+from __future__ import annotations
+
+from .runner import CureBenchmark
+from ....core import registry as _registry
+
+_registry.register(CureBenchmark())
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/cure/ingest.py b/surfsense_evals/src/surfsense_evals/suites/medical/cure/ingest.py
new file mode 100644
index 000000000..6eca8810c
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/cure/ingest.py
@@ -0,0 +1,239 @@
+"""CUREv1 ingestion.
+
+For each (lang, discipline) requested, downloads the corpus split via
+``datasets.load_dataset(path="clinia/CUREv1", name="corpus", split=<discipline>)``,
+batches passages into ~5 MB markdown bundles, uploads them to
+SurfSense, polls until ``ready``, and persists the
+``corpus_id -> document_id`` map under
+``data/medical/maps/cure_corpus_map_<discipline>.jsonl``. A union map
+``cure_corpus_map.jsonl`` is also written so the runner can resolve
+citations across disciplines without juggling per-file paths.
+"""
+
+from __future__ import annotations
+
+import io
+import json
+import logging
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+
+from ....core.config import set_suite_state
+from ....core.ingest_settings import IngestSettings, settings_header_line
+from ....core.registry import RunContext
+
+logger = logging.getLogger(__name__)
+
+
+_BATCH_SIZE_BYTES = 5 * 1024 * 1024
+
+# 10 disciplines covered by the dataset card. We exhaustively list
+# them so a smoke test can default to one.
+DISCIPLINES = (
+    "anesthesiology",
+    "cardiology",
+    "dermatology",
+    "endocrinology",
+    "gastroenterology",
+    "hematology",
+    "nephrology",
+    "neurology",
+    "obstetrics_gynecology",
+    "psychiatry",
+)
+
+
+@dataclass
+class CorpusPassage:
+    corpus_id: str
+    title: str
+    text: str
+
+    def to_markdown(self) -> str:
+        title = (self.title or "").strip() or "Untitled"
+        body = (self.text or "").strip()
+        return f"# {title}\n\n_id: `{self.corpus_id}`_\n\n{body}\n"
+
+
+@dataclass
+class PassageBatch:
+    path: Path
+    corpus_ids: list[str]
+
+
+def _stream_corpus(discipline: str) -> Iterable[CorpusPassage]:
+    """Stream corpus rows for one discipline via the ``datasets`` library."""
+
+    from datasets import load_dataset  # noqa: PLC0415
+
+    logger.info("Loading CUREv1 corpus for discipline=%s", discipline)
+    ds = load_dataset(path="clinia/CUREv1", name="corpus", split=discipline)
+    for row in ds:
+        cid = str(row.get("_id") or "")
+        if not cid:
+            continue
+        yield CorpusPassage(
+            corpus_id=cid,
+            title=str(row.get("title") or ""),
+            text=str(row.get("text") or ""),
+        )
+
+
+def _write_batches(
+    passages: Iterable[CorpusPassage],
+    *,
+    out_dir: Path,
+    discipline: str,
+    batch_bytes: int = _BATCH_SIZE_BYTES,
+) -> list[PassageBatch]:
+    out_dir.mkdir(parents=True, exist_ok=True)
+    batches: list[PassageBatch] = []
+    current_buffer = io.StringIO()
+    current_ids: list[str] = []
+    current_bytes = 0
+    batch_idx = 0
+
+    def _flush() -> None:
+        nonlocal current_buffer, current_ids, current_bytes, batch_idx
+        if not current_ids:
+            return
+        path = out_dir / f"cure_{discipline}_{batch_idx:04d}.md"
+        path.write_text(current_buffer.getvalue(), encoding="utf-8")
+        batches.append(PassageBatch(path=path, corpus_ids=current_ids))
+        batch_idx += 1
+        current_buffer = io.StringIO()
+        current_ids = []
+        current_bytes = 0
+
+    for passage in passages:
+        chunk = passage.to_markdown() + "\n---\n\n"
+        chunk_bytes = len(chunk.encode("utf-8"))
+        if current_bytes + chunk_bytes > batch_bytes and current_ids:
+            _flush()
+        current_buffer.write(chunk)
+        current_ids.append(passage.corpus_id)
+        current_bytes += chunk_bytes
+    _flush()
+    return batches
+
+
+async def run_ingest(
+    ctx: RunContext,
+    *,
+    disciplines: list[str] | None = None,
+    max_per_discipline: int | None = None,
+    settings: IngestSettings | None = None,
+) -> None:
+    disciplines = disciplines or list(DISCIPLINES)
+    settings = settings or IngestSettings(use_vision_llm=False, processing_mode="basic")
+    bench_dir = ctx.benchmark_data_dir()
+    batches_root = bench_dir / "batches"
+    batches_root.mkdir(parents=True, exist_ok=True)
+
+    docs_client = ctx.documents_client()
+    union_map_path = ctx.maps_dir() / "cure_corpus_map.jsonl"
+    union_map_fh = union_map_path.open("w", encoding="utf-8")
+    # Header row records the ingest-time settings so the runner can
+    # surface them in the report (see core/ingest_settings.py).
+    union_map_fh.write(settings_header_line(settings) + "\n")
+    try:
+        for discipline in disciplines:
+            try:
+                passages_iter = _stream_corpus(discipline)
+                if max_per_discipline is not None:
+                    passages_iter = _take(passages_iter, max_per_discipline)
+                batches = _write_batches(
+                    passages_iter,
+                    out_dir=batches_root / discipline,
+                    discipline=discipline,
+                )
+            except Exception as exc:  # noqa: BLE001
+                logger.warning("Skipping discipline %s: %s", discipline, exc)
+                continue
+            if not batches:
+                logger.warning("Discipline %s produced 0 batches; skipping upload", discipline)
+                continue
+            logger.info(
+                "Uploading %d batches for discipline %s", len(batches), discipline
+            )
+            upload_result = await docs_client.upload(
+                files=[b.path for b in batches],
+                search_space_id=ctx.search_space_id,
+                should_summarize=settings.should_summarize,
+                use_vision_llm=settings.use_vision_llm,
+                processing_mode=settings.processing_mode,
+            )
+            new_doc_ids = list(upload_result.document_ids)
+            if new_doc_ids:
+                await docs_client.wait_until_ready(
+                    search_space_id=ctx.search_space_id,
+                    document_ids=new_doc_ids,
+                    timeout_s=3600.0,
+                    max_poll_s=15.0,
+                )
+            statuses = await docs_client.get_status(
+                search_space_id=ctx.search_space_id,
+                document_ids=new_doc_ids + upload_result.duplicate_document_ids,
+            )
+            title_to_doc = {s.title: s.document_id for s in statuses}
+
+            per_discipline_path = (
+                ctx.maps_dir() / f"cure_corpus_map_{discipline}.jsonl"
+            )
+            with per_discipline_path.open("w", encoding="utf-8") as fh:
+                fh.write(settings_header_line(settings) + "\n")
+                for batch in batches:
+                    doc_id = title_to_doc.get(batch.path.name)
+                    if doc_id is None:
+                        logger.warning("No document_id for batch %s", batch.path.name)
+                        continue
+                    for cid in batch.corpus_ids:
+                        record = {
+                            "corpus_id": cid,
+                            "document_id": doc_id,
+                            "discipline": discipline,
+                        }
+                        fh.write(json.dumps(record) + "\n")
+                        union_map_fh.write(json.dumps(record) + "\n")
+
+            chunks_map_path = ctx.maps_dir() / f"cure_chunk_map_{discipline}.jsonl"
+            with chunks_map_path.open("w", encoding="utf-8") as fh:
+                for doc_id in {title_to_doc.get(b.path.name) for b in batches} - {None}:
+                    try:
+                        chunks = await docs_client.list_chunks(int(doc_id))
+                    except Exception as exc:  # noqa: BLE001
+                        logger.warning(
+                            "Failed to list chunks for doc_id=%s: %s", doc_id, exc
+                        )
+                        continue
+                    for chunk in chunks:
+                        fh.write(
+                            json.dumps(
+                                {
+                                    "chunk_id": chunk.id,
+                                    "document_id": doc_id,
+                                    "discipline": discipline,
+                                }
+                            )
+                            + "\n"
+                        )
+    finally:
+        union_map_fh.close()
+
+    new_state = ctx.suite_state
+    new_state.ingestion_maps["cure"] = str(union_map_path)
+    set_suite_state(ctx.config, ctx.suite, new_state)
+    logger.info("CUREv1 ingestion complete; union map at %s", union_map_path)
+
+
+def _take(it: Iterable, n: int) -> Iterable:
+    yielded = 0
+    for x in it:
+        if yielded >= n:
+            return
+        yield x
+        yielded += 1
+
+
+__all__ = ["DISCIPLINES", "CorpusPassage", "PassageBatch", "run_ingest"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/cure/runner.py b/surfsense_evals/src/surfsense_evals/suites/medical/cure/runner.py
new file mode 100644
index 000000000..416912b14
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/cure/runner.py
@@ -0,0 +1,397 @@
+"""CUREv1 runner — single-arm SurfSense retrieval scoring.
+
+For each query we ask SurfSense via ``/api/v1/new_chat`` (no
+``mentioned_document_ids``) and parse chunk citations from the
+streamed answer. Cited ``chunk_id`` → ``document_id`` (chunk map) →
+``corpus_id`` (corpus map). The resulting ranked list is scored
+against the dataset's qrels.
+
+The prompt nudges the model to surface its supporting passages via
+SurfSense's standard ``[citation:CHUNK_ID]`` format (already required
+by the agent system prompt), so we recover retrieval ordering from
+the answer text without needing a separate retrieval API.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from ....core.arms import ArmRequest, ArmResult, SurfSenseArm
+from ....core.config import utc_iso_timestamp
+from ....core.ingest_settings import (
+    IngestSettings,
+    add_ingest_settings_args,
+    format_ingest_settings_md,
+    is_settings_header,
+    read_settings_header,
+)
+from ....core.metrics.retrieval import score_run
+from ....core.registry import (
+    Benchmark,
+    ReportSection,
+    RunArtifact,
+    RunContext,
+)
+
+logger = logging.getLogger(__name__)
+
+
+_PROMPT = """\
+You are a medical literature retrieval assistant for the question
+below. Identify the top passages from the knowledge base that best
+answer it and cite each one in the standard format
+[citation:CHUNK_ID]. List as many citations as are useful, ordered
+from most to least relevant. Provide a one-sentence justification
+for each citation.
+
+Query: {query}
+"""
+
+
+_DESCRIPTION = "CUREv1 retrieval (single-arm SurfSense): Recall@k / MRR / nDCG@10."
+
+# CUREv1 corpus is text-only markdown bundles; vision LLM at ingest
+# is wasted by default but the operator can flip it via CLI for an
+# A/B comparison.
+_DEFAULT_INGEST_SETTINGS = IngestSettings(
+    use_vision_llm=False,
+    processing_mode="basic",
+    should_summarize=False,
+)
+
+
+@dataclass
+class CureQuery:
+    qid: str
+    text: str
+    discipline: str
+
+
+def _load_chunk_map(maps_dir: Path) -> dict[int, int]:
+    """Union all ``cure_chunk_map_<discipline>.jsonl`` into one dict."""
+
+    out: dict[int, int] = {}
+    for path in sorted(maps_dir.glob("cure_chunk_map_*.jsonl")):
+        with path.open("r", encoding="utf-8") as fh:
+            for line in fh:
+                if not line.strip():
+                    continue
+                row = json.loads(line)
+                if is_settings_header(row):
+                    continue
+                try:
+                    out[int(row["chunk_id"])] = int(row["document_id"])
+                except (KeyError, TypeError, ValueError):
+                    continue
+    return out
+
+
+def _load_doc_to_corpus(maps_dir: Path) -> dict[int, list[str]]:
+    """Map ``document_id -> [corpus_id, ...]`` from the union map.
+
+    Multiple corpus passages may live in one batched markdown
+    document, so each doc_id maps to a list. Citation ordering of the
+    first occurrence is preserved.
+    """
+
+    out: dict[int, list[str]] = defaultdict(list)
+    union_path = maps_dir / "cure_corpus_map.jsonl"
+    if not union_path.exists():
+        return out
+    with union_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            if not line.strip():
+                continue
+            row = json.loads(line)
+            if is_settings_header(row):
+                continue
+            try:
+                out[int(row["document_id"])].append(str(row["corpus_id"]))
+            except (KeyError, TypeError, ValueError):
+                continue
+    return out
+
+
+def _load_queries(*, lang: str, disciplines: list[str], sample_n: int | None) -> list[CureQuery]:
+    from datasets import load_dataset  # noqa: PLC0415
+
+    out: list[CureQuery] = []
+    for discipline in disciplines:
+        try:
+            ds = load_dataset(path="clinia/CUREv1", name=f"queries-{lang}", split=discipline)
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("Skipping queries for %s/%s: %s", lang, discipline, exc)
+            continue
+        for row in ds:
+            qid = str(row.get("_id") or "")
+            text = str(row.get("text") or "")
+            if not qid or not text:
+                continue
+            out.append(CureQuery(qid=qid, text=text, discipline=discipline))
+    out.sort(key=lambda q: (q.discipline, q.qid))
+    if sample_n is not None and sample_n > 0:
+        # Stratified-by-discipline slice.
+        per_d = max(1, sample_n // max(1, len(disciplines)))
+        sliced: list[CureQuery] = []
+        counter: dict[str, int] = defaultdict(int)
+        for q in out:
+            if counter[q.discipline] >= per_d:
+                continue
+            sliced.append(q)
+            counter[q.discipline] += 1
+            if len(sliced) >= sample_n:
+                break
+        out = sliced
+    return out
+
+
+def _load_qrels(*, disciplines: list[str]) -> dict[str, dict[str, float]]:
+    from datasets import load_dataset  # noqa: PLC0415
+
+    out: dict[str, dict[str, float]] = defaultdict(dict)
+    for discipline in disciplines:
+        try:
+            ds = load_dataset(path="clinia/CUREv1", name="qrels", split=discipline)
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("Skipping qrels for %s: %s", discipline, exc)
+            continue
+        for row in ds:
+            qid = str(row.get("query-id") or row.get("query_id") or "")
+            cid = str(row.get("corpus-id") or row.get("corpus_id") or "")
+            score = row.get("score")
+            if not qid or not cid or score is None:
+                continue
+            try:
+                out[qid][cid] = float(score)
+            except (TypeError, ValueError):
+                continue
+    return out
+
+
+async def _gather_with_limit(coros, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(c):
+        async with sem:
+            return await c
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+class CureBenchmark:
+    suite: str = "medical"
+    name: str = "cure"
+    headline: bool = False
+    description: str = _DESCRIPTION
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument("--lang", default="en", choices=("en", "es", "fr"))
+        parser.add_argument("--discipline", default=None,
+                            help="Restrict to one discipline (default: all ingested).")
+        parser.add_argument("--n", dest="sample_n", type=int, default=None)
+        parser.add_argument("--concurrency", type=int, default=4)
+        parser.add_argument(
+            "--max-passages-per-discipline", type=int, default=None,
+            help="(ingest only) cap corpus rows per discipline for smoke testing.",
+        )
+        # Per-upload knobs forwarded to /documents/fileupload at ingest;
+        # ignored at run-time (runner reads resolved settings from the
+        # union-map header).
+        add_ingest_settings_args(parser, defaults=_DEFAULT_INGEST_SETTINGS)
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:
+        from .ingest import DISCIPLINES, run_ingest
+
+        settings = IngestSettings.merge(_DEFAULT_INGEST_SETTINGS, opts)
+        await run_ingest(
+            ctx,
+            disciplines=list(DISCIPLINES),
+            max_per_discipline=opts.get("max_passages_per_discipline"),
+            settings=settings,
+        )
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:
+        lang = opts.get("lang") or "en"
+        discipline_filter = opts.get("discipline")
+        sample_n = opts.get("sample_n")
+        concurrency = int(opts.get("concurrency") or 4)
+
+        maps_dir = ctx.maps_dir()
+        chunk_to_doc = _load_chunk_map(maps_dir)
+        doc_to_corpus = _load_doc_to_corpus(maps_dir)
+        ingest_settings = read_settings_header(maps_dir / "cure_corpus_map.jsonl")
+        if not chunk_to_doc or not doc_to_corpus:
+            raise RuntimeError(
+                "CUREv1 not ingested for this suite. Run "
+                "`python -m surfsense_evals ingest medical cure` first."
+            )
+
+        # Disciplines to query are determined by the per-discipline maps
+        # actually present (either user-filtered or whatever was ingested).
+        ingested_disciplines = sorted({
+            row_disc
+            for path in maps_dir.glob("cure_corpus_map_*.jsonl")
+            for row_disc in [path.stem[len("cure_corpus_map_"):]]
+        })
+        if discipline_filter:
+            disciplines = [discipline_filter]
+        else:
+            disciplines = ingested_disciplines or ["dermatology"]
+
+        queries = _load_queries(lang=lang, disciplines=disciplines, sample_n=sample_n)
+        if not queries:
+            raise RuntimeError(
+                f"No CUREv1 queries matched lang={lang!r} disciplines={disciplines!r}."
+            )
+        qrels = _load_qrels(disciplines=disciplines)
+        logger.info(
+            "CUREv1: %d queries / %d qrels across disciplines %s",
+            len(queries),
+            len(qrels),
+            disciplines,
+        )
+
+        arm = SurfSenseArm(
+            client=ctx.new_chat_client(),
+            search_space_id=ctx.search_space_id,
+            ephemeral_threads=True,
+        )
+
+        async def _ask(q: CureQuery) -> ArmResult:
+            return await arm.answer(
+                ArmRequest(
+                    question_id=f"{q.discipline}::{q.qid}",
+                    prompt=_PROMPT.format(query=q.text.strip()),
+                )
+            )
+
+        results: list[ArmResult] = await _gather_with_limit(
+            (_ask(q) for q in queries), concurrency=concurrency
+        )
+
+        per_query_retrieved: dict[str, list[str]] = {}
+        for q, res in zip(queries, results):
+            chunk_ids: list[int] = []
+            seen: set[int] = set()
+            for citation in res.citations:
+                if citation.get("kind") != "chunk":
+                    continue
+                cid = int(citation.get("chunk_id"))
+                if cid in seen:
+                    continue
+                chunk_ids.append(cid)
+                seen.add(cid)
+            corpus_ids: list[str] = []
+            seen_corpus: set[str] = set()
+            for cid in chunk_ids:
+                doc_id = chunk_to_doc.get(cid)
+                if doc_id is None:
+                    continue
+                for corpus_id in doc_to_corpus.get(doc_id, []):
+                    if corpus_id in seen_corpus:
+                        continue
+                    corpus_ids.append(corpus_id)
+                    seen_corpus.add(corpus_id)
+            per_query_retrieved[q.qid] = corpus_ids
+
+        scores = score_run(
+            per_query_retrieved=per_query_retrieved,
+            per_query_qrels=qrels,
+            ks=(1, 5, 10, 32),
+            ndcg_k=10,
+        )
+
+        run_timestamp = utc_iso_timestamp()
+        run_dir = ctx.runs_dir(run_timestamp=run_timestamp)
+        raw_path = run_dir / "raw.jsonl"
+        with raw_path.open("w", encoding="utf-8") as fh:
+            for q, res in zip(queries, results):
+                fh.write(
+                    json.dumps(
+                        {
+                            "discipline": q.discipline,
+                            "qid": q.qid,
+                            "lang": lang,
+                            "retrieved_corpus_ids": per_query_retrieved.get(q.qid, []),
+                            **res.to_jsonl(),
+                        }
+                    )
+                    + "\n"
+                )
+
+        metrics = scores.to_dict()
+        metrics["lang"] = lang
+        metrics["disciplines"] = disciplines
+
+        artifact = RunArtifact(
+            suite=self.suite,
+            benchmark=self.name,
+            run_timestamp=run_timestamp,
+            raw_path=raw_path,
+            metrics=metrics,
+            extra={
+                "n_queries": len(queries),
+                "lang": lang,
+                "disciplines": disciplines,
+                "concurrency": concurrency,
+                "provider_model": ctx.provider_model,
+                "ingest_settings": ingest_settings,
+            },
+        )
+        manifest_path = run_dir / "run_artifact.json"
+        manifest_path.write_text(
+            json.dumps(
+                {
+                    "suite": self.suite,
+                    "benchmark": self.name,
+                    "raw_path": "raw.jsonl",
+                    "metrics": metrics,
+                    "extra": artifact.extra,
+                },
+                indent=2,
+                sort_keys=True,
+            )
+            + "\n",
+            encoding="utf-8",
+        )
+        return artifact
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:
+        if not artifacts:
+            return ReportSection(
+                title="CUREv1 — single-arm SurfSense retrieval",
+                headline=False,
+                body_md="(no run artifacts found)",
+                body_json={},
+            )
+        latest = max(artifacts, key=lambda a: a.run_timestamp)
+        m = latest.metrics
+        recall = m.get("recall_at_k", {})
+        lines: list[str] = [
+            format_ingest_settings_md(latest.extra.get("ingest_settings")),
+            f"- Language: {m.get('lang', '?')}",
+            f"- Disciplines: {', '.join(m.get('disciplines', []) or ['?'])}",
+            f"- n_queries (after qrels intersection): {m.get('n_queries', 0)}",
+        ]
+        for k in (1, 5, 10, 32):
+            v = recall.get(str(k), recall.get(k))
+            if v is not None:
+                lines.append(f"- Recall@{k}: {float(v):.3f}")
+        lines.append(f"- MRR: {float(m.get('mrr', 0.0)):.3f}")
+        lines.append(f"- nDCG@10: {float(m.get('ndcg_at_10', 0.0)):.3f}")
+        return ReportSection(
+            title="CUREv1 — single-arm SurfSense retrieval",
+            headline=False,
+            body_md="\n".join(lines),
+            body_json=m,
+        )
+
+
+__all__ = ["CureBenchmark", "CureQuery"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/__init__.py b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/__init__.py
new file mode 100644
index 000000000..3e803398d
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/__init__.py
@@ -0,0 +1,25 @@
+"""MedXpertQA-MM — multimodal medical exam head-to-head (medical suite headline).
+
+Source: https://huggingface.co/datasets/TsinghuaC3I/MedXpertQA
+Paper:  https://arxiv.org/abs/2501.18362 (ICML 2025)
+
+* MM subset: ~2,000 expert-level exam questions with diverse medical
+  images (radiology, dermatology, pathology, ECGs, gross specimens,
+  fundus photos) and structured patient information embedded in the
+  question stem.
+* 5 answer choices per MM question (A–E).
+* USMLE / COMLEX / 17 specialty board sources; rigorously filtered
+  and reviewed by physicians.
+
+Real diagnostic images carry signal that text-only patient charts
+cannot (e.g. CT scans, dermoscopy), so this benchmark exercises the
+full vision RAG pipeline end-to-end against a vision-capable model
+fed the same PDF natively.
+"""
+
+from __future__ import annotations
+
+from ....core import registry as _registry
+from .runner import MedXpertQAMMBenchmark
+
+_registry.register(MedXpertQAMMBenchmark())
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/ingest.py b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/ingest.py
new file mode 100644
index 000000000..5293e116f
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/ingest.py
@@ -0,0 +1,394 @@
+"""MedXpertQA-MM ingestion.
+
+Steps:
+
+1. Pull ``MM/test.jsonl`` (and optionally ``MM/dev.jsonl``) plus
+   ``images.zip`` from
+   ``hf://datasets/TsinghuaC3I/MedXpertQA``. Cache under
+   ``<data_dir>/medical/medxpertqa/``.
+2. Extract ``images.zip`` once into ``<data_dir>/medical/medxpertqa/images/``.
+3. Render one PDF per MM question (text question + structured patient
+   info embedded in the question stem + each image flowable + answer
+   options). Output: ``<data_dir>/medical/medxpertqa/pdfs/<id>.pdf``.
+4. Upload each PDF to SurfSense with ``use_vision_llm=True``; persist
+   ``id -> document_id`` in
+   ``<data_dir>/medical/maps/medxpertqa_doc_map.jsonl``.
+
+Both arms then receive byte-identical PDFs. The native arm sends the
+PDF directly to OpenRouter; SurfSense ingests via its own vision
+pipeline and the runner queries with ``mentioned_document_ids=[...]``
+to scope retrieval to the question's PDF.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import zipfile
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+
+from ....core.config import set_suite_state
+from ....core.ingest_settings import IngestSettings, settings_header_line
+from ....core.pdf import PdfImage, render_pdf_with_images
+from ....core.registry import RunContext
+from .prompt import format_options
+
+logger = logging.getLogger(__name__)
+
+
+HF_REPO_ID = "TsinghuaC3I/MedXpertQA"
+HF_REPO_TYPE = "dataset"
+
+
+def _hf_hub_download(*args, **kwargs):
+    from huggingface_hub import hf_hub_download
+
+    return hf_hub_download(*args, **kwargs)
+
+
+# ---------------------------------------------------------------------------
+# Question shape
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class MedXpertQuestion:
+    qid: str                         # e.g. "MM-26"
+    question: str                    # full question text (case + ask)
+    options: dict[str, str]          # A-E
+    label: str                       # "A".."E"
+    image_files: list[str]           # filenames inside images.zip
+    medical_task: str
+    body_system: str
+    question_type: str
+    split: str                       # "test" or "dev"
+
+
+def _load_jsonl(path: Path, *, split: str) -> list[MedXpertQuestion]:
+    out: list[MedXpertQuestion] = []
+    with path.open("r", encoding="utf-8") as fh:
+        for raw_line in fh:
+            line = raw_line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            qid = str(row.get("id") or "").strip()
+            question = str(row.get("question") or "").strip()
+            options = row.get("options") or {}
+            label = str(row.get("label") or "").strip().upper()
+            if not qid or not question or not isinstance(options, dict) or not label:
+                continue
+            opts = {str(k).strip().upper(): str(v).strip() for k, v in options.items()}
+            images = row.get("images") or []
+            if not isinstance(images, list):
+                images = []
+            out.append(MedXpertQuestion(
+                qid=qid,
+                question=question,
+                options=opts,
+                label=label,
+                image_files=[str(x).strip() for x in images if str(x).strip()],
+                medical_task=str(row.get("medical_task") or "").strip(),
+                body_system=str(row.get("body_system") or "").strip(),
+                question_type=str(row.get("question_type") or "").strip(),
+                split=split,
+            ))
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Image archive helpers
+# ---------------------------------------------------------------------------
+
+
+def _ensure_images_extracted(images_zip: Path, images_dir: Path) -> None:
+    """Extract images.zip once, tolerantly handle re-runs."""
+
+    marker = images_dir / ".extracted_ok"
+    if marker.exists():
+        return
+    images_dir.mkdir(parents=True, exist_ok=True)
+    logger.info("Extracting MedXpertQA images.zip -> %s", images_dir)
+    with zipfile.ZipFile(images_zip) as zf:
+        zf.extractall(images_dir)
+    marker.write_text("ok\n", encoding="utf-8")
+
+
+def _resolve_image_path(image_filename: str, images_dir: Path) -> Path | None:
+    """Find a question's image in the (possibly nested) extract directory.
+
+    The zip layout sometimes nests under ``images/`` and sometimes
+    flat — handle both.
+    """
+
+    direct = images_dir / image_filename
+    if direct.exists():
+        return direct
+    nested = images_dir / "images" / image_filename
+    if nested.exists():
+        return nested
+    # Last-ditch: glob recursively (slow but correct for unusual layouts).
+    matches = list(images_dir.rglob(image_filename))
+    return matches[0] if matches else None
+
+
+# ---------------------------------------------------------------------------
+# PDF rendering
+# ---------------------------------------------------------------------------
+
+
+def _render_question_pdf(
+    q: MedXpertQuestion,
+    *,
+    images_dir: Path,
+    pdfs_dir: Path,
+) -> tuple[Path, list[str]]:
+    """Render one MedXpertQA question into a PDF.
+
+    Layout:
+      Title:    MedXpertQA — <qid>  (medical_task / body_system)
+      Section 1 (case):       full question text
+      Section 1 images:       each image flowable + caption
+      Section 2 (options):    A) ... B) ... C) ... D) ... E) ...
+
+    Returns (pdf_path, missing_images) so the caller can warn on
+    questions where some image files weren't found.
+    """
+
+    out_path = pdfs_dir / f"{q.qid}.pdf"
+    images: list[PdfImage] = []
+    missing: list[str] = []
+    for fname in q.image_files:
+        resolved = _resolve_image_path(fname, images_dir)
+        if resolved is None:
+            missing.append(fname)
+            continue
+        images.append(PdfImage(path=resolved, caption=f"Image: {fname}", max_width_in=5.5))
+
+    title_meta_parts = []
+    if q.medical_task:
+        title_meta_parts.append(q.medical_task)
+    if q.body_system:
+        title_meta_parts.append(q.body_system)
+    if q.question_type:
+        title_meta_parts.append(q.question_type)
+    title_suffix = f" ({' / '.join(title_meta_parts)})" if title_meta_parts else ""
+
+    sections = [
+        ("Clinical case", q.question, images),
+        ("Answer choices", format_options(q.options), None),
+    ]
+    render_pdf_with_images(
+        title=f"MedXpertQA-MM {q.qid}{title_suffix}",
+        sections=sections,
+        output_path=out_path,
+    )
+    return out_path, missing
+
+
+# ---------------------------------------------------------------------------
+# Upload helper
+# ---------------------------------------------------------------------------
+
+
+async def _upload_pdfs(
+    ctx: RunContext,
+    pdf_paths: Iterable[Path],
+    *,
+    batch_size: int,
+    settings: IngestSettings,
+) -> dict[str, int]:
+    docs_client = ctx.documents_client()
+    name_to_id: dict[str, int] = {}
+    pdf_list = list(pdf_paths)
+    for batch_start in range(0, len(pdf_list), batch_size):
+        batch = pdf_list[batch_start:batch_start + batch_size]
+        result = await docs_client.upload(
+            files=batch,
+            search_space_id=ctx.search_space_id,
+            should_summarize=settings.should_summarize,
+            use_vision_llm=settings.use_vision_llm,
+            processing_mode=settings.processing_mode,
+        )
+        all_ids = list(result.document_ids) + list(result.duplicate_document_ids)
+        if all_ids:
+            await docs_client.wait_until_ready(
+                search_space_id=ctx.search_space_id,
+                document_ids=result.document_ids,
+                timeout_s=1800.0,
+            )
+            statuses = await docs_client.get_status(
+                search_space_id=ctx.search_space_id,
+                document_ids=all_ids,
+            )
+            for s in statuses:
+                name_to_id[s.title] = s.document_id
+        logger.info(
+            "Uploaded MedXpertQA batch %d-%d: %d new, %d duplicate",
+            batch_start, batch_start + len(batch),
+            len(result.document_ids), len(result.duplicate_document_ids),
+        )
+    return name_to_id
+
+
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+
+
+async def run_ingest(
+    ctx: RunContext,
+    *,
+    split: str = "test",
+    max_questions: int | None = None,
+    upload_batch_size: int = 8,
+    skip_upload: bool = False,
+    include_dev: bool = False,
+    settings: IngestSettings | None = None,
+) -> None:
+    """Ingest MedXpertQA-MM into the medical suite.
+
+    Parameters
+    ----------
+    split : 'test' (default), 'dev', or 'both'
+        Which subset to render + upload.
+    max_questions : int | None
+        Cap on number of questions ingested (handy for fast iteration).
+    upload_batch_size : int
+        PDFs per ``fileupload`` call.
+    skip_upload : bool
+        Render PDFs locally but don't push to SurfSense.
+    include_dev : bool
+        Convenience: equivalent to ``split='both'``.
+    """
+
+    settings = settings or IngestSettings(use_vision_llm=True, processing_mode="basic")
+    bench_dir = ctx.benchmark_data_dir()
+    images_zip_local = bench_dir / "images.zip"
+    images_dir = bench_dir / "images"
+    pdfs_dir = bench_dir / "pdfs"
+    pdfs_dir.mkdir(parents=True, exist_ok=True)
+    hf_cache = bench_dir / ".hf_cache"
+    hf_cache.mkdir(parents=True, exist_ok=True)
+
+    # Step 1: download jsonl(s)
+    splits_to_load: list[str] = []
+    if split == "both" or include_dev:
+        splits_to_load = ["dev", "test"]
+    elif split in {"dev", "test"}:
+        splits_to_load = [split]
+    else:
+        raise ValueError(f"Unknown split {split!r}; use 'test' / 'dev' / 'both'")
+
+    questions: list[MedXpertQuestion] = []
+    for sp in splits_to_load:
+        rel = f"MM/{sp}.jsonl"
+        local = _hf_hub_download(
+            repo_id=HF_REPO_ID,
+            filename=rel,
+            repo_type=HF_REPO_TYPE,
+            cache_dir=str(hf_cache),
+        )
+        loaded = _load_jsonl(Path(local), split=sp)
+        questions.extend(loaded)
+        logger.info("Loaded %d MedXpertQA-MM questions from %s split", len(loaded), sp)
+
+    if max_questions is not None and max_questions > 0:
+        questions = questions[:max_questions]
+    if not questions:
+        raise RuntimeError("No MedXpertQA-MM questions loaded; check the split argument.")
+
+    # Step 2: download images.zip + extract once
+    if not images_zip_local.exists():
+        local_zip = _hf_hub_download(
+            repo_id=HF_REPO_ID,
+            filename="images.zip",
+            repo_type=HF_REPO_TYPE,
+            cache_dir=str(hf_cache),
+        )
+        # Materialise into bench_dir so the path is stable.
+        try:
+            from os import link as _link
+            _link(local_zip, images_zip_local)
+        except OSError:
+            from shutil import copy2
+            copy2(local_zip, images_zip_local)
+    _ensure_images_extracted(images_zip_local, images_dir)
+
+    # Step 3: render PDFs
+    pdf_paths: dict[str, Path] = {}
+    missing_image_count = 0
+    for i, q in enumerate(questions, start=1):
+        try:
+            pdf, missing = _render_question_pdf(q, images_dir=images_dir, pdfs_dir=pdfs_dir)
+            pdf_paths[q.qid] = pdf
+            if missing:
+                missing_image_count += len(missing)
+                logger.debug("qid=%s missing %d images: %s", q.qid, len(missing), missing)
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("Failed to render MedXpertQA PDF for %s: %s", q.qid, exc)
+        if i % 50 == 0:
+            logger.info("  ... rendered %d / %d PDFs", i, len(questions))
+    if missing_image_count:
+        logger.warning(
+            "MedXpertQA: %d image references could not be resolved on disk "
+            "(rendered PDFs may be missing some images).",
+            missing_image_count,
+        )
+
+    # Step 4: upload
+    name_to_id: dict[str, int] = {}
+    if skip_upload:
+        logger.info("MedXpertQA: --skip-upload set; skipping SurfSense ingestion")
+    else:
+        logger.info("MedXpertQA upload settings: %s", settings.render_label())
+        name_to_id = await _upload_pdfs(
+            ctx,
+            pdf_paths.values(),
+            batch_size=upload_batch_size,
+            settings=settings,
+        )
+
+    # Step 5: persist manifest + questions
+    questions_jsonl = bench_dir / "questions.jsonl"
+    with questions_jsonl.open("w", encoding="utf-8") as fh:
+        for q in questions:
+            fh.write(json.dumps({
+                "qid": q.qid,
+                "question": q.question,
+                "options": q.options,
+                "label": q.label,
+                "image_files": q.image_files,
+                "medical_task": q.medical_task,
+                "body_system": q.body_system,
+                "question_type": q.question_type,
+                "split": q.split,
+            }) + "\n")
+    logger.info("Wrote %d MedXpertQA questions to %s", len(questions), questions_jsonl)
+
+    map_path = ctx.maps_dir() / "medxpertqa_doc_map.jsonl"
+    with map_path.open("w", encoding="utf-8") as fh:
+        # Header line records the resolved ingest settings
+        # (see core/ingest_settings.py).
+        fh.write(settings_header_line(settings) + "\n")
+        for q in questions:
+            local = pdf_paths.get(q.qid)
+            if local is None:
+                continue
+            fh.write(json.dumps({
+                "qid": q.qid,
+                "document_id": name_to_id.get(local.name),
+                "pdf_path": str(local),
+                "n_images": len(q.image_files),
+                "split": q.split,
+            }) + "\n")
+    logger.info("Wrote MedXpertQA doc map to %s", map_path)
+
+    new_state = ctx.suite_state
+    new_state.ingestion_maps["medxpertqa"] = str(map_path)
+    set_suite_state(ctx.config, ctx.suite, new_state)
+
+
+__all__ = ["MedXpertQuestion", "run_ingest"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/prompt.py b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/prompt.py
new file mode 100644
index 000000000..5c4a69916
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/prompt.py
@@ -0,0 +1,54 @@
+"""MedXpertQA-MM prompt.
+
+Mirrors the upstream paper's evaluation prompt (Zuo et al., ICML 2025
+§3.4): present case + 5 options A-E, ask for a single letter answer.
+We also instruct the model to use the embedded images explicitly,
+since the whole point of the MM subset is that the answer depends on
+visual evidence (radiology / dermoscopy / pathology / ECG, etc.).
+"""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+
+ANSWER_LETTERS = ("A", "B", "C", "D", "E")
+
+
+_PROMPT = """\
+You are a board-certified physician. The following exam question
+includes a clinical case and one or more medical images (radiology,
+dermatology, pathology, ECG, etc.). Use BOTH the text and the images
+to choose the best answer. Do not rely on memorisation of the case;
+read the images carefully — they often determine the correct answer.
+
+Case + question:
+{question}
+
+Answer choices:
+{options_block}
+
+Respond on a single line in the format `Answer: X` where X is one of
+A, B, C, D, or E.
+"""
+
+
+def format_options(options: Mapping[str, str]) -> str:
+    """Render the ``A) ... E) ...`` options block."""
+
+    parts: list[str] = []
+    for letter in ANSWER_LETTERS:
+        text = options.get(letter)
+        if text is None or str(text).strip() == "":
+            continue
+        parts.append(f"{letter}) {str(text).strip()}")
+    return "\n".join(parts)
+
+
+def build_prompt(question: str, options: Mapping[str, str]) -> str:
+    return _PROMPT.format(
+        question=question.strip(),
+        options_block=format_options(options),
+    )
+
+
+__all__ = ["ANSWER_LETTERS", "build_prompt", "format_options"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/runner.py b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/runner.py
new file mode 100644
index 000000000..75646ef32
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/runner.py
@@ -0,0 +1,681 @@
+"""MedXpertQA-MM runner — Native PDF (vision) vs SurfSense (vision RAG).
+
+Headline benchmark for the medical suite.
+
+* Native arm reads the rendered PDF (case + images + options) via
+  OpenRouter ``chat/completions`` + the file-parser plugin.
+* SurfSense arm queries ``POST /api/v1/new_chat`` scoped via
+  ``mentioned_document_ids=[doc_id]`` to the same per-question PDF.
+
+Operational notes:
+
+* PDFs contain real images (radiology, dermoscopy, pathology, ECGs).
+  Operator must pin a vision-capable model via
+  ``setup --provider-model anthropic/claude-sonnet-4.5`` (or similar);
+  the runner emits a warning if a known text-only slug is pinned.
+* MedXpertQA tags ``medical_task`` (Diagnosis / Treatment / Basic
+  Medicine) and ``body_system`` (Cardiovascular / Lymphatic / …)
+  directly on every row; we slice the report by both.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from ....core.arms import ArmRequest, ArmResult, NativePdfArm, SurfSenseArm
+from ....core.config import utc_iso_timestamp
+from ....core.ingest_settings import (
+    IngestSettings,
+    add_ingest_settings_args,
+    format_ingest_settings_md,
+    is_settings_header,
+)
+from ....core.metrics.comparison import (
+    bootstrap_delta_ci,
+    mcnemar_test,
+    paired_aggregate,
+)
+from ....core.metrics.mc_accuracy import accuracy_with_wilson_ci
+from ....core.providers.openrouter_pdf import OpenRouterPdfProvider, PdfEngine
+from ....core.registry import (
+    ReportSection,
+    RunArtifact,
+    RunContext,
+)
+from ....core.scenarios import format_scenario_md
+from .prompt import ANSWER_LETTERS, build_prompt
+
+logger = logging.getLogger(__name__)
+
+
+_TEXT_ONLY_HINTS = ("gpt-5.4-mini", "gpt-3.5", "text-only", "instruct-")
+
+
+@dataclass
+class MXQuestion:
+    qid: str
+    question: str
+    options: dict[str, str]
+    label: str
+    medical_task: str
+    body_system: str
+    question_type: str
+    split: str
+    n_images: int
+    pdf_path: Path
+    document_id: int | None
+
+
+def _load_doc_map(map_path: Path) -> tuple[dict[str, dict[str, Any]], dict[str, Any]]:
+    """Read the doc map JSONL.
+
+    Returns ``(rows, settings)`` where ``settings`` is the
+    ``__settings__`` header blob (or ``{}`` for legacy maps).
+    """
+
+    rows: dict[str, dict[str, Any]] = {}
+    settings: dict[str, Any] = {}
+    with map_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if is_settings_header(row):
+                settings = dict(row["__settings__"])
+                continue
+            rows[str(row["qid"])] = row
+    return rows, settings
+
+
+def _load_questions(
+    questions_jsonl: Path,
+    doc_map: dict[str, dict[str, Any]],
+    *,
+    split_filter: str | None,
+    task_filter: str | None,
+    body_filter: str | None,
+    require_images: bool,
+    sample_n: int | None,
+) -> list[MXQuestion]:
+    out: list[MXQuestion] = []
+    with questions_jsonl.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            qid = str(row.get("qid") or "").strip()
+            if not qid:
+                continue
+            if split_filter and split_filter != "all" and row.get("split") != split_filter:
+                continue
+            if task_filter and task_filter != "all" and row.get("medical_task") != task_filter:
+                continue
+            if body_filter and body_filter != "all" and row.get("body_system") != body_filter:
+                continue
+            map_row = doc_map.get(qid)
+            if map_row is None:
+                logger.debug("No doc-map entry for %s; skipping", qid)
+                continue
+            n_images = int(map_row.get("n_images", 0))
+            if require_images and n_images <= 0:
+                continue
+            out.append(MXQuestion(
+                qid=qid,
+                question=str(row.get("question") or ""),
+                options={str(k).upper(): str(v) for k, v in (row.get("options") or {}).items()},
+                label=str(row.get("label") or "").strip().upper(),
+                medical_task=str(row.get("medical_task") or "").strip(),
+                body_system=str(row.get("body_system") or "").strip(),
+                question_type=str(row.get("question_type") or "").strip(),
+                split=str(row.get("split") or ""),
+                n_images=n_images,
+                pdf_path=Path(map_row["pdf_path"]),
+                document_id=map_row.get("document_id"),
+            ))
+    out.sort(key=lambda q: (q.split, q.qid))
+    if sample_n is not None and sample_n > 0:
+        out = out[:sample_n]
+    return out
+
+
+async def _gather_with_limit(coros: Iterable, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(coro):
+        async with sem:
+            return await coro
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+_DESCRIPTION = (
+    "MedXpertQA-MM (~2,000 multimodal medical exam questions, 5 options, with images) — "
+    "Native PDF (vision) vs SurfSense (vision RAG) head-to-head."
+)
+
+# MedXpertQA-MM PDFs embed clinical images; vision LLM at ingest is
+# the whole point. Operators can flip ``--no-vision-llm`` to measure
+# how much we degrade without it (likely material).
+_DEFAULT_INGEST_SETTINGS = IngestSettings(
+    use_vision_llm=True,
+    processing_mode="basic",
+    should_summarize=False,
+)
+
+
+class MedXpertQAMMBenchmark:
+    """Multimodal medical exam head-to-head."""
+
+    suite: str = "medical"
+    name: str = "medxpertqa"
+    headline: bool = True  # The medical suite headline.
+    description: str = _DESCRIPTION
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument(
+            "--split", default="test", choices=["test", "dev", "all"],
+            help="Which MedXpertQA-MM split to run (default: test).",
+        )
+        parser.add_argument(
+            "--task", default="all",
+            help="Filter by medical_task value (e.g. Diagnosis, Treatment, Basic Medicine).",
+        )
+        parser.add_argument(
+            "--body-system", dest="body_filter", default="all",
+            help="Filter by body_system value (e.g. Cardiovascular, Lymphatic).",
+        )
+        parser.add_argument(
+            "--require-images", dest="require_images", action="store_true",
+            help="Skip rare MM rows that ended up with zero resolvable images.",
+        )
+        parser.add_argument("--n", dest="sample_n", type=int, default=None,
+                            help="Run only the first N questions after filters apply.")
+        parser.add_argument("--concurrency", type=int, default=4,
+                            help="Parallel question workers per arm.")
+        parser.add_argument("--no-mentions", dest="no_mentions", action="store_true",
+                            help="SurfSense arm: skip mentioned_document_ids (unscoped retrieval).")
+        parser.add_argument(
+            "--pdf-engine", default="native",
+            choices=[e.value for e in PdfEngine],
+            help="OpenRouter file-parser engine for the native arm.",
+        )
+        parser.add_argument(
+            "--max-output-tokens", type=int, default=512,
+            help="Cap on completion length for both arms.",
+        )
+        # Ingest-only knobs (forwarded by the CLI to ingest.run_ingest).
+        parser.add_argument(
+            "--max-questions", dest="max_questions", type=int, default=None,
+            help="(ingest only) cap on number of MM questions to render + upload.",
+        )
+        parser.add_argument(
+            "--upload-batch-size", dest="upload_batch_size", type=int, default=8,
+            help="(ingest only) PDFs per fileupload call.",
+        )
+        parser.add_argument(
+            "--skip-upload", dest="skip_upload", action="store_true",
+            help="(ingest only) render PDFs locally but don't push to SurfSense.",
+        )
+        parser.add_argument(
+            "--include-dev", dest="include_dev", action="store_true",
+            help="(ingest only) shorthand for --split all.",
+        )
+        # Per-upload knobs forwarded to /documents/fileupload at ingest;
+        # ignored at run-time (runner reads the resolved settings out of
+        # the doc-map manifest header).
+        add_ingest_settings_args(parser, defaults=_DEFAULT_INGEST_SETTINGS)
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:
+        from .ingest import run_ingest
+
+        settings = IngestSettings.merge(_DEFAULT_INGEST_SETTINGS, opts)
+        await run_ingest(
+            ctx,
+            split=opts.get("split") or "test",
+            max_questions=opts.get("max_questions"),
+            upload_batch_size=int(opts.get("upload_batch_size") or 8),
+            skip_upload=bool(opts.get("skip_upload", False)),
+            include_dev=bool(opts.get("include_dev", False)),
+            settings=settings,
+        )
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:
+        split_filter = opts.get("split") or "test"
+        task_filter = opts.get("task") or "all"
+        body_filter = opts.get("body_filter") or "all"
+        require_images = bool(opts.get("require_images"))
+        sample_n = opts.get("sample_n")
+        concurrency = int(opts.get("concurrency") or 4)
+        no_mentions = bool(opts.get("no_mentions"))
+        pdf_engine_name = opts.get("pdf_engine") or "native"
+        max_output_tokens = int(opts.get("max_output_tokens") or 512)
+
+        bench_dir = ctx.benchmark_data_dir()
+        questions_jsonl = bench_dir / "questions.jsonl"
+        map_path = ctx.maps_dir() / "medxpertqa_doc_map.jsonl"
+        if not questions_jsonl.exists() or not map_path.exists():
+            raise RuntimeError(
+                "MedXpertQA-MM not ingested for this suite. Run "
+                "`python -m surfsense_evals ingest medical medxpertqa` first."
+            )
+
+        doc_map, ingest_settings = _load_doc_map(map_path)
+        questions = _load_questions(
+            questions_jsonl, doc_map,
+            split_filter=split_filter,
+            task_filter=task_filter if task_filter != "all" else None,
+            body_filter=body_filter if body_filter != "all" else None,
+            require_images=require_images,
+            sample_n=sample_n,
+        )
+        if not questions:
+            raise RuntimeError(
+                "No MedXpertQA-MM questions matched the filters; broaden --split/--task/--body-system/--n."
+            )
+        logger.info("MedXpertQA-MM: scheduled %d questions", len(questions))
+
+        api_key = os.environ.get("OPENROUTER_API_KEY")
+        if not api_key:
+            raise RuntimeError("OPENROUTER_API_KEY env var is required for the native arm.")
+
+        # Native arm slug differs from SurfSense slug only in cost-arbitrage
+        # scenario; otherwise both arms answer with provider_model.
+        native_arm_model = ctx.native_arm_model
+        if any(hint in native_arm_model.lower() for hint in _TEXT_ONLY_HINTS):
+            if ctx.scenario == "symmetric-cheap":
+                logger.info(
+                    "symmetric-cheap: native arm pinned to text-only %r as "
+                    "intended; expect it to lose on image-bearing questions "
+                    "(SurfSense answers from vision-extracted chunks).",
+                    native_arm_model,
+                )
+            else:
+                logger.warning(
+                    "Native arm slug %r looks text-only; image content in "
+                    "MedXpertQA PDFs will be ignored. Re-pin via "
+                    "`setup --provider-model anthropic/claude-sonnet-4.5` "
+                    "(or pass --native-arm-model and --scenario cost-arbitrage "
+                    "to make this asymmetry explicit).",
+                    native_arm_model,
+                )
+
+        provider = OpenRouterPdfProvider(
+            api_key=api_key,
+            base_url=ctx.config.openrouter_base_url,
+            model=native_arm_model,
+            engine=PdfEngine(pdf_engine_name),
+        )
+        native_arm = NativePdfArm(provider=provider, max_output_tokens=max_output_tokens)
+        surf_arm = SurfSenseArm(
+            client=ctx.new_chat_client(),
+            search_space_id=ctx.search_space_id,
+            ephemeral_threads=True,
+        )
+
+        run_timestamp = utc_iso_timestamp()
+        run_dir = ctx.runs_dir(run_timestamp=run_timestamp)
+        raw_path = run_dir / "raw.jsonl"
+
+        async def _native_one(q: MXQuestion) -> ArmResult:
+            return await native_arm.answer(_make_native_request(q, max_output_tokens))
+
+        async def _surf_one(q: MXQuestion) -> ArmResult:
+            return await surf_arm.answer(_make_surfsense_request(q, no_mentions=no_mentions))
+
+        native_results, surf_results = await asyncio.gather(
+            _gather_with_limit((_native_one(q) for q in questions), concurrency=concurrency),
+            _gather_with_limit((_surf_one(q) for q in questions), concurrency=concurrency),
+        )
+
+        with raw_path.open("w", encoding="utf-8") as fh:
+            for q, n_res, s_res in zip(questions, native_results, surf_results, strict=False):
+                meta = {
+                    "qid": q.qid,
+                    "split": q.split,
+                    "medical_task": q.medical_task,
+                    "body_system": q.body_system,
+                    "question_type": q.question_type,
+                    "n_images": q.n_images,
+                    "correct": q.label,
+                    "document_id": q.document_id,
+                }
+                fh.write(json.dumps({**meta, **n_res.to_jsonl()}) + "\n")
+                fh.write(json.dumps({**meta, **s_res.to_jsonl()}) + "\n")
+
+        metrics = _compute_metrics(questions, native_results, surf_results)
+        artifact = RunArtifact(
+            suite=self.suite,
+            benchmark=self.name,
+            run_timestamp=run_timestamp,
+            raw_path=raw_path,
+            metrics=metrics,
+            extra={
+                "n_questions": len(questions),
+                "concurrency": concurrency,
+                "split_filter": split_filter,
+                "task_filter": task_filter,
+                "body_filter": body_filter,
+                "require_images": require_images,
+                "no_mentions": no_mentions,
+                "pdf_engine": pdf_engine_name,
+                "scenario": ctx.scenario,
+                "provider_model": ctx.provider_model,
+                "native_arm_model": native_arm_model,
+                "vision_provider_model": ctx.vision_provider_model,
+                "agent_llm_id": ctx.agent_llm_id,
+                "ingest_settings": ingest_settings,
+            },
+        )
+
+        manifest_path = run_dir / "run_artifact.json"
+        manifest_path.write_text(
+            json.dumps({
+                "suite": self.suite,
+                "benchmark": self.name,
+                "raw_path": "raw.jsonl",
+                "metrics": metrics,
+                "extra": artifact.extra,
+            }, indent=2, sort_keys=True) + "\n",
+            encoding="utf-8",
+        )
+        return artifact
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:
+        if not artifacts:
+            return ReportSection(
+                title="MedXpertQA-MM — Native PDF (vision) vs SurfSense (vision RAG)",
+                headline=False,
+                body_md="(no run artifacts found)",
+                body_json={},
+            )
+        latest = max(artifacts, key=lambda a: a.run_timestamp)
+        m = latest.metrics
+        native = m.get("native", {})
+        surf = m.get("surfsense", {})
+        delta = m.get("delta", {})
+        per_task = m.get("per_task", {})
+        per_body = m.get("per_body_system", {})
+        extra = latest.extra
+
+        body_lines: list[str] = []
+        body_lines.append(
+            f"- Sample size: {extra.get('n_questions', '?')} questions "
+            f"(split: `{extra.get('split_filter', 'test')}`, "
+            f"task: `{extra.get('task_filter', 'all')}`, "
+            f"body: `{extra.get('body_filter', 'all')}`, "
+            f"engine: `{extra.get('pdf_engine', 'native')}`)."
+        )
+        body_lines.append(format_scenario_md(extra))
+        body_lines.append(format_ingest_settings_md(extra.get("ingest_settings")))
+        body_lines.append(
+            "- Native arm (OpenRouter `chat/completions` + file plugin, "
+            f"`{extra.get('native_arm_model') or extra.get('provider_model', '?')}`):"
+        )
+        body_lines.append(_arm_summary_lines(native, indent="  "))
+        body_lines.append(
+            "- SurfSense arm (`POST /api/v1/new_chat`, vision RAG over chunks, "
+            f"`{extra.get('provider_model', '?')}`):"
+        )
+        body_lines.append(_arm_summary_lines(surf, indent="  "))
+        body_lines.append("- Delta (paired):")
+        body_lines.append(
+            f"  - Accuracy: SurfSense {_pp(delta.get('accuracy_pp'))} pp "
+            f"(McNemar p={_fmt(delta.get('mcnemar_p_value'), 4)}, "
+            f"method={delta.get('mcnemar_method')})"
+        )
+        body_lines.append(
+            f"  - Bootstrap 95% CI on delta: "
+            f"[{_pp(delta.get('bootstrap_ci_low'))}pp, {_pp(delta.get('bootstrap_ci_high'))}pp]"
+        )
+        body_lines.append(
+            f"  - Cost / question: native ${_dollars(native.get('cost_micros_mean'))}, "
+            f"surfsense ${_dollars(surf.get('cost_micros_mean'))} "
+            f"(SurfSense delta {_pct_change(delta.get('cost_micros_pct'))})"
+        )
+        body_lines.append(
+            f"  - Latency p50: native {_ms_to_s(native.get('latency_ms_median'))}, "
+            f"surfsense {_ms_to_s(surf.get('latency_ms_median'))} "
+            f"(SurfSense delta {_pct_change(delta.get('latency_ms_pct'))})"
+        )
+        if per_task:
+            body_lines.append("- Per-medical_task split:")
+            for task_name, vals in sorted(per_task.items()):
+                body_lines.append(
+                    f"  - {task_name}: SurfSense {_pp(vals.get('delta_accuracy_pp'))} pp "
+                    f"(n={vals.get('n')})"
+                )
+        if per_body:
+            body_lines.append("- Per-body_system split (top 5 by sample size):")
+            top = sorted(per_body.items(), key=lambda kv: -kv[1].get("n", 0))[:5]
+            for body_name, vals in top:
+                body_lines.append(
+                    f"  - {body_name}: SurfSense {_pp(vals.get('delta_accuracy_pp'))} pp "
+                    f"(n={vals.get('n')})"
+                )
+
+        return ReportSection(
+            title="MedXpertQA-MM — Native PDF (vision) vs SurfSense (vision RAG)",
+            headline=False,
+            body_md="\n".join(body_lines),
+            body_json=m,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Per-question helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_native_request(q: MXQuestion, max_tokens: int) -> ArmRequest:
+    prompt = build_prompt(q.question, q.options)
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=prompt,
+        pdf_paths=[q.pdf_path],
+        options={"max_tokens": max_tokens},
+    )
+
+
+def _make_surfsense_request(q: MXQuestion, *, no_mentions: bool) -> ArmRequest:
+    prompt = build_prompt(q.question, q.options)
+    mentions: list[int] | None = None
+    if not no_mentions and q.document_id is not None:
+        mentions = [int(q.document_id)]
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=prompt,
+        mentioned_document_ids=mentions,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Metrics
+# ---------------------------------------------------------------------------
+
+
+def _compute_metrics(
+    questions: list[MXQuestion],
+    native_results: list[ArmResult],
+    surf_results: list[ArmResult],
+) -> dict[str, Any]:
+    native_correct: list[bool] = []
+    surf_correct: list[bool] = []
+    for q, n_res, s_res in zip(questions, native_results, surf_results, strict=False):
+        gold = q.label
+        n_ok = (n_res.answer_letter or "").upper() == gold and gold in ANSWER_LETTERS
+        s_ok = (s_res.answer_letter or "").upper() == gold and gold in ANSWER_LETTERS
+        native_correct.append(n_ok)
+        surf_correct.append(s_ok)
+
+    native_costs = [float(r.cost_micros) for r in native_results]
+    surf_costs = [float(r.cost_micros) for r in surf_results]
+    native_lats = [float(r.latency_ms) for r in native_results]
+    surf_lats = [float(r.latency_ms) for r in surf_results]
+    native_in = [float(r.input_tokens) for r in native_results]
+    native_out = [float(r.output_tokens) for r in native_results]
+
+    native_acc = accuracy_with_wilson_ci(sum(native_correct), len(native_correct))
+    surf_acc = accuracy_with_wilson_ci(sum(surf_correct), len(surf_correct))
+    mc = mcnemar_test(native_correct, surf_correct)
+    boot = bootstrap_delta_ci(native_correct, surf_correct, n_resamples=2000)
+
+    native_cost_agg = paired_aggregate(native_costs)
+    surf_cost_agg = paired_aggregate(surf_costs)
+    native_lat_agg = paired_aggregate(native_lats)
+    surf_lat_agg = paired_aggregate(surf_lats)
+
+    cost_pct = _safe_pct(surf_cost_agg.mean, native_cost_agg.mean)
+    lat_pct = _safe_pct(surf_lat_agg.median, native_lat_agg.median)
+
+    per_task = _per_field(questions, native_correct, surf_correct, key=lambda q: q.medical_task or "unknown")
+    per_body = _per_field(questions, native_correct, surf_correct, key=lambda q: q.body_system or "unknown")
+
+    return {
+        "native": {
+            **native_acc.to_dict(),
+            "cost_micros_mean": native_cost_agg.mean,
+            "cost_micros_median": native_cost_agg.median,
+            "latency_ms_mean": native_lat_agg.mean,
+            "latency_ms_median": native_lat_agg.median,
+            "latency_ms_p95": native_lat_agg.p95,
+            "input_tokens_mean": (sum(native_in) / len(native_in)) if native_in else 0.0,
+            "output_tokens_mean": (sum(native_out) / len(native_out)) if native_out else 0.0,
+        },
+        "surfsense": {
+            **surf_acc.to_dict(),
+            "cost_micros_mean": surf_cost_agg.mean,
+            "cost_micros_median": surf_cost_agg.median,
+            "latency_ms_mean": surf_lat_agg.mean,
+            "latency_ms_median": surf_lat_agg.median,
+            "latency_ms_p95": surf_lat_agg.p95,
+        },
+        "delta": {
+            "accuracy_pp": 100.0 * (surf_acc.accuracy - native_acc.accuracy),
+            "mcnemar_p_value": mc.p_value,
+            "mcnemar_method": mc.method,
+            "mcnemar_b_native_only": mc.b,
+            "mcnemar_c_surfsense_only": mc.c,
+            "bootstrap_ci_low": 100.0 * boot.ci_low,
+            "bootstrap_ci_high": 100.0 * boot.ci_high,
+            "cost_micros_pct": cost_pct,
+            "latency_ms_pct": lat_pct,
+        },
+        "per_task": per_task,
+        "per_body_system": per_body,
+    }
+
+
+def _per_field(
+    questions: list[MXQuestion],
+    native_correct: list[bool],
+    surf_correct: list[bool],
+    *,
+    key,
+) -> dict[str, dict[str, Any]]:
+    bucket: dict[str, list[tuple[bool, bool]]] = {}
+    for q, n_ok, s_ok in zip(questions, native_correct, surf_correct, strict=False):
+        bucket.setdefault(key(q), []).append((n_ok, s_ok))
+    out: dict[str, dict[str, Any]] = {}
+    for k, pairs in bucket.items():
+        n_correct = [a for a, _ in pairs]
+        s_correct = [b for _, b in pairs]
+        out[k] = {
+            "n": len(pairs),
+            "native_accuracy": (sum(n_correct) / len(pairs)) if pairs else 0.0,
+            "surfsense_accuracy": (sum(s_correct) / len(pairs)) if pairs else 0.0,
+            "delta_accuracy_pp": (
+                100.0 * (sum(s_correct) - sum(n_correct)) / len(pairs)
+                if pairs else 0.0
+            ),
+        }
+    return out
+
+
+def _safe_pct(numerator: float, denominator: float) -> float | None:
+    if denominator == 0:
+        return None
+    return 100.0 * (numerator - denominator) / denominator
+
+
+# ---------------------------------------------------------------------------
+# Formatters
+# ---------------------------------------------------------------------------
+
+
+def _arm_summary_lines(d: dict[str, Any], *, indent: str) -> str:
+    if not d:
+        return f"{indent}(no data)"
+    acc = d.get("accuracy", 0.0)
+    low = d.get("ci_low", 0.0)
+    high = d.get("ci_high", 0.0)
+    lines = [
+        f"{indent}- Accuracy: {acc * 100:.1f}% (Wilson 95% CI: {low * 100:.1f}% – {high * 100:.1f}%)",
+        f"{indent}- Cost / question: ${_dollars(d.get('cost_micros_mean'))} (mean), "
+        f"${_dollars(d.get('cost_micros_median'))} (median)",
+        f"{indent}- Latency: p50 {_ms_to_s(d.get('latency_ms_median'))}, "
+        f"p95 {_ms_to_s(d.get('latency_ms_p95'))}",
+    ]
+    if "input_tokens_mean" in d:
+        lines.append(
+            f"{indent}- Mean tokens / question: in {d.get('input_tokens_mean', 0):.0f}, "
+            f"out {d.get('output_tokens_mean', 0):.0f}"
+        )
+    return "\n".join(lines)
+
+
+def _dollars(micros: Any) -> str:
+    if micros is None:
+        return "?"
+    try:
+        return f"{(float(micros) / 1_000_000):.4f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _ms_to_s(ms: Any) -> str:
+    if ms is None:
+        return "?"
+    try:
+        return f"{float(ms) / 1000:.1f}s"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _pp(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):+.1f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _pct_change(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):+.0f}%"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _fmt(value: Any, ndigits: int) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):.{ndigits}f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+__all__ = ["MedXpertQAMMBenchmark", "MXQuestion"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/mirage/__init__.py b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/__init__.py
new file mode 100644
index 000000000..e527b37f4
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/__init__.py
@@ -0,0 +1,17 @@
+"""MIRAGE — secondary single-arm SurfSense MCQ measurement.
+
+Source: https://github.com/Teddy-XiongGZ/MIRAGE, paper
+https://aclanthology.org/2024.findings-acl.372/. 7,663 questions
+across MMLU-Med, MedQA-US, MedMCQA, PubMedQA*, BioASQ-Y/N.
+
+This is a SurfSense-only measurement (not a head-to-head); native
+PDF-in-LLM doesn't apply because there is no per-question discrete
+document — the corpus is millions of biomedical snippets.
+"""
+
+from __future__ import annotations
+
+from .runner import MirageBenchmark
+from ....core import registry as _registry
+
+_registry.register(MirageBenchmark())
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/mirage/ingest.py b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/ingest.py
new file mode 100644
index 000000000..9769d078b
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/ingest.py
@@ -0,0 +1,548 @@
+"""MIRAGE ingestion.
+
+Downloads:
+
+* ``benchmark.json`` (≈ 4 MB; questions for the 5 sub-tasks).
+* ``retrieved_snippets_10k.zip`` (the union of top-10k snippet ids
+  retrieved by every retriever in the MedRAG paper, per task — a
+  recall ceiling that avoids needing the full 23.9M-doc PubMed mirror).
+
+Snippet *content* lives in the MedRAG HF mirrors
+(``MedRAG/textbooks``, ``MedRAG/pubmed``, ``MedRAG/statpearls``,
+``MedRAG/wikipedia``). We default to ``MedRAG/textbooks`` (212 MB,
+125k snippets) which is the smallest and covers the majority of
+``MedQA-US`` and the medical examination subsets. Operators can
+opt into larger corpora with ``--corpus``.
+
+Each snippet is written as one markdown file then batched into
+``~5 MB`` markdown bundles for SurfSense's file upload (smaller
+than backend default ``MAX_FILE_SIZE_BYTES`` and avoids the per-call
+overhead of one HTTP request per snippet).
+
+The ingestion produces two maps under ``data/medical/maps/``:
+
+* ``mirage_snippet_map.jsonl`` — ``{snippet_id, document_id, batch_path}``
+* ``mirage_chunk_map.jsonl`` — ``{chunk_id, document_id, snippet_id?}``
+  (best-effort; chunk text is heuristically attributed to the
+  snippet it overlaps when the SurfSense chunker splits a batched
+  markdown).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import io
+import json
+import logging
+import zipfile
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+
+import httpx
+
+from ....core.config import set_suite_state
+from ....core.ingest_settings import IngestSettings, settings_header_line
+from ....core.registry import RunContext
+
+logger = logging.getLogger(__name__)
+
+
+MIRAGE_BENCHMARK_URL = (
+    "https://raw.githubusercontent.com/Teddy-XiongGZ/MIRAGE/main/benchmark.json"
+)
+# Upstream only ships ONE zip — top-10k retrievals across 5 retrievers,
+# ~16 GB. We default to skipping it (see `--skip-snippet-filter`) and
+# ingesting the chosen corpus in full; this URL is only fetched when
+# the operator explicitly opts in.
+MIRAGE_SNIPPETS_ZIP_URL = (
+    "https://virginia.box.com/shared/static/cxq17th6eisl2pn04vp0x723zczlvlzc.zip"
+)
+
+
+_DEFAULT_CORPUS = "MedRAG/textbooks"
+_BATCH_SIZE_BYTES = 5 * 1024 * 1024
+# 2 GB safety cap. Anything larger requires --allow-large-download.
+# Set high enough that ``benchmark.json`` and small zips pass through
+# untouched but the 16 GB MIRAGE retrievals zip trips the guard.
+_LARGE_DOWNLOAD_BYTES = 2 * 1024 * 1024 * 1024
+_DOWNLOAD_RETRIES = 5
+_RETRYABLE_NET_EXC: tuple[type[BaseException], ...] = (
+    httpx.RemoteProtocolError,
+    httpx.ReadError,
+    httpx.ReadTimeout,
+    httpx.ConnectError,
+    httpx.ConnectTimeout,
+)
+
+
+@dataclass
+class SnippetRow:
+    snippet_id: str
+    title: str
+    content: str
+
+    def to_markdown(self) -> str:
+        title = (self.title or "").strip() or "Untitled"
+        body = (self.content or "").strip()
+        return f"# {title}\n\n_id: `{self.snippet_id}`_\n\n{body}\n"
+
+
+# ---------------------------------------------------------------------------
+# Download helpers
+# ---------------------------------------------------------------------------
+
+
+async def _fetch_to_path(
+    url: str,
+    *,
+    dest: Path,
+    label: str,
+    timeout_s: float = 600.0,
+    allow_large_download: bool = False,
+    expect_zip: bool = False,
+) -> Path:
+    """Download ``url`` to ``dest`` with retry, atomic-rename, and
+    HTTP ``Range`` resume.
+
+    Operational properties:
+
+    * If ``dest`` already exists *and* (when ``expect_zip`` is True) the
+      cached file is a valid ZIP, returns it immediately. A corrupt ZIP
+      is removed and re-downloaded — this is the safety net for the
+      `box.com truncated 16 GB zip` failure mode where the previous
+      run wrote a half-completed file then exited with an exception.
+    * Bytes are written to ``<dest>.partial`` and renamed only after the
+      stream completes cleanly (and, for zips, only after a quick
+      central-directory check). A failure mid-download leaves the
+      ``.partial`` file in place so the next attempt can resume from
+      where it stopped via an HTTP ``Range`` header.
+    * Retries on transient network errors (``RemoteProtocolError``,
+      ``ReadError``, ``ReadTimeout``, ``ConnectError``,
+      ``ConnectTimeout``) with exponential backoff, up to
+      ``_DOWNLOAD_RETRIES``.
+    * Aborts before downloading if the ``Content-Length`` (or already-
+      downloaded ``.partial`` size) is over ``_LARGE_DOWNLOAD_BYTES``
+      and ``allow_large_download`` is False, to keep an operator from
+      surprise-grabbing 16 GB on a slow link.
+    """
+
+    if dest.exists():
+        if expect_zip and not _is_valid_zip(dest):
+            logger.warning(
+                "Cached %s at %s failed ZIP validation (size=%d B); deleting "
+                "and re-downloading.",
+                label,
+                dest,
+                dest.stat().st_size,
+            )
+            dest.unlink(missing_ok=True)
+        else:
+            logger.info("Using cached %s at %s", label, dest)
+            return dest
+
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    partial = dest.with_suffix(dest.suffix + ".partial")
+    last_exc: BaseException | None = None
+
+    for attempt in range(1, _DOWNLOAD_RETRIES + 1):
+        existing_bytes = partial.stat().st_size if partial.exists() else 0
+        headers: dict[str, str] = {}
+        if existing_bytes:
+            headers["Range"] = f"bytes={existing_bytes}-"
+            logger.info(
+                "Resuming %s from byte %d (attempt %d/%d)",
+                label,
+                existing_bytes,
+                attempt,
+                _DOWNLOAD_RETRIES,
+            )
+        else:
+            logger.info(
+                "Downloading %s from %s (attempt %d/%d)",
+                label,
+                url,
+                attempt,
+                _DOWNLOAD_RETRIES,
+            )
+
+        try:
+            async with httpx.AsyncClient(
+                timeout=httpx.Timeout(timeout_s, connect=20.0),
+                follow_redirects=True,
+            ) as client:
+                async with client.stream("GET", url, headers=headers) as response:
+                    if existing_bytes and response.status_code == 200:
+                        logger.warning(
+                            "Server ignored Range header for %s; restarting from 0.",
+                            label,
+                        )
+                        partial.unlink(missing_ok=True)
+                        existing_bytes = 0
+                    elif response.status_code == 416:
+                        # Range not satisfiable — the .partial is at or
+                        # past the end. Treat as "already downloaded";
+                        # validate by closing and re-opening for atomic
+                        # rename below.
+                        logger.info(
+                            "Server reports %s already complete (HTTP 416).",
+                            label,
+                        )
+                    elif response.status_code not in (200, 206):
+                        response.raise_for_status()
+
+                    total_size = _planned_total_size(response, existing_bytes)
+                    if (
+                        total_size is not None
+                        and total_size > _LARGE_DOWNLOAD_BYTES
+                        and not allow_large_download
+                    ):
+                        raise _LargeDownloadAbort(label, total_size)
+
+                    mode = "ab" if existing_bytes else "wb"
+                    with partial.open(mode) as fh:
+                        async for chunk in response.aiter_bytes(chunk_size=1 << 18):
+                            fh.write(chunk)
+            # Optional content sanity check before promoting to dest.
+            if expect_zip and not _is_valid_zip(partial):
+                raise zipfile.BadZipFile(
+                    f"{label} downloaded to {partial} but failed central-"
+                    "directory check; will retry."
+                )
+            partial.replace(dest)
+            return dest
+        except _LargeDownloadAbort:
+            raise
+        except _RETRYABLE_NET_EXC as exc:
+            last_exc = exc
+            wait = min(60.0, 2.0 ** attempt)
+            logger.warning(
+                "Network error fetching %s (%s: %s); retrying in %.0fs.",
+                label,
+                type(exc).__name__,
+                exc,
+                wait,
+            )
+            await asyncio.sleep(wait)
+        except zipfile.BadZipFile as exc:
+            last_exc = exc
+            # Truncated body — drop the partial and retry from scratch.
+            partial.unlink(missing_ok=True)
+            wait = min(60.0, 2.0 ** attempt)
+            logger.warning(
+                "Truncated ZIP for %s; restarting from byte 0 in %.0fs.",
+                label,
+                wait,
+            )
+            await asyncio.sleep(wait)
+
+    raise RuntimeError(
+        f"Failed to download {label} after {_DOWNLOAD_RETRIES} attempts: {last_exc!s}"
+    )
+
+
+def _planned_total_size(response: httpx.Response, existing_bytes: int) -> int | None:
+    """Best-effort total size including any already-buffered .partial bytes."""
+
+    cl = response.headers.get("Content-Length")
+    if not cl:
+        return None
+    try:
+        remaining = int(cl)
+    except ValueError:
+        return None
+    return existing_bytes + remaining
+
+
+def _is_valid_zip(path: Path) -> bool:
+    """Cheap ZIP validity check via central-directory parse."""
+
+    try:
+        with zipfile.ZipFile(path) as zf:
+            # ``namelist`` forces the central directory to be parsed.
+            zf.namelist()
+        return True
+    except (zipfile.BadZipFile, OSError):
+        return False
+
+
+class _LargeDownloadAbort(RuntimeError):
+    """Raised when a download exceeds the safety threshold without opt-in."""
+
+    def __init__(self, label: str, size_bytes: int) -> None:
+        gb = size_bytes / (1024 ** 3)
+        super().__init__(
+            f"{label} would download ~{gb:.1f} GB, above the {_LARGE_DOWNLOAD_BYTES / (1024 ** 3):.0f} GB safety cap. "
+            "Re-run with `--allow-large-download` to acknowledge, or use "
+            "`--skip-snippet-filter` to bypass this download entirely and "
+            "ingest the full corpus instead."
+        )
+
+
+def _read_snippet_ids(zip_path: Path, *, tasks: list[str]) -> dict[str, set[str]]:
+    """Walk the ZIP for files whose path contains any task name.
+
+    Each MedRAG retriever produces one JSON file per task in the zip;
+    we union all retrievers' top-K ids. The exact directory layout has
+    historically been ``<retriever>/<task>.json`` mapping
+    ``question_id -> [snippet_id, ...]``.
+    """
+
+    out: dict[str, set[str]] = {t: set() for t in tasks}
+    with zipfile.ZipFile(zip_path, "r") as zf:
+        for member in zf.namelist():
+            if not member.lower().endswith(".json"):
+                continue
+            stem = Path(member).stem.lower()
+            for task in tasks:
+                if task.lower() in stem:
+                    try:
+                        with zf.open(member) as fh:
+                            payload = json.loads(fh.read().decode("utf-8"))
+                    except (json.JSONDecodeError, KeyError):
+                        continue
+                    for ids in payload.values():
+                        if isinstance(ids, list):
+                            for sid in ids:
+                                if isinstance(sid, str):
+                                    out[task].add(sid)
+                                elif isinstance(sid, dict) and "id" in sid:
+                                    out[task].add(str(sid["id"]))
+                    break
+    return out
+
+
+def _load_corpus(
+    corpus_name: str, snippet_ids: set[str] | None
+) -> Iterable[SnippetRow]:
+    """Stream rows from a MedRAG HF corpus.
+
+    * ``snippet_ids=None`` → yield every row (full-corpus ingestion path).
+    * ``snippet_ids={...}`` → filter to the requested ids.
+
+    Imported lazily — ``datasets`` is a heavyweight dep.
+    """
+
+    if snippet_ids is not None and not snippet_ids:
+        return iter(())
+    from datasets import load_dataset  # noqa: PLC0415
+
+    logger.info("Loading corpus %s (this may take a while)", corpus_name)
+    ds = load_dataset(corpus_name, split="train", streaming=True)
+    for row in ds:
+        sid = str(row.get("id") or "")
+        if snippet_ids is not None and sid not in snippet_ids:
+            continue
+        yield SnippetRow(
+            snippet_id=sid,
+            title=str(row.get("title") or ""),
+            content=str(row.get("content") or row.get("contents") or ""),
+        )
+
+
+# ---------------------------------------------------------------------------
+# Batching + upload
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class SnippetBatch:
+    path: Path
+    snippet_ids: list[str]
+
+
+def _write_batches(
+    snippets: Iterable[SnippetRow],
+    *,
+    out_dir: Path,
+    batch_bytes: int = _BATCH_SIZE_BYTES,
+    prefix: str = "mirage",
+) -> list[SnippetBatch]:
+    out_dir.mkdir(parents=True, exist_ok=True)
+    batches: list[SnippetBatch] = []
+    current_buffer = io.StringIO()
+    current_ids: list[str] = []
+    current_bytes = 0
+    batch_idx = 0
+
+    def _flush() -> None:
+        nonlocal current_buffer, current_ids, current_bytes, batch_idx
+        if not current_ids:
+            return
+        path = out_dir / f"{prefix}_{batch_idx:04d}.md"
+        path.write_text(current_buffer.getvalue(), encoding="utf-8")
+        batches.append(SnippetBatch(path=path, snippet_ids=current_ids))
+        batch_idx += 1
+        current_buffer = io.StringIO()
+        current_ids = []
+        current_bytes = 0
+
+    for snippet in snippets:
+        chunk = snippet.to_markdown() + "\n---\n\n"
+        chunk_bytes = len(chunk.encode("utf-8"))
+        if current_bytes + chunk_bytes > batch_bytes and current_ids:
+            _flush()
+        current_buffer.write(chunk)
+        current_ids.append(snippet.snippet_id)
+        current_bytes += chunk_bytes
+    _flush()
+    return batches
+
+
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+
+
+async def run_ingest(
+    ctx: RunContext,
+    *,
+    tasks: list[str] | None = None,
+    corpus: str = _DEFAULT_CORPUS,
+    max_snippets_per_task: int | None = None,
+    skip_snippet_filter: bool = True,
+    allow_large_download: bool = False,
+    settings: IngestSettings | None = None,
+) -> None:
+    """Ingest a MedRAG corpus into the suite SearchSpace.
+
+    By default (``skip_snippet_filter=True``) we ingest the **entire**
+    chosen corpus and let SurfSense's own retriever do the work. The
+    upstream MIRAGE retrieval zip is ~16 GB and only useful when you
+    want to pre-filter the corpus to the set of snippets some other
+    retriever surfaced; for ``MedRAG/textbooks`` (212 MB / 125k snippets)
+    that pre-filter is unnecessary overhead and routinely fails to
+    download (box.com truncates the stream). Set
+    ``skip_snippet_filter=False`` (CLI: ``--use-snippet-filter``) only
+    if you specifically want the upstream filter — and budget the
+    16 GB zip transfer.
+    """
+
+    tasks = tasks or ["mmlu", "medqa", "medmcqa", "pubmedqa", "bioasq"]
+    settings = settings or IngestSettings(use_vision_llm=False, processing_mode="basic")
+
+    bench_path = ctx.benchmark_data_dir() / "benchmark.json"
+    await _fetch_to_path(MIRAGE_BENCHMARK_URL, dest=bench_path, label="MIRAGE benchmark.json")
+
+    if skip_snippet_filter:
+        logger.info(
+            "Skipping retrieved_snippets_10k.zip (skip_snippet_filter=True); "
+            "ingesting entire corpus %s.",
+            corpus,
+        )
+        snippets = list(_load_corpus(corpus, snippet_ids=None))
+    else:
+        zip_path = ctx.benchmark_data_dir() / "retrieved_snippets_10k.zip"
+        await _fetch_to_path(
+            MIRAGE_SNIPPETS_ZIP_URL,
+            dest=zip_path,
+            label="MIRAGE retrieved_snippets_10k.zip",
+            allow_large_download=allow_large_download,
+            expect_zip=True,
+        )
+
+        by_task = _read_snippet_ids(zip_path, tasks=tasks)
+        if max_snippets_per_task is not None:
+            by_task = {k: set(list(v)[:max_snippets_per_task]) for k, v in by_task.items()}
+
+        union_ids = set().union(*by_task.values())
+        logger.info(
+            "MIRAGE: tasks=%s, snippet ids per task: %s, union=%d",
+            tasks,
+            {k: len(v) for k, v in by_task.items()},
+            len(union_ids),
+        )
+        if not union_ids:
+            raise RuntimeError(
+                f"No snippet ids parsed for tasks {tasks!r} from {zip_path}. "
+                "Check the zip layout (the upstream archive may have changed)."
+            )
+
+        snippets = list(_load_corpus(corpus, snippet_ids=union_ids))
+        logger.info(
+            "Loaded %d / %d requested snippets from corpus %s",
+            len(snippets),
+            len(union_ids),
+            corpus,
+        )
+    if not snippets:
+        raise RuntimeError(
+            f"Corpus {corpus} returned 0 matching rows. Either the snippet "
+            "ids reference a different corpus (e.g. PubMed) or the HF mirror "
+            "is unavailable. Pass --corpus to override."
+        )
+
+    batches_dir = ctx.benchmark_data_dir() / "batches"
+    batches = _write_batches(snippets, out_dir=batches_dir)
+    logger.info("Wrote %d snippet batches to %s", len(batches), batches_dir)
+
+    docs_client = ctx.documents_client()
+    upload_result = await docs_client.upload(
+        files=[b.path for b in batches],
+        search_space_id=ctx.search_space_id,
+        should_summarize=settings.should_summarize,
+        use_vision_llm=settings.use_vision_llm,
+        processing_mode=settings.processing_mode,
+    )
+    logger.info("MIRAGE upload settings: %s", settings.render_label())
+    new_doc_ids = list(upload_result.document_ids)
+    if new_doc_ids:
+        await docs_client.wait_until_ready(
+            search_space_id=ctx.search_space_id,
+            document_ids=new_doc_ids,
+            timeout_s=3600.0,
+            max_poll_s=15.0,
+        )
+
+    statuses = await docs_client.get_status(
+        search_space_id=ctx.search_space_id,
+        document_ids=new_doc_ids + upload_result.duplicate_document_ids,
+    )
+    title_to_doc = {s.title: s.document_id for s in statuses}
+
+    snippet_map_path = ctx.maps_dir() / "mirage_snippet_map.jsonl"
+    chunk_map_path = ctx.maps_dir() / "mirage_chunk_map.jsonl"
+    with snippet_map_path.open("w", encoding="utf-8") as fh:
+        # Header line records the ingest-time settings (see
+        # core/ingest_settings.py for the protocol).
+        fh.write(settings_header_line(settings) + "\n")
+        for batch in batches:
+            doc_id = title_to_doc.get(batch.path.name)
+            if doc_id is None:
+                logger.warning("No document_id for batch %s", batch.path.name)
+                continue
+            for sid in batch.snippet_ids:
+                fh.write(
+                    json.dumps(
+                        {
+                            "snippet_id": sid,
+                            "document_id": doc_id,
+                            "batch_path": str(batch.path),
+                        }
+                    )
+                    + "\n"
+                )
+
+    # Best-effort chunk map. SurfSense doesn't expose snippet attribution
+    # per chunk, so we just record (chunk_id -> document_id) here; the
+    # MIRAGE runner only needs document_id for accuracy scoring.
+    with chunk_map_path.open("w", encoding="utf-8") as fh:
+        for doc_id in {b.path.name and title_to_doc.get(b.path.name) for b in batches} - {None}:
+            try:
+                chunks = await docs_client.list_chunks(int(doc_id))
+            except Exception as exc:  # noqa: BLE001
+                logger.warning("Failed to list chunks for doc_id=%s: %s", doc_id, exc)
+                continue
+            for chunk in chunks:
+                fh.write(
+                    json.dumps({"chunk_id": chunk.id, "document_id": doc_id})
+                    + "\n"
+                )
+
+    new_state = ctx.suite_state
+    new_state.ingestion_maps["mirage"] = str(snippet_map_path)
+    set_suite_state(ctx.config, ctx.suite, new_state)
+    logger.info("Wrote MIRAGE maps to %s and %s", snippet_map_path, chunk_map_path)
+
+
+__all__ = ["run_ingest", "SnippetRow", "SnippetBatch"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/mirage/prompt.py b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/prompt.py
new file mode 100644
index 000000000..9e5b1c618
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/prompt.py
@@ -0,0 +1,44 @@
+"""MedRAG ``{step_by_step_thinking, answer_choice}`` MCQ prompt.
+
+Mirrors the MedRAG paper's prompt format so accuracy numbers are
+comparable to the published MIRAGE leaderboard.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+
+
+_PROMPT_TEMPLATE = """\
+You are a helpful medical expert. Answer the following multiple-choice
+question using the relevant medical knowledge available to you (and any
+retrieved context, if provided).
+
+Respond with a JSON object on a single line:
+{{"step_by_step_thinking": "<your reasoning>", "answer_choice": "<letter>"}}
+
+Question: {question}
+
+Options:
+{options_block}
+"""
+
+
+def _options_block(options: Mapping[str, str]) -> str:
+    parts: list[str] = []
+    for letter in sorted(options.keys()):
+        text = options.get(letter)
+        if text is None or text == "":
+            continue
+        parts.append(f"{letter}) {text}")
+    return "\n".join(parts)
+
+
+def build_prompt(question: str, options: Mapping[str, str]) -> str:
+    return _PROMPT_TEMPLATE.format(
+        question=question.strip(),
+        options_block=_options_block(options),
+    )
+
+
+__all__ = ["build_prompt"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/mirage/runner.py b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/runner.py
new file mode 100644
index 000000000..0f336c0d5
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/runner.py
@@ -0,0 +1,332 @@
+"""MIRAGE runner: SurfSense-only per-task accuracy.
+
+The benchmark file format is one top-level dict per task (``mmlu``,
+``medqa``, ``medmcqa``, ``pubmedqa``, ``bioasq``); each task value is
+``{question_id: {question, options, answer}}``.
+
+We restrict retrieval to the suite SearchSpace's full corpus (no
+``mentioned_document_ids`` — MIRAGE has no per-question ground-truth
+document; retrieval *is* the test). Accuracy is paired against the
+``answer`` letter from the dataset.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+from dataclasses import dataclass
+from typing import Any
+
+from ....core.arms import ArmRequest, ArmResult, SurfSenseArm
+from ....core.config import utc_iso_timestamp
+from ....core.ingest_settings import (
+    IngestSettings,
+    add_ingest_settings_args,
+    format_ingest_settings_md,
+    read_settings_header,
+)
+from ....core.metrics.mc_accuracy import accuracy_with_wilson_ci, macro_accuracy
+from ....core.registry import (
+    Benchmark,
+    ReportSection,
+    RunArtifact,
+    RunContext,
+)
+from .prompt import build_prompt
+
+logger = logging.getLogger(__name__)
+
+
+_TASKS = ("mmlu", "medqa", "medmcqa", "pubmedqa", "bioasq")
+_DESCRIPTION = "MIRAGE (7,663 medical MCQs) — single-arm SurfSense per-task accuracy."
+
+# MIRAGE corpus is text-only (textbook + abstract markdown). Vision
+# LLM at ingest is wasted compute by default; flip ``--use-vision-llm``
+# to measure cost.
+_DEFAULT_INGEST_SETTINGS = IngestSettings(
+    use_vision_llm=False,
+    processing_mode="basic",
+    should_summarize=False,
+)
+
+
+@dataclass
+class MirageQuestion:
+    task: str
+    qid: str
+    question: str
+    options: dict[str, str]
+    correct: str
+
+    @property
+    def question_id(self) -> str:
+        return f"{self.task}::{self.qid}"
+
+
+def _load_questions(
+    benchmark: dict[str, Any],
+    *,
+    tasks: list[str],
+    sample_n: int | None,
+) -> list[MirageQuestion]:
+    out: list[MirageQuestion] = []
+    for task in tasks:
+        rows = benchmark.get(task) or {}
+        if not isinstance(rows, dict):
+            continue
+        for qid, raw in rows.items():
+            if not isinstance(raw, dict):
+                continue
+            options = raw.get("options") or {}
+            if not isinstance(options, dict):
+                continue
+            answer_raw = str(raw.get("answer") or "").strip()
+            if not answer_raw:
+                continue
+            answer_letter = answer_raw[:1].upper()
+            out.append(
+                MirageQuestion(
+                    task=task,
+                    qid=str(qid),
+                    question=str(raw.get("question", "")),
+                    options={str(k): str(v) for k, v in options.items() if v},
+                    correct=answer_letter,
+                )
+            )
+    out.sort(key=lambda q: (q.task, q.qid))
+    if sample_n is not None and sample_n > 0:
+        # Stratified-by-task slice so smoke runs cover every task.
+        per_task = max(1, sample_n // max(1, len(tasks)))
+        sliced: list[MirageQuestion] = []
+        per_task_counter: dict[str, int] = {}
+        for q in out:
+            n = per_task_counter.get(q.task, 0)
+            if n >= per_task:
+                continue
+            sliced.append(q)
+            per_task_counter[q.task] = n + 1
+            if len(sliced) >= sample_n:
+                break
+        out = sliced
+    return out
+
+
+async def _gather_with_limit(coros, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(c):
+        async with sem:
+            return await c
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+class MirageBenchmark:
+    suite: str = "medical"
+    name: str = "mirage"
+    headline: bool = False
+    description: str = _DESCRIPTION
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument(
+            "--task",
+            default="all",
+            choices=("all", *_TASKS),
+            help="Run a single task or all (default: all).",
+        )
+        parser.add_argument("--n", dest="sample_n", type=int, default=None,
+                            help="Stratified sample size across tasks.")
+        parser.add_argument("--concurrency", type=int, default=4)
+        parser.add_argument(
+            "--corpus", default="MedRAG/textbooks",
+            help="HF MedRAG corpus to ingest from (default: MedRAG/textbooks).",
+        )
+        parser.add_argument(
+            "--max-snippets-per-task", type=int, default=None,
+            help="Cap the per-task ingestion to N snippets (smoke).",
+        )
+        # Mutually exclusive: by default we skip the upstream 16 GB
+        # retrievals zip and ingest the entire corpus. Operators who
+        # want the upstream pre-filter pass --use-snippet-filter (and,
+        # if their corpus mismatch warrants the 16 GB transfer,
+        # --allow-large-download).
+        snippet_group = parser.add_mutually_exclusive_group()
+        snippet_group.add_argument(
+            "--use-snippet-filter", dest="use_snippet_filter", action="store_true",
+            default=False,
+            help="Download retrieved_snippets_10k.zip (~16 GB) and "
+                 "filter the corpus to those ids before ingest. "
+                 "Default: skip and ingest entire corpus.",
+        )
+        snippet_group.add_argument(
+            "--skip-snippet-filter", dest="use_snippet_filter", action="store_false",
+            help="(Default) Skip the 16 GB upstream zip; ingest entire corpus.",
+        )
+        parser.add_argument(
+            "--allow-large-download", action="store_true", default=False,
+            help="Permit downloads larger than 2 GB (e.g. retrieved_snippets_10k.zip).",
+        )
+        # Per-upload knobs; ignored at run-time (runner reads the
+        # resolved settings out of the snippet-map manifest header).
+        add_ingest_settings_args(parser, defaults=_DEFAULT_INGEST_SETTINGS)
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:
+        from .ingest import run_ingest
+
+        settings = IngestSettings.merge(_DEFAULT_INGEST_SETTINGS, opts)
+        await run_ingest(
+            ctx,
+            corpus=str(opts.get("corpus") or "MedRAG/textbooks"),
+            max_snippets_per_task=opts.get("max_snippets_per_task"),
+            skip_snippet_filter=not bool(opts.get("use_snippet_filter")),
+            allow_large_download=bool(opts.get("allow_large_download")),
+            settings=settings,
+        )
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:
+        task_filter = opts.get("task") or "all"
+        tasks = list(_TASKS) if task_filter == "all" else [task_filter]
+        sample_n = opts.get("sample_n")
+        concurrency = int(opts.get("concurrency") or 4)
+
+        bench_path = ctx.benchmark_data_dir() / "benchmark.json"
+        if not bench_path.exists():
+            raise RuntimeError(
+                "MIRAGE benchmark.json missing. Run "
+                "`python -m surfsense_evals ingest medical mirage` first."
+            )
+        benchmark = json.loads(bench_path.read_text(encoding="utf-8"))
+        ingest_settings = read_settings_header(
+            ctx.maps_dir() / "mirage_snippet_map.jsonl"
+        )
+        questions = _load_questions(benchmark, tasks=tasks, sample_n=sample_n)
+        if not questions:
+            raise RuntimeError(
+                f"No MIRAGE questions matched task={task_filter!r} sample_n={sample_n!r}."
+            )
+        logger.info("MIRAGE: scheduled %d questions across tasks %s",
+                    len(questions), tasks)
+
+        arm = SurfSenseArm(
+            client=ctx.new_chat_client(),
+            search_space_id=ctx.search_space_id,
+            ephemeral_threads=True,
+        )
+
+        async def _ask(q: MirageQuestion) -> ArmResult:
+            request = ArmRequest(
+                question_id=q.question_id,
+                prompt=build_prompt(q.question, q.options),
+            )
+            return await arm.answer(request)
+
+        results: list[ArmResult] = await _gather_with_limit(
+            (_ask(q) for q in questions), concurrency=concurrency
+        )
+
+        run_timestamp = utc_iso_timestamp()
+        run_dir = ctx.runs_dir(run_timestamp=run_timestamp)
+        raw_path = run_dir / "raw.jsonl"
+        with raw_path.open("w", encoding="utf-8") as fh:
+            for q, res in zip(questions, results):
+                fh.write(
+                    json.dumps(
+                        {
+                            "task": q.task,
+                            "qid": q.qid,
+                            "correct": q.correct,
+                            **res.to_jsonl(),
+                        }
+                    )
+                    + "\n"
+                )
+
+        per_task_acc: dict[str, dict[str, Any]] = {}
+        for task in tasks:
+            n_correct = 0
+            n_total = 0
+            for q, res in zip(questions, results):
+                if q.task != task:
+                    continue
+                n_total += 1
+                if (res.answer_letter or "").upper() == q.correct:
+                    n_correct += 1
+            acc = accuracy_with_wilson_ci(n_correct, n_total)
+            per_task_acc[task] = acc.to_dict()
+
+        macro = macro_accuracy(
+            {t: accuracy_with_wilson_ci(d["n_correct"], d["n_total"]) for t, d in per_task_acc.items()}
+        )
+        metrics = {"per_task": per_task_acc, "macro_accuracy": macro}
+
+        artifact = RunArtifact(
+            suite=self.suite,
+            benchmark=self.name,
+            run_timestamp=run_timestamp,
+            raw_path=raw_path,
+            metrics=metrics,
+            extra={
+                "n_questions": len(questions),
+                "task_filter": task_filter,
+                "concurrency": concurrency,
+                "provider_model": ctx.provider_model,
+                "ingest_settings": ingest_settings,
+            },
+        )
+        manifest_path = run_dir / "run_artifact.json"
+        manifest_path.write_text(
+            json.dumps(
+                {
+                    "suite": self.suite,
+                    "benchmark": self.name,
+                    "raw_path": "raw.jsonl",
+                    "metrics": metrics,
+                    "extra": artifact.extra,
+                },
+                indent=2,
+                sort_keys=True,
+            )
+            + "\n",
+            encoding="utf-8",
+        )
+        return artifact
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:
+        if not artifacts:
+            return ReportSection(
+                title="MIRAGE — single-arm SurfSense per-task accuracy",
+                headline=False,
+                body_md="(no run artifacts found)",
+                body_json={},
+            )
+        latest = max(artifacts, key=lambda a: a.run_timestamp)
+        per_task = latest.metrics.get("per_task", {})
+        macro = latest.metrics.get("macro_accuracy", 0.0)
+        lines: list[str] = []
+        lines.append(format_ingest_settings_md(latest.extra.get("ingest_settings")))
+        for task in _TASKS:
+            row = per_task.get(task)
+            if not row:
+                continue
+            acc = row.get("accuracy", 0.0)
+            low = row.get("ci_low", 0.0)
+            high = row.get("ci_high", 0.0)
+            lines.append(
+                f"- {task}: {acc * 100:.1f}% "
+                f"(Wilson 95% CI: {low * 100:.1f}% – {high * 100:.1f}%, "
+                f"n={row.get('n_total', '?')})"
+            )
+        if not lines:
+            lines.append("- (no per-task results)")
+        lines.append(f"- Macro accuracy: {macro * 100:.2f}%")
+        return ReportSection(
+            title="MIRAGE — single-arm SurfSense per-task accuracy",
+            headline=False,
+            body_md="\n".join(lines),
+            body_json=latest.metrics,
+        )
+
+
+__all__ = ["MirageBenchmark", "MirageQuestion"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/__init__.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/__init__.py
new file mode 100644
index 000000000..22682ed3f
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/__init__.py
@@ -0,0 +1,14 @@
+"""Multimodal long-document benchmarks (PDFs with embedded images/charts/tables).
+
+Distinct from the medical suite because these documents are domain-mixed
+(research reports, financials, manuals, government, brochures, papers).
+The hypothesis being tested here is *general*: does SurfSense's
+chunking-based vision RAG preserve information that lives in pixels —
+across long PDFs, across pages — versus feeding the same PDF directly
+to a vision-capable model?
+
+Subpackages register themselves with ``core.registry`` on import. The
+``suites/__init__.py`` discovery walker imports them automatically.
+"""
+
+from __future__ import annotations
diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/__init__.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/__init__.py
new file mode 100644
index 000000000..1c2bfa84c
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/__init__.py
@@ -0,0 +1,19 @@
+"""MMLongBench-Doc — head-to-head Native PDF (vision) vs SurfSense (vision RAG).
+
+Source: https://huggingface.co/datasets/yubo2333/MMLongBench-Doc
+Paper:  https://arxiv.org/abs/2407.01523 (NeurIPS 2024 D&B Track)
+
+* 135 long PDFs (avg 47 pages, multi-modal: text, images, charts, tables)
+* 1,091 expert-annotated questions
+* 33% require evidence from multiple pages
+* ~22% intentionally unanswerable (tests hallucination resistance)
+* 7 document types: research report, tutorial/workshop, academic paper,
+  financial report, brochure, government, manuals
+"""
+
+from __future__ import annotations
+
+from ....core import registry as _registry
+from .runner import MMLongBenchDocBenchmark
+
+_registry.register(MMLongBenchDocBenchmark())
diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/grader.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/grader.py
new file mode 100644
index 000000000..7edad73eb
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/grader.py
@@ -0,0 +1,236 @@
+"""Format-aware grader for MMLongBench-Doc answers.
+
+The dataset ships with five ``answer_format`` values per question:
+
+* ``Str``  — short factoid string
+* ``Int``  — integer count / year
+* ``Float`` — decimal number (often with units stripped)
+* ``List`` — comma- or semicolon-separated bag of items
+* ``None`` — gold answer is literally "Not answerable" (hallucination probe)
+
+The official MMLongBench-Doc paper grades with GPT-4 as judge. We
+implement a *deterministic* rule-based grader as the default (so two
+researchers running the same harness get the same number); an
+LLM-judge mode is exposed via ``--judge gpt5`` and routed through the
+same OpenRouter key the arms use, but is opt-in to keep cost down.
+
+Returned by every grading call:
+
+* ``correct: bool`` — final pass/fail used for accuracy + McNemar
+* ``f1: float``     — token-level F1 (continuous credit, useful when
+  comparing arms that get *most* of a list right)
+* ``method: str``   — which path graded the row (one of
+  ``str_norm`` / ``int_eq`` / ``float_tol`` / ``list_set`` /
+  ``none_match`` / ``llm_judge``).
+"""
+
+from __future__ import annotations
+
+import re
+import string
+from collections import Counter
+from dataclasses import dataclass
+
+# ---------------------------------------------------------------------------
+# Public types
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class GradeResult:
+    correct: bool
+    f1: float
+    method: str
+    normalised_pred: str = ""
+    normalised_gold: str = ""
+
+
+# ---------------------------------------------------------------------------
+# Normalisation helpers (shared)
+# ---------------------------------------------------------------------------
+
+_PUNCT_TABLE = str.maketrans({c: " " for c in string.punctuation})
+_ARTICLES = re.compile(r"\b(a|an|the)\b", re.IGNORECASE)
+_WS = re.compile(r"\s+")
+_NOT_ANSWERABLE_TOKENS = {
+    "not answerable",
+    "cannot be answered",
+    "cannot answer",
+    "no answer",
+    "unknown",
+    "none",
+    "not specified",
+    "not mentioned",
+    "not provided",
+    "the answer is not in the document",
+}
+
+# Abbreviations that should be matched literally on the lowercased
+# prediction (because normalisation strips their punctuation and
+# leaves them too short to be safe as substring tokens).
+_NOT_ANSWERABLE_LITERAL = {"n/a", "na/", "n.a.", "n a"}
+
+
+def _normalise_text(s: str) -> str:
+    """SQuAD-style normalisation: lowercase, drop punctuation/articles, squash whitespace."""
+
+    s = s.lower()
+    s = s.translate(_PUNCT_TABLE)
+    s = _ARTICLES.sub(" ", s)
+    s = _WS.sub(" ", s).strip()
+    return s
+
+
+# ---------------------------------------------------------------------------
+# Per-format graders
+# ---------------------------------------------------------------------------
+
+
+def _grade_str(pred: str, gold: str) -> GradeResult:
+    p = _normalise_text(pred)
+    g = _normalise_text(gold)
+    if not p:
+        return GradeResult(False, 0.0, "str_norm", p, g)
+    if p == g:
+        return GradeResult(True, 1.0, "str_norm", p, g)
+    # Substring match in either direction = correct (handles the common
+    # "model emits a fuller sentence containing the gold" case).
+    if g and (g in p or p in g):
+        return GradeResult(True, _f1_tokens(p, g), "str_norm", p, g)
+    return GradeResult(False, _f1_tokens(p, g), "str_norm", p, g)
+
+
+_INT_RE = re.compile(r"-?\d[\d,]*")
+
+
+def _grade_int(pred: str, gold: str) -> GradeResult:
+    g_match = _INT_RE.search(gold)
+    if g_match is None:
+        return _grade_str(pred, gold)
+    g_val = int(g_match.group(0).replace(",", ""))
+    p_match = _INT_RE.search(pred)
+    if p_match is None:
+        return GradeResult(False, 0.0, "int_eq", str(p_match), str(g_val))
+    p_val = int(p_match.group(0).replace(",", ""))
+    return GradeResult(p_val == g_val, 1.0 if p_val == g_val else 0.0,
+                       "int_eq", str(p_val), str(g_val))
+
+
+_FLOAT_RE = re.compile(r"-?\d+(?:[.,]\d+)?")
+
+
+def _grade_float(pred: str, gold: str, *, rel_tol: float = 1e-2) -> GradeResult:
+    g_match = _FLOAT_RE.search(gold)
+    if g_match is None:
+        return _grade_str(pred, gold)
+    g_val = float(g_match.group(0).replace(",", "."))
+    p_match = _FLOAT_RE.search(pred)
+    if p_match is None:
+        return GradeResult(False, 0.0, "float_tol", "", str(g_val))
+    p_val = float(p_match.group(0).replace(",", "."))
+    # Tolerance: 1% relative or 0.01 absolute, whichever is looser.
+    abs_diff = abs(p_val - g_val)
+    tol = max(abs(g_val) * rel_tol, 0.01)
+    ok = abs_diff <= tol
+    return GradeResult(ok, 1.0 if ok else 0.0, "float_tol", str(p_val), str(g_val))
+
+
+_LIST_SPLIT = re.compile(r"[;,\n]")
+
+
+def _grade_list(pred: str, gold: str) -> GradeResult:
+    g_items = {_normalise_text(x) for x in _LIST_SPLIT.split(gold) if x.strip()}
+    p_items = {_normalise_text(x) for x in _LIST_SPLIT.split(pred) if x.strip()}
+    if not g_items:
+        return _grade_str(pred, gold)
+    inter = g_items & p_items
+    if not inter:
+        return GradeResult(False, 0.0, "list_set",
+                           ", ".join(sorted(p_items)),
+                           ", ".join(sorted(g_items)))
+    precision = len(inter) / len(p_items) if p_items else 0.0
+    recall = len(inter) / len(g_items)
+    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0
+    return GradeResult(f1 >= 0.999, f1, "list_set",
+                       ", ".join(sorted(p_items)),
+                       ", ".join(sorted(g_items)))
+
+
+def _grade_none(pred: str, gold: str) -> GradeResult:
+    """Gold == 'Not answerable'. The arm earns credit if its prediction
+    expresses inability to answer.
+
+    Two passes:
+
+    1. Literal-substring check on the lowercased+stripped pred for
+       ambiguous abbreviations like ``n/a`` (since normalisation
+       strips the punctuation and would over-match).
+    2. Word-boundary substring check on the normalised pred for the
+       multi-word phrases (``cannot answer``, ``not specified`` etc.).
+    """
+
+    raw_lower = (pred or "").strip().lower()
+    p = _normalise_text(pred)
+    expressed_unknown = False
+
+    # Pass 1: literal abbreviation hits on the raw lowercased text.
+    if any(lit in raw_lower for lit in _NOT_ANSWERABLE_LITERAL):
+        expressed_unknown = True
+
+    # Pass 2: word-boundary check on normalised tokens.
+    if not expressed_unknown:
+        p_padded = f" {p} "
+        for tok_raw in _NOT_ANSWERABLE_TOKENS:
+            tok = _normalise_text(tok_raw)
+            if not tok or len(tok) < 3:
+                continue
+            if f" {tok} " in p_padded:
+                expressed_unknown = True
+                break
+    return GradeResult(
+        expressed_unknown, 1.0 if expressed_unknown else 0.0,
+        "none_match", p, _normalise_text(gold),
+    )
+
+
+def _f1_tokens(pred: str, gold: str) -> float:
+    p_tok = pred.split()
+    g_tok = gold.split()
+    if not p_tok or not g_tok:
+        return 0.0
+    common = Counter(p_tok) & Counter(g_tok)
+    overlap = sum(common.values())
+    if overlap == 0:
+        return 0.0
+    precision = overlap / len(p_tok)
+    recall = overlap / len(g_tok)
+    return 2 * precision * recall / (precision + recall)
+
+
+# ---------------------------------------------------------------------------
+# Public dispatcher
+# ---------------------------------------------------------------------------
+
+
+_FORMAT_DISPATCH = {
+    "str": _grade_str,
+    "int": _grade_int,
+    "float": _grade_float,
+    "list": _grade_list,
+    "none": _grade_none,
+}
+
+
+def grade(*, pred: str, gold: str, answer_format: str) -> GradeResult:
+    """Grade a single (prediction, gold) pair.
+
+    ``answer_format`` is the dataset's ``answer_format`` column value.
+    Unknown / blank values fall through to string grading.
+    """
+
+    fmt = (answer_format or "").strip().lower()
+    fn = _FORMAT_DISPATCH.get(fmt, _grade_str)
+    return fn(pred or "", gold or "")
+
+
+__all__ = ["GradeResult", "grade"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/ingest.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/ingest.py
new file mode 100644
index 000000000..cf0572df8
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/ingest.py
@@ -0,0 +1,365 @@
+"""MMLongBench-Doc ingestion.
+
+Steps:
+
+1. Pull the questions parquet from
+   ``hf://datasets/yubo2333/MMLongBench-Doc/data/`` and cache locally.
+2. Resolve the unique set of ``doc_id`` referenced by questions, and
+   download each PDF from
+   ``hf://datasets/yubo2333/MMLongBench-Doc/documents/<doc_id>``.
+   ``huggingface_hub.hf_hub_download`` is resumable + content-hash
+   verifying; we cache PDFs under ``<data_dir>/multimodal_doc/mmlongbench/pdfs/``.
+3. Upload every PDF to SurfSense via ``DocumentsClient.upload`` with
+   ``use_vision_llm=True`` so SurfSense's Pillow + LiteLLM vision
+   pipeline extracts captions / OCR for embedded images, charts, and
+   tables.
+4. Wait for ``processed`` status and persist
+   ``doc_id -> document_id`` in
+   ``<data_dir>/multimodal_doc/maps/mmlongbench_doc_map.jsonl``.
+
+By default we ingest **all** 135 PDFs (~660 MB, totally manageable).
+Operators can scope to a subset with ``--max-docs N`` if iterating on
+a slow vision pipeline.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+
+from ....core.config import set_suite_state
+from ....core.ingest_settings import IngestSettings, settings_header_line
+from ....core.registry import RunContext
+
+logger = logging.getLogger(__name__)
+
+
+HF_REPO_ID = "yubo2333/MMLongBench-Doc"
+HF_REPO_TYPE = "dataset"
+
+# Lazy import: huggingface_hub + pyarrow are heavyweight; keep the
+# benchmark module importable on machines that have only the core
+# install (e.g. CI lint jobs).
+def _hf_hub_download(*args, **kwargs):
+    from huggingface_hub import hf_hub_download
+
+    return hf_hub_download(*args, **kwargs)
+
+
+def _list_repo_files() -> list[str]:
+    from huggingface_hub import list_repo_files
+
+    return list_repo_files(repo_id=HF_REPO_ID, repo_type=HF_REPO_TYPE)
+
+
+# ---------------------------------------------------------------------------
+# Question parquet -> Python rows
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class MMLongBenchQuestion:
+    doc_id: str          # filename inside the documents/ folder
+    doc_type: str
+    question: str
+    answer: str
+    answer_format: str   # Str / Int / Float / List / None
+    evidence_pages: list[int]
+    evidence_sources: list[str]
+
+
+def _load_questions_from_parquet(parquet_path: Path) -> list[MMLongBenchQuestion]:
+    import pyarrow.parquet as pq
+
+    table = pq.read_table(parquet_path)
+    rows = table.to_pylist()
+    out: list[MMLongBenchQuestion] = []
+    for row in rows:
+        doc_id = str(row.get("doc_id") or "").strip()
+        if not doc_id:
+            continue
+        question = str(row.get("question") or "").strip()
+        if not question:
+            continue
+        out.append(
+            MMLongBenchQuestion(
+                doc_id=doc_id,
+                doc_type=str(row.get("doc_type") or "").strip(),
+                question=question,
+                answer=str(row.get("answer") or "").strip(),
+                answer_format=str(row.get("answer_format") or "").strip(),
+                evidence_pages=_parse_int_list(row.get("evidence_pages")),
+                evidence_sources=_parse_str_list(row.get("evidence_sources")),
+            )
+        )
+    return out
+
+
+def _parse_int_list(raw) -> list[int]:
+    if raw is None:
+        return []
+    if isinstance(raw, list):
+        out = []
+        for x in raw:
+            try:
+                out.append(int(x))
+            except (TypeError, ValueError):
+                continue
+        return out
+    text = str(raw).strip().strip("[]")
+    if not text:
+        return []
+    out: list[int] = []
+    for tok in text.split(","):
+        tok = tok.strip().strip("'\"")
+        if tok.isdigit():
+            out.append(int(tok))
+    return out
+
+
+def _parse_str_list(raw) -> list[str]:
+    if raw is None:
+        return []
+    if isinstance(raw, list):
+        return [str(x).strip().strip("'\"") for x in raw if str(x).strip()]
+    text = str(raw).strip().strip("[]")
+    if not text:
+        return []
+    return [tok.strip().strip("'\"") for tok in text.split(",") if tok.strip()]
+
+
+# ---------------------------------------------------------------------------
+# Download helpers
+# ---------------------------------------------------------------------------
+
+
+def _download_questions_parquet(cache_dir: Path) -> Path:
+    """Download every parquet under ``data/`` and concatenate.
+
+    The HF dataset usually publishes a single ``train`` split, but we
+    enumerate to be robust to repo restructuring.
+    """
+
+    parquet_paths: list[Path] = []
+    files = _list_repo_files()
+    data_files = [f for f in files if f.startswith("data/") and f.endswith(".parquet")]
+    if not data_files:
+        raise RuntimeError(
+            f"No parquet files found under data/ in {HF_REPO_ID}; "
+            f"upstream repo may have been restructured."
+        )
+    for rel in sorted(data_files):
+        local = _hf_hub_download(
+            repo_id=HF_REPO_ID,
+            filename=rel,
+            repo_type=HF_REPO_TYPE,
+            cache_dir=str(cache_dir),
+        )
+        parquet_paths.append(Path(local))
+        logger.info("Cached MMLongBench parquet shard %s -> %s", rel, local)
+    return parquet_paths[0] if len(parquet_paths) == 1 else _merge_parquets(parquet_paths, cache_dir)
+
+
+def _merge_parquets(paths: list[Path], cache_dir: Path) -> Path:
+    """Combine multiple parquet shards into one (rare branch, but correct)."""
+
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+
+    tables = [pq.read_table(p) for p in paths]
+    merged = pa.concat_tables(tables, promote_options="default")
+    out = cache_dir / "merged_questions.parquet"
+    pq.write_table(merged, out)
+    return out
+
+
+def _download_pdf(doc_id: str, cache_dir: Path, pdfs_dir: Path) -> Path:
+    """Download a single PDF (resumable via huggingface_hub cache)."""
+
+    rel = f"documents/{doc_id}"
+    local = _hf_hub_download(
+        repo_id=HF_REPO_ID,
+        filename=rel,
+        repo_type=HF_REPO_TYPE,
+        cache_dir=str(cache_dir),
+    )
+    # Materialise to a stable path inside our data/ tree so the runner
+    # has a deterministic location regardless of HF cache internals.
+    dest = pdfs_dir / doc_id
+    if not dest.exists() or dest.stat().st_size != Path(local).stat().st_size:
+        # Use a hardlink when possible (cheap), fall back to copy.
+        try:
+            if dest.exists():
+                dest.unlink()
+            os.link(local, dest)
+        except OSError:
+            from shutil import copy2
+
+            copy2(local, dest)
+    return dest
+
+
+# ---------------------------------------------------------------------------
+# Upload helpers
+# ---------------------------------------------------------------------------
+
+
+async def _upload_pdfs(
+    ctx: RunContext,
+    pdf_paths: Iterable[Path],
+    *,
+    batch_size: int,
+    settings: IngestSettings,
+) -> dict[str, int]:
+    """Upload PDFs in batches, return ``filename -> document_id`` map."""
+
+    docs_client = ctx.documents_client()
+    name_to_id: dict[str, int] = {}
+    pdf_list = list(pdf_paths)
+    for batch_start in range(0, len(pdf_list), batch_size):
+        batch = pdf_list[batch_start:batch_start + batch_size]
+        result = await docs_client.upload(
+            files=batch,
+            search_space_id=ctx.search_space_id,
+            should_summarize=settings.should_summarize,
+            use_vision_llm=settings.use_vision_llm,
+            processing_mode=settings.processing_mode,
+        )
+        all_ids = list(result.document_ids) + list(result.duplicate_document_ids)
+        if all_ids:
+            await docs_client.wait_until_ready(
+                search_space_id=ctx.search_space_id,
+                document_ids=result.document_ids,  # only newly added need polling
+                timeout_s=1800.0,  # vision pipeline is slow on long PDFs
+            )
+            statuses = await docs_client.get_status(
+                search_space_id=ctx.search_space_id,
+                document_ids=all_ids,
+            )
+            for s in statuses:
+                name_to_id[s.title] = s.document_id
+        logger.info(
+            "Uploaded MMLongBench batch %d-%d: %d new, %d duplicate",
+            batch_start, batch_start + len(batch),
+            len(result.document_ids), len(result.duplicate_document_ids),
+        )
+    return name_to_id
+
+
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+
+
+async def run_ingest(
+    ctx: RunContext,
+    *,
+    max_docs: int | None = None,
+    upload_batch_size: int = 8,
+    skip_upload: bool = False,
+    settings: IngestSettings | None = None,
+) -> None:
+    """Ingest MMLongBench-Doc into the multimodal_doc suite.
+
+    Parameters
+    ----------
+    max_docs : int | None
+        Cap the number of PDFs to download + upload. ``None`` = all 135.
+        Useful when iterating on the runner without paying for the full
+        vision pipeline pass each time.
+    upload_batch_size : int
+        How many PDFs to send per ``fileupload`` call. Smaller batches
+        recover faster from individual failures; larger batches reduce
+        round-trip overhead.
+    skip_upload : bool
+        Download + cache PDFs locally but skip SurfSense ingestion.
+        Useful for testing the native arm in isolation.
+    """
+
+    settings = settings or IngestSettings(use_vision_llm=True, processing_mode="basic")
+    bench_dir = ctx.benchmark_data_dir()
+    pdfs_dir = bench_dir / "pdfs"
+    pdfs_dir.mkdir(parents=True, exist_ok=True)
+    hf_cache = bench_dir / ".hf_cache"
+    hf_cache.mkdir(parents=True, exist_ok=True)
+
+    # Step 1: questions
+    parquet_path = _download_questions_parquet(hf_cache)
+    questions = _load_questions_from_parquet(parquet_path)
+    if not questions:
+        raise RuntimeError(
+            "MMLongBench-Doc parquet contains no parseable questions. "
+            "Upstream may have changed schema."
+        )
+
+    # Persist a copy alongside the PDFs so the runner has one place to read.
+    questions_jsonl = bench_dir / "questions.jsonl"
+    with questions_jsonl.open("w", encoding="utf-8") as fh:
+        for q in questions:
+            fh.write(json.dumps({
+                "doc_id": q.doc_id,
+                "doc_type": q.doc_type,
+                "question": q.question,
+                "answer": q.answer,
+                "answer_format": q.answer_format,
+                "evidence_pages": q.evidence_pages,
+                "evidence_sources": q.evidence_sources,
+            }) + "\n")
+    logger.info("Wrote %d MMLongBench questions to %s", len(questions), questions_jsonl)
+
+    # Step 2: download unique PDFs
+    unique_doc_ids = sorted({q.doc_id for q in questions})
+    if max_docs is not None and max_docs > 0:
+        unique_doc_ids = unique_doc_ids[:max_docs]
+    logger.info("MMLongBench: downloading %d unique PDFs", len(unique_doc_ids))
+
+    pdf_paths: dict[str, Path] = {}
+    for i, doc_id in enumerate(unique_doc_ids, start=1):
+        try:
+            pdf_paths[doc_id] = _download_pdf(doc_id, hf_cache, pdfs_dir)
+            if i % 10 == 0:
+                logger.info("  ... %d / %d PDFs cached", i, len(unique_doc_ids))
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("Failed to download MMLongBench PDF %s: %s", doc_id, exc)
+
+    # Step 3: upload to SurfSense
+    name_to_id: dict[str, int] = {}
+    if skip_upload:
+        logger.info("MMLongBench: --skip-upload set; skipping SurfSense ingestion")
+    else:
+        logger.info("MMLongBench upload settings: %s", settings.render_label())
+        name_to_id = await _upload_pdfs(
+            ctx,
+            pdf_paths.values(),
+            batch_size=upload_batch_size,
+            settings=settings,
+        )
+
+    # Step 4: persist doc_id -> document_id manifest
+    map_path = ctx.maps_dir() / "mmlongbench_doc_map.jsonl"
+    with map_path.open("w", encoding="utf-8") as fh:
+        # Header line records the resolved ingest settings
+        # (see core/ingest_settings.py).
+        fh.write(settings_header_line(settings) + "\n")
+        for doc_id in unique_doc_ids:
+            local = pdf_paths.get(doc_id)
+            if local is None:
+                continue
+            fh.write(json.dumps({
+                "doc_id": doc_id,
+                "document_id": name_to_id.get(local.name),
+                "pdf_path": str(local),
+                "n_questions": sum(1 for q in questions if q.doc_id == doc_id),
+            }) + "\n")
+    logger.info("Wrote MMLongBench doc map to %s", map_path)
+
+    new_state = ctx.suite_state
+    new_state.ingestion_maps["mmlongbench"] = str(map_path)
+    set_suite_state(ctx.config, ctx.suite, new_state)
+
+
+__all__ = ["MMLongBenchQuestion", "run_ingest"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/prompt.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/prompt.py
new file mode 100644
index 000000000..27d6a0d00
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/prompt.py
@@ -0,0 +1,60 @@
+"""MMLongBench-Doc prompt template.
+
+Both arms get the same prompt — only the document delivery channel
+differs (native PDF embedded in the OpenRouter request vs SurfSense
+RAG retrieval). The format hint in the prompt mirrors what the
+upstream paper uses so the grader's regex can reliably extract the
+answer.
+"""
+
+from __future__ import annotations
+
+# ---------------------------------------------------------------------------
+# Per-format hint blocks
+# ---------------------------------------------------------------------------
+
+_FORMAT_HINTS: dict[str, str] = {
+    "str": (
+        "Respond with the answer as a short phrase, no full sentence. "
+        "Format your final line as `Answer: <text>`."
+    ),
+    "int": (
+        "Respond with a single integer only. "
+        "Format your final line as `Answer: <integer>`."
+    ),
+    "float": (
+        "Respond with a single decimal number only (no units). "
+        "Format your final line as `Answer: <number>`."
+    ),
+    "list": (
+        "Respond with a comma-separated list of items, no extra text. "
+        "Format your final line as `Answer: item1, item2, item3`."
+    ),
+    "none": (
+        "If the answer cannot be determined from the document, say so explicitly. "
+        "Format your final line as `Answer: Not answerable`."
+    ),
+}
+
+
+_PROMPT = """\
+You are a document-understanding assistant. Use ONLY the provided
+document to answer the question. The document may contain text,
+tables, charts, figures, and images. If the answer is in a chart or
+image, read it carefully. Do not use external knowledge.
+
+Question: {question}
+
+{format_hint}
+"""
+
+
+def build_prompt(question: str, *, answer_format: str) -> str:
+    """Assemble the full prompt for one MMLongBench question."""
+
+    fmt = (answer_format or "str").strip().lower()
+    hint = _FORMAT_HINTS.get(fmt, _FORMAT_HINTS["str"])
+    return _PROMPT.format(question=question.strip(), format_hint=hint)
+
+
+__all__ = ["build_prompt"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/runner.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/runner.py
new file mode 100644
index 000000000..0e352d7ae
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/runner.py
@@ -0,0 +1,704 @@
+"""MMLongBench-Doc runner — head-to-head Native PDF (vision) vs SurfSense (vision RAG).
+
+Differences from a typical MCQ head-to-head:
+
+* Open-ended answers (Str / Int / Float / List / Not-answerable) — uses
+  ``extract_freeform_answer`` instead of ``extract_answer_letter``.
+* Format-aware grader (see ``.grader``) returns both binary correctness
+  (for accuracy / McNemar) and continuous F1 (for nuanced reporting).
+* Native arm requires a vision-capable model — we don't enforce this
+  in code (operator's choice via ``setup --provider-model``) but we
+  emit a warning if the pinned slug looks text-only.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from ....core.arms import ArmRequest, ArmResult, NativePdfArm, SurfSenseArm
+from ....core.config import utc_iso_timestamp
+from ....core.ingest_settings import (
+    IngestSettings,
+    add_ingest_settings_args,
+    format_ingest_settings_md,
+    is_settings_header,
+)
+from ....core.metrics.comparison import (
+    bootstrap_delta_ci,
+    mcnemar_test,
+    paired_aggregate,
+)
+from ....core.metrics.mc_accuracy import accuracy_with_wilson_ci
+from ....core.parse.freeform_answer import extract_freeform_answer
+from ....core.providers.openrouter_pdf import OpenRouterPdfProvider, PdfEngine
+from ....core.registry import (
+    ReportSection,
+    RunArtifact,
+    RunContext,
+)
+from ....core.scenarios import format_scenario_md
+from .grader import GradeResult, grade
+from .prompt import build_prompt
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Question + map row shapes
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class MMLBQuestion:
+    qid: str               # synthesised from doc_id + index
+    doc_id: str            # filename inside the documents/ folder
+    doc_type: str
+    question: str
+    gold_answer: str
+    answer_format: str
+    evidence_pages: list[int]
+    evidence_sources: list[str]
+    pdf_path: Path
+    document_id: int | None  # SurfSense doc id (None if upload skipped)
+
+
+def _load_doc_map(map_path: Path) -> tuple[dict[str, dict[str, Any]], dict[str, Any]]:
+    """Read the doc map JSONL.
+
+    Returns ``(rows, settings)`` where ``settings`` is the
+    ``__settings__`` header blob (or ``{}`` for legacy maps).
+    """
+
+    rows: dict[str, dict[str, Any]] = {}
+    settings: dict[str, Any] = {}
+    with map_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if is_settings_header(row):
+                settings = dict(row["__settings__"])
+                continue
+            rows[str(row["doc_id"])] = row
+    return rows, settings
+
+
+def _load_questions(
+    questions_jsonl: Path,
+    doc_map: dict[str, dict[str, Any]],
+    *,
+    doc_filter: list[str] | None,
+    format_filter: str | None,
+    sample_n: int | None,
+    skip_unanswerable: bool,
+) -> list[MMLBQuestion]:
+    out: list[MMLBQuestion] = []
+    per_doc_counter: dict[str, int] = {}
+    with questions_jsonl.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            doc_id = str(row.get("doc_id") or "").strip()
+            if not doc_id:
+                continue
+            if doc_filter and doc_id not in doc_filter:
+                continue
+            map_row = doc_map.get(doc_id)
+            if map_row is None:
+                logger.debug("No doc-map entry for %s; skipping", doc_id)
+                continue
+            answer_format = str(row.get("answer_format") or "").strip().lower()
+            if format_filter and format_filter != "all" and format_filter != answer_format:
+                continue
+            gold = str(row.get("answer") or "").strip()
+            if skip_unanswerable and answer_format == "none":
+                continue
+            idx = per_doc_counter.get(doc_id, 0)
+            per_doc_counter[doc_id] = idx + 1
+            out.append(MMLBQuestion(
+                qid=f"{doc_id}::Q{idx:03d}",
+                doc_id=doc_id,
+                doc_type=str(row.get("doc_type") or "").strip(),
+                question=str(row.get("question") or "").strip(),
+                gold_answer=gold,
+                answer_format=answer_format,
+                evidence_pages=list(row.get("evidence_pages") or []),
+                evidence_sources=list(row.get("evidence_sources") or []),
+                pdf_path=Path(map_row["pdf_path"]),
+                document_id=map_row.get("document_id"),
+            ))
+    out.sort(key=lambda q: (q.doc_id, q.qid))
+    if sample_n is not None and sample_n > 0:
+        out = out[:sample_n]
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Bounded concurrency helper
+# ---------------------------------------------------------------------------
+
+
+async def _gather_with_limit(coros: Iterable, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(coro):
+        async with sem:
+            return await coro
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+# ---------------------------------------------------------------------------
+# Benchmark
+# ---------------------------------------------------------------------------
+
+
+_DESCRIPTION = (
+    "MMLongBench-Doc (135 long PDFs, 1,091 multimodal questions) — "
+    "Native PDF (vision) vs SurfSense (vision RAG) head-to-head."
+)
+
+
+_TEXT_ONLY_HINTS = ("gpt-5.4-mini", "gpt-3.5", "text-only", "instruct-")
+
+# MMLongBench-Doc PDFs are long documents with figures, charts, and
+# tables. Vision LLM at ingest is the whole point; flip --no-vision-llm
+# to measure how much SurfSense degrades on real document images.
+_DEFAULT_INGEST_SETTINGS = IngestSettings(
+    use_vision_llm=True,
+    processing_mode="basic",
+    should_summarize=False,
+)
+
+
+class MMLongBenchDocBenchmark:
+    """Long-document multimodal RAG vs native vision."""
+
+    suite: str = "multimodal_doc"
+    name: str = "mmlongbench"
+    headline: bool = True
+    description: str = _DESCRIPTION
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument(
+            "--docs",
+            default=None,
+            help="Comma-separated doc_ids (filenames) to run (default: all).",
+        )
+        parser.add_argument(
+            "--format",
+            default="all",
+            choices=["all", "str", "int", "float", "list", "none"],
+            help="Filter to one answer format. 'none' = unanswerable probes only.",
+        )
+        parser.add_argument(
+            "--n", dest="sample_n", type=int, default=None,
+            help="Run only the first N questions after filters apply.",
+        )
+        parser.add_argument(
+            "--skip-unanswerable", dest="skip_unanswerable", action="store_true",
+            help="Drop ~22%% unanswerable questions (use to compare against baselines that don't include them).",
+        )
+        parser.add_argument(
+            "--concurrency", type=int, default=4,
+            help="Parallel question workers per arm.",
+        )
+        parser.add_argument(
+            "--no-mentions", dest="no_mentions", action="store_true",
+            help="SurfSense arm: skip mentioned_document_ids (unscoped retrieval).",
+        )
+        parser.add_argument(
+            "--pdf-engine", default="native",
+            choices=[e.value for e in PdfEngine],
+            help="OpenRouter file-parser engine for the native arm.",
+        )
+        parser.add_argument(
+            "--max-output-tokens", type=int, default=512,
+            help="Cap on completion length for both arms.",
+        )
+        # Ingest-only knobs (forwarded by the CLI to ingest.run_ingest).
+        parser.add_argument(
+            "--max-docs", dest="max_docs", type=int, default=None,
+            help="(ingest only) cap on number of unique PDFs to download + upload.",
+        )
+        parser.add_argument(
+            "--upload-batch-size", dest="upload_batch_size", type=int, default=8,
+            help="(ingest only) PDFs per fileupload call.",
+        )
+        parser.add_argument(
+            "--skip-upload", dest="skip_upload", action="store_true",
+            help="(ingest only) cache PDFs locally but don't push to SurfSense.",
+        )
+        # Per-upload knobs forwarded to /documents/fileupload at ingest;
+        # ignored at run-time (runner reads the resolved settings out of
+        # the doc-map manifest header).
+        add_ingest_settings_args(parser, defaults=_DEFAULT_INGEST_SETTINGS)
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:
+        from .ingest import run_ingest
+
+        settings = IngestSettings.merge(_DEFAULT_INGEST_SETTINGS, opts)
+        await run_ingest(
+            ctx,
+            max_docs=opts.get("max_docs"),
+            upload_batch_size=int(opts.get("upload_batch_size") or 8),
+            skip_upload=bool(opts.get("skip_upload", False)),
+            settings=settings,
+        )
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:
+        docs_raw: str | None = opts.get("docs")
+        doc_filter = [d.strip() for d in docs_raw.split(",")] if docs_raw else None
+        format_filter = opts.get("format") or "all"
+        sample_n = opts.get("sample_n")
+        skip_unanswerable = bool(opts.get("skip_unanswerable"))
+        concurrency = int(opts.get("concurrency") or 4)
+        no_mentions = bool(opts.get("no_mentions"))
+        pdf_engine_name = opts.get("pdf_engine") or "native"
+        max_output_tokens = int(opts.get("max_output_tokens") or 512)
+
+        bench_dir = ctx.benchmark_data_dir()
+        questions_jsonl = bench_dir / "questions.jsonl"
+        map_path = ctx.maps_dir() / "mmlongbench_doc_map.jsonl"
+        if not questions_jsonl.exists() or not map_path.exists():
+            raise RuntimeError(
+                "MMLongBench-Doc not ingested for this suite. Run "
+                "`python -m surfsense_evals ingest multimodal_doc mmlongbench` first."
+            )
+
+        doc_map, ingest_settings = _load_doc_map(map_path)
+        questions = _load_questions(
+            questions_jsonl, doc_map,
+            doc_filter=doc_filter,
+            format_filter=None if format_filter == "all" else format_filter,
+            sample_n=sample_n,
+            skip_unanswerable=skip_unanswerable,
+        )
+        if not questions:
+            raise RuntimeError(
+                "No MMLongBench questions matched the filters; broaden --docs/--format/--n."
+            )
+        logger.info("MMLongBench-Doc: scheduled %d questions", len(questions))
+
+        api_key = os.environ.get("OPENROUTER_API_KEY")
+        if not api_key:
+            raise RuntimeError(
+                "OPENROUTER_API_KEY env var is required for the native arm."
+            )
+
+        # Native arm slug differs from SurfSense slug only in cost-arbitrage
+        # scenario; otherwise both arms answer with provider_model.
+        native_arm_model = ctx.native_arm_model
+        if any(hint in native_arm_model.lower() for hint in _TEXT_ONLY_HINTS):
+            if ctx.scenario == "symmetric-cheap":
+                logger.info(
+                    "symmetric-cheap: native arm pinned to text-only %r as "
+                    "intended; expect it to lose on image-bearing pages "
+                    "(SurfSense answers from vision-extracted chunks).",
+                    native_arm_model,
+                )
+            else:
+                logger.warning(
+                    "Native arm slug %r looks text-only; image content in "
+                    "PDFs will be ignored. Re-pin via "
+                    "`setup --provider-model anthropic/claude-sonnet-4.5` "
+                    "(or pass --native-arm-model and --scenario cost-arbitrage "
+                    "to make this asymmetry explicit).",
+                    native_arm_model,
+                )
+
+        provider = OpenRouterPdfProvider(
+            api_key=api_key,
+            base_url=ctx.config.openrouter_base_url,
+            model=native_arm_model,
+            engine=PdfEngine(pdf_engine_name),
+        )
+        native_arm = NativePdfArm(provider=provider, max_output_tokens=max_output_tokens)
+        surf_arm = SurfSenseArm(
+            client=ctx.new_chat_client(),
+            search_space_id=ctx.search_space_id,
+            ephemeral_threads=True,
+        )
+
+        run_timestamp = utc_iso_timestamp()
+        run_dir = ctx.runs_dir(run_timestamp=run_timestamp)
+        raw_path = run_dir / "raw.jsonl"
+
+        async def _native_one(q: MMLBQuestion) -> ArmResult:
+            return await native_arm.answer(_make_native_request(q, max_output_tokens))
+
+        async def _surf_one(q: MMLBQuestion) -> ArmResult:
+            return await surf_arm.answer(_make_surfsense_request(q, no_mentions=no_mentions))
+
+        native_results, surf_results = await asyncio.gather(
+            _gather_with_limit((_native_one(q) for q in questions), concurrency=concurrency),
+            _gather_with_limit((_surf_one(q) for q in questions), concurrency=concurrency),
+        )
+
+        native_grades = [_grade_one(q, r) for q, r in zip(questions, native_results, strict=False)]
+        surf_grades = [_grade_one(q, r) for q, r in zip(questions, surf_results, strict=False)]
+
+        with raw_path.open("w", encoding="utf-8") as fh:
+            for q, n_res, s_res, n_g, s_g in zip(
+                questions, native_results, surf_results, native_grades, surf_grades, strict=False
+            ):
+                meta = {
+                    "qid": q.qid,
+                    "doc_id": q.doc_id,
+                    "doc_type": q.doc_type,
+                    "answer_format": q.answer_format,
+                    "gold": q.gold_answer,
+                    "evidence_pages": q.evidence_pages,
+                    "evidence_sources": q.evidence_sources,
+                    "document_id": q.document_id,
+                }
+                fh.write(json.dumps({
+                    **meta,
+                    **n_res.to_jsonl(),
+                    "graded": _grade_to_jsonl(n_g),
+                }) + "\n")
+                fh.write(json.dumps({
+                    **meta,
+                    **s_res.to_jsonl(),
+                    "graded": _grade_to_jsonl(s_g),
+                }) + "\n")
+
+        metrics = _compute_metrics(questions, native_results, surf_results, native_grades, surf_grades)
+        artifact = RunArtifact(
+            suite=self.suite,
+            benchmark=self.name,
+            run_timestamp=run_timestamp,
+            raw_path=raw_path,
+            metrics=metrics,
+            extra={
+                "n_questions": len(questions),
+                "concurrency": concurrency,
+                "format_filter": format_filter,
+                "skip_unanswerable": skip_unanswerable,
+                "no_mentions": no_mentions,
+                "pdf_engine": pdf_engine_name,
+                "scenario": ctx.scenario,
+                "provider_model": ctx.provider_model,
+                "native_arm_model": native_arm_model,
+                "vision_provider_model": ctx.vision_provider_model,
+                "agent_llm_id": ctx.agent_llm_id,
+                "ingest_settings": ingest_settings,
+            },
+        )
+
+        manifest_path = run_dir / "run_artifact.json"
+        manifest_path.write_text(
+            json.dumps({
+                "suite": self.suite,
+                "benchmark": self.name,
+                "raw_path": "raw.jsonl",
+                "metrics": metrics,
+                "extra": artifact.extra,
+            }, indent=2, sort_keys=True) + "\n",
+            encoding="utf-8",
+        )
+        return artifact
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:
+        if not artifacts:
+            return ReportSection(
+                title="MMLongBench-Doc — Native PDF (vision) vs SurfSense (vision RAG)",
+                headline=True,
+                body_md="(no run artifacts found)",
+                body_json={},
+            )
+        latest = max(artifacts, key=lambda a: a.run_timestamp)
+        m = latest.metrics
+        native = m.get("native", {})
+        surf = m.get("surfsense", {})
+        delta = m.get("delta", {})
+        per_format = m.get("per_format", {})
+        extra = latest.extra
+
+        body_lines: list[str] = []
+        body_lines.append(
+            f"- Sample size: {extra.get('n_questions', '?')} questions "
+            f"(format filter: `{extra.get('format_filter', 'all')}`, "
+            f"skip-unanswerable: `{extra.get('skip_unanswerable', False)}`, "
+            f"engine: `{extra.get('pdf_engine', 'native')}`)."
+        )
+        body_lines.append(format_scenario_md(extra))
+        body_lines.append(format_ingest_settings_md(extra.get("ingest_settings")))
+        body_lines.append(
+            "- Native arm (OpenRouter `chat/completions` + file plugin, "
+            f"`{extra.get('native_arm_model') or extra.get('provider_model', '?')}`):"
+        )
+        body_lines.append(_arm_summary_lines(native, indent="  "))
+        body_lines.append(
+            "- SurfSense arm (`POST /api/v1/new_chat`, vision RAG over chunks, "
+            f"`{extra.get('provider_model', '?')}`):"
+        )
+        body_lines.append(_arm_summary_lines(surf, indent="  "))
+        body_lines.append("- Delta (paired):")
+        body_lines.append(
+            f"  - Accuracy: SurfSense {_pp(delta.get('accuracy_pp'))} pp "
+            f"(McNemar p={_fmt(delta.get('mcnemar_p_value'), 4)}, "
+            f"method={delta.get('mcnemar_method')})"
+        )
+        body_lines.append(
+            f"  - F1 (mean): SurfSense {_pp(delta.get('f1_pp'))} pp"
+        )
+        body_lines.append(
+            f"  - Bootstrap 95% CI on accuracy delta: "
+            f"[{_pp(delta.get('bootstrap_ci_low'))}pp, {_pp(delta.get('bootstrap_ci_high'))}pp]"
+        )
+        body_lines.append(
+            f"  - Cost / question: native ${_dollars(native.get('cost_micros_mean'))}, "
+            f"surfsense ${_dollars(surf.get('cost_micros_mean'))} "
+            f"(SurfSense delta {_pct_change(delta.get('cost_micros_pct'))})"
+        )
+        body_lines.append(
+            f"  - Latency p50: native {_ms_to_s(native.get('latency_ms_median'))}, "
+            f"surfsense {_ms_to_s(surf.get('latency_ms_median'))} "
+            f"(SurfSense delta {_pct_change(delta.get('latency_ms_pct'))})"
+        )
+        if per_format:
+            body_lines.append("- Per-format split (accuracy delta in pp):")
+            for fmt, vals in sorted(per_format.items()):
+                body_lines.append(
+                    f"  - {fmt}: SurfSense {_pp(vals.get('delta_accuracy_pp'))} pp "
+                    f"(n={vals.get('n')}, native acc={vals.get('native_accuracy', 0)*100:.1f}%, "
+                    f"surf acc={vals.get('surfsense_accuracy', 0)*100:.1f}%)"
+                )
+
+        return ReportSection(
+            title="MMLongBench-Doc — Native PDF (vision) vs SurfSense (vision RAG)",
+            headline=True,
+            body_md="\n".join(body_lines),
+            body_json=m,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Per-question helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_native_request(q: MMLBQuestion, max_tokens: int) -> ArmRequest:
+    prompt = build_prompt(q.question, answer_format=q.answer_format)
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=prompt,
+        pdf_paths=[q.pdf_path],
+        options={"max_tokens": max_tokens},
+    )
+
+
+def _make_surfsense_request(q: MMLBQuestion, *, no_mentions: bool) -> ArmRequest:
+    prompt = build_prompt(q.question, answer_format=q.answer_format)
+    mentions: list[int] | None = None
+    if not no_mentions and q.document_id is not None:
+        mentions = [int(q.document_id)]
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=prompt,
+        mentioned_document_ids=mentions,
+    )
+
+
+def _grade_one(q: MMLBQuestion, result: ArmResult) -> GradeResult:
+    pred_text = extract_freeform_answer(result.raw_text or "")
+    return grade(pred=pred_text, gold=q.gold_answer, answer_format=q.answer_format)
+
+
+def _grade_to_jsonl(g: GradeResult) -> dict[str, Any]:
+    return {
+        "correct": g.correct,
+        "f1": g.f1,
+        "method": g.method,
+        "normalised_pred": g.normalised_pred,
+        "normalised_gold": g.normalised_gold,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Metrics aggregation
+# ---------------------------------------------------------------------------
+
+
+def _compute_metrics(
+    questions: list[MMLBQuestion],
+    native_results: list[ArmResult],
+    surf_results: list[ArmResult],
+    native_grades: list[GradeResult],
+    surf_grades: list[GradeResult],
+) -> dict[str, Any]:
+    native_correct = [g.correct for g in native_grades]
+    surf_correct = [g.correct for g in surf_grades]
+    native_f1 = [g.f1 for g in native_grades]
+    surf_f1 = [g.f1 for g in surf_grades]
+
+    native_costs = [float(r.cost_micros) for r in native_results]
+    surf_costs = [float(r.cost_micros) for r in surf_results]
+    native_latencies = [float(r.latency_ms) for r in native_results]
+    surf_latencies = [float(r.latency_ms) for r in surf_results]
+    native_in_tokens = [float(r.input_tokens) for r in native_results]
+    native_out_tokens = [float(r.output_tokens) for r in native_results]
+
+    native_acc = accuracy_with_wilson_ci(sum(native_correct), len(native_correct))
+    surf_acc = accuracy_with_wilson_ci(sum(surf_correct), len(surf_correct))
+    mc = mcnemar_test(native_correct, surf_correct)
+    boot = bootstrap_delta_ci(native_correct, surf_correct, n_resamples=2000)
+
+    native_cost_agg = paired_aggregate(native_costs)
+    surf_cost_agg = paired_aggregate(surf_costs)
+    native_latency_agg = paired_aggregate(native_latencies)
+    surf_latency_agg = paired_aggregate(surf_latencies)
+
+    cost_pct = _safe_pct(surf_cost_agg.mean, native_cost_agg.mean)
+    latency_pct = _safe_pct(surf_latency_agg.median, native_latency_agg.median)
+
+    per_format_pairs: dict[str, list[tuple[bool, bool]]] = {}
+    for q, n_ok, s_ok in zip(questions, native_correct, surf_correct, strict=False):
+        per_format_pairs.setdefault(q.answer_format or "unknown", []).append((n_ok, s_ok))
+
+    per_format: dict[str, dict[str, Any]] = {}
+    for fmt, pairs in per_format_pairs.items():
+        n_correct = [a for a, _ in pairs]
+        s_correct = [b for _, b in pairs]
+        per_format[fmt] = {
+            "n": len(pairs),
+            "native_accuracy": (sum(n_correct) / len(pairs)) if pairs else 0.0,
+            "surfsense_accuracy": (sum(s_correct) / len(pairs)) if pairs else 0.0,
+            "delta_accuracy_pp": (
+                100.0 * (sum(s_correct) - sum(n_correct)) / len(pairs)
+                if pairs else 0.0
+            ),
+        }
+
+    native_f1_mean = sum(native_f1) / len(native_f1) if native_f1 else 0.0
+    surf_f1_mean = sum(surf_f1) / len(surf_f1) if surf_f1 else 0.0
+
+    return {
+        "native": {
+            **native_acc.to_dict(),
+            "f1_mean": native_f1_mean,
+            "cost_micros_mean": native_cost_agg.mean,
+            "cost_micros_median": native_cost_agg.median,
+            "latency_ms_mean": native_latency_agg.mean,
+            "latency_ms_median": native_latency_agg.median,
+            "latency_ms_p95": native_latency_agg.p95,
+            "input_tokens_mean": (sum(native_in_tokens) / len(native_in_tokens)) if native_in_tokens else 0.0,
+            "output_tokens_mean": (sum(native_out_tokens) / len(native_out_tokens)) if native_out_tokens else 0.0,
+        },
+        "surfsense": {
+            **surf_acc.to_dict(),
+            "f1_mean": surf_f1_mean,
+            "cost_micros_mean": surf_cost_agg.mean,
+            "cost_micros_median": surf_cost_agg.median,
+            "latency_ms_mean": surf_latency_agg.mean,
+            "latency_ms_median": surf_latency_agg.median,
+            "latency_ms_p95": surf_latency_agg.p95,
+        },
+        "delta": {
+            "accuracy_pp": 100.0 * (surf_acc.accuracy - native_acc.accuracy),
+            "f1_pp": 100.0 * (surf_f1_mean - native_f1_mean),
+            "mcnemar_p_value": mc.p_value,
+            "mcnemar_method": mc.method,
+            "mcnemar_b_native_only": mc.b,
+            "mcnemar_c_surfsense_only": mc.c,
+            "bootstrap_ci_low": 100.0 * boot.ci_low,
+            "bootstrap_ci_high": 100.0 * boot.ci_high,
+            "cost_micros_pct": cost_pct,
+            "latency_ms_pct": latency_pct,
+        },
+        "per_format": per_format,
+    }
+
+
+def _safe_pct(numerator: float, denominator: float) -> float | None:
+    if denominator == 0:
+        return None
+    return 100.0 * (numerator - denominator) / denominator
+
+
+# ---------------------------------------------------------------------------
+# Tiny formatting helpers used by report_section
+# ---------------------------------------------------------------------------
+
+
+def _arm_summary_lines(d: dict[str, Any], *, indent: str) -> str:
+    if not d:
+        return f"{indent}(no data)"
+    acc = d.get("accuracy", 0.0)
+    low = d.get("ci_low", 0.0)
+    high = d.get("ci_high", 0.0)
+    f1 = d.get("f1_mean", 0.0)
+    lines = [
+        f"{indent}- Accuracy: {acc * 100:.1f}% (Wilson 95% CI: {low * 100:.1f}% – {high * 100:.1f}%)",
+        f"{indent}- F1 (token-level mean): {f1 * 100:.1f}%",
+        f"{indent}- Cost / question: ${_dollars(d.get('cost_micros_mean'))} (mean), "
+        f"${_dollars(d.get('cost_micros_median'))} (median)",
+        f"{indent}- Latency: p50 {_ms_to_s(d.get('latency_ms_median'))}, "
+        f"p95 {_ms_to_s(d.get('latency_ms_p95'))}",
+    ]
+    if "input_tokens_mean" in d:
+        lines.append(
+            f"{indent}- Mean tokens / question: in {d.get('input_tokens_mean', 0):.0f}, "
+            f"out {d.get('output_tokens_mean', 0):.0f}"
+        )
+    return "\n".join(lines)
+
+
+def _dollars(micros: Any) -> str:
+    if micros is None:
+        return "?"
+    try:
+        return f"{(float(micros) / 1_000_000):.4f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _ms_to_s(ms: Any) -> str:
+    if ms is None:
+        return "?"
+    try:
+        return f"{float(ms) / 1000:.1f}s"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _pp(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):+.1f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _pct_change(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):+.0f}%"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _fmt(value: Any, ndigits: int) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):.{ndigits}f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+__all__ = ["MMLBQuestion", "MMLongBenchDocBenchmark"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/__init__.py b/surfsense_evals/src/surfsense_evals/suites/research/__init__.py
new file mode 100644
index 000000000..03fe24c02
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/__init__.py
@@ -0,0 +1,18 @@
+"""Research / multi-document RAG benchmarks.
+
+Distinct from ``multimodal_doc`` (PDF-bound) and ``medical`` (one
+question = one source PDF). Benchmarks here put *retrieval and
+reasoning across many documents* in the critical path — the regime
+where SurfSense's chunk-level RAG should shine versus "pour the
+entire document into the LLM" or "ask the LLM cold".
+
+* ``frames`` (google/frames-benchmark) — 824 multi-hop Wikipedia
+  questions; tests bare-LLM vs SurfSense over a shared ~330-doc
+  corpus.
+* ``crag``   (facebookresearch/CRAG, KDD Cup 2024) — 2,706 web QA
+  pairs with 5 pre-retrieved HTML pages each; tests bare-LLM vs
+  long-context-stuffed LLM vs SurfSense over the question's 5
+  scoped pages — the closest comparison to a competing RAG product.
+"""
+
+from __future__ import annotations
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/__init__.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/__init__.py
new file mode 100644
index 000000000..80358c474
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/__init__.py
@@ -0,0 +1,57 @@
+"""CRAG — Comprehensive RAG Benchmark (Yang et al., Meta, KDD Cup 2024).
+
+Source: https://github.com/facebookresearch/CRAG  (Tasks 1, 2, and 3)
+Paper:  https://arxiv.org/abs/2406.04744
+
+This package registers two siblings:
+
+* ``crag``    — Tasks 1 & 2: 5 candidate pages per question.
+* ``crag_t3`` — Task 3:       50 candidate pages per question. The
+  long-context arm is capped to the top-5 (the realistic "naive
+  RAG = pick top-K results" baseline); SurfSense retrieves over
+  all 50, where its rerank becomes the entire contribution.
+
+Both share the grader, prompt, runner, and report code; only the
+ingest path differs (single bz2 vs 4-part tar.bz2 streamed).
+
+CRAG ships ~2,706 factual QA pairs, each paired with **5 full HTML
+pages** retrieved as the top-5 of a real web search at ``query_time``
+(50 in Task 3).
+The benchmark spans 5 domains (finance, music, movie, sports, open)
+and 8 question types (simple, comparison, aggregation, set, multi-hop,
+post-processing, false_premise, simple_w_condition) — heads/torsos/
+tails of entity popularity — and an explicit static→real-time
+freshness axis.
+
+Why CRAG demonstrates SurfSense more clearly than FRAMES
+--------------------------------------------------------
+FRAMES tested SurfSense vs. *no retrieval at all* — a fair "naive
+prompting" baseline (the published 40.8% number) but not a competing
+RAG product. CRAG enables a three-way comparison:
+
+* ``bare_llm``      — chat completion with the question only. CRAG
+  paper: ≤34% accuracy ("LLM cold").
+* ``long_context``  — stuff all 5 extracted page texts straight into
+  the prompt (the "naive RAG" / "straightforward RAG" arm in the
+  paper). Published baseline: ~44%.
+* ``surfsense``     — POST ``/api/v1/new_chat`` with retrieval scoped
+  to the question's 5 ingested pages (``mentioned_document_ids``).
+
+So the headline becomes "SurfSense vs. context-stuffed long-context
+LLM, both fed the same 5 pages" — a head-to-head against the simplest
+realistic RAG strategy, not against an unarmed model.
+
+Scoring follows the CRAG paper: each prediction is graded as
+**correct** (+1), **missing/I-don't-know** (0), or **incorrect** (-1),
+and the headline metric is the *Truthfulness Score*:
+``(#correct - #incorrect) / total`` — penalising hallucinations
+relative to refusals.
+"""
+
+from __future__ import annotations
+
+from ....core import registry as _registry
+from .runner import CragBenchmark, CragTask3Benchmark
+
+_registry.register(CragBenchmark())
+_registry.register(CragTask3Benchmark())
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/dataset.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/dataset.py
new file mode 100644
index 000000000..224dcae5c
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/dataset.py
@@ -0,0 +1,335 @@
+"""CRAG dataset loader — download ``crag_task_1_and_2_dev_v4.jsonl.bz2`` and parse.
+
+The CRAG repo (``facebookresearch/CRAG``) ships Tasks 1 & 2 as a
+single bzip2-compressed JSONL on GitHub raw. Each row carries:
+
+* ``interaction_id``    — opaque per-question id (we keep verbatim)
+* ``query_time``        — wall clock of the original web search
+* ``domain``            — finance | music | movie | sports | open
+* ``question_type``     — simple | comparison | aggregation | set |
+                          multi-hop | post-processing | false_premise |
+                          simple_w_condition
+* ``static_or_dynamic`` — static | slow-changing | fast-changing | real-time
+* ``query``             — the question
+* ``answer``            — gold short answer
+* ``alt_ans``           — list[str] of alternative valid answers
+                          (paraphrases / synonyms / unit variants)
+* ``split``             — 0 = validation, 1 = public test
+* ``popularity``        — head | torso | tail (KG questions); empty for web
+* ``search_results``    — list of up to 5 ``{page_name, page_url,
+                          page_snippet, page_result, page_last_modified}``;
+                          ``page_result`` is full HTML.
+
+We materialise this into ``CragQuestion`` objects keeping ``pages`` as
+a list of ``CragPage`` so downstream ingest can save each as its own
+file and SurfSense can dedupe on filename.
+"""
+
+from __future__ import annotations
+
+import bz2
+import hashlib
+import io
+import json
+import logging
+import urllib.request
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+# Tasks 1 & 2 share the same JSONL on the public CRAG repo.
+CRAG_TASK_1_2_URL = (
+    "https://github.com/facebookresearch/CRAG/raw/refs/heads/main/data/"
+    "crag_task_1_and_2_dev_v4.jsonl.bz2"
+)
+CRAG_TASK_1_2_FILENAME = "crag_task_1_and_2_dev_v4.jsonl.bz2"
+
+
+# ---------------------------------------------------------------------------
+# Question / page dataclasses
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class CragPage:
+    """One of the up-to-5 pre-retrieved web pages for a CRAG question."""
+
+    page_name: str
+    page_url: str
+    page_snippet: str
+    page_html: str
+    page_last_modified: str | None = None
+
+    @property
+    def url_hash(self) -> str:
+        """Stable 12-hex digest of the page URL for filename keys.
+
+        We can't use the raw URL as a filename (slashes, query strings,
+        unicode), and we *do* want collision-safety across the whole
+        ingest sample. ``sha1[:12]`` gives us 48 bits of namespace
+        which is overkill for a corpus capped at a few thousand pages.
+        """
+
+        return hashlib.sha1(self.page_url.encode("utf-8")).hexdigest()[:12]
+
+
+@dataclass
+class CragQuestion:
+    """One row of CRAG (Tasks 1 & 2)."""
+
+    qid: str                          # synthesised "C00000".."C02705"
+    interaction_id: str
+    query_time: str
+    query: str
+    gold_answer: str
+    alt_answers: list[str]
+    domain: str
+    question_type: str
+    static_or_dynamic: str
+    popularity: str                   # may be "" for web-sourced questions
+    split: int                        # 0=validation, 1=public_test
+    raw_index: int                    # row index in the source JSONL
+    pages: list[CragPage] = field(default_factory=list)
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "qid": self.qid,
+            "interaction_id": self.interaction_id,
+            "query_time": self.query_time,
+            "query": self.query,
+            "gold_answer": self.gold_answer,
+            "alt_answers": list(self.alt_answers),
+            "domain": self.domain,
+            "question_type": self.question_type,
+            "static_or_dynamic": self.static_or_dynamic,
+            "popularity": self.popularity,
+            "split": self.split,
+            "raw_index": self.raw_index,
+            "n_pages": len(self.pages),
+            "page_urls": [p.page_url for p in self.pages],
+        }
+
+
+# ---------------------------------------------------------------------------
+# Download + decompress
+# ---------------------------------------------------------------------------
+
+
+def download_task_1_2(cache_dir: Path) -> Path:
+    """Download the bz2 archive into ``cache_dir`` (skip if cached).
+
+    Returns the path to the local ``.jsonl.bz2``. We use stdlib
+    ``urllib`` rather than ``httpx`` to keep the download synchronous
+    and trivially resumable (re-running the function is a no-op once
+    the file is on disk and non-empty).
+    """
+
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    dest = cache_dir / CRAG_TASK_1_2_FILENAME
+    if dest.exists() and dest.stat().st_size > 0:
+        logger.debug("CRAG bz2 already cached at %s", dest)
+        return dest
+
+    logger.info("Downloading CRAG (Tasks 1 & 2) from %s ...", CRAG_TASK_1_2_URL)
+    tmp = dest.with_suffix(dest.suffix + ".part")
+    req = urllib.request.Request(
+        CRAG_TASK_1_2_URL,
+        headers={"User-Agent": "SurfSense-Evals/0.1 (CRAG dataset fetch)"},
+    )
+    with urllib.request.urlopen(req, timeout=600) as response, tmp.open("wb") as fh:
+        chunk = response.read(1 << 20)
+        while chunk:
+            fh.write(chunk)
+            chunk = response.read(1 << 20)
+    tmp.replace(dest)
+    logger.info("CRAG bz2 downloaded: %s (%.1f MiB)", dest, dest.stat().st_size / 1024 / 1024)
+    return dest
+
+
+# ---------------------------------------------------------------------------
+# Parse
+# ---------------------------------------------------------------------------
+
+
+def _parse_pages(raw_search_results: Any) -> list[CragPage]:
+    if not isinstance(raw_search_results, list):
+        return []
+    pages: list[CragPage] = []
+    for entry in raw_search_results:
+        if not isinstance(entry, dict):
+            continue
+        url = str(entry.get("page_url") or "").strip()
+        html = str(entry.get("page_result") or "")
+        if not url or not html.strip():
+            # No URL or empty HTML => useless for retrieval.
+            continue
+        pages.append(CragPage(
+            page_name=str(entry.get("page_name") or "").strip(),
+            page_url=url,
+            page_snippet=str(entry.get("page_snippet") or "").strip(),
+            page_html=html,
+            page_last_modified=(
+                str(entry.get("page_last_modified")).strip()
+                if entry.get("page_last_modified") else None
+            ),
+        ))
+    return pages
+
+
+def _parse_alt_answers(raw: Any) -> list[str]:
+    if isinstance(raw, list):
+        return [str(x).strip() for x in raw if str(x).strip()]
+    if isinstance(raw, str) and raw.strip():
+        return [raw.strip()]
+    return []
+
+
+def iter_questions(jsonl_bz2_path: Path) -> list[CragQuestion]:
+    """Stream-decompress + parse the CRAG JSONL into ``CragQuestion`` objects.
+
+    The bz2 expansion ratio is ~10x and the decompressed file is
+    multi-GB; we therefore decompress *line by line* via
+    ``bz2.open(..., "rt")``. Each row is a single (potentially very
+    large, due to embedded HTML) JSON object. We keep the entire row
+    in memory because we materialise the pages to disk immediately
+    after parsing in the ingest pipeline — the runner never holds
+    more than the current sample's worth of HTML.
+    """
+
+    out: list[CragQuestion] = []
+    with bz2.open(jsonl_bz2_path, mode="rt", encoding="utf-8") as fh:
+        for raw_idx, line in enumerate(fh):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                row = json.loads(line)
+            except json.JSONDecodeError as exc:
+                logger.warning("Skipping malformed CRAG row %d: %s", raw_idx, exc)
+                continue
+            query = str(row.get("query") or "").strip()
+            answer = str(row.get("answer") or "").strip()
+            if not query or not answer:
+                logger.debug("Skipping CRAG row %d with missing query/answer", raw_idx)
+                continue
+            interaction_id = str(row.get("interaction_id") or "").strip()
+            pages = _parse_pages(row.get("search_results"))
+            out.append(CragQuestion(
+                qid=f"C{raw_idx:05d}",
+                interaction_id=interaction_id,
+                query_time=str(row.get("query_time") or "").strip(),
+                query=query,
+                gold_answer=answer,
+                alt_answers=_parse_alt_answers(row.get("alt_ans")),
+                domain=str(row.get("domain") or "").strip().lower(),
+                question_type=str(row.get("question_type") or "").strip().lower(),
+                static_or_dynamic=str(row.get("static_or_dynamic") or "").strip().lower(),
+                popularity=str(row.get("popularity") or "").strip().lower(),
+                split=int(row.get("split") or 0),
+                raw_index=raw_idx,
+                pages=pages,
+            ))
+    return out
+
+
+def stratified_sample(
+    questions: list[CragQuestion],
+    *,
+    n: int,
+    seed: int = 17,
+) -> list[CragQuestion]:
+    """Take ``n`` questions that roughly preserve the domain × question-type mix.
+
+    CRAG is only ~2.7k rows so naive head-of-list sampling badly
+    over-weights ``finance`` (because the dataset isn't shuffled by
+    domain). We bucket on ``(domain, question_type)`` and round-robin
+    pick from each bucket until we hit ``n`` — this gives every
+    bucket a fair shot and keeps the sample composition stable across
+    re-runs (deterministic via the seeded shuffle inside each bucket).
+    """
+
+    if n <= 0 or n >= len(questions):
+        return list(questions)
+    import random
+
+    rng = random.Random(seed)
+    buckets: dict[tuple[str, str], list[CragQuestion]] = {}
+    for q in questions:
+        buckets.setdefault((q.domain, q.question_type), []).append(q)
+    for items in buckets.values():
+        rng.shuffle(items)
+
+    keys = sorted(buckets.keys())
+    chosen: list[CragQuestion] = []
+    cursor = 0
+    while len(chosen) < n and any(buckets[k] for k in keys):
+        key = keys[cursor % len(keys)]
+        cursor += 1
+        if buckets[key]:
+            chosen.append(buckets[key].pop())
+    chosen.sort(key=lambda q: q.raw_index)
+    return chosen
+
+
+def write_questions_jsonl(questions: list[CragQuestion], dest: Path) -> None:
+    """Persist a parsed copy (without page HTML) under the benchmark data dir."""
+
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    with dest.open("w", encoding="utf-8") as fh:
+        for q in questions:
+            fh.write(json.dumps(q.to_dict()) + "\n")
+
+
+# ---------------------------------------------------------------------------
+# Reading the lightweight questions.jsonl back
+# ---------------------------------------------------------------------------
+
+
+def load_questions_jsonl(path: Path) -> list[dict[str, Any]]:
+    """Re-load the lightweight (no-HTML) questions JSONL from disk."""
+
+    out: list[dict[str, Any]] = []
+    if not path.exists():
+        return out
+    with path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                out.append(json.loads(line))
+            except json.JSONDecodeError:
+                continue
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Convenience: decompress a snippet to memory for tests
+# ---------------------------------------------------------------------------
+
+
+def decompress_to_memory(jsonl_bz2_path: Path) -> io.StringIO:
+    """For tests / one-off scripts: read the whole bz2 into a StringIO.
+
+    Avoids leaking gigabytes; use ``iter_questions`` in production.
+    """
+
+    with bz2.open(jsonl_bz2_path, mode="rb") as fh:
+        return io.StringIO(fh.read().decode("utf-8"))
+
+
+__all__ = [
+    "CRAG_TASK_1_2_FILENAME",
+    "CRAG_TASK_1_2_URL",
+    "CragPage",
+    "CragQuestion",
+    "decompress_to_memory",
+    "download_task_1_2",
+    "iter_questions",
+    "load_questions_jsonl",
+    "stratified_sample",
+    "write_questions_jsonl",
+]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/dataset_task3.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/dataset_task3.py
new file mode 100644
index 000000000..02bed5935
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/dataset_task3.py
@@ -0,0 +1,263 @@
+"""CRAG Task 3 dataset loader — 4-part tar.bz2 → streaming JSONL.
+
+Task 3 ships ~7 GB of compressed data split into 4 parts on GitHub:
+
+    crag_task_3_dev_v4.tar.bz2.part1    (≈2 GB)
+    crag_task_3_dev_v4.tar.bz2.part2    (≈2 GB)
+    crag_task_3_dev_v4.tar.bz2.part3    (≈2 GB)
+    crag_task_3_dev_v4.tar.bz2.part4    (≈1.3 GB)
+
+Concatenated, they form a tar archive containing a single JSONL file.
+Decompressed, that JSONL is on the order of 30-50 GB because each row
+embeds 50 full HTML pages (vs 5 in Tasks 1 & 2).
+
+Materialising the JSONL would blow the disk budget (we have ~50 GB
+free at the time of writing), so we stream the whole thing instead:
+
+  1. Download parts (idempotent; ``scripts/download_crag_task3.py``).
+  2. Concat them into a virtual file via ``_MultiPartReader``.
+  3. Wrap in ``bz2.BZ2File`` for on-the-fly decompression.
+  4. Wrap in ``tarfile.open(fileobj=..., mode="r|")`` for streaming
+     tar member iteration.
+  5. For the JSONL member inside, ``tar.extractfile()`` returns a
+     binary file-like; we iterate lines and yield parsed dicts.
+
+The caller can ``break`` out as soon as they have enough samples —
+nothing past the consumed point is decompressed.
+
+Schema is identical to Tasks 1 & 2 (see ``dataset.py``); only
+``search_results`` is bigger (50 entries instead of 5).
+"""
+
+from __future__ import annotations
+
+import bz2
+import json
+import logging
+import tarfile
+from collections.abc import Iterator
+from pathlib import Path
+from typing import IO
+
+from .dataset import (
+    CragPage,
+    CragQuestion,
+    _parse_alt_answers,
+    _parse_pages,
+)
+
+logger = logging.getLogger(__name__)
+
+
+CRAG_TASK_3_PART_URLS: tuple[str, ...] = tuple(
+    "https://github.com/facebookresearch/CRAG/raw/refs/heads/main/data/"
+    f"crag_task_3_dev_v4.tar.bz2.part{i}"
+    for i in (1, 2, 3, 4)
+)
+CRAG_TASK_3_PART_NAMES: tuple[str, ...] = tuple(
+    f"crag_task_3_dev_v4.tar.bz2.part{i}" for i in (1, 2, 3, 4)
+)
+
+
+# ---------------------------------------------------------------------------
+# Multi-part virtual file (concatenates N files transparently)
+# ---------------------------------------------------------------------------
+
+
+class _MultiPartReader:
+    """Read N files end-to-end as if they were one big file.
+
+    Implements just enough of the file protocol for ``bz2.BZ2File``
+    to consume it: ``read(n)``, ``readable()``, ``close()``.
+    Doesn't implement ``seek`` — the bz2 + tarfile streaming path
+    is forward-only, which is what we want here.
+    """
+
+    def __init__(self, paths: list[Path]) -> None:
+        if not paths:
+            raise ValueError("_MultiPartReader needs at least one path")
+        for p in paths:
+            if not p.exists():
+                raise FileNotFoundError(p)
+        self._paths = list(paths)
+        self._idx = 0
+        self._fh: IO[bytes] | None = self._paths[0].open("rb")
+        self._closed = False
+
+    def read(self, n: int = -1) -> bytes:
+        if self._closed:
+            raise ValueError("read of closed _MultiPartReader")
+        if n is None or n < 0:
+            chunks: list[bytes] = []
+            while self._fh is not None:
+                chunks.append(self._fh.read())
+                self._advance()
+            return b"".join(chunks)
+        out: list[bytes] = []
+        remaining = n
+        while remaining > 0 and self._fh is not None:
+            chunk = self._fh.read(remaining)
+            if not chunk:
+                self._advance()
+                continue
+            out.append(chunk)
+            remaining -= len(chunk)
+        return b"".join(out)
+
+    def _advance(self) -> None:
+        if self._fh is not None:
+            self._fh.close()
+            self._fh = None
+        self._idx += 1
+        if self._idx < len(self._paths):
+            self._fh = self._paths[self._idx].open("rb")
+
+    def readable(self) -> bool:
+        return not self._closed
+
+    def close(self) -> None:
+        if self._fh is not None:
+            self._fh.close()
+            self._fh = None
+        self._closed = True
+
+    def __enter__(self) -> _MultiPartReader:
+        return self
+
+    def __exit__(self, exc_type, exc, tb) -> None:  # type: ignore[no-untyped-def]
+        self.close()
+
+
+# ---------------------------------------------------------------------------
+# Stream the JSONL inside the tar.bz2
+# ---------------------------------------------------------------------------
+
+
+def _is_jsonl_member(name: str) -> bool:
+    return name.endswith(".jsonl") or name.endswith(".jsonl.txt")
+
+
+def iter_questions_task3(
+    parts_dir: Path,
+    *,
+    max_questions: int | None = None,
+) -> list[CragQuestion]:
+    """Stream-parse Task 3 rows into ``CragQuestion`` objects.
+
+    The Task 3 archive ships its 2,706 questions sharded across
+    multiple JSONL files inside the tar (e.g.
+    ``crag_task_3_dev_v4_0.jsonl``, ``..._1.jsonl``, …). We iterate
+    members in-stream, parse every JSONL one we encounter, and stop
+    as soon as ``max_questions`` is reached — at which point we
+    don't decompress any further members.
+
+    For a typical n=50 sample at ~3 MB per row we touch ~150 MB of
+    decompressed JSONL — almost always inside the first shard.
+    """
+
+    parts = [parts_dir / name for name in CRAG_TASK_3_PART_NAMES]
+    multi = _MultiPartReader(parts)
+    bz = bz2.BZ2File(multi, mode="rb")
+    tar = tarfile.open(fileobj=bz, mode="r|")
+    out: list[CragQuestion] = []
+    raw_idx = 0
+    found_jsonl = False
+    try:
+        for member in tar:
+            if not member.isfile() or not _is_jsonl_member(member.name):
+                continue
+            found_jsonl = True
+            logger.info(
+                "CRAG Task 3: streaming JSONL shard %s (size: %d bytes)",
+                member.name, member.size,
+            )
+            fh = tar.extractfile(member)
+            if fh is None:
+                logger.warning("tar.extractfile returned None for %s; skipping", member.name)
+                continue
+            try:
+                for raw_line in fh:
+                    line = raw_line.decode("utf-8", errors="replace").strip()
+                    if not line:
+                        continue
+                    try:
+                        row = json.loads(line)
+                    except json.JSONDecodeError as exc:
+                        logger.warning(
+                            "Skipping malformed CRAG Task 3 row %d in %s: %s",
+                            raw_idx, member.name, exc,
+                        )
+                        raw_idx += 1
+                        continue
+                    query = str(row.get("query") or "").strip()
+                    answer = str(row.get("answer") or "").strip()
+                    if not query or not answer:
+                        raw_idx += 1
+                        continue
+                    out.append(CragQuestion(
+                        qid=f"T3_{raw_idx:05d}",
+                        interaction_id=str(row.get("interaction_id") or "").strip(),
+                        query_time=str(row.get("query_time") or "").strip(),
+                        query=query,
+                        gold_answer=answer,
+                        alt_answers=_parse_alt_answers(row.get("alt_ans")),
+                        domain=str(row.get("domain") or "").strip().lower(),
+                        question_type=str(row.get("question_type") or "").strip().lower(),
+                        static_or_dynamic=str(row.get("static_or_dynamic") or "").strip().lower(),
+                        popularity=str(row.get("popularity") or "").strip().lower(),
+                        split=int(row.get("split") or 0),
+                        raw_index=raw_idx,
+                        pages=_parse_pages(row.get("search_results")),
+                    ))
+                    raw_idx += 1
+                    if max_questions is not None and len(out) >= max_questions:
+                        return out
+            finally:
+                try:
+                    fh.close()
+                except Exception:  # noqa: BLE001
+                    pass
+        if not found_jsonl:
+            raise RuntimeError(
+                "No JSONL member found inside Task 3 tar.bz2 archive; "
+                "schema may have changed upstream."
+            )
+    finally:
+        try:
+            tar.close()
+        except Exception:  # noqa: BLE001
+            pass
+        try:
+            bz.close()
+        except Exception:  # noqa: BLE001
+            pass
+        try:
+            multi.close()
+        except Exception:  # noqa: BLE001
+            pass
+    return out
+
+
+def parts_present(parts_dir: Path) -> bool:
+    """``True`` iff all 4 parts exist on disk and are non-empty."""
+
+    for name in CRAG_TASK_3_PART_NAMES:
+        p = parts_dir / name
+        if not p.exists() or p.stat().st_size == 0:
+            return False
+    return True
+
+
+# ---------------------------------------------------------------------------
+# Re-exports for convenience
+# ---------------------------------------------------------------------------
+
+
+__all__ = [
+    "CRAG_TASK_3_PART_NAMES",
+    "CRAG_TASK_3_PART_URLS",
+    "CragPage",
+    "CragQuestion",
+    "iter_questions_task3",
+    "parts_present",
+]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/grader.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/grader.py
new file mode 100644
index 000000000..63f66702b
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/grader.py
@@ -0,0 +1,540 @@
+"""CRAG 3-class grader: ``correct`` (+1) / ``missing`` (0) / ``incorrect`` (-1).
+
+The CRAG paper's headline metric is the **Truthfulness Score**:
+
+    score = (#correct - #incorrect) / total
+
+which rewards calibrated abstention — refusing to answer is neutral
+(0), guessing wrong is negative (-1). Grading is therefore a 3-class
+problem rather than the 2-class accuracy used for FRAMES.
+
+Pipeline per (pred, gold, alt_ans, question_type):
+
+1. Detect refusal first (``Answer: I don't know`` / "I don't know" /
+   "no information") → ``missing`` (deterministic, never billed).
+2. ``false_premise`` questions: gold is canonically "the question
+   contains a false premise" — reward any answer that flags the
+   false premise (substring "false premise" / "incorrect premise" /
+   "no such") as correct.
+3. Run the FRAMES-style deterministic shortcut (exact / numeric /
+   substring) on ``pred`` against ``gold ∪ alt_ans``. Hit → correct.
+4. Fall through to the LLM judge (if configured), which returns one
+   of ``{correct, missing, incorrect}`` — verbatim CRAG protocol.
+5. No judge configured → record ``incorrect`` (pessimistic but at
+   least monotone with the deterministic grader).
+
+The judge is throttled by an asyncio.Semaphore so it doesn't outrun
+the OpenRouter rate limit; the pre-judge deterministic pass keeps
+the bill bounded (most easy "Beyoncé"-vs-"Beyoncé Knowles" cases
+short-circuit before we burn judge tokens).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import re
+import string
+from collections.abc import Sequence
+from dataclasses import dataclass
+from typing import Any, Literal
+
+from ....core.providers.openrouter_chat import OpenRouterChatProvider
+
+logger = logging.getLogger(__name__)
+
+
+GradeClass = Literal["correct", "missing", "incorrect"]
+
+
+# ---------------------------------------------------------------------------
+# Public type
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class CragGradeResult:
+    """One graded (pred, gold) pair under CRAG's 3-class rubric."""
+
+    grade: GradeClass
+    score: int                     # +1 / 0 / -1
+    method: str                    # exact, numeric, substring, refusal,
+                                   # false_premise_correct, false_premise_miss,
+                                   # llm_judge, lexical_miss, ...
+    normalised_pred: str = ""
+    normalised_gold: str = ""
+    judge_rationale: str = ""
+
+    @property
+    def correct(self) -> bool:
+        return self.grade == "correct"
+
+    @property
+    def missing(self) -> bool:
+        return self.grade == "missing"
+
+    @property
+    def incorrect(self) -> bool:
+        return self.grade == "incorrect"
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "grade": self.grade,
+            "score": self.score,
+            "method": self.method,
+            "normalised_pred": self.normalised_pred,
+            "normalised_gold": self.normalised_gold,
+            "judge_rationale": self.judge_rationale,
+        }
+
+
+def _grade_to_score(grade: GradeClass) -> int:
+    return {"correct": 1, "missing": 0, "incorrect": -1}[grade]
+
+
+# ---------------------------------------------------------------------------
+# Normalisation
+# ---------------------------------------------------------------------------
+
+
+_PUNCT_TABLE = str.maketrans({c: " " for c in string.punctuation})
+_ARTICLES = re.compile(r"\b(a|an|the)\b", re.IGNORECASE)
+_WS = re.compile(r"\s+")
+
+
+def _normalise(s: str) -> str:
+    s = (s or "").lower()
+    s = s.translate(_PUNCT_TABLE)
+    s = _ARTICLES.sub(" ", s)
+    s = _WS.sub(" ", s).strip()
+    return s
+
+
+_WORD_NUMBERS = {
+    "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
+    "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10, "eleven": 11,
+    "twelve": 12, "thirteen": 13, "fourteen": 14, "fifteen": 15, "sixteen": 16,
+    "seventeen": 17, "eighteen": 18, "nineteen": 19, "twenty": 20,
+}
+
+_NUMERIC_RE = re.compile(r"-?\d+(?:[.,]\d+)?")
+
+
+def _maybe_number(s: str) -> float | None:
+    """Extract a single numeric value from raw lowercased text."""
+
+    raw = (s or "").strip().lower()
+    if not raw:
+        return None
+    match = _NUMERIC_RE.search(raw)
+    if match:
+        try:
+            return float(match.group(0).replace(",", ""))
+        except ValueError:
+            pass
+    for tok in _normalise(s).split():
+        if tok in _WORD_NUMBERS:
+            return float(_WORD_NUMBERS[tok])
+    return None
+
+
+def _whole_word_substring(haystack: str, needle: str) -> bool:
+    if not needle:
+        return False
+    return f" {needle} " in f" {haystack} "
+
+
+# ---------------------------------------------------------------------------
+# Refusal detection
+# ---------------------------------------------------------------------------
+
+
+_REFUSAL_PATTERNS = [
+    re.compile(r"\bi\s+don'?t\s+know\b", re.IGNORECASE),
+    re.compile(r"\bi\s+do\s+not\s+know\b", re.IGNORECASE),
+    re.compile(r"\bnot\s+enough\s+information\b", re.IGNORECASE),
+    re.compile(r"\binsufficient\s+information\b", re.IGNORECASE),
+    re.compile(r"\bcannot\s+(?:be\s+)?(?:answered|determined)\b", re.IGNORECASE),
+    re.compile(r"\bunable\s+to\s+(?:answer|determine)\b", re.IGNORECASE),
+    re.compile(r"\bno\s+(?:information|data|evidence)\b", re.IGNORECASE),
+]
+
+
+def _is_refusal(pred: str) -> bool:
+    """Cheap deterministic check for "I don't know" -shaped responses."""
+
+    if not pred or not pred.strip():
+        return True  # empty answer is a de facto refusal
+    return any(p.search(pred) for p in _REFUSAL_PATTERNS)
+
+
+# ---------------------------------------------------------------------------
+# False-premise handling
+# ---------------------------------------------------------------------------
+
+
+_FALSE_PREMISE_PATTERNS = [
+    re.compile(r"false\s+premise", re.IGNORECASE),
+    re.compile(r"incorrect\s+premise", re.IGNORECASE),
+    re.compile(r"premise\s+(?:is|of)\s+the\s+question", re.IGNORECASE),
+    re.compile(r"\bno\s+such\b", re.IGNORECASE),
+    re.compile(r"never\s+(?:happened|occurred|existed)", re.IGNORECASE),
+    re.compile(r"\bdid\s+not\s+(?:happen|occur|exist)\b", re.IGNORECASE),
+    re.compile(r"\bdoes\s+not\s+exist\b", re.IGNORECASE),
+    re.compile(r"is\s+not\s+(?:true|correct|accurate)", re.IGNORECASE),
+    re.compile(r"\bisn'?t\s+(?:true|correct|accurate)\b", re.IGNORECASE),
+    re.compile(r"\binvalid\s+(?:premise|question|assumption)\b", re.IGNORECASE),
+]
+
+
+def _flags_false_premise(pred: str) -> bool:
+    return any(p.search(pred) for p in _FALSE_PREMISE_PATTERNS)
+
+
+# ---------------------------------------------------------------------------
+# Deterministic grader
+# ---------------------------------------------------------------------------
+
+
+def grade_deterministic(
+    *,
+    pred: str,
+    gold: str,
+    alt_answers: Sequence[str] = (),
+    question_type: str = "",
+) -> CragGradeResult:
+    """Try to grade without the LLM judge. Returns a final result.
+
+    Always returns *some* result — the caller checks ``method`` to
+    decide whether the LLM judge should overturn it. ``lexical_miss``
+    and ``false_premise_unclear`` are the two methods that trigger the
+    judge fallback.
+    """
+
+    qtype = (question_type or "").lower()
+    n_pred = _normalise(pred)
+    n_gold = _normalise(gold)
+
+    if _is_refusal(pred):
+        # CRAG protocol: refusal is *missing* (0), even on false-premise
+        # questions where one might argue refusal == correct. We
+        # follow the paper's grading literally.
+        return CragGradeResult(
+            grade="missing",
+            score=0,
+            method="refusal",
+            normalised_pred=n_pred,
+            normalised_gold=n_gold,
+        )
+
+    # Empty-gold guard (shouldn't happen, but defensively):
+    if not n_gold:
+        return CragGradeResult(
+            grade="incorrect",
+            score=-1,
+            method="empty_gold",
+            normalised_pred=n_pred,
+            normalised_gold=n_gold,
+        )
+
+    # False-premise questions: gold is typically "the question contains
+    # a false premise" / "no such X" / similar. Any answer that
+    # explicitly flags the false premise is correct.
+    if qtype == "false_premise":
+        if _flags_false_premise(pred):
+            return CragGradeResult(
+                grade="correct",
+                score=1,
+                method="false_premise_flagged",
+                normalised_pred=n_pred,
+                normalised_gold=n_gold,
+            )
+        # If the model commits to *any* concrete answer on a false-
+        # premise question without flagging the premise, it is wrong.
+        # But we don't classify ourselves — let the judge decide on
+        # the off chance the gold itself is e.g. "no" and the pred
+        # is "no" without explicit "false premise" wording.
+        return CragGradeResult(
+            grade="incorrect",
+            score=-1,
+            method="false_premise_unclear",
+            normalised_pred=n_pred,
+            normalised_gold=n_gold,
+        )
+
+    # All non-false-premise questions: try the standard chain against
+    # gold and each alt answer. First match wins.
+    candidates = [gold, *list(alt_answers)]
+    for candidate in candidates:
+        if not candidate or not str(candidate).strip():
+            continue
+        cand_norm = _normalise(candidate)
+        if not cand_norm:
+            continue
+        if n_pred == cand_norm:
+            return CragGradeResult(
+                grade="correct", score=1, method="exact",
+                normalised_pred=n_pred, normalised_gold=cand_norm,
+            )
+        p_num = _maybe_number(pred)
+        c_num = _maybe_number(candidate)
+        if p_num is not None and c_num is not None:
+            # Pure 1% relative tolerance for CRAG (currency, counts,
+            # ratios). Unlike FRAMES (which uses a 0.5 absolute floor
+            # for year-shaped answers), CRAG's numeric questions are
+            # often small-value (stock prices, percentages) where a
+            # 0.5 floor would let "$2.05" match "$2.17". The judge is
+            # the safety net for borderline rounding cases.
+            tol = abs(c_num) * 0.01
+            if abs(p_num - c_num) <= tol:
+                return CragGradeResult(
+                    grade="correct", score=1, method="numeric",
+                    normalised_pred=n_pred, normalised_gold=cand_norm,
+                )
+            # Numeric question with different numbers — keep looking
+            # at other candidates rather than declaring miss now;
+            # alt answers may include word forms that pass.
+        if _whole_word_substring(n_pred, cand_norm):
+            return CragGradeResult(
+                grade="correct", score=1, method="substring",
+                normalised_pred=n_pred, normalised_gold=cand_norm,
+            )
+        if _whole_word_substring(cand_norm, n_pred) and len(n_pred) >= 3:
+            return CragGradeResult(
+                grade="correct", score=1, method="substring_reverse",
+                normalised_pred=n_pred, normalised_gold=cand_norm,
+            )
+
+    return CragGradeResult(
+        grade="incorrect",
+        score=-1,
+        method="lexical_miss",
+        normalised_pred=n_pred,
+        normalised_gold=n_gold,
+    )
+
+
+# ---------------------------------------------------------------------------
+# LLM-as-judge (3-class)
+# ---------------------------------------------------------------------------
+
+
+_JUDGE_SYSTEM = (
+    "You are an impartial grader for short-answer factual questions, "
+    "following the CRAG benchmark rubric. Given a question, the gold "
+    "answer (and any alternative valid answers), and a model's "
+    "prediction, classify the prediction into exactly one of three "
+    "categories:\n\n"
+    "* \"correct\"   — the prediction expresses the same factual "
+    "content as the gold answer (paraphrasing OK; numbers as words "
+    "OK; partial-but-correct names OK; non-contradictory extra "
+    "detail OK).\n"
+    "* \"missing\"   — the prediction explicitly refuses, says \"I "
+    "don't know\", says there is insufficient information, or hedges "
+    "without committing.\n"
+    "* \"incorrect\" — the prediction commits to a fact that is "
+    "different from the gold answer, or fails to flag a false "
+    "premise when the question contains one.\n\n"
+    "Special case: if the question contains a false premise and the "
+    "gold answer says so, then a prediction that flags the false "
+    "premise is \"correct\".\n\n"
+    "Respond with ONLY a JSON object on a single line:\n"
+    '{\"grade\": \"correct\"|\"missing\"|\"incorrect\", \"rationale\": \"<one short sentence>\"}'
+)
+
+
+_JUDGE_TEMPLATE = """\
+Question: {question}
+Question type: {question_type}
+Gold answer: {gold}
+{alt_block}Model prediction: {pred}
+
+Decide whether the prediction is correct, missing, or incorrect.
+"""
+
+
+@dataclass
+class CragJudgeConfig:
+    api_key: str
+    model: str = "anthropic/claude-sonnet-4.5"
+    base_url: str = "https://openrouter.ai/api/v1"
+    max_tokens: int = 200
+    concurrency: int = 4
+
+
+class CragLlmJudge:
+    """Async LLM judge over OpenRouter chat completions, 3-class output."""
+
+    def __init__(self, *, config: CragJudgeConfig) -> None:
+        self._config = config
+        self._provider = OpenRouterChatProvider(
+            api_key=config.api_key,
+            base_url=config.base_url,
+            model=config.model,
+        )
+        self._sem = asyncio.Semaphore(max(1, config.concurrency))
+
+    @property
+    def model(self) -> str:
+        return self._config.model
+
+    async def judge(
+        self,
+        *,
+        question: str,
+        gold: str,
+        alt_answers: Sequence[str],
+        pred: str,
+        question_type: str = "",
+    ) -> tuple[GradeClass, str]:
+        """Return ``(grade, rationale)``. Errors return incorrect + reason."""
+
+        alt_block = ""
+        if alt_answers:
+            alt_lines = "\n".join(f"  - {a}" for a in alt_answers if a)
+            if alt_lines:
+                alt_block = f"Alternative valid answers:\n{alt_lines}\n"
+        prompt = _JUDGE_TEMPLATE.format(
+            question=question,
+            question_type=question_type or "unknown",
+            gold=gold,
+            alt_block=alt_block,
+            pred=pred,
+        )
+        try:
+            async with self._sem:
+                response = await self._provider.complete(
+                    prompt=prompt,
+                    system_prompt=_JUDGE_SYSTEM,
+                    max_tokens=self._config.max_tokens,
+                )
+        except Exception as exc:  # noqa: BLE001
+            return "incorrect", f"judge_error: {type(exc).__name__}: {exc}"
+        return _parse_judge_response(response.text)
+
+
+def _parse_judge_response(text: str) -> tuple[GradeClass, str]:
+    """Parse the judge reply into a 3-class label + rationale."""
+
+    if not text or not text.strip():
+        return "incorrect", "judge_returned_empty"
+    match = re.search(r"\{[^{}]*\}", text, flags=re.DOTALL)
+    candidate = match.group(0) if match else text
+    try:
+        data = json.loads(candidate)
+    except (json.JSONDecodeError, ValueError):
+        lowered = text.strip().lower()
+        if "correct" in lowered and "incorrect" not in lowered:
+            return "correct", "yes (parser_fallback)"
+        if "missing" in lowered or "i don" in lowered:
+            return "missing", "missing (parser_fallback)"
+        return "incorrect", f"unparseable_judge_response: {text[:200]}"
+    raw_grade = str(data.get("grade") or "").strip().lower()
+    rationale = str(data.get("rationale", "")).strip()[:280]
+    if raw_grade in {"correct", "missing", "incorrect"}:
+        return raw_grade, rationale  # type: ignore[return-value]
+    return "incorrect", f"unknown_grade={raw_grade!r}; {rationale}"
+
+
+# ---------------------------------------------------------------------------
+# Combined grader
+# ---------------------------------------------------------------------------
+
+
+# Methods that should *not* trigger the LLM judge — the deterministic
+# verdict is conclusive (refusal, exact match, numeric mismatch, etc.).
+_TERMINAL_METHODS = frozenset({
+    "refusal",
+    "exact",
+    "numeric",
+    "substring",
+    "substring_reverse",
+    "false_premise_flagged",
+    "empty_gold",
+})
+
+
+async def grade_with_judge(
+    *,
+    pred: str,
+    gold: str,
+    alt_answers: Sequence[str],
+    question: str,
+    question_type: str,
+    judge: CragLlmJudge | None,
+) -> CragGradeResult:
+    """One row → deterministic shortcut → optional LLM judge fallback."""
+
+    det = grade_deterministic(
+        pred=pred,
+        gold=gold,
+        alt_answers=alt_answers,
+        question_type=question_type,
+    )
+    if det.method in _TERMINAL_METHODS:
+        return det
+    if judge is None:
+        return det  # ``lexical_miss`` / ``false_premise_unclear`` → keep as-is
+    grade, rationale = await judge.judge(
+        question=question,
+        gold=gold,
+        alt_answers=alt_answers,
+        pred=pred,
+        question_type=question_type,
+    )
+    return CragGradeResult(
+        grade=grade,
+        score=_grade_to_score(grade),
+        method="llm_judge",
+        normalised_pred=det.normalised_pred,
+        normalised_gold=det.normalised_gold,
+        judge_rationale=rationale,
+    )
+
+
+@dataclass
+class CragGradeRow:
+    """One row to grade. Mirrors the FRAMES grader's tuple but typed."""
+
+    qid: str
+    question: str
+    gold: str
+    alt_answers: list[str]
+    pred: str
+    question_type: str = ""
+
+
+async def grade_many(
+    *,
+    rows: Sequence[CragGradeRow],
+    judge: CragLlmJudge | None,
+) -> list[CragGradeResult]:
+    """Grade every row concurrently. Judge enforces its own concurrency cap."""
+
+    if not rows:
+        return []
+    coros = [
+        grade_with_judge(
+            pred=r.pred,
+            gold=r.gold,
+            alt_answers=r.alt_answers,
+            question=r.question,
+            question_type=r.question_type,
+            judge=judge,
+        )
+        for r in rows
+    ]
+    return list(await asyncio.gather(*coros))
+
+
+__all__ = [
+    "CragGradeResult",
+    "CragGradeRow",
+    "CragJudgeConfig",
+    "CragLlmJudge",
+    "GradeClass",
+    "grade_deterministic",
+    "grade_many",
+    "grade_with_judge",
+]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/html_extract.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/html_extract.py
new file mode 100644
index 000000000..1b00aedc2
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/html_extract.py
@@ -0,0 +1,206 @@
+"""HTML → markdown for CRAG pages, with boilerplate removal.
+
+Each CRAG page is a *full* HTML document (nav, ads, recommended-for-
+you, footer, ...). Without removing that boilerplate, retrieval over
+the chunks would surface menu items and "subscribe to our newsletter"
+boxes instead of the actual page content. We use ``trafilatura``,
+which is purpose-built for main-content extraction (the same library
+Common Crawl downstream pipelines use). It outputs clean prose with
+section headers, lists, and tables preserved.
+
+Extraction policy:
+1. ``trafilatura.extract`` with ``output_format="markdown"`` — main
+   content only, headers preserved, tables kept.
+2. If extraction fails or returns < 200 chars (paywalled / JS-only
+   page / extraction confused), fall back to a plain stdlib
+   ``HTMLParser`` that strips tags and collapses whitespace. Some
+   text is better than no text — SurfSense's chunker handles noisy
+   prose.
+
+We *intentionally* keep the page name and URL as visible H1 / link
+metadata so the SurfSense chunker preserves doc identity at the top of
+the first chunk (mirrors what we do for FRAMES Wikipedia pages).
+"""
+
+from __future__ import annotations
+
+import html
+import logging
+import re
+from dataclasses import dataclass
+from html.parser import HTMLParser
+
+logger = logging.getLogger(__name__)
+
+
+_MIN_TRAFILATURA_LENGTH = 200
+_MAX_OUTPUT_CHARS = 200_000  # cap to keep upload payloads sane
+
+
+@dataclass
+class ExtractionResult:
+    """Outcome of converting one HTML blob to plain markdown."""
+
+    text: str
+    method: str          # "trafilatura" | "fallback_strip" | "empty"
+    n_chars: int
+
+    @property
+    def ok(self) -> bool:
+        return self.n_chars > 0
+
+
+# ---------------------------------------------------------------------------
+# Trafilatura wrapper (lazy import so tests / small scripts don't pay)
+# ---------------------------------------------------------------------------
+
+
+def _trafilatura_extract(html_text: str, *, url: str) -> str | None:
+    try:
+        import trafilatura
+    except ImportError:  # pragma: no cover - dependency is required
+        logger.warning("trafilatura not installed; falling back to strip-tags only")
+        return None
+    try:
+        text = trafilatura.extract(
+            html_text,
+            url=url or None,
+            output_format="markdown",
+            include_links=False,
+            include_images=False,
+            include_tables=True,
+            favor_recall=True,
+        )
+    except Exception as exc:  # noqa: BLE001 - trafilatura raises a zoo
+        logger.debug("trafilatura.extract crashed for %s: %s", url, exc)
+        return None
+    if not text:
+        return None
+    return text.strip()
+
+
+# ---------------------------------------------------------------------------
+# Stdlib fallback: strip HTML tags
+# ---------------------------------------------------------------------------
+
+
+class _StripHTMLParser(HTMLParser):
+    """Collect text content, treating block tags as paragraph breaks.
+
+    We deliberately drop ``<script>``, ``<style>``, ``<nav>``,
+    ``<header>``, ``<footer>``, and ``<aside>`` content — these are
+    almost always boilerplate and they are the dominant source of
+    noise SurfSense ends up retrieving against if not removed.
+    """
+
+    _SKIP_TAGS = frozenset({"script", "style", "nav", "header", "footer", "aside", "svg"})
+    _BLOCK_TAGS = frozenset({
+        "p", "div", "section", "article", "li", "ul", "ol",
+        "h1", "h2", "h3", "h4", "h5", "h6", "br", "tr",
+        "td", "th", "table", "blockquote", "pre",
+    })
+
+    def __init__(self) -> None:
+        super().__init__(convert_charrefs=True)
+        self._buffer: list[str] = []
+        self._skip_depth: int = 0
+
+    def handle_starttag(self, tag: str, attrs: list) -> None:  # noqa: ARG002
+        if tag in self._SKIP_TAGS:
+            self._skip_depth += 1
+        if tag in self._BLOCK_TAGS:
+            self._buffer.append("\n")
+
+    def handle_endtag(self, tag: str) -> None:
+        if tag in self._SKIP_TAGS and self._skip_depth > 0:
+            self._skip_depth -= 1
+        if tag in self._BLOCK_TAGS:
+            self._buffer.append("\n")
+
+    def handle_data(self, data: str) -> None:
+        if self._skip_depth:
+            return
+        self._buffer.append(data)
+
+    def get_text(self) -> str:
+        text = "".join(self._buffer)
+        # Decode any leftover entities and collapse whitespace.
+        text = html.unescape(text)
+        text = re.sub(r"[ \t]+", " ", text)
+        text = re.sub(r"\n[ \t]+", "\n", text)
+        text = re.sub(r"\n{3,}", "\n\n", text)
+        return text.strip()
+
+
+def _strip_tags(html_text: str) -> str:
+    parser = _StripHTMLParser()
+    try:
+        parser.feed(html_text)
+    except Exception as exc:  # noqa: BLE001 - HTMLParser is fragile on garbage input
+        logger.debug("HTMLParser failed; using regex strip: %s", exc)
+        no_tags = re.sub(r"<[^>]+>", " ", html_text)
+        return re.sub(r"\s+", " ", html.unescape(no_tags)).strip()
+    return parser.get_text()
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def extract_main_content(
+    html_text: str,
+    *,
+    url: str = "",
+    page_name: str = "",
+    last_modified: str | None = None,
+) -> ExtractionResult:
+    """Convert one HTML blob into clean markdown for ingest.
+
+    The returned ``text`` is prefixed with a small metadata header
+    (``# {page_name}\\n\\nSource: {url}\\n``) so that:
+
+    * SurfSense's chunker has a stable doc-identity anchor at the top
+      of the first chunk (matches what we do for FRAMES Wikipedia).
+    * The retrieval-augmented arm sees the URL inline, which the LLM
+      can surface as a citation if the prompt asks for one.
+    """
+
+    body = ""
+    method = "empty"
+    if html_text and html_text.strip():
+        body = _trafilatura_extract(html_text, url=url) or ""
+        if body and len(body) >= _MIN_TRAFILATURA_LENGTH:
+            method = "trafilatura"
+        else:
+            stripped = _strip_tags(html_text)
+            # Prefer trafilatura output even if short, but only if it
+            # contained any prose at all — empty trafilatura fall-through
+            # to the stripped form.
+            if body and stripped and len(stripped) > len(body) * 1.5:
+                body = stripped
+                method = "fallback_strip"
+            elif not body and stripped:
+                body = stripped
+                method = "fallback_strip"
+            elif body:
+                method = "trafilatura"
+
+    body = body.strip()
+    if len(body) > _MAX_OUTPUT_CHARS:
+        body = body[:_MAX_OUTPUT_CHARS] + "\n\n[...truncated...]"
+
+    if not body:
+        return ExtractionResult(text="", method="empty", n_chars=0)
+
+    title_line = (page_name or url or "Untitled").strip()
+    header_lines = [f"# {title_line}"]
+    if url:
+        header_lines.append(f"Source: {url}")
+    if last_modified:
+        header_lines.append(f"Last modified: {last_modified}")
+    final = "\n".join(header_lines) + "\n\n" + body + "\n"
+    return ExtractionResult(text=final, method=method, n_chars=len(final))
+
+
+__all__ = ["ExtractionResult", "extract_main_content"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest.py
new file mode 100644
index 000000000..1a6a1dfa7
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest.py
@@ -0,0 +1,447 @@
+"""CRAG ingestion: download → extract → upload → per-question doc map.
+
+Steps:
+
+1. Download ``crag_task_1_and_2_dev_v4.jsonl.bz2`` from
+   ``facebookresearch/CRAG`` (skip if cached).
+2. Stream-parse into ``CragQuestion`` objects.
+3. Optionally cap to ``--n-questions N`` (and *stratified* sample
+   across ``(domain, question_type)`` so the smoke / partial run
+   isn't dominated by ``finance`` or ``simple``).
+4. For each question, extract the 5 web pages to clean markdown via
+   ``trafilatura`` and write them to
+   ``<bench_dir>/pages/<qid>__<page_idx>__<url_hash>.md``. The
+   filename is unique across the whole sample (so SurfSense's
+   ``(filename, search_space)`` dedup never collides between
+   questions) and round-trippable (the ``<qid>__`` prefix lets the
+   ingest infer doc-membership at the title level even before we
+   land on a stable status response).
+5. Upload all extracted pages to SurfSense in batches with text-only
+   ETL (``use_vision_llm=False, processing_mode="basic"``) — these
+   are extracted plaintext, no images involved.
+6. Persist a doc map at
+   ``<suite_data>/maps/crag_doc_map.jsonl`` with one row per question:
+
+       {"qid": "C00042",
+        "interaction_id": "<uuid>",
+        "question": "<text>",
+        "gold_answer": "<text>",
+        "alt_answers": [...],
+        "domain": "...", "question_type": "...",
+        "static_or_dynamic": "...", "popularity": "...",
+        "query_time": "...",
+        "page_filenames": ["C00042__0__abc123.md", ...],
+        "document_ids": [42101, 42102, ...],
+        "missing_pages": [...]   # filenames whose upload failed
+       }
+
+The runner uses ``document_ids`` to scope SurfSense retrieval to
+exactly the 5 pages of the question (matches CRAG protocol — the
+benchmark explicitly hands over its own retrieved pages).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from ....core.clients.documents import (
+    DocumentProcessingFailed,
+    DocumentProcessingTimeout,
+)
+from ....core.config import set_suite_state
+from ....core.ingest_settings import IngestSettings, settings_header_line
+from ....core.registry import RunContext
+from .dataset import (
+    CragPage,
+    CragQuestion,
+    download_task_1_2,
+    iter_questions,
+    stratified_sample,
+    write_questions_jsonl,
+)
+from .html_extract import extract_main_content
+
+logger = logging.getLogger(__name__)
+
+
+_FILENAME_SAFE = re.compile(r"[^A-Za-z0-9._\-]+")
+
+
+def _page_filename(qid: str, page_idx: int, page: CragPage) -> str:
+    """Filesystem-safe, globally unique markdown filename for a CRAG page.
+
+    Format: ``<qid>__<idx>__<url_hash>.md``. Both the qid (``C00042``)
+    and the URL-hash (``[:12]``) are alphanumeric so we don't need to
+    sanitise them, but we strip anything else just in case.
+    """
+
+    qid_safe = _FILENAME_SAFE.sub("_", qid)
+    return f"{qid_safe}__{page_idx:02d}__{page.url_hash}.md"
+
+
+# ---------------------------------------------------------------------------
+# Stats
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class _IngestStats:
+    n_questions: int
+    n_pages_total: int
+    n_pages_extracted: int
+    n_pages_empty: int
+    n_uploaded: int
+    n_existing: int
+    bench_dir: Path
+    map_path: Path
+
+
+# ---------------------------------------------------------------------------
+# Page extraction
+# ---------------------------------------------------------------------------
+
+
+def _materialise_pages(
+    questions: list[CragQuestion],
+    *,
+    pages_dir: Path,
+    overwrite: bool = False,
+) -> tuple[dict[str, list[str]], dict[str, str]]:
+    """Extract every page in every question to ``pages_dir`` as markdown.
+
+    Returns:
+      * ``qid -> [filename, filename, ...]`` (in page order, only
+        successful extractions)
+      * ``filename -> source_url`` for diagnostics
+
+    Empty extractions (paywall / JS / parse-fail with no fallback
+    output) are skipped — better to retrieve from 4 pages than feed
+    SurfSense's chunker an empty file.
+    """
+
+    pages_dir.mkdir(parents=True, exist_ok=True)
+    qid_to_files: dict[str, list[str]] = {}
+    file_to_url: dict[str, str] = {}
+    method_counts: dict[str, int] = {}
+    n_empty = 0
+
+    for q in questions:
+        names: list[str] = []
+        for idx, page in enumerate(q.pages):
+            filename = _page_filename(q.qid, idx, page)
+            dest = pages_dir / filename
+            if dest.exists() and dest.stat().st_size > 0 and not overwrite:
+                method_counts["cache_hit"] = method_counts.get("cache_hit", 0) + 1
+                names.append(filename)
+                file_to_url[filename] = page.page_url
+                continue
+            result = extract_main_content(
+                page.page_html,
+                url=page.page_url,
+                page_name=page.page_name,
+                last_modified=page.page_last_modified,
+            )
+            method_counts[result.method] = method_counts.get(result.method, 0) + 1
+            if not result.ok:
+                n_empty += 1
+                continue
+            dest.write_text(result.text, encoding="utf-8")
+            names.append(filename)
+            file_to_url[filename] = page.page_url
+        qid_to_files[q.qid] = names
+
+    logger.info(
+        "CRAG page extraction: %s; empty=%d, total_files=%d across %d questions",
+        method_counts, n_empty, len(file_to_url), len(qid_to_files),
+    )
+    return qid_to_files, file_to_url
+
+
+# ---------------------------------------------------------------------------
+# Upload
+# ---------------------------------------------------------------------------
+
+
+async def _upload_pages(
+    ctx: RunContext,
+    *,
+    pages_dir: Path,
+    filenames: list[str],
+    batch_size: int,
+    settings: IngestSettings,
+) -> dict[str, int]:
+    """Upload ``filenames`` (already on disk under ``pages_dir``) and return name → doc_id."""
+
+    if not filenames:
+        return {}
+    docs_client = ctx.documents_client()
+    name_to_id: dict[str, int] = {}
+    paths = [pages_dir / fn for fn in filenames if (pages_dir / fn).exists()]
+
+    for batch_start in range(0, len(paths), batch_size):
+        batch = paths[batch_start : batch_start + batch_size]
+        result = await docs_client.upload(
+            files=batch,
+            search_space_id=ctx.search_space_id,
+            should_summarize=settings.should_summarize,
+            use_vision_llm=settings.use_vision_llm,
+            processing_mode=settings.processing_mode,
+        )
+        all_ids = list(result.document_ids) + list(result.duplicate_document_ids)
+        if result.document_ids:
+            try:
+                await docs_client.wait_until_ready(
+                    search_space_id=ctx.search_space_id,
+                    document_ids=result.document_ids,
+                    timeout_s=900.0,
+                )
+            except (DocumentProcessingFailed, DocumentProcessingTimeout) as exc:
+                logger.warning("CRAG batch processing issue: %s", exc)
+        if all_ids:
+            statuses = await docs_client.get_status(
+                search_space_id=ctx.search_space_id,
+                document_ids=all_ids,
+            )
+            for s in statuses:
+                stem = Path(s.title).stem if s.title.endswith(".md") else s.title
+                name_to_id[stem] = s.document_id
+                name_to_id[s.title] = s.document_id
+                if not s.title.endswith(".md"):
+                    name_to_id[f"{s.title}.md"] = s.document_id
+        logger.info(
+            "CRAG upload batch %d-%d: %d new, %d duplicate",
+            batch_start, batch_start + len(batch),
+            len(result.document_ids), len(result.duplicate_document_ids),
+        )
+    return name_to_id
+
+
+# ---------------------------------------------------------------------------
+# Doc map writer
+# ---------------------------------------------------------------------------
+
+
+def _resolve_question_doc_ids(
+    questions: list[CragQuestion],
+    qid_to_files: dict[str, list[str]],
+    name_to_id: dict[str, int],
+) -> list[dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    for q in questions:
+        filenames = qid_to_files.get(q.qid, [])
+        doc_ids: list[int] = []
+        missing: list[str] = []
+        for fn in filenames:
+            stem = Path(fn).stem
+            doc_id = name_to_id.get(stem) or name_to_id.get(fn)
+            if doc_id is not None and doc_id not in doc_ids:
+                doc_ids.append(doc_id)
+            else:
+                missing.append(fn)
+        rows.append({
+            "qid": q.qid,
+            "interaction_id": q.interaction_id,
+            "raw_index": q.raw_index,
+            "question": q.query,
+            "gold_answer": q.gold_answer,
+            "alt_answers": list(q.alt_answers),
+            "domain": q.domain,
+            "question_type": q.question_type,
+            "static_or_dynamic": q.static_or_dynamic,
+            "popularity": q.popularity,
+            "query_time": q.query_time,
+            "split": q.split,
+            "page_filenames": filenames,
+            "document_ids": doc_ids,
+            "missing_pages": missing,
+            "n_pages": len(filenames),
+        })
+    return rows
+
+
+# ---------------------------------------------------------------------------
+# Public entry
+# ---------------------------------------------------------------------------
+
+
+async def run_ingest(
+    ctx: RunContext,
+    *,
+    n_questions: int | None = None,
+    upload_batch_size: int = 16,
+    skip_upload: bool = False,
+    overwrite_extract: bool = False,
+    settings: IngestSettings | None = None,
+    sample_seed: int = 17,
+) -> None:
+    """Ingest the CRAG benchmark (Tasks 1 & 2) into the research suite.
+
+    Parameters
+    ----------
+    n_questions
+        Cap on the number of CRAG questions to materialise.
+        ``None`` = all 2,706 (~13,500 pages — large; smoke runs
+        should pass 10-20 and full runs ~200).
+    upload_batch_size
+        Markdown files per ``/documents/fileupload`` call.
+    skip_upload
+        Extract + cache markdown locally but don't push to SurfSense
+        (useful for debugging the extraction step).
+    overwrite_extract
+        Re-run trafilatura even when a cached markdown file exists.
+        Default False so re-running ingest is idempotent.
+    settings
+        Override per-upload knobs. CRAG defaults to text-only basic
+        ETL — these are *extracted* plaintext, no images.
+    sample_seed
+        RNG seed for ``stratified_sample``. Pin this for reproducibility.
+    """
+
+    settings = settings or IngestSettings(
+        use_vision_llm=False,
+        processing_mode="basic",
+        should_summarize=False,
+    )
+    bench_dir = ctx.benchmark_data_dir()
+    pages_dir = bench_dir / "pages"
+    raw_cache = bench_dir / ".raw_cache"
+    raw_cache.mkdir(parents=True, exist_ok=True)
+
+    bz2_path = download_task_1_2(raw_cache)
+    logger.info("CRAG: parsing %s ...", bz2_path.name)
+    all_questions = iter_questions(bz2_path)
+    if not all_questions:
+        raise RuntimeError(
+            "CRAG JSONL contained no parseable rows; upstream may have changed schema."
+        )
+    logger.info("CRAG: parsed %d total questions", len(all_questions))
+
+    if n_questions is not None and n_questions > 0:
+        questions = stratified_sample(all_questions, n=n_questions, seed=sample_seed)
+        logger.info(
+            "CRAG: stratified sample of %d questions across %d (domain, qtype) buckets",
+            len(questions),
+            len({(q.domain, q.question_type) for q in questions}),
+        )
+    else:
+        questions = all_questions
+
+    questions_jsonl = bench_dir / "questions.jsonl"
+    write_questions_jsonl(questions, questions_jsonl)
+
+    n_pages_total = sum(len(q.pages) for q in questions)
+    logger.info(
+        "CRAG: extracting up to %d pages across %d questions ...",
+        n_pages_total, len(questions),
+    )
+    qid_to_files, file_to_url = _materialise_pages(
+        questions, pages_dir=pages_dir, overwrite=overwrite_extract,
+    )
+    n_pages_extracted = sum(len(v) for v in qid_to_files.values())
+
+    name_to_id: dict[str, int] = {}
+    if skip_upload:
+        logger.info("CRAG: --skip-upload; skipping SurfSense ingestion")
+    else:
+        all_filenames = sorted({fn for fns in qid_to_files.values() for fn in fns})
+        logger.info("CRAG: uploading %d unique pages ...", len(all_filenames))
+        name_to_id = await _upload_pages(
+            ctx,
+            pages_dir=pages_dir,
+            filenames=all_filenames,
+            batch_size=upload_batch_size,
+            settings=settings,
+        )
+
+    doc_rows = _resolve_question_doc_ids(questions, qid_to_files, name_to_id)
+    map_path = ctx.maps_dir() / "crag_doc_map.jsonl"
+    with map_path.open("w", encoding="utf-8") as fh:
+        fh.write(settings_header_line(settings) + "\n")
+        for row in doc_rows:
+            fh.write(json.dumps(row) + "\n")
+    logger.info("Wrote CRAG doc map to %s (%d rows)", map_path, len(doc_rows))
+
+    new_state = ctx.suite_state
+    new_state.ingestion_maps["crag"] = str(map_path)
+    set_suite_state(ctx.config, ctx.suite, new_state)
+
+    stats = _IngestStats(
+        n_questions=len(questions),
+        n_pages_total=n_pages_total,
+        n_pages_extracted=n_pages_extracted,
+        n_pages_empty=n_pages_total - n_pages_extracted,
+        n_uploaded=len(name_to_id),
+        n_existing=0,
+        bench_dir=bench_dir,
+        map_path=map_path,
+    )
+    logger.info("CRAG ingest done: %s", stats)
+
+
+# ---------------------------------------------------------------------------
+# For runner: read extracted page text back from disk
+# ---------------------------------------------------------------------------
+
+
+def read_page_markdown(bench_dir: Path, filename: str) -> str | None:
+    """Return the on-disk markdown body for a previously-extracted page.
+
+    Used by the long-context runner arm to assemble the prompt at
+    inference time — we don't keep all 5×N pages in memory between
+    ingest and run.
+    """
+
+    path = bench_dir / "pages" / filename
+    if not path.exists():
+        return None
+    try:
+        return path.read_text(encoding="utf-8")
+    except OSError:
+        return None
+
+
+async def _retry_upload_idempotent(  # noqa: D401 - hidden helper
+    ctx: RunContext,
+    *,
+    pages_dir: Path,
+    filenames: list[str],
+    batch_size: int,
+    settings: IngestSettings,
+    max_attempts: int = 2,
+) -> dict[str, int]:
+    """Future-proofing hook (unused today): retry the ingest upload pass."""
+
+    last_exc: Exception | None = None
+    for attempt in range(max_attempts):
+        try:
+            return await _upload_pages(
+                ctx,
+                pages_dir=pages_dir,
+                filenames=filenames,
+                batch_size=batch_size,
+                settings=settings,
+            )
+        except Exception as exc:  # noqa: BLE001
+            last_exc = exc
+            logger.warning("CRAG upload attempt %d failed: %s", attempt + 1, exc)
+            await asyncio.sleep(2.0 * (attempt + 1))
+    if last_exc is not None:
+        raise last_exc
+    return {}
+
+
+__all__ = [
+    "_IngestStats",
+    "_materialise_pages",
+    "_page_filename",
+    "_resolve_question_doc_ids",
+    "_upload_pages",
+    "read_page_markdown",
+    "run_ingest",
+]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest_task3.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest_task3.py
new file mode 100644
index 000000000..e5440f382
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest_task3.py
@@ -0,0 +1,191 @@
+"""CRAG Task 3 ingestion: 4-part download → streaming JSONL → upload.
+
+Same flow as ``ingest.run_ingest`` for Tasks 1 & 2 (extract HTML →
+upload markdown → resolve doc_ids → write doc map), but:
+
+* Source: 4 .tar.bz2 parts streamed via ``dataset_task3``.
+* Page count: 50 per question instead of 5 — the whole point of
+  Task 3 (the long-context arm now structurally has to choose what
+  to keep, while SurfSense's retrieval becomes mandatory).
+* Stratified sampling re-uses the Task 1 helper since the question
+  schema is identical.
+
+Doc map lands at ``<suite_data>/maps/crag_t3_doc_map.jsonl`` with the
+same row shape as Task 1's map (so the runner only needs to know
+which file to load; everything else is shared).
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+
+from ....core.config import set_suite_state
+from ....core.ingest_settings import IngestSettings, settings_header_line
+from ....core.registry import RunContext
+from .dataset import stratified_sample, write_questions_jsonl
+from .dataset_task3 import (
+    CRAG_TASK_3_PART_NAMES,
+    iter_questions_task3,
+    parts_present,
+)
+from .ingest import (
+    _IngestStats,
+    _materialise_pages,
+    _resolve_question_doc_ids,
+    _upload_pages,
+)
+
+logger = logging.getLogger(__name__)
+
+
+_INSTRUCTIONS_TO_DOWNLOAD = (
+    "Run `python scripts/download_crag_task3.py` first to fetch the "
+    "4 tar.bz2 parts (~7 GB total) into "
+    "data/research/crag_t3/.raw_cache/. The downloader is idempotent "
+    "and parallel."
+)
+
+
+async def run_ingest_task3(
+    ctx: RunContext,
+    *,
+    n_questions: int | None = None,
+    upload_batch_size: int = 16,
+    skip_upload: bool = False,
+    overwrite_extract: bool = False,
+    settings: IngestSettings | None = None,
+    sample_seed: int = 17,
+    parse_cap: int | None = None,
+) -> None:
+    """Ingest CRAG Task 3 (50 pages per question) into the research suite.
+
+    Parameters
+    ----------
+    n_questions
+        Cap on the post-stratified-sample question count. ``None`` =
+        "use whatever ``parse_cap`` produced". For real runs aim for
+        50 (~2,500 pages) — n=200 (10k pages) is doable but slow.
+    parse_cap
+        Hard cap on how many rows we *parse* from the streaming
+        archive before stratified sampling. Defaults to
+        ``max(400, 6*n_questions)`` — enough to cover all (domain,
+        question_type) buckets ~5x but small enough to fit in the
+        first shard or two (each shard is ≈5 GB decompressed and
+        holds ~300 rows; bz2 throughput is ~50 MB/s). Lowering this
+        is the only knob that bounds streaming cost since we can
+        ``break`` out of the JSONL stream early without decompressing
+        the rest of the ~50 GB archive body.
+    upload_batch_size
+        Markdown files per ``/documents/fileupload`` call.
+    skip_upload
+        Extract markdown locally, don't push to SurfSense.
+    overwrite_extract
+        Re-run trafilatura even when a cached markdown is present.
+    settings
+        Per-upload knobs override (default: text-only basic ETL).
+    sample_seed
+        RNG seed for stratified sampling (deterministic).
+    """
+
+    settings = settings or IngestSettings(
+        use_vision_llm=False,
+        processing_mode="basic",
+        should_summarize=False,
+    )
+    bench_dir = ctx.benchmark_data_dir()
+    pages_dir = bench_dir / "pages"
+    raw_cache = bench_dir / ".raw_cache"
+    raw_cache.mkdir(parents=True, exist_ok=True)
+
+    if not parts_present(raw_cache):
+        missing = [
+            n for n in CRAG_TASK_3_PART_NAMES
+            if not (raw_cache / n).exists()
+        ]
+        raise RuntimeError(
+            f"CRAG Task 3 parts missing from {raw_cache}: {missing}. "
+            f"{_INSTRUCTIONS_TO_DOWNLOAD}"
+        )
+
+    # 1. Stream-parse (capped). For n=50 we don't need the full 2,706
+    #    rows — just enough that the stratified sampler can balance.
+    #    Each tar shard ~5 GB / ~300 rows / ~2 min decompress, so
+    #    400-500 rows = shard 0 + a slice of shard 1 ≈ 3-4 min.
+    parse_cap = parse_cap or (
+        max(400, 6 * (n_questions or 50)) if n_questions else None
+    )
+    logger.info(
+        "CRAG Task 3: streaming JSONL (parse_cap=%s) ...",
+        parse_cap if parse_cap else "no-cap",
+    )
+    all_questions = iter_questions_task3(raw_cache, max_questions=parse_cap)
+    logger.info("CRAG Task 3: parsed %d rows", len(all_questions))
+
+    if not all_questions:
+        raise RuntimeError("CRAG Task 3 streaming returned 0 rows; check archive integrity.")
+
+    if n_questions is not None and n_questions > 0:
+        questions = stratified_sample(all_questions, n=n_questions, seed=sample_seed)
+        logger.info(
+            "CRAG Task 3: stratified sample of %d questions across %d (domain, qtype) buckets",
+            len(questions),
+            len({(q.domain, q.question_type) for q in questions}),
+        )
+    else:
+        questions = all_questions
+
+    questions_jsonl = bench_dir / "questions.jsonl"
+    write_questions_jsonl(questions, questions_jsonl)
+
+    n_pages_total = sum(len(q.pages) for q in questions)
+    logger.info(
+        "CRAG Task 3: extracting up to %d pages across %d questions ...",
+        n_pages_total, len(questions),
+    )
+    qid_to_files, _file_to_url = _materialise_pages(
+        questions, pages_dir=pages_dir, overwrite=overwrite_extract,
+    )
+    n_pages_extracted = sum(len(v) for v in qid_to_files.values())
+
+    name_to_id: dict[str, int] = {}
+    if skip_upload:
+        logger.info("CRAG Task 3: --skip-upload; skipping SurfSense ingestion")
+    else:
+        all_filenames = sorted({fn for fns in qid_to_files.values() for fn in fns})
+        logger.info("CRAG Task 3: uploading %d unique pages ...", len(all_filenames))
+        name_to_id = await _upload_pages(
+            ctx,
+            pages_dir=pages_dir,
+            filenames=all_filenames,
+            batch_size=upload_batch_size,
+            settings=settings,
+        )
+
+    doc_rows = _resolve_question_doc_ids(questions, qid_to_files, name_to_id)
+    map_path = ctx.maps_dir() / "crag_t3_doc_map.jsonl"
+    with map_path.open("w", encoding="utf-8") as fh:
+        fh.write(settings_header_line(settings) + "\n")
+        for row in doc_rows:
+            fh.write(json.dumps(row) + "\n")
+    logger.info("Wrote CRAG Task 3 doc map to %s (%d rows)", map_path, len(doc_rows))
+
+    new_state = ctx.suite_state
+    new_state.ingestion_maps["crag_t3"] = str(map_path)
+    set_suite_state(ctx.config, ctx.suite, new_state)
+
+    stats = _IngestStats(
+        n_questions=len(questions),
+        n_pages_total=n_pages_total,
+        n_pages_extracted=n_pages_extracted,
+        n_pages_empty=n_pages_total - n_pages_extracted,
+        n_uploaded=len(name_to_id),
+        n_existing=0,
+        bench_dir=bench_dir,
+        map_path=map_path,
+    )
+    logger.info("CRAG Task 3 ingest done: %s", stats)
+
+
+__all__ = ["run_ingest_task3"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/prompt.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/prompt.py
new file mode 100644
index 000000000..626834505
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/prompt.py
@@ -0,0 +1,146 @@
+"""CRAG prompt templates for the three competing arms.
+
+The CRAG paper grades each prediction as one of:
+
+* **correct**   — answer matches gold (with paraphrasing tolerance)
+* **missing**   — model refuses or says "I don't know"
+* **incorrect** — model commits to a wrong answer (hallucination)
+
+The truthfulness score `(correct - incorrect) / total` rewards
+calibrated abstention, so the prompts below explicitly *invite* the
+model to refuse when it isn't confident — otherwise the bare-LLM arm
+gets penalised twice (no docs *and* a no-refusal prompt) and the
+comparison stops being fair to the LLM-only baseline.
+
+Three templates, byte-identical instructions:
+
+* ``build_bare_prompt(q)``         — question-only.
+* ``build_long_context_prompt(q, contexts)`` — question + concatenated
+  page extracts, all stuffed into the user message. Mirrors the
+  paper's "straightforward RAG" baseline.
+* ``build_surfsense_prompt(q)``    — question + a hint that retrieval
+  over the question's 5 ingested pages is available; the SurfSense
+  agent itself owns the retrieval step.
+
+The ``Answer:`` line at the end is parsed by ``extract_freeform_answer``
+in the runner, so the format is mandatory.
+"""
+
+from __future__ import annotations
+
+
+_BASE_INSTRUCTIONS = (
+    "You are a careful question-answering assistant. The question is a "
+    "real-world factual question that may be about finance, music, "
+    "movies, sports, or any other domain.\n\n"
+    "Important rules:\n"
+    "1. If the question contains a false premise (an assumption that "
+    "is factually wrong), say so explicitly in your final answer "
+    "rather than answering as if the premise were true.\n"
+    "2. If you are not confident in an answer, prefer saying \"I don't "
+    "know\" over guessing. A wrong commit is penalised more than a "
+    "refusal.\n"
+    "3. Keep the final answer short — a name, a number, a date, a "
+    "phrase. Do not repeat the question.\n\n"
+    "Format your final line EXACTLY as:\n"
+    "Answer: <short answer>\n"
+    "If you don't know, write `Answer: I don't know`."
+)
+
+
+_BARE_TEMPLATE = """\
+{instructions}
+
+Question: {question}
+Question time: {query_time}
+"""
+
+
+_SURFSENSE_TEMPLATE = """\
+{instructions}
+
+You have access to a search index of up to 5 web pages that were
+retrieved for this question. Use the retrieval tool to look up any
+facts you are not confident about. The pages may be partially or fully
+relevant; some may contradict each other (prefer the more authoritative
+or more recent source).
+
+Question: {question}
+Question time: {query_time}
+"""
+
+
+_LONG_CONTEXT_TEMPLATE = """\
+{instructions}
+
+You are given the full text of {n_contexts} web pages that were
+retrieved for this question. Read all of them, then answer. The
+pages may be partially or fully relevant; some may contradict each
+other (prefer the more authoritative or more recent source).
+
+{contexts}
+
+Question: {question}
+Question time: {query_time}
+"""
+
+
+def build_bare_prompt(question: str, *, query_time: str = "") -> str:
+    """Prompt for the no-retrieval baseline arm."""
+
+    return _BARE_TEMPLATE.format(
+        instructions=_BASE_INSTRUCTIONS,
+        question=question.strip(),
+        query_time=query_time.strip() or "unknown",
+    )
+
+
+def build_surfsense_prompt(question: str, *, query_time: str = "") -> str:
+    """Prompt for the SurfSense arm (agent does retrieval itself)."""
+
+    return _SURFSENSE_TEMPLATE.format(
+        instructions=_BASE_INSTRUCTIONS,
+        question=question.strip(),
+        query_time=query_time.strip() or "unknown",
+    )
+
+
+def build_long_context_prompt(
+    question: str,
+    *,
+    contexts: list[tuple[str, str]],
+    query_time: str = "",
+    per_page_char_cap: int = 12_000,
+) -> str:
+    """Prompt for the "stuff all pages into the prompt" baseline.
+
+    ``contexts`` is a list of ``(page_title_or_url, page_text)`` pairs.
+    Each page is truncated at ``per_page_char_cap`` (default 12k chars
+    ≈ 3k tokens) so a 5-page CRAG question fits well under any
+    modern long-context window with room for the question + reasoning.
+    """
+
+    blocks: list[str] = []
+    for idx, (title, text) in enumerate(contexts, start=1):
+        body = (text or "").strip()
+        if len(body) > per_page_char_cap:
+            body = body[:per_page_char_cap].rstrip() + "\n[...truncated...]"
+        title_clean = (title or f"page_{idx}").strip().replace("\n", " ")
+        blocks.append(
+            f"--- PAGE {idx}: {title_clean} ---\n{body}\n"
+        )
+    contexts_block = "\n".join(blocks) if blocks else "(no pages retrieved)"
+    return _LONG_CONTEXT_TEMPLATE.format(
+        instructions=_BASE_INSTRUCTIONS,
+        n_contexts=len(contexts),
+        contexts=contexts_block,
+        question=question.strip(),
+        query_time=query_time.strip() or "unknown",
+    )
+
+
+__all__ = [
+    "build_bare_prompt",
+    "build_long_context_prompt",
+    "build_surfsense_prompt",
+]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/runner.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/runner.py
new file mode 100644
index 000000000..d6ba49294
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/runner.py
@@ -0,0 +1,1053 @@
+"""CRAG runner — Bare LLM vs Long-Context LLM vs SurfSense.
+
+Three arms run paired on every question in the sample. All three
+answer with the same model (CRAG is a head-to-head benchmark, not a
+cost-arbitrage benchmark). The arms differ only in *what they see*:
+
+1. ``bare_llm``      — chat completion with the question only
+   (paper baseline ≤34%).
+2. ``long_context``  — same model, but the user message also includes
+   the extracted text of all 5 web pages (paper baseline ~44%).
+3. ``surfsense``     — POST ``/api/v1/new_chat`` with retrieval scoped
+   to the question's 5 ingested pages via ``mentioned_document_ids``.
+   The agent retrieves and reasons; we only grade the final answer.
+
+Grading: 3-class CRAG rubric — correct/missing/incorrect — with
+deterministic shortcuts and an LLM-as-judge fallback. Headline is
+the **truthfulness score** ``(#correct - #incorrect) / total``, the
+metric the CRAG paper and KDD Cup 2024 leaderboard use.
+
+We keep paired stats (McNemar + bootstrap CI) on the **correct**
+flag for each arm pair (long_context vs bare, surfsense vs
+long_context, surfsense vs bare) so the report can call out exactly
+where the lift comes from. Per-domain and per-question-type breakdowns
+surface where SurfSense beats long-context (e.g. multi-hop / set
+questions where retrieval-then-reason wins over stuff-it-all-in).
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from ....core.arms import ArmRequest, ArmResult, BareLlmArm, SurfSenseArm
+from ....core.config import utc_iso_timestamp
+from ....core.ingest_settings import (
+    IngestSettings,
+    add_ingest_settings_args,
+    format_ingest_settings_md,
+    is_settings_header,
+)
+from ....core.metrics.comparison import (
+    bootstrap_delta_ci,
+    mcnemar_test,
+    paired_aggregate,
+)
+from ....core.metrics.mc_accuracy import accuracy_with_wilson_ci
+from ....core.parse.freeform_answer import extract_freeform_answer
+from ....core.providers.openrouter_chat import OpenRouterChatProvider
+from ....core.registry import ReportSection, RunArtifact, RunContext
+from ....core.scenarios import format_scenario_md
+from .grader import (
+    CragGradeResult,
+    CragGradeRow,
+    CragJudgeConfig,
+    CragLlmJudge,
+    grade_many,
+)
+from .ingest import read_page_markdown
+from .prompt import (
+    build_bare_prompt,
+    build_long_context_prompt,
+    build_surfsense_prompt,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Question shape (post-ingest)
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class CragRunnerQuestion:
+    qid: str
+    raw_index: int
+    question: str
+    gold_answer: str
+    alt_answers: list[str]
+    domain: str
+    question_type: str
+    static_or_dynamic: str
+    popularity: str
+    query_time: str
+    page_filenames: list[str]
+    document_ids: list[int]
+    missing_pages: list[str] = field(default_factory=list)
+
+
+def _load_doc_map(map_path: Path) -> tuple[list[dict[str, Any]], dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    settings: dict[str, Any] = {}
+    with map_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if is_settings_header(row):
+                settings = dict(row["__settings__"])
+                continue
+            rows.append(row)
+    return rows, settings
+
+
+def _filter_questions(
+    rows: list[dict[str, Any]],
+    *,
+    sample_n: int | None,
+    domain_filter: str | None,
+    qtype_filter: str | None,
+) -> list[CragRunnerQuestion]:
+    out: list[CragRunnerQuestion] = []
+    for row in rows:
+        domain = str(row.get("domain") or "").lower()
+        qtype = str(row.get("question_type") or "").lower()
+        if domain_filter and domain_filter != domain:
+            continue
+        if qtype_filter and qtype_filter not in qtype:
+            continue
+        out.append(CragRunnerQuestion(
+            qid=str(row.get("qid") or "").strip(),
+            raw_index=int(row.get("raw_index") or 0),
+            question=str(row.get("question") or "").strip(),
+            gold_answer=str(row.get("gold_answer") or "").strip(),
+            alt_answers=list(row.get("alt_answers") or []),
+            domain=domain,
+            question_type=qtype,
+            static_or_dynamic=str(row.get("static_or_dynamic") or "").lower(),
+            popularity=str(row.get("popularity") or "").lower(),
+            query_time=str(row.get("query_time") or "").strip(),
+            page_filenames=list(row.get("page_filenames") or []),
+            document_ids=list(row.get("document_ids") or []),
+            missing_pages=list(row.get("missing_pages") or []),
+        ))
+    out.sort(key=lambda q: q.raw_index)
+    if sample_n is not None and sample_n > 0:
+        out = out[:sample_n]
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Concurrency helper
+# ---------------------------------------------------------------------------
+
+
+async def _gather_with_limit(coros: Iterable, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(coro):
+        async with sem:
+            return await coro
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+# ---------------------------------------------------------------------------
+# Benchmark
+# ---------------------------------------------------------------------------
+
+
+_DESCRIPTION = (
+    "CRAG (Comprehensive RAG Benchmark, Meta KDD Cup 2024) — three "
+    "arms (Bare LLM / Long-Context LLM / SurfSense) over the same "
+    "5-page-per-question CRAG corpus. Tests competitive RAG vs naive "
+    "context-stuffing; CRAG truthfulness score is the headline metric."
+)
+
+
+_DEFAULT_INGEST_SETTINGS = IngestSettings(
+    use_vision_llm=False,
+    processing_mode="basic",
+    should_summarize=False,
+)
+
+
+class CragBenchmark:
+    """3-arm CRAG runner: bare vs long-context vs SurfSense."""
+
+    suite: str = "research"
+    name: str = "crag"
+    headline: bool = True
+    description: str = _DESCRIPTION
+
+    # Subclasses (e.g. Task 3) override these without re-implementing run().
+    doc_map_filename: str = "crag_doc_map.jsonl"
+    # 0 = use ALL pages in the long-context arm. Task 3 defaults to 5
+    # so the long-context arm models the realistic "stuff the top-5
+    # search results into the prompt" baseline rather than blowing
+    # past the 128k-token context window with all 50 pages.
+    default_long_context_top_n: int = 0
+    pages_per_question_label: str = "5 pages"
+    ingest_hint: str = (
+        "`python -m surfsense_evals ingest research crag --n-questions 200`"
+    )
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument(
+            "--n", dest="sample_n", type=int, default=None,
+            help="Run only the first N questions after filters.",
+        )
+        parser.add_argument(
+            "--domain", dest="domain_filter", default=None,
+            help="Filter to a single CRAG domain (finance|music|movie|sports|open).",
+        )
+        parser.add_argument(
+            "--qtype", dest="qtype_filter", default=None,
+            help=(
+                "Filter to questions whose question_type contains this "
+                "substring (case-insensitive). Examples: 'multi-hop', "
+                "'comparison', 'false_premise'."
+            ),
+        )
+        parser.add_argument(
+            "--concurrency", type=int, default=4,
+            help="Parallel question workers per arm.",
+        )
+        parser.add_argument(
+            "--max-output-tokens", type=int, default=512,
+            help="Cap on completion length for the chat-completion arms.",
+        )
+        parser.add_argument(
+            "--per-page-char-cap", dest="per_page_char_cap", type=int, default=12_000,
+            help="Long-context arm: max chars per page before truncation (default 12k).",
+        )
+        parser.add_argument(
+            "--long-context-top-n-pages", dest="long_context_top_n_pages",
+            type=int, default=self.default_long_context_top_n,
+            help=(
+                "Long-context arm: keep only the first N pages from the "
+                "question's candidate list (0 = use all). Task 3 defaults "
+                "to 5 (the realistic 'naive RAG' top-K baseline)."
+            ),
+        )
+        parser.add_argument(
+            "--skip-bare", dest="skip_bare", action="store_true",
+            help="Skip the bare-LLM arm (saves cost on re-runs).",
+        )
+        parser.add_argument(
+            "--skip-long-context", dest="skip_long_context", action="store_true",
+            help="Skip the long-context arm.",
+        )
+        parser.add_argument(
+            "--skip-surfsense", dest="skip_surfsense", action="store_true",
+            help="Skip the SurfSense arm (useful when iterating on the LLM arms only).",
+        )
+        parser.add_argument(
+            "--no-mention-scope", dest="no_mention_scope", action="store_true",
+            help=(
+                "SurfSense arm: don't pass mentioned_document_ids; let "
+                "the agent retrieve over the entire SearchSpace. Default "
+                "is to scope to the question's 5 ingested pages "
+                "(matches CRAG protocol)."
+            ),
+        )
+        parser.add_argument(
+            "--no-judge", dest="no_judge", action="store_true",
+            help="Disable the LLM-as-judge fallback grader.",
+        )
+        parser.add_argument(
+            "--judge-model", dest="judge_model",
+            default="anthropic/claude-sonnet-4.5",
+            help="OpenRouter slug for the LLM judge.",
+        )
+        parser.add_argument(
+            "--judge-concurrency", dest="judge_concurrency", type=int, default=4,
+            help="Parallel judge calls.",
+        )
+        # Ingest knobs
+        parser.add_argument(
+            "--n-questions", dest="n_questions", type=int, default=None,
+            help="(ingest only) cap on number of questions to materialise + ingest.",
+        )
+        parser.add_argument(
+            "--upload-batch-size", dest="upload_batch_size", type=int, default=16,
+            help="(ingest only) markdown files per fileupload call.",
+        )
+        parser.add_argument(
+            "--skip-upload", dest="skip_upload", action="store_true",
+            help="(ingest only) extract pages locally but don't push to SurfSense.",
+        )
+        parser.add_argument(
+            "--overwrite-extract", dest="overwrite_extract", action="store_true",
+            help="(ingest only) re-run trafilatura even when cached markdown exists.",
+        )
+        parser.add_argument(
+            "--sample-seed", dest="sample_seed", type=int, default=17,
+            help="(ingest only) RNG seed for the stratified sample.",
+        )
+        add_ingest_settings_args(parser, defaults=_DEFAULT_INGEST_SETTINGS)
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:
+        from .ingest import run_ingest
+
+        settings = IngestSettings.merge(_DEFAULT_INGEST_SETTINGS, opts)
+        await run_ingest(
+            ctx,
+            n_questions=opts.get("n_questions"),
+            upload_batch_size=int(opts.get("upload_batch_size") or 16),
+            skip_upload=bool(opts.get("skip_upload", False)),
+            overwrite_extract=bool(opts.get("overwrite_extract", False)),
+            settings=settings,
+            sample_seed=int(opts.get("sample_seed") or 17),
+        )
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:
+        sample_n = opts.get("sample_n")
+        domain_filter = (opts.get("domain_filter") or "").strip().lower() or None
+        qtype_filter = (opts.get("qtype_filter") or "").strip().lower() or None
+        concurrency = int(opts.get("concurrency") or 4)
+        max_output_tokens = int(opts.get("max_output_tokens") or 512)
+        per_page_char_cap = int(opts.get("per_page_char_cap") or 12_000)
+        long_context_top_n_pages = int(
+            opts.get("long_context_top_n_pages")
+            if opts.get("long_context_top_n_pages") is not None
+            else self.default_long_context_top_n
+        )
+        skip_bare = bool(opts.get("skip_bare"))
+        skip_long_context = bool(opts.get("skip_long_context"))
+        skip_surfsense = bool(opts.get("skip_surfsense"))
+        no_mention_scope = bool(opts.get("no_mention_scope"))
+        no_judge = bool(opts.get("no_judge"))
+        judge_model = str(opts.get("judge_model") or "anthropic/claude-sonnet-4.5")
+        judge_concurrency = int(opts.get("judge_concurrency") or 4)
+
+        bench_dir = ctx.benchmark_data_dir()
+        map_path = ctx.maps_dir() / self.doc_map_filename
+        if not map_path.exists():
+            raise RuntimeError(
+                f"{self.name} not ingested for this suite. Run "
+                f"{self.ingest_hint} first."
+            )
+
+        rows, ingest_settings = _load_doc_map(map_path)
+        questions = _filter_questions(
+            rows,
+            sample_n=sample_n,
+            domain_filter=domain_filter,
+            qtype_filter=qtype_filter,
+        )
+        if not questions:
+            raise RuntimeError(
+                "No CRAG questions matched the filters; broaden --n / --domain / --qtype."
+            )
+        logger.info("CRAG: scheduled %d questions", len(questions))
+
+        api_key = os.environ.get("OPENROUTER_API_KEY")
+        if not api_key and not (skip_bare and skip_long_context):
+            raise RuntimeError(
+                "OPENROUTER_API_KEY env var is required for the bare / long-context arms."
+            )
+
+        bare_arm = long_context_arm = surf_arm = None
+        chat_provider: OpenRouterChatProvider | None = None
+        if not (skip_bare and skip_long_context):
+            chat_provider = OpenRouterChatProvider(
+                api_key=api_key or "",
+                base_url=ctx.config.openrouter_base_url,
+                model=ctx.native_arm_model,
+            )
+        if not skip_bare and chat_provider is not None:
+            bare_arm = BareLlmArm(
+                provider=chat_provider,
+                max_output_tokens=max_output_tokens,
+                name="bare_llm",
+            )
+        if not skip_long_context and chat_provider is not None:
+            long_context_arm = BareLlmArm(
+                provider=chat_provider,
+                max_output_tokens=max_output_tokens,
+                name="long_context",
+            )
+        if not skip_surfsense:
+            surf_arm = SurfSenseArm(
+                client=ctx.new_chat_client(),
+                search_space_id=ctx.search_space_id,
+                ephemeral_threads=True,
+            )
+
+        judge: CragLlmJudge | None = None
+        if not no_judge:
+            if not api_key:
+                logger.warning("CRAG: --no-judge implied (no OPENROUTER_API_KEY for judge)")
+            else:
+                judge = CragLlmJudge(config=CragJudgeConfig(
+                    api_key=api_key,
+                    model=judge_model,
+                    base_url=ctx.config.openrouter_base_url,
+                    concurrency=judge_concurrency,
+                ))
+
+        run_timestamp = utc_iso_timestamp()
+        run_dir = ctx.runs_dir(run_timestamp=run_timestamp)
+        raw_path = run_dir / "raw.jsonl"
+
+        async def _bare_one(q: CragRunnerQuestion) -> ArmResult:
+            assert bare_arm is not None
+            return await bare_arm.answer(_make_bare_request(q, max_output_tokens))
+
+        async def _long_context_one(q: CragRunnerQuestion) -> ArmResult:
+            assert long_context_arm is not None
+            return await long_context_arm.answer(
+                _make_long_context_request(
+                    q,
+                    bench_dir,
+                    max_output_tokens,
+                    per_page_char_cap,
+                    top_n_pages=long_context_top_n_pages,
+                )
+            )
+
+        async def _surf_one(q: CragRunnerQuestion) -> ArmResult:
+            assert surf_arm is not None
+            return await surf_arm.answer(
+                _make_surfsense_request(q, scope_to_pages=not no_mention_scope)
+            )
+
+        # Run all enabled arms concurrently. Each arm is itself
+        # internally concurrency-bounded.
+        tasks: list[Any] = []
+        if bare_arm is not None:
+            tasks.append(_gather_with_limit((_bare_one(q) for q in questions), concurrency=concurrency))
+        else:
+            tasks.append(_make_skipped_results(questions, "bare_llm"))
+        if long_context_arm is not None:
+            tasks.append(_gather_with_limit((_long_context_one(q) for q in questions), concurrency=concurrency))
+        else:
+            tasks.append(_make_skipped_results(questions, "long_context"))
+        if surf_arm is not None:
+            tasks.append(_gather_with_limit((_surf_one(q) for q in questions), concurrency=concurrency))
+        else:
+            tasks.append(_make_skipped_results(questions, "surfsense"))
+
+        bare_results, long_context_results, surf_results = await asyncio.gather(*tasks)
+
+        bare_grades = await _grade_results(questions, bare_results, judge=judge) if bare_arm else _empty_grades(questions)
+        lc_grades = await _grade_results(questions, long_context_results, judge=judge) if long_context_arm else _empty_grades(questions)
+        surf_grades = await _grade_results(questions, surf_results, judge=judge) if surf_arm else _empty_grades(questions)
+
+        with raw_path.open("w", encoding="utf-8") as fh:
+            for q, b_res, l_res, s_res, b_g, l_g, s_g in zip(
+                questions,
+                bare_results, long_context_results, surf_results,
+                bare_grades, lc_grades, surf_grades,
+                strict=False,
+            ):
+                meta = {
+                    "qid": q.qid,
+                    "raw_index": q.raw_index,
+                    "domain": q.domain,
+                    "question_type": q.question_type,
+                    "static_or_dynamic": q.static_or_dynamic,
+                    "popularity": q.popularity,
+                    "n_pages": len(q.page_filenames),
+                    "n_doc_ids": len(q.document_ids),
+                    "gold": q.gold_answer,
+                    "alt_answers": q.alt_answers,
+                }
+                for res, grade in (
+                    (b_res, b_g), (l_res, l_g), (s_res, s_g),
+                ):
+                    fh.write(json.dumps({
+                        **meta,
+                        **res.to_jsonl(),
+                        "graded": grade.to_dict(),
+                    }) + "\n")
+
+        metrics = _compute_metrics(
+            questions=questions,
+            bare_results=bare_results, long_context_results=long_context_results, surf_results=surf_results,
+            bare_grades=bare_grades, lc_grades=lc_grades, surf_grades=surf_grades,
+            arms_active={
+                "bare_llm": bare_arm is not None,
+                "long_context": long_context_arm is not None,
+                "surfsense": surf_arm is not None,
+            },
+        )
+        artifact = RunArtifact(
+            suite=self.suite,
+            benchmark=self.name,
+            run_timestamp=run_timestamp,
+            raw_path=raw_path,
+            metrics=metrics,
+            extra={
+                "n_questions": len(questions),
+                "concurrency": concurrency,
+                "domain_filter": domain_filter,
+                "qtype_filter": qtype_filter,
+                "no_mention_scope": no_mention_scope,
+                "no_judge": no_judge,
+                "judge_model": judge_model if not no_judge else None,
+                "scenario": ctx.scenario,
+                "provider_model": ctx.provider_model,
+                "native_arm_model": ctx.native_arm_model,
+                "vision_provider_model": ctx.vision_provider_model,
+                "agent_llm_id": ctx.agent_llm_id,
+                "ingest_settings": ingest_settings,
+                "per_page_char_cap": per_page_char_cap,
+                "long_context_top_n_pages": long_context_top_n_pages,
+                "pages_per_question_label": self.pages_per_question_label,
+                "max_output_tokens": max_output_tokens,
+                "arms_active": {
+                    "bare_llm": bare_arm is not None,
+                    "long_context": long_context_arm is not None,
+                    "surfsense": surf_arm is not None,
+                },
+            },
+        )
+
+        manifest_path = run_dir / "run_artifact.json"
+        manifest_path.write_text(
+            json.dumps({
+                "suite": self.suite,
+                "benchmark": self.name,
+                "raw_path": "raw.jsonl",
+                "metrics": metrics,
+                "extra": artifact.extra,
+            }, indent=2, sort_keys=True) + "\n",
+            encoding="utf-8",
+        )
+        return artifact
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:
+        if not artifacts:
+            return ReportSection(
+                title="CRAG — Bare LLM vs Long-Context LLM vs SurfSense",
+                headline=True,
+                body_md="(no run artifacts found)",
+                body_json={},
+            )
+        latest = max(artifacts, key=lambda a: a.run_timestamp)
+        m = latest.metrics
+        bare = m.get("bare_llm", {})
+        lc = m.get("long_context", {})
+        surf = m.get("surfsense", {})
+        deltas = m.get("deltas", {})
+        per_domain = m.get("per_domain", {})
+        per_qtype = m.get("per_question_type", {})
+        extra = latest.extra
+
+        body_lines: list[str] = []
+        body_lines.append(
+            f"- Sample size: {extra.get('n_questions', '?')} questions "
+            f"(domain filter: `{extra.get('domain_filter') or 'none'}`, "
+            f"qtype filter: `{extra.get('qtype_filter') or 'none'}`, "
+            f"judge: `{extra.get('judge_model') or 'deterministic-only'}`)."
+        )
+        body_lines.append(format_scenario_md(extra))
+        body_lines.append(format_ingest_settings_md(extra.get("ingest_settings")))
+        active = extra.get("arms_active") or {}
+        if not active.get("bare_llm", True):
+            body_lines.append("- Bare-LLM arm: SKIPPED.")
+        else:
+            body_lines.append(
+                f"- Bare-LLM arm (`{extra.get('native_arm_model') or '?'}`, no retrieval):"
+            )
+            body_lines.append(_arm_summary_lines(bare, indent="  "))
+        if not active.get("long_context", True):
+            body_lines.append("- Long-context arm: SKIPPED.")
+        else:
+            top_n = int(extra.get("long_context_top_n_pages") or 0)
+            page_phrase = (
+                f"top-{top_n} of {extra.get('pages_per_question_label') or 'pages'}"
+                if top_n > 0
+                else f"all of {extra.get('pages_per_question_label') or 'pages'}"
+            )
+            body_lines.append(
+                f"- Long-context arm (`{extra.get('native_arm_model') or '?'}`, "
+                f"{page_phrase} stuffed into prompt; per-page cap "
+                f"{extra.get('per_page_char_cap', 12_000):,} chars):"
+            )
+            body_lines.append(_arm_summary_lines(lc, indent="  "))
+        if not active.get("surfsense", True):
+            body_lines.append("- SurfSense arm: SKIPPED.")
+        else:
+            scope_phrase = (
+                "whole SearchSpace"
+                if extra.get("no_mention_scope")
+                else f"per-question {extra.get('pages_per_question_label') or 'pages'}"
+            )
+            body_lines.append(
+                f"- SurfSense arm (`{extra.get('provider_model', '?')}`, retrieval over "
+                f"{scope_phrase}):"
+            )
+            body_lines.append(_arm_summary_lines(surf, indent="  "))
+
+        body_lines.append("- Headline truthfulness scores (CRAG paper rubric):")
+        for label, key in (
+            ("Bare LLM", "bare_llm"), ("Long-Context", "long_context"), ("SurfSense", "surfsense"),
+        ):
+            d = m.get(key, {})
+            body_lines.append(
+                f"  - {label}: score={_signed_pct(d.get('truthfulness_score'))}, "
+                f"correct={_pct(d.get('correct_rate'))}, "
+                f"missing={_pct(d.get('missing_rate'))}, "
+                f"incorrect={_pct(d.get('incorrect_rate'))}"
+            )
+
+        if deltas:
+            body_lines.append("- Pairwise deltas (paired):")
+            for label, key in (
+                ("SurfSense vs Bare", "surfsense_vs_bare"),
+                ("SurfSense vs Long-Context", "surfsense_vs_long_context"),
+                ("Long-Context vs Bare", "long_context_vs_bare"),
+            ):
+                d = deltas.get(key)
+                if not d:
+                    continue
+                body_lines.append(
+                    f"  - {label}: accuracy {_pp(d.get('accuracy_pp'))} pp, "
+                    f"truthfulness {_pp(d.get('truthfulness_score_pp'))} pp "
+                    f"(McNemar p={_fmt(d.get('mcnemar_p_value'), 4)}, "
+                    f"method={d.get('mcnemar_method')}; bootstrap CI on accuracy "
+                    f"[{_pp(d.get('bootstrap_ci_low'))}pp, {_pp(d.get('bootstrap_ci_high'))}pp])"
+                )
+
+        if per_domain:
+            body_lines.append("- Per-domain truthfulness score (active arms):")
+            for domain in sorted(per_domain.keys()):
+                row = per_domain[domain]
+                pieces: list[str] = [f"  - {domain} (n={row.get('n')}):"]
+                for arm in ("bare_llm", "long_context", "surfsense"):
+                    if arm not in row:
+                        continue
+                    pieces.append(
+                        f"{arm}={_signed_pct(row[arm].get('truthfulness_score'))}"
+                    )
+                body_lines.append(" ".join(pieces))
+
+        if per_qtype:
+            body_lines.append("- Per-question-type truthfulness score (active arms):")
+            for qtype in sorted(per_qtype.keys()):
+                row = per_qtype[qtype]
+                pieces = [f"  - {qtype} (n={row.get('n')}):"]
+                for arm in ("bare_llm", "long_context", "surfsense"):
+                    if arm not in row:
+                        continue
+                    pieces.append(
+                        f"{arm}={_signed_pct(row[arm].get('truthfulness_score'))}"
+                    )
+                body_lines.append(" ".join(pieces))
+
+        return ReportSection(
+            title="CRAG — Bare LLM vs Long-Context LLM vs SurfSense",
+            headline=True,
+            body_md="\n".join(body_lines),
+            body_json=m,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Per-question helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_bare_request(q: CragRunnerQuestion, max_tokens: int) -> ArmRequest:
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=build_bare_prompt(q.question, query_time=q.query_time),
+        options={"max_tokens": max_tokens},
+    )
+
+
+def _make_long_context_request(
+    q: CragRunnerQuestion,
+    bench_dir: Path,
+    max_tokens: int,
+    per_page_char_cap: int,
+    *,
+    top_n_pages: int = 0,
+) -> ArmRequest:
+    # The CRAG search_results list is already ranked top-K from the
+    # original web search at query_time; slicing the prefix is the
+    # honest "naive RAG: take the top-K results" baseline.
+    page_iter = q.page_filenames
+    if top_n_pages and top_n_pages > 0:
+        page_iter = page_iter[:top_n_pages]
+    contexts: list[tuple[str, str]] = []
+    for fn in page_iter:
+        text = read_page_markdown(bench_dir, fn) or ""
+        if not text.strip():
+            continue
+        # Use the filename stem as a stable title fallback (URLs are
+        # already in the markdown body's "Source:" header line).
+        contexts.append((Path(fn).stem, text))
+    prompt = build_long_context_prompt(
+        q.question,
+        contexts=contexts,
+        query_time=q.query_time,
+        per_page_char_cap=per_page_char_cap,
+    )
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=prompt,
+        options={"max_tokens": max_tokens},
+    )
+
+
+def _make_surfsense_request(q: CragRunnerQuestion, *, scope_to_pages: bool) -> ArmRequest:
+    mentions: list[int] | None = None
+    if scope_to_pages and q.document_ids:
+        mentions = list(q.document_ids)
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=build_surfsense_prompt(q.question, query_time=q.query_time),
+        mentioned_document_ids=mentions,
+    )
+
+
+async def _grade_results(
+    questions: list[CragRunnerQuestion],
+    results: list[ArmResult],
+    *,
+    judge: CragLlmJudge | None,
+) -> list[CragGradeResult]:
+    rows: list[CragGradeRow] = []
+    for q, r in zip(questions, results, strict=False):
+        pred = extract_freeform_answer(r.raw_text or "")
+        rows.append(CragGradeRow(
+            qid=q.qid,
+            question=q.question,
+            gold=q.gold_answer,
+            alt_answers=q.alt_answers,
+            pred=pred,
+            question_type=q.question_type,
+        ))
+    return await grade_many(rows=rows, judge=judge)
+
+
+def _empty_grades(questions: list[CragRunnerQuestion]) -> list[CragGradeResult]:
+    return [
+        CragGradeResult(grade="missing", score=0, method="skipped_arm")
+        for _ in questions
+    ]
+
+
+async def _make_skipped_results(
+    questions: list[CragRunnerQuestion], arm_name: str,
+) -> list[ArmResult]:
+    """Stand-in results so downstream code can assume parallel lists."""
+
+    return [
+        ArmResult(arm=arm_name, question_id=q.qid, raw_text="", error="skipped")
+        for q in questions
+    ]
+
+
+# ---------------------------------------------------------------------------
+# Metrics aggregation
+# ---------------------------------------------------------------------------
+
+
+def _arm_truthfulness(grades: list[CragGradeResult]) -> dict[str, Any]:
+    """Per-arm headline numbers — accuracy + 3-class rates + truthfulness."""
+
+    n = len(grades) or 1
+    n_correct = sum(g.correct for g in grades)
+    n_missing = sum(g.missing for g in grades)
+    n_incorrect = sum(g.incorrect for g in grades)
+    return {
+        "n_total": len(grades),
+        "n_correct": n_correct,
+        "n_missing": n_missing,
+        "n_incorrect": n_incorrect,
+        "correct_rate": n_correct / n,
+        "missing_rate": n_missing / n,
+        "incorrect_rate": n_incorrect / n,
+        "truthfulness_score": (n_correct - n_incorrect) / n,
+    }
+
+
+def _compute_metrics(
+    *,
+    questions: list[CragRunnerQuestion],
+    bare_results: list[ArmResult],
+    long_context_results: list[ArmResult],
+    surf_results: list[ArmResult],
+    bare_grades: list[CragGradeResult],
+    lc_grades: list[CragGradeResult],
+    surf_grades: list[CragGradeResult],
+    arms_active: dict[str, bool],
+) -> dict[str, Any]:
+    bare_correct = [g.correct for g in bare_grades]
+    lc_correct = [g.correct for g in lc_grades]
+    surf_correct = [g.correct for g in surf_grades]
+
+    bare_acc = accuracy_with_wilson_ci(sum(bare_correct), len(bare_correct))
+    lc_acc = accuracy_with_wilson_ci(sum(lc_correct), len(lc_correct))
+    surf_acc = accuracy_with_wilson_ci(sum(surf_correct), len(surf_correct))
+
+    bare_t = _arm_truthfulness(bare_grades)
+    lc_t = _arm_truthfulness(lc_grades)
+    surf_t = _arm_truthfulness(surf_grades)
+
+    def _arm_block(
+        results: list[ArmResult],
+        acc: Any,
+        truthfulness: dict[str, Any],
+    ) -> dict[str, Any]:
+        costs = [float(r.cost_micros) for r in results]
+        latencies = [float(r.latency_ms) for r in results]
+        ins = [float(r.input_tokens) for r in results]
+        outs = [float(r.output_tokens) for r in results]
+        cost_agg = paired_aggregate(costs)
+        lat_agg = paired_aggregate(latencies)
+        return {
+            **acc.to_dict(),
+            **truthfulness,
+            "cost_micros_mean": cost_agg.mean,
+            "cost_micros_median": cost_agg.median,
+            "latency_ms_mean": lat_agg.mean,
+            "latency_ms_median": lat_agg.median,
+            "latency_ms_p95": lat_agg.p95,
+            "input_tokens_mean": (sum(ins) / len(ins)) if ins else 0.0,
+            "output_tokens_mean": (sum(outs) / len(outs)) if outs else 0.0,
+        }
+
+    out: dict[str, Any] = {
+        "bare_llm": _arm_block(bare_results, bare_acc, bare_t),
+        "long_context": _arm_block(long_context_results, lc_acc, lc_t),
+        "surfsense": _arm_block(surf_results, surf_acc, surf_t),
+    }
+
+    deltas: dict[str, Any] = {}
+    for label, ref_correct, ref_t, chal_correct, chal_t, both_active in (
+        ("surfsense_vs_bare", bare_correct, bare_t, surf_correct, surf_t,
+         arms_active.get("bare_llm") and arms_active.get("surfsense")),
+        ("surfsense_vs_long_context", lc_correct, lc_t, surf_correct, surf_t,
+         arms_active.get("long_context") and arms_active.get("surfsense")),
+        ("long_context_vs_bare", bare_correct, bare_t, lc_correct, lc_t,
+         arms_active.get("bare_llm") and arms_active.get("long_context")),
+    ):
+        if not both_active:
+            continue
+        mc = mcnemar_test(ref_correct, chal_correct)
+        boot = bootstrap_delta_ci(ref_correct, chal_correct, n_resamples=2000)
+        deltas[label] = {
+            "accuracy_pp": 100.0 * (sum(chal_correct) - sum(ref_correct)) / max(1, len(chal_correct)),
+            "truthfulness_score_pp": 100.0 * (chal_t["truthfulness_score"] - ref_t["truthfulness_score"]),
+            "mcnemar_p_value": mc.p_value,
+            "mcnemar_method": mc.method,
+            "mcnemar_b_ref_only": mc.b,
+            "mcnemar_c_challenger_only": mc.c,
+            "bootstrap_ci_low": 100.0 * boot.ci_low,
+            "bootstrap_ci_high": 100.0 * boot.ci_high,
+        }
+    out["deltas"] = deltas
+
+    out["per_domain"] = _per_facet_truthfulness(
+        questions, bare_grades, lc_grades, surf_grades,
+        arms_active=arms_active,
+        key_fn=lambda q: q.domain or "(unspecified)",
+    )
+    out["per_question_type"] = _per_facet_truthfulness(
+        questions, bare_grades, lc_grades, surf_grades,
+        arms_active=arms_active,
+        key_fn=lambda q: q.question_type or "(unspecified)",
+    )
+
+    out["grader_methods"] = {
+        "bare_llm": _count_methods(bare_grades) if arms_active.get("bare_llm") else {},
+        "long_context": _count_methods(lc_grades) if arms_active.get("long_context") else {},
+        "surfsense": _count_methods(surf_grades) if arms_active.get("surfsense") else {},
+    }
+    return out
+
+
+def _per_facet_truthfulness(
+    questions: list[CragRunnerQuestion],
+    bare_grades: list[CragGradeResult],
+    lc_grades: list[CragGradeResult],
+    surf_grades: list[CragGradeResult],
+    *,
+    arms_active: dict[str, bool],
+    key_fn: Any,
+) -> dict[str, Any]:
+    """Bucket truthfulness scores by ``key_fn(q)``."""
+
+    buckets: dict[str, dict[str, list[CragGradeResult]]] = {}
+    for q, b, l, s in zip(questions, bare_grades, lc_grades, surf_grades, strict=False):
+        key = key_fn(q)
+        bucket = buckets.setdefault(key, {"bare_llm": [], "long_context": [], "surfsense": []})
+        bucket["bare_llm"].append(b)
+        bucket["long_context"].append(l)
+        bucket["surfsense"].append(s)
+    out: dict[str, Any] = {}
+    for key, arms in buckets.items():
+        row: dict[str, Any] = {"n": len(arms["bare_llm"])}
+        for arm_name, grades in arms.items():
+            if not arms_active.get(arm_name):
+                continue
+            row[arm_name] = _arm_truthfulness(grades)
+        out[key] = row
+    return out
+
+
+def _count_methods(grades: list[CragGradeResult]) -> dict[str, int]:
+    out: dict[str, int] = {}
+    for g in grades:
+        out[g.method] = out.get(g.method, 0) + 1
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Tiny formatting helpers
+# ---------------------------------------------------------------------------
+
+
+def _arm_summary_lines(d: dict[str, Any], *, indent: str) -> str:
+    if not d:
+        return f"{indent}(no data)"
+    acc = d.get("accuracy", 0.0)
+    low = d.get("ci_low", 0.0)
+    high = d.get("ci_high", 0.0)
+    lines = [
+        f"{indent}- Accuracy: {acc * 100:.1f}% (Wilson 95% CI: {low * 100:.1f}% – {high * 100:.1f}%)",
+        f"{indent}- 3-class: correct={d.get('correct_rate', 0)*100:.1f}%, "
+        f"missing={d.get('missing_rate', 0)*100:.1f}%, "
+        f"incorrect={d.get('incorrect_rate', 0)*100:.1f}%",
+        f"{indent}- Truthfulness score (correct - incorrect)/total: "
+        f"{d.get('truthfulness_score', 0)*100:+.1f}%",
+        f"{indent}- Cost / question: ${_dollars(d.get('cost_micros_mean'))} (mean), "
+        f"${_dollars(d.get('cost_micros_median'))} (median)",
+        f"{indent}- Latency: p50 {_ms_to_s(d.get('latency_ms_median'))}, "
+        f"p95 {_ms_to_s(d.get('latency_ms_p95'))}",
+    ]
+    if d.get("input_tokens_mean") or d.get("output_tokens_mean"):
+        lines.append(
+            f"{indent}- Mean tokens / question: in {d.get('input_tokens_mean', 0):.0f}, "
+            f"out {d.get('output_tokens_mean', 0):.0f}"
+        )
+    return "\n".join(lines)
+
+
+def _dollars(micros: Any) -> str:
+    if micros is None:
+        return "?"
+    try:
+        return f"{(float(micros) / 1_000_000):.4f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _ms_to_s(ms: Any) -> str:
+    if ms is None:
+        return "?"
+    try:
+        return f"{float(ms) / 1000:.1f}s"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _pp(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):+.1f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _pct(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value)*100:.1f}%"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _signed_pct(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value)*100:+.1f}%"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _fmt(value: Any, ndigits: int) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):.{ndigits}f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+_TASK3_DESCRIPTION = (
+    "CRAG Task 3 (Meta KDD Cup 2024) — same 3 arms but the corpus per "
+    "question now has **50 candidate web pages** (vs 5 in Tasks 1 & 2). "
+    "The long-context arm uses only the top-5 (the realistic naive-RAG "
+    "baseline); SurfSense retrieves over all 50, where its rerank "
+    "becomes the actual contribution."
+)
+
+
+class CragTask3Benchmark(CragBenchmark):
+    """3-arm CRAG runner over Task 3 (50 pages per question).
+
+    Reuses the entire Task 1/2 runtime (grader, prompt, metrics,
+    reporting) — the only deltas are: the doc map filename, the
+    long-context arm's default page cap (5 instead of all 50), and
+    the ingest entrypoint (4-part archive instead of single bz2).
+    """
+
+    name: str = "crag_t3"
+    description: str = _TASK3_DESCRIPTION
+    doc_map_filename: str = "crag_t3_doc_map.jsonl"
+    default_long_context_top_n: int = 5
+    pages_per_question_label: str = "50 pages"
+    ingest_hint: str = (
+        "`python -m surfsense_evals ingest research crag_t3 --n-questions 50` "
+        "(after `python scripts/download_crag_task3.py`)"
+    )
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:
+        # Local import: keep dataset_task3's lazy-streaming module out
+        # of the import graph until someone actually wants Task 3.
+        from .ingest_task3 import run_ingest_task3
+
+        settings = IngestSettings.merge(_DEFAULT_INGEST_SETTINGS, opts)
+        await run_ingest_task3(
+            ctx,
+            n_questions=opts.get("n_questions"),
+            upload_batch_size=int(opts.get("upload_batch_size") or 16),
+            skip_upload=bool(opts.get("skip_upload", False)),
+            overwrite_extract=bool(opts.get("overwrite_extract", False)),
+            settings=settings,
+            sample_seed=int(opts.get("sample_seed") or 17),
+            parse_cap=opts.get("parse_cap"),
+        )
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        super().add_run_args(parser)
+        parser.add_argument(
+            "--parse-cap", dest="parse_cap", type=int, default=None,
+            help=(
+                "(ingest only) Hard cap on rows parsed from the streaming "
+                "Task 3 archive before stratified sampling. Default: "
+                "max(2000, 10 * n_questions). Lower = less decompression."
+            ),
+        )
+
+
+__all__ = ["CragBenchmark", "CragRunnerQuestion", "CragTask3Benchmark"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/frames/__init__.py b/surfsense_evals/src/surfsense_evals/suites/research/frames/__init__.py
new file mode 100644
index 000000000..4e556cd84
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/frames/__init__.py
@@ -0,0 +1,29 @@
+"""FRAMES — multi-hop Wikipedia retrieval & reasoning (google/frames-benchmark).
+
+Source: https://huggingface.co/datasets/google/frames-benchmark
+Paper:  https://arxiv.org/abs/2409.12941 (Krishna et al., 2024)
+
+* 824 multi-hop questions, each requiring 2-15 Wikipedia articles
+* 5 reasoning types: numerical, tabular, multiple constraints,
+  temporal, post-processing
+* Published Gemini-Pro-1.5 baselines:
+  - Naive prompting (no retrieval):    40.8%
+  - BM25, top-4:                       47.4%
+  - Multi-step retrieval & reasoning:  66.0%
+  - Oracle retrieval (gold articles):  72.9%
+
+This is the benchmark that *finally* puts SurfSense's strongest claim
+on trial: cross-document iterative retrieval. The harness ingests
+every Wikipedia article referenced by any question in the run sample
+into a single SearchSpace; SurfSense answers without
+``mentioned_document_ids`` so its agent has to actually retrieve.
+The bare-LLM arm answers from the prompt only (the published 40.8%
+baseline number).
+"""
+
+from __future__ import annotations
+
+from ....core import registry as _registry
+from .runner import FramesBenchmark
+
+_registry.register(FramesBenchmark())
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/frames/dataset.py b/surfsense_evals/src/surfsense_evals/suites/research/frames/dataset.py
new file mode 100644
index 000000000..c3b6b878e
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/frames/dataset.py
@@ -0,0 +1,174 @@
+"""FRAMES dataset loader — download ``test.tsv`` from HF and parse rows.
+
+The HF repo (``google/frames-benchmark``) ships a single tab-separated
+file at ``test.tsv`` (824 rows). Columns of interest for us:
+
+* unnamed first column → row index (``id`` we synthesise as ``Q000``..)
+* ``Prompt``  → the question (free-text, often multi-clause)
+* ``Answer``  → gold answer (short string: name, number, year, ...)
+* ``wikipedia_link_1`` ... ``wikipedia_link_11+`` → sparse per-question
+  link cells (we ignore in favour of the consolidated column below).
+* ``reasoning_types`` → pipe-separated tags (``"Numerical reasoning |
+  Tabular reasoning | Multiple constraints"``)
+* ``wiki_links`` → Python-list literal of every URL the question relies
+  on, e.g. ``"['https://en.wikipedia.org/wiki/...', '...']"``
+
+We use ``wiki_links`` (already deduplicated per row) and
+``ast.literal_eval`` to materialise it. The legacy
+``wikipedia_link_*`` columns are kept around only so a curious
+operator can compare cell-vs-list if upstream ever drift apart.
+"""
+
+from __future__ import annotations
+
+import ast
+import json
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+HF_REPO_ID = "google/frames-benchmark"
+HF_REPO_TYPE = "dataset"
+HF_TEST_FILE = "test.tsv"
+
+
+def _hf_hub_download(*args: Any, **kwargs: Any) -> str:
+    from huggingface_hub import hf_hub_download
+
+    return hf_hub_download(*args, **kwargs)
+
+
+# ---------------------------------------------------------------------------
+# Question dataclass
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class FramesQuestion:
+    """One row of FRAMES (post-parse)."""
+
+    qid: str                   # synthesised "Q000" .. "Q823"
+    question: str
+    gold_answer: str
+    wiki_urls: list[str]       # deduped, in original order
+    reasoning_types: list[str] # split on "|"
+    raw_index: int             # row index from the TSV (for debugging)
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "qid": self.qid,
+            "question": self.question,
+            "gold_answer": self.gold_answer,
+            "wiki_urls": list(self.wiki_urls),
+            "reasoning_types": list(self.reasoning_types),
+            "raw_index": self.raw_index,
+        }
+
+
+# ---------------------------------------------------------------------------
+# Download + parse
+# ---------------------------------------------------------------------------
+
+
+def download_test_tsv(cache_dir: Path) -> Path:
+    """Resumable download of ``test.tsv`` via ``huggingface_hub``."""
+
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    local = _hf_hub_download(
+        repo_id=HF_REPO_ID,
+        filename=HF_TEST_FILE,
+        repo_type=HF_REPO_TYPE,
+        cache_dir=str(cache_dir),
+    )
+    return Path(local)
+
+
+def _parse_wiki_links(raw: Any) -> list[str]:
+    """Convert the ``wiki_links`` cell (Python list literal) to ``list[str]``."""
+
+    if not raw:
+        return []
+    if isinstance(raw, list):
+        return [str(x).strip() for x in raw if str(x).strip()]
+    text = str(raw).strip()
+    if not text:
+        return []
+    try:
+        parsed = ast.literal_eval(text)
+    except (SyntaxError, ValueError):
+        # Fall back: maybe it's a comma-separated string with no quotes.
+        return [tok.strip() for tok in text.strip("[]").split(",") if tok.strip()]
+    if isinstance(parsed, (list, tuple)):
+        return [str(x).strip() for x in parsed if str(x).strip()]
+    return [str(parsed).strip()]
+
+
+def _parse_reasoning_types(raw: Any) -> list[str]:
+    if not raw:
+        return []
+    text = str(raw).strip()
+    if not text:
+        return []
+    return [tok.strip() for tok in text.split("|") if tok.strip()]
+
+
+def load_questions(tsv_path: Path) -> list[FramesQuestion]:
+    """Read FRAMES rows from disk into ``FramesQuestion`` objects.
+
+    Uses pandas for robust TSV parsing (tabs inside quoted strings are
+    rare in this dataset but pandas handles them; the stdlib ``csv``
+    module is fine too if pandas ever becomes a problem). We pin
+    ``index_col=0`` because the upstream TSV uses the first unnamed
+    column as the row index.
+    """
+
+    import pandas as pd
+
+    df = pd.read_csv(tsv_path, sep="\t", index_col=0, keep_default_na=False)
+    out: list[FramesQuestion] = []
+    for raw_idx, row in df.iterrows():
+        prompt = str(row.get("Prompt") or "").strip()
+        answer = str(row.get("Answer") or "").strip()
+        if not prompt or not answer:
+            logger.debug("Skipping FRAMES row %s with missing Prompt/Answer", raw_idx)
+            continue
+        urls = _parse_wiki_links(row.get("wiki_links"))
+        if not urls:
+            # Fall back to the per-cell ``wikipedia_link_*`` columns.
+            urls = []
+            for col in row.index:
+                if col.startswith("wikipedia_link"):
+                    val = str(row.get(col) or "").strip()
+                    if val and val not in urls:
+                        urls.append(val)
+        reasoning = _parse_reasoning_types(row.get("reasoning_types"))
+        out.append(FramesQuestion(
+            qid=f"Q{int(raw_idx):03d}",
+            question=prompt,
+            gold_answer=answer,
+            wiki_urls=urls,
+            reasoning_types=reasoning,
+            raw_index=int(raw_idx),
+        ))
+    return out
+
+
+def write_questions_jsonl(questions: list[FramesQuestion], dest: Path) -> None:
+    """Persist a parsed copy under the benchmark data dir."""
+
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    with dest.open("w", encoding="utf-8") as fh:
+        for q in questions:
+            fh.write(json.dumps(q.to_dict()) + "\n")
+
+
+__all__ = [
+    "FramesQuestion",
+    "download_test_tsv",
+    "load_questions",
+    "write_questions_jsonl",
+]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/frames/grader.py b/surfsense_evals/src/surfsense_evals/suites/research/frames/grader.py
new file mode 100644
index 000000000..d280e3eaf
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/frames/grader.py
@@ -0,0 +1,341 @@
+"""FRAMES grader: deterministic shortcut + LLM-as-judge fallback.
+
+FRAMES gold answers are short factoids (a name, a year, an ordinal,
+a count). The published paper uses an LLM judge for grading, citing
+the long tail of paraphrasing ("Jane Ballou" vs "Mrs. Ballou (Jane)";
+"5" vs "five"; "London, England" vs "London"). We replicate that
+faithfully *but* avoid burning judge tokens on the obvious cases.
+
+Pipeline per (pred, gold):
+
+1. Normalise both sides (SQuAD-style).
+2. If normalised pred == normalised gold → CORRECT (``method=exact``).
+3. Numeric path: if both extract to a single number and the values
+   match within 1% relative tolerance → CORRECT (``method=numeric``).
+4. Substring path: if normalised gold appears as a *whole-word phrase*
+   inside normalised pred (or vice versa) → CORRECT
+   (``method=substring``).
+5. Otherwise → call the LLM judge if a judge is wired; the judge
+   returns yes/no with a one-line rationale.
+6. If no judge is configured, fall through to ``False``
+   (``method=lexical_miss``).
+
+The judge is called *concurrently* across the run via a semaphore (so
+it doesn't outrun the upstream rate limit). Cached on
+``(arm, qid)`` so re-running ``report`` doesn't re-judge.
+
+Returned shape mirrors ``mmlongbench.grader.GradeResult`` to keep
+report writers uniform across benchmarks.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import re
+import string
+from collections.abc import Sequence
+from dataclasses import dataclass
+from typing import Any
+
+from ....core.providers.openrouter_chat import OpenRouterChatProvider
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Public types
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class GradeResult:
+    """Shape mirrors mmlongbench.grader.GradeResult for report uniformity."""
+
+    correct: bool
+    f1: float
+    method: str
+    normalised_pred: str = ""
+    normalised_gold: str = ""
+    judge_rationale: str = ""
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "correct": self.correct,
+            "f1": self.f1,
+            "method": self.method,
+            "normalised_pred": self.normalised_pred,
+            "normalised_gold": self.normalised_gold,
+            "judge_rationale": self.judge_rationale,
+        }
+
+
+# ---------------------------------------------------------------------------
+# Normalisation
+# ---------------------------------------------------------------------------
+
+
+_PUNCT_TABLE = str.maketrans({c: " " for c in string.punctuation})
+_ARTICLES = re.compile(r"\b(a|an|the)\b", re.IGNORECASE)
+_WS = re.compile(r"\s+")
+
+
+def _normalise(s: str) -> str:
+    s = (s or "").lower()
+    s = s.translate(_PUNCT_TABLE)
+    s = _ARTICLES.sub(" ", s)
+    s = _WS.sub(" ", s).strip()
+    return s
+
+
+_WORD_NUMBERS = {
+    "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
+    "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10, "eleven": 11,
+    "twelve": 12, "thirteen": 13, "fourteen": 14, "fifteen": 15, "sixteen": 16,
+    "seventeen": 17, "eighteen": 18, "nineteen": 19, "twenty": 20,
+}
+
+_NUMERIC_RE = re.compile(r"-?\d+(?:[.,]\d+)?")
+
+
+def _maybe_number(s: str) -> float | None:
+    """Extract a single numeric value, recognising digit and word forms.
+
+    Operates on the lowercased *raw* text (rather than the
+    punctuation-stripped normalisation) so that thousands separators
+    like ``1,234`` are preserved through the regex and parsed
+    correctly. We only fall back to ``_normalise`` for the word-number
+    pass, which doesn't care about punctuation.
+    """
+
+    raw = (s or "").strip().lower()
+    if not raw:
+        return None
+    match = _NUMERIC_RE.search(raw)
+    if match:
+        try:
+            return float(match.group(0).replace(",", ""))
+        except ValueError:
+            pass
+    for tok in _normalise(s).split():
+        if tok in _WORD_NUMBERS:
+            return float(_WORD_NUMBERS[tok])
+    return None
+
+
+def _whole_word_substring(haystack: str, needle: str) -> bool:
+    """Is ``needle`` present as a whole-word phrase in ``haystack``?"""
+
+    if not needle:
+        return False
+    pad_h = f" {haystack} "
+    pad_n = f" {needle} "
+    return pad_n in pad_h
+
+
+# ---------------------------------------------------------------------------
+# Deterministic shortcut
+# ---------------------------------------------------------------------------
+
+
+def grade_deterministic(*, pred: str, gold: str) -> GradeResult:
+    """Try to grade without the LLM judge. Returns a final-result object.
+
+    A ``False`` result with ``method == "lexical_miss"`` is the signal
+    to the caller that the LLM judge should be consulted (if available).
+    """
+
+    if not (pred or "").strip():
+        return GradeResult(False, 0.0, "empty_pred", "", _normalise(gold))
+
+    p = _normalise(pred)
+    g = _normalise(gold)
+    if not g:
+        # Defensively: gold should never be empty; if it is, we can't grade.
+        return GradeResult(False, 0.0, "empty_gold", p, g)
+
+    if p == g:
+        return GradeResult(True, 1.0, "exact", p, g)
+
+    p_num = _maybe_number(pred)
+    g_num = _maybe_number(gold)
+    if p_num is not None and g_num is not None:
+        # 1% relative tolerance, 0.5 absolute floor (handles year-ish answers).
+        tol = max(abs(g_num) * 0.01, 0.5)
+        if abs(p_num - g_num) <= tol:
+            return GradeResult(True, 1.0, "numeric", p, g)
+        return GradeResult(False, 0.0, "numeric_miss", p, g)
+
+    if _whole_word_substring(p, g):
+        return GradeResult(True, 1.0, "substring", p, g)
+    if _whole_word_substring(g, p) and len(p) >= 3:
+        # Be conservative the other direction — only credit if pred is
+        # at least 3 normalised chars (avoids "John" matching gold
+        # "John F. Kennedy" as correct).
+        return GradeResult(True, 1.0, "substring_reverse", p, g)
+
+    return GradeResult(False, 0.0, "lexical_miss", p, g)
+
+
+# ---------------------------------------------------------------------------
+# LLM-as-judge
+# ---------------------------------------------------------------------------
+
+
+_JUDGE_SYSTEM = (
+    "You are an impartial grader for short-answer factual questions. "
+    "Given a question, the gold answer, and a model's prediction, "
+    "decide whether the prediction is correct. The prediction is "
+    "correct if it expresses the same factual content as the gold "
+    "answer, allowing for paraphrasing, surface-level differences "
+    "(numbers as words, names with/without titles), and additional "
+    "non-contradictory detail. The prediction is incorrect if it "
+    "expresses a different fact, omits the central answer, or hedges "
+    "without committing.\n\n"
+    "Respond with ONLY a JSON object on a single line:\n"
+    '{\"correct\": true|false, \"rationale\": \"<one short sentence>\"}'
+)
+
+
+_JUDGE_TEMPLATE = """\
+Question: {question}
+Gold answer: {gold}
+Model prediction: {pred}
+
+Decide whether the prediction is correct.
+"""
+
+
+@dataclass
+class JudgeConfig:
+    """Configuration handed to ``LlmJudge`` at construction time."""
+
+    api_key: str
+    model: str = "anthropic/claude-sonnet-4.5"
+    base_url: str = "https://openrouter.ai/api/v1"
+    max_tokens: int = 200
+    concurrency: int = 4
+
+
+class LlmJudge:
+    """Async LLM judge over OpenRouter chat completions."""
+
+    def __init__(self, *, config: JudgeConfig) -> None:
+        self._config = config
+        self._provider = OpenRouterChatProvider(
+            api_key=config.api_key,
+            base_url=config.base_url,
+            model=config.model,
+        )
+        self._sem = asyncio.Semaphore(max(1, config.concurrency))
+
+    @property
+    def model(self) -> str:
+        return self._config.model
+
+    async def judge(
+        self,
+        *,
+        question: str,
+        gold: str,
+        pred: str,
+    ) -> tuple[bool, str]:
+        """Return ``(is_correct, rationale)``. Errors return False + reason."""
+
+        prompt = _JUDGE_TEMPLATE.format(question=question, gold=gold, pred=pred)
+        try:
+            async with self._sem:
+                response = await self._provider.complete(
+                    prompt=prompt,
+                    system_prompt=_JUDGE_SYSTEM,
+                    max_tokens=self._config.max_tokens,
+                )
+        except Exception as exc:  # noqa: BLE001
+            return False, f"judge_error: {type(exc).__name__}: {exc}"
+        return _parse_judge_response(response.text)
+
+
+def _parse_judge_response(text: str) -> tuple[bool, str]:
+    """Pull ``correct`` + ``rationale`` out of the judge's reply."""
+
+    if not text or not text.strip():
+        return False, "judge_returned_empty"
+    # Accept JSON anywhere in the message; some models prepend prose.
+    match = re.search(r"\{[^{}]*\}", text, flags=re.DOTALL)
+    candidate = match.group(0) if match else text
+    try:
+        data = json.loads(candidate)
+    except (json.JSONDecodeError, ValueError):
+        # Fallback: yes/no parsing.
+        lowered = text.strip().lower()
+        if lowered.startswith("yes") or "correct: yes" in lowered or '"correct": true' in lowered:
+            return True, "yes (parser_fallback)"
+        if lowered.startswith("no") or "correct: no" in lowered or '"correct": false' in lowered:
+            return False, "no (parser_fallback)"
+        return False, f"unparseable_judge_response: {text[:200]}"
+    correct = bool(data.get("correct"))
+    rationale = str(data.get("rationale", "")).strip()[:280]
+    return correct, rationale
+
+
+# ---------------------------------------------------------------------------
+# Combined grader
+# ---------------------------------------------------------------------------
+
+
+async def grade_with_judge(
+    *,
+    pred: str,
+    gold: str,
+    question: str,
+    judge: LlmJudge | None,
+) -> GradeResult:
+    """Grade one row: deterministic shortcut → optional LLM judge fallback."""
+
+    det = grade_deterministic(pred=pred, gold=gold)
+    if det.correct or det.method != "lexical_miss":
+        return det
+    if judge is None:
+        return det
+    is_correct, rationale = await judge.judge(question=question, gold=gold, pred=pred)
+    return GradeResult(
+        correct=is_correct,
+        f1=1.0 if is_correct else 0.0,
+        method="llm_judge",
+        normalised_pred=det.normalised_pred,
+        normalised_gold=det.normalised_gold,
+        judge_rationale=rationale,
+    )
+
+
+async def grade_many(
+    *,
+    rows: Sequence[tuple[str, str, str, str]],
+    judge: LlmJudge | None,
+) -> list[GradeResult]:
+    """Grade ``[(qid, question, gold, pred), ...]`` concurrently.
+
+    The judge already enforces its own concurrency cap; this just
+    schedules everything via ``asyncio.gather``. ``qid`` is unused
+    inside the grader but threaded through so callers can correlate
+    results back to their rows.
+    """
+
+    if not rows:
+        return []
+    coros = [
+        grade_with_judge(pred=p, gold=g, question=q, judge=judge)
+        for _qid, q, g, p in rows
+    ]
+    return list(await asyncio.gather(*coros))
+
+
+__all__ = [
+    "GradeResult",
+    "JudgeConfig",
+    "LlmJudge",
+    "grade_deterministic",
+    "grade_many",
+    "grade_with_judge",
+]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/frames/ingest.py b/surfsense_evals/src/surfsense_evals/suites/research/frames/ingest.py
new file mode 100644
index 000000000..9780be4ed
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/frames/ingest.py
@@ -0,0 +1,341 @@
+"""FRAMES ingestion: download → fetch Wikipedia → upload markdown.
+
+Steps:
+
+1. Download ``test.tsv`` from ``hf://datasets/google/frames-benchmark``.
+2. Parse rows into ``FramesQuestion`` objects.
+3. Optionally cap to the first ``--max-questions N`` so a smoke run
+   doesn't trigger a 1k-article fetch.
+4. Build the **deduplicated** set of Wikipedia URLs across the chosen
+   sample (questions share many articles — Q1 and Q42 might both
+   reference ``James_A._Garfield``).
+5. Fetch each unique article via ``WikiFetcher`` (polite 2 RPS) into
+   ``<bench_dir>/wiki/<title>.md``.
+6. Upload the resulting markdown files to SurfSense in batches with
+   ``use_vision_llm=False, processing_mode="basic"`` (text-only — no
+   reason to pay vision LLM costs on Wikipedia plaintext).
+7. Persist a doc map at
+   ``<suite_data>/maps/frames_doc_map.jsonl`` with one row per question
+   listing its ``document_ids`` (so the runner *could* scope retrieval
+   if requested, though by default we don't — see ``runner.py``).
+
+The doc map row shape:
+
+    {"qid": "Q000",
+     "wiki_titles": ["President of the United States", "James Buchanan", ...],
+     "document_ids": [123, 124, ...],
+     "missing_titles": []}
+
+We resolve titles → SurfSense document_ids via the post-upload
+``DocumentStatus.title`` field. SurfSense's title is the uploaded
+filename (without extension), so we round-trip via
+``cache_filename_for_title`` to match.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from ....core.clients.documents import (
+    DocumentProcessingFailed,
+    DocumentProcessingTimeout,
+)
+from ....core.config import set_suite_state
+from ....core.ingest_settings import IngestSettings, settings_header_line
+from ....core.registry import RunContext
+from .dataset import (
+    download_test_tsv,
+    load_questions,
+    write_questions_jsonl,
+)
+from .wiki_fetch import (
+    WikiArticle,
+    WikiFetcher,
+    cache_filename_for_title,
+    title_from_url,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class _IngestStats:
+    n_questions: int
+    n_unique_urls: int
+    n_fetched: int
+    n_cached_hits: int
+    n_missing: int
+    n_uploaded: int
+    n_existing: int
+    bench_dir: Path
+    map_path: Path
+
+
+async def _fetch_articles(
+    fetcher: WikiFetcher,
+    urls: list[str],
+) -> tuple[dict[str, WikiArticle], list[str]]:
+    """Fetch each URL serially (the WikiFetcher's rate-limiter serialises anyway).
+
+    Returns ``(url -> WikiArticle, missing_urls)``. Missing means
+    Wikipedia reported the title doesn't exist, the URL was non-wiki,
+    or the API returned an empty extract.
+    """
+
+    fetched: dict[str, WikiArticle] = {}
+    missing: list[str] = []
+    n_total = len(urls)
+    for i, url in enumerate(urls, start=1):
+        try:
+            article = await fetcher.fetch(url)
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("FRAMES wiki fetch %s failed: %s", url, exc)
+            missing.append(url)
+            continue
+        if article is None:
+            missing.append(url)
+            continue
+        fetched[url] = article
+        if i % 25 == 0 or i == n_total:
+            logger.info("  ... fetched %d / %d Wikipedia articles", i, n_total)
+    return fetched, missing
+
+
+async def _upload_markdowns(
+    ctx: RunContext,
+    articles: list[WikiArticle],
+    *,
+    batch_size: int,
+    settings: IngestSettings,
+) -> dict[str, int]:
+    """Upload deduplicated markdown files. Returns ``filename -> document_id``.
+
+    SurfSense dedupes uploads on ``(filename, search_space_id)``, so
+    re-running ingest after a crash is idempotent — duplicates land in
+    ``duplicate_document_ids`` and we still recover their ids via the
+    status endpoint.
+    """
+
+    if not articles:
+        return {}
+    docs_client = ctx.documents_client()
+    name_to_id: dict[str, int] = {}
+    paths = [a.markdown_path for a in articles]
+    for batch_start in range(0, len(paths), batch_size):
+        batch = paths[batch_start : batch_start + batch_size]
+        result = await docs_client.upload(
+            files=batch,
+            search_space_id=ctx.search_space_id,
+            should_summarize=settings.should_summarize,
+            use_vision_llm=settings.use_vision_llm,
+            processing_mode=settings.processing_mode,
+        )
+        all_ids = list(result.document_ids) + list(result.duplicate_document_ids)
+        if result.document_ids:
+            try:
+                await docs_client.wait_until_ready(
+                    search_space_id=ctx.search_space_id,
+                    document_ids=result.document_ids,
+                    timeout_s=900.0,
+                )
+            except (DocumentProcessingFailed, DocumentProcessingTimeout) as exc:
+                logger.warning("FRAMES batch processing issue: %s", exc)
+        if all_ids:
+            statuses = await docs_client.get_status(
+                search_space_id=ctx.search_space_id,
+                document_ids=all_ids,
+            )
+            for s in statuses:
+                # SurfSense stores the uploaded filename as ``title`` (no extension).
+                stem = Path(s.title).stem if s.title.endswith(".md") else s.title
+                name_to_id[stem] = s.document_id
+                name_to_id[s.title] = s.document_id
+        logger.info(
+            "FRAMES upload batch %d-%d: %d new, %d duplicate",
+            batch_start, batch_start + len(batch),
+            len(result.document_ids), len(result.duplicate_document_ids),
+        )
+    return name_to_id
+
+
+def _resolve_question_doc_ids(
+    questions: list[Any],
+    fetched: dict[str, WikiArticle],
+    name_to_id: dict[str, int],
+) -> list[dict[str, Any]]:
+    """For each question, list the document_ids of its (fetched) wiki articles."""
+
+    rows: list[dict[str, Any]] = []
+    for q in questions:
+        doc_ids: list[int] = []
+        titles: list[str] = []
+        missing: list[str] = []
+        for url in q.wiki_urls:
+            article = fetched.get(url)
+            if article is None:
+                missing.append(url)
+                continue
+            titles.append(article.title)
+            stem = Path(cache_filename_for_title(article.title)).stem
+            doc_id = name_to_id.get(stem) or name_to_id.get(article.markdown_path.name)
+            if doc_id is not None and doc_id not in doc_ids:
+                doc_ids.append(doc_id)
+        rows.append({
+            "qid": q.qid,
+            "raw_index": q.raw_index,
+            "n_wiki_urls": len(q.wiki_urls),
+            "wiki_titles": titles,
+            "document_ids": doc_ids,
+            "missing_urls": missing,
+        })
+    return rows
+
+
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+
+
+async def run_ingest(
+    ctx: RunContext,
+    *,
+    max_questions: int | None = None,
+    upload_batch_size: int = 16,
+    skip_upload: bool = False,
+    fetch_rate_limit_rps: float = 2.0,
+    settings: IngestSettings | None = None,
+) -> None:
+    """Ingest the FRAMES benchmark into the research suite.
+
+    Parameters
+    ----------
+    max_questions : int | None
+        Cap on the number of FRAMES questions to materialise. ``None`` =
+        all 824 (≈300+ unique articles). Smoke runs should pass 5-10.
+    upload_batch_size : int
+        Markdown files per ``/documents/fileupload`` call. Larger
+        batches reduce round-trip overhead; smaller batches recover
+        faster from individual processing failures.
+    skip_upload : bool
+        Fetch + cache Wikipedia articles locally but don't push to
+        SurfSense. Useful for debugging the fetcher in isolation.
+    fetch_rate_limit_rps : float
+        Maximum requests-per-second to the Wikipedia API. Default 2.0
+        is a polite ceiling; raise cautiously.
+    settings : IngestSettings | None
+        Override per-upload knobs. FRAMES defaults to text-only
+        (no vision LLM, basic mode) — the corpus is plain wikitext.
+    """
+
+    settings = settings or IngestSettings(
+        use_vision_llm=False,
+        processing_mode="basic",
+        should_summarize=False,
+    )
+    bench_dir = ctx.benchmark_data_dir()
+    wiki_cache = bench_dir / "wiki"
+    wiki_cache.mkdir(parents=True, exist_ok=True)
+    hf_cache = bench_dir / ".hf_cache"
+    hf_cache.mkdir(parents=True, exist_ok=True)
+
+    # 1. Download + parse questions.
+    tsv_path = download_test_tsv(hf_cache)
+    questions = load_questions(tsv_path)
+    if not questions:
+        raise RuntimeError(
+            "FRAMES test.tsv contained no parseable rows; upstream may "
+            "have changed schema."
+        )
+    logger.info("FRAMES: parsed %d questions from %s", len(questions), tsv_path.name)
+    if max_questions is not None and max_questions > 0:
+        questions = questions[:max_questions]
+        logger.info("FRAMES: capped to first %d questions", len(questions))
+
+    questions_jsonl = bench_dir / "questions.jsonl"
+    write_questions_jsonl(questions, questions_jsonl)
+
+    # 2. Build deduplicated URL set (preserving first-seen order).
+    seen_urls: dict[str, None] = {}
+    for q in questions:
+        for url in q.wiki_urls:
+            seen_urls.setdefault(url, None)
+    unique_urls = list(seen_urls.keys())
+    logger.info(
+        "FRAMES: %d unique Wikipedia URLs across %d questions",
+        len(unique_urls), len(questions),
+    )
+
+    # 3. Fetch (with cache).
+    fetcher = WikiFetcher(cache_dir=wiki_cache, rate_limit_rps=fetch_rate_limit_rps)
+    n_cached = sum(
+        1 for url in unique_urls
+        if (wiki_cache / cache_filename_for_title(_safe_title(url))).exists()
+    )
+    fetched, missing_urls = await _fetch_articles(fetcher, unique_urls)
+    logger.info(
+        "FRAMES: fetched=%d, cache_hits=%d, missing=%d",
+        len(fetched), n_cached, len(missing_urls),
+    )
+
+    # 4. Upload to SurfSense (deduped by filename).
+    name_to_id: dict[str, int] = {}
+    if skip_upload:
+        logger.info("FRAMES: --skip-upload; skipping SurfSense ingestion")
+    else:
+        unique_articles = list({a.markdown_path: a for a in fetched.values()}.values())
+        name_to_id = await _upload_markdowns(
+            ctx,
+            unique_articles,
+            batch_size=upload_batch_size,
+            settings=settings,
+        )
+
+    # 5. Persist per-question doc map.
+    doc_rows = _resolve_question_doc_ids(questions, fetched, name_to_id)
+
+    map_path = ctx.maps_dir() / "frames_doc_map.jsonl"
+    with map_path.open("w", encoding="utf-8") as fh:
+        fh.write(settings_header_line(settings) + "\n")
+        for row in doc_rows:
+            fh.write(json.dumps(row) + "\n")
+    logger.info("Wrote FRAMES doc map to %s (%d rows)", map_path, len(doc_rows))
+
+    # 6. Update suite state.
+    new_state = ctx.suite_state
+    new_state.ingestion_maps["frames"] = str(map_path)
+    set_suite_state(ctx.config, ctx.suite, new_state)
+
+    stats = _IngestStats(
+        n_questions=len(questions),
+        n_unique_urls=len(unique_urls),
+        n_fetched=len(fetched),
+        n_cached_hits=n_cached,
+        n_missing=len(missing_urls),
+        n_uploaded=len(name_to_id),
+        n_existing=0,
+        bench_dir=bench_dir,
+        map_path=map_path,
+    )
+    logger.info("FRAMES ingest done: %s", stats)
+
+
+def _safe_title(url: str) -> str:
+    """Pre-cache title resolution; returns ``""`` on bad URL."""
+
+    try:
+        return title_from_url(url)
+    except ValueError:
+        return ""
+
+
+__all__ = ["run_ingest"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/frames/prompt.py b/surfsense_evals/src/surfsense_evals/suites/research/frames/prompt.py
new file mode 100644
index 000000000..16bb06da4
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/frames/prompt.py
@@ -0,0 +1,71 @@
+"""FRAMES prompt templates.
+
+Two templates: one for the bare-LLM arm (no retrieval), one for
+SurfSense (the agent retrieves; we mostly just instruct it on
+output format). Both arms must use byte-identical *content* for the
+question itself so the head-to-head is fair — the wrappers diverge
+only in framing.
+
+Format expectations (mirrors the FRAMES paper, section 4):
+
+* Short factual answer — names, dates, numbers, ordinals
+* No extra explanation in the final line; we anchor on
+  ``Answer: <text>`` for deterministic extraction
+* Free-text reasoning is *allowed* before the final ``Answer:`` line —
+  multi-hop questions often benefit from it. We just don't grade it.
+"""
+
+from __future__ import annotations
+
+
+_BASE_INSTRUCTIONS = (
+    "You are a careful question-answering assistant. The question may "
+    "require combining facts from multiple sources, doing arithmetic, "
+    "or reasoning about dates. Think step by step if needed, then give "
+    "the final answer.\n\n"
+    "Format your final line EXACTLY as:\n"
+    "Answer: <short answer>\n\n"
+    "The answer should be as short as possible — a name, a number, a "
+    "date, a single phrase. Do not repeat the question. Do not include "
+    "punctuation at the end unless it is part of the answer."
+)
+
+
+_BARE_TEMPLATE = """\
+{instructions}
+
+Question: {question}
+"""
+
+
+_SURFSENSE_TEMPLATE = """\
+{instructions}
+
+You have access to a Wikipedia knowledge base via retrieval. Use it
+to look up any facts you are not confident about. The corpus contains
+the Wikipedia articles needed to answer this question, but you must
+retrieve them yourself — they are not pre-selected.
+
+Question: {question}
+"""
+
+
+def build_bare_prompt(question: str) -> str:
+    """Prompt for the no-retrieval baseline arm."""
+
+    return _BARE_TEMPLATE.format(
+        instructions=_BASE_INSTRUCTIONS,
+        question=question.strip(),
+    )
+
+
+def build_surfsense_prompt(question: str) -> str:
+    """Prompt for the SurfSense arm (retrieval-augmented)."""
+
+    return _SURFSENSE_TEMPLATE.format(
+        instructions=_BASE_INSTRUCTIONS,
+        question=question.strip(),
+    )
+
+
+__all__ = ["build_bare_prompt", "build_surfsense_prompt"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/frames/runner.py b/surfsense_evals/src/surfsense_evals/suites/research/frames/runner.py
new file mode 100644
index 000000000..a8dde0dd2
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/frames/runner.py
@@ -0,0 +1,686 @@
+"""FRAMES runner — Bare LLM (no retrieval) vs SurfSense (multi-hop RAG).
+
+Two arms run paired on every question in the sample:
+
+1. ``BareLlmArm``  — OpenRouter chat completion with the question only.
+   Reproduces the published "naive prompting" baseline (40.8% on
+   Gemini-Pro-1.5).
+2. ``SurfSenseArm`` — POST ``/api/v1/new_chat`` with **no**
+   ``mentioned_document_ids`` so the agent retrieves over the entire
+   ingested Wikipedia corpus. This is the "multi-step retrieval &
+   reasoning" cell in the FRAMES paper.
+
+Open-ended grading: deterministic shortcut + optional LLM-as-judge
+(``--no-judge`` to disable). Cost / latency / token aggregates are
+collected per arm. Paired stats (McNemar, bootstrap CI) for the
+accuracy delta. Per-reasoning-type breakdown to surface where one
+arm beats the other (numerical vs temporal vs multi-constraint, ...).
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import logging
+import os
+from collections.abc import Iterable
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from ....core.arms import ArmRequest, ArmResult, BareLlmArm, SurfSenseArm
+from ....core.config import utc_iso_timestamp
+from ....core.ingest_settings import (
+    IngestSettings,
+    add_ingest_settings_args,
+    format_ingest_settings_md,
+    is_settings_header,
+)
+from ....core.metrics.comparison import (
+    bootstrap_delta_ci,
+    mcnemar_test,
+    paired_aggregate,
+)
+from ....core.metrics.mc_accuracy import accuracy_with_wilson_ci
+from ....core.parse.freeform_answer import extract_freeform_answer
+from ....core.providers.openrouter_chat import OpenRouterChatProvider
+from ....core.registry import ReportSection, RunArtifact, RunContext
+from ....core.scenarios import format_scenario_md
+from .grader import GradeResult, JudgeConfig, LlmJudge, grade_many
+from .prompt import build_bare_prompt, build_surfsense_prompt
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Question shape
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class FramesRunnerQuestion:
+    qid: str
+    raw_index: int
+    question: str
+    gold_answer: str
+    reasoning_types: list[str]
+    document_ids: list[int]   # subset of corpus relevant to this Q (may be empty)
+    n_wiki_urls: int
+    missing_urls: list[str]
+
+
+def _load_doc_map(map_path: Path) -> tuple[dict[str, dict[str, Any]], dict[str, Any]]:
+    rows: dict[str, dict[str, Any]] = {}
+    settings: dict[str, Any] = {}
+    with map_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            if is_settings_header(row):
+                settings = dict(row["__settings__"])
+                continue
+            rows[str(row["qid"])] = row
+    return rows, settings
+
+
+def _load_questions(
+    questions_jsonl: Path,
+    doc_map: dict[str, dict[str, Any]],
+    *,
+    sample_n: int | None,
+    reasoning_filter: str | None,
+) -> list[FramesRunnerQuestion]:
+    out: list[FramesRunnerQuestion] = []
+    with questions_jsonl.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            qid = str(row.get("qid") or "").strip()
+            if not qid:
+                continue
+            map_row = doc_map.get(qid, {})
+            reasoning = list(row.get("reasoning_types") or [])
+            if reasoning_filter and reasoning_filter not in [r.lower() for r in reasoning]:
+                continue
+            out.append(FramesRunnerQuestion(
+                qid=qid,
+                raw_index=int(row.get("raw_index") or 0),
+                question=str(row.get("question") or "").strip(),
+                gold_answer=str(row.get("gold_answer") or "").strip(),
+                reasoning_types=reasoning,
+                document_ids=list(map_row.get("document_ids") or []),
+                n_wiki_urls=int(map_row.get("n_wiki_urls") or 0),
+                missing_urls=list(map_row.get("missing_urls") or []),
+            ))
+    out.sort(key=lambda q: q.raw_index)
+    if sample_n is not None and sample_n > 0:
+        out = out[:sample_n]
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Bounded concurrency helper
+# ---------------------------------------------------------------------------
+
+
+async def _gather_with_limit(coros: Iterable, *, concurrency: int) -> list[Any]:
+    sem = asyncio.Semaphore(max(1, concurrency))
+
+    async def _wrap(coro):
+        async with sem:
+            return await coro
+
+    return await asyncio.gather(*(_wrap(c) for c in coros))
+
+
+# ---------------------------------------------------------------------------
+# Benchmark
+# ---------------------------------------------------------------------------
+
+
+_DESCRIPTION = (
+    "FRAMES (824 multi-hop Wikipedia questions, 5 reasoning types) — "
+    "Bare LLM (no retrieval) vs SurfSense (multi-step RAG over the "
+    "Wikipedia corpus). Tests cross-document retrieval + reasoning."
+)
+
+
+_DEFAULT_INGEST_SETTINGS = IngestSettings(
+    use_vision_llm=False,
+    processing_mode="basic",
+    should_summarize=False,
+)
+
+
+class FramesBenchmark:
+    """Multi-hop Wikipedia RAG vs naive prompting."""
+
+    suite: str = "research"
+    name: str = "frames"
+    headline: bool = True
+    description: str = _DESCRIPTION
+
+    def add_run_args(self, parser: argparse.ArgumentParser) -> None:
+        parser.add_argument(
+            "--n", dest="sample_n", type=int, default=None,
+            help="Run only the first N questions after filters (default: all 824).",
+        )
+        parser.add_argument(
+            "--reasoning",
+            dest="reasoning_filter",
+            default=None,
+            help=(
+                "Filter to questions tagged with this reasoning type "
+                "(e.g. 'numerical reasoning', 'temporal reasoning'). "
+                "Case-insensitive substring against the upstream tags."
+            ),
+        )
+        parser.add_argument(
+            "--concurrency", type=int, default=4,
+            help="Parallel question workers per arm.",
+        )
+        parser.add_argument(
+            "--scope-mentions", dest="scope_mentions", action="store_true",
+            help=(
+                "SurfSense arm: scope retrieval to the per-question "
+                "document_ids (oracle-retrieval upper bound). Default "
+                "is full-corpus retrieval (the realistic FRAMES setting)."
+            ),
+        )
+        parser.add_argument(
+            "--max-output-tokens", type=int, default=512,
+            help="Cap on completion length for both arms.",
+        )
+        parser.add_argument(
+            "--no-judge", dest="no_judge", action="store_true",
+            help=(
+                "Disable LLM-as-judge fallback grading; use only the "
+                "deterministic grader (faster but more pessimistic)."
+            ),
+        )
+        parser.add_argument(
+            "--judge-model",
+            dest="judge_model",
+            default="anthropic/claude-sonnet-4.5",
+            help="OpenRouter slug for the LLM judge (default: claude-sonnet-4.5).",
+        )
+        parser.add_argument(
+            "--judge-concurrency",
+            dest="judge_concurrency",
+            type=int,
+            default=4,
+            help="Parallel judge calls (default: 4).",
+        )
+        # Ingest-only knobs.
+        parser.add_argument(
+            "--max-questions", dest="max_questions", type=int, default=None,
+            help="(ingest only) cap on number of questions to materialise + ingest.",
+        )
+        parser.add_argument(
+            "--upload-batch-size", dest="upload_batch_size", type=int, default=16,
+            help="(ingest only) markdown files per fileupload call.",
+        )
+        parser.add_argument(
+            "--skip-upload", dest="skip_upload", action="store_true",
+            help="(ingest only) cache wiki articles locally but don't push to SurfSense.",
+        )
+        parser.add_argument(
+            "--fetch-rps", dest="fetch_rate_limit_rps", type=float, default=2.0,
+            help="(ingest only) max requests/second to the Wikipedia API.",
+        )
+        add_ingest_settings_args(parser, defaults=_DEFAULT_INGEST_SETTINGS)
+
+    async def ingest(self, ctx: RunContext, **opts: Any) -> None:
+        from .ingest import run_ingest
+
+        settings = IngestSettings.merge(_DEFAULT_INGEST_SETTINGS, opts)
+        await run_ingest(
+            ctx,
+            max_questions=opts.get("max_questions"),
+            upload_batch_size=int(opts.get("upload_batch_size") or 16),
+            skip_upload=bool(opts.get("skip_upload", False)),
+            fetch_rate_limit_rps=float(opts.get("fetch_rate_limit_rps") or 2.0),
+            settings=settings,
+        )
+
+    async def run(self, ctx: RunContext, **opts: Any) -> RunArtifact:
+        sample_n = opts.get("sample_n")
+        reasoning_filter = opts.get("reasoning_filter")
+        if reasoning_filter:
+            reasoning_filter = reasoning_filter.strip().lower() or None
+        concurrency = int(opts.get("concurrency") or 4)
+        scope_mentions = bool(opts.get("scope_mentions"))
+        max_output_tokens = int(opts.get("max_output_tokens") or 512)
+        no_judge = bool(opts.get("no_judge"))
+        judge_model = str(opts.get("judge_model") or "anthropic/claude-sonnet-4.5")
+        judge_concurrency = int(opts.get("judge_concurrency") or 4)
+
+        bench_dir = ctx.benchmark_data_dir()
+        questions_jsonl = bench_dir / "questions.jsonl"
+        map_path = ctx.maps_dir() / "frames_doc_map.jsonl"
+        if not questions_jsonl.exists() or not map_path.exists():
+            raise RuntimeError(
+                "FRAMES not ingested for this suite. Run "
+                "`python -m surfsense_evals ingest research frames` first."
+            )
+
+        doc_map, ingest_settings = _load_doc_map(map_path)
+        questions = _load_questions(
+            questions_jsonl, doc_map,
+            sample_n=sample_n,
+            reasoning_filter=reasoning_filter,
+        )
+        if not questions:
+            raise RuntimeError(
+                "No FRAMES questions matched the filters; broaden --reasoning/--n."
+            )
+        logger.info("FRAMES: scheduled %d questions", len(questions))
+
+        api_key = os.environ.get("OPENROUTER_API_KEY")
+        if not api_key:
+            raise RuntimeError(
+                "OPENROUTER_API_KEY env var is required for the bare-LLM arm."
+            )
+
+        bare_provider = OpenRouterChatProvider(
+            api_key=api_key,
+            base_url=ctx.config.openrouter_base_url,
+            model=ctx.native_arm_model,
+        )
+        bare_arm = BareLlmArm(
+            provider=bare_provider,
+            max_output_tokens=max_output_tokens,
+        )
+        surf_arm = SurfSenseArm(
+            client=ctx.new_chat_client(),
+            search_space_id=ctx.search_space_id,
+            ephemeral_threads=True,
+        )
+
+        judge: LlmJudge | None = None
+        if not no_judge:
+            judge = LlmJudge(config=JudgeConfig(
+                api_key=api_key,
+                model=judge_model,
+                base_url=ctx.config.openrouter_base_url,
+                concurrency=judge_concurrency,
+            ))
+
+        run_timestamp = utc_iso_timestamp()
+        run_dir = ctx.runs_dir(run_timestamp=run_timestamp)
+        raw_path = run_dir / "raw.jsonl"
+
+        async def _bare_one(q: FramesRunnerQuestion) -> ArmResult:
+            return await bare_arm.answer(_make_bare_request(q, max_output_tokens))
+
+        async def _surf_one(q: FramesRunnerQuestion) -> ArmResult:
+            return await surf_arm.answer(
+                _make_surfsense_request(q, scope_mentions=scope_mentions)
+            )
+
+        bare_results, surf_results = await asyncio.gather(
+            _gather_with_limit((_bare_one(q) for q in questions), concurrency=concurrency),
+            _gather_with_limit((_surf_one(q) for q in questions), concurrency=concurrency),
+        )
+
+        bare_grades = await _grade_results(questions, bare_results, judge=judge)
+        surf_grades = await _grade_results(questions, surf_results, judge=judge)
+
+        with raw_path.open("w", encoding="utf-8") as fh:
+            for q, b_res, s_res, b_g, s_g in zip(
+                questions, bare_results, surf_results, bare_grades, surf_grades, strict=False
+            ):
+                meta = {
+                    "qid": q.qid,
+                    "raw_index": q.raw_index,
+                    "reasoning_types": q.reasoning_types,
+                    "n_wiki_urls": q.n_wiki_urls,
+                    "n_resolved_doc_ids": len(q.document_ids),
+                    "n_missing_urls": len(q.missing_urls),
+                    "gold": q.gold_answer,
+                }
+                fh.write(json.dumps({
+                    **meta,
+                    **b_res.to_jsonl(),
+                    "graded": b_g.to_dict(),
+                }) + "\n")
+                fh.write(json.dumps({
+                    **meta,
+                    **s_res.to_jsonl(),
+                    "graded": s_g.to_dict(),
+                }) + "\n")
+
+        metrics = _compute_metrics(questions, bare_results, surf_results, bare_grades, surf_grades)
+        artifact = RunArtifact(
+            suite=self.suite,
+            benchmark=self.name,
+            run_timestamp=run_timestamp,
+            raw_path=raw_path,
+            metrics=metrics,
+            extra={
+                "n_questions": len(questions),
+                "concurrency": concurrency,
+                "reasoning_filter": reasoning_filter,
+                "scope_mentions": scope_mentions,
+                "no_judge": no_judge,
+                "judge_model": judge_model if not no_judge else None,
+                "scenario": ctx.scenario,
+                "provider_model": ctx.provider_model,
+                "native_arm_model": ctx.native_arm_model,
+                "vision_provider_model": ctx.vision_provider_model,
+                "agent_llm_id": ctx.agent_llm_id,
+                "ingest_settings": ingest_settings,
+                "bare_arm_label": "bare_llm",
+            },
+        )
+
+        manifest_path = run_dir / "run_artifact.json"
+        manifest_path.write_text(
+            json.dumps({
+                "suite": self.suite,
+                "benchmark": self.name,
+                "raw_path": "raw.jsonl",
+                "metrics": metrics,
+                "extra": artifact.extra,
+            }, indent=2, sort_keys=True) + "\n",
+            encoding="utf-8",
+        )
+        return artifact
+
+    def report_section(self, artifacts: list[RunArtifact]) -> ReportSection:
+        if not artifacts:
+            return ReportSection(
+                title="FRAMES — Bare LLM vs SurfSense (multi-hop Wikipedia RAG)",
+                headline=True,
+                body_md="(no run artifacts found)",
+                body_json={},
+            )
+        latest = max(artifacts, key=lambda a: a.run_timestamp)
+        m = latest.metrics
+        bare = m.get("bare", {})
+        surf = m.get("surfsense", {})
+        delta = m.get("delta", {})
+        per_reasoning = m.get("per_reasoning", {})
+        extra = latest.extra
+
+        body_lines: list[str] = []
+        body_lines.append(
+            f"- Sample size: {extra.get('n_questions', '?')} questions "
+            f"(reasoning filter: `{extra.get('reasoning_filter') or 'none'}`, "
+            f"scope-mentions: `{extra.get('scope_mentions', False)}`, "
+            f"judge: `{extra.get('judge_model') or 'deterministic-only'}`)."
+        )
+        body_lines.append(format_scenario_md(extra))
+        body_lines.append(format_ingest_settings_md(extra.get("ingest_settings")))
+        body_lines.append(
+            "- Bare LLM arm (OpenRouter chat, no retrieval, "
+            f"`{extra.get('native_arm_model') or extra.get('provider_model', '?')}`):"
+        )
+        body_lines.append(_arm_summary_lines(bare, indent="  "))
+        body_lines.append(
+            "- SurfSense arm (`POST /api/v1/new_chat`, multi-step RAG, "
+            f"`{extra.get('provider_model', '?')}`):"
+        )
+        body_lines.append(_arm_summary_lines(surf, indent="  "))
+        body_lines.append("- Delta (paired):")
+        body_lines.append(
+            f"  - Accuracy: SurfSense {_pp(delta.get('accuracy_pp'))} pp "
+            f"(McNemar p={_fmt(delta.get('mcnemar_p_value'), 4)}, "
+            f"method={delta.get('mcnemar_method')})"
+        )
+        body_lines.append(
+            f"  - Bootstrap 95% CI on accuracy delta: "
+            f"[{_pp(delta.get('bootstrap_ci_low'))}pp, {_pp(delta.get('bootstrap_ci_high'))}pp]"
+        )
+        body_lines.append(
+            f"  - Cost / question: bare ${_dollars(bare.get('cost_micros_mean'))}, "
+            f"surfsense ${_dollars(surf.get('cost_micros_mean'))} "
+            f"(SurfSense delta {_pct_change(delta.get('cost_micros_pct'))})"
+        )
+        body_lines.append(
+            f"  - Latency p50: bare {_ms_to_s(bare.get('latency_ms_median'))}, "
+            f"surfsense {_ms_to_s(surf.get('latency_ms_median'))} "
+            f"(SurfSense delta {_pct_change(delta.get('latency_ms_pct'))})"
+        )
+        if per_reasoning:
+            body_lines.append("- Per-reasoning-type split (accuracy delta in pp):")
+            for tag, vals in sorted(per_reasoning.items()):
+                body_lines.append(
+                    f"  - {tag}: SurfSense {_pp(vals.get('delta_accuracy_pp'))} pp "
+                    f"(n={vals.get('n')}, bare acc={vals.get('bare_accuracy', 0)*100:.1f}%, "
+                    f"surf acc={vals.get('surfsense_accuracy', 0)*100:.1f}%)"
+                )
+
+        return ReportSection(
+            title="FRAMES — Bare LLM vs SurfSense (multi-hop Wikipedia RAG)",
+            headline=True,
+            body_md="\n".join(body_lines),
+            body_json=m,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Per-question helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_bare_request(q: FramesRunnerQuestion, max_tokens: int) -> ArmRequest:
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=build_bare_prompt(q.question),
+        options={"max_tokens": max_tokens},
+    )
+
+
+def _make_surfsense_request(q: FramesRunnerQuestion, *, scope_mentions: bool) -> ArmRequest:
+    mentions: list[int] | None = None
+    if scope_mentions and q.document_ids:
+        mentions = list(q.document_ids)
+    return ArmRequest(
+        question_id=q.qid,
+        prompt=build_surfsense_prompt(q.question),
+        mentioned_document_ids=mentions,
+    )
+
+
+async def _grade_results(
+    questions: list[FramesRunnerQuestion],
+    results: list[ArmResult],
+    *,
+    judge: LlmJudge | None,
+) -> list[GradeResult]:
+    rows: list[tuple[str, str, str, str]] = []
+    for q, r in zip(questions, results, strict=False):
+        pred = extract_freeform_answer(r.raw_text or "")
+        rows.append((q.qid, q.question, q.gold_answer, pred))
+    return await grade_many(rows=rows, judge=judge)
+
+
+# ---------------------------------------------------------------------------
+# Metrics aggregation
+# ---------------------------------------------------------------------------
+
+
+def _compute_metrics(
+    questions: list[FramesRunnerQuestion],
+    bare_results: list[ArmResult],
+    surf_results: list[ArmResult],
+    bare_grades: list[GradeResult],
+    surf_grades: list[GradeResult],
+) -> dict[str, Any]:
+    bare_correct = [g.correct for g in bare_grades]
+    surf_correct = [g.correct for g in surf_grades]
+
+    bare_costs = [float(r.cost_micros) for r in bare_results]
+    surf_costs = [float(r.cost_micros) for r in surf_results]
+    bare_latencies = [float(r.latency_ms) for r in bare_results]
+    surf_latencies = [float(r.latency_ms) for r in surf_results]
+    bare_in_tokens = [float(r.input_tokens) for r in bare_results]
+    bare_out_tokens = [float(r.output_tokens) for r in bare_results]
+
+    bare_acc = accuracy_with_wilson_ci(sum(bare_correct), len(bare_correct))
+    surf_acc = accuracy_with_wilson_ci(sum(surf_correct), len(surf_correct))
+    mc = mcnemar_test(bare_correct, surf_correct)
+    boot = bootstrap_delta_ci(bare_correct, surf_correct, n_resamples=2000)
+
+    bare_cost_agg = paired_aggregate(bare_costs)
+    surf_cost_agg = paired_aggregate(surf_costs)
+    bare_latency_agg = paired_aggregate(bare_latencies)
+    surf_latency_agg = paired_aggregate(surf_latencies)
+    cost_pct = _safe_pct(surf_cost_agg.mean, bare_cost_agg.mean)
+    latency_pct = _safe_pct(surf_latency_agg.median, bare_latency_agg.median)
+
+    # Per-reasoning-type breakdown. Each question may carry multiple
+    # reasoning tags; we count it under each tag (so totals don't
+    # equal len(questions) — the reader is expected to look at the
+    # per-tag ``n``).
+    per_reasoning_pairs: dict[str, list[tuple[bool, bool]]] = {}
+    for q, b_ok, s_ok in zip(questions, bare_correct, surf_correct, strict=False):
+        tags = q.reasoning_types or ["(untagged)"]
+        for tag in tags:
+            per_reasoning_pairs.setdefault(tag, []).append((b_ok, s_ok))
+
+    per_reasoning: dict[str, dict[str, Any]] = {}
+    for tag, pairs in per_reasoning_pairs.items():
+        b_correct = [a for a, _ in pairs]
+        s_correct = [b for _, b in pairs]
+        per_reasoning[tag] = {
+            "n": len(pairs),
+            "bare_accuracy": (sum(b_correct) / len(pairs)) if pairs else 0.0,
+            "surfsense_accuracy": (sum(s_correct) / len(pairs)) if pairs else 0.0,
+            "delta_accuracy_pp": (
+                100.0 * (sum(s_correct) - sum(b_correct)) / len(pairs)
+                if pairs else 0.0
+            ),
+        }
+
+    grader_methods = {
+        "bare": _count_methods(bare_grades),
+        "surfsense": _count_methods(surf_grades),
+    }
+
+    return {
+        "bare": {
+            **bare_acc.to_dict(),
+            "cost_micros_mean": bare_cost_agg.mean,
+            "cost_micros_median": bare_cost_agg.median,
+            "latency_ms_mean": bare_latency_agg.mean,
+            "latency_ms_median": bare_latency_agg.median,
+            "latency_ms_p95": bare_latency_agg.p95,
+            "input_tokens_mean": (sum(bare_in_tokens) / len(bare_in_tokens)) if bare_in_tokens else 0.0,
+            "output_tokens_mean": (sum(bare_out_tokens) / len(bare_out_tokens)) if bare_out_tokens else 0.0,
+        },
+        "surfsense": {
+            **surf_acc.to_dict(),
+            "cost_micros_mean": surf_cost_agg.mean,
+            "cost_micros_median": surf_cost_agg.median,
+            "latency_ms_mean": surf_latency_agg.mean,
+            "latency_ms_median": surf_latency_agg.median,
+            "latency_ms_p95": surf_latency_agg.p95,
+        },
+        "delta": {
+            "accuracy_pp": 100.0 * (surf_acc.accuracy - bare_acc.accuracy),
+            "mcnemar_p_value": mc.p_value,
+            "mcnemar_method": mc.method,
+            "mcnemar_b_bare_only": mc.b,
+            "mcnemar_c_surfsense_only": mc.c,
+            "bootstrap_ci_low": 100.0 * boot.ci_low,
+            "bootstrap_ci_high": 100.0 * boot.ci_high,
+            "cost_micros_pct": cost_pct,
+            "latency_ms_pct": latency_pct,
+        },
+        "per_reasoning": per_reasoning,
+        "grader_methods": grader_methods,
+    }
+
+
+def _count_methods(grades: list[GradeResult]) -> dict[str, int]:
+    out: dict[str, int] = {}
+    for g in grades:
+        out[g.method] = out.get(g.method, 0) + 1
+    return out
+
+
+def _safe_pct(numerator: float, denominator: float) -> float | None:
+    if denominator == 0:
+        return None
+    return 100.0 * (numerator - denominator) / denominator
+
+
+# ---------------------------------------------------------------------------
+# Tiny formatting helpers used by report_section
+# ---------------------------------------------------------------------------
+
+
+def _arm_summary_lines(d: dict[str, Any], *, indent: str) -> str:
+    if not d:
+        return f"{indent}(no data)"
+    acc = d.get("accuracy", 0.0)
+    low = d.get("ci_low", 0.0)
+    high = d.get("ci_high", 0.0)
+    lines = [
+        f"{indent}- Accuracy: {acc * 100:.1f}% (Wilson 95% CI: {low * 100:.1f}% – {high * 100:.1f}%)",
+        f"{indent}- Cost / question: ${_dollars(d.get('cost_micros_mean'))} (mean), "
+        f"${_dollars(d.get('cost_micros_median'))} (median)",
+        f"{indent}- Latency: p50 {_ms_to_s(d.get('latency_ms_median'))}, "
+        f"p95 {_ms_to_s(d.get('latency_ms_p95'))}",
+    ]
+    if "input_tokens_mean" in d:
+        lines.append(
+            f"{indent}- Mean tokens / question: in {d.get('input_tokens_mean', 0):.0f}, "
+            f"out {d.get('output_tokens_mean', 0):.0f}"
+        )
+    return "\n".join(lines)
+
+
+def _dollars(micros: Any) -> str:
+    if micros is None:
+        return "?"
+    try:
+        return f"{(float(micros) / 1_000_000):.4f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _ms_to_s(ms: Any) -> str:
+    if ms is None:
+        return "?"
+    try:
+        return f"{float(ms) / 1000:.1f}s"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _pp(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):+.1f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _pct_change(value: Any) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):+.0f}%"
+    except (TypeError, ValueError):
+        return "?"
+
+
+def _fmt(value: Any, ndigits: int) -> str:
+    if value is None:
+        return "?"
+    try:
+        return f"{float(value):.{ndigits}f}"
+    except (TypeError, ValueError):
+        return "?"
+
+
+__all__ = ["FramesBenchmark", "FramesRunnerQuestion"]
diff --git a/surfsense_evals/src/surfsense_evals/suites/research/frames/wiki_fetch.py b/surfsense_evals/src/surfsense_evals/suites/research/frames/wiki_fetch.py
new file mode 100644
index 000000000..7f6b63e50
--- /dev/null
+++ b/surfsense_evals/src/surfsense_evals/suites/research/frames/wiki_fetch.py
@@ -0,0 +1,241 @@
+"""Wikipedia article fetcher → plain-text markdown, with disk cache.
+
+We hit the MediaWiki action API for *plain text* extracts:
+
+    GET https://en.wikipedia.org/w/api.php
+        ?action=query&prop=extracts&explaintext=true
+        &redirects=1&titles=<Title>&format=json&formatversion=2
+
+This avoids HTML→markdown conversion (and its many edge cases). The
+``explaintext=true`` mode strips infoboxes / templates / wikilinks
+and returns clean section-headered prose, which is exactly what we
+want SurfSense to chunk + embed. We prepend ``# <Title>\n\n`` so the
+markdown has a visible H1 (helps SurfSense's chunker preserve doc
+identity at the top of the first chunk).
+
+Caching: every fetched article lands in
+``<bench_dir>/wiki/<sanitised-title>.md`` and is reused on subsequent
+runs. The cache key is the URL-decoded title (e.g.
+``Charlotte_Brontë`` regardless of source URL casing or
+percent-encoding).
+
+Politeness: 2 RPS rate limit + a descriptive User-Agent (Wikimedia
+asks for one). We don't parallelise above 2 RPS — this is a courtesy
+to Wikipedia and only ~300 articles for the n=100 sample.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import re
+import urllib.parse
+from dataclasses import dataclass
+from pathlib import Path
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+
+WIKI_API = "https://en.wikipedia.org/w/api.php"
+USER_AGENT = (
+    "SurfSense-Evals/0.1 (https://github.com/MODSetter/SurfSense; "
+    "research-benchmark fetch; respects 2 RPS rate limit)"
+)
+
+
+@dataclass(frozen=True)
+class WikiArticle:
+    """One fetched article + metadata."""
+
+    title: str            # canonical title returned by MW (post-redirect)
+    source_url: str       # the URL we were asked to fetch
+    markdown_path: Path   # where the cached body lives on disk
+    n_chars: int          # length of the body (post-prepend H1)
+    redirected_from: str | None = None
+
+
+# ---------------------------------------------------------------------------
+# Title <-> URL helpers
+# ---------------------------------------------------------------------------
+
+
+_WIKI_PATH_RE = re.compile(r"^/wiki/(?P<title>[^?#]+)$")
+
+
+def title_from_url(url: str) -> str:
+    """Pull the page title out of a wiki URL.
+
+    ``https://en.wikipedia.org/wiki/Charlotte_Bront%C3%AB`` → ``Charlotte Brontë``.
+    Spaces are preserved (the API accepts spaces and underscores
+    interchangeably; we use spaces to keep cache filenames human-readable).
+    """
+
+    parsed = urllib.parse.urlparse(url)
+    if parsed.netloc and "wikipedia.org" not in parsed.netloc:
+        raise ValueError(f"Not a Wikipedia URL: {url!r}")
+    match = _WIKI_PATH_RE.match(parsed.path)
+    if not match:
+        raise ValueError(f"Unrecognised wiki path: {parsed.path!r}")
+    raw_title = urllib.parse.unquote(match.group("title"))
+    # MW treats underscores and spaces as equivalent; spaces are friendlier.
+    return raw_title.replace("_", " ").strip()
+
+
+_FILENAME_SAFE = re.compile(r"[^A-Za-z0-9._\- ]")
+
+
+def cache_filename_for_title(title: str) -> str:
+    """Map a title to a filesystem-safe filename.
+
+    Replaces every non-(alnum / ``._- `` / space) character with ``_``.
+    Title collisions are rare (FRAMES only has English Wikipedia titles)
+    and a final ``hash(title)[:8]`` would obscure the otherwise-readable
+    filenames; we accept the (vanishingly small) collision risk.
+    """
+
+    safe = _FILENAME_SAFE.sub("_", title)
+    safe = safe.strip().replace(" ", "_")
+    return f"{safe}.md"
+
+
+# ---------------------------------------------------------------------------
+# Async fetcher with rate limiting + retry
+# ---------------------------------------------------------------------------
+
+
+class WikiFetcher:
+    """Polite fetch + disk cache + redirect handling."""
+
+    def __init__(
+        self,
+        *,
+        cache_dir: Path,
+        rate_limit_rps: float = 2.0,
+        timeout_s: float = 30.0,
+        max_retries: int = 3,
+    ) -> None:
+        self._cache_dir = Path(cache_dir)
+        self._cache_dir.mkdir(parents=True, exist_ok=True)
+        self._min_interval = 1.0 / max(rate_limit_rps, 0.1)
+        self._last_request_at = 0.0
+        self._rate_lock = asyncio.Lock()
+        self._timeout = httpx.Timeout(timeout_s, connect=10.0)
+        self._max_retries = max_retries
+
+    async def _throttle(self) -> None:
+        async with self._rate_lock:
+            now = asyncio.get_event_loop().time()
+            wait = self._last_request_at + self._min_interval - now
+            if wait > 0:
+                await asyncio.sleep(wait)
+            self._last_request_at = asyncio.get_event_loop().time()
+
+    async def fetch(
+        self,
+        url: str,
+        *,
+        http: httpx.AsyncClient | None = None,
+    ) -> WikiArticle | None:
+        """Fetch one article. Returns ``None`` only if MW reports the title is missing.
+
+        Raises on transport errors after retries. Caller decides
+        whether to abort the whole ingest or continue with the
+        successfully-fetched subset.
+        """
+
+        try:
+            title = title_from_url(url)
+        except ValueError as exc:
+            logger.warning("Skipping non-wiki URL %s: %s", url, exc)
+            return None
+
+        cache_path = self._cache_dir / cache_filename_for_title(title)
+        if cache_path.exists() and cache_path.stat().st_size > 0:
+            return WikiArticle(
+                title=title,
+                source_url=url,
+                markdown_path=cache_path,
+                n_chars=cache_path.stat().st_size,
+            )
+
+        last_exc: Exception | None = None
+        for attempt in range(self._max_retries):
+            try:
+                await self._throttle()
+                payload = await self._fetch_extract(title, http=http)
+                break
+            except (httpx.HTTPError, RuntimeError) as exc:
+                last_exc = exc
+                wait = 1.0 * (2 ** attempt)
+                logger.warning(
+                    "wiki fetch %r attempt %d failed: %s; retry in %.1fs",
+                    title, attempt + 1, exc, wait,
+                )
+                await asyncio.sleep(wait)
+        else:
+            assert last_exc is not None
+            raise last_exc
+
+        page = payload.get("page") or {}
+        if not page or page.get("missing"):
+            logger.warning("Wikipedia reports missing page for %r (url=%s)", title, url)
+            return None
+
+        canonical_title = str(page.get("title") or title).strip()
+        body = str(page.get("extract") or "").strip()
+        if not body:
+            logger.warning("Wikipedia returned empty extract for %r", title)
+            return None
+        markdown = f"# {canonical_title}\n\n{body}\n"
+        cache_path.write_text(markdown, encoding="utf-8")
+        return WikiArticle(
+            title=canonical_title,
+            source_url=url,
+            markdown_path=cache_path,
+            n_chars=len(markdown),
+            redirected_from=title if canonical_title != title else None,
+        )
+
+    async def _fetch_extract(
+        self,
+        title: str,
+        *,
+        http: httpx.AsyncClient | None,
+    ) -> dict:
+        """One MW API call. Returns ``{'page': {...}}`` (formatversion=2)."""
+
+        params = {
+            "action": "query",
+            "prop": "extracts",
+            "explaintext": "true",
+            "redirects": "1",
+            "format": "json",
+            "formatversion": "2",
+            "titles": title,
+        }
+        headers = {"User-Agent": USER_AGENT, "Accept": "application/json"}
+        if http is not None:
+            response = await http.get(WIKI_API, params=params, headers=headers, timeout=self._timeout)
+        else:
+            async with httpx.AsyncClient(timeout=self._timeout) as client:
+                response = await client.get(WIKI_API, params=params, headers=headers, timeout=self._timeout)
+        response.raise_for_status()
+        data = response.json()
+        if "error" in data:
+            raise RuntimeError(f"MediaWiki API error: {data['error']!r}")
+        pages = (data.get("query") or {}).get("pages") or []
+        if not pages:
+            return {"page": {}}
+        return {"page": pages[0]}
+
+
+__all__ = [
+    "WIKI_API",
+    "USER_AGENT",
+    "WikiArticle",
+    "WikiFetcher",
+    "cache_filename_for_title",
+    "title_from_url",
+]
diff --git a/surfsense_evals/tests/__init__.py b/surfsense_evals/tests/__init__.py
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/surfsense_evals/tests/__init__.py
@@ -0,0 +1 @@
+
diff --git a/surfsense_evals/tests/conftest.py b/surfsense_evals/tests/conftest.py
new file mode 100644
index 000000000..1cb1d0faf
--- /dev/null
+++ b/surfsense_evals/tests/conftest.py
@@ -0,0 +1,34 @@
+"""Shared pytest fixtures for surfsense-evals."""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.core.config import Config
+
+
+@pytest.fixture
+def tmp_env(monkeypatch, tmp_path: Path) -> Path:
+    """Isolate env vars + filesystem state per test.
+
+    Wipes every ``SURFSENSE_*`` / ``OPENROUTER_*`` / ``EVAL_*`` var so a
+    test that wants a specific credential mode can ``monkeypatch.setenv``
+    just what it needs without leakage from the caller's shell.
+    """
+
+    for key in list(os.environ):
+        if key.startswith(("SURFSENSE_", "OPENROUTER_", "EVAL_")):
+            monkeypatch.delenv(key, raising=False)
+    monkeypatch.setenv("EVAL_DATA_DIR", str(tmp_path / "data"))
+    monkeypatch.setenv("EVAL_REPORTS_DIR", str(tmp_path / "reports"))
+    return tmp_path
+
+
+@pytest.fixture
+def isolated_config(tmp_env: Path) -> Config:  # noqa: ARG001
+    from surfsense_evals.core.config import load_config
+
+    return load_config()
diff --git a/surfsense_evals/tests/core/__init__.py b/surfsense_evals/tests/core/__init__.py
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/surfsense_evals/tests/core/__init__.py
@@ -0,0 +1 @@
+
diff --git a/surfsense_evals/tests/core/test_auth.py b/surfsense_evals/tests/core/test_auth.py
new file mode 100644
index 000000000..43ec94b93
--- /dev/null
+++ b/surfsense_evals/tests/core/test_auth.py
@@ -0,0 +1,95 @@
+"""Auth credential resolution + 401 refresh hook."""
+
+from __future__ import annotations
+
+import httpx
+import pytest
+import respx
+
+from surfsense_evals.core.auth import (
+    CredentialError,
+    acquire_token,
+    client_with_auth,
+)
+from surfsense_evals.core.config import Config
+
+
+def _make_config(**overrides) -> Config:
+    base = {
+        "surfsense_api_base": "http://test",
+        "openrouter_api_key": None,
+        "openrouter_base_url": "https://openrouter.ai/api/v1",
+        "surfsense_jwt": None,
+        "surfsense_refresh_token": None,
+        "surfsense_user_email": None,
+        "surfsense_user_password": None,
+        "data_dir": None,
+        "reports_dir": None,
+    }
+    base.update(overrides)
+    # Path objects required by Config; tests don't touch the FS.
+    from pathlib import Path
+
+    base["data_dir"] = base["data_dir"] or Path("/tmp/eval_test_data")
+    base["reports_dir"] = base["reports_dir"] or Path("/tmp/eval_test_reports")
+    return Config(**base)
+
+
+@pytest.mark.asyncio
+async def test_acquire_token_jwt_mode_short_circuits():
+    config = _make_config(surfsense_jwt="abc", surfsense_refresh_token="ref")
+    bundle = await acquire_token(config)
+    assert bundle.access_token == "abc"
+    assert bundle.refresh_token == "ref"
+    assert bundle.mode == "jwt"
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_acquire_token_local_mode_posts_form():
+    respx.post("http://test/auth/jwt/login").mock(
+        return_value=httpx.Response(
+            200, json={"access_token": "T", "refresh_token": "R", "token_type": "bearer"}
+        )
+    )
+    config = _make_config(
+        surfsense_user_email="u@example.com", surfsense_user_password="pw"
+    )
+    bundle = await acquire_token(config)
+    assert bundle.access_token == "T"
+    assert bundle.refresh_token == "R"
+    assert bundle.mode == "local"
+
+
+@pytest.mark.asyncio
+async def test_acquire_token_no_credentials():
+    config = _make_config()
+    with pytest.raises(CredentialError) as exc:
+        await acquire_token(config)
+    assert "SURFSENSE_USER_EMAIL" in str(exc.value)
+    assert "SURFSENSE_JWT" in str(exc.value)
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_client_with_auth_refreshes_on_401():
+    config = _make_config(surfsense_jwt="old", surfsense_refresh_token="ref")
+    bundle = await acquire_token(config)
+
+    respx.post("http://test/auth/jwt/refresh").mock(
+        return_value=httpx.Response(200, json={"access_token": "new", "refresh_token": "ref2"})
+    )
+    # First call returns 401; the retry (post-refresh) returns 200.
+    respx.get("http://test/api/v1/searchspaces").mock(
+        side_effect=[
+            httpx.Response(401, json={"detail": "expired"}),
+            httpx.Response(200, json=[]),
+        ]
+    )
+
+    async with client_with_auth(config, bundle) as client:
+        response = await client.get("http://test/api/v1/searchspaces")
+
+    assert response.status_code == 200
+    assert bundle.access_token == "new"
+    assert bundle.refresh_token == "ref2"
diff --git a/surfsense_evals/tests/core/test_clients.py b/surfsense_evals/tests/core/test_clients.py
new file mode 100644
index 000000000..9e2c4ad75
--- /dev/null
+++ b/surfsense_evals/tests/core/test_clients.py
@@ -0,0 +1,262 @@
+"""respx-mocked tests for the SurfSense HTTP clients."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import httpx
+import pytest
+import respx
+
+from surfsense_evals.core.clients import (
+    DocumentsClient,
+    NewChatClient,
+    SearchSpaceClient,
+)
+from surfsense_evals.core.clients.new_chat import ThreadBusyError
+
+_BASE = "http://test"
+
+
+@pytest.fixture
+def http() -> httpx.AsyncClient:
+    return httpx.AsyncClient(base_url=_BASE)
+
+
+# ---------------------------------------------------------------------------
+# SearchSpaceClient
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_create_search_space_returns_row(respx_mock, http):
+    respx_mock.post("/api/v1/searchspaces").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "id": 99,
+                "name": "eval-medical-2026",
+                "description": None,
+                "user_id": "user-x",
+                "citations_enabled": True,
+                "qna_custom_instructions": None,
+            },
+        )
+    )
+    client = SearchSpaceClient(http, _BASE)
+    row = await client.create("eval-medical-2026")
+    assert row.id == 99
+    assert row.name == "eval-medical-2026"
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_delete_search_space_idempotent_on_404(respx_mock, http):
+    respx_mock.delete("/api/v1/searchspaces/42").mock(
+        return_value=httpx.Response(404, json={"detail": "gone"})
+    )
+    client = SearchSpaceClient(http, _BASE)
+    await client.delete(42)  # must not raise
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_set_llm_preferences_partial_update(respx_mock, http):
+    route = respx_mock.put("/api/v1/search-spaces/42/llm-preferences").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "agent_llm_id": -10042,
+                "document_summary_llm_id": None,
+                "image_generation_config_id": None,
+                "vision_llm_config_id": None,
+                "agent_llm": {
+                    "id": -10042,
+                    "provider": "OPENROUTER",
+                    "model_name": "anthropic/claude-sonnet-4.5",
+                },
+            },
+        )
+    )
+    client = SearchSpaceClient(http, _BASE)
+    prefs = await client.set_llm_preferences(42, agent_llm_id=-10042)
+    assert prefs.agent_llm_id == -10042
+    assert prefs.agent_llm["provider"] == "OPENROUTER"
+    sent_body = json.loads(route.calls[-1].request.content)
+    assert sent_body == {"agent_llm_id": -10042}
+
+
+# ---------------------------------------------------------------------------
+# DocumentsClient
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_documents_status_parses_state(respx_mock, http):
+    respx_mock.get("/api/v1/documents/status").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "items": [
+                    {"id": 1, "title": "a.pdf", "document_type": "FILE",
+                     "status": {"state": "ready", "reason": None}},
+                    {"id": 2, "title": "b.pdf", "document_type": "FILE",
+                     "status": {"state": "failed", "reason": "ETL boom"}},
+                ]
+            },
+        )
+    )
+    client = DocumentsClient(http, _BASE)
+    statuses = await client.get_status(search_space_id=1, document_ids=[1, 2])
+    assert {s.document_id for s in statuses} == {1, 2}
+    assert {s.is_ready for s in statuses} == {True, False}
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_documents_upload_returns_payload(respx_mock, http, tmp_path: Path):
+    f1 = tmp_path / "a.pdf"
+    f1.write_bytes(b"%PDF-1.4 small")
+    respx_mock.post("/api/v1/documents/fileupload").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "message": "Files uploaded",
+                "document_ids": [101],
+                "duplicate_document_ids": [],
+                "total_files": 1,
+                "pending_files": 1,
+                "skipped_duplicates": 0,
+            },
+        )
+    )
+    client = DocumentsClient(http, _BASE)
+    result = await client.upload(files=[f1], search_space_id=7)
+    assert result.document_ids == [101]
+    assert result.pending_files == 1
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_documents_list_chunks_paginated(respx_mock, http):
+    respx_mock.get("/api/v1/documents/5/chunks").mock(
+        side_effect=[
+            httpx.Response(200, json={
+                "items": [{"id": 1, "content": "a"}, {"id": 2, "content": "b"}],
+                "total": 3, "page": 0, "page_size": 2, "has_more": True,
+            }),
+            httpx.Response(200, json={
+                "items": [{"id": 3, "content": "c"}],
+                "total": 3, "page": 1, "page_size": 2, "has_more": False,
+            }),
+        ]
+    )
+    client = DocumentsClient(http, _BASE)
+    rows = await client.list_chunks(5, page_size=2)
+    assert [r.id for r in rows] == [1, 2, 3]
+
+
+# ---------------------------------------------------------------------------
+# NewChatClient
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_create_thread_returns_id(respx_mock, http):
+    respx_mock.post("/api/v1/threads").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "id": 555,
+                "title": "eval",
+                "archived": False,
+                "visibility": "PRIVATE",
+                "search_space_id": 1,
+                "messages": [],
+                "created_at": "2026-05-11T00:00:00Z",
+                "updated_at": "2026-05-11T00:00:00Z",
+            },
+        )
+    )
+    client = NewChatClient(http, _BASE)
+    tid = await client.create_thread(search_space_id=1)
+    assert tid == 555
+
+
+def _sse_body(events: list[dict]) -> bytes:
+    parts = []
+    for ev in events:
+        parts.append(f"data: {json.dumps(ev)}\n\n")
+    parts.append("data: [DONE]\n\n")
+    return "".join(parts).encode("utf-8")
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_ask_accumulates_text_deltas(respx_mock, http):
+    body = _sse_body([
+        {"type": "start", "messageId": "m1"},
+        {"type": "text-start", "id": "t1"},
+        {"type": "text-delta", "id": "t1", "delta": "Answer "},
+        {"type": "text-delta", "id": "t1", "delta": "is "},
+        {"type": "text-delta", "id": "t1", "delta": "B [citation:42]."},
+        {"type": "text-end", "id": "t1"},
+        {"type": "finish"},
+    ])
+    respx_mock.post("/api/v1/new_chat").mock(
+        return_value=httpx.Response(
+            200,
+            content=body,
+            headers={"Content-Type": "text/event-stream"},
+        )
+    )
+    client = NewChatClient(http, _BASE)
+    answer = await client.ask(
+        thread_id=1, search_space_id=2, user_query="What is the answer?"
+    )
+    assert answer.text == "Answer is B [citation:42]."
+    assert answer.finished_normally is True
+    assert any(c["chunk_id"] == 42 for c in answer.citations)
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_ask_409_thread_busy_retries(respx_mock, http):
+    body = _sse_body([
+        {"type": "text-delta", "id": "t1", "delta": "ok"},
+        {"type": "finish"},
+    ])
+    busy = httpx.Response(
+        409,
+        json={"detail": {"errorCode": "THREAD_BUSY", "message": "busy"}},
+        headers={"Retry-After": "1"},
+    )
+    success = httpx.Response(
+        200, content=body, headers={"Content-Type": "text/event-stream"}
+    )
+    respx_mock.post("/api/v1/new_chat").mock(side_effect=[busy, success])
+    client = NewChatClient(http, _BASE)
+    answer = await client.ask(
+        thread_id=1, search_space_id=2, user_query="hi", max_busy_retries=2
+    )
+    assert answer.text == "ok"
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_ask_409_exhausts_retries(respx_mock, http):
+    busy = httpx.Response(
+        409,
+        json={"detail": {"errorCode": "TURN_CANCELLING", "message": "wait"}},
+        headers={"Retry-After": "1"},
+    )
+    respx_mock.post("/api/v1/new_chat").mock(return_value=busy)
+    client = NewChatClient(http, _BASE)
+    with pytest.raises(ThreadBusyError):
+        await client.ask(
+            thread_id=1, search_space_id=2, user_query="hi", max_busy_retries=1
+        )
diff --git a/surfsense_evals/tests/core/test_config.py b/surfsense_evals/tests/core/test_config.py
new file mode 100644
index 000000000..f7b8f7249
--- /dev/null
+++ b/surfsense_evals/tests/core/test_config.py
@@ -0,0 +1,160 @@
+"""Tests for env loading + state.json read/write."""
+
+from __future__ import annotations
+
+import json
+
+from surfsense_evals.core.config import (
+    DEFAULT_SCENARIO,
+    SCENARIOS,
+    SuiteState,
+    clear_suite_state,
+    get_suite_state,
+    load_config,
+    set_suite_state,
+)
+
+
+def test_load_config_defaults_to_localhost(tmp_env):  # noqa: ARG001
+    config = load_config()
+    assert config.surfsense_api_base == "http://localhost:8000"
+    assert config.has_jwt_mode() is False
+    assert config.has_local_mode() is False
+    assert config.credential_mode() == "none"
+
+
+def test_load_config_picks_up_jwt_env(tmp_env, monkeypatch):  # noqa: ARG001
+    monkeypatch.setenv("SURFSENSE_JWT", "tok")
+    config = load_config()
+    assert config.credential_mode() == "jwt"
+
+
+def test_load_config_picks_up_local_env(tmp_env, monkeypatch):  # noqa: ARG001
+    monkeypatch.setenv("SURFSENSE_USER_EMAIL", "u@x.com")
+    monkeypatch.setenv("SURFSENSE_USER_PASSWORD", "pw")
+    config = load_config()
+    assert config.credential_mode() == "local"
+
+
+def test_state_roundtrip_per_suite(tmp_env):  # noqa: ARG001
+    config = load_config()
+    assert get_suite_state(config, "medical") is None
+    state = SuiteState(
+        search_space_id=1,
+        agent_llm_id=-10042,
+        provider_model="anthropic/claude-sonnet-4.5",
+        created_at="2026-05-11T20-30-00Z",
+    )
+    set_suite_state(config, "medical", state)
+    legal = SuiteState(
+        search_space_id=2,
+        agent_llm_id=-1,
+        provider_model="openai/gpt-5",
+        created_at="2026-05-11T21-00-00Z",
+    )
+    set_suite_state(config, "legal", legal)
+
+    fetched = get_suite_state(config, "medical")
+    assert fetched.search_space_id == 1
+    assert fetched.provider_model == "anthropic/claude-sonnet-4.5"
+
+    # Other suite untouched after teardown.
+    cleared = clear_suite_state(config, "medical")
+    assert cleared is True
+    assert get_suite_state(config, "medical") is None
+    assert get_suite_state(config, "legal").search_space_id == 2
+
+    raw = json.loads(config.state_path.read_text(encoding="utf-8"))
+    assert "medical" not in raw["suites"]
+    assert "legal" in raw["suites"]
+
+
+def test_paths_are_per_suite(tmp_env):  # noqa: ARG001
+    config = load_config()
+    a = config.suite_data_dir("medical")
+    b = config.suite_data_dir("legal")
+    assert a != b
+    assert config.suite_reports_dir("medical").parent == config.reports_dir
+    assert config.suite_runs_dir("medical").name == "runs"
+    assert config.suite_maps_dir("medical").name == "maps"
+
+
+# ---------------------------------------------------------------------------
+# Scenario state — back-compat + new fields
+# ---------------------------------------------------------------------------
+
+
+def test_legacy_state_back_compat_defaults_to_head_to_head():
+    """state.json files written before scenarios shipped must still load.
+
+    Missing ``scenario`` / ``vision_*`` / ``native_arm_model`` keys all
+    default to ``head-to-head`` / ``None`` so old setups keep working
+    after upgrade — the runner's behaviour exactly mirrors the legacy
+    one (both arms answer with ``provider_model``).
+    """
+
+    legacy = {
+        "search_space_id": 7,
+        "agent_llm_id": -123,
+        "provider_model": "anthropic/claude-sonnet-4.5",
+        "created_at": "2026-05-11T20-30-00Z",
+        "ingestion_maps": {},
+    }
+    state = SuiteState.from_dict(legacy)
+    assert state.scenario == DEFAULT_SCENARIO == "head-to-head"
+    assert state.vision_llm_config_id is None
+    assert state.vision_provider_model is None
+    assert state.native_arm_model is None
+    # The native arm should still answer with the same slug as SurfSense.
+    assert state.effective_native_arm_model == state.provider_model
+
+
+def test_unknown_scenario_falls_back_to_default():
+    """Garbage scenario in state.json → default, not crash.
+
+    Defensive: we'd rather a stale state file render with the safe
+    head-to-head behaviour than break the whole run with a KeyError.
+    """
+
+    payload = {
+        "search_space_id": 1,
+        "agent_llm_id": -1,
+        "provider_model": "openai/gpt-5",
+        "scenario": "unknown-scenario-name",
+    }
+    state = SuiteState.from_dict(payload)
+    assert state.scenario == DEFAULT_SCENARIO
+
+
+def test_cost_arbitrage_state_persists_native_arm_model(tmp_env):  # noqa: ARG001
+    config = load_config()
+    state = SuiteState(
+        search_space_id=42,
+        agent_llm_id=-1,
+        provider_model="openai/gpt-5.4-mini",
+        created_at="2026-05-11T20-30-00Z",
+        scenario="cost-arbitrage",
+        vision_llm_config_id=-101,
+        vision_provider_model="anthropic/claude-sonnet-4.5",
+        native_arm_model="anthropic/claude-sonnet-4.5",
+    )
+    set_suite_state(config, "medical", state)
+
+    fetched = get_suite_state(config, "medical")
+    assert fetched.scenario == "cost-arbitrage"
+    assert fetched.vision_llm_config_id == -101
+    assert fetched.vision_provider_model == "anthropic/claude-sonnet-4.5"
+    assert fetched.native_arm_model == "anthropic/claude-sonnet-4.5"
+    # Cost arbitrage's whole point: native arm slug != surfsense slug.
+    assert fetched.effective_native_arm_model != fetched.provider_model
+    assert fetched.effective_native_arm_model == "anthropic/claude-sonnet-4.5"
+
+    raw = json.loads(config.state_path.read_text(encoding="utf-8"))
+    assert raw["suites"]["medical"]["scenario"] == "cost-arbitrage"
+
+
+def test_scenario_constants_are_stable():
+    """Pin the public scenario list; runners + tests key off these strings."""
+
+    assert SCENARIOS == ("head-to-head", "symmetric-cheap", "cost-arbitrage")
+    assert DEFAULT_SCENARIO == "head-to-head"
diff --git a/surfsense_evals/tests/core/test_ingest_settings.py b/surfsense_evals/tests/core/test_ingest_settings.py
new file mode 100644
index 000000000..acfac57a6
--- /dev/null
+++ b/surfsense_evals/tests/core/test_ingest_settings.py
@@ -0,0 +1,269 @@
+"""Unit tests for ``surfsense_evals.core.ingest_settings``.
+
+Covers:
+
+* ``IngestSettings.merge`` honours operator overrides and falls back
+  to per-benchmark defaults when the operator is silent.
+* ``add_ingest_settings_args`` exposes the three flag pairs and
+  argparse defaults of ``None`` correctly distinguish "not passed"
+  from "explicitly false".
+* ``settings_header_line`` / ``read_settings_header`` round-trip
+  through a JSONL file.
+* ``read_settings_header`` is fault-tolerant: missing files, missing
+  header, malformed JSON.
+* ``format_ingest_settings_md`` produces a stable Markdown bullet.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.core.ingest_settings import (
+    PROCESSING_MODES,
+    SETTINGS_HEADER_KEY,
+    IngestSettings,
+    add_ingest_settings_args,
+    format_ingest_settings_md,
+    is_settings_header,
+    read_settings_header,
+    settings_header_line,
+)
+
+# ---------------------------------------------------------------------------
+# IngestSettings.merge
+# ---------------------------------------------------------------------------
+
+
+class TestMerge:
+    def test_silent_operator_uses_defaults(self) -> None:
+        defaults = IngestSettings(use_vision_llm=True, processing_mode="basic", should_summarize=True)
+        merged = IngestSettings.merge(defaults, {})
+        assert merged == defaults
+
+    def test_explicit_false_overrides_default_true(self) -> None:
+        defaults = IngestSettings(use_vision_llm=True)
+        merged = IngestSettings.merge(
+            defaults, {"use_vision_llm": False}
+        )
+        assert merged.use_vision_llm is False
+
+    def test_explicit_true_overrides_default_false(self) -> None:
+        defaults = IngestSettings(use_vision_llm=False)
+        merged = IngestSettings.merge(
+            defaults, {"use_vision_llm": True}
+        )
+        assert merged.use_vision_llm is True
+
+    def test_none_means_silent(self) -> None:
+        # Argparse with BooleanOptionalAction yields None when the
+        # operator passed neither --use-vision-llm nor --no-vision-llm.
+        defaults = IngestSettings(use_vision_llm=True)
+        merged = IngestSettings.merge(
+            defaults, {"use_vision_llm": None}
+        )
+        assert merged.use_vision_llm is True
+
+    def test_processing_mode_override(self) -> None:
+        defaults = IngestSettings(processing_mode="basic")
+        merged = IngestSettings.merge(
+            defaults, {"processing_mode": "premium"}
+        )
+        assert merged.processing_mode == "premium"
+
+    def test_processing_mode_invalid_raises(self) -> None:
+        defaults = IngestSettings(processing_mode="basic")
+        with pytest.raises(ValueError, match="Invalid processing_mode"):
+            IngestSettings.merge(defaults, {"processing_mode": "exotic"})
+
+    def test_processing_mode_blank_falls_back(self) -> None:
+        defaults = IngestSettings(processing_mode="basic")
+        merged = IngestSettings.merge(defaults, {"processing_mode": ""})
+        assert merged.processing_mode == "basic"
+
+    def test_string_truthy_coerced(self) -> None:
+        defaults = IngestSettings(use_vision_llm=False)
+        merged = IngestSettings.merge(defaults, {"use_vision_llm": "yes"})
+        assert merged.use_vision_llm is True
+
+    def test_string_falsy_coerced(self) -> None:
+        defaults = IngestSettings(use_vision_llm=True)
+        merged = IngestSettings.merge(defaults, {"use_vision_llm": "false"})
+        assert merged.use_vision_llm is False
+
+    def test_other_keys_ignored(self) -> None:
+        # Benchmarks pass the whole opts dict; merge must tolerate
+        # unrelated keys without crashing.
+        defaults = IngestSettings(use_vision_llm=True, processing_mode="basic")
+        merged = IngestSettings.merge(
+            defaults,
+            {
+                "use_vision_llm": False,
+                "concurrency": 4,
+                "task_filter": "all",
+                "no_mentions": True,
+            },
+        )
+        assert merged.use_vision_llm is False
+        assert merged.processing_mode == "basic"
+
+    def test_to_dict_round_trips(self) -> None:
+        s = IngestSettings(use_vision_llm=True, processing_mode="premium", should_summarize=False)
+        d = s.to_dict()
+        assert d == {
+            "use_vision_llm": True,
+            "processing_mode": "premium",
+            "should_summarize": False,
+        }
+
+    def test_render_label_format(self) -> None:
+        s = IngestSettings(use_vision_llm=True, processing_mode="premium", should_summarize=True)
+        assert s.render_label() == "vision=on, mode=premium, summarize=on"
+
+
+# ---------------------------------------------------------------------------
+# add_ingest_settings_args
+# ---------------------------------------------------------------------------
+
+
+class TestAddArgs:
+    @pytest.fixture
+    def parser(self) -> argparse.ArgumentParser:
+        p = argparse.ArgumentParser()
+        add_ingest_settings_args(
+            p,
+            defaults=IngestSettings(
+                use_vision_llm=False, processing_mode="basic", should_summarize=False
+            ),
+        )
+        return p
+
+    def test_silent_invocation_yields_none(self, parser: argparse.ArgumentParser) -> None:
+        args = parser.parse_args([])
+        assert args.use_vision_llm is None
+        assert args.processing_mode is None
+        assert args.should_summarize is None
+
+    def test_use_vision_llm_flag(self, parser: argparse.ArgumentParser) -> None:
+        args = parser.parse_args(["--use-vision-llm"])
+        assert args.use_vision_llm is True
+
+    def test_no_vision_llm_flag(self, parser: argparse.ArgumentParser) -> None:
+        args = parser.parse_args(["--no-vision-llm"])
+        assert args.use_vision_llm is False
+
+    def test_processing_mode_choices(self, parser: argparse.ArgumentParser) -> None:
+        for mode in PROCESSING_MODES:
+            args = parser.parse_args(["--processing-mode", mode])
+            assert args.processing_mode == mode
+
+    def test_processing_mode_rejects_unknown(
+        self, parser: argparse.ArgumentParser
+    ) -> None:
+        with pytest.raises(SystemExit):
+            parser.parse_args(["--processing-mode", "exotic"])
+
+    def test_summarize_flag_pair(self, parser: argparse.ArgumentParser) -> None:
+        on = parser.parse_args(["--should-summarize"])
+        assert on.should_summarize is True
+        off = parser.parse_args(["--no-summarize"])
+        assert off.should_summarize is False
+
+    def test_vision_flags_mutually_exclusive(
+        self, parser: argparse.ArgumentParser
+    ) -> None:
+        with pytest.raises(SystemExit):
+            parser.parse_args(["--use-vision-llm", "--no-vision-llm"])
+
+    def test_full_pipeline(self, parser: argparse.ArgumentParser) -> None:
+        # Operator passes flags + defaults are reasonable. Merge
+        # should yield exactly what they asked for.
+        args = parser.parse_args(
+            ["--use-vision-llm", "--processing-mode", "premium"]
+        )
+        defaults = IngestSettings(
+            use_vision_llm=False, processing_mode="basic", should_summarize=False
+        )
+        merged = IngestSettings.merge(defaults, vars(args))
+        assert merged == IngestSettings(
+            use_vision_llm=True, processing_mode="premium", should_summarize=False
+        )
+
+
+# ---------------------------------------------------------------------------
+# Header round-trip + read_settings_header fault tolerance
+# ---------------------------------------------------------------------------
+
+
+class TestHeader:
+    def test_header_line_round_trip(self, tmp_path: Path) -> None:
+        s = IngestSettings(use_vision_llm=True, processing_mode="premium")
+        path = tmp_path / "map.jsonl"
+        with path.open("w", encoding="utf-8") as fh:
+            fh.write(settings_header_line(s) + "\n")
+            fh.write(json.dumps({"case_id": "x", "document_id": 1}) + "\n")
+        loaded = read_settings_header(path)
+        assert loaded == s.to_dict()
+
+    def test_is_settings_header_recognises(self) -> None:
+        assert is_settings_header({SETTINGS_HEADER_KEY: {}})
+        assert not is_settings_header({"case_id": "x"})
+
+    def test_missing_file_returns_empty(self, tmp_path: Path) -> None:
+        assert read_settings_header(tmp_path / "does_not_exist.jsonl") == {}
+
+    def test_empty_file_returns_empty(self, tmp_path: Path) -> None:
+        path = tmp_path / "empty.jsonl"
+        path.write_text("", encoding="utf-8")
+        assert read_settings_header(path) == {}
+
+    def test_no_header_returns_empty(self, tmp_path: Path) -> None:
+        path = tmp_path / "legacy.jsonl"
+        with path.open("w", encoding="utf-8") as fh:
+            fh.write(json.dumps({"case_id": "x", "document_id": 1}) + "\n")
+            fh.write(json.dumps({"case_id": "y", "document_id": 2}) + "\n")
+        assert read_settings_header(path) == {}
+
+    def test_malformed_json_returns_empty(self, tmp_path: Path) -> None:
+        path = tmp_path / "broken.jsonl"
+        path.write_text("not json\n", encoding="utf-8")
+        assert read_settings_header(path) == {}
+
+    def test_skips_blank_first_lines(self, tmp_path: Path) -> None:
+        s = IngestSettings(use_vision_llm=True)
+        path = tmp_path / "padded.jsonl"
+        with path.open("w", encoding="utf-8") as fh:
+            fh.write("\n\n")
+            fh.write(settings_header_line(s) + "\n")
+        assert read_settings_header(path) == s.to_dict()
+
+
+# ---------------------------------------------------------------------------
+# format_ingest_settings_md
+# ---------------------------------------------------------------------------
+
+
+class TestFormatMd:
+    def test_full_settings(self) -> None:
+        out = format_ingest_settings_md(
+            {"use_vision_llm": True, "processing_mode": "premium", "should_summarize": True}
+        )
+        assert "vision_llm=`on`" in out
+        assert "processing_mode=`premium`" in out
+        assert "summarize=`on`" in out
+
+    def test_default_off(self) -> None:
+        out = format_ingest_settings_md(
+            {"use_vision_llm": False, "processing_mode": "basic", "should_summarize": False}
+        )
+        assert "vision_llm=`off`" in out
+        assert "processing_mode=`basic`" in out
+        assert "summarize=`off`" in out
+
+    def test_missing_returns_re_ingest_hint(self) -> None:
+        # Empty dict + None + non-mapping should all degrade gracefully.
+        for raw in [None, {}, "not-a-mapping"]:
+            assert "(not recorded" in format_ingest_settings_md(raw)
diff --git a/surfsense_evals/tests/core/test_metrics.py b/surfsense_evals/tests/core/test_metrics.py
new file mode 100644
index 000000000..cde1bb957
--- /dev/null
+++ b/surfsense_evals/tests/core/test_metrics.py
@@ -0,0 +1,153 @@
+"""Metric correctness — Wilson, McNemar, retrieval scores."""
+
+from __future__ import annotations
+
+import math
+
+import pytest
+
+from surfsense_evals.core.metrics import (
+    accuracy_with_wilson_ci,
+    bootstrap_delta_ci,
+    mcnemar_test,
+    mrr,
+    ndcg_at_k,
+    recall_at_k,
+    score_run,
+    wilson_ci,
+)
+
+# ---------------------------------------------------------------------------
+# Wilson
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "k,n,low,high",
+    [
+        (80, 100, 0.7111, 0.8666),  # cross-checked vs statsmodels.proportion_confint(method='wilson')
+        (50, 100, 0.4038, 0.5962),
+        (0, 0, 0.0, 1.0),
+        (0, 10, 0.0, 0.2775),
+        (10, 10, 0.7225, 1.0),
+    ],
+)
+def test_wilson_ci_known_values(k, n, low, high):
+    result_low, result_high = wilson_ci(k, n)
+    assert math.isclose(result_low, low, abs_tol=5e-4), (k, n, result_low, low)
+    assert math.isclose(result_high, high, abs_tol=5e-4), (k, n, result_high, high)
+
+
+def test_accuracy_with_wilson_ci_object():
+    res = accuracy_with_wilson_ci(70, 100)
+    assert res.accuracy == 0.7
+    assert 0.0 < res.ci_low < res.ci_high < 1.0
+
+
+def test_invalid_inputs_raise():
+    with pytest.raises(ValueError):
+        accuracy_with_wilson_ci(-1, 10)
+    with pytest.raises(ValueError):
+        accuracy_with_wilson_ci(11, 10)
+
+
+# ---------------------------------------------------------------------------
+# McNemar
+# ---------------------------------------------------------------------------
+
+
+def test_mcnemar_degenerate_returns_p_value_one():
+    a = [True, True, False, False]
+    b = [True, True, False, False]
+    res = mcnemar_test(a, b)
+    assert res.b == 0 and res.c == 0
+    assert res.p_value == 1.0
+    assert res.method == "degenerate"
+
+
+def test_mcnemar_exact_branch_strong_signal():
+    """B = 0, C = 10 → exact two-sided binomial p == 2 * (1/2)**10."""
+
+    a = [True] * 10 + [False] * 10
+    b = [True] * 10 + [True] * 10  # surfsense beats native on the 10 native-wrong
+    res = mcnemar_test(a, b)
+    assert res.b == 0
+    assert res.c == 10
+    assert res.method == "exact"
+    expected = 2 * (0.5 ** 10)
+    assert math.isclose(res.p_value, expected, rel_tol=1e-9)
+
+
+def test_mcnemar_chi_square_approx_for_large_discordant():
+    # Construct b=15, c=5 with continuity-corrected chi^2 = (|10|-1)^2/20 = 4.05.
+    a = [True] * 15 + [False] * 5 + [True] * 30 + [False] * 30
+    b = [False] * 15 + [True] * 5 + [True] * 30 + [False] * 30
+    res = mcnemar_test(a, b)
+    assert res.method == "chi2_cc"
+    assert res.b == 15 and res.c == 5
+    assert math.isclose(res.statistic, ((abs(15 - 5) - 1) ** 2) / 20.0, rel_tol=1e-9)
+    # p ≈ chi2.sf(4.05, df=1) ≈ 0.04417
+    assert 0.04 < res.p_value < 0.05
+
+
+def test_mcnemar_length_mismatch():
+    with pytest.raises(ValueError):
+        mcnemar_test([True], [True, False])
+
+
+# ---------------------------------------------------------------------------
+# Bootstrap
+# ---------------------------------------------------------------------------
+
+
+def test_bootstrap_delta_ci_shape_and_determinism():
+    a = [True, True, False, True, False, False, True, True]
+    b = [True, True, True, True, True, False, True, False]
+    res1 = bootstrap_delta_ci(a, b, n_resamples=500, random_state=42)
+    res2 = bootstrap_delta_ci(a, b, n_resamples=500, random_state=42)
+    assert res1.delta == res2.delta
+    assert res1.ci_low == res2.ci_low
+    assert res1.ci_high == res2.ci_high
+    assert res1.ci_low <= res1.delta <= res1.ci_high
+    assert res1.n_resamples == 500
+
+
+# ---------------------------------------------------------------------------
+# Retrieval
+# ---------------------------------------------------------------------------
+
+
+def test_recall_at_k():
+    retrieved = ["a", "b", "c", "d"]
+    relevant = ["b", "d", "z"]
+    assert recall_at_k(retrieved, relevant, k=2) == pytest.approx(1 / 3)
+    assert recall_at_k(retrieved, relevant, k=4) == pytest.approx(2 / 3)
+
+
+def test_mrr():
+    assert mrr(["a", "b", "c"], ["c"]) == pytest.approx(1 / 3)
+    assert mrr(["x", "y"], ["z"]) == 0.0
+
+
+def test_ndcg_at_k_perfect_order():
+    qrels = {"a": 2, "b": 1}
+    assert ndcg_at_k(["a", "b"], qrels, k=2) == pytest.approx(1.0)
+
+
+def test_ndcg_at_k_irrelevant_first():
+    qrels = {"a": 2, "b": 1}
+    # Wrong order should still be > 0 but < 1
+    val = ndcg_at_k(["c", "a", "b"], qrels, k=3)
+    assert 0 < val < 1
+
+
+def test_score_run_aggregates_across_queries():
+    scores = score_run(
+        per_query_retrieved={"q1": ["a", "b"], "q2": ["x", "y", "z"]},
+        per_query_qrels={"q1": {"a": 1}, "q2": {"z": 2}},
+        ks=(1, 5),
+        ndcg_k=5,
+    )
+    assert scores.n_queries == 2
+    assert scores.recall_at_k[1] == pytest.approx((1 + 0) / 2)  # q1 hits @1, q2 doesn't
+    assert scores.mrr == pytest.approx((1.0 + 1 / 3) / 2)
diff --git a/surfsense_evals/tests/core/test_parse_answer_letter.py b/surfsense_evals/tests/core/test_parse_answer_letter.py
new file mode 100644
index 000000000..5adbf4bc3
--- /dev/null
+++ b/surfsense_evals/tests/core/test_parse_answer_letter.py
@@ -0,0 +1,27 @@
+"""Tests for the MCQ answer-letter extractor."""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.core.parse import extract_answer_letter
+from surfsense_evals.core.parse.answer_letter import AnswerLetterResult
+
+
+@pytest.mark.parametrize(
+    "text,expected_letter,expected_strategy",
+    [
+        ('```json\n{"step_by_step_thinking": "...", "answer_choice": "B"}\n```', "B", "json_envelope"),
+        ('Reasoning... {"step_by_step_thinking": "x", "answer_choice": "C"}', "C", "json_envelope"),
+        ("Long reasoning.\nAnswer: D", "D", "answer_line"),
+        ("The correct answer is (A).", "A", "answer_line"),
+        ("Final answer: e", "E", "answer_line"),
+        ("Long reasoning.\n\nB", "B", "bare_letter"),
+        ("Long reasoning.\n\n(C).", "C", "bare_letter"),
+        ("", None, "none"),
+        ("Just narrative without an answer.", None, "none"),
+    ],
+)
+def test_extract_answer_letter(text, expected_letter, expected_strategy):
+    result = extract_answer_letter(text)
+    assert result == AnswerLetterResult(expected_letter, expected_strategy)
diff --git a/surfsense_evals/tests/core/test_parse_citations.py b/surfsense_evals/tests/core/test_parse_citations.py
new file mode 100644
index 000000000..eb444dab2
--- /dev/null
+++ b/surfsense_evals/tests/core/test_parse_citations.py
@@ -0,0 +1,108 @@
+"""Parity tests for the citation regex.
+
+Each row mirrors a case from the canonical TS reference at
+``surfsense_web/lib/citations/citation-parser.ts``. If a future PR
+loosens or tightens the TS regex, these tests will start failing;
+that's the explicit signal to re-port the change.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.core.parse import (
+    CITATION_REGEX,
+    ChunkCitation,
+    UrlCitation,
+    parse_citations,
+)
+
+PARITY_TABLE = [
+    # (input, expected number of matches, expected first-token kind/value)
+    ("Plain text with no citation.", 0, None),
+    (
+        "The patient has fever [citation:42] and cough.",
+        1,
+        ChunkCitation(chunk_id=42, is_docs_chunk=False),
+    ),
+    (
+        "Negative chunk ids work [citation:-7].",
+        1,
+        ChunkCitation(chunk_id=-7, is_docs_chunk=False),
+    ),
+    (
+        "doc-prefix [citation:doc-12].",
+        1,
+        ChunkCitation(chunk_id=12, is_docs_chunk=True),
+    ),
+    (
+        "Multi id [citation:1, doc-2, -3].",
+        3,
+        ChunkCitation(chunk_id=1, is_docs_chunk=False),
+    ),
+    (
+        "URL form [citation:https://x.com/a].",
+        1,
+        UrlCitation(url="https://x.com/a"),
+    ),
+    (
+        "Chinese brackets【citation:5】.",
+        1,
+        ChunkCitation(chunk_id=5, is_docs_chunk=False),
+    ),
+    (
+        "ZWSP-decorated [\u200bcitation:9\u200b].",
+        1,
+        ChunkCitation(chunk_id=9, is_docs_chunk=False),
+    ),
+    (
+        "Whitespace [citation:  doc-100 ] tolerated.",
+        1,
+        ChunkCitation(chunk_id=100, is_docs_chunk=True),
+    ),
+    (
+        # The TS regex's URL char class excludes ']', so a trailing
+        # bracket isn't swallowed.
+        "Two URLs [citation:https://a.io] and [citation:https://b.io].",
+        2,
+        UrlCitation(url="https://a.io"),
+    ),
+    (
+        # Garbled form should match nothing.
+        "Citation-like but wrong [citation:].",
+        0,
+        None,
+    ),
+]
+
+
+@pytest.mark.parametrize("text,n_expected,first", PARITY_TABLE)
+def test_citation_regex_parity(text: str, n_expected: int, first):
+    tokens = parse_citations(text)
+    assert len(tokens) == n_expected, (text, tokens)
+    if first is not None:
+        assert tokens[0] == first, (text, tokens)
+
+
+def test_regex_pattern_matches_ts_source():
+    """Sanity: the compiled pattern carries the exact alternatives the TS source does."""
+
+    pattern = CITATION_REGEX.pattern
+    assert "https?://" in pattern
+    assert "urlcite" in pattern
+    assert "doc-" in pattern
+    assert "\u200B" in pattern
+    assert "【" in pattern and "】" in pattern
+
+
+def test_url_map_resolution():
+    text = "Inline placeholder [citation:urlcite0]."
+    tokens = parse_citations(text, url_map={"urlcite0": "https://resolved.example/x"})
+    assert tokens == [UrlCitation(url="https://resolved.example/x")]
+
+
+def test_url_map_missing_key_drops_token():
+    """Missing urlcite resolution returns no token (TS behaviour)."""
+
+    text = "[citation:urlcite99]"
+    assert parse_citations(text, url_map={}) == []
diff --git a/surfsense_evals/tests/core/test_parse_freeform_answer.py b/surfsense_evals/tests/core/test_parse_freeform_answer.py
new file mode 100644
index 000000000..bdc7d74fc
--- /dev/null
+++ b/surfsense_evals/tests/core/test_parse_freeform_answer.py
@@ -0,0 +1,73 @@
+"""Tests for ``surfsense_evals.core.parse.freeform_answer``."""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.core.parse.freeform_answer import extract_freeform_answer
+
+
+class TestExtractFreeformAnswer:
+    def test_empty_string_returns_empty(self) -> None:
+        assert extract_freeform_answer("") == ""
+        assert extract_freeform_answer("   \n\n  ") == ""
+
+    def test_simple_answer_marker(self) -> None:
+        assert extract_freeform_answer("Answer: 42") == "42"
+
+    def test_final_answer_marker(self) -> None:
+        assert extract_freeform_answer("Final answer: Paris") == "Paris"
+
+    def test_the_answer_is_marker(self) -> None:
+        assert extract_freeform_answer("The answer is: not answerable") == "not answerable"
+
+    def test_multiline_picks_last_answer_marker(self) -> None:
+        text = "Let me think...\nAnswer: 5\nAnswer: 7\n"
+        assert extract_freeform_answer(text) == "7"
+
+    def test_falls_back_to_last_nonempty_line(self) -> None:
+        text = "Some thinking here.\n\n42"
+        assert extract_freeform_answer(text) == "42"
+
+    def test_strips_quotes(self) -> None:
+        assert extract_freeform_answer('Answer: "Paris"') == "Paris"
+        assert extract_freeform_answer("Answer: 'Paris'") == "Paris"
+
+    def test_strips_backticks(self) -> None:
+        assert extract_freeform_answer("Answer: `42`") == "42"
+
+    def test_uses_fenced_block_when_no_marker(self) -> None:
+        text = "Here's my response:\n```\nfinal value\n```\n"
+        assert extract_freeform_answer(text) == "final value"
+
+    def test_case_insensitive_markers(self) -> None:
+        assert extract_freeform_answer("ANSWER: yes") == "yes"
+        assert extract_freeform_answer("answer: no") == "no"
+
+    @pytest.mark.parametrize("text,expected", [
+        ("Answer: 1, 2, 3", "1, 2, 3"),
+        ("Answer: 3.14", "3.14"),
+        ("Answer:    spaced   ", "spaced"),
+    ])
+    def test_various_payloads(self, text: str, expected: str) -> None:
+        assert extract_freeform_answer(text) == expected
+
+    def test_inline_answer_after_thinking_trace(self) -> None:
+        # Agent replies sometimes glue their thinking onto the same
+        # line as the final "Answer: ..." marker (no newline before it).
+        # The line-anchored regex misses this; the inline fallback
+        # should still extract the right value.
+        text = (
+            "Need the Charlotte Bronte book title/year and the rank "
+            "for a 128-foot NYC building.Answer: 128th"
+        )
+        assert extract_freeform_answer(text) == "128th"
+
+    def test_inline_picks_last_inline_answer(self) -> None:
+        text = "Thought: maybe Answer: 5 is right? Actually Answer: 7."
+        assert extract_freeform_answer(text) == "7."
+
+    def test_inline_does_not_override_proper_marker(self) -> None:
+        # When a clean line-anchored "Answer: ..." exists, that wins.
+        text = "Some preamble.Answer: 99\nAnswer: 42"
+        assert extract_freeform_answer(text) == "42"
diff --git a/surfsense_evals/tests/core/test_parse_sse.py b/surfsense_evals/tests/core/test_parse_sse.py
new file mode 100644
index 000000000..362717288
--- /dev/null
+++ b/surfsense_evals/tests/core/test_parse_sse.py
@@ -0,0 +1,84 @@
+"""Tests for the SSE consumer."""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.core.parse import iter_sse_events
+
+
+async def _alist(it):
+    out = []
+    async for x in it:
+        out.append(x)
+    return out
+
+
+async def _astream(lines):
+    for line in lines:
+        yield line
+
+
+@pytest.mark.asyncio
+async def test_basic_data_frame():
+    events = await _alist(
+        iter_sse_events(_astream([
+            'data: {"type": "text-delta", "delta": "hi"}',
+            "",
+            'data: {"type": "finish"}',
+            "",
+        ]))
+    )
+    assert [e.data for e in events] == [
+        '{"type": "text-delta", "delta": "hi"}',
+        '{"type": "finish"}',
+    ]
+
+
+@pytest.mark.asyncio
+async def test_done_sentinel_passes_through():
+    events = await _alist(
+        iter_sse_events(_astream([
+            "data: [DONE]",
+            "",
+        ]))
+    )
+    assert [e.data for e in events] == ["[DONE]"]
+
+
+@pytest.mark.asyncio
+async def test_multiline_data_joins_with_newline():
+    events = await _alist(
+        iter_sse_events(_astream([
+            "data: line1",
+            "data: line2",
+            "",
+        ]))
+    )
+    assert events[0].data == "line1\nline2"
+
+
+@pytest.mark.asyncio
+async def test_comments_and_other_fields_ignored():
+    events = await _alist(
+        iter_sse_events(_astream([
+            ": heartbeat",
+            "event: foo",
+            "id: 123",
+            "data: payload",
+            "",
+        ]))
+    )
+    assert [e.data for e in events] == ["payload"]
+
+
+@pytest.mark.asyncio
+async def test_handles_missing_trailing_blank():
+    """Some servers omit the final blank line; the consumer should still emit."""
+
+    events = await _alist(
+        iter_sse_events(_astream([
+            "data: only-one",
+        ]))
+    )
+    assert [e.data for e in events] == ["only-one"]
diff --git a/surfsense_evals/tests/core/test_pdf_render.py b/surfsense_evals/tests/core/test_pdf_render.py
new file mode 100644
index 000000000..facdabbe8
--- /dev/null
+++ b/surfsense_evals/tests/core/test_pdf_render.py
@@ -0,0 +1,51 @@
+"""Smoke tests for PDF rendering.
+
+We don't pull a full PDF parser into the test deps; the assertions
+are bytes-level (``%PDF`` magic, deterministic CreationDate scrub).
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from surfsense_evals.core.pdf import render_pdf, render_text_files_to_pdf
+
+
+def test_render_pdf_writes_pdf_with_magic(tmp_path: Path):
+    out = tmp_path / "out.pdf"
+    rendered = render_pdf(
+        title="Test",
+        sections=[("intro", "Hello world."), ("body", "Line one.\nLine two.")],
+        output_path=out,
+    )
+    assert rendered.path == out
+    assert out.exists()
+    assert out.read_bytes().startswith(b"%PDF-")
+
+
+def test_render_pdf_deterministic_dates(tmp_path: Path):
+    out_a = tmp_path / "a.pdf"
+    out_b = tmp_path / "b.pdf"
+    sections = [("only", "deterministic body content")]
+    render_pdf(title="Det", sections=sections, output_path=out_a)
+    render_pdf(title="Det", sections=sections, output_path=out_b)
+    # CreationDate / ModDate are scrubbed to a fixed value, so the two
+    # files should compare equal (modulo any other internal randomness
+    # — reportlab's basic outputs are deterministic given fixed inputs).
+    assert out_a.read_bytes() == out_b.read_bytes()
+
+
+def test_render_text_files_uses_filename_as_section(tmp_path: Path):
+    files_dir = tmp_path / "src"
+    files_dir.mkdir()
+    (files_dir / "admission_note.txt").write_text("history of present illness", encoding="utf-8")
+    (files_dir / "labs.txt").write_text("Na 138, K 4.0", encoding="utf-8")
+    out = tmp_path / "case.pdf"
+    rendered = render_text_files_to_pdf(
+        title="Case 1",
+        files=[files_dir / "admission_note.txt", files_dir / "labs.txt"],
+        output_path=out,
+    )
+    assert out.exists()
+    # We don't decode the PDF; the n_chars estimate should reflect both inputs.
+    assert rendered.n_chars >= len("history of present illness") + len("Na 138, K 4.0")
diff --git a/surfsense_evals/tests/core/test_pdf_render_with_images.py b/surfsense_evals/tests/core/test_pdf_render_with_images.py
new file mode 100644
index 000000000..c29503bc9
--- /dev/null
+++ b/surfsense_evals/tests/core/test_pdf_render_with_images.py
@@ -0,0 +1,73 @@
+"""Tests for ``render_pdf_with_images`` — covers image embedding +
+deterministic byte output, mirroring ``test_pdf_render.py`` for the
+text-only path.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.core.pdf import PdfImage, render_pdf_with_images
+
+
+@pytest.fixture
+def tiny_png(tmp_path: Path) -> Path:
+    """Generate a real 4x4 PNG via Pillow — embeds cleanly in reportlab.
+
+    Hand-crafted PNG headers tend to fail PIL's strict decoder, so we
+    delegate to Pillow which is already a transitive dep of reportlab.
+    """
+
+    from PIL import Image as PILImage
+
+    p = tmp_path / "pixel.png"
+    PILImage.new("RGB", (4, 4), color=(128, 128, 128)).save(p, format="PNG")
+    return p
+
+
+class TestRenderPdfWithImages:
+    def test_renders_pdf_with_no_images(self, tmp_path: Path) -> None:
+        out = tmp_path / "out.pdf"
+        rendered = render_pdf_with_images(
+            title="Test",
+            sections=[("Heading", "Body text here.", None)],
+            output_path=out,
+        )
+        assert rendered.path == out
+        assert out.exists()
+        assert out.read_bytes().startswith(b"%PDF-")
+
+    def test_renders_pdf_with_one_image(self, tmp_path: Path, tiny_png: Path) -> None:
+        out = tmp_path / "out.pdf"
+        render_pdf_with_images(
+            title="Test",
+            sections=[("Case", "Body text.", [PdfImage(path=tiny_png, caption="A pixel")])],
+            output_path=out,
+        )
+        assert out.exists()
+        assert out.stat().st_size > 200  # not empty
+
+    def test_deterministic_bytes(self, tmp_path: Path, tiny_png: Path) -> None:
+        out_a = tmp_path / "a.pdf"
+        out_b = tmp_path / "b.pdf"
+        sections = [
+            ("Case", "Some text.", [PdfImage(path=tiny_png, caption="cap")]),
+            ("Options", "A) one\nB) two", None),
+        ]
+        render_pdf_with_images(title="Test", sections=sections, output_path=out_a)
+        render_pdf_with_images(title="Test", sections=sections, output_path=out_b)
+        assert out_a.read_bytes() == out_b.read_bytes()
+
+    def test_skips_invalid_image_silently(self, tmp_path: Path) -> None:
+        """A bad image path should not abort the whole PDF render."""
+
+        out = tmp_path / "out.pdf"
+        render_pdf_with_images(
+            title="Test",
+            sections=[("Case", "Text", [PdfImage(path=tmp_path / "nope.jpg", caption="x")])],
+            output_path=out,
+        )
+        assert out.exists()
+        assert out.read_bytes().startswith(b"%PDF-")
diff --git a/surfsense_evals/tests/core/test_provider_openrouter.py b/surfsense_evals/tests/core/test_provider_openrouter.py
new file mode 100644
index 000000000..eb78aa053
--- /dev/null
+++ b/surfsense_evals/tests/core/test_provider_openrouter.py
@@ -0,0 +1,121 @@
+"""respx-mocked tests for the OpenRouter PDF provider."""
+
+from __future__ import annotations
+
+import base64
+import json
+from pathlib import Path
+
+import httpx
+import pytest
+import respx
+
+from surfsense_evals.core.providers.openrouter_pdf import (
+    OpenRouterPdfProvider,
+    PdfEngine,
+)
+
+_BASE = "https://openrouter.test"
+
+
+@pytest.fixture
+def tiny_pdf(tmp_path: Path) -> Path:
+    p = tmp_path / "case.pdf"
+    p.write_bytes(b"%PDF-1.4 minimal content")
+    return p
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_payload_shape_matches_openrouter_docs(respx_mock, tiny_pdf: Path):
+    captured = {}
+
+    def _capture(request):
+        captured["body"] = json.loads(request.content)
+        captured["headers"] = dict(request.headers)
+        return httpx.Response(
+            200,
+            json={
+                "choices": [{
+                    "message": {"content": "Answer: B"},
+                    "finish_reason": "stop",
+                }],
+                "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15, "cost": 0.0001},
+            },
+        )
+
+    respx_mock.post("/chat/completions").mock(side_effect=_capture)
+
+    provider = OpenRouterPdfProvider(
+        api_key="sk-or-test",
+        base_url=_BASE,
+        model="anthropic/claude-sonnet-4.5",
+        engine=PdfEngine.NATIVE,
+    )
+    response = await provider.complete(prompt="What is the diagnosis?", pdf_path=tiny_pdf)
+    body = captured["body"]
+    assert body["model"] == "anthropic/claude-sonnet-4.5"
+    assert body["plugins"] == [{"id": "file-parser", "pdf": {"engine": "native"}}]
+    user = body["messages"][-1]
+    assert user["role"] == "user"
+    file_part = user["content"][0]
+    assert file_part["type"] == "file"
+    assert file_part["file"]["filename"] == tiny_pdf.name
+    assert file_part["file"]["file_data"].startswith("data:application/pdf;base64,")
+    assert (
+        base64.b64decode(file_part["file"]["file_data"].split(",", 1)[1])
+        == tiny_pdf.read_bytes()  # noqa: ASYNC240 — test fixture, sync read is fine
+    )
+    assert user["content"][1] == {"type": "text", "text": "What is the diagnosis?"}
+    assert captured["headers"]["authorization"] == "Bearer sk-or-test"
+    assert captured["headers"].get("x-title") == "SurfSense-evals"
+
+    assert response.text == "Answer: B"
+    assert response.input_tokens == 10
+    assert response.output_tokens == 5
+    assert response.total_tokens == 15
+    # cost 0.0001 USD == 100 micros
+    assert response.cost_micros == 100
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_chat_array_content_concatenates(respx_mock, tiny_pdf: Path):
+    respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "choices": [{
+                    "message": {
+                        "content": [
+                            {"type": "text", "text": "Hello "},
+                            {"type": "text", "text": "world"},
+                            {"type": "image_url", "image_url": "ignored"},
+                        ]
+                    }
+                }],
+                "usage": {"prompt_tokens": 1, "completion_tokens": 1},
+            },
+        )
+    )
+    provider = OpenRouterPdfProvider(
+        api_key="sk-or-test", base_url=_BASE, model="x/y"
+    )
+    response = await provider.complete(prompt="hi", pdf_path=tiny_pdf)
+    assert response.text == "Hello world"
+
+
+@pytest.mark.asyncio
+@respx.mock(base_url=_BASE)
+async def test_provider_raises_on_4xx(respx_mock, tiny_pdf: Path):
+    respx_mock.post("/chat/completions").mock(
+        return_value=httpx.Response(429, json={"error": {"message": "rate limited"}})
+    )
+    provider = OpenRouterPdfProvider(api_key="sk-or-test", base_url=_BASE, model="x/y")
+    with pytest.raises(httpx.HTTPStatusError):
+        await provider.complete(prompt="hi", pdf_path=tiny_pdf)
+
+
+def test_missing_api_key_raises():
+    with pytest.raises(ValueError):
+        OpenRouterPdfProvider(api_key="", base_url=_BASE, model="x/y")
diff --git a/surfsense_evals/tests/core/test_registry.py b/surfsense_evals/tests/core/test_registry.py
new file mode 100644
index 000000000..ffdbf2261
--- /dev/null
+++ b/surfsense_evals/tests/core/test_registry.py
@@ -0,0 +1,58 @@
+"""Registry + auto-discovery tests.
+
+* Auto-discovery skips packages starting with ``_`` (so test fixtures
+  don't leak into the production catalogue).
+* Manually importing a ``_demo`` benchmark fires its ``register(...)``
+  call and the CLI sees it.
+"""
+
+from __future__ import annotations
+
+import importlib
+
+from surfsense_evals.core import registry
+
+
+def _force_register_demo() -> None:
+    """Import (or reload) the demo module so its ``register(...)`` runs.
+
+    On a fresh interpreter, ``import_module`` triggers package
+    initialization. After the first call though, the module is cached
+    in ``sys.modules`` and a second ``import_module`` is a no-op — so
+    if a previous test already unregistered the entry, we have to
+    ``reload`` to re-execute the module body.
+    """
+
+    module = importlib.import_module("surfsense_evals.suites._demo.hello")
+    if ("_demo", "hello") not in registry.snapshot():
+        importlib.reload(module)
+
+
+def test_auto_discovery_skips_underscore_prefixed_subpackages():
+    from surfsense_evals.suites import discover_suites
+
+    discovered = discover_suites()
+    assert all(not part.startswith("_") for full in discovered for part in full.split("."))
+    # The medical suite's headline benchmark must always discover.
+    assert any(name.endswith(".medical.medxpertqa") for name in discovered)
+
+
+def test_demo_benchmark_registers_on_explicit_import():
+    _force_register_demo()
+    bench = registry.get("_demo", "hello")
+    assert bench is not None
+    assert bench.name == "hello"
+    assert bench.headline is False
+    # Cleanup so the test is idempotent under repeated runs.
+    registry.unregister("_demo", "hello")
+
+
+def test_register_unregister_roundtrip():
+    # Make sure no stale entry from a prior test in the session.
+    if ("_demo", "hello") in registry.snapshot():
+        registry.unregister("_demo", "hello")
+    snapshot_before = dict(registry.snapshot())
+    _force_register_demo()
+    assert ("_demo", "hello") in registry.snapshot()
+    registry.unregister("_demo", "hello")
+    assert dict(registry.snapshot()) == snapshot_before
diff --git a/surfsense_evals/tests/core/test_scenarios.py b/surfsense_evals/tests/core/test_scenarios.py
new file mode 100644
index 000000000..5e93c266b
--- /dev/null
+++ b/surfsense_evals/tests/core/test_scenarios.py
@@ -0,0 +1,68 @@
+"""Tests for the shared scenario formatter used in head-to-head reports."""
+
+from __future__ import annotations
+
+from surfsense_evals.core.scenarios import format_scenario_md
+
+
+def test_head_to_head_renders_both_arms_same_slug():
+    extra = {
+        "scenario": "head-to-head",
+        "provider_model": "anthropic/claude-sonnet-4.5",
+    }
+    line = format_scenario_md(extra)
+    assert "head-to-head" in line
+    assert "anthropic/claude-sonnet-4.5" in line
+
+
+def test_head_to_head_includes_vision_slug_when_recorded():
+    extra = {
+        "scenario": "head-to-head",
+        "provider_model": "anthropic/claude-sonnet-4.5",
+        "vision_provider_model": "anthropic/claude-sonnet-4.5",
+    }
+    line = format_scenario_md(extra)
+    assert "ingest VLM" in line
+    assert "claude-sonnet-4.5" in line
+
+
+def test_symmetric_cheap_calls_out_native_arm_disadvantage():
+    extra = {
+        "scenario": "symmetric-cheap",
+        "provider_model": "openai/gpt-5.4-mini",
+        "vision_provider_model": "anthropic/claude-sonnet-4.5",
+    }
+    line = format_scenario_md(extra)
+    assert "**symmetric-cheap**" in line
+    assert "gpt-5.4-mini" in line
+    # The "structurally loses" disclaimer must be there so reviewers
+    # don't read this as a fair comparison.
+    assert "structurally loses" in line.lower() or "structurally_loses" in line.lower()
+
+
+def test_cost_arbitrage_distinguishes_native_and_surfsense_slugs():
+    extra = {
+        "scenario": "cost-arbitrage",
+        "provider_model": "openai/gpt-5.4-mini",
+        "native_arm_model": "anthropic/claude-sonnet-4.5",
+        "vision_provider_model": "anthropic/claude-sonnet-4.5",
+    }
+    line = format_scenario_md(extra)
+    assert "**cost-arbitrage**" in line
+    # Both slugs surface; reader can see the asymmetry at a glance.
+    assert "anthropic/claude-sonnet-4.5" in line
+    assert "openai/gpt-5.4-mini" in line
+    assert "fraction of the per-query cost" in line
+
+
+def test_legacy_artifact_without_scenario_renders_as_head_to_head():
+    """Old run_artifact.json files don't have ``scenario`` — must still render."""
+
+    extra = {"provider_model": "anthropic/claude-sonnet-4.5"}
+    line = format_scenario_md(extra)
+    assert "head-to-head" in line
+
+
+def test_none_extra_does_not_crash():
+    line = format_scenario_md(None)
+    assert "head-to-head" in line
diff --git a/surfsense_evals/tests/core/test_vision_llm.py b/surfsense_evals/tests/core/test_vision_llm.py
new file mode 100644
index 000000000..5c3dfd719
--- /dev/null
+++ b/surfsense_evals/tests/core/test_vision_llm.py
@@ -0,0 +1,121 @@
+"""Tests for vision LLM auto-pick + explicit-slug resolution."""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.core.clients.search_space import VisionLlmConfigEntry
+from surfsense_evals.core.vision_llm import (
+    RECOMMENDED_VISION_PRIORITY,
+    VisionConfigError,
+    resolve_vision_llm,
+)
+
+
+def _entry(*, id: int, model_name: str, provider: str = "OPENROUTER") -> VisionLlmConfigEntry:
+    return VisionLlmConfigEntry(
+        id=id,
+        name=f"OpenRouter • {model_name}",
+        provider=provider,
+        model_name=model_name,
+        is_auto_mode=False,
+        raw={},
+    )
+
+
+# ---------------------------------------------------------------------------
+# Explicit slug resolution
+# ---------------------------------------------------------------------------
+
+
+def test_explicit_slug_resolves_to_matching_config_id():
+    candidates = [
+        _entry(id=-101, model_name="anthropic/claude-sonnet-4.5"),
+        _entry(id=-102, model_name="openai/gpt-5"),
+    ]
+    resolved = resolve_vision_llm(candidates, explicit_slug="openai/gpt-5")
+    assert resolved.config_id == -102
+    assert resolved.provider_model == "openai/gpt-5"
+    assert resolved.selected_via == "explicit"
+
+
+def test_explicit_slug_with_no_match_raises_with_helpful_listing():
+    candidates = [_entry(id=-101, model_name="anthropic/claude-sonnet-4.5")]
+    with pytest.raises(VisionConfigError) as exc_info:
+        resolve_vision_llm(candidates, explicit_slug="some/missing-slug")
+    msg = str(exc_info.value)
+    assert "some/missing-slug" in msg
+    assert "anthropic/claude-sonnet-4.5" in msg  # surfaced as a sample
+
+
+def test_explicit_slug_skips_non_openrouter_entries():
+    """A YAML BYOK entry with a colliding model_name shouldn't accidentally match."""
+
+    candidates = [
+        _entry(id=42, model_name="openai/gpt-5", provider="OPENAI"),
+        _entry(id=-101, model_name="openai/gpt-5"),
+    ]
+    resolved = resolve_vision_llm(candidates, explicit_slug="openai/gpt-5")
+    assert resolved.config_id == -101  # the OpenRouter one, not the BYOK one
+
+
+# ---------------------------------------------------------------------------
+# Auto-pick by recommended priority
+# ---------------------------------------------------------------------------
+
+
+def test_auto_pick_walks_priority_list_in_order():
+    candidates = [
+        _entry(id=-300, model_name="google/gemini-2.5-pro"),
+        _entry(id=-200, model_name="anthropic/claude-opus-4.7"),
+        _entry(id=-100, model_name="anthropic/claude-sonnet-4.5"),
+    ]
+    resolved = resolve_vision_llm(candidates, explicit_slug=None)
+    # claude-sonnet-4.5 is first in the priority tuple, so it wins.
+    assert resolved.config_id == -100
+    assert resolved.provider_model == "anthropic/claude-sonnet-4.5"
+    assert resolved.selected_via == "auto-priority"
+
+
+def test_auto_pick_skips_to_next_priority_when_first_unavailable():
+    candidates = [
+        _entry(id=-200, model_name="anthropic/claude-opus-4.7"),
+        _entry(id=-300, model_name="google/gemini-2.5-pro"),
+    ]
+    resolved = resolve_vision_llm(candidates, explicit_slug=None)
+    # claude-sonnet-4.5 not registered → claude-opus-4.7 is next in priority.
+    assert resolved.provider_model == "anthropic/claude-opus-4.7"
+    assert resolved.selected_via == "auto-priority"
+
+
+def test_auto_pick_falls_back_to_first_openrouter_when_no_recommended_match():
+    candidates = [
+        _entry(id=-700, model_name="some/exotic-vision-model"),
+        _entry(id=-800, model_name="another/exotic-vision-model"),
+    ]
+    resolved = resolve_vision_llm(candidates, explicit_slug=None)
+    # Neither matches the priority list → first OpenRouter entry wins.
+    assert resolved.config_id == -700
+    assert resolved.selected_via == "auto-fallback"
+
+
+def test_auto_pick_with_zero_openrouter_candidates_raises():
+    candidates: list[VisionLlmConfigEntry] = []
+    with pytest.raises(VisionConfigError) as exc_info:
+        resolve_vision_llm(candidates, explicit_slug=None)
+    assert "vision_enabled: true" in str(exc_info.value)
+
+
+def test_auto_pick_ignores_non_openrouter_entries():
+    candidates = [
+        _entry(id=99, model_name="anthropic/claude-sonnet-4.5", provider="ANTHROPIC"),
+    ]
+    with pytest.raises(VisionConfigError):
+        resolve_vision_llm(candidates, explicit_slug=None)
+
+
+def test_recommended_priority_is_a_stable_public_list():
+    """If you reorder this, update the README's auto-pick claim too."""
+
+    assert RECOMMENDED_VISION_PRIORITY[0] == "anthropic/claude-sonnet-4.5"
+    assert "google/gemini-2.5-pro" in RECOMMENDED_VISION_PRIORITY
diff --git a/surfsense_evals/tests/suites/__init__.py b/surfsense_evals/tests/suites/__init__.py
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/surfsense_evals/tests/suites/__init__.py
@@ -0,0 +1 @@
+
diff --git a/surfsense_evals/tests/suites/test_crag_dataset.py b/surfsense_evals/tests/suites/test_crag_dataset.py
new file mode 100644
index 000000000..36114b52e
--- /dev/null
+++ b/surfsense_evals/tests/suites/test_crag_dataset.py
@@ -0,0 +1,224 @@
+"""Tests for the CRAG dataset loader (parser + sampling).
+
+The full bz2 download is excluded — these tests synthesise a tiny
+JSONL-bz2 in a tmp dir and verify the parser / stratified-sampler
+produce well-shaped objects.
+"""
+
+from __future__ import annotations
+
+import bz2
+import json
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.suites.research.crag.dataset import (
+    CragPage,
+    CragQuestion,
+    iter_questions,
+    stratified_sample,
+)
+
+
+def _make_jsonl_bz2(rows: list[dict], tmp_path: Path) -> Path:
+    """Write ``rows`` as one JSON object per line, bz2-compressed."""
+
+    dest = tmp_path / "fake.jsonl.bz2"
+    payload = "\n".join(json.dumps(r) for r in rows).encode("utf-8")
+    with bz2.open(dest, "wb") as fh:
+        fh.write(payload)
+    return dest
+
+
+def _row(
+    *,
+    interaction_id: str,
+    query: str,
+    answer: str,
+    domain: str = "movie",
+    question_type: str = "simple",
+    pages: list[dict] | None = None,
+    alt_ans: list[str] | None = None,
+    popularity: str = "head",
+    static_or_dynamic: str = "static",
+    split: int = 0,
+    query_time: str = "2024-04-01",
+) -> dict:
+    return {
+        "interaction_id": interaction_id,
+        "query_time": query_time,
+        "domain": domain,
+        "question_type": question_type,
+        "static_or_dynamic": static_or_dynamic,
+        "query": query,
+        "answer": answer,
+        "alt_ans": alt_ans or [],
+        "split": split,
+        "popularity": popularity,
+        "search_results": pages or [],
+    }
+
+
+class TestParser:
+    def test_basic_parse(self, tmp_path: Path) -> None:
+        rows = [
+            _row(
+                interaction_id="abc",
+                query="Who directed Inception?",
+                answer="Christopher Nolan",
+                pages=[{
+                    "page_name": "Inception (film)",
+                    "page_url": "https://en.wikipedia.org/wiki/Inception",
+                    "page_snippet": "snippet",
+                    "page_result": "<html>full html</html>",
+                    "page_last_modified": "2024-01-01",
+                }],
+            ),
+        ]
+        path = _make_jsonl_bz2(rows, tmp_path)
+        parsed = iter_questions(path)
+        assert len(parsed) == 1
+        q = parsed[0]
+        assert q.query == "Who directed Inception?"
+        assert q.gold_answer == "Christopher Nolan"
+        assert q.qid == "C00000"
+        assert q.domain == "movie"
+        assert q.question_type == "simple"
+        assert len(q.pages) == 1
+        page = q.pages[0]
+        assert page.page_name == "Inception (film)"
+        assert page.page_url == "https://en.wikipedia.org/wiki/Inception"
+
+    def test_skips_missing_query_or_answer(self, tmp_path: Path) -> None:
+        rows = [
+            _row(interaction_id="1", query="", answer="x"),
+            _row(interaction_id="2", query="ok?", answer=""),
+            _row(interaction_id="3", query="ok?", answer="x"),
+        ]
+        path = _make_jsonl_bz2(rows, tmp_path)
+        parsed = iter_questions(path)
+        assert len(parsed) == 1
+        assert parsed[0].interaction_id == "3"
+
+    def test_skips_empty_pages(self, tmp_path: Path) -> None:
+        rows = [
+            _row(
+                interaction_id="x",
+                query="q?",
+                answer="a",
+                pages=[
+                    {"page_url": "", "page_result": "<html/>"},  # no URL
+                    {"page_url": "https://x.test/", "page_result": ""},  # empty html
+                    {"page_url": "https://y.test/", "page_result": "<html>good</html>"},
+                ],
+            ),
+        ]
+        path = _make_jsonl_bz2(rows, tmp_path)
+        parsed = iter_questions(path)
+        assert len(parsed) == 1
+        assert len(parsed[0].pages) == 1
+        assert parsed[0].pages[0].page_url == "https://y.test/"
+
+    def test_alt_answers_parsed(self, tmp_path: Path) -> None:
+        rows = [
+            _row(interaction_id="z", query="q?", answer="42",
+                 alt_ans=["forty-two", "42.0"]),
+        ]
+        path = _make_jsonl_bz2(rows, tmp_path)
+        parsed = iter_questions(path)
+        assert parsed[0].alt_answers == ["forty-two", "42.0"]
+
+    def test_handles_malformed_line(self, tmp_path: Path) -> None:
+        # Manually construct a bz2 with one valid line and one garbage line.
+        good = json.dumps(_row(interaction_id="ok", query="q?", answer="a"))
+        path = tmp_path / "mixed.jsonl.bz2"
+        with bz2.open(path, "wb") as fh:
+            fh.write(b"not-json{\n")
+            fh.write((good + "\n").encode("utf-8"))
+        parsed = iter_questions(path)
+        # Malformed line is skipped; one good row survives at index 1.
+        assert len(parsed) == 1
+        assert parsed[0].interaction_id == "ok"
+
+
+class TestPageHash:
+    def test_url_hash_stable(self) -> None:
+        a = CragPage(
+            page_name="A", page_url="https://x.test/p?q=1",
+            page_snippet="", page_html="<html/>",
+        )
+        b = CragPage(
+            page_name="B", page_url="https://x.test/p?q=1",
+            page_snippet="", page_html="<html/>",
+        )
+        assert a.url_hash == b.url_hash
+        assert len(a.url_hash) == 12
+
+    def test_url_hash_unique(self) -> None:
+        a = CragPage(
+            page_name="A", page_url="https://x.test/a", page_snippet="", page_html="<html/>",
+        )
+        b = CragPage(
+            page_name="B", page_url="https://x.test/b", page_snippet="", page_html="<html/>",
+        )
+        assert a.url_hash != b.url_hash
+
+
+class TestStratifiedSample:
+    def _make_pool(self) -> list[CragQuestion]:
+        out: list[CragQuestion] = []
+        idx = 0
+        # 30 finance/simple, 20 movie/comparison, 5 sports/multi-hop.
+        for n, domain, qtype in (
+            (30, "finance", "simple"),
+            (20, "movie", "comparison"),
+            (5, "sports", "multi-hop"),
+        ):
+            for _ in range(n):
+                out.append(CragQuestion(
+                    qid=f"C{idx:05d}",
+                    interaction_id=f"i{idx}",
+                    query_time="2024-01-01",
+                    query=f"q{idx}?",
+                    gold_answer="a",
+                    alt_answers=[],
+                    domain=domain,
+                    question_type=qtype,
+                    static_or_dynamic="static",
+                    popularity="head",
+                    split=0,
+                    raw_index=idx,
+                    pages=[],
+                ))
+                idx += 1
+        return out
+
+    def test_sample_smaller_than_pool(self) -> None:
+        pool = self._make_pool()
+        sample = stratified_sample(pool, n=15, seed=7)
+        assert len(sample) == 15
+        # Should pull from all three buckets at least once.
+        domains = {q.domain for q in sample}
+        assert domains == {"finance", "movie", "sports"}
+
+    def test_sample_returns_pool_when_n_geq(self) -> None:
+        pool = self._make_pool()
+        sample = stratified_sample(pool, n=999, seed=1)
+        assert len(sample) == len(pool)
+
+    def test_sample_sorted_by_raw_index(self) -> None:
+        pool = self._make_pool()
+        sample = stratified_sample(pool, n=10, seed=42)
+        assert [q.raw_index for q in sample] == sorted(q.raw_index for q in sample)
+
+    def test_sample_deterministic(self) -> None:
+        pool = self._make_pool()
+        s1 = stratified_sample(pool, n=20, seed=11)
+        s2 = stratified_sample(pool, n=20, seed=11)
+        assert [q.qid for q in s1] == [q.qid for q in s2]
+
+    def test_n_zero_or_negative_returns_pool(self) -> None:
+        pool = self._make_pool()
+        assert len(stratified_sample(pool, n=0)) == len(pool)
+        assert len(stratified_sample(pool, n=-1)) == len(pool)
diff --git a/surfsense_evals/tests/suites/test_crag_dataset_task3.py b/surfsense_evals/tests/suites/test_crag_dataset_task3.py
new file mode 100644
index 000000000..123628350
--- /dev/null
+++ b/surfsense_evals/tests/suites/test_crag_dataset_task3.py
@@ -0,0 +1,259 @@
+"""Unit tests for CRAG Task 3 streaming dataset loader.
+
+We don't (and shouldn't) hit the real 7 GB upstream archive in
+unit tests. Instead we construct tiny tar.bz2 archives split across
+N parts and verify:
+
+* ``_MultiPartReader`` correctly stitches N files together.
+* The streaming path (multi → bz2 → tar → JSONL) yields parsed
+  ``CragQuestion`` rows with the right shape.
+* ``max_questions`` cap is honoured (early break, no greedy read).
+* ``parts_present`` correctly detects missing/empty parts.
+"""
+
+from __future__ import annotations
+
+import bz2
+import io
+import json
+import tarfile
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.suites.research.crag.dataset_task3 import (
+    _MultiPartReader,
+    iter_questions_task3,
+    parts_present,
+)
+
+
+# ---------------------------------------------------------------------------
+# Fixtures: build a tiny synthetic Task 3 archive
+# ---------------------------------------------------------------------------
+
+
+def _make_jsonl_payload(n_rows: int) -> bytes:
+    rows = []
+    for i in range(n_rows):
+        rows.append({
+            "interaction_id": f"int_{i:04d}",
+            "query_time": "2024-01-01 00:00:00",
+            "domain": ["finance", "music", "movie", "sports", "open"][i % 5],
+            "question_type": ["simple", "comparison", "aggregation", "multi-hop"][i % 4],
+            "static_or_dynamic": "static",
+            "popularity": "head",
+            "split": 0,
+            "query": f"Synthetic CRAG question {i}?",
+            "answer": f"answer-{i}",
+            "alt_ans": [f"alt-{i}-a", f"alt-{i}-b"],
+            "search_results": [
+                {
+                    "page_name": f"Page {j} for q{i}",
+                    "page_url": f"https://example.com/q{i}/p{j}",
+                    "page_snippet": "snippet",
+                    "page_result": f"<html><body><p>q{i} p{j} body</p></body></html>",
+                    "page_last_modified": "",
+                }
+                for j in range(50)
+            ],
+        })
+    return b"\n".join(json.dumps(r).encode("utf-8") for r in rows) + b"\n"
+
+
+def _make_tar_bz2(jsonl_bytes: bytes, *, member_name: str = "data.jsonl") -> bytes:
+    bio = io.BytesIO()
+    with bz2.BZ2File(bio, mode="wb") as bz:
+        with tarfile.open(fileobj=bz, mode="w") as tar:
+            info = tarfile.TarInfo(name=member_name)
+            info.size = len(jsonl_bytes)
+            tar.addfile(info, io.BytesIO(jsonl_bytes))
+    return bio.getvalue()
+
+
+def _make_tar_bz2_multi(shards: list[tuple[str, bytes]]) -> bytes:
+    """Build a tar.bz2 archive containing multiple JSONL shards.
+
+    Mirrors the real CRAG Task 3 layout: one tar with N JSONL members
+    named ``crag_task_3_dev_v4_{i}.jsonl`` (or whatever the caller
+    passes in).
+    """
+
+    bio = io.BytesIO()
+    with bz2.BZ2File(bio, mode="wb") as bz:
+        with tarfile.open(fileobj=bz, mode="w") as tar:
+            for name, payload in shards:
+                info = tarfile.TarInfo(name=name)
+                info.size = len(payload)
+                tar.addfile(info, io.BytesIO(payload))
+    return bio.getvalue()
+
+
+def _split_into_parts(blob: bytes, n_parts: int) -> list[bytes]:
+    """Split byte string into N roughly-equal chunks (last gets remainder)."""
+    chunk = max(1, len(blob) // n_parts)
+    parts = [blob[i * chunk : (i + 1) * chunk] for i in range(n_parts - 1)]
+    parts.append(blob[(n_parts - 1) * chunk :])
+    return parts
+
+
+@pytest.fixture
+def task3_parts_dir(tmp_path: Path) -> Path:
+    """A directory containing a 4-part synthetic CRAG Task 3 archive (12 rows)."""
+    blob = _make_tar_bz2(_make_jsonl_payload(12))
+    parts = _split_into_parts(blob, 4)
+    parts_dir = tmp_path / ".raw_cache"
+    parts_dir.mkdir()
+    for i, b in enumerate(parts, start=1):
+        (parts_dir / f"crag_task_3_dev_v4.tar.bz2.part{i}").write_bytes(b)
+    return parts_dir
+
+
+# ---------------------------------------------------------------------------
+# _MultiPartReader
+# ---------------------------------------------------------------------------
+
+
+class TestMultiPartReader:
+    def test_concatenates_parts_in_order(self, tmp_path: Path) -> None:
+        a = tmp_path / "a"
+        b = tmp_path / "b"
+        c = tmp_path / "c"
+        a.write_bytes(b"hello, ")
+        b.write_bytes(b"streaming ")
+        c.write_bytes(b"world!")
+        with _MultiPartReader([a, b, c]) as r:
+            assert r.read() == b"hello, streaming world!"
+
+    def test_read_n_crosses_part_boundary(self, tmp_path: Path) -> None:
+        a = tmp_path / "a"
+        b = tmp_path / "b"
+        a.write_bytes(b"AAA")
+        b.write_bytes(b"BBBB")
+        with _MultiPartReader([a, b]) as r:
+            # Read 5 bytes — straddles boundary between parts.
+            assert r.read(5) == b"AAABB"
+            assert r.read(5) == b"BB"
+            assert r.read(5) == b""
+
+    def test_close_is_idempotent(self, tmp_path: Path) -> None:
+        a = tmp_path / "a"
+        a.write_bytes(b"x")
+        r = _MultiPartReader([a])
+        r.close()
+        r.close()
+        with pytest.raises(ValueError):
+            r.read(1)
+
+    def test_missing_part_raises(self, tmp_path: Path) -> None:
+        with pytest.raises(FileNotFoundError):
+            _MultiPartReader([tmp_path / "does-not-exist"])
+
+    def test_empty_paths_raises(self) -> None:
+        with pytest.raises(ValueError):
+            _MultiPartReader([])
+
+
+# ---------------------------------------------------------------------------
+# iter_questions_task3
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def task3_multi_shard_dir(tmp_path: Path) -> Path:
+    """A 4-part archive whose tar contains 3 JSONL shards (4 + 4 + 4 rows)."""
+    payload_a = _make_jsonl_payload(4)
+    payload_b = _make_jsonl_payload(4)
+    payload_c = _make_jsonl_payload(4)
+    blob = _make_tar_bz2_multi([
+        ("crag_task_3_dev_v4_0.jsonl", payload_a),
+        ("crag_task_3_dev_v4_1.jsonl", payload_b),
+        ("crag_task_3_dev_v4_2.jsonl", payload_c),
+    ])
+    parts = _split_into_parts(blob, 4)
+    parts_dir = tmp_path / ".raw_cache"
+    parts_dir.mkdir()
+    for i, b in enumerate(parts, start=1):
+        (parts_dir / f"crag_task_3_dev_v4.tar.bz2.part{i}").write_bytes(b)
+    return parts_dir
+
+
+class TestIterQuestionsTask3:
+    def test_streams_full_archive(self, task3_parts_dir: Path) -> None:
+        questions = iter_questions_task3(task3_parts_dir)
+        assert len(questions) == 12
+        # All questions get the T3_ prefix and 50 pages each.
+        assert all(q.qid.startswith("T3_") for q in questions)
+        assert all(len(q.pages) == 50 for q in questions)
+        # Schema fields preserved.
+        first = questions[0]
+        assert first.query == "Synthetic CRAG question 0?"
+        assert first.gold_answer == "answer-0"
+        assert first.domain == "finance"
+        assert "alt-0-a" in first.alt_answers
+
+    def test_max_questions_caps_early(self, task3_parts_dir: Path) -> None:
+        questions = iter_questions_task3(task3_parts_dir, max_questions=3)
+        assert len(questions) == 3
+        # Sequential indices 0..2 — we don't skip rows.
+        assert [q.raw_index for q in questions] == [0, 1, 2]
+
+    def test_streams_multi_shard_archive(self, task3_multi_shard_dir: Path) -> None:
+        # Three shards × four rows each = twelve rows total.
+        questions = iter_questions_task3(task3_multi_shard_dir)
+        assert len(questions) == 12
+        # raw_index increments monotonically across shards.
+        assert [q.raw_index for q in questions] == list(range(12))
+        # qids are unique and sequential across shards.
+        assert len({q.qid for q in questions}) == 12
+
+    def test_max_questions_short_circuits_first_shard(self, task3_multi_shard_dir: Path) -> None:
+        # Cap < shard size — shouldn't touch shards 1 or 2 at all.
+        questions = iter_questions_task3(task3_multi_shard_dir, max_questions=2)
+        assert len(questions) == 2
+        # Both come from shard 0 (raw_index 0, 1).
+        assert [q.raw_index for q in questions] == [0, 1]
+
+    def test_max_questions_spans_shards(self, task3_multi_shard_dir: Path) -> None:
+        # Cap = 6 → all 4 from shard 0 + first 2 from shard 1.
+        questions = iter_questions_task3(task3_multi_shard_dir, max_questions=6)
+        assert len(questions) == 6
+        assert [q.raw_index for q in questions] == [0, 1, 2, 3, 4, 5]
+
+    def test_raises_when_no_jsonl_member(self, tmp_path: Path) -> None:
+        # Archive containing a non-jsonl member.
+        bio = io.BytesIO()
+        with bz2.BZ2File(bio, mode="wb") as bz:
+            with tarfile.open(fileobj=bz, mode="w") as tar:
+                info = tarfile.TarInfo(name="README.md")
+                payload = b"not jsonl"
+                info.size = len(payload)
+                tar.addfile(info, io.BytesIO(payload))
+        parts_dir = tmp_path / ".raw_cache"
+        parts_dir.mkdir()
+        for i, name in enumerate(
+            ("part1", "part2", "part3", "part4"), start=1,
+        ):
+            half = len(bio.getvalue()) // 4
+            chunk = bio.getvalue()[(i - 1) * half : i * half if i < 4 else len(bio.getvalue())]
+            (parts_dir / f"crag_task_3_dev_v4.tar.bz2.{name}").write_bytes(chunk)
+        with pytest.raises(RuntimeError, match="No JSONL member"):
+            iter_questions_task3(parts_dir)
+
+
+# ---------------------------------------------------------------------------
+# parts_present
+# ---------------------------------------------------------------------------
+
+
+class TestPartsPresent:
+    def test_all_present(self, task3_parts_dir: Path) -> None:
+        assert parts_present(task3_parts_dir) is True
+
+    def test_one_missing(self, task3_parts_dir: Path) -> None:
+        (task3_parts_dir / "crag_task_3_dev_v4.tar.bz2.part2").unlink()
+        assert parts_present(task3_parts_dir) is False
+
+    def test_one_empty(self, task3_parts_dir: Path) -> None:
+        (task3_parts_dir / "crag_task_3_dev_v4.tar.bz2.part3").write_bytes(b"")
+        assert parts_present(task3_parts_dir) is False
diff --git a/surfsense_evals/tests/suites/test_crag_grader.py b/surfsense_evals/tests/suites/test_crag_grader.py
new file mode 100644
index 000000000..93bf6f478
--- /dev/null
+++ b/surfsense_evals/tests/suites/test_crag_grader.py
@@ -0,0 +1,248 @@
+"""Tests for the CRAG 3-class deterministic grader.
+
+The LLM-judge fallback is excluded here (network call); these tests
+exercise the deterministic shortcut + the special-case routing for
+``false_premise`` questions and refusal detection (``I don't know``).
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.suites.research.crag.grader import (
+    CragGradeResult,
+    _flags_false_premise,
+    _is_refusal,
+    _maybe_number,
+    _normalise,
+    _whole_word_substring,
+    grade_deterministic,
+)
+
+
+class TestNormalisation:
+    def test_lowercase_and_punct_stripped(self) -> None:
+        assert _normalise("Apple Inc.") == "apple inc"
+
+    def test_articles_removed(self) -> None:
+        assert _normalise("The Apple Watch") == "apple watch"
+
+    def test_empty_returns_empty(self) -> None:
+        assert _normalise("") == ""
+
+
+class TestNumericExtraction:
+    def test_simple_int(self) -> None:
+        assert _maybe_number("42") == 42.0
+
+    def test_int_with_commas(self) -> None:
+        assert _maybe_number("$1,234") == 1234.0
+
+    def test_year_in_sentence(self) -> None:
+        assert _maybe_number("released in 2008") == 2008.0
+
+    def test_word_number(self) -> None:
+        assert _maybe_number("seven") == 7.0
+
+
+class TestWholeWordSubstring:
+    def test_phrase_match(self) -> None:
+        assert _whole_word_substring("the new york yankees", "new york")
+
+    def test_word_boundary_required(self) -> None:
+        assert not _whole_word_substring("yorkshire", "york")
+
+
+class TestRefusalDetection:
+    def test_explicit_idk(self) -> None:
+        assert _is_refusal("Answer: I don't know")
+
+    def test_idk_no_apostrophe(self) -> None:
+        assert _is_refusal("I dont know")
+
+    def test_no_information(self) -> None:
+        assert _is_refusal("There is no information available about this.")
+
+    def test_unable_to_answer(self) -> None:
+        assert _is_refusal("I am unable to answer this question.")
+
+    def test_empty_is_refusal(self) -> None:
+        assert _is_refusal("")
+        assert _is_refusal("   ")
+
+    def test_real_answer_is_not_refusal(self) -> None:
+        assert not _is_refusal("Answer: Apple Inc")
+        assert not _is_refusal("The CEO is Tim Cook.")
+
+
+class TestFalsePremiseDetection:
+    def test_explicit_false_premise(self) -> None:
+        assert _flags_false_premise(
+            "The question contains a false premise; the company never had that product."
+        )
+
+    def test_no_such(self) -> None:
+        assert _flags_false_premise("There is no such album.")
+
+    def test_did_not_happen(self) -> None:
+        assert _flags_false_premise("That event did not happen.")
+
+    def test_does_not_exist(self) -> None:
+        assert _flags_false_premise("That movie does not exist.")
+
+    def test_normal_answer_is_not_premise_flag(self) -> None:
+        assert not _flags_false_premise("Apple, headquartered in Cupertino.")
+
+
+class TestGradeDeterministicHappyPath:
+    def test_exact_match_correct(self) -> None:
+        result = grade_deterministic(pred="Tim Cook", gold="Tim Cook", question_type="simple")
+        assert result.grade == "correct"
+        assert result.score == 1
+        assert result.method == "exact"
+
+    def test_substring_match(self) -> None:
+        result = grade_deterministic(
+            pred="The answer is Tim Cook, CEO of Apple.",
+            gold="Tim Cook",
+            question_type="simple",
+        )
+        assert result.grade == "correct"
+        assert result.method == "substring"
+
+    def test_alt_answer_match(self) -> None:
+        result = grade_deterministic(
+            pred="2,008",
+            gold="two thousand eight",
+            alt_answers=["2008"],
+            question_type="simple",
+        )
+        assert result.grade == "correct"
+        assert result.score == 1
+
+    def test_numeric_within_tolerance(self) -> None:
+        result = grade_deterministic(
+            pred="The revenue was $1,234,000 USD",
+            gold="$1,234,123",
+            question_type="aggregation",
+        )
+        assert result.grade == "correct"
+        assert result.method == "numeric"
+
+    def test_numeric_outside_tolerance(self) -> None:
+        result = grade_deterministic(
+            pred="100",
+            gold="500",
+            question_type="aggregation",
+        )
+        assert result.grade == "incorrect"
+        assert result.score == -1
+
+    def test_numeric_strict_small_currency(self) -> None:
+        # CRAG (unlike FRAMES) does not apply a 0.5 absolute floor —
+        # ``$2.05`` should NOT match ``$2.17`` (≈5.5% off, well over 1%).
+        result = grade_deterministic(
+            pred="$2.05",
+            gold="$2.17",
+            question_type="simple",
+        )
+        # Falls through to lexical_miss (no substring overlap either).
+        assert result.grade == "incorrect"
+        assert result.method == "lexical_miss"
+
+
+class TestGradeDeterministicRefusal:
+    def test_idk_maps_to_missing(self) -> None:
+        result = grade_deterministic(
+            pred="I don't know.", gold="Tim Cook", question_type="simple",
+        )
+        assert result.grade == "missing"
+        assert result.score == 0
+        assert result.method == "refusal"
+
+    def test_empty_pred_maps_to_missing(self) -> None:
+        result = grade_deterministic(pred="", gold="Tim Cook", question_type="simple")
+        assert result.grade == "missing"
+
+    def test_no_information_maps_to_missing(self) -> None:
+        result = grade_deterministic(
+            pred="There is not enough information to answer.",
+            gold="42",
+            question_type="simple",
+        )
+        assert result.grade == "missing"
+
+
+class TestGradeDeterministicFalsePremise:
+    def test_flagging_premise_is_correct(self) -> None:
+        result = grade_deterministic(
+            pred="The question contains a false premise; that movie does not exist.",
+            gold="invalid question",
+            question_type="false_premise",
+        )
+        assert result.grade == "correct"
+        assert result.method == "false_premise_flagged"
+
+    def test_committing_to_false_answer_is_unclear(self) -> None:
+        # Should land in false_premise_unclear → judge fallback territory.
+        result = grade_deterministic(
+            pred="The album was released in 2010.",
+            gold="invalid question",
+            question_type="false_premise",
+        )
+        assert result.grade == "incorrect"
+        assert result.method == "false_premise_unclear"
+
+    def test_idk_on_false_premise_is_missing(self) -> None:
+        # Refusal precedes false-premise routing.
+        result = grade_deterministic(
+            pred="I don't know.",
+            gold="invalid question",
+            question_type="false_premise",
+        )
+        assert result.grade == "missing"
+
+
+class TestGradeDeterministicLexicalMiss:
+    def test_unknown_paraphrase_routes_to_judge(self) -> None:
+        result = grade_deterministic(
+            pred="It is the technology giant in Cupertino.",
+            gold="Apple Inc",
+            question_type="simple",
+        )
+        # Without a judge, we fall through to lexical_miss → incorrect.
+        assert result.grade == "incorrect"
+        assert result.method == "lexical_miss"
+
+    def test_short_pred_no_substring_credit(self) -> None:
+        # Reverse-substring path requires len >= 3 to credit.
+        result = grade_deterministic(
+            pred="JK",
+            gold="JK Rowling",
+            question_type="simple",
+        )
+        assert result.grade == "incorrect"
+
+
+class TestGradeResultShape:
+    def test_to_dict_round_trip(self) -> None:
+        result = CragGradeResult(
+            grade="correct", score=1, method="exact",
+            normalised_pred="x", normalised_gold="x",
+        )
+        d = result.to_dict()
+        assert d["grade"] == "correct"
+        assert d["score"] == 1
+        assert d["method"] == "exact"
+
+    def test_score_matches_grade(self) -> None:
+        # Construct via grader so the score field is populated correctly.
+        for gold, pred, want_grade in (
+            ("hi", "hi", "correct"),
+            ("hi", "I don't know", "missing"),
+            ("hi", "bye", "incorrect"),
+        ):
+            result = grade_deterministic(pred=pred, gold=gold, question_type="simple")
+            assert result.grade == want_grade
+            expected_score = {"correct": 1, "missing": 0, "incorrect": -1}[want_grade]
+            assert result.score == expected_score
diff --git a/surfsense_evals/tests/suites/test_crag_html_extract.py b/surfsense_evals/tests/suites/test_crag_html_extract.py
new file mode 100644
index 000000000..a2b47aa45
--- /dev/null
+++ b/surfsense_evals/tests/suites/test_crag_html_extract.py
@@ -0,0 +1,149 @@
+"""Tests for the CRAG HTML extractor.
+
+We don't network-fetch trafilatura; we just verify the wrapper:
+
+* Strips obvious boilerplate (nav/footer/scripts) out of the result.
+* Falls back to the stdlib stripper on degenerate input.
+* Caps output at the configured ceiling.
+* Always prepends a metadata header (``# title``) when content is
+  produced.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.suites.research.crag.html_extract import (
+    extract_main_content,
+)
+
+
+_RICH_HTML = """\
+<!DOCTYPE html>
+<html>
+<head><title>Apple Q3 Earnings</title>
+<script>const a=1;</script>
+<style>body{font-family:sans;}</style>
+</head>
+<body>
+<nav><a href="/home">Home</a><a href="/about">About</a></nav>
+<header><h1>Tech News Site</h1><p>Subscribe to our newsletter</p></header>
+<main>
+<article>
+  <h1>Apple posts $90B revenue in Q3 2024</h1>
+  <p>Apple Inc. announced its Q3 2024 financial results today, reporting
+  $90 billion in revenue, beating analyst expectations of $87 billion.</p>
+  <p>The company saw growth across iPhone, services, and wearables.
+  CEO Tim Cook attributed the performance to strong demand in emerging
+  markets, particularly India.</p>
+  <h2>Segment breakdown</h2>
+  <ul>
+    <li>iPhone: $45B</li>
+    <li>Services: $24B</li>
+    <li>Mac: $7B</li>
+  </ul>
+</article>
+</main>
+<footer><p>Copyright 2024 Tech News Site. All rights reserved.</p></footer>
+</body></html>
+"""
+
+
+class TestExtractMainContent:
+    def test_extracts_main_article(self) -> None:
+        result = extract_main_content(
+            _RICH_HTML,
+            url="https://example.com/apple",
+            page_name="Apple Q3 Earnings",
+        )
+        assert result.ok
+        assert "Apple" in result.text
+        assert "Q3 2024" in result.text
+        # Header line is prepended.
+        assert result.text.startswith("# Apple Q3 Earnings")
+        assert "Source: https://example.com/apple" in result.text
+
+    def test_strips_boilerplate(self) -> None:
+        result = extract_main_content(
+            _RICH_HTML,
+            url="https://example.com/apple",
+            page_name="Apple Q3 Earnings",
+        )
+        assert result.ok
+        # Boilerplate strings should NOT make it through.
+        assert "Subscribe to our newsletter" not in result.text
+        assert "Copyright 2024 Tech News Site" not in result.text
+        assert "const a=1" not in result.text  # script content
+
+    def test_includes_last_modified_when_provided(self) -> None:
+        result = extract_main_content(
+            _RICH_HTML,
+            url="https://example.com/apple",
+            page_name="Apple Q3 Earnings",
+            last_modified="2024-08-01",
+        )
+        assert "Last modified: 2024-08-01" in result.text
+
+    def test_empty_html_returns_empty_result(self) -> None:
+        result = extract_main_content("", url="https://x.test/")
+        assert not result.ok
+        assert result.method == "empty"
+        assert result.n_chars == 0
+
+    def test_whitespace_only_html_is_empty(self) -> None:
+        result = extract_main_content("   \n   ", url="https://x.test/")
+        assert not result.ok
+
+    def test_garbage_html_falls_back(self) -> None:
+        # Trafilatura should reject this, fallback strip should still yield text.
+        result = extract_main_content(
+            "<<weird>>not a tag>>>The brown fox<<jumped<<",
+            url="https://x.test/garbage",
+            page_name="Garbage",
+        )
+        # Either trafilatura recovers something or fallback_strip does.
+        if result.ok:
+            assert "brown fox" in result.text or "jumped" in result.text
+
+
+class TestFallbackStripper:
+    def test_extract_when_no_clear_main(self) -> None:
+        html = """
+        <html><body>
+        <p>This is content one.</p>
+        <p>This is content two.</p>
+        </body></html>
+        """
+        result = extract_main_content(
+            html, url="https://x.test/", page_name="Title",
+        )
+        assert result.ok
+        assert "content one" in result.text
+        assert "content two" in result.text
+
+    def test_html_entities_decoded(self) -> None:
+        html = """<html><body>
+        <article>
+        <p>Tom &amp; Jerry &mdash; classic cartoon &copy; 1940.</p>
+        <p>It's a story about a cat &lt;Tom&gt; and a mouse &lt;Jerry&gt;.</p>
+        </article>
+        </body></html>"""
+        result = extract_main_content(html, url="https://x.test/")
+        assert result.ok
+        # & should be decoded
+        assert "&amp;" not in result.text
+        assert "Tom" in result.text and "Jerry" in result.text
+
+
+class TestOutputCapping:
+    def test_long_output_is_truncated(self) -> None:
+        # Generate enough content to exceed 200k cap.
+        body = "<p>" + ("hello world " * 50_000) + "</p>"
+        html = f"<html><body><article>{body}</article></body></html>"
+        result = extract_main_content(html, url="https://x.test/", page_name="long")
+        assert result.ok
+        # The body text itself + the metadata header. Truncation marker
+        # appears either at the body limit or before EOF.
+        if "[...truncated...]" in result.text:
+            # The truncation kicked in.
+            assert len(result.text) <= 250_000  # header + 200k cap + slack
diff --git a/surfsense_evals/tests/suites/test_frames_dataset.py b/surfsense_evals/tests/suites/test_frames_dataset.py
new file mode 100644
index 000000000..e79e7db89
--- /dev/null
+++ b/surfsense_evals/tests/suites/test_frames_dataset.py
@@ -0,0 +1,154 @@
+"""Tests for the FRAMES dataset parser.
+
+Network-free: we round-trip a tiny fixture TSV through pandas and
+``load_questions`` to confirm:
+
+* row indices become zero-padded ``Q###`` ids,
+* ``wiki_links`` (Python list literal) is materialised correctly,
+* ``reasoning_types`` is split on the pipe separator,
+* missing Prompt/Answer rows are dropped, and
+* the legacy ``wikipedia_link_*`` per-cell fallback works when
+  ``wiki_links`` is missing/empty.
+"""
+
+from __future__ import annotations
+
+import textwrap
+from pathlib import Path
+
+import pytest
+
+from surfsense_evals.suites.research.frames.dataset import (
+    FramesQuestion,
+    _parse_reasoning_types,
+    _parse_wiki_links,
+    load_questions,
+)
+
+
+# ---------------------------------------------------------------------------
+# Pure-function tests
+# ---------------------------------------------------------------------------
+
+
+class TestParseWikiLinks:
+    def test_python_list_literal(self) -> None:
+        s = "['https://en.wikipedia.org/wiki/A', 'https://en.wikipedia.org/wiki/B']"
+        assert _parse_wiki_links(s) == [
+            "https://en.wikipedia.org/wiki/A",
+            "https://en.wikipedia.org/wiki/B",
+        ]
+
+    def test_none_or_empty(self) -> None:
+        assert _parse_wiki_links(None) == []
+        assert _parse_wiki_links("") == []
+        assert _parse_wiki_links("[]") == []
+
+    def test_unquoted_csv_fallback(self) -> None:
+        # Defensive: non-Python-list strings still split on commas.
+        s = "https://a, https://b"
+        assert _parse_wiki_links(s) == ["https://a", "https://b"]
+
+    def test_already_a_list(self) -> None:
+        assert _parse_wiki_links(["x", "y"]) == ["x", "y"]
+
+
+class TestParseReasoningTypes:
+    def test_pipe_separated(self) -> None:
+        assert _parse_reasoning_types("Numerical reasoning | Multiple constraints") == [
+            "Numerical reasoning",
+            "Multiple constraints",
+        ]
+
+    def test_single_tag(self) -> None:
+        assert _parse_reasoning_types("Tabular reasoning") == ["Tabular reasoning"]
+
+    def test_empty(self) -> None:
+        assert _parse_reasoning_types(None) == []
+        assert _parse_reasoning_types("") == []
+
+
+# ---------------------------------------------------------------------------
+# Round-trip via pandas
+# ---------------------------------------------------------------------------
+
+
+def _write_tsv(path: Path, body: str) -> None:
+    """Helper that writes a tab-separated fixture exactly as the user typed it."""
+
+    path.write_text(textwrap.dedent(body), encoding="utf-8")
+
+
+def test_load_questions_basic(tmp_path: Path) -> None:
+    tsv = tmp_path / "test.tsv"
+    rows = [
+        # Header (first column is unnamed → pandas treats as index)
+        "\tPrompt\tAnswer\twikipedia_link_1\twikipedia_link_2\treasoning_types\twiki_links",
+        # Row 0
+        "0\tWho was the 15th president?\tJames Buchanan\t"
+        "https://en.wikipedia.org/wiki/James_Buchanan\t\t"
+        "Multiple constraints\t"
+        "['https://en.wikipedia.org/wiki/James_Buchanan']",
+        # Row 1
+        "1\tHow many years between A and B?\t87\t"
+        "https://en.wikipedia.org/wiki/A\thttps://en.wikipedia.org/wiki/B\t"
+        "Numerical reasoning | Temporal reasoning\t"
+        "['https://en.wikipedia.org/wiki/A', 'https://en.wikipedia.org/wiki/B']",
+        # Row 2 (intentionally missing Prompt — should be dropped)
+        "2\t\tunused\t\t\t\t",
+    ]
+    tsv.write_text("\n".join(rows) + "\n", encoding="utf-8")
+
+    questions = load_questions(tsv)
+    assert len(questions) == 2
+
+    q0, q1 = questions
+    assert isinstance(q0, FramesQuestion)
+    assert q0.qid == "Q000"
+    assert q0.raw_index == 0
+    assert q0.gold_answer == "James Buchanan"
+    assert q0.wiki_urls == ["https://en.wikipedia.org/wiki/James_Buchanan"]
+    assert q0.reasoning_types == ["Multiple constraints"]
+
+    assert q1.qid == "Q001"
+    assert q1.gold_answer == "87"
+    assert q1.wiki_urls == [
+        "https://en.wikipedia.org/wiki/A",
+        "https://en.wikipedia.org/wiki/B",
+    ]
+    assert q1.reasoning_types == ["Numerical reasoning", "Temporal reasoning"]
+
+
+def test_load_questions_falls_back_to_per_cell_links(tmp_path: Path) -> None:
+    """When ``wiki_links`` is empty, the loader should glue the
+    ``wikipedia_link_*`` cells back together."""
+
+    tsv = tmp_path / "test.tsv"
+    rows = [
+        "\tPrompt\tAnswer\twikipedia_link_1\twikipedia_link_2\treasoning_types\twiki_links",
+        "0\tQ?\tA\t"
+        "https://en.wikipedia.org/wiki/Cell1\thttps://en.wikipedia.org/wiki/Cell2\t"
+        "Numerical reasoning\t",
+    ]
+    tsv.write_text("\n".join(rows) + "\n", encoding="utf-8")
+    questions = load_questions(tsv)
+    assert len(questions) == 1
+    assert questions[0].wiki_urls == [
+        "https://en.wikipedia.org/wiki/Cell1",
+        "https://en.wikipedia.org/wiki/Cell2",
+    ]
+
+
+def test_load_questions_to_dict_round_trip(tmp_path: Path) -> None:
+    tsv = tmp_path / "test.tsv"
+    rows = [
+        "\tPrompt\tAnswer\treasoning_types\twiki_links",
+        "0\tQ?\tParis\tTemporal reasoning\t['https://en.wikipedia.org/wiki/Paris']",
+    ]
+    tsv.write_text("\n".join(rows) + "\n", encoding="utf-8")
+
+    [q] = load_questions(tsv)
+    d = q.to_dict()
+    assert d["qid"] == "Q000"
+    assert d["wiki_urls"] == ["https://en.wikipedia.org/wiki/Paris"]
+    assert d["reasoning_types"] == ["Temporal reasoning"]
diff --git a/surfsense_evals/tests/suites/test_frames_grader.py b/surfsense_evals/tests/suites/test_frames_grader.py
new file mode 100644
index 000000000..e6e38ff8a
--- /dev/null
+++ b/surfsense_evals/tests/suites/test_frames_grader.py
@@ -0,0 +1,160 @@
+"""Tests for the FRAMES grader's deterministic shortcut.
+
+The LLM-judge fallback is excluded here (network call); we just
+confirm the rule-based path picks up obvious correct/incorrect
+cases and routes the ambiguous ones to ``lexical_miss`` so the
+runner knows to consult the judge.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.suites.research.frames.grader import (
+    GradeResult,
+    _maybe_number,
+    _normalise,
+    _whole_word_substring,
+    grade_deterministic,
+)
+
+
+class TestNormalisation:
+    def test_lowercase_and_punct_stripped(self) -> None:
+        assert _normalise("Jane Ballou.") == "jane ballou"
+
+    def test_articles_removed(self) -> None:
+        assert _normalise("The Eiffel Tower") == "eiffel tower"
+
+    def test_whitespace_squashed(self) -> None:
+        assert _normalise("  multi   space\tinput  ") == "multi space input"
+
+    def test_empty_returns_empty(self) -> None:
+        assert _normalise("") == ""
+        assert _normalise(None) == ""  # type: ignore[arg-type]
+
+
+class TestNumericExtraction:
+    def test_simple_int(self) -> None:
+        assert _maybe_number("42") == 42.0
+
+    def test_int_with_commas(self) -> None:
+        assert _maybe_number("1,234") == 1234.0
+
+    def test_year_in_sentence(self) -> None:
+        assert _maybe_number("It was published in 1847.") == 1847.0
+
+    def test_word_number(self) -> None:
+        assert _maybe_number("five") == 5.0
+        assert _maybe_number("Twenty") == 20.0
+
+    def test_no_number_returns_none(self) -> None:
+        assert _maybe_number("Jane Ballou") is None
+        assert _maybe_number("") is None
+
+
+class TestWholeWordSubstring:
+    def test_phrase_match(self) -> None:
+        assert _whole_word_substring("president of the united states", "united states")
+
+    def test_word_boundary_required(self) -> None:
+        # "states" should NOT match inside "statesman"
+        assert not _whole_word_substring("the renowned statesman", "states")
+
+    def test_empty_needle(self) -> None:
+        assert not _whole_word_substring("anything", "")
+
+
+class TestExactMatch:
+    def test_identical(self) -> None:
+        r = grade_deterministic(pred="Jane Ballou", gold="Jane Ballou")
+        assert r.correct is True
+        assert r.method == "exact"
+
+    def test_case_insensitive(self) -> None:
+        r = grade_deterministic(pred="paris", gold="Paris")
+        assert r.correct is True
+        assert r.method == "exact"
+
+    def test_punctuation_ignored(self) -> None:
+        r = grade_deterministic(pred="Jane Ballou.", gold="Jane Ballou")
+        assert r.correct is True
+
+
+class TestNumericPath:
+    def test_int_match(self) -> None:
+        r = grade_deterministic(pred="The answer is 87", gold="87")
+        assert r.correct is True
+        assert r.method == "numeric"
+
+    def test_word_number_matches_digit(self) -> None:
+        r = grade_deterministic(pred="five", gold="5")
+        assert r.correct is True
+        assert r.method == "numeric"
+
+    def test_off_by_more_than_tolerance_fails(self) -> None:
+        r = grade_deterministic(pred="86", gold="87")
+        # 86 vs 87, abs diff = 1, tol = max(0.01*87, 0.5) = 0.87 → fails
+        assert r.correct is False
+        assert r.method == "numeric_miss"
+
+    def test_within_one_percent_passes(self) -> None:
+        r = grade_deterministic(pred="100", gold="101")
+        # 1.0 abs diff, tol = max(0.01*101, 0.5) = 1.01 → passes
+        assert r.correct is True
+
+
+class TestSubstringPath:
+    def test_pred_contains_gold(self) -> None:
+        r = grade_deterministic(
+            pred="The answer is Jane Ballou according to records",
+            gold="Jane Ballou",
+        )
+        assert r.correct is True
+        assert r.method == "substring"
+
+    def test_gold_contains_pred_with_minimum_length(self) -> None:
+        # Gold = "John F Kennedy", pred = "Kennedy" → reverse substring,
+        # ≥3 chars, but the FRAMES style usually accepts this.
+        r = grade_deterministic(pred="Kennedy", gold="John F. Kennedy")
+        assert r.correct is True
+        assert r.method == "substring_reverse"
+
+    def test_too_short_pred_no_reverse_credit(self) -> None:
+        r = grade_deterministic(pred="of", gold="World of Warcraft")
+        # "of" passes length but is a stopword; the article-stripping
+        # normaliser removes it from gold, so substring fails. Either
+        # way, the grader should NOT credit this.
+        assert r.correct is False
+
+
+class TestLexicalMiss:
+    def test_completely_different_pred_falls_through(self) -> None:
+        r = grade_deterministic(pred="London", gold="Paris")
+        assert r.correct is False
+        assert r.method == "lexical_miss"
+
+    def test_empty_pred(self) -> None:
+        r = grade_deterministic(pred="", gold="Paris")
+        assert r.correct is False
+        assert r.method == "empty_pred"
+
+    def test_empty_gold_defensive(self) -> None:
+        r = grade_deterministic(pred="something", gold="")
+        # Defensive guard — gold should never be empty in practice.
+        assert r.correct is False
+        assert r.method == "empty_gold"
+
+
+class TestGradeResultShape:
+    def test_dict_has_all_expected_keys(self) -> None:
+        r = grade_deterministic(pred="Paris", gold="Paris")
+        d = r.to_dict()
+        assert set(d) >= {
+            "correct",
+            "f1",
+            "method",
+            "normalised_pred",
+            "normalised_gold",
+            "judge_rationale",
+        }
diff --git a/surfsense_evals/tests/suites/test_frames_wiki_fetch.py b/surfsense_evals/tests/suites/test_frames_wiki_fetch.py
new file mode 100644
index 000000000..4941756f4
--- /dev/null
+++ b/surfsense_evals/tests/suites/test_frames_wiki_fetch.py
@@ -0,0 +1,112 @@
+"""Tests for the FRAMES Wikipedia fetcher.
+
+We mock the MW API with respx so tests are network-free. Coverage:
+
+* URL → title parsing (percent-encoded, underscores, redirects)
+* Filename safety (slashes, special chars)
+* Cache hit short-circuits the API call
+* Missing pages return ``None`` (not an exception)
+* Successful fetches write ``# Title`` markdown to disk
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import httpx
+import pytest
+import respx
+
+from surfsense_evals.suites.research.frames.wiki_fetch import (
+    WIKI_API,
+    WikiFetcher,
+    cache_filename_for_title,
+    title_from_url,
+)
+
+
+class TestTitleFromUrl:
+    def test_basic(self) -> None:
+        assert title_from_url("https://en.wikipedia.org/wiki/James_Buchanan") == "James Buchanan"
+
+    def test_percent_encoded(self) -> None:
+        assert (
+            title_from_url("https://en.wikipedia.org/wiki/Charlotte_Bront%C3%AB")
+            == "Charlotte Brontë"
+        )
+
+    def test_query_string_dropped(self) -> None:
+        assert title_from_url("https://en.wikipedia.org/wiki/Foo?action=edit") == "Foo"
+
+    def test_non_wiki_raises(self) -> None:
+        with pytest.raises(ValueError):
+            title_from_url("https://example.com/wiki/Foo")
+
+
+class TestCacheFilename:
+    def test_simple(self) -> None:
+        assert cache_filename_for_title("James Buchanan") == "James_Buchanan.md"
+
+    def test_unicode_replaced_with_underscore(self) -> None:
+        # Brontë's diaeresis is non-ASCII so the regex replaces it with `_`.
+        # The space → `_` happens after the unicode swap, so the final
+        # name has exactly one underscore for the diaeresis. Acceptable:
+        # filenames stay round-trippable as long as the rule is deterministic.
+        assert cache_filename_for_title("Charlotte Brontë") == "Charlotte_Bront_.md"
+
+    def test_slashes_replaced(self) -> None:
+        # Wikipedia titles can contain ``/`` (e.g. "I/O"), which would
+        # break the filesystem layout if not sanitised.
+        assert cache_filename_for_title("I/O") == "I_O.md"
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_fetch_success_writes_markdown(tmp_path: Path) -> None:
+    respx.get(WIKI_API).mock(return_value=httpx.Response(
+        200,
+        json={"query": {"pages": [{
+            "pageid": 1,
+            "title": "James Buchanan",
+            "extract": "James Buchanan was the 15th president of the United States.",
+        }]}},
+    ))
+    fetcher = WikiFetcher(cache_dir=tmp_path, rate_limit_rps=100)  # disable throttle
+    article = await fetcher.fetch("https://en.wikipedia.org/wiki/James_Buchanan")
+    assert article is not None
+    assert article.title == "James Buchanan"
+    body = article.markdown_path.read_text(encoding="utf-8")
+    assert body.startswith("# James Buchanan")
+    assert "15th president" in body
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_fetch_missing_page_returns_none(tmp_path: Path) -> None:
+    respx.get(WIKI_API).mock(return_value=httpx.Response(
+        200,
+        json={"query": {"pages": [{
+            "title": "DoesNotExist",
+            "missing": True,
+        }]}},
+    ))
+    fetcher = WikiFetcher(cache_dir=tmp_path, rate_limit_rps=100)
+    article = await fetcher.fetch("https://en.wikipedia.org/wiki/DoesNotExist")
+    assert article is None
+    assert not (tmp_path / "DoesNotExist.md").exists()
+
+
+@pytest.mark.asyncio
+@respx.mock
+async def test_fetch_cache_hit_skips_api(tmp_path: Path) -> None:
+    # Pre-populate the cache.
+    cached = tmp_path / cache_filename_for_title("Cached Page")
+    cached.write_text("# Cached Page\n\nfrom disk\n", encoding="utf-8")
+    fetcher = WikiFetcher(cache_dir=tmp_path, rate_limit_rps=100)
+
+    # No respx mock registered; if the fetcher hits the network, respx
+    # would error out (it intercepts everything inside the decorator).
+    article = await fetcher.fetch("https://en.wikipedia.org/wiki/Cached_Page")
+    assert article is not None
+    assert article.markdown_path == cached
+    assert article.markdown_path.read_text(encoding="utf-8").endswith("from disk\n")
diff --git a/surfsense_evals/tests/suites/test_mmlongbench_grader.py b/surfsense_evals/tests/suites/test_mmlongbench_grader.py
new file mode 100644
index 000000000..92cd5f0cb
--- /dev/null
+++ b/surfsense_evals/tests/suites/test_mmlongbench_grader.py
@@ -0,0 +1,129 @@
+"""Tests for the MMLongBench-Doc format-aware grader.
+
+The grader is the critical correctness piece for the open-ended
+benchmark (no MCQ shortcut), so we cover all five formats with
+representative happy-path + edge-case rows.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from surfsense_evals.suites.multimodal_doc.mmlongbench.grader import grade
+
+
+class TestStrFormat:
+    def test_exact_match(self) -> None:
+        r = grade(pred="Apollo 11", gold="Apollo 11", answer_format="Str")
+        assert r.correct is True
+        assert r.f1 == 1.0
+        assert r.method == "str_norm"
+
+    def test_lowercase_normalised(self) -> None:
+        r = grade(pred="paris", gold="Paris", answer_format="Str")
+        assert r.correct is True
+
+    def test_punctuation_difference_drops_to_substring(self) -> None:
+        # "N.A.S.A." normalises to "n a s a" (whitespace tokens) which
+        # doesn't equal "nasa" — but the F1 token overlap is still 0
+        # because none of the single letters appear standalone in "nasa".
+        # We assert the grader fails closed rather than over-claiming.
+        r = grade(pred="N.A.S.A.", gold="NASA", answer_format="Str")
+        assert r.correct is False  # explicit: this is a failure mode we accept
+
+    def test_substring_credit(self) -> None:
+        r = grade(pred="The answer is Paris.", gold="Paris", answer_format="Str")
+        assert r.correct is True
+
+    def test_completely_wrong(self) -> None:
+        r = grade(pred="London", gold="Paris", answer_format="Str")
+        assert r.correct is False
+        assert r.f1 < 0.5
+
+    def test_empty_pred(self) -> None:
+        r = grade(pred="", gold="Paris", answer_format="Str")
+        assert r.correct is False
+        assert r.f1 == 0.0
+
+
+class TestIntFormat:
+    def test_exact_int(self) -> None:
+        assert grade(pred="42", gold="42", answer_format="Int").correct is True
+
+    def test_int_in_sentence(self) -> None:
+        assert grade(pred="The answer is 42 years.", gold="42", answer_format="Int").correct is True
+
+    def test_int_with_commas(self) -> None:
+        assert grade(pred="1,500", gold="1500", answer_format="Int").correct is True
+
+    def test_wrong_int(self) -> None:
+        assert grade(pred="41", gold="42", answer_format="Int").correct is False
+
+    def test_no_int_in_pred(self) -> None:
+        assert grade(pred="not answerable", gold="42", answer_format="Int").correct is False
+
+
+class TestFloatFormat:
+    def test_exact_float(self) -> None:
+        assert grade(pred="3.14", gold="3.14", answer_format="Float").correct is True
+
+    def test_within_tolerance(self) -> None:
+        # 1% tolerance — 3.14 vs 3.13 is well within.
+        assert grade(pred="3.13", gold="3.14", answer_format="Float").correct is True
+
+    def test_outside_tolerance(self) -> None:
+        assert grade(pred="3.5", gold="3.14", answer_format="Float").correct is False
+
+    def test_european_decimal_comma(self) -> None:
+        # ``3,14`` should parse as 3.14
+        assert grade(pred="3,14", gold="3.14", answer_format="Float").correct is True
+
+    def test_zero_gold_with_small_abs_diff(self) -> None:
+        # Absolute tolerance of 0.01 should kick in for near-zero golds.
+        assert grade(pred="0.005", gold="0", answer_format="Float").correct is True
+
+
+class TestListFormat:
+    def test_exact_set_match(self) -> None:
+        r = grade(pred="apple, banana, cherry", gold="apple, banana, cherry", answer_format="List")
+        assert r.correct is True
+        assert r.f1 == pytest.approx(1.0)
+
+    def test_set_match_different_order(self) -> None:
+        r = grade(pred="cherry, apple, banana", gold="apple, banana, cherry", answer_format="List")
+        assert r.correct is True
+
+    def test_partial_overlap_gives_f1(self) -> None:
+        r = grade(pred="apple, banana", gold="apple, banana, cherry", answer_format="List")
+        assert r.correct is False
+        assert 0.0 < r.f1 < 1.0
+
+    def test_extra_items_lower_precision(self) -> None:
+        r = grade(pred="apple, banana, cherry, date", gold="apple, banana, cherry", answer_format="List")
+        assert 0.0 < r.f1 < 1.0
+        # Recall=1, precision=3/4 → F1 ~= 0.857
+        assert r.f1 == pytest.approx(2 * (3 / 4) * 1 / (3 / 4 + 1), rel=1e-3)
+
+
+class TestNoneFormat:
+    def test_unknown_phrase_credited(self) -> None:
+        for phrase in ("Not answerable", "I cannot answer this.", "No answer", "N/A"):
+            r = grade(pred=phrase, gold="Not answerable", answer_format="None")
+            assert r.correct is True, phrase
+
+    def test_actual_answer_marked_wrong(self) -> None:
+        # The arm hallucinated an answer when it should have said "I don't know".
+        r = grade(pred="The answer is 42.", gold="Not answerable", answer_format="None")
+        assert r.correct is False
+
+
+class TestUnknownFormatFallsBackToStr:
+    def test_blank_format_uses_str_grader(self) -> None:
+        r = grade(pred="Paris", gold="Paris", answer_format="")
+        assert r.correct is True
+        assert r.method == "str_norm"
+
+    def test_garbage_format_uses_str_grader(self) -> None:
+        r = grade(pred="Paris", gold="Paris", answer_format="quux")
+        assert r.correct is True
+        assert r.method == "str_norm"
diff --git a/surfsense_evals/tests/test_integration_smoke.py b/surfsense_evals/tests/test_integration_smoke.py
new file mode 100644
index 000000000..493c04c25
--- /dev/null
+++ b/surfsense_evals/tests/test_integration_smoke.py
@@ -0,0 +1,35 @@
+"""Opt-in integration smoke against ``http://localhost:8000``.
+
+Run with ``pytest -m integration``. Skipped by default. Touches the
+real backend — requires it to be reachable, OPENROUTER_API_KEY
+unrelated, and one credential mode set.
+"""
+
+from __future__ import annotations
+
+import os
+
+import httpx
+import pytest
+
+from surfsense_evals.core.auth import acquire_token, client_with_auth
+from surfsense_evals.core.config import load_config
+
+pytestmark = pytest.mark.integration
+
+
+@pytest.mark.asyncio
+async def test_smoke_against_localhost():
+    if "SURFSENSE_API_BASE" not in os.environ:
+        pytest.skip("SURFSENSE_API_BASE not set; skipping integration smoke")
+    config = load_config()
+    if config.credential_mode() == "none":
+        pytest.skip("No credentials in environment; skipping integration smoke")
+    bundle = await acquire_token(config)
+    async with client_with_auth(config, bundle) as client:
+        response = await client.get(f"{config.surfsense_api_base}/api/v1/global-new-llm-configs")
+        try:
+            response.raise_for_status()
+        except httpx.HTTPStatusError as exc:
+            pytest.fail(f"Backend rejected smoke call: {exc!s}")
+        assert isinstance(response.json(), list)
diff --git a/surfsense_evals/uv.lock b/surfsense_evals/uv.lock
new file mode 100644
index 000000000..6c4fd7283
--- /dev/null
+++ b/surfsense_evals/uv.lock
@@ -0,0 +1,1742 @@
+version = 1
+revision = 1
+requires-python = ">=3.12"
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version < '3.14' and sys_platform == 'win32'",
+    "python_full_version < '3.14' and sys_platform == 'emscripten'",
+    "python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+]
+
+[[package]]
+name = "aiohappyeyeballs"
+version = "2.6.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265 },
+]
+
+[[package]]
+name = "aiohttp"
+version = "3.13.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohappyeyeballs" },
+    { name = "aiosignal" },
+    { name = "attrs" },
+    { name = "frozenlist" },
+    { name = "multidict" },
+    { name = "propcache" },
+    { name = "yarl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/77/9a/152096d4808df8e4268befa55fba462f440f14beab85e8ad9bf990516918/aiohttp-3.13.5.tar.gz", hash = "sha256:9d98cc980ecc96be6eb4c1994ce35d28d8b1f5e5208a23b421187d1209dbb7d1", size = 7858271 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/be/6f/353954c29e7dcce7cf00280a02c75f30e133c00793c7a2ed3776d7b2f426/aiohttp-3.13.5-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:023ecba036ddd840b0b19bf195bfae970083fd7024ce1ac22e9bba90464620e9", size = 748876 },
+    { url = "https://files.pythonhosted.org/packages/f5/1b/428a7c64687b3b2e9cd293186695affc0e1e54a445d0361743b231f11066/aiohttp-3.13.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15c933ad7920b7d9a20de151efcd05a6e38302cbf0e10c9b2acb9a42210a2416", size = 499557 },
+    { url = "https://files.pythonhosted.org/packages/29/47/7be41556bfbb6917069d6a6634bb7dd5e163ba445b783a90d40f5ac7e3a7/aiohttp-3.13.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ab2899f9fa2f9f741896ebb6fa07c4c883bfa5c7f2ddd8cf2aafa86fa981b2d2", size = 500258 },
+    { url = "https://files.pythonhosted.org/packages/67/84/c9ecc5828cb0b3695856c07c0a6817a99d51e2473400f705275a2b3d9239/aiohttp-3.13.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a60eaa2d440cd4707696b52e40ed3e2b0f73f65be07fd0ef23b6b539c9c0b0b4", size = 1749199 },
+    { url = "https://files.pythonhosted.org/packages/f0/d3/3c6d610e66b495657622edb6ae7c7fd31b2e9086b4ec50b47897ad6042a9/aiohttp-3.13.5-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:55b3bdd3292283295774ab585160c4004f4f2f203946997f49aac032c84649e9", size = 1721013 },
+    { url = "https://files.pythonhosted.org/packages/49/a0/24409c12217456df0bae7babe3b014e460b0b38a8e60753d6cb339f6556d/aiohttp-3.13.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c2b2355dc094e5f7d45a7bb262fe7207aa0460b37a0d87027dcf21b5d890e7d5", size = 1781501 },
+    { url = "https://files.pythonhosted.org/packages/98/9d/b65ec649adc5bccc008b0957a9a9c691070aeac4e41cea18559fef49958b/aiohttp-3.13.5-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b38765950832f7d728297689ad78f5f2cf79ff82487131c4d26fe6ceecdc5f8e", size = 1878981 },
+    { url = "https://files.pythonhosted.org/packages/57/d8/8d44036d7eb7b6a8ec4c5494ea0c8c8b94fbc0ed3991c1a7adf230df03bf/aiohttp-3.13.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b18f31b80d5a33661e08c89e202edabf1986e9b49c42b4504371daeaa11b47c1", size = 1767934 },
+    { url = "https://files.pythonhosted.org/packages/31/04/d3f8211f273356f158e3464e9e45484d3fb8c4ce5eb2f6fe9405c3273983/aiohttp-3.13.5-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:33add2463dde55c4f2d9635c6ab33ce154e5ecf322bd26d09af95c5f81cfa286", size = 1566671 },
+    { url = "https://files.pythonhosted.org/packages/41/db/073e4ebe00b78e2dfcacff734291651729a62953b48933d765dc513bf798/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:327cc432fdf1356fb4fbc6fe833ad4e9f6aacb71a8acaa5f1855e4b25910e4a9", size = 1705219 },
+    { url = "https://files.pythonhosted.org/packages/48/45/7dfba71a2f9fd97b15c95c06819de7eb38113d2cdb6319669195a7d64270/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:7c35b0bf0b48a70b4cb4fc5d7bed9b932532728e124874355de1a0af8ec4bc88", size = 1743049 },
+    { url = "https://files.pythonhosted.org/packages/18/71/901db0061e0f717d226386a7f471bb59b19566f2cae5f0d93874b017271f/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:df23d57718f24badef8656c49743e11a89fd6f5358fa8a7b96e728fda2abf7d3", size = 1749557 },
+    { url = "https://files.pythonhosted.org/packages/08/d5/41eebd16066e59cd43728fe74bce953d7402f2b4ddfdfef2c0e9f17ca274/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:02e048037a6501a5ec1f6fc9736135aec6eb8a004ce48838cb951c515f32c80b", size = 1558931 },
+    { url = "https://files.pythonhosted.org/packages/30/e6/4a799798bf05740e66c3a1161079bda7a3dd8e22ca392481d7a7f9af82a6/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:31cebae8b26f8a615d2b546fee45d5ffb76852ae6450e2a03f42c9102260d6fe", size = 1774125 },
+    { url = "https://files.pythonhosted.org/packages/84/63/7749337c90f92bc2cb18f9560d67aa6258c7060d1397d21529b8004fcf6f/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:888e78eb5ca55a615d285c3c09a7a91b42e9dd6fc699b166ebd5dee87c9ccf14", size = 1732427 },
+    { url = "https://files.pythonhosted.org/packages/98/de/cf2f44ff98d307e72fb97d5f5bbae3bfcb442f0ea9790c0bf5c5c2331404/aiohttp-3.13.5-cp312-cp312-win32.whl", hash = "sha256:8bd3ec6376e68a41f9f95f5ed170e2fcf22d4eb27a1f8cb361d0508f6e0557f3", size = 433534 },
+    { url = "https://files.pythonhosted.org/packages/aa/ca/eadf6f9c8fa5e31d40993e3db153fb5ed0b11008ad5d9de98a95045bed84/aiohttp-3.13.5-cp312-cp312-win_amd64.whl", hash = "sha256:110e448e02c729bcebb18c60b9214a87ba33bac4a9fa5e9a5f139938b56c6cb1", size = 460446 },
+    { url = "https://files.pythonhosted.org/packages/78/e9/d76bf503005709e390122d34e15256b88f7008e246c4bdbe915cd4f1adce/aiohttp-3.13.5-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5029cc80718bbd545123cd8fe5d15025eccaaaace5d0eeec6bd556ad6163d61", size = 742930 },
+    { url = "https://files.pythonhosted.org/packages/57/00/4b7b70223deaebd9bb85984d01a764b0d7bd6526fcdc73cca83bcbe7243e/aiohttp-3.13.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4bb6bf5811620003614076bdc807ef3b5e38244f9d25ca5fe888eaccea2a9832", size = 496927 },
+    { url = "https://files.pythonhosted.org/packages/9c/f5/0fb20fb49f8efdcdce6cd8127604ad2c503e754a8f139f5e02b01626523f/aiohttp-3.13.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a84792f8631bf5a94e52d9cc881c0b824ab42717165a5579c760b830d9392ac9", size = 497141 },
+    { url = "https://files.pythonhosted.org/packages/3b/86/b7c870053e36a94e8951b803cb5b909bfbc9b90ca941527f5fcafbf6b0fa/aiohttp-3.13.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:57653eac22c6a4c13eb22ecf4d673d64a12f266e72785ab1c8b8e5940d0e8090", size = 1732476 },
+    { url = "https://files.pythonhosted.org/packages/b5/e5/4e161f84f98d80c03a238671b4136e6530453d65262867d989bbe78244d0/aiohttp-3.13.5-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e5e5f7debc7a57af53fdf5c5009f9391d9f4c12867049d509bf7bb164a6e295b", size = 1706507 },
+    { url = "https://files.pythonhosted.org/packages/d4/56/ea11a9f01518bd5a2a2fcee869d248c4b8a0cfa0bb13401574fa31adf4d4/aiohttp-3.13.5-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c719f65bebcdf6716f10e9eff80d27567f7892d8988c06de12bbbd39307c6e3a", size = 1773465 },
+    { url = "https://files.pythonhosted.org/packages/eb/40/333ca27fb74b0383f17c90570c748f7582501507307350a79d9f9f3c6eb1/aiohttp-3.13.5-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d97f93fdae594d886c5a866636397e2bcab146fd7a132fd6bb9ce182224452f8", size = 1873523 },
+    { url = "https://files.pythonhosted.org/packages/f0/d2/e2f77eef1acb7111405433c707dc735e63f67a56e176e72e9e7a2cd3f493/aiohttp-3.13.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3df334e39d4c2f899a914f1dba283c1aadc311790733f705182998c6f7cae665", size = 1754113 },
+    { url = "https://files.pythonhosted.org/packages/fb/56/3f653d7f53c89669301ec9e42c95233e2a0c0a6dd051269e6e678db4fdb0/aiohttp-3.13.5-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fe6970addfea9e5e081401bcbadf865d2b6da045472f58af08427e108d618540", size = 1562351 },
+    { url = "https://files.pythonhosted.org/packages/ec/a6/9b3e91eb8ae791cce4ee736da02211c85c6f835f1bdfac0594a8a3b7018c/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7becdf835feff2f4f335d7477f121af787e3504b48b449ff737afb35869ba7bb", size = 1693205 },
+    { url = "https://files.pythonhosted.org/packages/98/fc/bfb437a99a2fcebd6b6eaec609571954de2ed424f01c352f4b5504371dd3/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:676e5651705ad5d8a70aeb8eb6936c436d8ebbd56e63436cb7dd9bb36d2a9a46", size = 1730618 },
+    { url = "https://files.pythonhosted.org/packages/e4/b6/c8534862126191a034f68153194c389addc285a0f1347d85096d349bbc15/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:9b16c653d38eb1a611cc898c41e76859ca27f119d25b53c12875fd0474ae31a8", size = 1745185 },
+    { url = "https://files.pythonhosted.org/packages/0b/93/4ca8ee2ef5236e2707e0fd5fecb10ce214aee1ff4ab307af9c558bda3b37/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:999802d5fa0389f58decd24b537c54aa63c01c3219ce17d1214cbda3c2b22d2d", size = 1557311 },
+    { url = "https://files.pythonhosted.org/packages/57/ae/76177b15f18c5f5d094f19901d284025db28eccc5ae374d1d254181d33f4/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:ec707059ee75732b1ba130ed5f9580fe10ff75180c812bc267ded039db5128c6", size = 1773147 },
+    { url = "https://files.pythonhosted.org/packages/01/a4/62f05a0a98d88af59d93b7fcac564e5f18f513cb7471696ac286db970d6a/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2d6d44a5b48132053c2f6cd5c8cb14bc67e99a63594e336b0f2af81e94d5530c", size = 1730356 },
+    { url = "https://files.pythonhosted.org/packages/e4/85/fc8601f59dfa8c9523808281f2da571f8b4699685f9809a228adcc90838d/aiohttp-3.13.5-cp313-cp313-win32.whl", hash = "sha256:329f292ed14d38a6c4c435e465f48bebb47479fd676a0411936cc371643225cc", size = 432637 },
+    { url = "https://files.pythonhosted.org/packages/c0/1b/ac685a8882896acf0f6b31d689e3792199cfe7aba37969fa91da63a7fa27/aiohttp-3.13.5-cp313-cp313-win_amd64.whl", hash = "sha256:69f571de7500e0557801c0b51f4780482c0ec5fe2ac851af5a92cfce1af1cb83", size = 458896 },
+    { url = "https://files.pythonhosted.org/packages/5d/ce/46572759afc859e867a5bc8ec3487315869013f59281ce61764f76d879de/aiohttp-3.13.5-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:eb4639f32fd4a9904ab8fb45bf3383ba71137f3d9d4ba25b3b3f3109977c5b8c", size = 745721 },
+    { url = "https://files.pythonhosted.org/packages/13/fe/8a2efd7626dbe6049b2ef8ace18ffda8a4dfcbe1bcff3ac30c0c7575c20b/aiohttp-3.13.5-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:7e5dc4311bd5ac493886c63cbf76ab579dbe4641268e7c74e48e774c74b6f2be", size = 497663 },
+    { url = "https://files.pythonhosted.org/packages/9b/91/cc8cc78a111826c54743d88651e1687008133c37e5ee615fee9b57990fac/aiohttp-3.13.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:756c3c304d394977519824449600adaf2be0ccee76d206ee339c5e76b70ded25", size = 499094 },
+    { url = "https://files.pythonhosted.org/packages/0a/33/a8362cb15cf16a3af7e86ed11962d5cd7d59b449202dc576cdc731310bde/aiohttp-3.13.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecc26751323224cf8186efcf7fbcbc30f4e1d8c7970659daf25ad995e4032a56", size = 1726701 },
+    { url = "https://files.pythonhosted.org/packages/45/0c/c091ac5c3a17114bd76cbf85d674650969ddf93387876cf67f754204bd77/aiohttp-3.13.5-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:10a75acfcf794edf9d8db50e5a7ec5fc818b2a8d3f591ce93bc7b1210df016d2", size = 1683360 },
+    { url = "https://files.pythonhosted.org/packages/23/73/bcee1c2b79bc275e964d1446c55c54441a461938e70267c86afaae6fba27/aiohttp-3.13.5-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0f7a18f258d124cd678c5fe072fe4432a4d5232b0657fca7c1847f599233c83a", size = 1773023 },
+    { url = "https://files.pythonhosted.org/packages/c7/ef/720e639df03004fee2d869f771799d8c23046dec47d5b81e396c7cda583a/aiohttp-3.13.5-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:df6104c009713d3a89621096f3e3e88cc323fd269dbd7c20afe18535094320be", size = 1853795 },
+    { url = "https://files.pythonhosted.org/packages/bd/c9/989f4034fb46841208de7aeeac2c6d8300745ab4f28c42f629ba77c2d916/aiohttp-3.13.5-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:241a94f7de7c0c3b616627aaad530fe2cb620084a8b144d3be7b6ecfe95bae3b", size = 1730405 },
+    { url = "https://files.pythonhosted.org/packages/ce/75/ee1fd286ca7dc599d824b5651dad7b3be7ff8d9a7e7b3fe9820d9180f7db/aiohttp-3.13.5-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c974fb66180e58709b6fc402846f13791240d180b74de81d23913abe48e96d94", size = 1558082 },
+    { url = "https://files.pythonhosted.org/packages/c3/20/1e9e6650dfc436340116b7aa89ff8cb2bbdf0abc11dfaceaad8f74273a10/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:6e27ea05d184afac78aabbac667450c75e54e35f62238d44463131bd3f96753d", size = 1692346 },
+    { url = "https://files.pythonhosted.org/packages/d8/40/8ebc6658d48ea630ac7903912fe0dd4e262f0e16825aa4c833c56c9f1f56/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:a79a6d399cef33a11b6f004c67bb07741d91f2be01b8d712d52c75711b1e07c7", size = 1698891 },
+    { url = "https://files.pythonhosted.org/packages/d8/78/ea0ae5ec8ba7a5c10bdd6e318f1ba5e76fcde17db8275188772afc7917a4/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:c632ce9c0b534fbe25b52c974515ed674937c5b99f549a92127c85f771a78772", size = 1742113 },
+    { url = "https://files.pythonhosted.org/packages/8a/66/9d308ed71e3f2491be1acb8769d96c6f0c47d92099f3bc9119cada27b357/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:fceedde51fbd67ee2bcc8c0b33d0126cc8b51ef3bbde2f86662bd6d5a6f10ec5", size = 1553088 },
+    { url = "https://files.pythonhosted.org/packages/da/a6/6cc25ed8dfc6e00c90f5c6d126a98e2cf28957ad06fa1036bd34b6f24a2c/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:f92995dfec9420bb69ae629abf422e516923ba79ba4403bc750d94fb4a6c68c1", size = 1757976 },
+    { url = "https://files.pythonhosted.org/packages/c1/2b/cce5b0ffe0de99c83e5e36d8f828e4161e415660a9f3e58339d07cce3006/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:20ae0ff08b1f2c8788d6fb85afcb798654ae6ba0b747575f8562de738078457b", size = 1712444 },
+    { url = "https://files.pythonhosted.org/packages/6c/cf/9e1795b4160c58d29421eafd1a69c6ce351e2f7c8d3c6b7e4ca44aea1a5b/aiohttp-3.13.5-cp314-cp314-win32.whl", hash = "sha256:b20df693de16f42b2472a9c485e1c948ee55524786a0a34345511afdd22246f3", size = 438128 },
+    { url = "https://files.pythonhosted.org/packages/22/4d/eaedff67fc805aeba4ba746aec891b4b24cebb1a7d078084b6300f79d063/aiohttp-3.13.5-cp314-cp314-win_amd64.whl", hash = "sha256:f85c6f327bf0b8c29da7d93b1cabb6363fb5e4e160a32fa241ed2dce21b73162", size = 464029 },
+    { url = "https://files.pythonhosted.org/packages/79/11/c27d9332ee20d68dd164dc12a6ecdef2e2e35ecc97ed6cf0d2442844624b/aiohttp-3.13.5-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:1efb06900858bb618ff5cee184ae2de5828896c448403d51fb633f09e109be0a", size = 778758 },
+    { url = "https://files.pythonhosted.org/packages/04/fb/377aead2e0a3ba5f09b7624f702a964bdf4f08b5b6728a9799830c80041e/aiohttp-3.13.5-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:fee86b7c4bd29bdaf0d53d14739b08a106fdda809ca5fe032a15f52fae5fe254", size = 512883 },
+    { url = "https://files.pythonhosted.org/packages/bb/a6/aa109a33671f7a5d3bd78b46da9d852797c5e665bfda7d6b373f56bff2ec/aiohttp-3.13.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:20058e23909b9e65f9da62b396b77dfa95965cbe840f8def6e572538b1d32e36", size = 516668 },
+    { url = "https://files.pythonhosted.org/packages/79/b3/ca078f9f2fa9563c36fb8ef89053ea2bb146d6f792c5104574d49d8acb63/aiohttp-3.13.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cf20a8d6868cb15a73cab329ffc07291ba8c22b1b88176026106ae39aa6df0f", size = 1883461 },
+    { url = "https://files.pythonhosted.org/packages/b7/e3/a7ad633ca1ca497b852233a3cce6906a56c3225fb6d9217b5e5e60b7419d/aiohttp-3.13.5-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:330f5da04c987f1d5bdb8ae189137c77139f36bd1cb23779ca1a354a4b027800", size = 1747661 },
+    { url = "https://files.pythonhosted.org/packages/33/b9/cd6fe579bed34a906d3d783fe60f2fa297ef55b27bb4538438ee49d4dc41/aiohttp-3.13.5-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6f1cbf0c7926d315c3c26c2da41fd2b5d2fe01ac0e157b78caefc51a782196cf", size = 1863800 },
+    { url = "https://files.pythonhosted.org/packages/c0/3f/2c1e2f5144cefa889c8afd5cf431994c32f3b29da9961698ff4e3811b79a/aiohttp-3.13.5-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:53fc049ed6390d05423ba33103ded7281fe897cf97878f369a527070bd95795b", size = 1958382 },
+    { url = "https://files.pythonhosted.org/packages/66/1d/f31ec3f1013723b3babe3609e7f119c2c2fb6ef33da90061a705ef3e1bc8/aiohttp-3.13.5-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:898703aa2667e3c5ca4c54ca36cd73f58b7a38ef87a5606414799ebce4d3fd3a", size = 1803724 },
+    { url = "https://files.pythonhosted.org/packages/0e/b4/57712dfc6f1542f067daa81eb61da282fab3e6f1966fca25db06c4fc62d5/aiohttp-3.13.5-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0494a01ca9584eea1e5fbd6d748e61ecff218c51b576ee1999c23db7066417d8", size = 1640027 },
+    { url = "https://files.pythonhosted.org/packages/25/3c/734c878fb43ec083d8e31bf029daae1beafeae582d1b35da234739e82ee7/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6cf81fe010b8c17b09495cbd15c1d35afbc8fb405c0c9cf4738e5ae3af1d65be", size = 1806644 },
+    { url = "https://files.pythonhosted.org/packages/20/a5/f671e5cbec1c21d044ff3078223f949748f3a7f86b14e34a365d74a5d21f/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:c564dd5f09ddc9d8f2c2d0a301cd30a79a2cc1b46dd1a73bef8f0038863d016b", size = 1791630 },
+    { url = "https://files.pythonhosted.org/packages/0b/63/fb8d0ad63a0b8a99be97deac8c04dacf0785721c158bdf23d679a87aa99e/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:2994be9f6e51046c4f864598fd9abeb4fba6e88f0b2152422c9666dcd4aea9c6", size = 1809403 },
+    { url = "https://files.pythonhosted.org/packages/59/0c/bfed7f30662fcf12206481c2aac57dedee43fe1c49275e85b3a1e1742294/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:157826e2fa245d2ef46c83ea8a5faf77ca19355d278d425c29fda0beb3318037", size = 1634924 },
+    { url = "https://files.pythonhosted.org/packages/17/d6/fd518d668a09fd5a3319ae5e984d4d80b9a4b3df4e21c52f02251ef5a32e/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:a8aca50daa9493e9e13c0f566201a9006f080e7c50e5e90d0b06f53146a54500", size = 1836119 },
+    { url = "https://files.pythonhosted.org/packages/78/b7/15fb7a9d52e112a25b621c67b69c167805cb1f2ab8f1708a5c490d1b52fe/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3b13560160d07e047a93f23aaa30718606493036253d5430887514715b67c9d9", size = 1772072 },
+    { url = "https://files.pythonhosted.org/packages/7e/df/57ba7f0c4a553fc2bd8b6321df236870ec6fd64a2a473a8a13d4f733214e/aiohttp-3.13.5-cp314-cp314t-win32.whl", hash = "sha256:9a0f4474b6ea6818b41f82172d799e4b3d29e22c2c520ce4357856fced9af2f8", size = 471819 },
+    { url = "https://files.pythonhosted.org/packages/62/29/2f8418269e46454a26171bfdd6a055d74febf32234e474930f2f60a17145/aiohttp-3.13.5-cp314-cp314t-win_amd64.whl", hash = "sha256:18a2f6c1182c51baa1d28d68fea51513cb2a76612f038853c0ad3c145423d3d9", size = 505441 },
+]
+
+[[package]]
+name = "aiosignal"
+version = "1.4.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "frozenlist" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490 },
+]
+
+[[package]]
+name = "annotated-doc"
+version = "0.0.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303 },
+]
+
+[[package]]
+name = "annotated-types"
+version = "0.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643 },
+]
+
+[[package]]
+name = "anyio"
+version = "4.13.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "idna" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/19/14/2c5dd9f512b66549ae92767a9c7b330ae88e1932ca57876909410251fe13/anyio-4.13.0.tar.gz", hash = "sha256:334b70e641fd2221c1505b3890c69882fe4a2df910cba14d97019b90b24439dc", size = 231622 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl", hash = "sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size = 114353 },
+]
+
+[[package]]
+name = "attrs"
+version = "26.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9a/8e/82a0fe20a541c03148528be8cac2408564a6c9a0cc7e9171802bc1d26985/attrs-26.1.0.tar.gz", hash = "sha256:d03ceb89cb322a8fd706d4fb91940737b6642aa36998fe130a9bc96c985eff32", size = 952055 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/64/b4/17d4b0b2a2dc85a6df63d1157e028ed19f90d4cd97c36717afef2bc2f395/attrs-26.1.0-py3-none-any.whl", hash = "sha256:c647aa4a12dfbad9333ca4e71fe62ddc36f4e63b2d260a37a8b83d2f043ac309", size = 67548 },
+]
+
+[[package]]
+name = "certifi"
+version = "2026.4.22"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/25/ee/6caf7a40c36a1220410afe15a1cc64993a1f864871f698c0f93acb72842a/certifi-2026.4.22.tar.gz", hash = "sha256:8d455352a37b71bf76a79caa83a3d6c25afee4a385d632127b6afb3963f1c580", size = 137077 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl", hash = "sha256:3cb2210c8f88ba2318d29b0388d1023c8492ff72ecdde4ebdaddbb13a31b1c4a", size = 135707 },
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.4.7"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e7/a1/67fe25fac3c7642725500a3f6cfe5821ad557c3abb11c9d20d12c7008d3e/charset_normalizer-3.4.7.tar.gz", hash = "sha256:ae89db9e5f98a11a4bf50407d4363e7b09b31e55bc117b4f7d80aab97ba009e5", size = 144271 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0c/eb/4fc8d0a7110eb5fc9cc161723a34a8a6c200ce3b4fbf681bc86feee22308/charset_normalizer-3.4.7-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:eca9705049ad3c7345d574e3510665cb2cf844c2f2dcfe675332677f081cbd46", size = 311328 },
+    { url = "https://files.pythonhosted.org/packages/f8/e3/0fadc706008ac9d7b9b5be6dc767c05f9d3e5df51744ce4cc9605de7b9f4/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6178f72c5508bfc5fd446a5905e698c6212932f25bcdd4b47a757a50605a90e2", size = 208061 },
+    { url = "https://files.pythonhosted.org/packages/42/f0/3dd1045c47f4a4604df85ec18ad093912ae1344ac706993aff91d38773a2/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e1421b502d83040e6d7fb2fb18dff63957f720da3d77b2fbd3187ceb63755d7b", size = 229031 },
+    { url = "https://files.pythonhosted.org/packages/dc/67/675a46eb016118a2fbde5a277a5d15f4f69d5f3f5f338e5ee2f8948fcf43/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:edac0f1ab77644605be2cbba52e6b7f630731fc42b34cb0f634be1a6eface56a", size = 225239 },
+    { url = "https://files.pythonhosted.org/packages/4b/f8/d0118a2f5f23b02cd166fa385c60f9b0d4f9194f574e2b31cef350ad7223/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5649fd1c7bade02f320a462fdefd0b4bd3ce036065836d4f42e0de958038e116", size = 216589 },
+    { url = "https://files.pythonhosted.org/packages/b1/f1/6d2b0b261b6c4ceef0fcb0d17a01cc5bc53586c2d4796fa04b5c540bc13d/charset_normalizer-3.4.7-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:203104ed3e428044fd943bc4bf45fa73c0730391f9621e37fe39ecf477b128cb", size = 202733 },
+    { url = "https://files.pythonhosted.org/packages/6f/c0/7b1f943f7e87cc3db9626ba17807d042c38645f0a1d4415c7a14afb5591f/charset_normalizer-3.4.7-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:298930cec56029e05497a76988377cbd7457ba864beeea92ad7e844fe74cd1f1", size = 212652 },
+    { url = "https://files.pythonhosted.org/packages/38/dd/5a9ab159fe45c6e72079398f277b7d2b523e7f716acc489726115a910097/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:708838739abf24b2ceb208d0e22403dd018faeef86ddac04319a62ae884c4f15", size = 211229 },
+    { url = "https://files.pythonhosted.org/packages/d5/ff/531a1cad5ca855d1c1a8b69cb71abfd6d85c0291580146fda7c82857caa1/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:0f7eb884681e3938906ed0434f20c63046eacd0111c4ba96f27b76084cd679f5", size = 203552 },
+    { url = "https://files.pythonhosted.org/packages/c1/4c/a5fb52d528a8ca41f7598cb619409ece30a169fbdf9cdce592e53b46c3a6/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4dc1e73c36828f982bfe79fadf5919923f8a6f4df2860804db9a98c48824ce8d", size = 230806 },
+    { url = "https://files.pythonhosted.org/packages/59/7a/071feed8124111a32b316b33ae4de83d36923039ef8cf48120266844285b/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:aed52fea0513bac0ccde438c188c8a471c4e0f457c2dd20cdbf6ea7a450046c7", size = 212316 },
+    { url = "https://files.pythonhosted.org/packages/fd/35/f7dba3994312d7ba508e041eaac39a36b120f32d4c8662b8814dab876431/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:fea24543955a6a729c45a73fe90e08c743f0b3334bbf3201e6c4bc1b0c7fa464", size = 227274 },
+    { url = "https://files.pythonhosted.org/packages/8a/2d/a572df5c9204ab7688ec1edc895a73ebded3b023bb07364710b05dd1c9be/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bb6d88045545b26da47aa879dd4a89a71d1dce0f0e549b1abcb31dfe4a8eac49", size = 218468 },
+    { url = "https://files.pythonhosted.org/packages/86/eb/890922a8b03a568ca2f336c36585a4713c55d4d67bf0f0c78924be6315ca/charset_normalizer-3.4.7-cp312-cp312-win32.whl", hash = "sha256:2257141f39fe65a3fdf38aeccae4b953e5f3b3324f4ff0daf9f15b8518666a2c", size = 148460 },
+    { url = "https://files.pythonhosted.org/packages/35/d9/0e7dffa06c5ab081f75b1b786f0aefc88365825dfcd0ac544bdb7b2b6853/charset_normalizer-3.4.7-cp312-cp312-win_amd64.whl", hash = "sha256:5ed6ab538499c8644b8a3e18debabcd7ce684f3fa91cf867521a7a0279cab2d6", size = 159330 },
+    { url = "https://files.pythonhosted.org/packages/9e/5d/481bcc2a7c88ea6b0878c299547843b2521ccbc40980cb406267088bc701/charset_normalizer-3.4.7-cp312-cp312-win_arm64.whl", hash = "sha256:56be790f86bfb2c98fb742ce566dfb4816e5a83384616ab59c49e0604d49c51d", size = 147828 },
+    { url = "https://files.pythonhosted.org/packages/c1/3b/66777e39d3ae1ddc77ee606be4ec6d8cbd4c801f65e5a1b6f2b11b8346dd/charset_normalizer-3.4.7-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:f496c9c3cc02230093d8330875c4c3cdfc3b73612a5fd921c65d39cbcef08063", size = 309627 },
+    { url = "https://files.pythonhosted.org/packages/2e/4e/b7f84e617b4854ade48a1b7915c8ccfadeba444d2a18c291f696e37f0d3b/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ea948db76d31190bf08bd371623927ee1339d5f2a0b4b1b4a4439a65298703c", size = 207008 },
+    { url = "https://files.pythonhosted.org/packages/c4/bb/ec73c0257c9e11b268f018f068f5d00aa0ef8c8b09f7753ebd5f2880e248/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a277ab8928b9f299723bc1a2dabb1265911b1a76341f90a510368ca44ad9ab66", size = 228303 },
+    { url = "https://files.pythonhosted.org/packages/85/fb/32d1f5033484494619f701e719429c69b766bfc4dbc61aa9e9c8c166528b/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3bec022aec2c514d9cf199522a802bd007cd588ab17ab2525f20f9c34d067c18", size = 224282 },
+    { url = "https://files.pythonhosted.org/packages/fa/07/330e3a0dda4c404d6da83b327270906e9654a24f6c546dc886a0eb0ffb23/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e044c39e41b92c845bc815e5ae4230804e8e7bc29e399b0437d64222d92809dd", size = 215595 },
+    { url = "https://files.pythonhosted.org/packages/e3/7c/fc890655786e423f02556e0216d4b8c6bcb6bdfa890160dc66bf52dee468/charset_normalizer-3.4.7-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:f495a1652cf3fbab2eb0639776dad966c2fb874d79d87ca07f9d5f059b8bd215", size = 201986 },
+    { url = "https://files.pythonhosted.org/packages/d8/97/bfb18b3db2aed3b90cf54dc292ad79fdd5ad65c4eae454099475cbeadd0d/charset_normalizer-3.4.7-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e712b419df8ba5e42b226c510472b37bd57b38e897d3eca5e8cfd410a29fa859", size = 211711 },
+    { url = "https://files.pythonhosted.org/packages/6f/a5/a581c13798546a7fd557c82614a5c65a13df2157e9ad6373166d2a3e645d/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7804338df6fcc08105c7745f1502ba68d900f45fd770d5bdd5288ddccb8a42d8", size = 210036 },
+    { url = "https://files.pythonhosted.org/packages/8c/bf/b3ab5bcb478e4193d517644b0fb2bf5497fbceeaa7a1bc0f4d5b50953861/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:481551899c856c704d58119b5025793fa6730adda3571971af568f66d2424bb5", size = 202998 },
+    { url = "https://files.pythonhosted.org/packages/e7/4e/23efd79b65d314fa320ec6017b4b5834d5c12a58ba4610aa353af2e2f577/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f59099f9b66f0d7145115e6f80dd8b1d847176df89b234a5a6b3f00437aa0832", size = 230056 },
+    { url = "https://files.pythonhosted.org/packages/b9/9f/1e1941bc3f0e01df116e68dc37a55c4d249df5e6fa77f008841aef68264f/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:f59ad4c0e8f6bba240a9bb85504faa1ab438237199d4cce5f622761507b8f6a6", size = 211537 },
+    { url = "https://files.pythonhosted.org/packages/80/0f/088cbb3020d44428964a6c97fe1edfb1b9550396bf6d278330281e8b709c/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:3dedcc22d73ec993f42055eff4fcfed9318d1eeb9a6606c55892a26964964e48", size = 226176 },
+    { url = "https://files.pythonhosted.org/packages/6a/9f/130394f9bbe06f4f63e22641d32fc9b202b7e251c9aef4db044324dac493/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:64f02c6841d7d83f832cd97ccf8eb8a906d06eb95d5276069175c696b024b60a", size = 217723 },
+    { url = "https://files.pythonhosted.org/packages/73/55/c469897448a06e49f8fa03f6caae97074fde823f432a98f979cc42b90e69/charset_normalizer-3.4.7-cp313-cp313-win32.whl", hash = "sha256:4042d5c8f957e15221d423ba781e85d553722fc4113f523f2feb7b188cc34c5e", size = 148085 },
+    { url = "https://files.pythonhosted.org/packages/5d/78/1b74c5bbb3f99b77a1715c91b3e0b5bdb6fe302d95ace4f5b1bec37b0167/charset_normalizer-3.4.7-cp313-cp313-win_amd64.whl", hash = "sha256:3946fa46a0cf3e4c8cb1cc52f56bb536310d34f25f01ca9b6c16afa767dab110", size = 158819 },
+    { url = "https://files.pythonhosted.org/packages/68/86/46bd42279d323deb8687c4a5a811fd548cb7d1de10cf6535d099877a9a9f/charset_normalizer-3.4.7-cp313-cp313-win_arm64.whl", hash = "sha256:80d04837f55fc81da168b98de4f4b797ef007fc8a79ab71c6ec9bc4dd662b15b", size = 147915 },
+    { url = "https://files.pythonhosted.org/packages/97/c8/c67cb8c70e19ef1960b97b22ed2a1567711de46c4ddf19799923adc836c2/charset_normalizer-3.4.7-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:c36c333c39be2dbca264d7803333c896ab8fa7d4d6f0ab7edb7dfd7aea6e98c0", size = 309234 },
+    { url = "https://files.pythonhosted.org/packages/99/85/c091fdee33f20de70d6c8b522743b6f831a2f1cd3ff86de4c6a827c48a76/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1c2aed2e5e41f24ea8ef1590b8e848a79b56f3a5564a65ceec43c9d692dc7d8a", size = 208042 },
+    { url = "https://files.pythonhosted.org/packages/87/1c/ab2ce611b984d2fd5d86a5a8a19c1ae26acac6bad967da4967562c75114d/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:54523e136b8948060c0fa0bc7b1b50c32c186f2fceee897a495406bb6e311d2b", size = 228706 },
+    { url = "https://files.pythonhosted.org/packages/a8/29/2b1d2cb00bf085f59d29eb773ce58ec2d325430f8c216804a0a5cd83cbca/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:715479b9a2802ecac752a3b0efa2b0b60285cf962ee38414211abdfccc233b41", size = 224727 },
+    { url = "https://files.pythonhosted.org/packages/47/5c/032c2d5a07fe4d4855fea851209cca2b6f03ebeb6d4e3afdb3358386a684/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bd6c2a1c7573c64738d716488d2cdd3c00e340e4835707d8fdb8dc1a66ef164e", size = 215882 },
+    { url = "https://files.pythonhosted.org/packages/2c/c2/356065d5a8b78ed04499cae5f339f091946a6a74f91e03476c33f0ab7100/charset_normalizer-3.4.7-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:c45e9440fb78f8ddabcf714b68f936737a121355bf59f3907f4e17721b9d1aae", size = 200860 },
+    { url = "https://files.pythonhosted.org/packages/0c/cd/a32a84217ced5039f53b29f460962abb2d4420def55afabe45b1c3c7483d/charset_normalizer-3.4.7-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3534e7dcbdcf757da6b85a0bbf5b6868786d5982dd959b065e65481644817a18", size = 211564 },
+    { url = "https://files.pythonhosted.org/packages/44/86/58e6f13ce26cc3b8f4a36b94a0f22ae2f00a72534520f4ae6857c4b81f89/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e8ac484bf18ce6975760921bb6148041faa8fef0547200386ea0b52b5d27bf7b", size = 211276 },
+    { url = "https://files.pythonhosted.org/packages/8f/fe/d17c32dc72e17e155e06883efa84514ca375f8a528ba2546bee73fc4df81/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:a5fe03b42827c13cdccd08e6c0247b6a6d4b5e3cdc53fd1749f5896adcdc2356", size = 201238 },
+    { url = "https://files.pythonhosted.org/packages/6a/29/f33daa50b06525a237451cdb6c69da366c381a3dadcd833fa5676bc468b3/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:2d6eb928e13016cea4f1f21d1e10c1cebd5a421bc57ddf5b1142ae3f86824fab", size = 230189 },
+    { url = "https://files.pythonhosted.org/packages/b6/6e/52c84015394a6a0bdcd435210a7e944c5f94ea1055f5cc5d56c5fe368e7b/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:e74327fb75de8986940def6e8dee4f127cc9752bee7355bb323cc5b2659b6d46", size = 211352 },
+    { url = "https://files.pythonhosted.org/packages/8c/d7/4353be581b373033fb9198bf1da3cf8f09c1082561e8e922aa7b39bf9fe8/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:d6038d37043bced98a66e68d3aa2b6a35505dc01328cd65217cefe82f25def44", size = 227024 },
+    { url = "https://files.pythonhosted.org/packages/30/45/99d18aa925bd1740098ccd3060e238e21115fffbfdcb8f3ece837d0ace6c/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7579e913a5339fb8fa133f6bbcfd8e6749696206cf05acdbdca71a1b436d8e72", size = 217869 },
+    { url = "https://files.pythonhosted.org/packages/5c/05/5ee478aa53f4bb7996482153d4bfe1b89e0f087f0ab6b294fcf92d595873/charset_normalizer-3.4.7-cp314-cp314-win32.whl", hash = "sha256:5b77459df20e08151cd6f8b9ef8ef1f961ef73d85c21a555c7eed5b79410ec10", size = 148541 },
+    { url = "https://files.pythonhosted.org/packages/48/77/72dcb0921b2ce86420b2d79d454c7022bf5be40202a2a07906b9f2a35c97/charset_normalizer-3.4.7-cp314-cp314-win_amd64.whl", hash = "sha256:92a0a01ead5e668468e952e4238cccd7c537364eb7d851ab144ab6627dbbe12f", size = 159634 },
+    { url = "https://files.pythonhosted.org/packages/c6/a3/c2369911cd72f02386e4e340770f6e158c7980267da16af8f668217abaa0/charset_normalizer-3.4.7-cp314-cp314-win_arm64.whl", hash = "sha256:67f6279d125ca0046a7fd386d01b311c6363844deac3e5b069b514ba3e63c246", size = 148384 },
+    { url = "https://files.pythonhosted.org/packages/94/09/7e8a7f73d24dba1f0035fbbf014d2c36828fc1bf9c88f84093e57d315935/charset_normalizer-3.4.7-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:effc3f449787117233702311a1b7d8f59cba9ced946ba727bdc329ec69028e24", size = 330133 },
+    { url = "https://files.pythonhosted.org/packages/8d/da/96975ddb11f8e977f706f45cddd8540fd8242f71ecdb5d18a80723dcf62c/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fbccdc05410c9ee21bbf16a35f4c1d16123dcdeb8a1d38f33654fa21d0234f79", size = 216257 },
+    { url = "https://files.pythonhosted.org/packages/e5/e8/1d63bf8ef2d388e95c64b2098f45f84758f6d102a087552da1485912637b/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:733784b6d6def852c814bce5f318d25da2ee65dd4839a0718641c696e09a2960", size = 234851 },
+    { url = "https://files.pythonhosted.org/packages/9b/40/e5ff04233e70da2681fa43969ad6f66ca5611d7e669be0246c4c7aaf6dc8/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a89c23ef8d2c6b27fd200a42aa4ac72786e7c60d40efdc76e6011260b6e949c4", size = 233393 },
+    { url = "https://files.pythonhosted.org/packages/be/c1/06c6c49d5a5450f76899992f1ee40b41d076aee9279b49cf9974d2f313d5/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6c114670c45346afedc0d947faf3c7f701051d2518b943679c8ff88befe14f8e", size = 223251 },
+    { url = "https://files.pythonhosted.org/packages/2b/9f/f2ff16fb050946169e3e1f82134d107e5d4ae72647ec8a1b1446c148480f/charset_normalizer-3.4.7-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:a180c5e59792af262bf263b21a3c49353f25945d8d9f70628e73de370d55e1e1", size = 206609 },
+    { url = "https://files.pythonhosted.org/packages/69/d5/a527c0cd8d64d2eab7459784fb4169a0ac76e5a6fc5237337982fd61347e/charset_normalizer-3.4.7-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3c9a494bc5ec77d43cea229c4f6db1e4d8fe7e1bbffa8b6f0f0032430ff8ab44", size = 220014 },
+    { url = "https://files.pythonhosted.org/packages/7e/80/8a7b8104a3e203074dc9aa2c613d4b726c0e136bad1cc734594b02867972/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8d828b6667a32a728a1ad1d93957cdf37489c57b97ae6c4de2860fa749b8fc1e", size = 218979 },
+    { url = "https://files.pythonhosted.org/packages/02/9a/b759b503d507f375b2b5c153e4d2ee0a75aa215b7f2489cf314f4541f2c0/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:cf1493cd8607bec4d8a7b9b004e699fcf8f9103a9284cc94962cb73d20f9d4a3", size = 209238 },
+    { url = "https://files.pythonhosted.org/packages/c2/4e/0f3f5d47b86bdb79256e7290b26ac847a2832d9a4033f7eb2cd4bcf4bb5b/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:0c96c3b819b5c3e9e165495db84d41914d6894d55181d2d108cc1a69bfc9cce0", size = 236110 },
+    { url = "https://files.pythonhosted.org/packages/96/23/bce28734eb3ed2c91dcf93abeb8a5cf393a7b2749725030bb630e554fdd8/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:752a45dc4a6934060b3b0dab47e04edc3326575f82be64bc4fc293914566503e", size = 219824 },
+    { url = "https://files.pythonhosted.org/packages/2c/6f/6e897c6984cc4d41af319b077f2f600fc8214eb2fe2d6bcb79141b882400/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:8778f0c7a52e56f75d12dae53ae320fae900a8b9b4164b981b9c5ce059cd1fcb", size = 233103 },
+    { url = "https://files.pythonhosted.org/packages/76/22/ef7bd0fe480a0ae9b656189ec00744b60933f68b4f42a7bb06589f6f576a/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ce3412fbe1e31eb81ea42f4169ed94861c56e643189e1e75f0041f3fe7020abe", size = 225194 },
+    { url = "https://files.pythonhosted.org/packages/c5/a7/0e0ab3e0b5bc1219bd80a6a0d4d72ca74d9250cb2382b7c699c147e06017/charset_normalizer-3.4.7-cp314-cp314t-win32.whl", hash = "sha256:c03a41a8784091e67a39648f70c5f97b5b6a37f216896d44d2cdcb82615339a0", size = 159827 },
+    { url = "https://files.pythonhosted.org/packages/7a/1d/29d32e0fb40864b1f878c7f5a0b343ae676c6e2b271a2d55cc3a152391da/charset_normalizer-3.4.7-cp314-cp314t-win_amd64.whl", hash = "sha256:03853ed82eeebbce3c2abfdbc98c96dc205f32a79627688ac9a27370ea61a49c", size = 174168 },
+    { url = "https://files.pythonhosted.org/packages/de/32/d92444ad05c7a6e41fb2036749777c163baf7a0301a040cb672d6b2b1ae9/charset_normalizer-3.4.7-cp314-cp314t-win_arm64.whl", hash = "sha256:c35abb8bfff0185efac5878da64c45dafd2b37fb0383add1be155a763c1f083d", size = 153018 },
+    { url = "https://files.pythonhosted.org/packages/db/8f/61959034484a4a7c527811f4721e75d02d653a35afb0b6054474d8185d4c/charset_normalizer-3.4.7-py3-none-any.whl", hash = "sha256:3dce51d0f5e7951f8bb4900c257dad282f49190fdbebecd4ba99bcc41fef404d", size = 61958 },
+]
+
+[[package]]
+name = "click"
+version = "8.3.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/bb/63/f9e1ea081ce35720d8b92acde70daaedace594dc93b693c869e0d5910718/click-8.3.3.tar.gz", hash = "sha256:398329ad4837b2ff7cbe1dd166a4c0f8900c3ca3a218de04466f38f6497f18a2", size = 328061 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ae/44/c1221527f6a71a01ec6fbad7fa78f1d50dfa02217385cf0fa3eec7087d59/click-8.3.3-py3-none-any.whl", hash = "sha256:a2bf429bb3033c89fa4936ffb35d5cb471e3719e1f3c8a7c3fff0b8314305613", size = 110502 },
+]
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
+]
+
+[[package]]
+name = "datasets"
+version = "4.8.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "dill" },
+    { name = "filelock" },
+    { name = "fsspec", extra = ["http"] },
+    { name = "httpx" },
+    { name = "huggingface-hub" },
+    { name = "multiprocess" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "pandas" },
+    { name = "pyarrow" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tqdm" },
+    { name = "xxhash" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/66/34/14cd8e76f907f7d4dca2334cfeec9f81d30fd15c25a015f99aaea694eaed/datasets-4.8.5.tar.gz", hash = "sha256:0f0c1c3d56ffff2c93b2f4c63c95bac94f3d7e8621aea2a2a576275233bba772", size = 605649 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/65/99/00f3196036501b53032c4b1ab8337a0b978dee832ed276dae3815df4e8b5/datasets-4.8.5-py3-none-any.whl", hash = "sha256:5079900781719c0e063a8efdd2cd95a31ad0c63209178669cd23cf1b926149ff", size = 528973 },
+]
+
+[[package]]
+name = "dill"
+version = "0.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/81/e1/56027a71e31b02ddc53c7d65b01e68edf64dea2932122fe7746a516f75d5/dill-0.4.1.tar.gz", hash = "sha256:423092df4182177d4d8ba8290c8a5b640c66ab35ec7da59ccfa00f6fa3eea5fa", size = 187315 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/77/dc8c558f7593132cf8fefec57c4f60c83b16941c574ac5f619abb3ae7933/dill-0.4.1-py3-none-any.whl", hash = "sha256:1e1ce33e978ae97fcfcff5638477032b801c46c7c65cf717f95fbc2248f79a9d", size = 120019 },
+]
+
+[[package]]
+name = "filelock"
+version = "3.29.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b5/fe/997687a931ab51049acce6fa1f23e8f01216374ea81374ddee763c493db5/filelock-3.29.0.tar.gz", hash = "sha256:69974355e960702e789734cb4871f884ea6fe50bd8404051a3530bc07809cf90", size = 57571 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/81/47/dd9a212ef6e343a6857485ffe25bba537304f1913bdbed446a23f7f592e1/filelock-3.29.0-py3-none-any.whl", hash = "sha256:96f5f6344709aa1572bbf631c640e4ebeeb519e08da902c39a001882f30ac258", size = 39812 },
+]
+
+[[package]]
+name = "frozenlist"
+version = "1.8.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/2d/f5/c831fac6cc817d26fd54c7eaccd04ef7e0288806943f7cc5bbf69f3ac1f0/frozenlist-1.8.0.tar.gz", hash = "sha256:3ede829ed8d842f6cd48fc7081d7a41001a56f1f38603f9d49bf3020d59a31ad", size = 45875 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/69/29/948b9aa87e75820a38650af445d2ef2b6b8a6fab1a23b6bb9e4ef0be2d59/frozenlist-1.8.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:78f7b9e5d6f2fdb88cdde9440dc147259b62b9d3b019924def9f6478be254ac1", size = 87782 },
+    { url = "https://files.pythonhosted.org/packages/64/80/4f6e318ee2a7c0750ed724fa33a4bdf1eacdc5a39a7a24e818a773cd91af/frozenlist-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:229bf37d2e4acdaf808fd3f06e854a4a7a3661e871b10dc1f8f1896a3b05f18b", size = 50594 },
+    { url = "https://files.pythonhosted.org/packages/2b/94/5c8a2b50a496b11dd519f4a24cb5496cf125681dd99e94c604ccdea9419a/frozenlist-1.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f833670942247a14eafbb675458b4e61c82e002a148f49e68257b79296e865c4", size = 50448 },
+    { url = "https://files.pythonhosted.org/packages/6a/bd/d91c5e39f490a49df14320f4e8c80161cfcce09f1e2cde1edd16a551abb3/frozenlist-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:494a5952b1c597ba44e0e78113a7266e656b9794eec897b19ead706bd7074383", size = 242411 },
+    { url = "https://files.pythonhosted.org/packages/8f/83/f61505a05109ef3293dfb1ff594d13d64a2324ac3482be2cedc2be818256/frozenlist-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96f423a119f4777a4a056b66ce11527366a8bb92f54e541ade21f2374433f6d4", size = 243014 },
+    { url = "https://files.pythonhosted.org/packages/d8/cb/cb6c7b0f7d4023ddda30cf56b8b17494eb3a79e3fda666bf735f63118b35/frozenlist-1.8.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3462dd9475af2025c31cc61be6652dfa25cbfb56cbbf52f4ccfe029f38decaf8", size = 234909 },
+    { url = "https://files.pythonhosted.org/packages/31/c5/cd7a1f3b8b34af009fb17d4123c5a778b44ae2804e3ad6b86204255f9ec5/frozenlist-1.8.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4c800524c9cd9bac5166cd6f55285957fcfc907db323e193f2afcd4d9abd69b", size = 250049 },
+    { url = "https://files.pythonhosted.org/packages/c0/01/2f95d3b416c584a1e7f0e1d6d31998c4a795f7544069ee2e0962a4b60740/frozenlist-1.8.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d6a5df73acd3399d893dafc71663ad22534b5aa4f94e8a2fabfe856c3c1b6a52", size = 256485 },
+    { url = "https://files.pythonhosted.org/packages/ce/03/024bf7720b3abaebcff6d0793d73c154237b85bdf67b7ed55e5e9596dc9a/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:405e8fe955c2280ce66428b3ca55e12b3c4e9c336fb2103a4937e891c69a4a29", size = 237619 },
+    { url = "https://files.pythonhosted.org/packages/69/fa/f8abdfe7d76b731f5d8bd217827cf6764d4f1d9763407e42717b4bed50a0/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:908bd3f6439f2fef9e85031b59fd4f1297af54415fb60e4254a95f75b3cab3f3", size = 250320 },
+    { url = "https://files.pythonhosted.org/packages/f5/3c/b051329f718b463b22613e269ad72138cc256c540f78a6de89452803a47d/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:294e487f9ec720bd8ffcebc99d575f7eff3568a08a253d1ee1a0378754b74143", size = 246820 },
+    { url = "https://files.pythonhosted.org/packages/0f/ae/58282e8f98e444b3f4dd42448ff36fa38bef29e40d40f330b22e7108f565/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:74c51543498289c0c43656701be6b077f4b265868fa7f8a8859c197006efb608", size = 250518 },
+    { url = "https://files.pythonhosted.org/packages/8f/96/007e5944694d66123183845a106547a15944fbbb7154788cbf7272789536/frozenlist-1.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:776f352e8329135506a1d6bf16ac3f87bc25b28e765949282dcc627af36123aa", size = 239096 },
+    { url = "https://files.pythonhosted.org/packages/66/bb/852b9d6db2fa40be96f29c0d1205c306288f0684df8fd26ca1951d461a56/frozenlist-1.8.0-cp312-cp312-win32.whl", hash = "sha256:433403ae80709741ce34038da08511d4a77062aa924baf411ef73d1146e74faf", size = 39985 },
+    { url = "https://files.pythonhosted.org/packages/b8/af/38e51a553dd66eb064cdf193841f16f077585d4d28394c2fa6235cb41765/frozenlist-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:34187385b08f866104f0c0617404c8eb08165ab1272e884abc89c112e9c00746", size = 44591 },
+    { url = "https://files.pythonhosted.org/packages/a7/06/1dc65480ab147339fecc70797e9c2f69d9cea9cf38934ce08df070fdb9cb/frozenlist-1.8.0-cp312-cp312-win_arm64.whl", hash = "sha256:fe3c58d2f5db5fbd18c2987cba06d51b0529f52bc3a6cdc33d3f4eab725104bd", size = 40102 },
+    { url = "https://files.pythonhosted.org/packages/2d/40/0832c31a37d60f60ed79e9dfb5a92e1e2af4f40a16a29abcc7992af9edff/frozenlist-1.8.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8d92f1a84bb12d9e56f818b3a746f3efba93c1b63c8387a73dde655e1e42282a", size = 85717 },
+    { url = "https://files.pythonhosted.org/packages/30/ba/b0b3de23f40bc55a7057bd38434e25c34fa48e17f20ee273bbde5e0650f3/frozenlist-1.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:96153e77a591c8adc2ee805756c61f59fef4cf4073a9275ee86fe8cba41241f7", size = 49651 },
+    { url = "https://files.pythonhosted.org/packages/0c/ab/6e5080ee374f875296c4243c381bbdef97a9ac39c6e3ce1d5f7d42cb78d6/frozenlist-1.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f21f00a91358803399890ab167098c131ec2ddd5f8f5fd5fe9c9f2c6fcd91e40", size = 49417 },
+    { url = "https://files.pythonhosted.org/packages/d5/4e/e4691508f9477ce67da2015d8c00acd751e6287739123113a9fca6f1604e/frozenlist-1.8.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fb30f9626572a76dfe4293c7194a09fb1fe93ba94c7d4f720dfae3b646b45027", size = 234391 },
+    { url = "https://files.pythonhosted.org/packages/40/76/c202df58e3acdf12969a7895fd6f3bc016c642e6726aa63bd3025e0fc71c/frozenlist-1.8.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eaa352d7047a31d87dafcacbabe89df0aa506abb5b1b85a2fb91bc3faa02d822", size = 233048 },
+    { url = "https://files.pythonhosted.org/packages/f9/c0/8746afb90f17b73ca5979c7a3958116e105ff796e718575175319b5bb4ce/frozenlist-1.8.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:03ae967b4e297f58f8c774c7eabcce57fe3c2434817d4385c50661845a058121", size = 226549 },
+    { url = "https://files.pythonhosted.org/packages/7e/eb/4c7eefc718ff72f9b6c4893291abaae5fbc0c82226a32dcd8ef4f7a5dbef/frozenlist-1.8.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f6292f1de555ffcc675941d65fffffb0a5bcd992905015f85d0592201793e0e5", size = 239833 },
+    { url = "https://files.pythonhosted.org/packages/c2/4e/e5c02187cf704224f8b21bee886f3d713ca379535f16893233b9d672ea71/frozenlist-1.8.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:29548f9b5b5e3460ce7378144c3010363d8035cea44bc0bf02d57f5a685e084e", size = 245363 },
+    { url = "https://files.pythonhosted.org/packages/1f/96/cb85ec608464472e82ad37a17f844889c36100eed57bea094518bf270692/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ec3cc8c5d4084591b4237c0a272cc4f50a5b03396a47d9caaf76f5d7b38a4f11", size = 229314 },
+    { url = "https://files.pythonhosted.org/packages/5d/6f/4ae69c550e4cee66b57887daeebe006fe985917c01d0fff9caab9883f6d0/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:517279f58009d0b1f2e7c1b130b377a349405da3f7621ed6bfae50b10adf20c1", size = 243365 },
+    { url = "https://files.pythonhosted.org/packages/7a/58/afd56de246cf11780a40a2c28dc7cbabbf06337cc8ddb1c780a2d97e88d8/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:db1e72ede2d0d7ccb213f218df6a078a9c09a7de257c2fe8fcef16d5925230b1", size = 237763 },
+    { url = "https://files.pythonhosted.org/packages/cb/36/cdfaf6ed42e2644740d4a10452d8e97fa1c062e2a8006e4b09f1b5fd7d63/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:b4dec9482a65c54a5044486847b8a66bf10c9cb4926d42927ec4e8fd5db7fed8", size = 240110 },
+    { url = "https://files.pythonhosted.org/packages/03/a8/9ea226fbefad669f11b52e864c55f0bd57d3c8d7eb07e9f2e9a0b39502e1/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:21900c48ae04d13d416f0e1e0c4d81f7931f73a9dfa0b7a8746fb2fe7dd970ed", size = 233717 },
+    { url = "https://files.pythonhosted.org/packages/1e/0b/1b5531611e83ba7d13ccc9988967ea1b51186af64c42b7a7af465dcc9568/frozenlist-1.8.0-cp313-cp313-win32.whl", hash = "sha256:8b7b94a067d1c504ee0b16def57ad5738701e4ba10cec90529f13fa03c833496", size = 39628 },
+    { url = "https://files.pythonhosted.org/packages/d8/cf/174c91dbc9cc49bc7b7aab74d8b734e974d1faa8f191c74af9b7e80848e6/frozenlist-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:878be833caa6a3821caf85eb39c5ba92d28e85df26d57afb06b35b2efd937231", size = 43882 },
+    { url = "https://files.pythonhosted.org/packages/c1/17/502cd212cbfa96eb1388614fe39a3fc9ab87dbbe042b66f97acb57474834/frozenlist-1.8.0-cp313-cp313-win_arm64.whl", hash = "sha256:44389d135b3ff43ba8cc89ff7f51f5a0bb6b63d829c8300f79a2fe4fe61bcc62", size = 39676 },
+    { url = "https://files.pythonhosted.org/packages/d2/5c/3bbfaa920dfab09e76946a5d2833a7cbdf7b9b4a91c714666ac4855b88b4/frozenlist-1.8.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:e25ac20a2ef37e91c1b39938b591457666a0fa835c7783c3a8f33ea42870db94", size = 89235 },
+    { url = "https://files.pythonhosted.org/packages/d2/d6/f03961ef72166cec1687e84e8925838442b615bd0b8854b54923ce5b7b8a/frozenlist-1.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:07cdca25a91a4386d2e76ad992916a85038a9b97561bf7a3fd12d5d9ce31870c", size = 50742 },
+    { url = "https://files.pythonhosted.org/packages/1e/bb/a6d12b7ba4c3337667d0e421f7181c82dda448ce4e7ad7ecd249a16fa806/frozenlist-1.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4e0c11f2cc6717e0a741f84a527c52616140741cd812a50422f83dc31749fb52", size = 51725 },
+    { url = "https://files.pythonhosted.org/packages/bc/71/d1fed0ffe2c2ccd70b43714c6cab0f4188f09f8a67a7914a6b46ee30f274/frozenlist-1.8.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b3210649ee28062ea6099cfda39e147fa1bc039583c8ee4481cb7811e2448c51", size = 284533 },
+    { url = "https://files.pythonhosted.org/packages/c9/1f/fb1685a7b009d89f9bf78a42d94461bc06581f6e718c39344754a5d9bada/frozenlist-1.8.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:581ef5194c48035a7de2aefc72ac6539823bb71508189e5de01d60c9dcd5fa65", size = 292506 },
+    { url = "https://files.pythonhosted.org/packages/e6/3b/b991fe1612703f7e0d05c0cf734c1b77aaf7c7d321df4572e8d36e7048c8/frozenlist-1.8.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3ef2d026f16a2b1866e1d86fc4e1291e1ed8a387b2c333809419a2f8b3a77b82", size = 274161 },
+    { url = "https://files.pythonhosted.org/packages/ca/ec/c5c618767bcdf66e88945ec0157d7f6c4a1322f1473392319b7a2501ded7/frozenlist-1.8.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5500ef82073f599ac84d888e3a8c1f77ac831183244bfd7f11eaa0289fb30714", size = 294676 },
+    { url = "https://files.pythonhosted.org/packages/7c/ce/3934758637d8f8a88d11f0585d6495ef54b2044ed6ec84492a91fa3b27aa/frozenlist-1.8.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:50066c3997d0091c411a66e710f4e11752251e6d2d73d70d8d5d4c76442a199d", size = 300638 },
+    { url = "https://files.pythonhosted.org/packages/fc/4f/a7e4d0d467298f42de4b41cbc7ddaf19d3cfeabaf9ff97c20c6c7ee409f9/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5c1c8e78426e59b3f8005e9b19f6ff46e5845895adbde20ece9218319eca6506", size = 283067 },
+    { url = "https://files.pythonhosted.org/packages/dc/48/c7b163063d55a83772b268e6d1affb960771b0e203b632cfe09522d67ea5/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:eefdba20de0d938cec6a89bd4d70f346a03108a19b9df4248d3cf0d88f1b0f51", size = 292101 },
+    { url = "https://files.pythonhosted.org/packages/9f/d0/2366d3c4ecdc2fd391e0afa6e11500bfba0ea772764d631bbf82f0136c9d/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:cf253e0e1c3ceb4aaff6df637ce033ff6535fb8c70a764a8f46aafd3d6ab798e", size = 289901 },
+    { url = "https://files.pythonhosted.org/packages/b8/94/daff920e82c1b70e3618a2ac39fbc01ae3e2ff6124e80739ce5d71c9b920/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:032efa2674356903cd0261c4317a561a6850f3ac864a63fc1583147fb05a79b0", size = 289395 },
+    { url = "https://files.pythonhosted.org/packages/e3/20/bba307ab4235a09fdcd3cc5508dbabd17c4634a1af4b96e0f69bfe551ebd/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6da155091429aeba16851ecb10a9104a108bcd32f6c1642867eadaee401c1c41", size = 283659 },
+    { url = "https://files.pythonhosted.org/packages/fd/00/04ca1c3a7a124b6de4f8a9a17cc2fcad138b4608e7a3fc5877804b8715d7/frozenlist-1.8.0-cp313-cp313t-win32.whl", hash = "sha256:0f96534f8bfebc1a394209427d0f8a63d343c9779cda6fc25e8e121b5fd8555b", size = 43492 },
+    { url = "https://files.pythonhosted.org/packages/59/5e/c69f733a86a94ab10f68e496dc6b7e8bc078ebb415281d5698313e3af3a1/frozenlist-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5d63a068f978fc69421fb0e6eb91a9603187527c86b7cd3f534a5b77a592b888", size = 48034 },
+    { url = "https://files.pythonhosted.org/packages/16/6c/be9d79775d8abe79b05fa6d23da99ad6e7763a1d080fbae7290b286093fd/frozenlist-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf0a7e10b077bf5fb9380ad3ae8ce20ef919a6ad93b4552896419ac7e1d8e042", size = 41749 },
+    { url = "https://files.pythonhosted.org/packages/f1/c8/85da824b7e7b9b6e7f7705b2ecaf9591ba6f79c1177f324c2735e41d36a2/frozenlist-1.8.0-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:cee686f1f4cadeb2136007ddedd0aaf928ab95216e7691c63e50a8ec066336d0", size = 86127 },
+    { url = "https://files.pythonhosted.org/packages/8e/e8/a1185e236ec66c20afd72399522f142c3724c785789255202d27ae992818/frozenlist-1.8.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:119fb2a1bd47307e899c2fac7f28e85b9a543864df47aa7ec9d3c1b4545f096f", size = 49698 },
+    { url = "https://files.pythonhosted.org/packages/a1/93/72b1736d68f03fda5fdf0f2180fb6caaae3894f1b854d006ac61ecc727ee/frozenlist-1.8.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4970ece02dbc8c3a92fcc5228e36a3e933a01a999f7094ff7c23fbd2beeaa67c", size = 49749 },
+    { url = "https://files.pythonhosted.org/packages/a7/b2/fabede9fafd976b991e9f1b9c8c873ed86f202889b864756f240ce6dd855/frozenlist-1.8.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:cba69cb73723c3f329622e34bdbf5ce1f80c21c290ff04256cff1cd3c2036ed2", size = 231298 },
+    { url = "https://files.pythonhosted.org/packages/3a/3b/d9b1e0b0eed36e70477ffb8360c49c85c8ca8ef9700a4e6711f39a6e8b45/frozenlist-1.8.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:778a11b15673f6f1df23d9586f83c4846c471a8af693a22e066508b77d201ec8", size = 232015 },
+    { url = "https://files.pythonhosted.org/packages/dc/94/be719d2766c1138148564a3960fc2c06eb688da592bdc25adcf856101be7/frozenlist-1.8.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0325024fe97f94c41c08872db482cf8ac4800d80e79222c6b0b7b162d5b13686", size = 225038 },
+    { url = "https://files.pythonhosted.org/packages/e4/09/6712b6c5465f083f52f50cf74167b92d4ea2f50e46a9eea0523d658454ae/frozenlist-1.8.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:97260ff46b207a82a7567b581ab4190bd4dfa09f4db8a8b49d1a958f6aa4940e", size = 240130 },
+    { url = "https://files.pythonhosted.org/packages/f8/d4/cd065cdcf21550b54f3ce6a22e143ac9e4836ca42a0de1022da8498eac89/frozenlist-1.8.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:54b2077180eb7f83dd52c40b2750d0a9f175e06a42e3213ce047219de902717a", size = 242845 },
+    { url = "https://files.pythonhosted.org/packages/62/c3/f57a5c8c70cd1ead3d5d5f776f89d33110b1addae0ab010ad774d9a44fb9/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2f05983daecab868a31e1da44462873306d3cbfd76d1f0b5b69c473d21dbb128", size = 229131 },
+    { url = "https://files.pythonhosted.org/packages/6c/52/232476fe9cb64f0742f3fde2b7d26c1dac18b6d62071c74d4ded55e0ef94/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:33f48f51a446114bc5d251fb2954ab0164d5be02ad3382abcbfe07e2531d650f", size = 240542 },
+    { url = "https://files.pythonhosted.org/packages/5f/85/07bf3f5d0fb5414aee5f47d33c6f5c77bfe49aac680bfece33d4fdf6a246/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:154e55ec0655291b5dd1b8731c637ecdb50975a2ae70c606d100750a540082f7", size = 237308 },
+    { url = "https://files.pythonhosted.org/packages/11/99/ae3a33d5befd41ac0ca2cc7fd3aa707c9c324de2e89db0e0f45db9a64c26/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:4314debad13beb564b708b4a496020e5306c7333fa9a3ab90374169a20ffab30", size = 238210 },
+    { url = "https://files.pythonhosted.org/packages/b2/60/b1d2da22f4970e7a155f0adde9b1435712ece01b3cd45ba63702aea33938/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:073f8bf8becba60aa931eb3bc420b217bb7d5b8f4750e6f8b3be7f3da85d38b7", size = 231972 },
+    { url = "https://files.pythonhosted.org/packages/3f/ab/945b2f32de889993b9c9133216c068b7fcf257d8595a0ac420ac8677cab0/frozenlist-1.8.0-cp314-cp314-win32.whl", hash = "sha256:bac9c42ba2ac65ddc115d930c78d24ab8d4f465fd3fc473cdedfccadb9429806", size = 40536 },
+    { url = "https://files.pythonhosted.org/packages/59/ad/9caa9b9c836d9ad6f067157a531ac48b7d36499f5036d4141ce78c230b1b/frozenlist-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:3e0761f4d1a44f1d1a47996511752cf3dcec5bbdd9cc2b4fe595caf97754b7a0", size = 44330 },
+    { url = "https://files.pythonhosted.org/packages/82/13/e6950121764f2676f43534c555249f57030150260aee9dcf7d64efda11dd/frozenlist-1.8.0-cp314-cp314-win_arm64.whl", hash = "sha256:d1eaff1d00c7751b7c6662e9c5ba6eb2c17a2306ba5e2a37f24ddf3cc953402b", size = 40627 },
+    { url = "https://files.pythonhosted.org/packages/c0/c7/43200656ecc4e02d3f8bc248df68256cd9572b3f0017f0a0c4e93440ae23/frozenlist-1.8.0-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:d3bb933317c52d7ea5004a1c442eef86f426886fba134ef8cf4226ea6ee1821d", size = 89238 },
+    { url = "https://files.pythonhosted.org/packages/d1/29/55c5f0689b9c0fb765055629f472c0de484dcaf0acee2f7707266ae3583c/frozenlist-1.8.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:8009897cdef112072f93a0efdce29cd819e717fd2f649ee3016efd3cd885a7ed", size = 50738 },
+    { url = "https://files.pythonhosted.org/packages/ba/7d/b7282a445956506fa11da8c2db7d276adcbf2b17d8bb8407a47685263f90/frozenlist-1.8.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2c5dcbbc55383e5883246d11fd179782a9d07a986c40f49abe89ddf865913930", size = 51739 },
+    { url = "https://files.pythonhosted.org/packages/62/1c/3d8622e60d0b767a5510d1d3cf21065b9db874696a51ea6d7a43180a259c/frozenlist-1.8.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:39ecbc32f1390387d2aa4f5a995e465e9e2f79ba3adcac92d68e3e0afae6657c", size = 284186 },
+    { url = "https://files.pythonhosted.org/packages/2d/14/aa36d5f85a89679a85a1d44cd7a6657e0b1c75f61e7cad987b203d2daca8/frozenlist-1.8.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92db2bf818d5cc8d9c1f1fc56b897662e24ea5adb36ad1f1d82875bd64e03c24", size = 292196 },
+    { url = "https://files.pythonhosted.org/packages/05/23/6bde59eb55abd407d34f77d39a5126fb7b4f109a3f611d3929f14b700c66/frozenlist-1.8.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:2dc43a022e555de94c3b68a4ef0b11c4f747d12c024a520c7101709a2144fb37", size = 273830 },
+    { url = "https://files.pythonhosted.org/packages/d2/3f/22cff331bfad7a8afa616289000ba793347fcd7bc275f3b28ecea2a27909/frozenlist-1.8.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb89a7f2de3602cfed448095bab3f178399646ab7c61454315089787df07733a", size = 294289 },
+    { url = "https://files.pythonhosted.org/packages/a4/89/5b057c799de4838b6c69aa82b79705f2027615e01be996d2486a69ca99c4/frozenlist-1.8.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:33139dc858c580ea50e7e60a1b0ea003efa1fd42e6ec7fdbad78fff65fad2fd2", size = 300318 },
+    { url = "https://files.pythonhosted.org/packages/30/de/2c22ab3eb2a8af6d69dc799e48455813bab3690c760de58e1bf43b36da3e/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:168c0969a329b416119507ba30b9ea13688fafffac1b7822802537569a1cb0ef", size = 282814 },
+    { url = "https://files.pythonhosted.org/packages/59/f7/970141a6a8dbd7f556d94977858cfb36fa9b66e0892c6dd780d2219d8cd8/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:28bd570e8e189d7f7b001966435f9dac6718324b5be2990ac496cf1ea9ddb7fe", size = 291762 },
+    { url = "https://files.pythonhosted.org/packages/c1/15/ca1adae83a719f82df9116d66f5bb28bb95557b3951903d39135620ef157/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:b2a095d45c5d46e5e79ba1e5b9cb787f541a8dee0433836cea4b96a2c439dcd8", size = 289470 },
+    { url = "https://files.pythonhosted.org/packages/ac/83/dca6dc53bf657d371fbc88ddeb21b79891e747189c5de990b9dfff2ccba1/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:eab8145831a0d56ec9c4139b6c3e594c7a83c2c8be25d5bcf2d86136a532287a", size = 289042 },
+    { url = "https://files.pythonhosted.org/packages/96/52/abddd34ca99be142f354398700536c5bd315880ed0a213812bc491cff5e4/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:974b28cf63cc99dfb2188d8d222bc6843656188164848c4f679e63dae4b0708e", size = 283148 },
+    { url = "https://files.pythonhosted.org/packages/af/d3/76bd4ed4317e7119c2b7f57c3f6934aba26d277acc6309f873341640e21f/frozenlist-1.8.0-cp314-cp314t-win32.whl", hash = "sha256:342c97bf697ac5480c0a7ec73cd700ecfa5a8a40ac923bd035484616efecc2df", size = 44676 },
+    { url = "https://files.pythonhosted.org/packages/89/76/c615883b7b521ead2944bb3480398cbb07e12b7b4e4d073d3752eb721558/frozenlist-1.8.0-cp314-cp314t-win_amd64.whl", hash = "sha256:06be8f67f39c8b1dc671f5d83aaefd3358ae5cdcf8314552c57e7ed3e6475bdd", size = 49451 },
+    { url = "https://files.pythonhosted.org/packages/e0/a3/5982da14e113d07b325230f95060e2169f5311b1017ea8af2a29b374c289/frozenlist-1.8.0-cp314-cp314t-win_arm64.whl", hash = "sha256:102e6314ca4da683dca92e3b1355490fed5f313b768500084fbe6371fddfdb79", size = 42507 },
+    { url = "https://files.pythonhosted.org/packages/9a/9a/e35b4a917281c0b8419d4207f4334c8e8c5dbf4f3f5f9ada73958d937dcc/frozenlist-1.8.0-py3-none-any.whl", hash = "sha256:0c18a16eab41e82c295618a77502e17b195883241c563b00f0aa5106fc4eaa0d", size = 13409 },
+]
+
+[[package]]
+name = "fsspec"
+version = "2026.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505 },
+]
+
+[package.optional-dependencies]
+http = [
+    { name = "aiohttp" },
+]
+
+[[package]]
+name = "h11"
+version = "0.16.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515 },
+]
+
+[[package]]
+name = "hf-xet"
+version = "1.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/74/d8/5c06fc76461418326a7decf8367480c35be11a41fd938633929c60a9ec6b/hf_xet-1.5.0.tar.gz", hash = "sha256:e0fb0a34d9f406eed88233e829a67ec016bec5af19e480eac65a233ea289a948", size = 837196 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/68/9b/6912c99070915a4f28119e3c5b52a9abd1eec0ad5cb293b8c967a0c6f5a2/hf_xet-1.5.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:7d70fe2ce97b9db73b9c9b9c81fe3693640aec83416a966c446afea54acfae3c", size = 4023383 },
+    { url = "https://files.pythonhosted.org/packages/0f/6d/9563cfde59b5d8128a9c7ec972a087f4c782e4f7bac5a85234edfd5d5e49/hf_xet-1.5.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:73a0dae8c71de3b0633a45c73f4a4a5ed09e94b43441d82981a781d4f12baa42", size = 3792751 },
+    { url = "https://files.pythonhosted.org/packages/07/a5/ed5a0cf35b49a0571af5a8f53416dad1877a718c021c9937c3a53cb45781/hf_xet-1.5.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a60290ec57e9b71767fba7c3645ddafdd0759974b540441510c629c6db6db24a", size = 4456058 },
+    { url = "https://files.pythonhosted.org/packages/60/fb/3ae8bf2a7a37a4197d0195d7247fd25b3952e15cb8a599e285dfaa6f52b3/hf_xet-1.5.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:e5de0f6deada0dada870bb376a11bcd1f08abf3a968a6d118f33e72d1b1eb480", size = 4250783 },
+    { url = "https://files.pythonhosted.org/packages/a2/9b/8bae40d4d91525085137196e84eb0ed49cf65b5e96e5c3ecdadd8bd0fac2/hf_xet-1.5.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c799d49f1a5544a0ef7591c0ee75e0d6b93d6f56dc7a4979f59f7518d2872216", size = 4445594 },
+    { url = "https://files.pythonhosted.org/packages/13/59/c74efbbd4e8728172b2cc72a2bc014d2947a4b7bdced932fbd3f5da1a4e5/hf_xet-1.5.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2baea1b0b989e5c152fe81425f7745ddc8901280ba3d97c98d8cdece7b706c60", size = 4663995 },
+    { url = "https://files.pythonhosted.org/packages/73/32/8e1e0410af64cda9b139d1dcebdc993a8ff9c8c7c0e2696ae356d75ccc0d/hf_xet-1.5.0-cp313-cp313t-win_amd64.whl", hash = "sha256:526345b3ed45f374f6317349df489167606736c876241ba984105afe7fd4839d", size = 3966608 },
+    { url = "https://files.pythonhosted.org/packages/fc/34/a8febc8f4edbea8b3e21b02ebc8b628679b84ba7e45cde624a7736b51500/hf_xet-1.5.0-cp313-cp313t-win_arm64.whl", hash = "sha256:786d28e2eb8315d5035544b9d137b4a842d600c434bb91bf7d0d953cce906ad4", size = 3796946 },
+    { url = "https://files.pythonhosted.org/packages/2a/20/8fc8996afe5815fa1a6be8e9e5c02f24500f409d599e905800d498a4e14d/hf_xet-1.5.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:872d5601e6deea30d15865ede55d29eac6daf5a534ab417b99b6ef6b076dd96c", size = 4023495 },
+    { url = "https://files.pythonhosted.org/packages/32/6a/93d84463c00cecb561a7508aa6303e35ee2894294eac14245526924415fe/hf_xet-1.5.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:9929561f5abf4581c8ea79587881dfef6b8abb2a0d8a51915936fc2a614f4e73", size = 3792731 },
+    { url = "https://files.pythonhosted.org/packages/9d/5a/8ec8e0c863b382d00b3c2e2af6ded6b06371be617144a625903a6d562f4b/hf_xet-1.5.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f7b7bbae318e583a86fb21e5a4a175d6721d628a2874f4bd022d0e660c32a682", size = 4456738 },
+    { url = "https://files.pythonhosted.org/packages/c5/ca/f7effa1a67717da2bcc6b6c28f71c6ca648c77acaec4e2c32f40cbe16d85/hf_xet-1.5.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:cf7b2dc6f31a4ea754bb50f74cde482dcf5d366d184076d8530b9872787f3761", size = 4251622 },
+    { url = "https://files.pythonhosted.org/packages/65/f2/19247dba3e231cf77dec59ddfb878f00057635ff773d099c9b59d37812c3/hf_xet-1.5.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8dbcbab554c9ef158ef2c991545c3e970ddd8cc7acdcd0a78c5a41095dab4ded", size = 4445667 },
+    { url = "https://files.pythonhosted.org/packages/7f/64/6f116801a3bcfb6f59f5c251f48cadc47ea54026441c4a385079286a94fa/hf_xet-1.5.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5906bf7718d3636dc13402914736abe723492cb730f744834f5f5b67d3a12702", size = 4664619 },
+    { url = "https://files.pythonhosted.org/packages/5c/e8/069542d37946ed08669b127e1496fa99e78196d71de8d41eda5e9f1b7a58/hf_xet-1.5.0-cp314-cp314t-win_amd64.whl", hash = "sha256:5f3dc2248fc01cc0a00cd392ab497f1ca373fcbc7e3f2da1f452480b384e839e", size = 3966802 },
+    { url = "https://files.pythonhosted.org/packages/f9/91/fc6fdec27b14d04e88c386ac0a0129732b53fa23f7c4a78f4b83a039c567/hf_xet-1.5.0-cp314-cp314t-win_arm64.whl", hash = "sha256:b285cea1b5bab46b758772716ba8d6854a1a0310fed1c249d678a8b38601e5a0", size = 3797168 },
+    { url = "https://files.pythonhosted.org/packages/3d/fb/69ff198a82cae7eb1a69fb84d93b3a3e4816564d76817fe541ddc96874eb/hf_xet-1.5.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:dad0dc84e941b8ba3c860659fe1fdc35c049d47cce293f003287757e971a8f56", size = 4030814 },
+    { url = "https://files.pythonhosted.org/packages/9b/ff/edcc2b40162bef3ff78e14ab637e5f3b89243d6aee72f5949d3bb6a5af83/hf_xet-1.5.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:fd6e5a9b0fdac4ed03ed45ef79254a655b1aaab514a02202617fbf643f5fdf7a", size = 3798444 },
+    { url = "https://files.pythonhosted.org/packages/49/4d/103f76b04310e5e57656696cc184690d20c466af0bca3ca88f8c8ea5d4f3/hf_xet-1.5.0-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3531b1823a0e6d77d80f9ed15ca0e00f0d115094f8ac033d5cae88f4564cc949", size = 4465986 },
+    { url = "https://files.pythonhosted.org/packages/c4/a2/546f47f464737b3edbab6f8ddb57f2599b93d2cbb66f06abb475ccb48651/hf_xet-1.5.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:9a0ee58cd18d5ea799f7ed11290bbccbe56bdd8b1d97ca74b9cc49a3945d7a3b", size = 4259865 },
+    { url = "https://files.pythonhosted.org/packages/95/7f/1be593c1f28613be2e196473481cd81bfc5910795e30a34e8f744f6cac4f/hf_xet-1.5.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1e60df5a42e9bed8628b6416af2cba4cba57ae9f02de226a06b020d98e1aab18", size = 4459835 },
+    { url = "https://files.pythonhosted.org/packages/aa/b2/703569fc881f3284487e68cda7b42179978480da3c438042a6bbbb4a671c/hf_xet-1.5.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:4b35549ce62601b84da4ff9b24d970032ace3d4430f52d91bcbb26c901d6c690", size = 4672414 },
+    { url = "https://files.pythonhosted.org/packages/af/37/1b6def445c567286b50aa3b33828158e135b1be44938dde59f11382a500c/hf_xet-1.5.0-cp37-abi3-win_amd64.whl", hash = "sha256:2806c7c17b4d23f8d88f7c4814f838c3b6150773fe339c20af23e1cfaf2797e4", size = 3977238 },
+    { url = "https://files.pythonhosted.org/packages/62/94/3b66b148778ee100dcfd69c2ca22b57b41b44d3063ceec934f209e9184ce/hf_xet-1.5.0-cp37-abi3-win_arm64.whl", hash = "sha256:b6c9df403040248c76d808d3e047d64db2d923bae593eb244c41e425cf6cd7be", size = 3806916 },
+]
+
+[[package]]
+name = "httpcore"
+version = "1.0.9"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "h11" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784 },
+]
+
+[[package]]
+name = "httpx"
+version = "0.28.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "anyio" },
+    { name = "certifi" },
+    { name = "httpcore" },
+    { name = "idna" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517 },
+]
+
+[[package]]
+name = "httpx-sse"
+version = "0.4.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0f/4c/751061ffa58615a32c31b2d82e8482be8dd4a89154f003147acee90f2be9/httpx_sse-0.4.3.tar.gz", hash = "sha256:9b1ed0127459a66014aec3c56bebd93da3c1bc8bb6618c8082039a44889a755d", size = 15943 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d2/fd/6668e5aec43ab844de6fc74927e155a3b37bf40d7c3790e49fc0406b6578/httpx_sse-0.4.3-py3-none-any.whl", hash = "sha256:0ac1c9fe3c0afad2e0ebb25a934a59f4c7823b60792691f779fad2c5568830fc", size = 8960 },
+]
+
+[[package]]
+name = "huggingface-hub"
+version = "1.14.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "filelock" },
+    { name = "fsspec" },
+    { name = "hf-xet", marker = "platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" },
+    { name = "httpx" },
+    { name = "packaging" },
+    { name = "pyyaml" },
+    { name = "tqdm" },
+    { name = "typer" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/39/40/43109e943fd718b0ccd0cd61eb4f1c347df22bf81f5874c6f22adf44bcff/huggingface_hub-1.14.0.tar.gz", hash = "sha256:d6d2c9cd6be1d02ae9ec6672d5587d10a427f377db688e82528f426a041622c2", size = 782365 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/89/a5/33b49ba7bea7c41bb37f74ec0f8beea0831e052330196633fe2c77516ea6/huggingface_hub-1.14.0-py3-none-any.whl", hash = "sha256:efe075535c62e130b30e836b138e13785f6f043d1f0539e0a39aa411a99e90b8", size = 661479 },
+]
+
+[[package]]
+name = "idna"
+version = "3.15"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/82/77/7b3966d0b9d1d31a36ddf1746926a11dface89a83409bf1483f0237aa758/idna-3.15.tar.gz", hash = "sha256:ca962446ea538f7092a95e057da437618e886f4d349216d2b1e294abfdb65fdc", size = 199245 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d2/23/408243171aa9aaba178d3e2559159c24c1171a641aa83b67bdd3394ead8e/idna-3.15-py3-none-any.whl", hash = "sha256:048adeaf8c2d788c40fee287673ccaa74c24ffd8dcf09ffa555a2fbb59f10ac8", size = 72340 },
+]
+
+[[package]]
+name = "iniconfig"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484 },
+]
+
+[[package]]
+name = "joblib"
+version = "1.5.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071 },
+]
+
+[[package]]
+name = "markdown-it-py"
+version = "4.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mdurl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/06/ff/7841249c247aa650a76b9ee4bbaeae59370dc8bfd2f6c01f3630c35eb134/markdown_it_py-4.2.0.tar.gz", hash = "sha256:04a21681d6fbb623de53f6f364d352309d4094dd4194040a10fd51833e418d49", size = 82454 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/81/4da04ced5a082363ecfa159c010d200ecbd959ae410c10c0264a38cac0f5/markdown_it_py-4.2.0-py3-none-any.whl", hash = "sha256:9f7ebbcd14fe59494226453aed97c1070d83f8d24b6fc3a3bcf9a38092641c4a", size = 91687 },
+]
+
+[[package]]
+name = "mdurl"
+version = "0.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
+]
+
+[[package]]
+name = "multidict"
+version = "6.7.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1a/c2/c2d94cbe6ac1753f3fc980da97b3d930efe1da3af3c9f5125354436c073d/multidict-6.7.1.tar.gz", hash = "sha256:ec6652a1bee61c53a3e5776b6049172c53b6aaba34f18c9ad04f82712bac623d", size = 102010 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8d/9c/f20e0e2cf80e4b2e4b1c365bf5fe104ee633c751a724246262db8f1a0b13/multidict-6.7.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a90f75c956e32891a4eda3639ce6dd86e87105271f43d43442a3aedf3cddf172", size = 76893 },
+    { url = "https://files.pythonhosted.org/packages/fe/cf/18ef143a81610136d3da8193da9d80bfe1cb548a1e2d1c775f26b23d024a/multidict-6.7.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fccb473e87eaa1382689053e4a4618e7ba7b9b9b8d6adf2027ee474597128cd", size = 45456 },
+    { url = "https://files.pythonhosted.org/packages/a9/65/1caac9d4cd32e8433908683446eebc953e82d22b03d10d41a5f0fefe991b/multidict-6.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0fa96985700739c4c7853a43c0b3e169360d6855780021bfc6d0f1ce7c123e7", size = 43872 },
+    { url = "https://files.pythonhosted.org/packages/cf/3b/d6bd75dc4f3ff7c73766e04e705b00ed6dbbaccf670d9e05a12b006f5a21/multidict-6.7.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:cb2a55f408c3043e42b40cc8eecd575afa27b7e0b956dfb190de0f8499a57a53", size = 251018 },
+    { url = "https://files.pythonhosted.org/packages/fd/80/c959c5933adedb9ac15152e4067c702a808ea183a8b64cf8f31af8ad3155/multidict-6.7.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eb0ce7b2a32d09892b3dd6cc44877a0d02a33241fafca5f25c8b6b62374f8b75", size = 258883 },
+    { url = "https://files.pythonhosted.org/packages/86/85/7ed40adafea3d4f1c8b916e3b5cc3a8e07dfcdcb9cd72800f4ed3ca1b387/multidict-6.7.1-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c3a32d23520ee37bf327d1e1a656fec76a2edd5c038bf43eddfa0572ec49c60b", size = 242413 },
+    { url = "https://files.pythonhosted.org/packages/d2/57/b8565ff533e48595503c785f8361ff9a4fde4d67de25c207cd0ba3befd03/multidict-6.7.1-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9c90fed18bffc0189ba814749fdcc102b536e83a9f738a9003e569acd540a733", size = 268404 },
+    { url = "https://files.pythonhosted.org/packages/e0/50/9810c5c29350f7258180dfdcb2e52783a0632862eb334c4896ac717cebcb/multidict-6.7.1-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:da62917e6076f512daccfbbde27f46fed1c98fee202f0559adec8ee0de67f71a", size = 269456 },
+    { url = "https://files.pythonhosted.org/packages/f3/8d/5e5be3ced1d12966fefb5c4ea3b2a5b480afcea36406559442c6e31d4a48/multidict-6.7.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bfde23ef6ed9db7eaee6c37dcec08524cb43903c60b285b172b6c094711b3961", size = 256322 },
+    { url = "https://files.pythonhosted.org/packages/31/6e/d8a26d81ac166a5592782d208dd90dfdc0a7a218adaa52b45a672b46c122/multidict-6.7.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3758692429e4e32f1ba0df23219cd0b4fc0a52f476726fff9337d1a57676a582", size = 253955 },
+    { url = "https://files.pythonhosted.org/packages/59/4c/7c672c8aad41534ba619bcd4ade7a0dc87ed6b8b5c06149b85d3dd03f0cd/multidict-6.7.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:398c1478926eca669f2fd6a5856b6de9c0acf23a2cb59a14c0ba5844fa38077e", size = 251254 },
+    { url = "https://files.pythonhosted.org/packages/7b/bd/84c24de512cbafbdbc39439f74e967f19570ce7924e3007174a29c348916/multidict-6.7.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c102791b1c4f3ab36ce4101154549105a53dc828f016356b3e3bcae2e3a039d3", size = 252059 },
+    { url = "https://files.pythonhosted.org/packages/fa/ba/f5449385510825b73d01c2d4087bf6d2fccc20a2d42ac34df93191d3dd03/multidict-6.7.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a088b62bd733e2ad12c50dad01b7d0166c30287c166e137433d3b410add807a6", size = 263588 },
+    { url = "https://files.pythonhosted.org/packages/d7/11/afc7c677f68f75c84a69fe37184f0f82fce13ce4b92f49f3db280b7e92b3/multidict-6.7.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3d51ff4785d58d3f6c91bdbffcb5e1f7ddfda557727043aa20d20ec4f65e324a", size = 259642 },
+    { url = "https://files.pythonhosted.org/packages/2b/17/ebb9644da78c4ab36403739e0e6e0e30ebb135b9caf3440825001a0bddcb/multidict-6.7.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc5907494fccf3e7d3f94f95c91d6336b092b5fc83811720fae5e2765890dfba", size = 251377 },
+    { url = "https://files.pythonhosted.org/packages/ca/a4/840f5b97339e27846c46307f2530a2805d9d537d8b8bd416af031cad7fa0/multidict-6.7.1-cp312-cp312-win32.whl", hash = "sha256:28ca5ce2fd9716631133d0e9a9b9a745ad7f60bac2bccafb56aa380fc0b6c511", size = 41887 },
+    { url = "https://files.pythonhosted.org/packages/80/31/0b2517913687895f5904325c2069d6a3b78f66cc641a86a2baf75a05dcbb/multidict-6.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcee94dfbd638784645b066074b338bc9cc155d4b4bffa4adce1615c5a426c19", size = 46053 },
+    { url = "https://files.pythonhosted.org/packages/0c/5b/aba28e4ee4006ae4c7df8d327d31025d760ffa992ea23812a601d226e682/multidict-6.7.1-cp312-cp312-win_arm64.whl", hash = "sha256:ba0a9fb644d0c1a2194cf7ffb043bd852cea63a57f66fbd33959f7dae18517bf", size = 43307 },
+    { url = "https://files.pythonhosted.org/packages/f2/22/929c141d6c0dba87d3e1d38fbdf1ba8baba86b7776469f2bc2d3227a1e67/multidict-6.7.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2b41f5fed0ed563624f1c17630cb9941cf2309d4df00e494b551b5f3e3d67a23", size = 76174 },
+    { url = "https://files.pythonhosted.org/packages/c7/75/bc704ae15fee974f8fccd871305e254754167dce5f9e42d88a2def741a1d/multidict-6.7.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84e61e3af5463c19b67ced91f6c634effb89ef8bfc5ca0267f954451ed4bb6a2", size = 45116 },
+    { url = "https://files.pythonhosted.org/packages/79/76/55cd7186f498ed080a18440c9013011eb548f77ae1b297206d030eb1180a/multidict-6.7.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:935434b9853c7c112eee7ac891bc4cb86455aa631269ae35442cb316790c1445", size = 43524 },
+    { url = "https://files.pythonhosted.org/packages/e9/3c/414842ef8d5a1628d68edee29ba0e5bcf235dbfb3ccd3ea303a7fe8c72ff/multidict-6.7.1-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:432feb25a1cb67fe82a9680b4d65fb542e4635cb3166cd9c01560651ad60f177", size = 249368 },
+    { url = "https://files.pythonhosted.org/packages/f6/32/befed7f74c458b4a525e60519fe8d87eef72bb1e99924fa2b0f9d97a221e/multidict-6.7.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e82d14e3c948952a1a85503817e038cba5905a3352de76b9a465075d072fba23", size = 256952 },
+    { url = "https://files.pythonhosted.org/packages/03/d6/c878a44ba877f366630c860fdf74bfb203c33778f12b6ac274936853c451/multidict-6.7.1-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4cfb48c6ea66c83bcaaf7e4dfa7ec1b6bbcf751b7db85a328902796dfde4c060", size = 240317 },
+    { url = "https://files.pythonhosted.org/packages/68/49/57421b4d7ad2e9e60e25922b08ceb37e077b90444bde6ead629095327a6f/multidict-6.7.1-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1d540e51b7e8e170174555edecddbd5538105443754539193e3e1061864d444d", size = 267132 },
+    { url = "https://files.pythonhosted.org/packages/b7/fe/ec0edd52ddbcea2a2e89e174f0206444a61440b40f39704e64dc807a70bd/multidict-6.7.1-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:273d23f4b40f3dce4d6c8a821c741a86dec62cded82e1175ba3d99be128147ed", size = 268140 },
+    { url = "https://files.pythonhosted.org/packages/b0/73/6e1b01cbeb458807aa0831742232dbdd1fa92bfa33f52a3f176b4ff3dc11/multidict-6.7.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d624335fd4fa1c08a53f8b4be7676ebde19cd092b3895c421045ca87895b429", size = 254277 },
+    { url = "https://files.pythonhosted.org/packages/6a/b2/5fb8c124d7561a4974c342bc8c778b471ebbeb3cc17df696f034a7e9afe7/multidict-6.7.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:12fad252f8b267cc75b66e8fc51b3079604e8d43a75428ffe193cd9e2195dfd6", size = 252291 },
+    { url = "https://files.pythonhosted.org/packages/5a/96/51d4e4e06bcce92577fcd488e22600bd38e4fd59c20cb49434d054903bd2/multidict-6.7.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:03ede2a6ffbe8ef936b92cb4529f27f42be7f56afcdab5ab739cd5f27fb1cbf9", size = 250156 },
+    { url = "https://files.pythonhosted.org/packages/db/6b/420e173eec5fba721a50e2a9f89eda89d9c98fded1124f8d5c675f7a0c0f/multidict-6.7.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:90efbcf47dbe33dcf643a1e400d67d59abeac5db07dc3f27d6bdeae497a2198c", size = 249742 },
+    { url = "https://files.pythonhosted.org/packages/44/a3/ec5b5bd98f306bc2aa297b8c6f11a46714a56b1e6ef5ebda50a4f5d7c5fb/multidict-6.7.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:5c4b9bfc148f5a91be9244d6264c53035c8a0dcd2f51f1c3c6e30e30ebaa1c84", size = 262221 },
+    { url = "https://files.pythonhosted.org/packages/cd/f7/e8c0d0da0cd1e28d10e624604e1a36bcc3353aaebdfdc3a43c72bc683a12/multidict-6.7.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:401c5a650f3add2472d1d288c26deebc540f99e2fb83e9525007a74cd2116f1d", size = 258664 },
+    { url = "https://files.pythonhosted.org/packages/52/da/151a44e8016dd33feed44f730bd856a66257c1ee7aed4f44b649fb7edeb3/multidict-6.7.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:97891f3b1b3ffbded884e2916cacf3c6fc87b66bb0dde46f7357404750559f33", size = 249490 },
+    { url = "https://files.pythonhosted.org/packages/87/af/a3b86bf9630b732897f6fc3f4c4714b90aa4361983ccbdcd6c0339b21b0c/multidict-6.7.1-cp313-cp313-win32.whl", hash = "sha256:e1c5988359516095535c4301af38d8a8838534158f649c05dd1050222321bcb3", size = 41695 },
+    { url = "https://files.pythonhosted.org/packages/b2/35/e994121b0e90e46134673422dd564623f93304614f5d11886b1b3e06f503/multidict-6.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:960c83bf01a95b12b08fd54324a4eb1d5b52c88932b5cba5d6e712bb3ed12eb5", size = 45884 },
+    { url = "https://files.pythonhosted.org/packages/ca/61/42d3e5dbf661242a69c97ea363f2d7b46c567da8eadef8890022be6e2ab0/multidict-6.7.1-cp313-cp313-win_arm64.whl", hash = "sha256:563fe25c678aaba333d5399408f5ec3c383ca5b663e7f774dd179a520b8144df", size = 43122 },
+    { url = "https://files.pythonhosted.org/packages/6d/b3/e6b21c6c4f314bb956016b0b3ef2162590a529b84cb831c257519e7fde44/multidict-6.7.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:c76c4bec1538375dad9d452d246ca5368ad6e1c9039dadcf007ae59c70619ea1", size = 83175 },
+    { url = "https://files.pythonhosted.org/packages/fb/76/23ecd2abfe0957b234f6c960f4ade497f55f2c16aeb684d4ecdbf1c95791/multidict-6.7.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:57b46b24b5d5ebcc978da4ec23a819a9402b4228b8a90d9c656422b4bdd8a963", size = 48460 },
+    { url = "https://files.pythonhosted.org/packages/c4/57/a0ed92b23f3a042c36bc4227b72b97eca803f5f1801c1ab77c8a212d455e/multidict-6.7.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e954b24433c768ce78ab7929e84ccf3422e46deb45a4dc9f93438f8217fa2d34", size = 46930 },
+    { url = "https://files.pythonhosted.org/packages/b5/66/02ec7ace29162e447f6382c495dc95826bf931d3818799bbef11e8f7df1a/multidict-6.7.1-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3bd231490fa7217cc832528e1cd8752a96f0125ddd2b5749390f7c3ec8721b65", size = 242582 },
+    { url = "https://files.pythonhosted.org/packages/58/18/64f5a795e7677670e872673aca234162514696274597b3708b2c0d276cce/multidict-6.7.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:253282d70d67885a15c8a7716f3a73edf2d635793ceda8173b9ecc21f2fb8292", size = 250031 },
+    { url = "https://files.pythonhosted.org/packages/c8/ed/e192291dbbe51a8290c5686f482084d31bcd9d09af24f63358c3d42fd284/multidict-6.7.1-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0b4c48648d7649c9335cf1927a8b87fa692de3dcb15faa676c6a6f1f1aabda43", size = 228596 },
+    { url = "https://files.pythonhosted.org/packages/1e/7e/3562a15a60cf747397e7f2180b0a11dc0c38d9175a650e75fa1b4d325e15/multidict-6.7.1-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:98bc624954ec4d2c7cb074b8eefc2b5d0ce7d482e410df446414355d158fe4ca", size = 257492 },
+    { url = "https://files.pythonhosted.org/packages/24/02/7d0f9eae92b5249bb50ac1595b295f10e263dd0078ebb55115c31e0eaccd/multidict-6.7.1-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1b99af4d9eec0b49927b4402bcbb58dea89d3e0db8806a4086117019939ad3dd", size = 255899 },
+    { url = "https://files.pythonhosted.org/packages/00/e3/9b60ed9e23e64c73a5cde95269ef1330678e9c6e34dd4eb6b431b85b5a10/multidict-6.7.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6aac4f16b472d5b7dc6f66a0d49dd57b0e0902090be16594dc9ebfd3d17c47e7", size = 247970 },
+    { url = "https://files.pythonhosted.org/packages/3e/06/538e58a63ed5cfb0bd4517e346b91da32fde409d839720f664e9a4ae4f9d/multidict-6.7.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:21f830fe223215dffd51f538e78c172ed7c7f60c9b96a2bf05c4848ad49921c3", size = 245060 },
+    { url = "https://files.pythonhosted.org/packages/b2/2f/d743a3045a97c895d401e9bd29aaa09b94f5cbdf1bd561609e5a6c431c70/multidict-6.7.1-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:f5dd81c45b05518b9aa4da4aa74e1c93d715efa234fd3e8a179df611cc85e5f4", size = 235888 },
+    { url = "https://files.pythonhosted.org/packages/38/83/5a325cac191ab28b63c52f14f1131f3b0a55ba3b9aa65a6d0bf2a9b921a0/multidict-6.7.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:eb304767bca2bb92fb9c5bd33cedc95baee5bb5f6c88e63706533a1c06ad08c8", size = 243554 },
+    { url = "https://files.pythonhosted.org/packages/20/1f/9d2327086bd15da2725ef6aae624208e2ef828ed99892b17f60c344e57ed/multidict-6.7.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:c9035dde0f916702850ef66460bc4239d89d08df4d02023a5926e7446724212c", size = 252341 },
+    { url = "https://files.pythonhosted.org/packages/e8/2c/2a1aa0280cf579d0f6eed8ee5211c4f1730bd7e06c636ba2ee6aafda302e/multidict-6.7.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:af959b9beeb66c822380f222f0e0a1889331597e81f1ded7f374f3ecb0fd6c52", size = 246391 },
+    { url = "https://files.pythonhosted.org/packages/e5/03/7ca022ffc36c5a3f6e03b179a5ceb829be9da5783e6fe395f347c0794680/multidict-6.7.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:41f2952231456154ee479651491e94118229844dd7226541788be783be2b5108", size = 243422 },
+    { url = "https://files.pythonhosted.org/packages/dc/1d/b31650eab6c5778aceed46ba735bd97f7c7d2f54b319fa916c0f96e7805b/multidict-6.7.1-cp313-cp313t-win32.whl", hash = "sha256:df9f19c28adcb40b6aae30bbaa1478c389efd50c28d541d76760199fc1037c32", size = 47770 },
+    { url = "https://files.pythonhosted.org/packages/ac/5b/2d2d1d522e51285bd61b1e20df8f47ae1a9d80839db0b24ea783b3832832/multidict-6.7.1-cp313-cp313t-win_amd64.whl", hash = "sha256:d54ecf9f301853f2c5e802da559604b3e95bb7a3b01a9c295c6ee591b9882de8", size = 53109 },
+    { url = "https://files.pythonhosted.org/packages/3d/a3/cc409ba012c83ca024a308516703cf339bdc4b696195644a7215a5164a24/multidict-6.7.1-cp313-cp313t-win_arm64.whl", hash = "sha256:5a37ca18e360377cfda1d62f5f382ff41f2b8c4ccb329ed974cc2e1643440118", size = 45573 },
+    { url = "https://files.pythonhosted.org/packages/91/cc/db74228a8be41884a567e88a62fd589a913708fcf180d029898c17a9a371/multidict-6.7.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:8f333ec9c5eb1b7105e3b84b53141e66ca05a19a605368c55450b6ba208cb9ee", size = 75190 },
+    { url = "https://files.pythonhosted.org/packages/d5/22/492f2246bb5b534abd44804292e81eeaf835388901f0c574bac4eeec73c5/multidict-6.7.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:a407f13c188f804c759fc6a9f88286a565c242a76b27626594c133b82883b5c2", size = 44486 },
+    { url = "https://files.pythonhosted.org/packages/f1/4f/733c48f270565d78b4544f2baddc2fb2a245e5a8640254b12c36ac7ac68e/multidict-6.7.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0e161ddf326db5577c3a4cc2d8648f81456e8a20d40415541587a71620d7a7d1", size = 43219 },
+    { url = "https://files.pythonhosted.org/packages/24/bb/2c0c2287963f4259c85e8bcbba9182ced8d7fca65c780c38e99e61629d11/multidict-6.7.1-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:1e3a8bb24342a8201d178c3b4984c26ba81a577c80d4d525727427460a50c22d", size = 245132 },
+    { url = "https://files.pythonhosted.org/packages/a7/f9/44d4b3064c65079d2467888794dea218d1601898ac50222ab8a9a8094460/multidict-6.7.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97231140a50f5d447d3164f994b86a0bed7cd016e2682f8650d6a9158e14fd31", size = 252420 },
+    { url = "https://files.pythonhosted.org/packages/8b/13/78f7275e73fa17b24c9a51b0bd9d73ba64bb32d0ed51b02a746eb876abe7/multidict-6.7.1-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6b10359683bd8806a200fd2909e7c8ca3a7b24ec1d8132e483d58e791d881048", size = 233510 },
+    { url = "https://files.pythonhosted.org/packages/4b/25/8167187f62ae3cbd52da7893f58cb036b47ea3fb67138787c76800158982/multidict-6.7.1-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:283ddac99f7ac25a4acadbf004cb5ae34480bbeb063520f70ce397b281859362", size = 264094 },
+    { url = "https://files.pythonhosted.org/packages/a1/e7/69a3a83b7b030cf283fb06ce074a05a02322359783424d7edf0f15fe5022/multidict-6.7.1-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:538cec1e18c067d0e6103aa9a74f9e832904c957adc260e61cd9d8cf0c3b3d37", size = 260786 },
+    { url = "https://files.pythonhosted.org/packages/fe/3b/8ec5074bcfc450fe84273713b4b0a0dd47c0249358f5d82eb8104ffe2520/multidict-6.7.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7eee46ccb30ff48a1e35bb818cc90846c6be2b68240e42a78599166722cea709", size = 248483 },
+    { url = "https://files.pythonhosted.org/packages/48/5a/d5a99e3acbca0e29c5d9cba8f92ceb15dce78bab963b308ae692981e3a5d/multidict-6.7.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fa263a02f4f2dd2d11a7b1bb4362aa7cb1049f84a9235d31adf63f30143469a0", size = 248403 },
+    { url = "https://files.pythonhosted.org/packages/35/48/e58cd31f6c7d5102f2a4bf89f96b9cf7e00b6c6f3d04ecc44417c00a5a3c/multidict-6.7.1-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:2e1425e2f99ec5bd36c15a01b690a1a2456209c5deed58f95469ffb46039ccbb", size = 240315 },
+    { url = "https://files.pythonhosted.org/packages/94/33/1cd210229559cb90b6786c30676bb0c58249ff42f942765f88793b41fdce/multidict-6.7.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:497394b3239fc6f0e13a78a3e1b61296e72bf1c5f94b4c4eb80b265c37a131cd", size = 245528 },
+    { url = "https://files.pythonhosted.org/packages/64/f2/6e1107d226278c876c783056b7db43d800bb64c6131cec9c8dfb6903698e/multidict-6.7.1-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:233b398c29d3f1b9676b4b6f75c518a06fcb2ea0b925119fb2c1bc35c05e1601", size = 258784 },
+    { url = "https://files.pythonhosted.org/packages/4d/c1/11f664f14d525e4a1b5327a82d4de61a1db604ab34c6603bb3c2cc63ad34/multidict-6.7.1-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:93b1818e4a6e0930454f0f2af7dfce69307ca03cdcfb3739bf4d91241967b6c1", size = 251980 },
+    { url = "https://files.pythonhosted.org/packages/e1/9f/75a9ac888121d0c5bbd4ecf4eead45668b1766f6baabfb3b7f66a410e231/multidict-6.7.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:f33dc2a3abe9249ea5d8360f969ec7f4142e7ac45ee7014d8f8d5acddf178b7b", size = 243602 },
+    { url = "https://files.pythonhosted.org/packages/9a/e7/50bf7b004cc8525d80dbbbedfdc7aed3e4c323810890be4413e589074032/multidict-6.7.1-cp314-cp314-win32.whl", hash = "sha256:3ab8b9d8b75aef9df299595d5388b14530839f6422333357af1339443cff777d", size = 40930 },
+    { url = "https://files.pythonhosted.org/packages/e0/bf/52f25716bbe93745595800f36fb17b73711f14da59ed0bb2eba141bc9f0f/multidict-6.7.1-cp314-cp314-win_amd64.whl", hash = "sha256:5e01429a929600e7dab7b166062d9bb54a5eed752384c7384c968c2afab8f50f", size = 45074 },
+    { url = "https://files.pythonhosted.org/packages/97/ab/22803b03285fa3a525f48217963da3a65ae40f6a1b6f6cf2768879e208f9/multidict-6.7.1-cp314-cp314-win_arm64.whl", hash = "sha256:4885cb0e817aef5d00a2e8451d4665c1808378dc27c2705f1bf4ef8505c0d2e5", size = 42471 },
+    { url = "https://files.pythonhosted.org/packages/e0/6d/f9293baa6146ba9507e360ea0292b6422b016907c393e2f63fc40ab7b7b5/multidict-6.7.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:0458c978acd8e6ea53c81eefaddbbee9c6c5e591f41b3f5e8e194780fe026581", size = 82401 },
+    { url = "https://files.pythonhosted.org/packages/7a/68/53b5494738d83558d87c3c71a486504d8373421c3e0dbb6d0db48ad42ee0/multidict-6.7.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:c0abd12629b0af3cf590982c0b413b1e7395cd4ec026f30986818ab95bfaa94a", size = 48143 },
+    { url = "https://files.pythonhosted.org/packages/37/e8/5284c53310dcdc99ce5d66563f6e5773531a9b9fe9ec7a615e9bc306b05f/multidict-6.7.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:14525a5f61d7d0c94b368a42cff4c9a4e7ba2d52e2672a7b23d84dc86fb02b0c", size = 46507 },
+    { url = "https://files.pythonhosted.org/packages/e4/fc/6800d0e5b3875568b4083ecf5f310dcf91d86d52573160834fb4bfcf5e4f/multidict-6.7.1-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:17307b22c217b4cf05033dabefe68255a534d637c6c9b0cc8382718f87be4262", size = 239358 },
+    { url = "https://files.pythonhosted.org/packages/41/75/4ad0973179361cdf3a113905e6e088173198349131be2b390f9fa4da5fc6/multidict-6.7.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7a7e590ff876a3eaf1c02a4dfe0724b6e69a9e9de6d8f556816f29c496046e59", size = 246884 },
+    { url = "https://files.pythonhosted.org/packages/c3/9c/095bb28b5da139bd41fb9a5d5caff412584f377914bd8787c2aa98717130/multidict-6.7.1-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:5fa6a95dfee63893d80a34758cd0e0c118a30b8dcb46372bf75106c591b77889", size = 225878 },
+    { url = "https://files.pythonhosted.org/packages/07/d0/c0a72000243756e8f5a277b6b514fa005f2c73d481b7d9e47cd4568aa2e4/multidict-6.7.1-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a0543217a6a017692aa6ae5cc39adb75e587af0f3a82288b1492eb73dd6cc2a4", size = 253542 },
+    { url = "https://files.pythonhosted.org/packages/c0/6b/f69da15289e384ecf2a68837ec8b5ad8c33e973aa18b266f50fe55f24b8c/multidict-6.7.1-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f99fe611c312b3c1c0ace793f92464d8cd263cc3b26b5721950d977b006b6c4d", size = 252403 },
+    { url = "https://files.pythonhosted.org/packages/a2/76/b9669547afa5a1a25cd93eaca91c0da1c095b06b6d2d8ec25b713588d3a1/multidict-6.7.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9004d8386d133b7e6135679424c91b0b854d2d164af6ea3f289f8f2761064609", size = 244889 },
+    { url = "https://files.pythonhosted.org/packages/7e/a9/a50d2669e506dad33cfc45b5d574a205587b7b8a5f426f2fbb2e90882588/multidict-6.7.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e628ef0e6859ffd8273c69412a2465c4be4a9517d07261b33334b5ec6f3c7489", size = 241982 },
+    { url = "https://files.pythonhosted.org/packages/c5/bb/1609558ad8b456b4827d3c5a5b775c93b87878fd3117ed3db3423dfbce1b/multidict-6.7.1-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:841189848ba629c3552035a6a7f5bf3b02eb304e9fea7492ca220a8eda6b0e5c", size = 232415 },
+    { url = "https://files.pythonhosted.org/packages/d8/59/6f61039d2aa9261871e03ab9dc058a550d240f25859b05b67fd70f80d4b3/multidict-6.7.1-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:ce1bbd7d780bb5a0da032e095c951f7014d6b0a205f8318308140f1a6aba159e", size = 240337 },
+    { url = "https://files.pythonhosted.org/packages/a1/29/fdc6a43c203890dc2ae9249971ecd0c41deaedfe00d25cb6564b2edd99eb/multidict-6.7.1-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:b26684587228afed0d50cf804cc71062cc9c1cdf55051c4c6345d372947b268c", size = 248788 },
+    { url = "https://files.pythonhosted.org/packages/a9/14/a153a06101323e4cf086ecee3faadba52ff71633d471f9685c42e3736163/multidict-6.7.1-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:9f9af11306994335398293f9958071019e3ab95e9a707dc1383a35613f6abcb9", size = 242842 },
+    { url = "https://files.pythonhosted.org/packages/41/5f/604ae839e64a4a6efc80db94465348d3b328ee955e37acb24badbcd24d83/multidict-6.7.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b4938326284c4f1224178a560987b6cf8b4d38458b113d9b8c1db1a836e640a2", size = 240237 },
+    { url = "https://files.pythonhosted.org/packages/5f/60/c3a5187bf66f6fb546ff4ab8fb5a077cbdd832d7b1908d4365c7f74a1917/multidict-6.7.1-cp314-cp314t-win32.whl", hash = "sha256:98655c737850c064a65e006a3df7c997cd3b220be4ec8fe26215760b9697d4d7", size = 48008 },
+    { url = "https://files.pythonhosted.org/packages/0c/f7/addf1087b860ac60e6f382240f64fb99f8bfb532bb06f7c542b83c29ca61/multidict-6.7.1-cp314-cp314t-win_amd64.whl", hash = "sha256:497bde6223c212ba11d462853cfa4f0ae6ef97465033e7dc9940cdb3ab5b48e5", size = 53542 },
+    { url = "https://files.pythonhosted.org/packages/4c/81/4629d0aa32302ef7b2ec65c75a728cc5ff4fa410c50096174c1632e70b3e/multidict-6.7.1-cp314-cp314t-win_arm64.whl", hash = "sha256:2bbd113e0d4af5db41d5ebfe9ccaff89de2120578164f86a5d17d5a576d1e5b2", size = 44719 },
+    { url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319 },
+]
+
+[[package]]
+name = "multiprocess"
+version = "0.70.19"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "dill" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a2/f2/e783ac7f2aeeed14e9e12801f22529cc7e6b7ab80928d6dcce4e9f00922d/multiprocess-0.70.19.tar.gz", hash = "sha256:952021e0e6c55a4a9fe4cd787895b86e239a40e76802a789d6305398d3975897", size = 2079989 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e3/45/8004d1e6b9185c1a444d6b55ac5682acf9d98035e54386d967366035a03a/multiprocess-0.70.19-py310-none-any.whl", hash = "sha256:97404393419dcb2a8385910864eedf47a3cadf82c66345b44f036420eb0b5d87", size = 134948 },
+    { url = "https://files.pythonhosted.org/packages/86/c2/dec9722dc3474c164a0b6bcd9a7ed7da542c98af8cabce05374abab35edd/multiprocess-0.70.19-py311-none-any.whl", hash = "sha256:928851ae7973aea4ce0eaf330bbdafb2e01398a91518d5c8818802845564f45c", size = 144457 },
+    { url = "https://files.pythonhosted.org/packages/71/70/38998b950a97ea279e6bd657575d22d1a2047256caf707d9a10fbce4f065/multiprocess-0.70.19-py312-none-any.whl", hash = "sha256:3a56c0e85dd5025161bac5ce138dcac1e49174c7d8e74596537e729fd5c53c28", size = 150281 },
+    { url = "https://files.pythonhosted.org/packages/7f/74/d2c27e03cb84251dfe7249b8e82923643c6d48fa4883b9476b025e7dc7eb/multiprocess-0.70.19-py313-none-any.whl", hash = "sha256:8d5eb4ec5017ba2fab4e34a747c6d2c2b6fecfe9e7236e77988db91580ada952", size = 156414 },
+    { url = "https://files.pythonhosted.org/packages/a0/61/af9115673a5870fd885247e2f1b68c4f1197737da315b520a91c757a861a/multiprocess-0.70.19-py314-none-any.whl", hash = "sha256:e8cc7fbdff15c0613f0a1f1f8744bef961b0a164c0ca29bdff53e9d2d93c5e5f", size = 160318 },
+    { url = "https://files.pythonhosted.org/packages/7e/82/69e539c4c2027f1e1697e09aaa2449243085a0edf81ae2c6341e84d769b6/multiprocess-0.70.19-py39-none-any.whl", hash = "sha256:0d4b4397ed669d371c81dcd1ef33fd384a44d6c3de1bd0ca7ac06d837720d3c5", size = 133477 },
+]
+
+[[package]]
+name = "numpy"
+version = "2.4.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d7/9f/b8cef5bffa569759033adda9481211426f12f53299629b410340795c2514/numpy-2.4.4.tar.gz", hash = "sha256:2d390634c5182175533585cc89f3608a4682ccb173cc9bb940b2881c8d6f8fa0", size = 20731587 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/28/05/32396bec30fb2263770ee910142f49c1476d08e8ad41abf8403806b520ce/numpy-2.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15716cfef24d3a9762e3acdf87e27f58dc823d1348f765bbea6bef8c639bfa1b", size = 16689272 },
+    { url = "https://files.pythonhosted.org/packages/c5/f3/a983d28637bfcd763a9c7aafdb6d5c0ebf3d487d1e1459ffdb57e2f01117/numpy-2.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23cbfd4c17357c81021f21540da84ee282b9c8fba38a03b7b9d09ba6b951421e", size = 14699573 },
+    { url = "https://files.pythonhosted.org/packages/9b/fd/e5ecca1e78c05106d98028114f5c00d3eddb41207686b2b7de3e477b0e22/numpy-2.4.4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8b3b60bb7cba2c8c81837661c488637eee696f59a877788a396d33150c35d842", size = 5204782 },
+    { url = "https://files.pythonhosted.org/packages/de/2f/702a4594413c1a8632092beae8aba00f1d67947389369b3777aed783fdca/numpy-2.4.4-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e4a010c27ff6f210ff4c6ef34394cd61470d01014439b192ec22552ee867f2a8", size = 6552038 },
+    { url = "https://files.pythonhosted.org/packages/7f/37/eed308a8f56cba4d1fdf467a4fc67ef4ff4bf1c888f5fc980481890104b1/numpy-2.4.4-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f9e75681b59ddaa5e659898085ae0eaea229d054f2ac0c7e563a62205a700121", size = 15670666 },
+    { url = "https://files.pythonhosted.org/packages/0a/0d/0e3ecece05b7a7e87ab9fb587855548da437a061326fff64a223b6dcb78a/numpy-2.4.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:81f4a14bee47aec54f883e0cad2d73986640c1590eb9bfaaba7ad17394481e6e", size = 16645480 },
+    { url = "https://files.pythonhosted.org/packages/34/49/f2312c154b82a286758ee2f1743336d50651f8b5195db18cdb63675ff649/numpy-2.4.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:62d6b0f03b694173f9fcb1fb317f7222fd0b0b103e784c6549f5e53a27718c44", size = 17020036 },
+    { url = "https://files.pythonhosted.org/packages/7b/e9/736d17bd77f1b0ec4f9901aaec129c00d59f5d84d5e79bba540ef12c2330/numpy-2.4.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbc356aae7adf9e6336d336b9c8111d390a05df88f1805573ebb0807bd06fd1d", size = 18368643 },
+    { url = "https://files.pythonhosted.org/packages/63/f6/d417977c5f519b17c8a5c3bc9e8304b0908b0e21136fe43bf628a1343914/numpy-2.4.4-cp312-cp312-win32.whl", hash = "sha256:0d35aea54ad1d420c812bfa0385c71cd7cc5bcf7c65fed95fc2cd02fe8c79827", size = 5961117 },
+    { url = "https://files.pythonhosted.org/packages/2d/5b/e1deebf88ff431b01b7406ca3583ab2bbb90972bbe1c568732e49c844f7e/numpy-2.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:b5f0362dc928a6ecd9db58868fca5e48485205e3855957bdedea308f8672ea4a", size = 12320584 },
+    { url = "https://files.pythonhosted.org/packages/58/89/e4e856ac82a68c3ed64486a544977d0e7bdd18b8da75b78a577ca31c4395/numpy-2.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:846300f379b5b12cc769334464656bc882e0735d27d9726568bc932fdc49d5ec", size = 10221450 },
+    { url = "https://files.pythonhosted.org/packages/14/1d/d0a583ce4fefcc3308806a749a536c201ed6b5ad6e1322e227ee4848979d/numpy-2.4.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:08f2e31ed5e6f04b118e49821397f12767934cfdd12a1ce86a058f91e004ee50", size = 16684933 },
+    { url = "https://files.pythonhosted.org/packages/c1/62/2b7a48fbb745d344742c0277f01286dead15f3f68e4f359fbfcf7b48f70f/numpy-2.4.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e823b8b6edc81e747526f70f71a9c0a07ac4e7ad13020aa736bb7c9d67196115", size = 14694532 },
+    { url = "https://files.pythonhosted.org/packages/e5/87/499737bfba066b4a3bebff24a8f1c5b2dee410b209bc6668c9be692580f0/numpy-2.4.4-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4a19d9dba1a76618dd86b164d608566f393f8ec6ac7c44f0cc879011c45e65af", size = 5199661 },
+    { url = "https://files.pythonhosted.org/packages/cd/da/464d551604320d1491bc345efed99b4b7034143a85787aab78d5691d5a0e/numpy-2.4.4-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:d2a8490669bfe99a233298348acc2d824d496dee0e66e31b66a6022c2ad74a5c", size = 6547539 },
+    { url = "https://files.pythonhosted.org/packages/7d/90/8d23e3b0dafd024bf31bdec225b3bb5c2dbfa6912f8a53b8659f21216cbf/numpy-2.4.4-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45dbed2ab436a9e826e302fcdcbe9133f9b0006e5af7168afb8963a6520da103", size = 15668806 },
+    { url = "https://files.pythonhosted.org/packages/d1/73/a9d864e42a01896bb5974475438f16086be9ba1f0d19d0bb7a07427c4a8b/numpy-2.4.4-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c901b15172510173f5cb310eae652908340f8dede90fff9e3bf6c0d8dfd92f83", size = 16632682 },
+    { url = "https://files.pythonhosted.org/packages/34/fb/14570d65c3bde4e202a031210475ae9cde9b7686a2e7dc97ee67d2833b35/numpy-2.4.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:99d838547ace2c4aace6c4f76e879ddfe02bb58a80c1549928477862b7a6d6ed", size = 17019810 },
+    { url = "https://files.pythonhosted.org/packages/8a/77/2ba9d87081fd41f6d640c83f26fb7351e536b7ce6dd9061b6af5904e8e46/numpy-2.4.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0aec54fd785890ecca25a6003fd9a5aed47ad607bbac5cd64f836ad8666f4959", size = 18357394 },
+    { url = "https://files.pythonhosted.org/packages/a2/23/52666c9a41708b0853fa3b1a12c90da38c507a3074883823126d4e9d5b30/numpy-2.4.4-cp313-cp313-win32.whl", hash = "sha256:07077278157d02f65c43b1b26a3886bce886f95d20aabd11f87932750dfb14ed", size = 5959556 },
+    { url = "https://files.pythonhosted.org/packages/57/fb/48649b4971cde70d817cf97a2a2fdc0b4d8308569f1dd2f2611959d2e0cf/numpy-2.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:5c70f1cc1c4efbe316a572e2d8b9b9cc44e89b95f79ca3331553fbb63716e2bf", size = 12317311 },
+    { url = "https://files.pythonhosted.org/packages/ba/d8/11490cddd564eb4de97b4579ef6bfe6a736cc07e94c1598590ae25415e01/numpy-2.4.4-cp313-cp313-win_arm64.whl", hash = "sha256:ef4059d6e5152fa1a39f888e344c73fdc926e1b2dd58c771d67b0acfbf2aa67d", size = 10222060 },
+    { url = "https://files.pythonhosted.org/packages/99/5d/dab4339177a905aad3e2221c915b35202f1ec30d750dd2e5e9d9a72b804b/numpy-2.4.4-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4bbc7f303d125971f60ec0aaad5e12c62d0d2c925f0ab1273debd0e4ba37aba5", size = 14822302 },
+    { url = "https://files.pythonhosted.org/packages/eb/e4/0564a65e7d3d97562ed6f9b0fd0fb0a6f559ee444092f105938b50043876/numpy-2.4.4-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:4d6d57903571f86180eb98f8f0c839fa9ebbfb031356d87f1361be91e433f5b7", size = 5327407 },
+    { url = "https://files.pythonhosted.org/packages/29/8d/35a3a6ce5ad371afa58b4700f1c820f8f279948cca32524e0a695b0ded83/numpy-2.4.4-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:4636de7fd195197b7535f231b5de9e4b36d2c440b6e566d2e4e4746e6af0ca93", size = 6647631 },
+    { url = "https://files.pythonhosted.org/packages/f4/da/477731acbd5a58a946c736edfdabb2ac5b34c3d08d1ba1a7b437fa0884df/numpy-2.4.4-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ad2e2ef14e0b04e544ea2fa0a36463f847f113d314aa02e5b402fdf910ef309e", size = 15727691 },
+    { url = "https://files.pythonhosted.org/packages/e6/db/338535d9b152beabeb511579598418ba0212ce77cf9718edd70262cc4370/numpy-2.4.4-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a285b3b96f951841799528cd1f4f01cd70e7e0204b4abebac9463eecfcf2a40", size = 16681241 },
+    { url = "https://files.pythonhosted.org/packages/e2/a9/ad248e8f58beb7a0219b413c9c7d8151c5d285f7f946c3e26695bdbbe2df/numpy-2.4.4-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f8474c4241bc18b750be2abea9d7a9ec84f46ef861dbacf86a4f6e043401f79e", size = 17085767 },
+    { url = "https://files.pythonhosted.org/packages/b5/1a/3b88ccd3694681356f70da841630e4725a7264d6a885c8d442a697e1146b/numpy-2.4.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4e874c976154687c1f71715b034739b45c7711bec81db01914770373d125e392", size = 18403169 },
+    { url = "https://files.pythonhosted.org/packages/c2/c9/fcfd5d0639222c6eac7f304829b04892ef51c96a75d479214d77e3ce6e33/numpy-2.4.4-cp313-cp313t-win32.whl", hash = "sha256:9c585a1790d5436a5374bac930dad6ed244c046ed91b2b2a3634eb2971d21008", size = 6083477 },
+    { url = "https://files.pythonhosted.org/packages/d5/e3/3938a61d1c538aaec8ed6fd6323f57b0c2d2d2219512434c5c878db76553/numpy-2.4.4-cp313-cp313t-win_amd64.whl", hash = "sha256:93e15038125dc1e5345d9b5b68aa7f996ec33b98118d18c6ca0d0b7d6198b7e8", size = 12457487 },
+    { url = "https://files.pythonhosted.org/packages/97/6a/7e345032cc60501721ef94e0e30b60f6b0bd601f9174ebd36389a2b86d40/numpy-2.4.4-cp313-cp313t-win_arm64.whl", hash = "sha256:0dfd3f9d3adbe2920b68b5cd3d51444e13a10792ec7154cd0a2f6e74d4ab3233", size = 10292002 },
+    { url = "https://files.pythonhosted.org/packages/6e/06/c54062f85f673dd5c04cbe2f14c3acb8c8b95e3384869bb8cc9bff8cb9df/numpy-2.4.4-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:f169b9a863d34f5d11b8698ead99febeaa17a13ca044961aa8e2662a6c7766a0", size = 16684353 },
+    { url = "https://files.pythonhosted.org/packages/4c/39/8a320264a84404c74cc7e79715de85d6130fa07a0898f67fb5cd5bd79908/numpy-2.4.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2483e4584a1cb3092da4470b38866634bafb223cbcd551ee047633fd2584599a", size = 14704914 },
+    { url = "https://files.pythonhosted.org/packages/91/fb/287076b2614e1d1044235f50f03748f31fa287e3dbe6abeb35cdfa351eca/numpy-2.4.4-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:2d19e6e2095506d1736b7d80595e0f252d76b89f5e715c35e06e937679ea7d7a", size = 5210005 },
+    { url = "https://files.pythonhosted.org/packages/63/eb/fcc338595309910de6ecabfcef2419a9ce24399680bfb149421fa2df1280/numpy-2.4.4-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:6a246d5914aa1c820c9443ddcee9c02bec3e203b0c080349533fae17727dfd1b", size = 6544974 },
+    { url = "https://files.pythonhosted.org/packages/44/5d/e7e9044032a716cdfaa3fba27a8e874bf1c5f1912a1ddd4ed071bf8a14a6/numpy-2.4.4-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:989824e9faf85f96ec9c7761cd8d29c531ad857bfa1daa930cba85baaecf1a9a", size = 15684591 },
+    { url = "https://files.pythonhosted.org/packages/98/7c/21252050676612625449b4807d6b695b9ce8a7c9e1c197ee6216c8a65c7c/numpy-2.4.4-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:27a8d92cd10f1382a67d7cf4db7ce18341b66438bdd9f691d7b0e48d104c2a9d", size = 16637700 },
+    { url = "https://files.pythonhosted.org/packages/b1/29/56d2bbef9465db24ef25393383d761a1af4f446a1df9b8cded4fe3a5a5d7/numpy-2.4.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e44319a2953c738205bf3354537979eaa3998ed673395b964c1176083dd46252", size = 17035781 },
+    { url = "https://files.pythonhosted.org/packages/e3/2b/a35a6d7589d21f44cea7d0a98de5ddcbb3d421b2622a5c96b1edf18707c3/numpy-2.4.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e892aff75639bbef0d2a2cfd55535510df26ff92f63c92cd84ef8d4ba5a5557f", size = 18362959 },
+    { url = "https://files.pythonhosted.org/packages/64/c9/d52ec581f2390e0f5f85cbfd80fb83d965fc15e9f0e1aec2195faa142cde/numpy-2.4.4-cp314-cp314-win32.whl", hash = "sha256:1378871da56ca8943c2ba674530924bb8ca40cd228358a3b5f302ad60cf875fc", size = 6008768 },
+    { url = "https://files.pythonhosted.org/packages/fa/22/4cc31a62a6c7b74a8730e31a4274c5dc80e005751e277a2ce38e675e4923/numpy-2.4.4-cp314-cp314-win_amd64.whl", hash = "sha256:715d1c092715954784bc79e1174fc2a90093dc4dc84ea15eb14dad8abdcdeb74", size = 12449181 },
+    { url = "https://files.pythonhosted.org/packages/70/2e/14cda6f4d8e396c612d1bf97f22958e92148801d7e4f110cabebdc0eef4b/numpy-2.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:2c194dd721e54ecad9ad387c1d35e63dce5c4450c6dc7dd5611283dda239aabb", size = 10496035 },
+    { url = "https://files.pythonhosted.org/packages/b1/e8/8fed8c8d848d7ecea092dc3469643f9d10bc3a134a815a3b033da1d2039b/numpy-2.4.4-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2aa0613a5177c264ff5921051a5719d20095ea586ca88cc802c5c218d1c67d3e", size = 14824958 },
+    { url = "https://files.pythonhosted.org/packages/05/1a/d8007a5138c179c2bf33ef44503e83d70434d2642877ee8fbb230e7c0548/numpy-2.4.4-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:42c16925aa5a02362f986765f9ebabf20de75cdefdca827d14315c568dcab113", size = 5330020 },
+    { url = "https://files.pythonhosted.org/packages/99/64/ffb99ac6ae93faf117bcbd5c7ba48a7f45364a33e8e458545d3633615dda/numpy-2.4.4-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:874f200b2a981c647340f841730fc3a2b54c9d940566a3c4149099591e2c4c3d", size = 6650758 },
+    { url = "https://files.pythonhosted.org/packages/6e/6e/795cc078b78a384052e73b2f6281ff7a700e9bf53bcce2ee579d4f6dd879/numpy-2.4.4-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9b39d38a9bd2ae1becd7eac1303d031c5c110ad31f2b319c6e7d98b135c934d", size = 15729948 },
+    { url = "https://files.pythonhosted.org/packages/5f/86/2acbda8cc2af5f3d7bfc791192863b9e3e19674da7b5e533fded124d1299/numpy-2.4.4-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b268594bccac7d7cf5844c7732e3f20c50921d94e36d7ec9b79e9857694b1b2f", size = 16679325 },
+    { url = "https://files.pythonhosted.org/packages/bc/59/cafd83018f4aa55e0ac6fa92aa066c0a1877b77a615ceff1711c260ffae8/numpy-2.4.4-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ac6b31e35612a26483e20750126d30d0941f949426974cace8e6b5c58a3657b0", size = 17084883 },
+    { url = "https://files.pythonhosted.org/packages/f0/85/a42548db84e65ece46ab2caea3d3f78b416a47af387fcbb47ec28e660dc2/numpy-2.4.4-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8e3ed142f2728df44263aaf5fb1f5b0b99f4070c553a0d7f033be65338329150", size = 18403474 },
+    { url = "https://files.pythonhosted.org/packages/ed/ad/483d9e262f4b831000062e5d8a45e342166ec8aaa1195264982bca267e62/numpy-2.4.4-cp314-cp314t-win32.whl", hash = "sha256:dddbbd259598d7240b18c9d87c56a9d2fb3b02fe266f49a7c101532e78c1d871", size = 6155500 },
+    { url = "https://files.pythonhosted.org/packages/c7/03/2fc4e14c7bd4ff2964b74ba90ecb8552540b6315f201df70f137faa5c589/numpy-2.4.4-cp314-cp314t-win_amd64.whl", hash = "sha256:a7164afb23be6e37ad90b2f10426149fd75aee07ca55653d2aa41e66c4ef697e", size = 12637755 },
+    { url = "https://files.pythonhosted.org/packages/58/78/548fb8e07b1a341746bfbecb32f2c268470f45fa028aacdbd10d9bc73aab/numpy-2.4.4-cp314-cp314t-win_arm64.whl", hash = "sha256:ba203255017337d39f89bdd58417f03c4426f12beed0440cfd933cb15f8669c7", size = 10566643 },
+]
+
+[[package]]
+name = "packaging"
+version = "26.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d7/f1/e7a6dd94a8d4a5626c03e4e99c87f241ba9e350cd9e6d75123f992427270/packaging-26.2.tar.gz", hash = "sha256:ff452ff5a3e828ce110190feff1178bb1f2ea2281fa2075aadb987c2fb221661", size = 228134 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195 },
+]
+
+[[package]]
+name = "pandas"
+version = "3.0.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "python-dateutil" },
+    { name = "tzdata", marker = "sys_platform == 'emscripten' or sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f8/87/4341c6252d1c47b08768c3d25ac487362bf403f0313ddae4a2a26c9b1b4c/pandas-3.0.3.tar.gz", hash = "sha256:696a4a00a2a2a35d4e5deb3fc946641b96c944f02230e4f76137fe35d806c4fc", size = 4651414 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/24/f1/392f8c5bfc16f66a0d2d41561c01627c228fe7ed2a0d056ef11315042570/pandas-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fed2ff7fd9779120e388e285fc029bd5cf9490cdd2e4166a9ee22c0e49a9ab09", size = 10357846 },
+    { url = "https://files.pythonhosted.org/packages/cf/3d/b16412745651e855f357e5e66930248688378853a6e2698a214e331fba1f/pandas-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b168fc218fd80a6cbdbdbc1a97ddc7889ed057d7eb45f50d866ceab5f39904c4", size = 9899550 },
+    { url = "https://files.pythonhosted.org/packages/31/a8/fa2535168fffcedf67f4f6de28d2dd903a747ca7c8ea6989451aaeb3a92f/pandas-3.0.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0383c72c75cdcca61a9e116e611143902dbfd08bff356829c2f6d1cf40a9ca8c", size = 10412965 },
+    { url = "https://files.pythonhosted.org/packages/65/b6/09b01cdbc15224e2850365192d17b7bdebb8bdbd8780ed221fcdf0d9a515/pandas-3.0.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6dc0b3fd2169c9157deed50b4d519553a3655c8c6a96027136d654592be973a9", size = 10894600 },
+    { url = "https://files.pythonhosted.org/packages/c9/a4/2eb28f2fccb4ced4a2c79ab2a5dee9ade1ebf44922ebad6fea158c9f95d4/pandas-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7e65d5407dc0b394f509699650e4a2ec01c0514f21850f453fa60f3be79a5dbf", size = 11422824 },
+    { url = "https://files.pythonhosted.org/packages/f8/45/830bb57f533a4604b355e07edcb8ea18cf88b5f94e5fca92f27052d7c597/pandas-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f8894dc474d648fe7b6ff0ca9b0bd73950d19952bc1a6534540762c5d79d305c", size = 11950889 },
+    { url = "https://files.pythonhosted.org/packages/b9/c5/fc1b368f303087d20e8c9bf3d6ceb186263cfac0ade735cd938538bea839/pandas-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:c7be265b62cef88e253a941e4698604973736dcfe242fdb5198f0f7bc473cdcc", size = 9755463 },
+    { url = "https://files.pythonhosted.org/packages/86/bd/fda8f9705b1b09c6ebe14bfc0fa0e4ec8584d54ea673628f157ff55131af/pandas-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:557409bc4178e70ee8d9ddb494798e51ebf6ea59330f6be22c51bab2a7db6c49", size = 9066158 },
+    { url = "https://files.pythonhosted.org/packages/c5/90/62d8302883c44308c477e222c3daf7c813a34c8e96985882fbd53d964352/pandas-3.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:67b3b64c11910cfa29f4e94a14d3bff9ee693b6fc76055e7cad549cee0aec5fa", size = 10331071 },
+    { url = "https://files.pythonhosted.org/packages/7f/ae/6a6493c783a101f165e4356953ba3c74d6f77f0042fa7d753da9dfbb640c/pandas-3.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:39436b377d56d2a2e52d0395bdbee171f01068e99af5250509aceeb929f765c7", size = 9875690 },
+    { url = "https://files.pythonhosted.org/packages/62/7c/5df8e9f56c69a2769fbe9382a5ef8f2658c007e376434e1e2cbb57ad895f/pandas-3.0.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4be06d68f9ddcfc645b87534911da79a8fbffc7573c80e0edcf42a5020624d8", size = 10381634 },
+    { url = "https://files.pythonhosted.org/packages/99/68/1237369725aa617bb358263d535803e3053fdbc593513ec5ed9c9896b5b6/pandas-3.0.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a4eeb6830daf35a71cc09649bd823e2b542dac246cdee9614c6e4bd65028cd6a", size = 10891243 },
+    { url = "https://files.pythonhosted.org/packages/25/93/77d108e8af7222b4a503ebde0e30215b1c2e4f8e53a526431890f22d5586/pandas-3.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1928e07221f82db493cd4af1e23c1bfca524a19a4699887975bff68f49a72bfb", size = 11388659 },
+    { url = "https://files.pythonhosted.org/packages/d0/bd/eff5b4399f332ac386c853f6cd2bd3fa2ca0061b9f36ecd9c4d7c4265649/pandas-3.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51b1fe551acb77dac643c6fda86084d8d446c10fe64b06a9cc29c4cc8540e7f2", size = 11942880 },
+    { url = "https://files.pythonhosted.org/packages/2c/20/559ace4200982c3887d0b86bfd0d856a2143ef8ddab63cc07934951a964c/pandas-3.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:a82d532a3351d435432cd913edbccaf8b8e01d4dd0e5ced5a8d2e8ecd94c7e44", size = 9757091 },
+    { url = "https://files.pythonhosted.org/packages/3a/66/69055a09fe200f29f922a3eeec4804611900b95f52d932ece3393c3c0c19/pandas-3.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:275c14e0fce14a2ec20eee474aecd305478ea3c1e6f6a9d8fe219a165542717e", size = 9057282 },
+    { url = "https://files.pythonhosted.org/packages/57/0e/efe801b0e6811e8e650cd21b7f2608e30f08a7067e2bf6e8752b0d56ee3c/pandas-3.0.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:46997386d528eb40376ecd6b033cf4a8a1e5282580f68f43de875b78cba2199d", size = 10767016 },
+    { url = "https://files.pythonhosted.org/packages/ea/dc/eb55135a1d5f0f0519f28da1f609a206d2cad1f9c35c32d51e38dd7261ae/pandas-3.0.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:261e308dfb22448384b7580cf719d2f998fe2966c92893c3e77d14008af1f066", size = 10420210 },
+    { url = "https://files.pythonhosted.org/packages/c6/3e/b1d5d955ce33ffecb407465a60bc32769d74fcf68224b7ae67ae11d4dea4/pandas-3.0.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd1a5d1def6a46002e964510bdc67c368aa0951df5d1d9f8365336f5a1f490cd", size = 10336126 },
+    { url = "https://files.pythonhosted.org/packages/f5/76/a01261711ab60a22d71b862f0de20e4c504bf80457270ad8cb42110f6abc/pandas-3.0.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d72828c20c6d6e83e1e22a6a3b47b326b71664112fa9705dcbccfd7a39b62085", size = 10728051 },
+    { url = "https://files.pythonhosted.org/packages/e9/21/ea191195e587b18cf682e97f433f81b2d0fbe341380e80a3e0d6e4403c8e/pandas-3.0.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d26cbe1fcfc12e8fd900e2454163e466b2d3af84f7c75481df7683ffc073d870", size = 11350796 },
+    { url = "https://files.pythonhosted.org/packages/64/69/f0eaaf54939f0e8c6768fd06be9af2cef9b36048b96dfb9e1b2c685a807e/pandas-3.0.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:3e91cec1879ada0624fc3dc9953c5cbd60208e59c0db28f540c5d6d47502422f", size = 11799741 },
+    { url = "https://files.pythonhosted.org/packages/45/a4/865e0e510cae5fc2194de4db28be638952de942571ba9125934fd9c01d47/pandas-3.0.3-cp313-cp313t-win_amd64.whl", hash = "sha256:08d789b41f87e0905880e293cedf6197ce71fe67cc081358b1e148a491b9bd13", size = 10499958 },
+    { url = "https://files.pythonhosted.org/packages/86/54/effdcc3c0ff7a08037889200e148ebe94c16c4f653be078c7b3675955df1/pandas-3.0.3-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:3650109c0f22879df8bd6179ab9ee3d7f1d1d4e7e0094a3f0032d9f51e2e64ac", size = 10336065 },
+    { url = "https://files.pythonhosted.org/packages/68/10/bf2d6738d72748b961a3751ab89522d58c54efc36a8e1a12161216cd45cf/pandas-3.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:bab900348131a7db1f69a7309ef141fd5680f1487094193bcbbb61791573bf8f", size = 9926101 },
+    { url = "https://files.pythonhosted.org/packages/ae/e9/e35cf11c8a136e757b956f5f0efdcaa50aecde85ea055f1898dfc68262f3/pandas-3.0.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ba7e08b9ac1d54569cd1e256e3668975ed624d6826f7b68df0342b012007bddb", size = 10457553 },
+    { url = "https://files.pythonhosted.org/packages/58/3b/1cdec6772bdbaf7b25dab360c59f03cadf05492dd724c6540af905389b07/pandas-3.0.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d71c63ae4ebdbf70209742096f1fc46a83a0613c99d4b23766cced9ff8cd62a", size = 10914065 },
+    { url = "https://files.pythonhosted.org/packages/c4/c2/1ef644445fcd72e3627bceec77e3560636f87ddce4ed841afe76b83b5bf9/pandas-3.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e3a2ec42c98ffa2565a67e08e218d06d72576d758d90facb7c00805194d8f360", size = 11459188 },
+    { url = "https://files.pythonhosted.org/packages/7e/49/4d8d4f42cbc9c4adc7a1870f269c02cbd6cd40d059622c06fb298addcbad/pandas-3.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:335f62418ed562cfc3c49e9e196375c28b729dcef8543abf4f9438e381bf3c76", size = 11982966 },
+    { url = "https://files.pythonhosted.org/packages/38/55/792619469bab9882d8bbd5865d45a72f6478762d04a9af4bf0d08c503e95/pandas-3.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:3c20a521bbb85902f79f7270c80a59e1b5452d96d170c034f207181870f97ac5", size = 9876755 },
+    { url = "https://files.pythonhosted.org/packages/2a/af/33c469653b0ba03b50c3a98192d4c07f0c75c66b263ceb097fce0ee97d31/pandas-3.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:a2d2dff8a04f3917b55ab3910c32990f8ddf7eceba114947838cefa976a68977", size = 9198658 },
+    { url = "https://files.pythonhosted.org/packages/a2/fa/b8c257bd76b8bd060c3a9151c1fca05e9b9c5e3af5d0f549c0356f6d143d/pandas-3.0.3-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:0d589105b3c14645af1738ff279b2995102d8f7a03b0a66dc8d95550eb513e04", size = 10787242 },
+    { url = "https://files.pythonhosted.org/packages/54/eb/f19206ffb0bf1919002969aa448b4702c6594845156a6f8050674855aac3/pandas-3.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:13fc1e853d9e04743d11ba75a985ccbc2a317fe07d8af61e445a6fd24dacd6a6", size = 10436369 },
+    { url = "https://files.pythonhosted.org/packages/fd/24/c7c39fb4fe22b71a0c2d78bf0c585c600092d85f94f086d2b3b2f6ca27e2/pandas-3.0.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:819959dab7bbd0049c15623fbac4e29a191b9528160a61fb1032242d8ced2d9c", size = 10358306 },
+    { url = "https://files.pythonhosted.org/packages/16/ec/dd2a9eb7fa1204df88c0864164e35b228ac581062ac612ba0a67fd812e4c/pandas-3.0.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:60ae316d3fd75d1858d450d0db0103ea2be3e7d4a95ec2f064f7e2ae63f7b028", size = 10758394 },
+    { url = "https://files.pythonhosted.org/packages/95/6e/00c61ea8e85b4f6d8d35e11852a1a4998fc7fafc91c6a602d1cc9c972d64/pandas-3.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:bd3a518890b400d32f9023722dc9a9a5c969f00b415419a3c06c043f09bb5d7d", size = 11375717 },
+    { url = "https://files.pythonhosted.org/packages/31/89/8fc1c268969fac43688d65fd92e67df24bd128d53cb4d2eee534cd307399/pandas-3.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9c39be2d709d01fa972a0cabc522389fceca4f3969332ba25a7d6c5802cf976a", size = 11828897 },
+    { url = "https://files.pythonhosted.org/packages/56/3b/e7d20dea247a3e6dc0bd8a6953854afbedc03951def4e7371e05e7263e25/pandas-3.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4db8c527972a821cf5286b40ccc57642a39bc62e62022b42f99f8a67fca8c3a1", size = 10900855 },
+    { url = "https://files.pythonhosted.org/packages/0f/54/68a0978d1ef8502b8492099beaa6e7a0c1b32e3b5d4f677f5810cb08711c/pandas-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:b2c95f8bfc1ee412bf482605d7bfd30c12d1d26bd59fdd91efeef1d4718decb1", size = 9466464 },
+]
+
+[[package]]
+name = "pillow"
+version = "12.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8c/21/c2bcdd5906101a30244eaffc1b6e6ce71a31bd0742a01eb89e660ebfac2d/pillow-12.2.0.tar.gz", hash = "sha256:a830b1a40919539d07806aa58e1b114df53ddd43213d9c8b75847eee6c0182b5", size = 46987819 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/58/be/7482c8a5ebebbc6470b3eb791812fff7d5e0216c2be3827b30b8bb6603ed/pillow-12.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2d192a155bbcec180f8564f693e6fd9bccff5a7af9b32e2e4bf8c9c69dbad6b5", size = 5308279 },
+    { url = "https://files.pythonhosted.org/packages/d8/95/0a351b9289c2b5cbde0bacd4a83ebc44023e835490a727b2a3bd60ddc0f4/pillow-12.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f3f40b3c5a968281fd507d519e444c35f0ff171237f4fdde090dd60699458421", size = 4695490 },
+    { url = "https://files.pythonhosted.org/packages/de/af/4e8e6869cbed569d43c416fad3dc4ecb944cb5d9492defaed89ddd6fe871/pillow-12.2.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:03e7e372d5240cc23e9f07deca4d775c0817bffc641b01e9c3af208dbd300987", size = 6284462 },
+    { url = "https://files.pythonhosted.org/packages/e9/9e/c05e19657fd57841e476be1ab46c4d501bffbadbafdc31a6d665f8b737b6/pillow-12.2.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b86024e52a1b269467a802258c25521e6d742349d760728092e1bc2d135b4d76", size = 8094744 },
+    { url = "https://files.pythonhosted.org/packages/2b/54/1789c455ed10176066b6e7e6da1b01e50e36f94ba584dc68d9eebfe9156d/pillow-12.2.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7371b48c4fa448d20d2714c9a1f775a81155050d383333e0a6c15b1123dda005", size = 6398371 },
+    { url = "https://files.pythonhosted.org/packages/43/e3/fdc657359e919462369869f1c9f0e973f353f9a9ee295a39b1fea8ee1a77/pillow-12.2.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:62f5409336adb0663b7caa0da5c7d9e7bdbaae9ce761d34669420c2a801b2780", size = 7087215 },
+    { url = "https://files.pythonhosted.org/packages/8b/f8/2f6825e441d5b1959d2ca5adec984210f1ec086435b0ed5f52c19b3b8a6e/pillow-12.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:01afa7cf67f74f09523699b4e88c73fb55c13346d212a59a2db1f86b0a63e8c5", size = 6509783 },
+    { url = "https://files.pythonhosted.org/packages/67/f9/029a27095ad20f854f9dba026b3ea6428548316e057e6fc3545409e86651/pillow-12.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc3d34d4a8fbec3e88a79b92e5465e0f9b842b628675850d860b8bd300b159f5", size = 7212112 },
+    { url = "https://files.pythonhosted.org/packages/be/42/025cfe05d1be22dbfdb4f264fe9de1ccda83f66e4fc3aac94748e784af04/pillow-12.2.0-cp312-cp312-win32.whl", hash = "sha256:58f62cc0f00fd29e64b29f4fd923ffdb3859c9f9e6105bfc37ba1d08994e8940", size = 6378489 },
+    { url = "https://files.pythonhosted.org/packages/5d/7b/25a221d2c761c6a8ae21bfa3874988ff2583e19cf8a27bf2fee358df7942/pillow-12.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:7f84204dee22a783350679a0333981df803dac21a0190d706a50475e361c93f5", size = 7084129 },
+    { url = "https://files.pythonhosted.org/packages/10/e1/542a474affab20fd4a0f1836cb234e8493519da6b76899e30bcc5d990b8b/pillow-12.2.0-cp312-cp312-win_arm64.whl", hash = "sha256:af73337013e0b3b46f175e79492d96845b16126ddf79c438d7ea7ff27783a414", size = 2463612 },
+    { url = "https://files.pythonhosted.org/packages/4a/01/53d10cf0dbad820a8db274d259a37ba50b88b24768ddccec07355382d5ad/pillow-12.2.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:8297651f5b5679c19968abefd6bb84d95fe30ef712eb1b2d9b2d31ca61267f4c", size = 4100837 },
+    { url = "https://files.pythonhosted.org/packages/0f/98/f3a6657ecb698c937f6c76ee564882945f29b79bad496abcba0e84659ec5/pillow-12.2.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:50d8520da2a6ce0af445fa6d648c4273c3eeefbc32d7ce049f22e8b5c3daecc2", size = 4176528 },
+    { url = "https://files.pythonhosted.org/packages/69/bc/8986948f05e3ea490b8442ea1c1d4d990b24a7e43d8a51b2c7d8b1dced36/pillow-12.2.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:766cef22385fa1091258ad7e6216792b156dc16d8d3fa607e7545b2b72061f1c", size = 3640401 },
+    { url = "https://files.pythonhosted.org/packages/34/46/6c717baadcd62bc8ed51d238d521ab651eaa74838291bda1f86fe1f864c9/pillow-12.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5d2fd0fa6b5d9d1de415060363433f28da8b1526c1c129020435e186794b3795", size = 5308094 },
+    { url = "https://files.pythonhosted.org/packages/71/43/905a14a8b17fdb1ccb58d282454490662d2cb89a6bfec26af6d3520da5ec/pillow-12.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:56b25336f502b6ed02e889f4ece894a72612fe885889a6e8c4c80239ff6e5f5f", size = 4695402 },
+    { url = "https://files.pythonhosted.org/packages/73/dd/42107efcb777b16fa0393317eac58f5b5cf30e8392e266e76e51cff28c3d/pillow-12.2.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f1c943e96e85df3d3478f7b691f229887e143f81fedab9b20205349ab04d73ed", size = 6280005 },
+    { url = "https://files.pythonhosted.org/packages/a8/68/b93e09e5e8549019e61acf49f65b1a8530765a7f812c77a7461bca7e4494/pillow-12.2.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:03f6fab9219220f041c74aeaa2939ff0062bd5c364ba9ce037197f4c6d498cd9", size = 8090669 },
+    { url = "https://files.pythonhosted.org/packages/4b/6e/3ccb54ce8ec4ddd1accd2d89004308b7b0b21c4ac3d20fa70af4760a4330/pillow-12.2.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5cdfebd752ec52bf5bb4e35d9c64b40826bc5b40a13df7c3cda20a2c03a0f5ed", size = 6395194 },
+    { url = "https://files.pythonhosted.org/packages/67/ee/21d4e8536afd1a328f01b359b4d3997b291ffd35a237c877b331c1c3b71c/pillow-12.2.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eedf4b74eda2b5a4b2b2fb4c006d6295df3bf29e459e198c90ea48e130dc75c3", size = 7082423 },
+    { url = "https://files.pythonhosted.org/packages/78/5f/e9f86ab0146464e8c133fe85df987ed9e77e08b29d8d35f9f9f4d6f917ba/pillow-12.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:00a2865911330191c0b818c59103b58a5e697cae67042366970a6b6f1b20b7f9", size = 6505667 },
+    { url = "https://files.pythonhosted.org/packages/ed/1e/409007f56a2fdce61584fd3acbc2bbc259857d555196cedcadc68c015c82/pillow-12.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1e1757442ed87f4912397c6d35a0db6a7b52592156014706f17658ff58bbf795", size = 7208580 },
+    { url = "https://files.pythonhosted.org/packages/23/c4/7349421080b12fb35414607b8871e9534546c128a11965fd4a7002ccfbee/pillow-12.2.0-cp313-cp313-win32.whl", hash = "sha256:144748b3af2d1b358d41286056d0003f47cb339b8c43a9ea42f5fea4d8c66b6e", size = 6375896 },
+    { url = "https://files.pythonhosted.org/packages/3f/82/8a3739a5e470b3c6cbb1d21d315800d8e16bff503d1f16b03a4ec3212786/pillow-12.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:390ede346628ccc626e5730107cde16c42d3836b89662a115a921f28440e6a3b", size = 7081266 },
+    { url = "https://files.pythonhosted.org/packages/c3/25/f968f618a062574294592f668218f8af564830ccebdd1fa6200f598e65c5/pillow-12.2.0-cp313-cp313-win_arm64.whl", hash = "sha256:8023abc91fba39036dbce14a7d6535632f99c0b857807cbbbf21ecc9f4717f06", size = 2463508 },
+    { url = "https://files.pythonhosted.org/packages/4d/a4/b342930964e3cb4dce5038ae34b0eab4653334995336cd486c5a8c25a00c/pillow-12.2.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:042db20a421b9bafecc4b84a8b6e444686bd9d836c7fd24542db3e7df7baad9b", size = 5309927 },
+    { url = "https://files.pythonhosted.org/packages/9f/de/23198e0a65a9cf06123f5435a5d95cea62a635697f8f03d134d3f3a96151/pillow-12.2.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:dd025009355c926a84a612fecf58bb315a3f6814b17ead51a8e48d3823d9087f", size = 4698624 },
+    { url = "https://files.pythonhosted.org/packages/01/a6/1265e977f17d93ea37aa28aa81bad4fa597933879fac2520d24e021c8da3/pillow-12.2.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:88ddbc66737e277852913bd1e07c150cc7bb124539f94c4e2df5344494e0a612", size = 6321252 },
+    { url = "https://files.pythonhosted.org/packages/3c/83/5982eb4a285967baa70340320be9f88e57665a387e3a53a7f0db8231a0cd/pillow-12.2.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d362d1878f00c142b7e1a16e6e5e780f02be8195123f164edf7eddd911eefe7c", size = 8126550 },
+    { url = "https://files.pythonhosted.org/packages/4e/48/6ffc514adce69f6050d0753b1a18fd920fce8cac87620d5a31231b04bfc5/pillow-12.2.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2c727a6d53cb0018aadd8018c2b938376af27914a68a492f59dfcaca650d5eea", size = 6433114 },
+    { url = "https://files.pythonhosted.org/packages/36/a3/f9a77144231fb8d40ee27107b4463e205fa4677e2ca2548e14da5cf18dce/pillow-12.2.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:efd8c21c98c5cc60653bcb311bef2ce0401642b7ce9d09e03a7da87c878289d4", size = 7115667 },
+    { url = "https://files.pythonhosted.org/packages/c1/fc/ac4ee3041e7d5a565e1c4fd72a113f03b6394cc72ab7089d27608f8aaccb/pillow-12.2.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9f08483a632889536b8139663db60f6724bfcb443c96f1b18855860d7d5c0fd4", size = 6538966 },
+    { url = "https://files.pythonhosted.org/packages/c0/a8/27fb307055087f3668f6d0a8ccb636e7431d56ed0750e07a60547b1e083e/pillow-12.2.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dac8d77255a37e81a2efcbd1fc05f1c15ee82200e6c240d7e127e25e365c39ea", size = 7238241 },
+    { url = "https://files.pythonhosted.org/packages/ad/4b/926ab182c07fccae9fcb120043464e1ff1564775ec8864f21a0ebce6ac25/pillow-12.2.0-cp313-cp313t-win32.whl", hash = "sha256:ee3120ae9dff32f121610bb08e4313be87e03efeadfc6c0d18f89127e24d0c24", size = 6379592 },
+    { url = "https://files.pythonhosted.org/packages/c2/c4/f9e476451a098181b30050cc4c9a3556b64c02cf6497ea421ac047e89e4b/pillow-12.2.0-cp313-cp313t-win_amd64.whl", hash = "sha256:325ca0528c6788d2a6c3d40e3568639398137346c3d6e66bb61db96b96511c98", size = 7085542 },
+    { url = "https://files.pythonhosted.org/packages/00/a4/285f12aeacbe2d6dc36c407dfbbe9e96d4a80b0fb710a337f6d2ad978c75/pillow-12.2.0-cp313-cp313t-win_arm64.whl", hash = "sha256:2e5a76d03a6c6dcef67edabda7a52494afa4035021a79c8558e14af25313d453", size = 2465765 },
+    { url = "https://files.pythonhosted.org/packages/bf/98/4595daa2365416a86cb0d495248a393dfc84e96d62ad080c8546256cb9c0/pillow-12.2.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:3adc9215e8be0448ed6e814966ecf3d9952f0ea40eb14e89a102b87f450660d8", size = 4100848 },
+    { url = "https://files.pythonhosted.org/packages/0b/79/40184d464cf89f6663e18dfcf7ca21aae2491fff1a16127681bf1fa9b8cf/pillow-12.2.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:6a9adfc6d24b10f89588096364cc726174118c62130c817c2837c60cf08a392b", size = 4176515 },
+    { url = "https://files.pythonhosted.org/packages/b0/63/703f86fd4c422a9cf722833670f4f71418fb116b2853ff7da722ea43f184/pillow-12.2.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:6a6e67ea2e6feda684ed370f9a1c52e7a243631c025ba42149a2cc5934dec295", size = 3640159 },
+    { url = "https://files.pythonhosted.org/packages/71/e0/fb22f797187d0be2270f83500aab851536101b254bfa1eae10795709d283/pillow-12.2.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:2bb4a8d594eacdfc59d9e5ad972aa8afdd48d584ffd5f13a937a664c3e7db0ed", size = 5312185 },
+    { url = "https://files.pythonhosted.org/packages/ba/8c/1a9e46228571de18f8e28f16fabdfc20212a5d019f3e3303452b3f0a580d/pillow-12.2.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:80b2da48193b2f33ed0c32c38140f9d3186583ce7d516526d462645fd98660ae", size = 4695386 },
+    { url = "https://files.pythonhosted.org/packages/70/62/98f6b7f0c88b9addd0e87c217ded307b36be024d4ff8869a812b241d1345/pillow-12.2.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22db17c68434de69d8ecfc2fe821569195c0c373b25cccb9cbdacf2c6e53c601", size = 6280384 },
+    { url = "https://files.pythonhosted.org/packages/5e/03/688747d2e91cfbe0e64f316cd2e8005698f76ada3130d0194664174fa5de/pillow-12.2.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7b14cc0106cd9aecda615dd6903840a058b4700fcb817687d0ee4fc8b6e389be", size = 8091599 },
+    { url = "https://files.pythonhosted.org/packages/f6/35/577e22b936fcdd66537329b33af0b4ccfefaeabd8aec04b266528cddb33c/pillow-12.2.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cbeb542b2ebc6fcdacabf8aca8c1a97c9b3ad3927d46b8723f9d4f033288a0f", size = 6396021 },
+    { url = "https://files.pythonhosted.org/packages/11/8d/d2532ad2a603ca2b93ad9f5135732124e57811d0168155852f37fbce2458/pillow-12.2.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4bfd07bc812fbd20395212969e41931001fd59eb55a60658b0e5710872e95286", size = 7083360 },
+    { url = "https://files.pythonhosted.org/packages/5e/26/d325f9f56c7e039034897e7380e9cc202b1e368bfd04d4cbe6a441f02885/pillow-12.2.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9aba9a17b623ef750a4d11b742cbafffeb48a869821252b30ee21b5e91392c50", size = 6507628 },
+    { url = "https://files.pythonhosted.org/packages/5f/f7/769d5632ffb0988f1c5e7660b3e731e30f7f8ec4318e94d0a5d674eb65a4/pillow-12.2.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:deede7c263feb25dba4e82ea23058a235dcc2fe1f6021025dc71f2b618e26104", size = 7209321 },
+    { url = "https://files.pythonhosted.org/packages/6a/7a/c253e3c645cd47f1aceea6a8bacdba9991bf45bb7dfe927f7c893e89c93c/pillow-12.2.0-cp314-cp314-win32.whl", hash = "sha256:632ff19b2778e43162304d50da0181ce24ac5bb8180122cbe1bf4673428328c7", size = 6479723 },
+    { url = "https://files.pythonhosted.org/packages/cd/8b/601e6566b957ca50e28725cb6c355c59c2c8609751efbecd980db44e0349/pillow-12.2.0-cp314-cp314-win_amd64.whl", hash = "sha256:4e6c62e9d237e9b65fac06857d511e90d8461a32adcc1b9065ea0c0fa3a28150", size = 7217400 },
+    { url = "https://files.pythonhosted.org/packages/d6/94/220e46c73065c3e2951bb91c11a1fb636c8c9ad427ac3ce7d7f3359b9b2f/pillow-12.2.0-cp314-cp314-win_arm64.whl", hash = "sha256:b1c1fbd8a5a1af3412a0810d060a78b5136ec0836c8a4ef9aa11807f2a22f4e1", size = 2554835 },
+    { url = "https://files.pythonhosted.org/packages/b6/ab/1b426a3974cb0e7da5c29ccff4807871d48110933a57207b5a676cccc155/pillow-12.2.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:57850958fe9c751670e49b2cecf6294acc99e562531f4bd317fa5ddee2068463", size = 5314225 },
+    { url = "https://files.pythonhosted.org/packages/19/1e/dce46f371be2438eecfee2a1960ee2a243bbe5e961890146d2dee1ff0f12/pillow-12.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d5d38f1411c0ed9f97bcb49b7bd59b6b7c314e0e27420e34d99d844b9ce3b6f3", size = 4698541 },
+    { url = "https://files.pythonhosted.org/packages/55/c3/7fbecf70adb3a0c33b77a300dc52e424dc22ad8cdc06557a2e49523b703d/pillow-12.2.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5c0a9f29ca8e79f09de89293f82fc9b0270bb4af1d58bc98f540cc4aedf03166", size = 6322251 },
+    { url = "https://files.pythonhosted.org/packages/1c/3c/7fbc17cfb7e4fe0ef1642e0abc17fc6c94c9f7a16be41498e12e2ba60408/pillow-12.2.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1610dd6c61621ae1cf811bef44d77e149ce3f7b95afe66a4512f8c59f25d9ebe", size = 8127807 },
+    { url = "https://files.pythonhosted.org/packages/ff/c3/a8ae14d6defd2e448493ff512fae903b1e9bd40b72efb6ec55ce0048c8ce/pillow-12.2.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a34329707af4f73cf1782a36cd2289c0368880654a2c11f027bcee9052d35dd", size = 6433935 },
+    { url = "https://files.pythonhosted.org/packages/6e/32/2880fb3a074847ac159d8f902cb43278a61e85f681661e7419e6596803ed/pillow-12.2.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8e9c4f5b3c546fa3458a29ab22646c1c6c787ea8f5ef51300e5a60300736905e", size = 7116720 },
+    { url = "https://files.pythonhosted.org/packages/46/87/495cc9c30e0129501643f24d320076f4cc54f718341df18cc70ec94c44e1/pillow-12.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:fb043ee2f06b41473269765c2feae53fc2e2fbf96e5e22ca94fb5ad677856f06", size = 6540498 },
+    { url = "https://files.pythonhosted.org/packages/18/53/773f5edca692009d883a72211b60fdaf8871cbef075eaa9d577f0a2f989e/pillow-12.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f278f034eb75b4e8a13a54a876cc4a5ab39173d2cdd93a638e1b467fc545ac43", size = 7239413 },
+    { url = "https://files.pythonhosted.org/packages/c9/e4/4b64a97d71b2a83158134abbb2f5bd3f8a2ea691361282f010998f339ec7/pillow-12.2.0-cp314-cp314t-win32.whl", hash = "sha256:6bb77b2dcb06b20f9f4b4a8454caa581cd4dd0643a08bacf821216a16d9c8354", size = 6482084 },
+    { url = "https://files.pythonhosted.org/packages/ba/13/306d275efd3a3453f72114b7431c877d10b1154014c1ebbedd067770d629/pillow-12.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:6562ace0d3fb5f20ed7290f1f929cae41b25ae29528f2af1722966a0a02e2aa1", size = 7225152 },
+    { url = "https://files.pythonhosted.org/packages/ff/6e/cf826fae916b8658848d7b9f38d88da6396895c676e8086fc0988073aaf8/pillow-12.2.0-cp314-cp314t-win_arm64.whl", hash = "sha256:aa88ccfe4e32d362816319ed727a004423aab09c5cea43c01a4b435643fa34eb", size = 2556579 },
+]
+
+[[package]]
+name = "pluggy"
+version = "1.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538 },
+]
+
+[[package]]
+name = "propcache"
+version = "0.5.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ec/44/c87281c333769159c50594f22610f77398a47ccbfbbf23074e744e86f87c/propcache-0.5.2.tar.gz", hash = "sha256:01c4fc7480cd0598bb4b57022df55b9ca296da7fc5a8760bd8451a7e63a7d427", size = 50208 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4a/cb/e27bc2b2737a0bb49962b275efa051e8f1c35a936df7d5139b6b658b7dc9/propcache-0.5.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:806719138ecd720339a12410fb9614ac9b2b2d3a5fdf8235d56981c36f4039ba", size = 95887 },
+    { url = "https://files.pythonhosted.org/packages/e6/13/b8ae04c59392f8d11c6cd9fb4011d1dc7c86b81225c770280300e259ffe1/propcache-0.5.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:db2b80ea58eab4f86b2beec3cc8b39e8ff9276ac20e96b7cce43c8ae84cd6b5a", size = 54654 },
+    { url = "https://files.pythonhosted.org/packages/2c/7d/49777a3e20b55863d4794384a38acd460c04157b0a00f8602b0d508b8431/propcache-0.5.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e5cbfac9f61484f7e9f3597775500cd3ebe8274e9b050c38f9525c77c97520bf", size = 55190 },
+    { url = "https://files.pythonhosted.org/packages/44/c7/085d0cd63062e84044e3f05797749c3f8e3938ff3aeb0eb2f69d43fafc91/propcache-0.5.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5dbc581d2814337da56222fab8dc5f161cd798a434e49bac27930aaef798e144", size = 59995 },
+    { url = "https://files.pythonhosted.org/packages/9c/42/32cf8e3009e92b2645cf1e944f701e8ea4e924dffde1ee26db860bcbf7e4/propcache-0.5.2-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:857187f381f88c8e2fa2fe56ab94879d011b883d5a2ee5a1b60a8cd2a06846d9", size = 63422 },
+    { url = "https://files.pythonhosted.org/packages/9e/1b/f112433f99fc979431b87a39ef169e3f8df070d99a72792c56d6937ac48b/propcache-0.5.2-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:178b4a2cdaac1818e2bf1c5a99b94383fa73ea5382e032a48dec07dc5668dc42", size = 64342 },
+    { url = "https://files.pythonhosted.org/packages/14/15/5574111ae50dd6e879456888c0eadd4c5a869959775854e18e18a6b345f3/propcache-0.5.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6f328175a2cde1f0ff2c4ed8ce968b9dcfb55f3a7153f39e2957ed994da13476", size = 61639 },
+    { url = "https://files.pythonhosted.org/packages/cc/da/4d775080b1490c0ae604acda868bd71aabe3a89ed16f2aa4339eb8a283e7/propcache-0.5.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5671d09a36b06d0fd4a3da0fccbcae360e9b1570924171a15e9e0997f0249fba", size = 61588 },
+    { url = "https://files.pythonhosted.org/packages/04/ac/f076982cbe2195ee9cf32de5a1e46951d9fb399fc207f390562dd0fd8fb2/propcache-0.5.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:80168e2ebe4d3ec6599d10ad8f520304ae1cad9b6c5a95372aef1b66b7bfb53a", size = 60029 },
+    { url = "https://files.pythonhosted.org/packages/70/60/189be62e0dd898dce3b331e1b8c7a543cd3a405ac0c81fe8ee8a9d5d77e1/propcache-0.5.2-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:45f11346f884bc47444f6e6647131055844134c3175b629f84952e2b5cd62b64", size = 56774 },
+    { url = "https://files.pythonhosted.org/packages/ea/9e/93377b9c7939c1ffae98f878dee955efadfd638078bc86dbc21f9d52f651/propcache-0.5.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:8e778ebd44ef4f66ed60a0416b06b489687db264a9c0b3620362f26489492913", size = 63532 },
+    { url = "https://files.pythonhosted.org/packages/14/f9/590ef6cfb9b8028d516d287812ece32bb0bc5f11fbb9c8bf6b2e6313fec8/propcache-0.5.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:c0cb9ed24c8964e172768d455a38254c2dd8a552905729ce006cad3d3dda59b1", size = 61592 },
+    { url = "https://files.pythonhosted.org/packages/b4/5e/70958b3034c297a630bba2f17ca7abc2d5f39a803ad7e370ab79d1ecd022/propcache-0.5.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:1d1ad32d9d4355e2be65574fd0bfd3677e7066b009cd5b9b2dee8aa6a6393b33", size = 64788 },
+    { url = "https://files.pythonhosted.org/packages/12/fd/77fe5936d8c3086ca9048f7f415f122ed82e53884a9ec193646b42deef06/propcache-0.5.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c80f4ba3e8f00189165999a742ee526ebeccedf6c3f7beb0c7df821e9772435a", size = 62514 },
+    { url = "https://files.pythonhosted.org/packages/cf/74/66bd798b5b3be70aa1b391f5cc9d6a0a5532d7fd3b19ec0b213e72e6ad9d/propcache-0.5.2-cp312-cp312-win32.whl", hash = "sha256:8c7972d8f193740d9175f0998ab38717e6cd322d5935c5b0fef8c0d323fd9031", size = 39018 },
+    { url = "https://files.pythonhosted.org/packages/61/7c/5c0d34aa3024694d6dcb9271cdbdd08c4e47c1c0ad95ec7e7bc74cdea145/propcache-0.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:d9ee8826a7d47863a08ac44e1a5f611a462eefc3a194b492da242128bec75b42", size = 42322 },
+    { url = "https://files.pythonhosted.org/packages/4d/91/875812f1a3feb20ceba818ef39fbe4d92f1081e04ac815c822496d0d038b/propcache-0.5.2-cp312-cp312-win_arm64.whl", hash = "sha256:2800a4a8ead6b28cccd1ec54b59346f0def7922ee1c7598e8499c733cfbb7c84", size = 38172 },
+    { url = "https://files.pythonhosted.org/packages/c5/09/f049e45385503fe67db75a6b6186a7b9f0c3930366dc960522c312a825b1/propcache-0.5.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:099aaf4b4d1a02265b92a977edf00b5c4f63b3b17ac6de39b0d637c9cac0188a", size = 94457 },
+    { url = "https://files.pythonhosted.org/packages/6b/65/83d1d05655baf63113731bd5a1008435e14f8d1e5a06cbe4ec5b23ad7a31/propcache-0.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:68ce1c44c7a813a7f71ea04315a8c7b330b63db99d059a797a4651bb6f69f117", size = 53835 },
+    { url = "https://files.pythonhosted.org/packages/a9/12/a6ba6482bb5ea3260c000c9b20881c95fa11c6b30173715668259f844ed7/propcache-0.5.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:fc299c129490f55f254cd90be0deca4764e36e9a7c08b4aa588479a3bbed3098", size = 54545 },
+    { url = "https://files.pythonhosted.org/packages/a9/19/7fa086f5764c59ec8a8e157cd93aa8497acc00aba9dcdec56bfffb32602d/propcache-0.5.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a6ae2198be502c10f09b2516e7b5d019816924bc3183a43ce792a7bd6625e6f4", size = 59886 },
+    { url = "https://files.pythonhosted.org/packages/a1/e4/5d7663dc8235956c8f5281698a3af1d351d8820341ddd890f59d9a9127f2/propcache-0.5.2-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6041d31504dc1779d700e1edcfb08eea334b357620b06681a4eabb57a74e574e", size = 63261 },
+    { url = "https://files.pythonhosted.org/packages/4a/4a/15a03adee24d6350da4292caeac44c34c033d2afe5e87eb370f38854560f/propcache-0.5.2-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f7eabc04151c78a9f4d5bbb5f1faf571e4defeb4b585e0fe95b60ff2dbe4d3d7", size = 64184 },
+    { url = "https://files.pythonhosted.org/packages/8b/c6/979176efdaa3d239e36d503d5af63a0a773b36662ed8f52e5b6a6d9fd40e/propcache-0.5.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4db0ba63d693afd40d249bd93f842b5f144f8fcbb83de05660373bcf30517b1d", size = 61534 },
+    { url = "https://files.pythonhosted.org/packages/c8/22/63e8cd1bae4c2d2be6493b6b7d10566ddafad88137cfbc99964a1119853c/propcache-0.5.2-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1dbcf7675229b35d31abb6547d8ebc8c27a830ac3f9a794edff6254873ec7c0a", size = 61500 },
+    { url = "https://files.pythonhosted.org/packages/60/5a/28e5d9acbac1cc9ccb67045e8c1b943aa8d79fdf39c93bd73cacd68008ea/propcache-0.5.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d310c013aad2c72f1c3f2f8dd3279d460a858c551f97aeb8c63e4693cca7b4d2", size = 59994 },
+    { url = "https://files.pythonhosted.org/packages/f3/40/db650677f554a95b9c01a7c9d93d629e93a15562f5deb4573c9ee136fed2/propcache-0.5.2-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:06187263ddad280d05b4d8a8b3bb7d164cbebd469236544a42e6d9b28ac6a4fa", size = 56884 },
+    { url = "https://files.pythonhosted.org/packages/80/45/70b39b89516ff8b96bf732fa6fded8cef20f293cb1508690101c3c07ec51/propcache-0.5.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3115559b8effafd63b142ea5ed53d63a16ea6469cbc63dce4ee194b42db5d853", size = 63464 },
+    { url = "https://files.pythonhosted.org/packages/f9/e2/fa59d3a89eac5534293124af4f1d0d0ada091ce4a0ab4610ce03fd2bdd8d/propcache-0.5.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:c60462af8e6dc30c35407c7237ea908d777b22862bbee27bc4699c0d8bcdc45a", size = 61588 },
+    { url = "https://files.pythonhosted.org/packages/0b/97/efb547a55c4bc7381cfb202d6a2239ac621045277bc1ea5dfd3a7f0516c0/propcache-0.5.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:40314bca9ac559716fe374094fc81c11dcc34b64fd6c585360f5775690505704", size = 64667 },
+    { url = "https://files.pythonhosted.org/packages/92/56/f5c7d9b4b7595d5127da38974d791b2153f3d1eae6c674af3583ace92ad3/propcache-0.5.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:cfa21e036ce1e1db2be04ba3b85d2df1bb1702fa01932d984c5464c665228ff4", size = 62463 },
+    { url = "https://files.pythonhosted.org/packages/bd/3b/484a3a65fc9f9f60c41dcd17b428bace5389544e2c680994534a20755066/propcache-0.5.2-cp313-cp313-win32.whl", hash = "sha256:f156a3529f38063b6dbaf356e15602a7f95f8055b1295a438433a6386f10463d", size = 38621 },
+    { url = "https://files.pythonhosted.org/packages/1c/fd/3f0f10dba4dabad3bf53102be007abf55481067952bde0fdddff439e7c61/propcache-0.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:dfed59d0a5aeb01e242e66ff0300bc4a265a7c05f612d30016f0b60b1017d757", size = 41649 },
+    { url = "https://files.pythonhosted.org/packages/90/ec/6ce619cc32bb500a482f811f9cd509368b4e58e638d13f2c68f370d6b475/propcache-0.5.2-cp313-cp313-win_arm64.whl", hash = "sha256:ba338430e87ceb9c8f0cf754de38a9860560261e56c00376debd628698a7364f", size = 37636 },
+    { url = "https://files.pythonhosted.org/packages/1b/82/c1d268bbbf2ef981c5bf0fbbe746db617c66e3bcefe431a1aa8943fbe23a/propcache-0.5.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:a592f5f3da71c8691c788c13cb6734b6d17663d2e1cb8caddf0673d01ef8847d", size = 98872 },
+    { url = "https://files.pythonhosted.org/packages/f4/d4/52c871e73e864e6b34c0e2d58ac1ec5ccd149497ddc7ad2137ae98323a35/propcache-0.5.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:6a997d0489e9668a384fcfd5061b857aa5361de73191cac204d04b889cfbbafa", size = 56257 },
+    { url = "https://files.pythonhosted.org/packages/67/f0/9b90ca2a210b3d09bcfcd96ecd0f55545c091535abce2a45de2775cfd357/propcache-0.5.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:10734b5484ea113152ee25a91dccedf81631791805d2c9ccb054958e51842c94", size = 56696 },
+    { url = "https://files.pythonhosted.org/packages/9d/0e/6e9d4ba07c8e56e21ddec1e75f12148142b21ca83a51871babce095334f4/propcache-0.5.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cafca7e56c12bb02ae16d283742bef25a61122e9dab2b5b3f2ccbe589ce32164", size = 62378 },
+    { url = "https://files.pythonhosted.org/packages/65/19/c10badaa463dde8a27ce884f8ee2ec37e6035b7c9f5ff0c8f74f06f08dac/propcache-0.5.2-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f064f8d2b59177878b7615df1735cd8fe3462ed6be8c7b217d17a276489c2b7f", size = 65283 },
+    { url = "https://files.pythonhosted.org/packages/b0/b6/93bea99ca80e19cef6512a8580e5b7857bbe09422d9daa7fd4ef5723306c/propcache-0.5.2-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f78abfa8dfc32376fd1aacf597b2f2fbbe0ea751419aee718af5d4f82537ef8c", size = 66616 },
+    { url = "https://files.pythonhosted.org/packages/83/e4/5c7462e50625f051f37fb38b8224f7639f667184bbd34424ec83819bb1b7/propcache-0.5.2-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f7467da8a9822bf1a55336f877340c5bcbd3c482afc43a99771169f74a26dedc", size = 63773 },
+    { url = "https://files.pythonhosted.org/packages/ca/b6/99238894047b13c823be25027e736626cd414a52a5e30d2c3347c2733529/propcache-0.5.2-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a6ddc6ac9e25de626c1f129c1b467d7ecd33ce2237d3fd0c4e429feef0a7ee1f", size = 63664 },
+    { url = "https://files.pythonhosted.org/packages/85/1e/a3a1a63116a2b8edb415a8bb9a6f0c34bd03830b1e18e8ce2904e1dc1cf4/propcache-0.5.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:2f22cbbac9e26a8e864c0985ff1268d5d939d53d9d9411a9824279097e03a2cb", size = 62643 },
+    { url = "https://files.pythonhosted.org/packages/e4/03/893cf147de2fc6543c5eaa07ad833170e7e2a2385725bbebe8c0503723bb/propcache-0.5.2-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:fc76378c62a0f04d0cd82fbb1a2cd2d7e28fcb40d5873f28a6c44e388aaa2751", size = 59595 },
+    { url = "https://files.pythonhosted.org/packages/86/3b/04c1a2e12c57766568ba75ba72b3bf2042818d4c1425fab6fc07155c7cff/propcache-0.5.2-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:acd2c8edba48e31e58a363b8cf4e5c7db3b04b3f9e371f601df30d9b0d244836", size = 65711 },
+    { url = "https://files.pythonhosted.org/packages/1c/34/80f8d0099f8d6bacc4de1624c85672681c8cd1149ca2da0e38fd120b817f/propcache-0.5.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:452b5065457eb9991ec5eb38ff41d6cd4c991c9ac7c531c4d5849ae473a9a13f", size = 64247 },
+    { url = "https://files.pythonhosted.org/packages/f3/1a/8b08f3a5f1037e9e370c55883ceeeee0f6dd0416fb2d2d67b8bfc91f2a79/propcache-0.5.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:3430bb2bfe1331885c427745a751e774ee679fd4344f80b97bf879815fe8fa55", size = 67102 },
+    { url = "https://files.pythonhosted.org/packages/34/68/8bdb7bb7756d76e005490649d10e4a8369e610c74d619f71e1aedf889e9c/propcache-0.5.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cef6cea3922890dd6c9654971001fa797b526c16ab5e1e46c05fd6f877be7568", size = 64964 },
+    { url = "https://files.pythonhosted.org/packages/0a/aa/50fb0b5d3968b61a510926ff8b8465f1d6e976b3ab74496d7a4b9fc42515/propcache-0.5.2-cp313-cp313t-win32.whl", hash = "sha256:72d61e16dd78228b58c5d47be830ff3da7e5f139abdf0aef9d86cde1c5cf2191", size = 42546 },
+    { url = "https://files.pythonhosted.org/packages/ae/4c/0ddbae64321bd4a95bcbfc19307238016b5b1fee645c84626c8d539e5b74/propcache-0.5.2-cp313-cp313t-win_amd64.whl", hash = "sha256:0958834041a0166d343b8d2cedcd8bcbaeb4fdbe0cf08320c5379f143c3be6e7", size = 46330 },
+    { url = "https://files.pythonhosted.org/packages/00/d9/9cddc8efb78d8af264c5ec9f6d10b62f57c515feda8d321595f56010fb23/propcache-0.5.2-cp313-cp313t-win_arm64.whl", hash = "sha256:6de8bd93ddde9b992cf2b2e0d796d501a19026b5b9fd87356d7d0779531a8d96", size = 40521 },
+    { url = "https://files.pythonhosted.org/packages/e2/ea/23ee535d90ce8bcc465a3028eb3cc0ce3bd1005f4bb27710b30587de798d/propcache-0.5.2-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:46088abff4cba581dea21ae0467a480526cb25aa5f3c269e909f800328bc3999", size = 94662 },
+    { url = "https://files.pythonhosted.org/packages/b5/06/c5a52f419b5d8972f8d46a7577476090d8e3263ff589ce40b5ca4968d5be/propcache-0.5.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:fc88b26f08d634f7bc819a7852e5214f5802641ab8d9fd5326892292eee1993e", size = 53928 },
+    { url = "https://files.pythonhosted.org/packages/63/b1/4260d67d6bd85e58a66b72d54ce15d5de789b6f3870cc6bedf8ff9667401/propcache-0.5.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:97797ebb098e670a2f92dd66f32897e30d7615b14e7f59711de23e30a9072539", size = 54650 },
+    { url = "https://files.pythonhosted.org/packages/70/06/2f46c318e3307cd7a6a7481def374ce838c0fe20084b39dd54b0879d0e99/propcache-0.5.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ba57fffe4ac99c5d30076161b5866336d97600769bad35cc68f7774b15298a4e", size = 59912 },
+    { url = "https://files.pythonhosted.org/packages/4c/29/fe1aebec2ce57ab985a9c382bded1124431f85078113aa222c5d278430d4/propcache-0.5.2-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:583c19759d9eec1e5b69e2fbef36a7d9c326041be9746cb822d335c8cedc2979", size = 63300 },
+    { url = "https://files.pythonhosted.org/packages/b4/18/2334b26768b6c82be8c69e83671b767d5ef426aa09b0cba6c2ea47816774/propcache-0.5.2-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d0326e2e5e1f3163fa306c834e48e8d490e5fae607a097a40c0648109b47ba80", size = 64208 },
+    { url = "https://files.pythonhosted.org/packages/2b/76/7f1bfd6afff4c5e38e36a3c6d68eb5f4b7311ea80baf693db78d95b603c4/propcache-0.5.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e00820e192c8dbebcafb383ebbf99030895f09905e7a0eb2e0340a0bcc2bc825", size = 61633 },
+    { url = "https://files.pythonhosted.org/packages/c4/46/b3ff8aba2b4953a3e50de2cf72f1b5748b8eca93b15f3dc2c84339084c09/propcache-0.5.2-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c66afea89b1e43725731d2004732a046fe6fe955d51f952c3e95a7314a284a39", size = 61724 },
+    { url = "https://files.pythonhosted.org/packages/c5/01/814cfcafbcff954f94c01cf30e097ddc88a076b5440fbcf4570753437d40/propcache-0.5.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d4dc37dec6c6cdad0b57881a5658fd14fbf53e333b1a86cf86559f190e1d9ec4", size = 60069 },
+    { url = "https://files.pythonhosted.org/packages/da/68/5c6f7622d510cc666a300687e06fd060c1a43361c0c9b20d284f06d8096a/propcache-0.5.2-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:5570dbcc97571c15f68068e529c92715a12f8d54030e272d264b377e22bd17a5", size = 57099 },
+    { url = "https://files.pythonhosted.org/packages/55/27/9cb0b4c679124085327957d42521c99dba04c88c90c3e55a6f0b633ebccc/propcache-0.5.2-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:f814362777a9f841adddb200ecdf8f5cb1e5a3c4b7a86378edbd6ccb26edd702", size = 63391 },
+    { url = "https://files.pythonhosted.org/packages/f0/9d/7258aaa5bdf60fc6f27591eef6fe52768cb0beda7140be477c8b12c9794a/propcache-0.5.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:196913dea116aeb5a2ba95af4ddcb7ea85559ae07d8eee8751688310d09168c3", size = 61626 },
+    { url = "https://files.pythonhosted.org/packages/8e/0d/41c602003e8a9b16fe1e7eadf62c7bfba9d5474370b24200bf48b315f45f/propcache-0.5.2-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:6e7b8719005dd1175be4ab1cd25e9b98659a5e0347331506ec6760d2773a7fb5", size = 64781 },
+    { url = "https://files.pythonhosted.org/packages/8b/f3/38e66b1856e9bd079deea015bc4a55f7767c0e4db2f7dcf69e7e680ba4ce/propcache-0.5.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:51f96d685ab16e88cab128cd37a52c5da540809c8b879fa047731bfcb4ad35a4", size = 62570 },
+    { url = "https://files.pythonhosted.org/packages/95/ca/bbfe9b910ce57dde8bb4876b4520fc02a4e89497c10de26be936758a3aaa/propcache-0.5.2-cp314-cp314-win32.whl", hash = "sha256:cc6fc3cc62e8501d3ed62894425040d2728ecddb1ed072737a5c70bd537aa9f0", size = 39436 },
+    { url = "https://files.pythonhosted.org/packages/61/d2/45c9defbaa1ea297035d9d4cce9e8f80daafbf19319c6007f157c6256ea9/propcache-0.5.2-cp314-cp314-win_amd64.whl", hash = "sha256:81e3a30b0bb60caa22033dd0f8a3618d1d67356212514f62c57db75cb0ef410c", size = 42373 },
+    { url = "https://files.pythonhosted.org/packages/44/68/9ea5103f41d5217d7d6ec24db90018e23aebec070c3f9a6e54d12b841fd8/propcache-0.5.2-cp314-cp314-win_arm64.whl", hash = "sha256:0d2c9bf8528f135dbb805ce027567e09164f7efa51a2be07458a2c0420f292d0", size = 38554 },
+    { url = "https://files.pythonhosted.org/packages/8a/81/fadf555f42d3b762eea8a53950b0489fdc0aa9da5f8ed9e10ce0a4e01b48/propcache-0.5.2-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:4bc8ff1feffc6a61c7002ffe84634c41b822e104990ae009f44a0834430070bb", size = 99395 },
+    { url = "https://files.pythonhosted.org/packages/f5/c9/c61e134a686949cf7971af3a390148b1156f7be81c73bc0cd12c873e2d48/propcache-0.5.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:79aa3ff0a9b566633b642fa9caf7e21ed1c13d6feca718187873f199e1514078", size = 56653 },
+    { url = "https://files.pythonhosted.org/packages/cb/73/daf935ea7048ddd7ec8eec5345b4a40b619d2d178b3c0a0900796bc3c794/propcache-0.5.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1b31822f4474c4036bae62de9402710051d431a606d6a0f907fec79935a071aa", size = 56914 },
+    { url = "https://files.pythonhosted.org/packages/79/9f/aba959b435ea18617edd7cf0a7ad0b9c574b8fc7e3d2cd55fb59cb255d33/propcache-0.5.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:13fef48778b5a2a756523fdb781326b028ca75e32858b04f2cdd19f394564917", size = 62567 },
+    { url = "https://files.pythonhosted.org/packages/6c/a1/859942de9a791ff42f6141736f5b37749b8f53e65edfa49638c67dd67e6a/propcache-0.5.2-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8b73ab70f1a3351fbc71f663b3e645af6dd0329100c353081cf69c37433fc6fe", size = 65542 },
+    { url = "https://files.pythonhosted.org/packages/b5/61/315bc0fd6c0fc7f80a528b8afd209e5fc4a875ea79571b91b8f50f442907/propcache-0.5.2-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5538d2c13d93e4698af7e092b57bc7298fd35d1d58e656ae18f23ee0d0378e03", size = 66845 },
+    { url = "https://files.pythonhosted.org/packages/47/f7/9f8122e3132e8e354ac41975ef8f1099be7d5a16bc7ae562734e993665c0/propcache-0.5.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cd645f03898405cabe694fb8bc35241e3a9c332ec85627584fe3de201452b335", size = 63985 },
+    { url = "https://files.pythonhosted.org/packages/c8/54/c317819ec157cbf6f35df9df9657a6f82daf34d5faf15948b2f639c2192e/propcache-0.5.2-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a473b3440261e0c60706e732b2ed2f517857344fc21bf48fdfe211e2d98eb285", size = 63999 },
+    { url = "https://files.pythonhosted.org/packages/5a/56/387e3f7dfce0a9233df41fb888aa1c30222cb4bbbf09537c02dd9bd85fe2/propcache-0.5.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7afa37062e6650640e932e4cc9297d81f9f42d9944029cc386b8247dea4da837", size = 62779 },
+    { url = "https://files.pythonhosted.org/packages/a1/9c/596784cb5824ed61ee960d3f8655a3f0993e107c6e98ab6c818b7fb92ccb/propcache-0.5.2-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:8a90efd5777e996e42d568db9ac740b944d691e565cbfd31b2f7832f9184b2b8", size = 59796 },
+    { url = "https://files.pythonhosted.org/packages/c2/3d/1a6cfa1726a48542c1e8784a0761421476a5b68e09b7f36bf95eb954aaba/propcache-0.5.2-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:f19bb891234d72535764d703bfed1153cc34f4214d5bd7150aee1eec9e8f4366", size = 66023 },
+    { url = "https://files.pythonhosted.org/packages/e4/0e/05fd6990369477076e4e280bcb970de760fddf0161a46e988bc95f7940ec/propcache-0.5.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:32775082acd2d807ee3db715c7770d38767b817870acfa08c29e057f3c4d5b56", size = 64448 },
+    { url = "https://files.pythonhosted.org/packages/cd/86/5f8da315a4309c62c10c0b2516b17492d5d3bbe1bb862b96604db67e2a37/propcache-0.5.2-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:9282fb1a3bccd038da9f768b927b24a0c753e466c086b7c4f3c6982851eefb2d", size = 67329 },
+    { url = "https://files.pythonhosted.org/packages/da/d3/3368efe79ab21f0cdf86ef49895811c9cc933131d4cde1f28a624e22e712/propcache-0.5.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cc49723e2f60d6b32a0f0b08a3fd6d13203c07f1cd9566cfce0f12a917c967a2", size = 65172 },
+    { url = "https://files.pythonhosted.org/packages/d5/07/127e8b0bacfb325396196f9d976a22453049b89b9b2b08477cc3145faa44/propcache-0.5.2-cp314-cp314t-win32.whl", hash = "sha256:2d7aa89ebca5acc98cba9d1472d976e394782f587bad6661003602a619fd1821", size = 43813 },
+    { url = "https://files.pythonhosted.org/packages/88/fb/46dad6c0ae49ed230ab1b16c890c2b6314e2403e6c412976f4a72d64a527/propcache-0.5.2-cp314-cp314t-win_amd64.whl", hash = "sha256:d447bb0b3054be5818458fbb171208b1d9ff11eba14e18ca18b90cbb45767370", size = 47764 },
+    { url = "https://files.pythonhosted.org/packages/e7/c4/a47d0a63aa309d10d59ede6e9d4cff03a344a79d1f0f4cd0cd74997b53e0/propcache-0.5.2-cp314-cp314t-win_arm64.whl", hash = "sha256:fe67a3d11cd9b4efabfa45c3d00ffba2b26811442a73a581a94b67c2b5faccf6", size = 41140 },
+    { url = "https://files.pythonhosted.org/packages/3a/ed/1cdcab6ba3d6ab7feca11fc14f0eeea80755bb53ef4e892079f31b10a25f/propcache-0.5.2-py3-none-any.whl", hash = "sha256:be1ddfcbb376e3de5d2e2db1d58d6d67463e6b4f9f040c000de8e300295465fe", size = 14036 },
+]
+
+[[package]]
+name = "pyarrow"
+version = "24.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/91/13/13e1069b351bdc3881266e11147ffccf687505dbb0ea74036237f5d454a5/pyarrow-24.0.0.tar.gz", hash = "sha256:85fe721a14dd823aca09127acbb06c3ca723efbd436c004f16bca601b04dcc83", size = 1180261 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b4/a9/9686d9f07837f91f775e8932659192e02c74f9d8920524b480b85212cc68/pyarrow-24.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:6233c9ed9ab9d1db47de57d9753256d9dcffbf42db341576099f0fd9f6bf4810", size = 34981559 },
+    { url = "https://files.pythonhosted.org/packages/80/b6/0ddf0e9b6ead3474ab087ae598c76b031fc45532bf6a63f3a553440fb258/pyarrow-24.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:f7616236ec1bc2b15bfdec22a71ab38851c86f8f05ff64f379e1278cf20c634a", size = 36663654 },
+    { url = "https://files.pythonhosted.org/packages/7c/3b/926382efe8ce27ba729071d3566ade6dfb86bdf112f366000196b2f5780a/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1617043b99bd33e5318ae18eb2919af09c71322ef1ca46566cdafc6e6712fb66", size = 45679394 },
+    { url = "https://files.pythonhosted.org/packages/b3/7a/829f7d9dfd37c207206081d6dad474d81dde29952401f07f2ba507814818/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6165461f55ef6314f026de6638d661188e3455d3ec49834556a0ebbdbace18bb", size = 48863122 },
+    { url = "https://files.pythonhosted.org/packages/5f/e8/f88ce625fe8babaae64e8db2d417c7653adb3019b08aae85c5ed787dc816/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3b13dedfe76a0ad2d1d859b0811b53827a4e9d93a0bcb05cf59333ab4980cc7e", size = 49376032 },
+    { url = "https://files.pythonhosted.org/packages/36/7a/82c363caa145fff88fb475da50d3bf52bb024f61917be5424c3392eaf878/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:25ea65d868eb04015cd18e6df2fbe98f07e5bda2abefabcb88fce39a947716f6", size = 51929490 },
+    { url = "https://files.pythonhosted.org/packages/66/1c/e3e72c8014ad2743ca64a701652c733cc5cbcee15c0463a32a8c55518d9e/pyarrow-24.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:295f0a7f2e242dabd513737cf076007dc5b2d59237e3eca37b05c0c6446f3826", size = 27355660 },
+    { url = "https://files.pythonhosted.org/packages/6f/d3/a1abf004482026ddc17f4503db227787fa3cfe41ec5091ff20e4fea55e57/pyarrow-24.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:02b001b3ed4723caa44f6cd1af2d5c86aa2cf9971dacc2ffa55b21237713dfba", size = 34976759 },
+    { url = "https://files.pythonhosted.org/packages/4f/4a/34f0a36d28a2dd32225301b79daad44e243dc1a2bb77d43b60749be255c4/pyarrow-24.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:04920d6a71aabd08a0417709efce97d45ea8e6fb733d9ca9ecffb13c67839f68", size = 36658471 },
+    { url = "https://files.pythonhosted.org/packages/1f/78/543b94712ae8bb1a6023bcc1acf1a740fbff8286747c289cd9468fced2a5/pyarrow-24.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:a964266397740257f16f7bb2e4f08a0c81454004beab8ff59dd531b73610e9f2", size = 45675981 },
+    { url = "https://files.pythonhosted.org/packages/84/9f/8fb7c222b100d314137fa40ec050de56cd8c6d957d1cfff685ce72f15b17/pyarrow-24.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:6f066b179d68c413374294bc1735f68475457c933258df594443bb9d88ddc2a0", size = 48859172 },
+    { url = "https://files.pythonhosted.org/packages/a7/d3/1ea72538e6c8b3b475ed78d1049a2c518e655761ea50fe1171fc855fcab7/pyarrow-24.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1183baeb14c5f587b1ec52831e665718ce632caab84b7cd6b85fd44f96114495", size = 49385733 },
+    { url = "https://files.pythonhosted.org/packages/c3/be/c3d8b06a1ba35f2260f8e1f771abbee7d5e345c0937aab90675706b1690a/pyarrow-24.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:806f24b4085453c197a5078218d1ee08783ebbba271badd153d1ae22a3ee804f", size = 51934335 },
+    { url = "https://files.pythonhosted.org/packages/9c/62/89e07a1e7329d2cde3e3c6994ba0839a24977a2beda8be6005ea3d860b99/pyarrow-24.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:e4505fc6583f7b05ab854934896bcac8253b04ac1171a77dfb73efef92076d91", size = 27271748 },
+    { url = "https://files.pythonhosted.org/packages/17/1a/cff3a59f80b5b1658549d46611b67163f65e0664431c076ad728bf9d5af4/pyarrow-24.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:1a4e45017efbf115032e4475ee876d525e0e36c742214fbe405332480ecd6275", size = 35238554 },
+    { url = "https://files.pythonhosted.org/packages/a8/99/cce0f42a327bfef2c420fb6078a3eb834826e5d6697bf3009fe11d2ad051/pyarrow-24.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:7986f1fa71cee060ad00758bcc79d3a93bab8559bf978fab9e53472a2e25a17b", size = 36782301 },
+    { url = "https://files.pythonhosted.org/packages/2a/66/8e560d5ff6793ca29aca213c53eec0dd482dd46cb93b2819e5aab52e4252/pyarrow-24.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:d3e0b61e8efb24ed38898e5cdc5fffa9124be480008d401a1f8071500494ae42", size = 45721929 },
+    { url = "https://files.pythonhosted.org/packages/27/0c/a26e25505d030716e078d9f16eb74973cbf0b33b672884e9f9da1c83b871/pyarrow-24.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:55a3bc1e3df3b5567b7d27ef551b2283f0c68a5e86f1cd56abc569da4f31335b", size = 48825365 },
+    { url = "https://files.pythonhosted.org/packages/5f/eb/771f9ecb0c65e73fe9dccdd1717901b9594f08c4515d000c7c62df573811/pyarrow-24.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:641f795b361874ac9da5294f8f443dfdbee355cf2bd9e3b8d97aaac2306b9b37", size = 49451819 },
+    { url = "https://files.pythonhosted.org/packages/48/da/61ae89a88732f5a785646f3ec6125dbb640fa98a540eb2b9889caa561403/pyarrow-24.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8adc8e6ce5fccf5dc707046ae4914fd537def529709cc0d285d37a7f9cd442ca", size = 51909252 },
+    { url = "https://files.pythonhosted.org/packages/cb/1a/8dd5cafab7b66573fa91c03d06d213356ad4edd71813aa75e08ce2b3a844/pyarrow-24.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:9b18371ad2f44044b81a8d23bc2d8a9b6a6226dca775e8e16cfee640473d6c5d", size = 27388127 },
+    { url = "https://files.pythonhosted.org/packages/ad/80/d022a34ff05d2cbedd8ccf841fc1f532ecfa9eb5ed1711b56d0e0ea71fc9/pyarrow-24.0.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:1cc9057f0319e26333b357e17f3c2c022f1a83739b48a88b25bfd5fa2dc18838", size = 35007997 },
+    { url = "https://files.pythonhosted.org/packages/1a/ff/f01485fda6f4e5d441afb8dd5e7681e4db18826c1e271852f5d3957d6a80/pyarrow-24.0.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:e6f1278ee4785b6db21229374a1c9e54ec7c549de5d1efc9630b6207de7e170b", size = 36678720 },
+    { url = "https://files.pythonhosted.org/packages/9e/c2/2d2d5fea814237923f71b36495211f20b43a1576f9a4d6da7e751a64ec6f/pyarrow-24.0.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:adbbedc55506cbdabb830890444fb856bfb0060c46c6f8026c6c2f2cf86ae795", size = 45741852 },
+    { url = "https://files.pythonhosted.org/packages/8e/3a/28ba9c1c1ebdbb5f1b94dfebb46f207e52e6a554b7fe4132540fde29a3a0/pyarrow-24.0.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:ae8a1145af31d903fa9bb166824d7abe9b4681a000b0159c9fb99c11bc11ad26", size = 48889852 },
+    { url = "https://files.pythonhosted.org/packages/df/51/4a389acfd31dca009f8fb82d7f510bb4130f2b3a8e18cf00194d0687d8ac/pyarrow-24.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d7027eba1df3b2069e2e8d80f644fa0918b68c46432af3d088ddd390d063ecde", size = 49445207 },
+    { url = "https://files.pythonhosted.org/packages/19/4b/0bab2b23d2ae901b1b9a03c0efd4b2d070256f8ce3fc43f6e58c167b2081/pyarrow-24.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:e56a1ffe9bf7b727432b89104cc0849c21582949dd7bdcb34f17b2001a351a76", size = 51954117 },
+    { url = "https://files.pythonhosted.org/packages/29/88/f4e9145da0417b3d2c12035a8492b35ff4a3dbc653e614fcfb51d9dedb38/pyarrow-24.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:38be1808cdd068605b787e6ca9119b27eb275a0234e50212c3492331680c3b1e", size = 28001155 },
+    { url = "https://files.pythonhosted.org/packages/79/4f/46a49a63f43526da895b1a45bbb51d5baf8e4d77159f8528fc3e5490007f/pyarrow-24.0.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:418e48ce50a45a6a6c73c454677203a9c75c966cb1e92ca3370959185f197a05", size = 35250387 },
+    { url = "https://files.pythonhosted.org/packages/a0/da/d5e0cd5ef00796922404806d5f00325cdadc3441ce2c13fe7115f2df9a64/pyarrow-24.0.0-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:2f16197705a230a78270cdd4ea8a1d57e86b2fdcbc34a1f6aebc72e65c986f9a", size = 36797102 },
+    { url = "https://files.pythonhosted.org/packages/34/c7/5904145b0a593a05236c882933d439b5720f0a145381179063722fbfc123/pyarrow-24.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:fb24ac194bfc5e86839d7dcd52092ee31e5fe6733fe11f5e3b06ef0812b20072", size = 45745118 },
+    { url = "https://files.pythonhosted.org/packages/13/d3/cca42fe166d1c6e4d5b80e530b7949104d10e17508a90ae202dac205ce2a/pyarrow-24.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:9700ebd9a51f5895ce75ff4ac4b3c47a7d4b42bc618be8e713e5d56bacf5f931", size = 48844765 },
+    { url = "https://files.pythonhosted.org/packages/b0/49/942c3b79878ba928324d1e17c274ed84581db8c0a749b24bcf4cbdf15bd3/pyarrow-24.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d8ddd2768da81d3ee08cfea9b597f4abb4e8e1dc8ae7e204b608d23a0d3ab699", size = 49471890 },
+    { url = "https://files.pythonhosted.org/packages/76/97/ff71431000a75d84135a1ace5ca4ba11726a231a8007bbb320a4c54075d5/pyarrow-24.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:61a3d7eaa97a14768b542f3d284dc6400dd2470d9f080708b13cd46b6ae18136", size = 51932250 },
+    { url = "https://files.pythonhosted.org/packages/51/be/6f79d55816d5c22557cf27533543d5d70dfe692adfbee4b99f2760674f38/pyarrow-24.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:c91d00057f23b8d353039520dc3a6c09d8608164c692e9f59a175a42b2ae0c19", size = 28131282 },
+]
+
+[[package]]
+name = "pydantic"
+version = "2.13.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "annotated-types" },
+    { name = "pydantic-core" },
+    { name = "typing-extensions" },
+    { name = "typing-inspection" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/18/a5/b60d21ac674192f8ab0ba4e9fd860690f9b4a6e51ca5df118733b487d8d6/pydantic-2.13.4.tar.gz", hash = "sha256:c40756b57adaa8b1efeeced5c196f3f3b7c435f90e84ea7f443901bec8099ef6", size = 844775 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fd/7b/122376b1fd3c62c1ed9dc80c931ace4844b3c55407b6fb2d199377c9736f/pydantic-2.13.4-py3-none-any.whl", hash = "sha256:45a282cde31d808236fd7ea9d919b128653c8b38b393d1c4ab335c62924d9aba", size = 472262 },
+]
+
+[[package]]
+name = "pydantic-core"
+version = "2.46.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9d/56/921726b776ace8d8f5db44c4ef961006580d91dc52b803c489fafd1aa249/pydantic_core-2.46.4.tar.gz", hash = "sha256:62f875393d7f270851f20523dd2e29f082bcc82292d66db2b64ea71f64b6e1c1", size = 471464 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ce/8c/af022f0af448d7747c5154288d46b5f2bc5f17366eaa0e23e9aa04d59f3b/pydantic_core-2.46.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:3245406455a5d98187ec35530fd772b1d799b26667980872c8d4614991e2c4a2", size = 2106158 },
+    { url = "https://files.pythonhosted.org/packages/19/95/6195171e385007300f0f5574592e467c568becce2d937a0b6804f218bc49/pydantic_core-2.46.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:962ccbab7b642487b1d8b7df90ef677e03134cf1fd8880bf698649b22a69371f", size = 1951724 },
+    { url = "https://files.pythonhosted.org/packages/8e/bc/f47d1ff9cbb1620e1b5b697eef06010035735f07820180e74178226b27b3/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8233f2947cf85404441fd7e0085f53b10c93e0ee78611099b5c7237e36aacbf7", size = 1975742 },
+    { url = "https://files.pythonhosted.org/packages/5b/11/9b9a5b0306345664a2da6410877af6e8082481b5884b3ddd78d47c6013ce/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3a233125ac121aa3ffba9a2b59edfc4a985a76092dc8279586ab4b71390875e7", size = 2052418 },
+    { url = "https://files.pythonhosted.org/packages/f1/b7/a65fec226f5d78fc39f4a13c4cc0c768c22b113438f60c14adc9d2865038/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5b712b53160b79a5850310b912a5ef8e57e56947c8ad690c227f5c9d7e561712", size = 2232274 },
+    { url = "https://files.pythonhosted.org/packages/68/f0/92039db98b907ef49269a8271f67db9cb78ae2fc68062ef7e4e77adb5f61/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9401557acd873c3a7f3eb9383edef8ac4968f9510e340f4808d427e75667e7b4", size = 2309940 },
+    { url = "https://files.pythonhosted.org/packages/5f/97/2aab507d3d00ca626e8e57c1eac6a79e4e5fbcc63eb99733ff55d1717f65/pydantic_core-2.46.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:926c9541b14b12b1681dca8a0b75feb510b06c6341b70a8e500c2fdcff837cce", size = 2094516 },
+    { url = "https://files.pythonhosted.org/packages/22/37/a8aca44d40d737dde2bc05b3c6c07dff0de07ce6f82e9f3167aeaf4d5dea/pydantic_core-2.46.4-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:56cb4851bcaf3d117eddcef4fe66afd750a50274b0da8e22be256d10e5611987", size = 2136854 },
+    { url = "https://files.pythonhosted.org/packages/24/99/fcef1b79238c06a8cbec70819ac722ba76e02bc8ada9b0fd66eba40da01b/pydantic_core-2.46.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c68fcd102d71ea85c5b2dfac3f4f8476eff42a9e078fd5faefff6d145063536b", size = 2180306 },
+    { url = "https://files.pythonhosted.org/packages/ae/6c/fc44000918855b42779d007ae63b0532794739027b2f417321cddbc44f6a/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b2f69dec1725e79a012d920df1707de5caf7ed5e08f3be4435e25803efc47458", size = 2190044 },
+    { url = "https://files.pythonhosted.org/packages/6b/65/d9cadc9f1920d7a127ad2edba16c1db7916e59719285cd6c94600b0080ba/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:8d0820e8192167f80d88d64038e609c31452eeca865b4e1d9950a27a4609b00b", size = 2329133 },
+    { url = "https://files.pythonhosted.org/packages/d0/cf/c873d91679f3a30bcf5e7ac280ce5573483e72295307685120d0d5ad3416/pydantic_core-2.46.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:fbdb89b3e1c94a30cc5edfce477c6e6a5dc4d8f84665b455c27582f211a1c72c", size = 2374464 },
+    { url = "https://files.pythonhosted.org/packages/47/bd/6f2fc8188f31bf10590f1e98e7b306336161fac930a8c514cd7bd828c7dc/pydantic_core-2.46.4-cp312-cp312-win32.whl", hash = "sha256:9aa768456404a8bf48a4406685ac2bec8e72b62c69313734fa3b73cf33b3a894", size = 1974823 },
+    { url = "https://files.pythonhosted.org/packages/40/8c/985c1d41ea1107c2534abd9870e4ed5c8e7669b5c308297835c001e7a1c4/pydantic_core-2.46.4-cp312-cp312-win_amd64.whl", hash = "sha256:e9c26f834c65f5752f3f06cb08cb86a913ceb7274d0db6e267808a708b46bc89", size = 2072919 },
+    { url = "https://files.pythonhosted.org/packages/c4/ba/f463d006e0c47373ca7ec5e1a261c59dc01ef4d62b2657af925fb0deee3a/pydantic_core-2.46.4-cp312-cp312-win_arm64.whl", hash = "sha256:4fc73cb559bdb54b1134a706a2802a4cddd27a0633f5abb7e53056268751ac6a", size = 2027604 },
+    { url = "https://files.pythonhosted.org/packages/51/a2/5d30b469c5267a17b39dec53208222f76a8d351dfac4af661888c5aee77d/pydantic_core-2.46.4-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:5d5902252db0d3cedf8d4a1bc68f70eeb430f7e4c7104c8c476753519b423008", size = 2106306 },
+    { url = "https://files.pythonhosted.org/packages/c1/81/4fa520eaffa8bd7d1525e644cd6d39e7d60b1592bc5b516693c7340b50f1/pydantic_core-2.46.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c94f0688e7b8d0a67abf40e57a7eaaecd17cc9586706a31b76c031f63df052b4", size = 1951906 },
+    { url = "https://files.pythonhosted.org/packages/03/d5/fd02da45b659668b05923b17ba3a0100a0a3d5541e3bd8fcc4ecb711309e/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f027324c56cd5406ca49c124b0db10e56c69064fec039acc571c29020cc87c76", size = 1976802 },
+    { url = "https://files.pythonhosted.org/packages/21/f2/95727e1368be3d3ed485eaab7adbd7dda408f33f7a36e8b48e0144002b91/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e739fee756ba1010f8bcccb534252e85a35fe45ae92c295a06059ce58b74ccd3", size = 2052446 },
+    { url = "https://files.pythonhosted.org/packages/9c/86/5d99feea3f77c7234b8718075b23db11532773c1a0dbd9b9490215dc2eeb/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9d56801be94b86a9da183e5f3766e6310752b99ff647e38b09a9500d88e46e76", size = 2232757 },
+    { url = "https://files.pythonhosted.org/packages/d2/3a/508ac615935ef7588cf6d9e9b91309fdc2da751af865e02a9098de88258c/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2412e734dcb48da14d4e4006b82b46b74f2518b8a26ee7e58c6844a6cd6d03c4", size = 2309275 },
+    { url = "https://files.pythonhosted.org/packages/07/f8/41db9de19d7987d6b04715a02b3b40aea467000275d9d758ffaa31af7d50/pydantic_core-2.46.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9551187363ffc0de2a00b2e47c25aeaeb1020b69b668762966df15fc5659dd5a", size = 2094467 },
+    { url = "https://files.pythonhosted.org/packages/2c/e2/f35033184cb11d0052daf4416e8e10a502ea2ac006fc4f459aee872727d1/pydantic_core-2.46.4-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:0186750b482eefa11d7f435892b09c5c606193ef3375bcf94aa00ae6bfb66262", size = 2134417 },
+    { url = "https://files.pythonhosted.org/packages/7e/7b/6ceeb1cc90e193862f444ebe373d8fdf613f0a82572dde03fb10734c6c71/pydantic_core-2.46.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5855698a4856556d86e8e6cd8434bc3ac0314ee8e12089ae0e143f64c6256e4e", size = 2179782 },
+    { url = "https://files.pythonhosted.org/packages/5a/f2/c8d7773ede6af08036423a00ae0ceffce266c3c52a096c435d68c896083f/pydantic_core-2.46.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:cbaf13819775b7f769bf4a1f066cb6df7a28d4480081a589828ef190226881cd", size = 2188782 },
+    { url = "https://files.pythonhosted.org/packages/59/31/0c864784e31f09f05cdd87606f08923b9c9e7f6e51dd27f20f62f975ce9f/pydantic_core-2.46.4-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:633147d34cf4550417f12e2b1a0383973bdf5cdfde212cb09e9a581cf10820be", size = 2328334 },
+    { url = "https://files.pythonhosted.org/packages/c2/eb/4f6c8a41efa30baa755590f4141abf3a8c370fab610915733e74134a7270/pydantic_core-2.46.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:82cf5301172168103724d49a1444d3378cb20cdee30b116a1bd6031236298a5d", size = 2372986 },
+    { url = "https://files.pythonhosted.org/packages/5b/24/b375a480d53113860c299764bfe9f349a3dc9108b3adc0d7f0d786492ebf/pydantic_core-2.46.4-cp313-cp313-win32.whl", hash = "sha256:9fa8ae11da9e2b3126c6426f147e0fba88d96d65921799bb30c6abd1cb2c97fb", size = 1973693 },
+    { url = "https://files.pythonhosted.org/packages/7e/e8/cff247591966f2d22ec8c003cd7587e27b7ba7b81ab2fb888e3ab75dc285/pydantic_core-2.46.4-cp313-cp313-win_amd64.whl", hash = "sha256:6b3ace8194b0e5204818c92802dcdca7fc6d88aabbb799d7c795540d9cd6d292", size = 2071819 },
+    { url = "https://files.pythonhosted.org/packages/c6/1a/f4aee670d5670e9e148e0c82c7db98d780be566c6e6a97ee8035528ca0b3/pydantic_core-2.46.4-cp313-cp313-win_arm64.whl", hash = "sha256:184c081504d17f1c1066e430e117142b2c77d9448a97f7b65c6ac9fd9aee238d", size = 2027411 },
+    { url = "https://files.pythonhosted.org/packages/8d/74/228a26ddad29c6672b805d9fd78e8d251cd04004fa7eed0e622096cd0250/pydantic_core-2.46.4-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:428e04521a40150c85216fc8b85e8d39fece235a9cf5e383761238c7fa9b96fb", size = 2102079 },
+    { url = "https://files.pythonhosted.org/packages/ad/1f/8970b150a4b4365623ae00fc88603491f763c627311ae8031e3111356d6e/pydantic_core-2.46.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:23ace664830ee0bfe014a0c7bc248b1f7f25ed7ad103852c317624a1083af462", size = 1952179 },
+    { url = "https://files.pythonhosted.org/packages/95/30/5211a831ae054928054b2f79731661087a2bc5c01e825c672b3a4a8f1b3e/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce5c1d2a8b27468f433ca974829c44060b8097eedc39933e3c206a90ee49c4a9", size = 1978926 },
+    { url = "https://files.pythonhosted.org/packages/57/e9/689668733b1eb67adeef047db3c2e8788fcf65a7fd9c9e2b46b7744fe245/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7283d57845ecf5a163403eb0702dfc220cc4fbdd18919cb5ccea4f95ee1cdab4", size = 2046785 },
+    { url = "https://files.pythonhosted.org/packages/60/d9/6715260422ff50a2109878fd24d948a6c3446bb2664f34ee78cd972b3acd/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8daafc69c93ee8a0204506a3b6b30f586ef54028f52aeeeb5c4cfc5184fd5914", size = 2228733 },
+    { url = "https://files.pythonhosted.org/packages/18/ae/fdb2f64316afca925640f8e70bb1a564b0ec2721c1389e25b8eb4bf9a299/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd2213145bcc2ba85884d0ac63d222fece9209678f77b9b4d76f054c561adb28", size = 2307534 },
+    { url = "https://files.pythonhosted.org/packages/89/1d/8eff589b45bb8190a9d12c49cfad0f176a5cbd1534908a6b5125e2886239/pydantic_core-2.46.4-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a5f930472650a82629163023e630d160863fce524c616f4e5186e5de9d9a49b", size = 2099732 },
+    { url = "https://files.pythonhosted.org/packages/06/d5/ee5a3366637fee41dee51a1fc91562dcf12ddbc68fda34e6b253da2324bb/pydantic_core-2.46.4-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:c1b3f518abeca3aa13c712fd202306e145abf59a18b094a6bafb2d2bbf59192c", size = 2129627 },
+    { url = "https://files.pythonhosted.org/packages/94/33/2414be571d2c6a6c4d08be21f9292b6d3fdb08949a97b6dfe985017821db/pydantic_core-2.46.4-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1a7dd0b3ee80d90150e3495a3a13ac34dbcbfd4f012996a6a1d8900e91b5c0fb", size = 2179141 },
+    { url = "https://files.pythonhosted.org/packages/7b/79/7daa95be995be0eecc4cf75064cb33f9bbbfe3fe0158caf2f0d4a996a5c7/pydantic_core-2.46.4-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:3fb702cd90b0446a3a1c5e470bfa0dd23c0233b676a9099ddcc964fa6ca13898", size = 2184325 },
+    { url = "https://files.pythonhosted.org/packages/9f/cb/d0a382f5c0de8a222dc61c65348e0ce831b1f68e0a018450d31c2cace3a5/pydantic_core-2.46.4-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:b8458003118a712e66286df6a707db01c52c0f52f7db8e4a38f0da1d3b94fc4e", size = 2323990 },
+    { url = "https://files.pythonhosted.org/packages/05/db/d9ba624cc4a5aced1598e88c04fdbd8310c8a69b9d38b9a3d39ce3a61ed7/pydantic_core-2.46.4-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:372429a130e469c9cd698925ce5fc50940b7a1336b0d82038e63d5bbc4edc519", size = 2369978 },
+    { url = "https://files.pythonhosted.org/packages/f2/20/d15df15ba918c423461905802bfd2981c3af0bfa0e40d05e13edbfa48bc3/pydantic_core-2.46.4-cp314-cp314-win32.whl", hash = "sha256:85bb3611ff1802f3ee7fdd7dbff26b56f343fb432d57a4728fdd49b6ef35e2f4", size = 1966354 },
+    { url = "https://files.pythonhosted.org/packages/fc/b6/6b8de4c0a7d7ab3004c439c80c5c1e0a3e8d78bbae19379b01960383d9e5/pydantic_core-2.46.4-cp314-cp314-win_amd64.whl", hash = "sha256:811ff8e9c313ab425368bcbb36e5c4ebd7108c2bbf4e4089cfbb0b01eff63fac", size = 2072238 },
+    { url = "https://files.pythonhosted.org/packages/32/36/51eb763beec1f4cf59b1db243a7dcc39cbb41230f050a09b9d69faaf0a48/pydantic_core-2.46.4-cp314-cp314-win_arm64.whl", hash = "sha256:bfec22eab3c8cc2ceec0248aec886624116dc079afa027ecc8ad4a7e62010f8a", size = 2018251 },
+    { url = "https://files.pythonhosted.org/packages/e8/91/855af51d625b23aa987116a19e231d2aaef9c4a415273ddc189b79a45fee/pydantic_core-2.46.4-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:af8244b2bef6aaad6d92cda81372de7f8c8d36c9f0c3ea36e827c60e7d9467a0", size = 2099593 },
+    { url = "https://files.pythonhosted.org/packages/fb/1b/8784a54c65edb5f49f0a14d6977cf1b209bba85a4c77445b255c2de58ab3/pydantic_core-2.46.4-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5a4330cdbc57162e4b3aa303f588ba752257694c9c9be3e7ebb11b4aca659b5d", size = 1935226 },
+    { url = "https://files.pythonhosted.org/packages/e8/e7/1955d28d1afc56dd4b3ad7cc0cf39df1b9852964cf16e5d13912756d6d6b/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29c61fc04a3d840155ff08e475a04809278972fe6aef51e2720554e96367e34b", size = 1974605 },
+    { url = "https://files.pythonhosted.org/packages/93/e2/3fedbf0ba7a22850e6e9fd78117f1c0f10f950182344d8a6c535d468fdd8/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c50f2528cf200c5eed56faf3f4e22fcd5f38c157a8b78576e6ba3168ec35f000", size = 2030777 },
+    { url = "https://files.pythonhosted.org/packages/f8/61/46be275fcaaba0b4f5b9669dd852267ce1ff616592dccf7a7845588df091/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0cbe8b01f948de4286c74cdd6c667aceb38f5c1e26f0693b3983d9d74887c65e", size = 2236641 },
+    { url = "https://files.pythonhosted.org/packages/60/db/12e93e46a8bac9988be3c016860f83293daea8c716c029c9ace279036f2f/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:617d7e2ca7dcb8c5cf6bcb8c59b8832c94b36196bbf1cbd1bfb56ed341905edd", size = 2286404 },
+    { url = "https://files.pythonhosted.org/packages/e2/4a/4d8b19008f38d31c53b8219cfedc2e3d5de5fe99d90076b7e767de29274f/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7027560ee92211647d0d34e3f7cd6f50da56399d26a9c8ad0da286d3869a53f3", size = 2109219 },
+    { url = "https://files.pythonhosted.org/packages/88/70/3cbc40978fefb7bb09c6708d40d4ad1a5d70fd7213c3d17f971de868ec1f/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:f99626688942fb746e545232e7726926f3be91b5975f8b55327665fafda991c7", size = 2110594 },
+    { url = "https://files.pythonhosted.org/packages/9d/20/b8d36736216e29491125531685b2f9e61aa5b4b2599893f8268551da3338/pydantic_core-2.46.4-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fc3e9034a63de20e15e8ade85358bc6efc614008cab72898b4b4952bea0509ff", size = 2159542 },
+    { url = "https://files.pythonhosted.org/packages/1d/a2/367df868eb584dacf6bf82a389272406d7178e301c4ac82545ab98bc2dd9/pydantic_core-2.46.4-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:97e7cf2be5c77b7d1a9713a05605d49460d02c6078d38d8bef3cbe323c548424", size = 2168146 },
+    { url = "https://files.pythonhosted.org/packages/c1/b8/4460f77f7e201893f649a29ab355dddd3beee8a97bcb1a320db414f9a06e/pydantic_core-2.46.4-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:3bf92c5d0e00fefaab325a4d27828fe6b6e2a21848686b5b60d2d9eeb09d76c6", size = 2306309 },
+    { url = "https://files.pythonhosted.org/packages/64/c4/be2639293acd87dc8ddbcec41a73cee9b2ebf996fe6d892a1a74e88ad3f7/pydantic_core-2.46.4-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:3ecbc122d18468d06ca279dc26a8c2e2d5acb10943bb35e36ae92096dc3b5565", size = 2369736 },
+    { url = "https://files.pythonhosted.org/packages/30/a6/9f9f380dbb301f67023bf8f707aaa75daadf84f7152d95c410fd7e81d994/pydantic_core-2.46.4-cp314-cp314t-win32.whl", hash = "sha256:e846ae7835bf0703ae43f534ab79a867146dadd59dc9ca5c8b53d5c8f7c9ef02", size = 1955575 },
+    { url = "https://files.pythonhosted.org/packages/40/1f/f1eb9eb350e795d1af8586289746f5c5677d16043040d63710e22abc43c9/pydantic_core-2.46.4-cp314-cp314t-win_amd64.whl", hash = "sha256:2108ba5c1c1eca18030634489dc544844144ee36357f2f9f780b93e7ddbb44b5", size = 2051624 },
+    { url = "https://files.pythonhosted.org/packages/f6/d2/42dd53d0a85c27606f316d3aa5d2869c4e8470a5ed6dec30e4a1abe19192/pydantic_core-2.46.4-cp314-cp314t-win_arm64.whl", hash = "sha256:4fcbe087dbc2068af7eda3aa87634eba216dbda64d1ae73c8684b621d33f6596", size = 2017325 },
+]
+
+[[package]]
+name = "pygments"
+version = "2.20.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151 },
+]
+
+[[package]]
+name = "pytest"
+version = "9.0.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249 },
+]
+
+[[package]]
+name = "pytest-asyncio"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/90/2c/8af215c0f776415f3590cac4f9086ccefd6fd463befeae41cd4d3f193e5a/pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5", size = 50087 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5", size = 15075 },
+]
+
+[[package]]
+name = "python-dateutil"
+version = "2.9.0.post0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "six" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892 },
+]
+
+[[package]]
+name = "python-dotenv"
+version = "1.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/82/ed/0301aeeac3e5353ef3d94b6ec08bbcabd04a72018415dcb29e588514bba8/python_dotenv-1.2.2.tar.gz", hash = "sha256:2c371a91fbd7ba082c2c1dc1f8bf89ca22564a087c2c287cd9b662adde799cf3", size = 50135 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101 },
+]
+
+[[package]]
+name = "pyyaml"
+version = "6.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063 },
+    { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973 },
+    { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116 },
+    { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011 },
+    { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870 },
+    { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089 },
+    { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181 },
+    { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658 },
+    { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003 },
+    { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344 },
+    { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669 },
+    { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252 },
+    { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081 },
+    { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159 },
+    { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626 },
+    { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613 },
+    { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115 },
+    { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427 },
+    { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090 },
+    { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246 },
+    { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814 },
+    { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809 },
+    { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454 },
+    { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355 },
+    { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175 },
+    { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228 },
+    { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194 },
+    { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429 },
+    { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912 },
+    { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108 },
+    { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641 },
+    { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901 },
+    { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132 },
+    { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261 },
+    { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272 },
+    { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923 },
+    { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062 },
+    { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341 },
+]
+
+[[package]]
+name = "reportlab"
+version = "4.5.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "charset-normalizer" },
+    { name = "pillow" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/4d/3f/b3861b7e40c9d66f4a04e018958d681d16b948bfd1963c962d43a8c23f66/reportlab-4.5.1.tar.gz", hash = "sha256:9fdf68f4de9171ec66acb4a5feed8f8ca2af43479e707a6fbb0daa75d88e5494", size = 3939748 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/45/ea7fad10122440de6e845568d106bffdc456ca0e8a1d8ae10b46016087e4/reportlab-4.5.1-py3-none-any.whl", hash = "sha256:06fce8cb56c83307cfa4909cdf4e6a2ddbb44e5d6ef4d2edca896d7e9769f091", size = 1957812 },
+]
+
+[[package]]
+name = "requests"
+version = "2.34.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "charset-normalizer" },
+    { name = "idna" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/43/b8/7a707d60fea4c49094e40262cc0e2ca6c768cca21587e34d3f705afec47e/requests-2.34.0.tar.gz", hash = "sha256:7d62fe92f50eb82c529b0916bb445afa1531a566fc8f35ffdc64446e771b856a", size = 142436 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ef/e6/e300fce5fe83c30520607a015dabd985df3251e188d234bfe9492e17a389/requests-2.34.0-py3-none-any.whl", hash = "sha256:917520a21b767485ce7c588f4ebb917c436b24a31231b44228715eaeb5a52c60", size = 73021 },
+]
+
+[[package]]
+name = "respx"
+version = "0.23.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "httpx" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/43/98/4e55c9c486404ec12373708d015ebce157966965a5ebe7f28ff2c784d41b/respx-0.23.1.tar.gz", hash = "sha256:242dcc6ce6b5b9bf621f5870c82a63997e8e82bc7c947f9ffe272b8f3dd5a780", size = 29243 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1d/4a/221da6ca167db45693d8d26c7dc79ccfc978a440251bf6721c9aaf251ac0/respx-0.23.1-py2.py3-none-any.whl", hash = "sha256:b18004b029935384bccfa6d7d9d74b4ec9af73a081cc28600fffc0447f4b8c1a", size = 25557 },
+]
+
+[[package]]
+name = "rich"
+version = "15.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "markdown-it-py" },
+    { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c0/8f/0722ca900cc807c13a6a0c696dacf35430f72e0ec571c4275d2371fca3e9/rich-15.0.0.tar.gz", hash = "sha256:edd07a4824c6b40189fb7ac9bc4c52536e9780fbbfbddf6f1e2502c31b068c36", size = 230680 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/82/3b/64d4899d73f91ba49a8c18a8ff3f0ea8f1c1d75481760df8c68ef5235bf5/rich-15.0.0-py3-none-any.whl", hash = "sha256:33bd4ef74232fb73fe9279a257718407f169c09b78a87ad3d296f548e27de0bb", size = 310654 },
+]
+
+[[package]]
+name = "ruff"
+version = "0.15.12"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/99/43/3291f1cc9106f4c63bdce7a8d0df5047fe8422a75b091c16b5e9355e0b11/ruff-0.15.12.tar.gz", hash = "sha256:ecea26adb26b4232c0c2ca19ccbc0083a68344180bba2a600605538ce51a40a6", size = 4643852 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c3/6e/e78ffb61d4686f3d96ba3df2c801161843746dcbcbb17a1e927d4829312b/ruff-0.15.12-py3-none-linux_armv6l.whl", hash = "sha256:f86f176e188e94d6bdbc09f09bfd9dc729059ad93d0e7390b5a73efe19f8861c", size = 10640713 },
+    { url = "https://files.pythonhosted.org/packages/ae/08/a317bc231fb9e7b93e4ef3089501e51922ff88d6936ce5cf870c4fe55419/ruff-0.15.12-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:e3bcd123364c3770b8e1b7baaf343cc99a35f197c5c6e8af79015c666c423a6c", size = 11069267 },
+    { url = "https://files.pythonhosted.org/packages/aa/a4/f828e9718d3dce1f5f11c39c4f65afd32783c8b2aebb2e3d259e492c47bd/ruff-0.15.12-py3-none-macosx_11_0_arm64.whl", hash = "sha256:fe87510d000220aa1ed530d4448a7c696a0cae1213e5ec30e5874287b66557b5", size = 10397182 },
+    { url = "https://files.pythonhosted.org/packages/71/e0/3310fc6d1b5e1fdea22bf3b1b807c7e187b581021b0d7d4514cccdb5fb71/ruff-0.15.12-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84a1630093121375a3e2a95b4a6dc7b59e2b4ee76216e32d81aae550a832d002", size = 10758012 },
+    { url = "https://files.pythonhosted.org/packages/11/c1/a606911aee04c324ddaa883ae418f3569792fd3c4a10c50e0dd0a2311e1e/ruff-0.15.12-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fb129f40f114f089ebe0ca56c0d251cf2061b17651d464bb6478dc01e69f11f5", size = 10447479 },
+    { url = "https://files.pythonhosted.org/packages/9d/68/4201e8444f0894f21ab4aeeaee68aa4f10b51613514a20d80bd628d57e88/ruff-0.15.12-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b0c862b172d695db7598426b8af465e7e9ac00a3ea2a3630ee67eb82e366aaa6", size = 11234040 },
+    { url = "https://files.pythonhosted.org/packages/34/ff/8a6d6cf4ccc23fd67060874e832c18919d1557a0611ebef03fdb01fff11e/ruff-0.15.12-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2849ea9f3484c3aca43a82f484210370319e7170df4dfe4843395ddf6c57bc33", size = 12087377 },
+    { url = "https://files.pythonhosted.org/packages/85/f6/c669cf73f5152f623d34e69866a46d5e6185816b19fcd5b6dd8a2d299922/ruff-0.15.12-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9e77c7e51c07fe396826d5969a5b846d9cd4c402535835fb6e21ce8b28fef847", size = 11367784 },
+    { url = "https://files.pythonhosted.org/packages/e8/39/c61d193b8a1daaa8977f7dea9e8d8ba866e02ea7b65d32f6861693aa4c12/ruff-0.15.12-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83b2f4f2f3b1026b5fb449b467d9264bf22067b600f7b6f41fc5958909f449d0", size = 11344088 },
+    { url = "https://files.pythonhosted.org/packages/c2/8d/49afab3645e31e12c590acb6d3b5b69d7aab5b81926dbaf7461f9441f37a/ruff-0.15.12-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:9ba3b8f1afd7e2e43d8943e55f249e13f9682fde09711644a6e7290eb4f3e339", size = 11271770 },
+    { url = "https://files.pythonhosted.org/packages/46/06/33f41fe94403e2b755481cdfb9b7ef3e4e0ed031c4581124658d935d52b4/ruff-0.15.12-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e852ba9fdc890655e1d78f2df1499efbe0e54126bd405362154a75e2bde159c5", size = 10719355 },
+    { url = "https://files.pythonhosted.org/packages/0d/59/18aa4e014debbf559670e4048e39260a85c7fcee84acfd761ac01e7b8d35/ruff-0.15.12-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:dd8aed930da53780d22fc70bdf84452c843cf64f8cb4eb38984319c24c5cd5fd", size = 10462758 },
+    { url = "https://files.pythonhosted.org/packages/25/e7/cc9f16fd0f3b5fddcbd7ec3d6ae30c8f3fde1047f32a4093a98d633c6570/ruff-0.15.12-py3-none-musllinux_1_2_i686.whl", hash = "sha256:01da3988d225628b709493d7dc67c3b9b12c0210016b08690ef9bd27970b262b", size = 10953498 },
+    { url = "https://files.pythonhosted.org/packages/72/7a/a9ba7f98c7a575978698f4230c5e8cc54bbc761af34f560818f933dafa0c/ruff-0.15.12-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:9cae0f92bd5700d1213188b31cd3bdd2b315361296d10b96b8e2337d3d11f53e", size = 11447765 },
+    { url = "https://files.pythonhosted.org/packages/ea/f9/0ae446942c846b8266059ad8a30702a35afae55f5cdc54c5adf8d7afdc27/ruff-0.15.12-py3-none-win32.whl", hash = "sha256:d0185894e038d7043ba8fd6aee7499ece6462dc0ea9f1e260c7451807c714c20", size = 10657277 },
+    { url = "https://files.pythonhosted.org/packages/33/f1/9614e03e1cdcbf9437570b5400ced8a720b5db22b28d8e0f1bda429f660d/ruff-0.15.12-py3-none-win_amd64.whl", hash = "sha256:c87a162d61ab3adca47c03f7f717c68672edec7d1b5499e652331780fe74950d", size = 11837758 },
+    { url = "https://files.pythonhosted.org/packages/c0/98/6beb4b351e472e5f4c4613f7c35a5290b8be2497e183825310c4c3a3984b/ruff-0.15.12-py3-none-win_arm64.whl", hash = "sha256:a538f7a82d061cee7be55542aca1d86d1393d55d81d4fcc314370f4340930d4f", size = 11120821 },
+]
+
+[[package]]
+name = "scikit-learn"
+version = "1.8.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "joblib" },
+    { name = "numpy" },
+    { name = "scipy" },
+    { name = "threadpoolctl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0e/d4/40988bf3b8e34feec1d0e6a051446b1f66225f8529b9309becaeef62b6c4/scikit_learn-1.8.0.tar.gz", hash = "sha256:9bccbb3b40e3de10351f8f5068e105d0f4083b1a65fa07b6634fbc401a6287fd", size = 7335585 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/90/74/e6a7cc4b820e95cc38cf36cd74d5aa2b42e8ffc2d21fe5a9a9c45c1c7630/scikit_learn-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5fb63362b5a7ddab88e52b6dbb47dac3fd7dafeee740dc6c8d8a446ddedade8e", size = 8548242 },
+    { url = "https://files.pythonhosted.org/packages/49/d8/9be608c6024d021041c7f0b3928d4749a706f4e2c3832bbede4fb4f58c95/scikit_learn-1.8.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:5025ce924beccb28298246e589c691fe1b8c1c96507e6d27d12c5fadd85bfd76", size = 8079075 },
+    { url = "https://files.pythonhosted.org/packages/dd/47/f187b4636ff80cc63f21cd40b7b2d177134acaa10f6bb73746130ee8c2e5/scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4496bb2cf7a43ce1a2d7524a79e40bc5da45cf598dbf9545b7e8316ccba47bb4", size = 8660492 },
+    { url = "https://files.pythonhosted.org/packages/97/74/b7a304feb2b49df9fafa9382d4d09061a96ee9a9449a7cbea7988dda0828/scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a0bcfe4d0d14aec44921545fd2af2338c7471de9cb701f1da4c9d85906ab847a", size = 8931904 },
+    { url = "https://files.pythonhosted.org/packages/9f/c4/0ab22726a04ede56f689476b760f98f8f46607caecff993017ac1b64aa5d/scikit_learn-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:35c007dedb2ffe38fe3ee7d201ebac4a2deccd2408e8621d53067733e3c74809", size = 8019359 },
+    { url = "https://files.pythonhosted.org/packages/24/90/344a67811cfd561d7335c1b96ca21455e7e472d281c3c279c4d3f2300236/scikit_learn-1.8.0-cp312-cp312-win_arm64.whl", hash = "sha256:8c497fff237d7b4e07e9ef1a640887fa4fb765647f86fbe00f969ff6280ce2bb", size = 7641898 },
+    { url = "https://files.pythonhosted.org/packages/03/aa/e22e0768512ce9255eba34775be2e85c2048da73da1193e841707f8f039c/scikit_learn-1.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0d6ae97234d5d7079dc0040990a6f7aeb97cb7fa7e8945f1999a429b23569e0a", size = 8513770 },
+    { url = "https://files.pythonhosted.org/packages/58/37/31b83b2594105f61a381fc74ca19e8780ee923be2d496fcd8d2e1147bd99/scikit_learn-1.8.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:edec98c5e7c128328124a029bceb09eda2d526997780fef8d65e9a69eead963e", size = 8044458 },
+    { url = "https://files.pythonhosted.org/packages/2d/5a/3f1caed8765f33eabb723596666da4ebbf43d11e96550fb18bdec42b467b/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:74b66d8689d52ed04c271e1329f0c61635bcaf5b926db9b12d58914cdc01fe57", size = 8610341 },
+    { url = "https://files.pythonhosted.org/packages/38/cf/06896db3f71c75902a8e9943b444a56e727418f6b4b4a90c98c934f51ed4/scikit_learn-1.8.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8fdf95767f989b0cfedb85f7ed8ca215d4be728031f56ff5a519ee1e3276dc2e", size = 8900022 },
+    { url = "https://files.pythonhosted.org/packages/1c/f9/9b7563caf3ec8873e17a31401858efab6b39a882daf6c1bfa88879c0aa11/scikit_learn-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:2de443b9373b3b615aec1bb57f9baa6bb3a9bd093f1269ba95c17d870422b271", size = 7989409 },
+    { url = "https://files.pythonhosted.org/packages/49/bd/1f4001503650e72c4f6009ac0c4413cb17d2d601cef6f71c0453da2732fc/scikit_learn-1.8.0-cp313-cp313-win_arm64.whl", hash = "sha256:eddde82a035681427cbedded4e6eff5e57fa59216c2e3e90b10b19ab1d0a65c3", size = 7619760 },
+    { url = "https://files.pythonhosted.org/packages/d2/7d/a630359fc9dcc95496588c8d8e3245cc8fd81980251079bc09c70d41d951/scikit_learn-1.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:7cc267b6108f0a1499a734167282c00c4ebf61328566b55ef262d48e9849c735", size = 8826045 },
+    { url = "https://files.pythonhosted.org/packages/cc/56/a0c86f6930cfcd1c7054a2bc417e26960bb88d32444fe7f71d5c2cfae891/scikit_learn-1.8.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:fe1c011a640a9f0791146011dfd3c7d9669785f9fed2b2a5f9e207536cf5c2fd", size = 8420324 },
+    { url = "https://files.pythonhosted.org/packages/46/1e/05962ea1cebc1cf3876667ecb14c283ef755bf409993c5946ade3b77e303/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72358cce49465d140cc4e7792015bb1f0296a9742d5622c67e31399b75468b9e", size = 8680651 },
+    { url = "https://files.pythonhosted.org/packages/fe/56/a85473cd75f200c9759e3a5f0bcab2d116c92a8a02ee08ccd73b870f8bb4/scikit_learn-1.8.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:80832434a6cc114f5219211eec13dcbc16c2bac0e31ef64c6d346cde3cf054cb", size = 8925045 },
+    { url = "https://files.pythonhosted.org/packages/cc/b7/64d8cfa896c64435ae57f4917a548d7ac7a44762ff9802f75a79b77cb633/scikit_learn-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ee787491dbfe082d9c3013f01f5991658b0f38aa8177e4cd4bf434c58f551702", size = 8507994 },
+    { url = "https://files.pythonhosted.org/packages/5e/37/e192ea709551799379958b4c4771ec507347027bb7c942662c7fbeba31cb/scikit_learn-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf97c10a3f5a7543f9b88cbf488d33d175e9146115a451ae34568597ba33dcde", size = 7869518 },
+    { url = "https://files.pythonhosted.org/packages/24/05/1af2c186174cc92dcab2233f327336058c077d38f6fe2aceb08e6ab4d509/scikit_learn-1.8.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:c22a2da7a198c28dd1a6e1136f19c830beab7fdca5b3e5c8bba8394f8a5c45b3", size = 8528667 },
+    { url = "https://files.pythonhosted.org/packages/a8/25/01c0af38fe969473fb292bba9dc2b8f9b451f3112ff242c647fee3d0dfe7/scikit_learn-1.8.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:6b595b07a03069a2b1740dc08c2299993850ea81cce4fe19b2421e0c970de6b7", size = 8066524 },
+    { url = "https://files.pythonhosted.org/packages/be/ce/a0623350aa0b68647333940ee46fe45086c6060ec604874e38e9ab7d8e6c/scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:29ffc74089f3d5e87dfca4c2c8450f88bdc61b0fc6ed5d267f3988f19a1309f6", size = 8657133 },
+    { url = "https://files.pythonhosted.org/packages/b8/cb/861b41341d6f1245e6ca80b1c1a8c4dfce43255b03df034429089ca2a2c5/scikit_learn-1.8.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fb65db5d7531bccf3a4f6bec3462223bea71384e2cda41da0f10b7c292b9e7c4", size = 8923223 },
+    { url = "https://files.pythonhosted.org/packages/76/18/a8def8f91b18cd1ba6e05dbe02540168cb24d47e8dcf69e8d00b7da42a08/scikit_learn-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:56079a99c20d230e873ea40753102102734c5953366972a71d5cb39a32bc40c6", size = 8096518 },
+    { url = "https://files.pythonhosted.org/packages/d1/77/482076a678458307f0deb44e29891d6022617b2a64c840c725495bee343f/scikit_learn-1.8.0-cp314-cp314-win_arm64.whl", hash = "sha256:3bad7565bc9cf37ce19a7c0d107742b320c1285df7aab1a6e2d28780df167242", size = 7754546 },
+    { url = "https://files.pythonhosted.org/packages/2d/d1/ef294ca754826daa043b2a104e59960abfab4cf653891037d19dd5b6f3cf/scikit_learn-1.8.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:4511be56637e46c25721e83d1a9cea9614e7badc7040c4d573d75fbe257d6fd7", size = 8848305 },
+    { url = "https://files.pythonhosted.org/packages/5b/e2/b1f8b05138ee813b8e1a4149f2f0d289547e60851fd1bb268886915adbda/scikit_learn-1.8.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:a69525355a641bf8ef136a7fa447672fb54fe8d60cab5538d9eb7c6438543fb9", size = 8432257 },
+    { url = "https://files.pythonhosted.org/packages/26/11/c32b2138a85dcb0c99f6afd13a70a951bfdff8a6ab42d8160522542fb647/scikit_learn-1.8.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c2656924ec73e5939c76ac4c8b026fc203b83d8900362eb2599d8aee80e4880f", size = 8678673 },
+    { url = "https://files.pythonhosted.org/packages/c7/57/51f2384575bdec454f4fe4e7a919d696c9ebce914590abf3e52d47607ab8/scikit_learn-1.8.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15fc3b5d19cc2be65404786857f2e13c70c83dd4782676dd6814e3b89dc8f5b9", size = 8922467 },
+    { url = "https://files.pythonhosted.org/packages/35/4d/748c9e2872637a57981a04adc038dacaa16ba8ca887b23e34953f0b3f742/scikit_learn-1.8.0-cp314-cp314t-win_amd64.whl", hash = "sha256:00d6f1d66fbcf4eba6e356e1420d33cc06c70a45bb1363cd6f6a8e4ebbbdece2", size = 8774395 },
+    { url = "https://files.pythonhosted.org/packages/60/22/d7b2ebe4704a5e50790ba089d5c2ae308ab6bb852719e6c3bd4f04c3a363/scikit_learn-1.8.0-cp314-cp314t-win_arm64.whl", hash = "sha256:f28dd15c6bb0b66ba09728cf09fd8736c304be29409bd8445a080c1280619e8c", size = 8002647 },
+]
+
+[[package]]
+name = "scipy"
+version = "1.17.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/35/48/b992b488d6f299dbe3f11a20b24d3dda3d46f1a635ede1c46b5b17a7b163/scipy-1.17.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:35c3a56d2ef83efc372eaec584314bd0ef2e2f0d2adb21c55e6ad5b344c0dcb8", size = 31610954 },
+    { url = "https://files.pythonhosted.org/packages/b2/02/cf107b01494c19dc100f1d0b7ac3cc08666e96ba2d64db7626066cee895e/scipy-1.17.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:fcb310ddb270a06114bb64bbe53c94926b943f5b7f0842194d585c65eb4edd76", size = 28172662 },
+    { url = "https://files.pythonhosted.org/packages/cf/a9/599c28631bad314d219cf9ffd40e985b24d603fc8a2f4ccc5ae8419a535b/scipy-1.17.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:cc90d2e9c7e5c7f1a482c9875007c095c3194b1cfedca3c2f3291cdc2bc7c086", size = 20344366 },
+    { url = "https://files.pythonhosted.org/packages/35/f5/906eda513271c8deb5af284e5ef0206d17a96239af79f9fa0aebfe0e36b4/scipy-1.17.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:c80be5ede8f3f8eded4eff73cc99a25c388ce98e555b17d31da05287015ffa5b", size = 22704017 },
+    { url = "https://files.pythonhosted.org/packages/da/34/16f10e3042d2f1d6b66e0428308ab52224b6a23049cb2f5c1756f713815f/scipy-1.17.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e19ebea31758fac5893a2ac360fedd00116cbb7628e650842a6691ba7ca28a21", size = 32927842 },
+    { url = "https://files.pythonhosted.org/packages/01/8e/1e35281b8ab6d5d72ebe9911edcdffa3f36b04ed9d51dec6dd140396e220/scipy-1.17.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02ae3b274fde71c5e92ac4d54bc06c42d80e399fec704383dcd99b301df37458", size = 35235890 },
+    { url = "https://files.pythonhosted.org/packages/c5/5c/9d7f4c88bea6e0d5a4f1bc0506a53a00e9fcb198de372bfe4d3652cef482/scipy-1.17.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8a604bae87c6195d8b1045eddece0514d041604b14f2727bbc2b3020172045eb", size = 35003557 },
+    { url = "https://files.pythonhosted.org/packages/65/94/7698add8f276dbab7a9de9fb6b0e02fc13ee61d51c7c3f85ac28b65e1239/scipy-1.17.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f590cd684941912d10becc07325a3eeb77886fe981415660d9265c4c418d0bea", size = 37625856 },
+    { url = "https://files.pythonhosted.org/packages/a2/84/dc08d77fbf3d87d3ee27f6a0c6dcce1de5829a64f2eae85a0ecc1f0daa73/scipy-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:41b71f4a3a4cab9d366cd9065b288efc4d4f3c0b37a91a8e0947fb5bd7f31d87", size = 36549682 },
+    { url = "https://files.pythonhosted.org/packages/bc/98/fe9ae9ffb3b54b62559f52dedaebe204b408db8109a8c66fdd04869e6424/scipy-1.17.1-cp312-cp312-win_arm64.whl", hash = "sha256:f4115102802df98b2b0db3cce5cb9b92572633a1197c77b7553e5203f284a5b3", size = 24547340 },
+    { url = "https://files.pythonhosted.org/packages/76/27/07ee1b57b65e92645f219b37148a7e7928b82e2b5dbeccecb4dff7c64f0b/scipy-1.17.1-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:5e3c5c011904115f88a39308379c17f91546f77c1667cea98739fe0fccea804c", size = 31590199 },
+    { url = "https://files.pythonhosted.org/packages/ec/ae/db19f8ab842e9b724bf5dbb7db29302a91f1e55bc4d04b1025d6d605a2c5/scipy-1.17.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:6fac755ca3d2c3edcb22f479fceaa241704111414831ddd3bc6056e18516892f", size = 28154001 },
+    { url = "https://files.pythonhosted.org/packages/5b/58/3ce96251560107b381cbd6e8413c483bbb1228a6b919fa8652b0d4090e7f/scipy-1.17.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:7ff200bf9d24f2e4d5dc6ee8c3ac64d739d3a89e2326ba68aaf6c4a2b838fd7d", size = 20325719 },
+    { url = "https://files.pythonhosted.org/packages/b2/83/15087d945e0e4d48ce2377498abf5ad171ae013232ae31d06f336e64c999/scipy-1.17.1-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4b400bdc6f79fa02a4d86640310dde87a21fba0c979efff5248908c6f15fad1b", size = 22683595 },
+    { url = "https://files.pythonhosted.org/packages/b4/e0/e58fbde4a1a594c8be8114eb4aac1a55bcd6587047efc18a61eb1f5c0d30/scipy-1.17.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b64ca7d4aee0102a97f3ba22124052b4bd2152522355073580bf4845e2550b6", size = 32896429 },
+    { url = "https://files.pythonhosted.org/packages/f5/5f/f17563f28ff03c7b6799c50d01d5d856a1d55f2676f537ca8d28c7f627cd/scipy-1.17.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:581b2264fc0aa555f3f435a5944da7504ea3a065d7029ad60e7c3d1ae09c5464", size = 35203952 },
+    { url = "https://files.pythonhosted.org/packages/8d/a5/9afd17de24f657fdfe4df9a3f1ea049b39aef7c06000c13db1530d81ccca/scipy-1.17.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:beeda3d4ae615106d7094f7e7cef6218392e4465cc95d25f900bebabfded0950", size = 34979063 },
+    { url = "https://files.pythonhosted.org/packages/8b/13/88b1d2384b424bf7c924f2038c1c409f8d88bb2a8d49d097861dd64a57b2/scipy-1.17.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6609bc224e9568f65064cfa72edc0f24ee6655b47575954ec6339534b2798369", size = 37598449 },
+    { url = "https://files.pythonhosted.org/packages/35/e5/d6d0e51fc888f692a35134336866341c08655d92614f492c6860dc45bb2c/scipy-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:37425bc9175607b0268f493d79a292c39f9d001a357bebb6b88fdfaff13f6448", size = 36510943 },
+    { url = "https://files.pythonhosted.org/packages/2a/fd/3be73c564e2a01e690e19cc618811540ba5354c67c8680dce3281123fb79/scipy-1.17.1-cp313-cp313-win_arm64.whl", hash = "sha256:5cf36e801231b6a2059bf354720274b7558746f3b1a4efb43fcf557ccd484a87", size = 24545621 },
+    { url = "https://files.pythonhosted.org/packages/6f/6b/17787db8b8114933a66f9dcc479a8272e4b4da75fe03b0c282f7b0ade8cd/scipy-1.17.1-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:d59c30000a16d8edc7e64152e30220bfbd724c9bbb08368c054e24c651314f0a", size = 31936708 },
+    { url = "https://files.pythonhosted.org/packages/38/2e/524405c2b6392765ab1e2b722a41d5da33dc5c7b7278184a8ad29b6cb206/scipy-1.17.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:010f4333c96c9bb1a4516269e33cb5917b08ef2166d5556ca2fd9f082a9e6ea0", size = 28570135 },
+    { url = "https://files.pythonhosted.org/packages/fd/c3/5bd7199f4ea8556c0c8e39f04ccb014ac37d1468e6cfa6a95c6b3562b76e/scipy-1.17.1-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:2ceb2d3e01c5f1d83c4189737a42d9cb2fc38a6eeed225e7515eef71ad301dce", size = 20741977 },
+    { url = "https://files.pythonhosted.org/packages/d9/b8/8ccd9b766ad14c78386599708eb745f6b44f08400a5fd0ade7cf89b6fc93/scipy-1.17.1-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:844e165636711ef41f80b4103ed234181646b98a53c8f05da12ca5ca289134f6", size = 23029601 },
+    { url = "https://files.pythonhosted.org/packages/6d/a0/3cb6f4d2fb3e17428ad2880333cac878909ad1a89f678527b5328b93c1d4/scipy-1.17.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:158dd96d2207e21c966063e1635b1063cd7787b627b6f07305315dd73d9c679e", size = 33019667 },
+    { url = "https://files.pythonhosted.org/packages/f3/c3/2d834a5ac7bf3a0c806ad1508efc02dda3c8c61472a56132d7894c312dea/scipy-1.17.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74cbb80d93260fe2ffa334efa24cb8f2f0f622a9b9febf8b483c0b865bfb3475", size = 35264159 },
+    { url = "https://files.pythonhosted.org/packages/4d/77/d3ed4becfdbd217c52062fafe35a72388d1bd82c2d0ba5ca19d6fcc93e11/scipy-1.17.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:dbc12c9f3d185f5c737d801da555fb74b3dcfa1a50b66a1a93e09190f41fab50", size = 35102771 },
+    { url = "https://files.pythonhosted.org/packages/bd/12/d19da97efde68ca1ee5538bb261d5d2c062f0c055575128f11a2730e3ac1/scipy-1.17.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:94055a11dfebe37c656e70317e1996dc197e1a15bbcc351bcdd4610e128fe1ca", size = 37665910 },
+    { url = "https://files.pythonhosted.org/packages/06/1c/1172a88d507a4baaf72c5a09bb6c018fe2ae0ab622e5830b703a46cc9e44/scipy-1.17.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e30bdeaa5deed6bc27b4cc490823cd0347d7dae09119b8803ae576ea0ce52e4c", size = 36562980 },
+    { url = "https://files.pythonhosted.org/packages/70/b0/eb757336e5a76dfa7911f63252e3b7d1de00935d7705cf772db5b45ec238/scipy-1.17.1-cp313-cp313t-win_arm64.whl", hash = "sha256:a720477885a9d2411f94a93d16f9d89bad0f28ca23c3f8daa521e2dcc3f44d49", size = 24856543 },
+    { url = "https://files.pythonhosted.org/packages/cf/83/333afb452af6f0fd70414dc04f898647ee1423979ce02efa75c3b0f2c28e/scipy-1.17.1-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:a48a72c77a310327f6a3a920092fa2b8fd03d7deaa60f093038f22d98e096717", size = 31584510 },
+    { url = "https://files.pythonhosted.org/packages/ed/a6/d05a85fd51daeb2e4ea71d102f15b34fedca8e931af02594193ae4fd25f7/scipy-1.17.1-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:45abad819184f07240d8a696117a7aacd39787af9e0b719d00285549ed19a1e9", size = 28170131 },
+    { url = "https://files.pythonhosted.org/packages/db/7b/8624a203326675d7746a254083a187398090a179335b2e4a20e2ddc46e83/scipy-1.17.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:3fd1fcdab3ea951b610dc4cef356d416d5802991e7e32b5254828d342f7b7e0b", size = 20342032 },
+    { url = "https://files.pythonhosted.org/packages/c9/35/2c342897c00775d688d8ff3987aced3426858fd89d5a0e26e020b660b301/scipy-1.17.1-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7bdf2da170b67fdf10bca777614b1c7d96ae3ca5794fd9587dce41eb2966e866", size = 22678766 },
+    { url = "https://files.pythonhosted.org/packages/ef/f2/7cdb8eb308a1a6ae1e19f945913c82c23c0c442a462a46480ce487fdc0ac/scipy-1.17.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:adb2642e060a6549c343603a3851ba76ef0b74cc8c079a9a58121c7ec9fe2350", size = 32957007 },
+    { url = "https://files.pythonhosted.org/packages/0b/2e/7eea398450457ecb54e18e9d10110993fa65561c4f3add5e8eccd2b9cd41/scipy-1.17.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eee2cfda04c00a857206a4330f0c5e3e56535494e30ca445eb19ec624ae75118", size = 35221333 },
+    { url = "https://files.pythonhosted.org/packages/d9/77/5b8509d03b77f093a0d52e606d3c4f79e8b06d1d38c441dacb1e26cacf46/scipy-1.17.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d2650c1fb97e184d12d8ba010493ee7b322864f7d3d00d3f9bb97d9c21de4068", size = 35042066 },
+    { url = "https://files.pythonhosted.org/packages/f9/df/18f80fb99df40b4070328d5ae5c596f2f00fffb50167e31439e932f29e7d/scipy-1.17.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:08b900519463543aa604a06bec02461558a6e1cef8fdbb8098f77a48a83c8118", size = 37612763 },
+    { url = "https://files.pythonhosted.org/packages/4b/39/f0e8ea762a764a9dc52aa7dabcfad51a354819de1f0d4652b6a1122424d6/scipy-1.17.1-cp314-cp314-win_amd64.whl", hash = "sha256:3877ac408e14da24a6196de0ddcace62092bfc12a83823e92e49e40747e52c19", size = 37290984 },
+    { url = "https://files.pythonhosted.org/packages/7c/56/fe201e3b0f93d1a8bcf75d3379affd228a63d7e2d80ab45467a74b494947/scipy-1.17.1-cp314-cp314-win_arm64.whl", hash = "sha256:f8885db0bc2bffa59d5c1b72fad7a6a92d3e80e7257f967dd81abb553a90d293", size = 25192877 },
+    { url = "https://files.pythonhosted.org/packages/96/ad/f8c414e121f82e02d76f310f16db9899c4fcde36710329502a6b2a3c0392/scipy-1.17.1-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:1cc682cea2ae55524432f3cdff9e9a3be743d52a7443d0cba9017c23c87ae2f6", size = 31949750 },
+    { url = "https://files.pythonhosted.org/packages/7c/b0/c741e8865d61b67c81e255f4f0a832846c064e426636cd7de84e74d209be/scipy-1.17.1-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:2040ad4d1795a0ae89bfc7e8429677f365d45aa9fd5e4587cf1ea737f927b4a1", size = 28585858 },
+    { url = "https://files.pythonhosted.org/packages/ed/1b/3985219c6177866628fa7c2595bfd23f193ceebbe472c98a08824b9466ff/scipy-1.17.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:131f5aaea57602008f9822e2115029b55d4b5f7c070287699fe45c661d051e39", size = 20757723 },
+    { url = "https://files.pythonhosted.org/packages/c0/19/2a04aa25050d656d6f7b9e7b685cc83d6957fb101665bfd9369ca6534563/scipy-1.17.1-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:9cdc1a2fcfd5c52cfb3045feb399f7b3ce822abdde3a193a6b9a60b3cb5854ca", size = 23043098 },
+    { url = "https://files.pythonhosted.org/packages/86/f1/3383beb9b5d0dbddd030335bf8a8b32d4317185efe495374f134d8be6cce/scipy-1.17.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e3dcd57ab780c741fde8dc68619de988b966db759a3c3152e8e9142c26295ad", size = 33030397 },
+    { url = "https://files.pythonhosted.org/packages/41/68/8f21e8a65a5a03f25a79165ec9d2b28c00e66dc80546cf5eb803aeeff35b/scipy-1.17.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a9956e4d4f4a301ebf6cde39850333a6b6110799d470dbbb1e25326ac447f52a", size = 35281163 },
+    { url = "https://files.pythonhosted.org/packages/84/8d/c8a5e19479554007a5632ed7529e665c315ae7492b4f946b0deb39870e39/scipy-1.17.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:a4328d245944d09fd639771de275701ccadf5f781ba0ff092ad141e017eccda4", size = 35116291 },
+    { url = "https://files.pythonhosted.org/packages/52/52/e57eceff0e342a1f50e274264ed47497b59e6a4e3118808ee58ddda7b74a/scipy-1.17.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a77cbd07b940d326d39a1d1b37817e2ee4d79cb30e7338f3d0cddffae70fcaa2", size = 37682317 },
+    { url = "https://files.pythonhosted.org/packages/11/2f/b29eafe4a3fbc3d6de9662b36e028d5f039e72d345e05c250e121a230dd4/scipy-1.17.1-cp314-cp314t-win_amd64.whl", hash = "sha256:eb092099205ef62cd1782b006658db09e2fed75bffcae7cc0d44052d8aa0f484", size = 37345327 },
+    { url = "https://files.pythonhosted.org/packages/07/39/338d9219c4e87f3e708f18857ecd24d22a0c3094752393319553096b98af/scipy-1.17.1-cp314-cp314t-win_arm64.whl", hash = "sha256:200e1050faffacc162be6a486a984a0497866ec54149a01270adc8a59b7c7d21", size = 25489165 },
+]
+
+[[package]]
+name = "shellingham"
+version = "1.5.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755 },
+]
+
+[[package]]
+name = "six"
+version = "1.17.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050 },
+]
+
+[[package]]
+name = "surfsense-evals"
+version = "0.1.0"
+source = { editable = "." }
+dependencies = [
+    { name = "datasets" },
+    { name = "httpx" },
+    { name = "httpx-sse" },
+    { name = "huggingface-hub" },
+    { name = "numpy" },
+    { name = "pillow" },
+    { name = "pyarrow" },
+    { name = "pydantic" },
+    { name = "python-dotenv" },
+    { name = "reportlab" },
+    { name = "rich" },
+    { name = "scikit-learn" },
+    { name = "scipy" },
+    { name = "tqdm" },
+]
+
+[package.optional-dependencies]
+dev = [
+    { name = "pytest" },
+    { name = "pytest-asyncio" },
+    { name = "respx" },
+    { name = "ruff" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "datasets", specifier = ">=2.21.0" },
+    { name = "httpx", specifier = ">=0.27.0" },
+    { name = "httpx-sse", specifier = ">=0.4.0" },
+    { name = "huggingface-hub", specifier = ">=0.24.0" },
+    { name = "numpy", specifier = ">=1.26.0" },
+    { name = "pillow", specifier = ">=10.0.0" },
+    { name = "pyarrow", specifier = ">=15.0.0" },
+    { name = "pydantic", specifier = ">=2.6.0" },
+    { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },
+    { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.23.0" },
+    { name = "python-dotenv", specifier = ">=1.0.0" },
+    { name = "reportlab", specifier = ">=4.0.0" },
+    { name = "respx", marker = "extra == 'dev'", specifier = ">=0.21.0" },
+    { name = "rich", specifier = ">=13.7.0" },
+    { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.5.0" },
+    { name = "scikit-learn", specifier = ">=1.4.0" },
+    { name = "scipy", specifier = ">=1.12.0" },
+    { name = "tqdm", specifier = ">=4.66.0" },
+]
+provides-extras = ["dev"]
+
+[[package]]
+name = "threadpoolctl"
+version = "3.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638 },
+]
+
+[[package]]
+name = "tqdm"
+version = "4.67.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/09/a9/6ba95a270c6f1fbcd8dac228323f2777d886cb206987444e4bce66338dd4/tqdm-4.67.3.tar.gz", hash = "sha256:7d825f03f89244ef73f1d4ce193cb1774a8179fd96f31d7e1dcde62092b960bb", size = 169598 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374 },
+]
+
+[[package]]
+name = "typer"
+version = "0.25.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "annotated-doc" },
+    { name = "click" },
+    { name = "rich" },
+    { name = "shellingham" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e4/51/9aed62104cea109b820bbd6c14245af756112017d309da813ef107d42e7e/typer-0.25.1.tar.gz", hash = "sha256:9616eb8853a09ffeabab1698952f33c6f29ffdbceb4eaeecf571880e8d7664cc", size = 122276 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3f/f9/2b3ff4e56e5fa7debfaf9eb135d0da96f3e9a1d5b27222223c7296336e5f/typer-0.25.1-py3-none-any.whl", hash = "sha256:75caa44ed46a03fb2dab8808753ffacdbfea88495e74c85a28c5eefcf5f39c89", size = 58409 },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.15.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614 },
+]
+
+[[package]]
+name = "typing-inspection"
+version = "0.4.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611 },
+]
+
+[[package]]
+name = "tzdata"
+version = "2026.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ba/19/1b9b0e29f30c6d35cb345486df41110984ea67ae69dddbc0e8a100999493/tzdata-2026.2.tar.gz", hash = "sha256:9173fde7d80d9018e02a662e168e5a2d04f87c41ea174b139fbef642eda62d10", size = 198254 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ce/e4/dccd7f47c4b64213ac01ef921a1337ee6e30e8c6466046018326977efd95/tzdata-2026.2-py2.py3-none-any.whl", hash = "sha256:bbe9af844f658da81a5f95019480da3a89415801f6cc966806612cc7169bffe7", size = 349321 },
+]
+
+[[package]]
+name = "urllib3"
+version = "2.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/53/0c/06f8b233b8fd13b9e5ee11424ef85419ba0d8ba0b3138bf360be2ff56953/urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c", size = 433602 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7f/3e/5db95bcf282c52709639744ca2a8b149baccf648e39c8cc87553df9eae0c/urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897", size = 131087 },
+]
+
+[[package]]
+name = "xxhash"
+version = "3.7.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/24/2f/e183a1b407002f5af81822bee18b61cdb94b8670208ef34734d8d2b8ebe9/xxhash-3.7.0.tar.gz", hash = "sha256:6cc4eefbb542a5d6ffd6d70ea9c502957c925e800f998c5630ecc809d6702bae", size = 82022 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f2/8a/51a14cdef4728c6c2337db8a7d8704422cc65676d9199d77215464c880af/xxhash-3.7.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:082c87bfdd2b9f457606c7a4a53457f4c4b48b0cdc48de0277f4349d79bb3d7a", size = 33357 },
+    { url = "https://files.pythonhosted.org/packages/b9/1b/0c2c933809421ffd9bf42b59315552c143c755db5d9a816b2f1ae273e884/xxhash-3.7.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5e7ce913b61f35b0c1c839a49ac9c8e75dd8d860150688aed353b0ce1bf409d8", size = 30869 },
+    { url = "https://files.pythonhosted.org/packages/03/a8/89d5fdd6ee12d70ba99451de46dd0e8010167468dcd913ec855653f4dd50/xxhash-3.7.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3beb1de3b1e9694fcdd853e570ee64c631c7062435d2f8c69c1adf809bc086f0", size = 194100 },
+    { url = "https://files.pythonhosted.org/packages/87/ee/2f9f2ed993e77206d1e66991290a1ebe22e843351ca3ebec8e49e01ba186/xxhash-3.7.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f3e7b689c3bce16699efcf736066f5c6cc4472c3840fe4b22bd8279daf4abdac", size = 212977 },
+    { url = "https://files.pythonhosted.org/packages/de/60/5a91644615a9e9d4e42c2e9925f1908e3a24e4e691d9de7340d565bea024/xxhash-3.7.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:a6545e6b409e3d5cbafc850fb84c55a1ca26ed15a6b11e3bf07a0e0cd84517c8", size = 236373 },
+    { url = "https://files.pythonhosted.org/packages/22/c0/f3a9384eaaed9d14d4d062a5d953aa0da489bfe9747877aa994caa87cd0b/xxhash-3.7.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:31ab1461c77a11461d703c88eb949e132a1c6515933cf675d97ec680f4bd18de", size = 212229 },
+    { url = "https://files.pythonhosted.org/packages/2e/67/02f07a9fd79726804190f2172c4894c3ed9a4ebccaca05653c84beb58025/xxhash-3.7.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7c4d596b7676f811172687ec567cbafb9e4dea2f9be1bbb4f622410cb7f40f40", size = 445462 },
+    { url = "https://files.pythonhosted.org/packages/40/37/558f5a90c0672fc9b4402dc25d87ac5b7406616e8969430c9ca4e52ee74d/xxhash-3.7.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13805f0461cba0a857924e70ff91ae6d52d2598f79a884e788db80532614a4a1", size = 193932 },
+    { url = "https://files.pythonhosted.org/packages/d5/90/aaa09cd58661d32044dbbad7df55bbe22a623032b810e7ed3b8c569a2a6f/xxhash-3.7.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:1d398f372496152f1c6933a33566373f8d1b37b98b8c9d608fa6edc0976f23b2", size = 284807 },
+    { url = "https://files.pythonhosted.org/packages/d6/f3/53df3719ab127a02c174f0c1c74924fcd110866e89c966bc7909cfa8fa84/xxhash-3.7.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d610aa62cdb7d4d497740741772a24a794903bf3e79eaa51d2e800082abe11e5", size = 210445 },
+    { url = "https://files.pythonhosted.org/packages/72/33/d219975c0e8b6fa2eb9ccd486fe47e21bf1847985b878dd2fbc3126e0d5c/xxhash-3.7.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:073c23900a9fbf3d26616c17c830db28af9803677cd5b33aea3224d824111514", size = 241273 },
+    { url = "https://files.pythonhosted.org/packages/3e/50/49b1afe610eb3964cedcb90a4d4c3d46a261ee8669cbd4f060652619ae3c/xxhash-3.7.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:418a463c3e6a590c0cdc890f8be19adb44a8c8acd175ca5b2a6de77e61d0b386", size = 197950 },
+    { url = "https://files.pythonhosted.org/packages/c6/75/5f42a1a4c78717d906a4b6a140c6dbf837ab1f547a54d23c4e2903310936/xxhash-3.7.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:03f8ff4474ee61c845758ce00711d7087a770d77efb36f7e74a6e867301000b8", size = 210709 },
+    { url = "https://files.pythonhosted.org/packages/8a/85/237e446c25abced71e9c53d269f2cef5bab8a82b3f88a12e00c5368e7368/xxhash-3.7.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:44fba4a5f1d179b7ddc7b3dc40f56f9209046421679b57025d4d8821b376fd8d", size = 275345 },
+    { url = "https://files.pythonhosted.org/packages/62/34/c2c26c0a6a9cc739bc2a5f0ae03ba8b87deb12b8bce35f7ac495e790dc6d/xxhash-3.7.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:31e3516a0f829d06ded4a2c0f3c7c5561993256bfa1c493975fb9dc7bfa828a1", size = 414056 },
+    { url = "https://files.pythonhosted.org/packages/a0/aa/5c58e9bc8071b8afd8dcf297ff362f723c4892168faba149f19904132bf4/xxhash-3.7.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b59ee2ac81de57771a09ecad09191e840a1d2fae1ef684208320591055768f83", size = 191485 },
+    { url = "https://files.pythonhosted.org/packages/d4/69/a929cf9d1e2e65a48b818cdce72cb6b69eab2e6877f21436d0a1942aff43/xxhash-3.7.0-cp312-cp312-win32.whl", hash = "sha256:74bbd92f8c7fcc397ba0a11bfdc106bc72ad7f11e3a60277753f87e7532b4d81", size = 30671 },
+    { url = "https://files.pythonhosted.org/packages/b9/1b/104b41a8947f4e1d4a66ce1e628eea752f37d1890bfd7453559ca7a3d950/xxhash-3.7.0-cp312-cp312-win_amd64.whl", hash = "sha256:7bd7bc82dd4f185f28f35193c2e968ef46131628e3cac62f639dadf321cba4d1", size = 31514 },
+    { url = "https://files.pythonhosted.org/packages/98/a0/1fd0ea1f1b886d9e7c73f0397571e22333a7d79e31da6d7127c2a4a71d75/xxhash-3.7.0-cp312-cp312-win_arm64.whl", hash = "sha256:7d7148180ec99ba36585b42c8c5de25e9b40191613bc4be68909b4d25a77a852", size = 27761 },
+    { url = "https://files.pythonhosted.org/packages/c1/ca/d5174b4c36d10f64d4ca7050563138c5a599efb01a765858ddefc9c1202a/xxhash-3.7.0-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:4b6d6b33f141158692bd4eafbb96edbc5aa0dabdb593a962db01a91983d4f8fa", size = 36813 },
+    { url = "https://files.pythonhosted.org/packages/41/d0/abc6c9d347ba1f1e1e1d98125d0881a0452c7f9a76a9dd03a7b5d2197f23/xxhash-3.7.0-cp313-cp313-android_21_x86_64.whl", hash = "sha256:845d347df254d6c619f616afa921331bada8614b8d373d58725c663ba97c3605", size = 35121 },
+    { url = "https://files.pythonhosted.org/packages/bf/11/4cc834eb3d79f2f2b3a6ef7324195208bcdfbdcf7534d2b17267aa5f3a8f/xxhash-3.7.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:fddbbb69a6fff4f421e7a0d1fa28f894b20112e9e3fab306af451e2dfd0e459b", size = 29624 },
+    { url = "https://files.pythonhosted.org/packages/23/83/e97d3e7b635fe73a1dfb1e91f805324dd6d930bb42041cbf18f183bc0b6d/xxhash-3.7.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:54876a4e45101cec2bf8f31a973cda073a23e2e108538dad224ba07f85f22487", size = 30638 },
+    { url = "https://files.pythonhosted.org/packages/f4/40/d84951d80c35db1f4c40a29a64a8520eea5d56e764c603906b4fe763580f/xxhash-3.7.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:0c72fe9c7e3d6dfd7f1e21e224a877917fa09c465694ba4e06464b9511b65544", size = 33323 },
+    { url = "https://files.pythonhosted.org/packages/89/cc/c7dc6558d97e9ab023f663d69ab28b340ed9bf4d2d94f2c259cf896bb354/xxhash-3.7.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a6d73a830b17ef49bc04e00182bd839164c1b3c59c127cd7c54fcb10c7ed8ee8", size = 33362 },
+    { url = "https://files.pythonhosted.org/packages/2a/6e/46b84017b1301d54091430353d4ad5901654a3e0871649877a416f7f1644/xxhash-3.7.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:91c3b07cf3362086d8f126c6aecd8e5e9396ad8b2f2219ea7e49a8250c318acd", size = 30874 },
+    { url = "https://files.pythonhosted.org/packages/df/5e/8f9158e3ab906ad3fec51e09b5ea0093e769f12207bfa42a368ca204e7ab/xxhash-3.7.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:50e879ebbac351c81565ca108db766d7832f5b8b6a5b14b8c0151f7190028e3d", size = 194185 },
+    { url = "https://files.pythonhosted.org/packages/f3/29/a804ded9f5d3d3758292678d23e7528b08fda7b7e750688d08b052322475/xxhash-3.7.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:921c14e93817842dd0dd9f372890a0f0c72e534650b6ab13c5be5cd0db11d47e", size = 213033 },
+    { url = "https://files.pythonhosted.org/packages/8b/91/1ce5a7d2fdc975267320e2c78fc1cecfe7ab735ccbcf6993ec5dd541cb2c/xxhash-3.7.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e64a7c9d7dfca3e0fafcbc5e455519090706a3e36e95d655cec3e04e79f95aaa", size = 236140 },
+    { url = "https://files.pythonhosted.org/packages/34/04/fd595a4fd8617b05fa27bd9b684ecb4985bfed27917848eea85d54036d06/xxhash-3.7.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2220af08163baf5fa36c2b8af079dc2cbe6e66ae061385267f9472362dfd53c6", size = 212291 },
+    { url = "https://files.pythonhosted.org/packages/03/fb/f1a379cbc372ae5b9f4ab36154c48a849ca6ebe3ac477067a57865bf3bc6/xxhash-3.7.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f14bb8b22a4a91325813e3d553b8963c10cf8c756cff65ee50c194431296c655", size = 445532 },
+    { url = "https://files.pythonhosted.org/packages/65/59/172424b79f8cfd4b6d8a122b2193e6b8ad4b11f7159bb3b6f9b3191329bb/xxhash-3.7.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:496736f86a9bedaf64b0dc70e3539d0766df01c71ea22032698e88f3f04a1ce9", size = 193990 },
+    { url = "https://files.pythonhosted.org/packages/b9/19/aeac22161d953f139f07ba5586cb4a17c5b7b6dff985122803bb12933500/xxhash-3.7.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0ff71596bd79816975b3de7130ab1ff4541410285a3c084584eeb1c8239996fd", size = 284876 },
+    { url = "https://files.pythonhosted.org/packages/77/d5/4fd0b59e7a02242953da05ff679fbb961b0a4368eac97a217e11dae110c1/xxhash-3.7.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1ad86695c19b1d46fe106925db3c7a37f16be37669dcf58dcc70a9dd6e324676", size = 210495 },
+    { url = "https://files.pythonhosted.org/packages/aa/fb/976a3165c728c7faf74aa1b5ab3cf6a85e6d731612894741840524c7d28c/xxhash-3.7.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:970f9f8c50961d639cbd0d988c96f80ddf66006de93641719282c4fe7a87c5e6", size = 241331 },
+    { url = "https://files.pythonhosted.org/packages/4a/2c/6763d5901d53ac9e6ba296e5717ae599025c9d268396e8faa8b4b0a8e0ac/xxhash-3.7.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:5886ad85e9e347911783760a1d16cb6b393e8f9e3b52c982568226cb56927bdc", size = 198037 },
+    { url = "https://files.pythonhosted.org/packages/61/2b/876e722d533833f5f9a83473e6ba993e48745701096944e77bbecf29b2c3/xxhash-3.7.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:6e934bbae1e0ec74e27d5f0d7f37ef547ce5ff9f0a7e63fb39e559fc99526734", size = 210744 },
+    { url = "https://files.pythonhosted.org/packages/21/e6/d7e7baef7ce24166b4668d3c48557bb35a23b92ecadcac7e7718d099ab69/xxhash-3.7.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:3b6b3d28228af044ebcded71c4a3dd86e1dbd7e2f4645bf40f7b5da65bb5fb5a", size = 275406 },
+    { url = "https://files.pythonhosted.org/packages/92/fe/198b3763b2e01ca908f2154969a2352ec99bda892b574a11a9a151c5ede4/xxhash-3.7.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:6be4d70d9ab76c9f324ead9c01af6ff52c324745ea0c3731682a0cf99720f1fe", size = 414125 },
+    { url = "https://files.pythonhosted.org/packages/3a/6d/019a11affd5a5499137cacca53808659964785439855b5aa40dfd3412916/xxhash-3.7.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:151d7520838d4465461a0b7f4ae488b3b00de16183dd3214c1a6b14bf89d7fb6", size = 191555 },
+    { url = "https://files.pythonhosted.org/packages/76/21/b96d58568df2d01533244c3e0e5cbdd0c8b2b25c4bec4d72f19259a292d7/xxhash-3.7.0-cp313-cp313-win32.whl", hash = "sha256:d798c1e291bffb8e37b5bbe0dda77fc767cd19e89cadaf66e6ed5d0ff88c9fe6", size = 30668 },
+    { url = "https://files.pythonhosted.org/packages/99/57/d849a8d3afa1f8f4bc6a831cd89f49f9706fbbad94d2975d6140a171988c/xxhash-3.7.0-cp313-cp313-win_amd64.whl", hash = "sha256:875811ba23c543b1a1c3143c926e43996eb27ebb8f52d3500744aa608c275aed", size = 31524 },
+    { url = "https://files.pythonhosted.org/packages/81/52/bacc753e92dee78b058af8dcef0a50815f5f860986c664a92d75f965b6a5/xxhash-3.7.0-cp313-cp313-win_arm64.whl", hash = "sha256:54a675cb300dda83d71daae2a599389d22db8021a0f8db0dd659e14626eb3ecc", size = 27768 },
+    { url = "https://files.pythonhosted.org/packages/1c/47/ddbd683b7fc7e592c1a8d9d65f73ce9ab513f082b3967eee2baf549b8fc6/xxhash-3.7.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a3b19a42111c4057c1547a4a1396a53961dca576a0f6b82bfa88a2d1561764b2", size = 33576 },
+    { url = "https://files.pythonhosted.org/packages/07/f2/36d3310161db7f72efb4562aadde0ed429f1d0531782dd6345b12d2da527/xxhash-3.7.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:8f4608a06e4d61b7a3425665a46d00e0579122e1a2fae97a0c52953a3aad9aa3", size = 31123 },
+    { url = "https://files.pythonhosted.org/packages/0d/3f/75937a5c69556ed213021e43cbedd84c8e0279d0d74e7d41a255d84ba4b1/xxhash-3.7.0-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:ad37c7792479e49cf96c1ab25517d7003fe0d93687a772ba19a097d235bbe41e", size = 196491 },
+    { url = "https://files.pythonhosted.org/packages/22/29/f10d7ff8c7a733d4403a43b9de18c8fabc005f98cec054644f04418659ee/xxhash-3.7.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dc026e3b89d98e30a8288c95cb696e77d150b3f0fb7a51f73dcd49ee6b5577fa", size = 215793 },
+    { url = "https://files.pythonhosted.org/packages/8b/fd/778f60aa295f58907938f030a8b514611f391405614a525cccd2ffc00eb5/xxhash-3.7.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c9b31ab1f28b078a6a1ac1a54eb35e7d5390deddd56870d0be3a0a733d1c321c", size = 237993 },
+    { url = "https://files.pythonhosted.org/packages/70/f5/736db5de387b4a540e37a05b84b40dc58a1ce974bfd2b4e5754ce29b68c3/xxhash-3.7.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3bb5fd680c038fd5229e44e9c493782f90df9bef632fd0499d442374688ff70b", size = 214887 },
+    { url = "https://files.pythonhosted.org/packages/4d/aa/09a095f22fdb9a27fbb716841fbff52119721f9ca4261952d07a912f7839/xxhash-3.7.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:030c0fd688fce3569fbb49a2feefd4110cbb0b650186fb4610759ecfac677548", size = 448407 },
+    { url = "https://files.pythonhosted.org/packages/74/8a/b745efeeca9e34a91c26fdc97ad8514c43d5a81ac78565cba80a1353870a/xxhash-3.7.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5b1bde10324f4c31812ae0d0502e92d916ae8917cad7209353f122b8b8f610c3", size = 196119 },
+    { url = "https://files.pythonhosted.org/packages/8a/5c/0cfceb024af90c191f665c7933b1f318ee234f4797858383bebd1881d52f/xxhash-3.7.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:503722d52a615f2604f5e7611de7d43878df010dc0053094ef91cb9a9ac3d987", size = 286751 },
+    { url = "https://files.pythonhosted.org/packages/0b/0a/0793e405dc3cf8f4ebe2c1acec1e4e4608cd9e7e50ea691dabbc2a95ccbb/xxhash-3.7.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c72500a3b6d6c30ebfc135035bcace9eb5884f2dc220804efcaaba43e9f611dd", size = 212961 },
+    { url = "https://files.pythonhosted.org/packages/0c/7e/721118ffc63bfff94aa565bcf2555a820f9f4bdb0f001e0d609bdfad70de/xxhash-3.7.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:43475925a766d01ca8cd9a857fd87f3d50406983c8506a4c07c4df12adcc867f", size = 243703 },
+    { url = "https://files.pythonhosted.org/packages/6e/18/16f6267160488b8276fd3d449d425712512add292ba545c1b6946bfdb7dd/xxhash-3.7.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:8d09dfd2ab135b985daf868b594315ebe11ad86cd9fea46e6c69f19b28f7d25a", size = 200894 },
+    { url = "https://files.pythonhosted.org/packages/2d/94/80ba841287fd97e3e9cac1d228788c8ef623746f570404961eec748ecb5c/xxhash-3.7.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:c50269d0055ac1faecfd559886d2cbe4b730de236585aba0e873f9d9dadbe585", size = 213357 },
+    { url = "https://files.pythonhosted.org/packages/a1/7e/106d4067130c59f1e18a55ffadcd876d8c68534883a1e02685b29d3d8153/xxhash-3.7.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:1910df4756a5ab58cfad8744fc2d0f23926e3efcc346ee76e87b974abab922f4", size = 277600 },
+    { url = "https://files.pythonhosted.org/packages/c5/86/a081dd30da71d720b2612a792bfd55e45fa9a07ac76a0507f60487473c25/xxhash-3.7.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:d006faf3b491957efcb433489be3c149efe4787b7063d5cddb8ddaefdc60e0c1", size = 416980 },
+    { url = "https://files.pythonhosted.org/packages/35/29/1a95221a029a3c1293773869e1ab47b07cbbdd82444a42809e8c60156626/xxhash-3.7.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:abb65b4e947e958f7b3b0d71db3ce447d1bc5f37f5eab871ce7223bda8768a04", size = 193840 },
+    { url = "https://files.pythonhosted.org/packages/c5/e0/db909dd0823285de2286f67e10ee4d81e96ad35d7d8e964ecb07fccd8af9/xxhash-3.7.0-cp313-cp313t-win32.whl", hash = "sha256:178959906cb1716a1ce08e0d69c82886c70a15a6f2790fc084fdd146ca30cd49", size = 30966 },
+    { url = "https://files.pythonhosted.org/packages/7b/ff/d705b15b22f21ee106adce239cb65d35067a158c630b240270f09b17c2e6/xxhash-3.7.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2524a1e20d4c231d13b50f7cf39e44265b055669a64a7a4b9a2a44faa03f19b6", size = 31784 },
+    { url = "https://files.pythonhosted.org/packages/a2/1f/b2cf83c3638fd0588e0b17f22e5a9400bdfb1a3e3755324ac0aee2250b88/xxhash-3.7.0-cp313-cp313t-win_arm64.whl", hash = "sha256:37d994d0ffe81ef087bb330d392caa809bb5853c77e22ea3f71db024a0543dba", size = 27932 },
+    { url = "https://files.pythonhosted.org/packages/0e/cc/431db584f6fbb9312e40a173af027644e5580d39df1f73603cbb9dca4d6b/xxhash-3.7.0-cp314-cp314-android_24_arm64_v8a.whl", hash = "sha256:8c5fcfd806c335bfa2adf1cd0b3110a44fc7b6995c3a648c27489bae85801465", size = 36644 },
+    { url = "https://files.pythonhosted.org/packages/bc/01/255ec513e0a705d1f9a61413e78dfce4e3235203f0ed525a24c2b4b56345/xxhash-3.7.0-cp314-cp314-android_24_x86_64.whl", hash = "sha256:506a0b488f190f0a06769575e30caf71615c898ed93ab18b0dbcb6dec5c3713c", size = 35003 },
+    { url = "https://files.pythonhosted.org/packages/68/70/c55fc33c93445b44d8fc5a17b41ed99e3cebe92bcf8396809e63fc9a1165/xxhash-3.7.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:ec68dbba21532c0173a9872298e65c89749f7c9d21538c3a78b5bb6105871568", size = 29655 },
+    { url = "https://files.pythonhosted.org/packages/c2/72/ff8de73df000d74467d12a59ce6d6e2b2a368b978d41ab7b1fba5ed442be/xxhash-3.7.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:fa77e7ec1450d415d20129961814787c9abd9a07f98872f070b1fe96c5084611", size = 30664 },
+    { url = "https://files.pythonhosted.org/packages/b6/91/08416d9bd9bc3bf39d831abe8a5631ac2db5141dfd6fe81c3fe59a1f9264/xxhash-3.7.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:fe32736295ea38e43e7d9424053c8c47c9f64fecfc7c895fb3da9b30b131c9ee", size = 33317 },
+    { url = "https://files.pythonhosted.org/packages/0e/3b/86b1caa4dee10a99f4bf9521e623359341c5e50d05158fa10c275b2bd079/xxhash-3.7.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:ab9dd2c83c4bbd63e422181a76f13502d049d3ddcac9a1bdc29196263d692bb8", size = 33457 },
+    { url = "https://files.pythonhosted.org/packages/ed/38/98ea14ad1517e1461292a65906951458d520689782bfbae111050145bdba/xxhash-3.7.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3afec3a336a2286601a437cb07562ab0227685e6fbb9ec17e8c18457ff348ecf", size = 30894 },
+    { url = "https://files.pythonhosted.org/packages/61/a2/074654d0b893606541199993c7db70067d9fc63b748e0d60020a52a1bd36/xxhash-3.7.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:565df64437a9390f84465dcca33e7377114c7ede8d05cd2cf20081f831ea788e", size = 194409 },
+    { url = "https://files.pythonhosted.org/packages/e2/26/6d2a1afc468189f77ca28c32e1c83e1b9da1178231e05641dbc1b350e332/xxhash-3.7.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:12eca820a5d558633d423bf8bb78ce72a55394823f64089247f788a7e0ae691e", size = 213135 },
+    { url = "https://files.pythonhosted.org/packages/8e/0e/d8aecf95e09c42547453137be74d2f7b8b14e08f5177fa2fab6144a19061/xxhash-3.7.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:f262b8f7599516567e070abf607b9af649052b2c4bd6f9be02b0cb41b7024805", size = 236379 },
+    { url = "https://files.pythonhosted.org/packages/f2/74/8140e8210536b3dd0cc816c4faaeb5ba6e63e8125ab25af4bcddd6a037b3/xxhash-3.7.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f1598916cb197681e03e601901e4ab96a9a963de398c59d0964f8a6f44a2b361", size = 212447 },
+    { url = "https://files.pythonhosted.org/packages/a0/d2/462001d2903b4bee5a5689598a0a55e5e7cd1ac7f4247a5545cff10d3ebb/xxhash-3.7.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:322b2f0622230f526aeb1738149948a7ae357a9e2ceb1383c6fd1fdaecdafa16", size = 445660 },
+    { url = "https://files.pythonhosted.org/packages/23/09/2bd1ed7f8689b20e51727952cac8329d50c694dc32b2eba06ba5bc742b37/xxhash-3.7.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:24cc22070880cc57b830a65cde4e65fa884c6d9b28ae4803b5ee05911e7bafba", size = 194076 },
+    { url = "https://files.pythonhosted.org/packages/c9/6e/692302cd0a5f4ac4e6289f37fa888dc2e1e07750b68fe3e4bfe939b8cea3/xxhash-3.7.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cb5a888a968b2434abf9ecda357b5d43f10d7b5a6da6fdbbe036208473aff0e2", size = 284990 },
+    { url = "https://files.pythonhosted.org/packages/05/d9/e54b159b3d9df7999d2a7c676ce7b323d1b5588a64f8f51ed8172567bd87/xxhash-3.7.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a999771ff97bec27d18341be4f3a36b163bb1ac41ec17bef6d2dabd84acd33c7", size = 210590 },
+    { url = "https://files.pythonhosted.org/packages/50/93/0e0df1a3a196ced4ca71de76d65ead25d8e87bbfb87b64306ea47a40c00d/xxhash-3.7.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:ed4a6efe2dee1655adb73e7ad40c6aa955a6892422b1e3b95de6a34de56e3cbb", size = 241442 },
+    { url = "https://files.pythonhosted.org/packages/9a/a9/d917a7a814e90b218f8a0d37967105eea91bf752c3303683c99a1f7bfc1f/xxhash-3.7.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:9fd17f14ac0faa12126c2f9ca774a8cf342957265ec3c8669c144e5e6cdb478c", size = 198356 },
+    { url = "https://files.pythonhosted.org/packages/89/5e/f2ba1877c39469abbefc72991d6ebdcbd4c0880db01ae8cb1f553b0c537d/xxhash-3.7.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:05fd1254268c59b5cb2a029dfc204275e9fc52de2913f1e53aa8d01442c96b4d", size = 210898 },
+    { url = "https://files.pythonhosted.org/packages/90/c6/be56b58e73de531f39a10de1355bb77ceb663900dc4bf2d6d3002a9c3f9e/xxhash-3.7.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:a2eae53197c6276d5b317f75a1be226bbf440c20b58bf525f36b5d0e1f657ca6", size = 275519 },
+    { url = "https://files.pythonhosted.org/packages/92/e2/17ddc85d5765b9c709f192009ed8f5a1fc876f4eb35bba7c307b5b1169f9/xxhash-3.7.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:bfe6f92e3522dcbe8c4281efd74fa7542a336cb00b0e3272c4ec0edabeaeaf67", size = 414191 },
+    { url = "https://files.pythonhosted.org/packages/9c/42/85f5b79f4bf1ec7ba052491164adfd4f4e9519f5dc7246de4fbd64a1bd56/xxhash-3.7.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7ab9a49c410d8c6c786ab99e79c529938d894c01433130353dd0fe999111077a", size = 191604 },
+    { url = "https://files.pythonhosted.org/packages/b8/d0/6127b623aa4cca18d8b7743592b048d689fd6c6e37ff26a22cddf6cd9d7f/xxhash-3.7.0-cp314-cp314-win32.whl", hash = "sha256:040ea63668f9185b92bc74942df09c7e65703deed71431333678fc6e739a9955", size = 31271 },
+    { url = "https://files.pythonhosted.org/packages/64/4f/44fc4788568004c43921701cbc127f48218a1eede2c9aea231115323564d/xxhash-3.7.0-cp314-cp314-win_amd64.whl", hash = "sha256:2a61e2a3fb23c892496d587b470dee7fa1b58b248a187719c65ea8e94ec13257", size = 32284 },
+    { url = "https://files.pythonhosted.org/packages/6d/77/18bb895eb60a49453d16e17d67990e5caff557c78eafc90ad4e2eabf4570/xxhash-3.7.0-cp314-cp314-win_arm64.whl", hash = "sha256:c7741c7524961d8c0cb4d4c21b28957ff731a3fd5b5cd8b856dc80a40e9e5acc", size = 28701 },
+    { url = "https://files.pythonhosted.org/packages/45/a0/46f72244570c550fbbb7db1ef554183dd5ebe9136385f30e032b781ae8f6/xxhash-3.7.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:fc84bf7aa7592f31ec63a3e7b11d624f468a3f19f5238cec7282a42e838ab1d7", size = 33646 },
+    { url = "https://files.pythonhosted.org/packages/4a/3a/453846a7eceea11e75def361eed01ec6a0205b9822c19927ed364ccae7cc/xxhash-3.7.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:9f1563fdc8abfc389748e6932c7e4e99c89a53e4ec37d4563c24fc06f5e5644b", size = 31125 },
+    { url = "https://files.pythonhosted.org/packages/bd/3e/49434aba738885d512f9e486db1bdd19db28dfa40372b56da26ef7a4e738/xxhash-3.7.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:2d415f18becf6f153046ab6adc97da77e3643a0ee205dae61c4012604113a020", size = 196633 },
+    { url = "https://files.pythonhosted.org/packages/a4/e9/006cb6127baeb9f8abe6d15e62faa01349f09b34e2bfd65175b2422d026b/xxhash-3.7.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bb16aa13ed175bc9be5c2491ba031b85a9b51c4ed90e0b3d4ebe63cf3fb54f8e", size = 215899 },
+    { url = "https://files.pythonhosted.org/packages/27/e4/cc57d72e66df0ae29b914335f1c6dcf61e8f3746ddf0ae3c471aa4f15e00/xxhash-3.7.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:f9fd595f1e5941b3d7863e4774e4b30caa6731fc34b9277da032295aa5656ee5", size = 238116 },
+    { url = "https://files.pythonhosted.org/packages/af/78/3531d4a3fd8a0038cc6be1f265a69c1b3587f557a10b677dd736de2202c1/xxhash-3.7.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1295325c5a98d552333fa53dc2b026b0ef0ec9c8e73ca3a952990b4c7d65d459", size = 215012 },
+    { url = "https://files.pythonhosted.org/packages/b4/f6/259fb1eaaec921f59b17203b0daee69829761226d3b980d5191d7723dd83/xxhash-3.7.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3573a651d146912da9daa9e29e5fbc45994420daaa9ef1e2fa5823e1dc485513", size = 448534 },
+    { url = "https://files.pythonhosted.org/packages/7b/16/a66d0eaf6a7e68532c07714361ddc904c663ec940f3b028c1ae4a21a7b9d/xxhash-3.7.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ec1e080a3d02d94ea9335bfab0e3374b877e25411422c18f51a943fa4b46381", size = 196217 },
+    { url = "https://files.pythonhosted.org/packages/8d/ef/d2efc7fc51756dc52509109d1a25cefc859d74bc4b19a167b12dbd8c2786/xxhash-3.7.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:84415265192072d8638a3afc3c1bc5995e310570cd9acb54dc46d3939e364fe0", size = 286906 },
+    { url = "https://files.pythonhosted.org/packages/fc/67/25decd1d4a4018582ec4db2a868a2b7e40640f4adb20dfeb19ac923aa825/xxhash-3.7.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8d4dea659b57443989ef32f4295104fd6912c73d0bf26d1d148bb88a9f159b02", size = 213057 },
+    { url = "https://files.pythonhosted.org/packages/0d/5d/17651eb29d06786cdc40c60ae3d27d645aa5d61d2eca6237a7ba0b94789b/xxhash-3.7.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:05ece0fe4d9c9c2728912d1981ae1566cfc83a011571b24732cbf76e1fb70dca", size = 243886 },
+    { url = "https://files.pythonhosted.org/packages/8a/d4/174d9cf7502243d586e6a9ae842b1ae23026620995114f85f1380e588bc9/xxhash-3.7.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:fd880353cf1ffaf321bc18dd663e111976dbd0d3bbd8a66d58d2b470dfa7f396", size = 201015 },
+    { url = "https://files.pythonhosted.org/packages/91/8c/2254e2d06c3ac5e6fe22eaf3da791b87ea823ae9f2c17b4af66755c5752d/xxhash-3.7.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:4e15cc9e2817f6481160f930c62842b3ff419e20e13072bcbab12230943092bc", size = 213457 },
+    { url = "https://files.pythonhosted.org/packages/79/a2/e3daa762545921173e3360f3b4ff7fc63c2d27359f7230ec1a7a74e117f6/xxhash-3.7.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:90b9d1a8bd37d768ffc92a1f651ec69afc532a96fa1ac2ea7abbed5d630b3237", size = 277738 },
+    { url = "https://files.pythonhosted.org/packages/e1/4c/e186da2c46b87f5204640e008d42730bf3c1ee9f0efb71ae1ebcdfeac681/xxhash-3.7.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:157c49475b34ecea8809e51123d9769a534e139d1247942f7a4bc67710bb2533", size = 417127 },
+    { url = "https://files.pythonhosted.org/packages/17/28/3798e15007a3712d0da3d3fe70f8e11916569858b5cc371053bc26270832/xxhash-3.7.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5a6ddec83325685e729ca119d1f5c518ec39294212ecd770e60693cdc5f7eb79", size = 193962 },
+    { url = "https://files.pythonhosted.org/packages/ad/95/a26baa93b5241fd7630998816a4ec47a5a0bad193b3f8fc8f3593e1a4a67/xxhash-3.7.0-cp314-cp314t-win32.whl", hash = "sha256:a04a6cab47e2166435aaf5b9e5ee41d1532cc8300efdef87f2a4d0acb7db19ed", size = 31643 },
+    { url = "https://files.pythonhosted.org/packages/44/36/5454f13c447e395f9b06a3e91274c59f503d31fad84e1836efe3bdb71f6a/xxhash-3.7.0-cp314-cp314t-win_amd64.whl", hash = "sha256:8653dd7c2eda020545bb2c71c7f7039b53fe7434d0fc1a0a9deb79ab3f1a4fc1", size = 32522 },
+    { url = "https://files.pythonhosted.org/packages/74/35/698e7e3ff38e22992ea24870a511d8762474fb6783627a2910ff22a185c2/xxhash-3.7.0-cp314-cp314t-win_arm64.whl", hash = "sha256:468f0fc114faaa4b36699f8e328bbc3bb11dc418ba94ac52c26dd736d4b6c637", size = 28807 },
+]
+
+[[package]]
+name = "yarl"
+version = "1.23.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "idna" },
+    { name = "multidict" },
+    { name = "propcache" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/23/6e/beb1beec874a72f23815c1434518bfc4ed2175065173fb138c3705f658d4/yarl-1.23.0.tar.gz", hash = "sha256:53b1ea6ca88ebd4420379c330aea57e258408dd0df9af0992e5de2078dc9f5d5", size = 194676 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/88/8a/94615bc31022f711add374097ad4144d569e95ff3c38d39215d07ac153a0/yarl-1.23.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1932b6b8bba8d0160a9d1078aae5838a66039e8832d41d2992daa9a3a08f7860", size = 124737 },
+    { url = "https://files.pythonhosted.org/packages/e3/6f/c6554045d59d64052698add01226bc867b52fe4a12373415d7991fdca95d/yarl-1.23.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:411225bae281f114067578891bc75534cfb3d92a3b4dfef7a6ca78ba354e6069", size = 87029 },
+    { url = "https://files.pythonhosted.org/packages/19/2a/725ecc166d53438bc88f76822ed4b1e3b10756e790bafd7b523fe97c322d/yarl-1.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:13a563739ae600a631c36ce096615fe307f131344588b0bc0daec108cdb47b25", size = 86310 },
+    { url = "https://files.pythonhosted.org/packages/99/30/58260ed98e6ff7f90ba84442c1ddd758c9170d70327394a6227b310cd60f/yarl-1.23.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9cbf44c5cb4a7633d078788e1b56387e3d3cf2b8139a3be38040b22d6c3221c8", size = 97587 },
+    { url = "https://files.pythonhosted.org/packages/76/0a/8b08aac08b50682e65759f7f8dde98ae8168f72487e7357a5d684c581ef9/yarl-1.23.0-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:53ad387048f6f09a8969631e4de3f1bf70c50e93545d64af4f751b2498755072", size = 92528 },
+    { url = "https://files.pythonhosted.org/packages/52/07/0b7179101fe5f8385ec6c6bb5d0cb9f76bd9fb4a769591ab6fb5cdbfc69a/yarl-1.23.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4a59ba56f340334766f3a4442e0efd0af895fae9e2b204741ef885c446b3a1a8", size = 105339 },
+    { url = "https://files.pythonhosted.org/packages/d3/8a/36d82869ab5ec829ca8574dfcb92b51286fcfb1e9c7a73659616362dc880/yarl-1.23.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:803a3c3ce4acc62eaf01eaca1208dcf0783025ef27572c3336502b9c232005e7", size = 105061 },
+    { url = "https://files.pythonhosted.org/packages/66/3e/868e5c3364b6cee19ff3e1a122194fa4ce51def02c61023970442162859e/yarl-1.23.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a3d2bff8f37f8d0f96c7ec554d16945050d54462d6e95414babaa18bfafc7f51", size = 100132 },
+    { url = "https://files.pythonhosted.org/packages/cf/26/9c89acf82f08a52cb52d6d39454f8d18af15f9d386a23795389d1d423823/yarl-1.23.0-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c75eb09e8d55bceb4367e83496ff8ef2bc7ea6960efb38e978e8073ea59ecb67", size = 99289 },
+    { url = "https://files.pythonhosted.org/packages/6f/54/5b0db00d2cb056922356104468019c0a132e89c8d3ab67d8ede9f4483d2a/yarl-1.23.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:877b0738624280e34c55680d6054a307aa94f7d52fa0e3034a9cc6e790871da7", size = 96950 },
+    { url = "https://files.pythonhosted.org/packages/f6/40/10fa93811fd439341fad7e0718a86aca0de9548023bbb403668d6555acab/yarl-1.23.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:b5405bb8f0e783a988172993cfc627e4d9d00432d6bbac65a923041edacf997d", size = 93960 },
+    { url = "https://files.pythonhosted.org/packages/bc/d2/8ae2e6cd77d0805f4526e30ec43b6f9a3dfc542d401ac4990d178e4bf0cf/yarl-1.23.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:1c3a3598a832590c5a3ce56ab5576361b5688c12cb1d39429cf5dba30b510760", size = 104703 },
+    { url = "https://files.pythonhosted.org/packages/2f/0c/b3ceacf82c3fe21183ce35fa2acf5320af003d52bc1fcf5915077681142e/yarl-1.23.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:8419ebd326430d1cbb7efb5292330a2cf39114e82df5cc3d83c9a0d5ebeaf2f2", size = 98325 },
+    { url = "https://files.pythonhosted.org/packages/9d/e0/12900edd28bdab91a69bd2554b85ad7b151f64e8b521fe16f9ad2f56477a/yarl-1.23.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:be61f6fff406ca40e3b1d84716fde398fc08bc63dd96d15f3a14230a0973ed86", size = 105067 },
+    { url = "https://files.pythonhosted.org/packages/15/61/74bb1182cf79c9bbe4eb6b1f14a57a22d7a0be5e9cedf8e2d5c2086474c3/yarl-1.23.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3ceb13c5c858d01321b5d9bb65e4cf37a92169ea470b70fec6f236b2c9dd7e34", size = 100285 },
+    { url = "https://files.pythonhosted.org/packages/69/7f/cd5ef733f2550de6241bd8bd8c3febc78158b9d75f197d9c7baa113436af/yarl-1.23.0-cp312-cp312-win32.whl", hash = "sha256:fffc45637bcd6538de8b85f51e3df3223e4ad89bccbfca0481c08c7fc8b7ed7d", size = 82359 },
+    { url = "https://files.pythonhosted.org/packages/f5/be/25216a49daeeb7af2bec0db22d5e7df08ed1d7c9f65d78b14f3b74fd72fc/yarl-1.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:f69f57305656a4852f2a7203efc661d8c042e6cc67f7acd97d8667fb448a426e", size = 87674 },
+    { url = "https://files.pythonhosted.org/packages/d2/35/aeab955d6c425b227d5b7247eafb24f2653fedc32f95373a001af5dfeb9e/yarl-1.23.0-cp312-cp312-win_arm64.whl", hash = "sha256:6e87a6e8735b44816e7db0b2fbc9686932df473c826b0d9743148432e10bb9b9", size = 81879 },
+    { url = "https://files.pythonhosted.org/packages/9a/4b/a0a6e5d0ee8a2f3a373ddef8a4097d74ac901ac363eea1440464ccbe0898/yarl-1.23.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:16c6994ac35c3e74fb0ae93323bf8b9c2a9088d55946109489667c510a7d010e", size = 123796 },
+    { url = "https://files.pythonhosted.org/packages/67/b6/8925d68af039b835ae876db5838e82e76ec87b9782ecc97e192b809c4831/yarl-1.23.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4a42e651629dafb64fd5b0286a3580613702b5809ad3f24934ea87595804f2c5", size = 86547 },
+    { url = "https://files.pythonhosted.org/packages/ae/50/06d511cc4b8e0360d3c94af051a768e84b755c5eb031b12adaaab6dec6e5/yarl-1.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7c6b9461a2a8b47c65eef63bb1c76a4f1c119618ffa99ea79bc5bb1e46c5821b", size = 85854 },
+    { url = "https://files.pythonhosted.org/packages/c4/f4/4e30b250927ffdab4db70da08b9b8d2194d7c7b400167b8fbeca1e4701ca/yarl-1.23.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2569b67d616eab450d262ca7cb9f9e19d2f718c70a8b88712859359d0ab17035", size = 98351 },
+    { url = "https://files.pythonhosted.org/packages/86/fc/4118c5671ea948208bdb1492d8b76bdf1453d3e73df051f939f563e7dcc5/yarl-1.23.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e9d9a4d06d3481eab79803beb4d9bd6f6a8e781ec078ac70d7ef2dcc29d1bea5", size = 92711 },
+    { url = "https://files.pythonhosted.org/packages/56/11/1ed91d42bd9e73c13dc9e7eb0dd92298d75e7ac4dd7f046ad0c472e231cd/yarl-1.23.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f514f6474e04179d3d33175ed3f3e31434d3130d42ec153540d5b157deefd735", size = 106014 },
+    { url = "https://files.pythonhosted.org/packages/ce/c9/74e44e056a23fbc33aca71779ef450ca648a5bc472bdad7a82339918f818/yarl-1.23.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:fda207c815b253e34f7e1909840fd14299567b1c0eb4908f8c2ce01a41265401", size = 105557 },
+    { url = "https://files.pythonhosted.org/packages/66/fe/b1e10b08d287f518994f1e2ff9b6d26f0adeecd8dd7d533b01bab29a3eda/yarl-1.23.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:34b6cf500e61c90f305094911f9acc9c86da1a05a7a3f5be9f68817043f486e4", size = 101559 },
+    { url = "https://files.pythonhosted.org/packages/72/59/c5b8d94b14e3d3c2a9c20cb100119fd534ab5a14b93673ab4cc4a4141ea5/yarl-1.23.0-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d7504f2b476d21653e4d143f44a175f7f751cd41233525312696c76aa3dbb23f", size = 100502 },
+    { url = "https://files.pythonhosted.org/packages/77/4f/96976cb54cbfc5c9fd73ed4c51804f92f209481d1fb190981c0f8a07a1d7/yarl-1.23.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:578110dd426f0d209d1509244e6d4a3f1a3e9077655d98c5f22583d63252a08a", size = 98027 },
+    { url = "https://files.pythonhosted.org/packages/63/6e/904c4f476471afdbad6b7e5b70362fb5810e35cd7466529a97322b6f5556/yarl-1.23.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:609d3614d78d74ebe35f54953c5bbd2ac647a7ddb9c30a5d877580f5e86b22f2", size = 95369 },
+    { url = "https://files.pythonhosted.org/packages/9d/40/acfcdb3b5f9d68ef499e39e04d25e141fe90661f9d54114556cf83be8353/yarl-1.23.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4966242ec68afc74c122f8459abd597afd7d8a60dc93d695c1334c5fd25f762f", size = 105565 },
+    { url = "https://files.pythonhosted.org/packages/5e/c6/31e28f3a6ba2869c43d124f37ea5260cac9c9281df803c354b31f4dd1f3c/yarl-1.23.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:e0fd068364a6759bc794459f0a735ab151d11304346332489c7972bacbe9e72b", size = 99813 },
+    { url = "https://files.pythonhosted.org/packages/08/1f/6f65f59e72d54aa467119b63fc0b0b1762eff0232db1f4720cd89e2f4a17/yarl-1.23.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:39004f0ad156da43e86aa71f44e033de68a44e5a31fc53507b36dd253970054a", size = 105632 },
+    { url = "https://files.pythonhosted.org/packages/a3/c4/18b178a69935f9e7a338127d5b77d868fdc0f0e49becd286d51b3a18c61d/yarl-1.23.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e5723c01a56c5028c807c701aa66722916d2747ad737a046853f6c46f4875543", size = 101895 },
+    { url = "https://files.pythonhosted.org/packages/8f/54/f5b870b5505663911dba950a8e4776a0dbd51c9c54c0ae88e823e4b874a0/yarl-1.23.0-cp313-cp313-win32.whl", hash = "sha256:1b6b572edd95b4fa8df75de10b04bc81acc87c1c7d16bcdd2035b09d30acc957", size = 82356 },
+    { url = "https://files.pythonhosted.org/packages/7a/84/266e8da36879c6edcd37b02b547e2d9ecdfea776be49598e75696e3316e1/yarl-1.23.0-cp313-cp313-win_amd64.whl", hash = "sha256:baaf55442359053c7d62f6f8413a62adba3205119bcb6f49594894d8be47e5e3", size = 87515 },
+    { url = "https://files.pythonhosted.org/packages/00/fd/7e1c66efad35e1649114fa13f17485f62881ad58edeeb7f49f8c5e748bf9/yarl-1.23.0-cp313-cp313-win_arm64.whl", hash = "sha256:fb4948814a2a98e3912505f09c9e7493b1506226afb1f881825368d6fb776ee3", size = 81785 },
+    { url = "https://files.pythonhosted.org/packages/9c/fc/119dd07004f17ea43bb91e3ece6587759edd7519d6b086d16bfbd3319982/yarl-1.23.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:aecfed0b41aa72b7881712c65cf764e39ce2ec352324f5e0837c7048d9e6daaa", size = 130719 },
+    { url = "https://files.pythonhosted.org/packages/e6/0d/9f2348502fbb3af409e8f47730282cd6bc80dec6630c1e06374d882d6eb2/yarl-1.23.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a41bcf68efd19073376eb8cf948b8d9be0af26256403e512bb18f3966f1f9120", size = 89690 },
+    { url = "https://files.pythonhosted.org/packages/50/93/e88f3c80971b42cfc83f50a51b9d165a1dbf154b97005f2994a79f212a07/yarl-1.23.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:cde9a2ecd91668bcb7f077c4966d8ceddb60af01b52e6e3e2680e4cf00ad1a59", size = 89851 },
+    { url = "https://files.pythonhosted.org/packages/1c/07/61c9dd8ba8f86473263b4036f70fb594c09e99c0d9737a799dfd8bc85651/yarl-1.23.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5023346c4ee7992febc0068e7593de5fa2bf611848c08404b35ebbb76b1b0512", size = 95874 },
+    { url = "https://files.pythonhosted.org/packages/9e/e9/f9ff8ceefba599eac6abddcfb0b3bee9b9e636e96dbf54342a8577252379/yarl-1.23.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d1009abedb49ae95b136a8904a3f71b342f849ffeced2d3747bf29caeda218c4", size = 88710 },
+    { url = "https://files.pythonhosted.org/packages/eb/78/0231bfcc5d4c8eec220bc2f9ef82cb4566192ea867a7c5b4148f44f6cbcd/yarl-1.23.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a8d00f29b42f534cc8aa3931cfe773b13b23e561e10d2b26f27a8d309b0e82a1", size = 101033 },
+    { url = "https://files.pythonhosted.org/packages/cd/9b/30ea5239a61786f18fd25797151a17fbb3be176977187a48d541b5447dd4/yarl-1.23.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:95451e6ce06c3e104556d73b559f5da6c34a069b6b62946d3ad66afcd51642ea", size = 100817 },
+    { url = "https://files.pythonhosted.org/packages/62/e2/a4980481071791bc83bce2b7a1a1f7adcabfa366007518b4b845e92eeee3/yarl-1.23.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:531ef597132086b6cf96faa7c6c1dcd0361dd5f1694e5cc30375907b9b7d3ea9", size = 97482 },
+    { url = "https://files.pythonhosted.org/packages/e5/1e/304a00cf5f6100414c4b5a01fc7ff9ee724b62158a08df2f8170dfc72a2d/yarl-1.23.0-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:88f9fb0116fbfcefcab70f85cf4b74a2b6ce5d199c41345296f49d974ddb4123", size = 95949 },
+    { url = "https://files.pythonhosted.org/packages/68/03/093f4055ed4cae649ac53bca3d180bd37102e9e11d048588e9ab0c0108d0/yarl-1.23.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e7b0460976dc75cb87ad9cc1f9899a4b97751e7d4e77ab840fc9b6d377b8fd24", size = 95839 },
+    { url = "https://files.pythonhosted.org/packages/b9/28/4c75ebb108f322aa8f917ae10a8ffa4f07cae10a8a627b64e578617df6a0/yarl-1.23.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:115136c4a426f9da976187d238e84139ff6b51a20839aa6e3720cd1026d768de", size = 90696 },
+    { url = "https://files.pythonhosted.org/packages/23/9c/42c2e2dd91c1a570402f51bdf066bfdb1241c2240ba001967bad778e77b7/yarl-1.23.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:ead11956716a940c1abc816b7df3fa2b84d06eaed8832ca32f5c5e058c65506b", size = 100865 },
+    { url = "https://files.pythonhosted.org/packages/74/05/1bcd60a8a0a914d462c305137246b6f9d167628d73568505fce3f1cb2e65/yarl-1.23.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:fe8f8f5e70e6dbdfca9882cd9deaac058729bcf323cf7a58660901e55c9c94f6", size = 96234 },
+    { url = "https://files.pythonhosted.org/packages/90/b2/f52381aac396d6778ce516b7bc149c79e65bfc068b5de2857ab69eeea3b7/yarl-1.23.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:a0e317df055958a0c1e79e5d2aa5a5eaa4a6d05a20d4b0c9c3f48918139c9fc6", size = 100295 },
+    { url = "https://files.pythonhosted.org/packages/e5/e8/638bae5bbf1113a659b2435d8895474598afe38b4a837103764f603aba56/yarl-1.23.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6f0fd84de0c957b2d280143522c4f91a73aada1923caee763e24a2b3fda9f8a5", size = 97784 },
+    { url = "https://files.pythonhosted.org/packages/80/25/a3892b46182c586c202629fc2159aa13975d3741d52ebd7347fd501d48d5/yarl-1.23.0-cp313-cp313t-win32.whl", hash = "sha256:93a784271881035ab4406a172edb0faecb6e7d00f4b53dc2f55919d6c9688595", size = 88313 },
+    { url = "https://files.pythonhosted.org/packages/43/68/8c5b36aa5178900b37387937bc2c2fe0e9505537f713495472dcf6f6fccc/yarl-1.23.0-cp313-cp313t-win_amd64.whl", hash = "sha256:dd00607bffbf30250fe108065f07453ec124dbf223420f57f5e749b04295e090", size = 94932 },
+    { url = "https://files.pythonhosted.org/packages/c6/cc/d79ba8292f51f81f4dc533a8ccfb9fc6992cabf0998ed3245de7589dc07c/yarl-1.23.0-cp313-cp313t-win_arm64.whl", hash = "sha256:ac09d42f48f80c9ee1635b2fcaa819496a44502737660d3c0f2ade7526d29144", size = 84786 },
+    { url = "https://files.pythonhosted.org/packages/90/98/b85a038d65d1b92c3903ab89444f48d3cee490a883477b716d7a24b1a78c/yarl-1.23.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:21d1b7305a71a15b4794b5ff22e8eef96ff4a6d7f9657155e5aa419444b28912", size = 124455 },
+    { url = "https://files.pythonhosted.org/packages/39/54/bc2b45559f86543d163b6e294417a107bb87557609007c007ad889afec18/yarl-1.23.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:85610b4f27f69984932a7abbe52703688de3724d9f72bceb1cca667deff27474", size = 86752 },
+    { url = "https://files.pythonhosted.org/packages/24/f9/e8242b68362bffe6fb536c8db5076861466fc780f0f1b479fc4ffbebb128/yarl-1.23.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:23f371bd662cf44a7630d4d113101eafc0cfa7518a2760d20760b26021454719", size = 86291 },
+    { url = "https://files.pythonhosted.org/packages/ea/d8/d1cb2378c81dd729e98c716582b1ccb08357e8488e4c24714658cc6630e8/yarl-1.23.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c4a80f77dc1acaaa61f0934176fccca7096d9b1ff08c8ba9cddf5ae034a24319", size = 99026 },
+    { url = "https://files.pythonhosted.org/packages/0a/ff/7196790538f31debe3341283b5b0707e7feb947620fc5e8236ef28d44f72/yarl-1.23.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:bd654fad46d8d9e823afbb4f87c79160b5a374ed1ff5bde24e542e6ba8f41434", size = 92355 },
+    { url = "https://files.pythonhosted.org/packages/c1/56/25d58c3eddde825890a5fe6aa1866228377354a3c39262235234ab5f616b/yarl-1.23.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:682bae25f0a0dd23a056739f23a134db9f52a63e2afd6bfb37ddc76292bbd723", size = 106417 },
+    { url = "https://files.pythonhosted.org/packages/51/8a/882c0e7bc8277eb895b31bce0138f51a1ba551fc2e1ec6753ffc1e7c1377/yarl-1.23.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a82836cab5f197a0514235aaf7ffccdc886ccdaa2324bc0aafdd4ae898103039", size = 106422 },
+    { url = "https://files.pythonhosted.org/packages/42/2b/fef67d616931055bf3d6764885990a3ac647d68734a2d6a9e1d13de437a2/yarl-1.23.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1c57676bdedc94cd3bc37724cf6f8cd2779f02f6aba48de45feca073e714fe52", size = 101915 },
+    { url = "https://files.pythonhosted.org/packages/18/6a/530e16aebce27c5937920f3431c628a29a4b6b430fab3fd1c117b26ff3f6/yarl-1.23.0-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c7f8dc16c498ff06497c015642333219871effba93e4a2e8604a06264aca5c5c", size = 100690 },
+    { url = "https://files.pythonhosted.org/packages/88/08/93749219179a45e27b036e03260fda05190b911de8e18225c294ac95bbc9/yarl-1.23.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:5ee586fb17ff8f90c91cf73c6108a434b02d69925f44f5f8e0d7f2f260607eae", size = 98750 },
+    { url = "https://files.pythonhosted.org/packages/d9/cf/ea424a004969f5d81a362110a6ac1496d79efdc6d50c2c4b2e3ea0fc2519/yarl-1.23.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:17235362f580149742739cc3828b80e24029d08cbb9c4bda0242c7b5bc610a8e", size = 94685 },
+    { url = "https://files.pythonhosted.org/packages/e2/b7/14341481fe568e2b0408bcf1484c652accafe06a0ade9387b5d3fd9df446/yarl-1.23.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:0793e2bd0cf14234983bbb371591e6bea9e876ddf6896cdcc93450996b0b5c85", size = 106009 },
+    { url = "https://files.pythonhosted.org/packages/0a/e6/5c744a9b54f4e8007ad35bce96fbc9218338e84812d36f3390cea616881a/yarl-1.23.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:3650dc2480f94f7116c364096bc84b1d602f44224ef7d5c7208425915c0475dd", size = 100033 },
+    { url = "https://files.pythonhosted.org/packages/0c/23/e3bfc188d0b400f025bc49d99793d02c9abe15752138dcc27e4eaf0c4a9e/yarl-1.23.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:f40e782d49630ad384db66d4d8b73ff4f1b8955dc12e26b09a3e3af064b3b9d6", size = 106483 },
+    { url = "https://files.pythonhosted.org/packages/72/42/f0505f949a90b3f8b7a363d6cbdf398f6e6c58946d85c6d3a3bc70595b26/yarl-1.23.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:94f8575fbdf81749008d980c17796097e645574a3b8c28ee313931068dad14fe", size = 102175 },
+    { url = "https://files.pythonhosted.org/packages/aa/65/b39290f1d892a9dd671d1c722014ca062a9c35d60885d57e5375db0404b5/yarl-1.23.0-cp314-cp314-win32.whl", hash = "sha256:c8aa34a5c864db1087d911a0b902d60d203ea3607d91f615acd3f3108ac32169", size = 83871 },
+    { url = "https://files.pythonhosted.org/packages/a9/5b/9b92f54c784c26e2a422e55a8d2607ab15b7ea3349e28359282f84f01d43/yarl-1.23.0-cp314-cp314-win_amd64.whl", hash = "sha256:63e92247f383c85ab00dd0091e8c3fa331a96e865459f5ee80353c70a4a42d70", size = 89093 },
+    { url = "https://files.pythonhosted.org/packages/e0/7d/8a84dc9381fd4412d5e7ff04926f9865f6372b4c2fd91e10092e65d29eb8/yarl-1.23.0-cp314-cp314-win_arm64.whl", hash = "sha256:70efd20be968c76ece7baa8dafe04c5be06abc57f754d6f36f3741f7aa7a208e", size = 83384 },
+    { url = "https://files.pythonhosted.org/packages/dd/8d/d2fad34b1c08aa161b74394183daa7d800141aaaee207317e82c790b418d/yarl-1.23.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:9a18d6f9359e45722c064c97464ec883eb0e0366d33eda61cb19a244bf222679", size = 131019 },
+    { url = "https://files.pythonhosted.org/packages/19/ff/33009a39d3ccf4b94d7d7880dfe17fb5816c5a4fe0096d9b56abceea9ac7/yarl-1.23.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:2803ed8b21ca47a43da80a6fd1ed3019d30061f7061daa35ac54f63933409412", size = 89894 },
+    { url = "https://files.pythonhosted.org/packages/0c/f1/dab7ac5e7306fb79c0190766a3c00b4cb8d09a1f390ded68c85a5934faf5/yarl-1.23.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:394906945aa8b19fc14a61cf69743a868bb8c465efe85eee687109cc540b98f4", size = 89979 },
+    { url = "https://files.pythonhosted.org/packages/aa/b1/08e95f3caee1fad6e65017b9f26c1d79877b502622d60e517de01e72f95d/yarl-1.23.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:71d006bee8397a4a89f469b8deb22469fe7508132d3c17fa6ed871e79832691c", size = 95943 },
+    { url = "https://files.pythonhosted.org/packages/c0/cc/6409f9018864a6aa186c61175b977131f373f1988e198e031236916e87e4/yarl-1.23.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:62694e275c93d54f7ccedcfef57d42761b2aad5234b6be1f3e3026cae4001cd4", size = 88786 },
+    { url = "https://files.pythonhosted.org/packages/76/40/cc22d1d7714b717fde2006fad2ced5efe5580606cb059ae42117542122f3/yarl-1.23.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a31de1613658308efdb21ada98cbc86a97c181aa050ba22a808120bb5be3ab94", size = 101307 },
+    { url = "https://files.pythonhosted.org/packages/8f/0d/476c38e85ddb4c6ec6b20b815bdd779aa386a013f3d8b85516feee55c8dc/yarl-1.23.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:fb1e8b8d66c278b21d13b0a7ca22c41dd757a7c209c6b12c313e445c31dd3b28", size = 100904 },
+    { url = "https://files.pythonhosted.org/packages/72/32/0abe4a76d59adf2081dcb0397168553ece4616ada1c54d1c49d8936c74f8/yarl-1.23.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50f9d8d531dfb767c565f348f33dd5139a6c43f5cbdf3f67da40d54241df93f6", size = 97728 },
+    { url = "https://files.pythonhosted.org/packages/b7/35/7b30f4810fba112f60f5a43237545867504e15b1c7647a785fbaf588fac2/yarl-1.23.0-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:575aa4405a656e61a540f4a80eaa5260f2a38fff7bfdc4b5f611840d76e9e277", size = 95964 },
+    { url = "https://files.pythonhosted.org/packages/2d/86/ed7a73ab85ef00e8bb70b0cb5421d8a2a625b81a333941a469a6f4022828/yarl-1.23.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:041b1a4cefacf65840b4e295c6985f334ba83c30607441ae3cf206a0eed1a2e4", size = 95882 },
+    { url = "https://files.pythonhosted.org/packages/19/90/d56967f61a29d8498efb7afb651e0b2b422a1e9b47b0ab5f4e40a19b699b/yarl-1.23.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:d38c1e8231722c4ce40d7593f28d92b5fc72f3e9774fe73d7e800ec32299f63a", size = 90797 },
+    { url = "https://files.pythonhosted.org/packages/72/00/8b8f76909259f56647adb1011d7ed8b321bcf97e464515c65016a47ecdf0/yarl-1.23.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:d53834e23c015ee83a99377db6e5e37d8484f333edb03bd15b4bc312cc7254fb", size = 101023 },
+    { url = "https://files.pythonhosted.org/packages/ac/e2/cab11b126fb7d440281b7df8e9ddbe4851e70a4dde47a202b6642586b8d9/yarl-1.23.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:2e27c8841126e017dd2a054a95771569e6070b9ee1b133366d8b31beb5018a41", size = 96227 },
+    { url = "https://files.pythonhosted.org/packages/c2/9b/2c893e16bfc50e6b2edf76c1a9eb6cb0c744346197e74c65e99ad8d634d0/yarl-1.23.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:76855800ac56f878847a09ce6dba727c93ca2d89c9e9d63002d26b916810b0a2", size = 100302 },
+    { url = "https://files.pythonhosted.org/packages/28/ec/5498c4e3a6d5f1003beb23405671c2eb9cdbf3067d1c80f15eeafe301010/yarl-1.23.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e09fd068c2e169a7070d83d3bde728a4d48de0549f975290be3c108c02e499b4", size = 98202 },
+    { url = "https://files.pythonhosted.org/packages/fe/c3/cd737e2d45e70717907f83e146f6949f20cc23cd4bf7b2688727763aa458/yarl-1.23.0-cp314-cp314t-win32.whl", hash = "sha256:73309162a6a571d4cbd3b6a1dcc703c7311843ae0d1578df6f09be4e98df38d4", size = 90558 },
+    { url = "https://files.pythonhosted.org/packages/e1/19/3774d162f6732d1cfb0b47b4140a942a35ca82bb19b6db1f80e9e7bdc8f8/yarl-1.23.0-cp314-cp314t-win_amd64.whl", hash = "sha256:4503053d296bc6e4cbd1fad61cf3b6e33b939886c4f249ba7c78b602214fabe2", size = 97610 },
+    { url = "https://files.pythonhosted.org/packages/51/47/3fa2286c3cb162c71cdb34c4224d5745a1ceceb391b2bd9b19b668a8d724/yarl-1.23.0-cp314-cp314t-win_arm64.whl", hash = "sha256:44bb7bef4ea409384e3f8bc36c063d77ea1b8d4a5b2706956c0d6695f07dcc25", size = 86041 },
+    { url = "https://files.pythonhosted.org/packages/69/68/c8739671f5699c7dc470580a4f821ef37c32c4cb0b047ce223a7f115757f/yarl-1.23.0-py3-none-any.whl", hash = "sha256:a2df6afe50dea8ae15fa34c9f824a3ee958d785fd5d089063d960bae1daa0a3f", size = 48288 },
+]