refactor(tests): Update tests to remove summary references and adjust for embedding errors

2026-07-24 23:41:10 +02:00 · 2026-06-04 01:51:21 +05:30 · 2026-06-04 01:51:21 +05:30 · e588782a9b
commit e588782a9b
parent e4d7b01b09
17 changed files with 69 additions and 148 deletions
--- a/surfsense_evals/README.md
+++ b/surfsense_evals/README.md
@ -137,15 +137,14 @@ Notes:
 - `--skip-unanswerable` (run) — drop unanswerable questions
 - `--docs <a.pdf>,<b.pdf>` (run) — scope to specific docs

-## Ingestion knobs (vision LLM, processing mode, summarize)
+## Ingestion knobs (vision LLM, processing mode)

-The harness exposes `POST /api/v1/documents/fileupload`'s three knobs on every `ingest` subcommand:
+The harness exposes `POST /api/v1/documents/fileupload`'s ingest knobs on every `ingest` subcommand:

 | Flag pair                                  | Effect                                                                                  |
 |--------------------------------------------|-----------------------------------------------------------------------------------------|
 | `--use-vision-llm` / `--no-vision-llm`     | Walk every embedded image in the PDF and inline image-derived text at the image's position (see below). |
 | `--processing-mode {basic,premium}`        | `premium` carries a 10× page multiplier and routes to a stronger ETL (e.g. LlamaCloud). |
-| `--should-summarize` / `--no-summarize`    | Generate a per-document summary at ingest.                                              |

 The "Default ingest" column in the benchmarks table is what runs if you don't pass any flag. Whatever was actually used is recorded as a `__settings__` header in the doc map (`data/<suite>/maps/<benchmark>_*_map.jsonl`) and as `extra.ingest_settings` in `run_artifact.json`, then surfaced in the report — no need to hunt through CLI history.

--- a/surfsense_evals/src/surfsense_evals/core/ingest_settings.py
+++ b/surfsense_evals/src/surfsense_evals/core/ingest_settings.py
@ -173,14 +173,14 @@ def add_ingest_settings_args(
    *,
    defaults: IngestSettings,
 ) -> None:
-    """Attach the three ingest-settings flag pairs to ``parser``.
+    """Attach ingest-settings flags to ``parser``.

-    Each bool exposes a mutually exclusive ``--foo`` / ``--no-foo``
-    pair so an operator can flip either direction without restating
-    every flag. Default is ``None`` so that "operator didn't pass the
-    flag" is distinguishable from "operator explicitly passed false"
-    — ``IngestSettings.merge`` then folds in the benchmark default
-    only when the operator was silent.
+    The vision bool exposes a mutually exclusive ``--foo`` / ``--no-foo``
+    pair so an operator can flip either direction without restating every
+    flag. Default is ``None`` so that "operator didn't pass the flag" is
+    distinguishable from "operator explicitly passed false" —
+    ``IngestSettings.merge`` then folds in the benchmark default only when
+    the operator was silent.
    """

    settings_group = parser.add_argument_group(
@ -276,7 +276,7 @@ def format_ingest_settings_md(settings: Any) -> str:
    mode = settings.get("processing_mode") or "basic"
    return (
        f"- SurfSense ingest settings: vision_llm=`{vision}`, "
-        f"processing_mode=`{mode}`, summarize=`{summarize}`"
+        f"processing_mode=`{mode}`"
    )


--- a/surfsense_evals/tests/core/test_ingest_settings.py
+++ b/surfsense_evals/tests/core/test_ingest_settings.py
@ -4,7 +4,7 @@ Covers:

 * ``IngestSettings.merge`` honours operator overrides and falls back
  to per-benchmark defaults when the operator is silent.
-* ``add_ingest_settings_args`` exposes the three flag pairs and
+* ``add_ingest_settings_args`` exposes ingest settings flags and
  argparse defaults of ``None`` correctly distinguish "not passed"
  from "explicitly false".
 * ``settings_header_line`` / ``read_settings_header`` round-trip
@ -116,12 +116,11 @@ class TestMerge:
        assert d == {
            "use_vision_llm": True,
            "processing_mode": "premium",
-            "use_vision_llm": False,
        }

    def test_render_label_format(self) -> None:
        s = IngestSettings(use_vision_llm=True, processing_mode="premium")
-        assert s.render_label() == "vision=on, mode=premium, summarize=on"
+        assert s.render_label() == "vision=on, mode=premium"


 # ---------------------------------------------------------------------------
@ -145,7 +144,6 @@ class TestAddArgs:
        args = parser.parse_args([])
        assert args.use_vision_llm is None
        assert args.processing_mode is None
-        assert args.use_vision_llm is None

    def test_use_vision_llm_flag(self, parser: argparse.ArgumentParser) -> None:
        args = parser.parse_args(["--use-vision-llm"])
@ -166,12 +164,6 @@ class TestAddArgs:
        with pytest.raises(SystemExit):
            parser.parse_args(["--processing-mode", "exotic"])

-    def test_summarize_flag_pair(self, parser: argparse.ArgumentParser) -> None:
-        on = parser.parse_args(["--should-summarize"])
-        assert on.use_vision_llm is True
-        off = parser.parse_args(["--no-summarize"])
-        assert off.use_vision_llm is False
-
    def test_vision_flags_mutually_exclusive(
        self, parser: argparse.ArgumentParser
    ) -> None:
@ -249,19 +241,17 @@ class TestHeader:
 class TestFormatMd:
    def test_full_settings(self) -> None:
        out = format_ingest_settings_md(
-            {"use_vision_llm": True, "processing_mode": "premium", "use_vision_llm": True}
+            {"use_vision_llm": True, "processing_mode": "premium"}
        )
        assert "vision_llm=`on`" in out
        assert "processing_mode=`premium`" in out
-        assert "summarize=`on`" in out

    def test_default_off(self) -> None:
        out = format_ingest_settings_md(
-            {"use_vision_llm": False, "processing_mode": "basic", "use_vision_llm": False}
+            {"use_vision_llm": False, "processing_mode": "basic"}
        )
        assert "vision_llm=`off`" in out
        assert "processing_mode=`basic`" in out
-        assert "summarize=`off`" in out

    def test_missing_returns_re_ingest_hint(self) -> None:
        # Empty dict + None + non-mapping should all degrade gracefully.