From dc6a17930b1028f16554c695030ed683f375d32c Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 4 Jun 2026 00:53:18 +0530 Subject: [PATCH] feat(evals): Remove summary ingest settings from evals --- .../surfsense_evals/core/clients/documents.py | 2 -- .../core/clients/search_space.py | 5 ---- .../surfsense_evals/core/ingest_settings.py | 25 +++---------------- .../suites/medical/cure/ingest.py | 3 +-- .../suites/medical/cure/runner.py | 1 - .../suites/medical/medxpertqa/ingest.py | 1 - .../suites/medical/medxpertqa/runner.py | 1 - .../suites/medical/mirage/ingest.py | 1 - .../suites/medical/mirage/runner.py | 1 - .../multimodal_doc/mmlongbench/ingest.py | 1 - .../multimodal_doc/mmlongbench/runner.py | 1 - .../suites/research/crag/ingest.py | 4 +-- .../suites/research/crag/runner.py | 1 - .../suites/research/frames/ingest.py | 4 +-- .../suites/research/frames/runner.py | 1 - surfsense_evals/tests/core/test_clients.py | 2 +- .../tests/core/test_ingest_settings.py | 24 +++++++++--------- 17 files changed, 19 insertions(+), 59 deletions(-) diff --git a/surfsense_evals/src/surfsense_evals/core/clients/documents.py b/surfsense_evals/src/surfsense_evals/core/clients/documents.py index 02bcf74da..362aae53b 100644 --- a/surfsense_evals/src/surfsense_evals/core/clients/documents.py +++ b/surfsense_evals/src/surfsense_evals/core/clients/documents.py @@ -110,7 +110,6 @@ class DocumentsClient: files: Iterable[Path], *, search_space_id: int, - should_summarize: bool = False, use_vision_llm: bool = False, processing_mode: str = "basic", ) -> FileUploadResult: @@ -149,7 +148,6 @@ class DocumentsClient: f"{self._base}/api/v1/documents/fileupload", data={ "search_space_id": str(search_space_id), - "should_summarize": "true" if should_summarize else "false", "use_vision_llm": "true" if use_vision_llm else "false", "processing_mode": processing_mode, }, diff --git a/surfsense_evals/src/surfsense_evals/core/clients/search_space.py b/surfsense_evals/src/surfsense_evals/core/clients/search_space.py index 37fa69f80..e2d37694d 100644 --- a/surfsense_evals/src/surfsense_evals/core/clients/search_space.py +++ b/surfsense_evals/src/surfsense_evals/core/clients/search_space.py @@ -83,7 +83,6 @@ class LlmPreferences: """ agent_llm_id: int | None - document_summary_llm_id: int | None image_generation_config_id: int | None vision_llm_config_id: int | None agent_llm: dict[str, Any] | None @@ -93,7 +92,6 @@ class LlmPreferences: def from_payload(cls, payload: dict[str, Any]) -> LlmPreferences: return cls( agent_llm_id=payload.get("agent_llm_id"), - document_summary_llm_id=payload.get("document_summary_llm_id"), image_generation_config_id=payload.get("image_generation_config_id"), vision_llm_config_id=payload.get("vision_llm_config_id"), agent_llm=payload.get("agent_llm"), @@ -154,7 +152,6 @@ class SearchSpaceClient: search_space_id: int, *, agent_llm_id: int | None = None, - document_summary_llm_id: int | None = None, image_generation_config_id: int | None = None, vision_llm_config_id: int | None = None, ) -> LlmPreferences: @@ -167,8 +164,6 @@ class SearchSpaceClient: body: dict[str, Any] = {} if agent_llm_id is not None: body["agent_llm_id"] = agent_llm_id - if document_summary_llm_id is not None: - body["document_summary_llm_id"] = document_summary_llm_id if image_generation_config_id is not None: body["image_generation_config_id"] = image_generation_config_id if vision_llm_config_id is not None: diff --git a/surfsense_evals/src/surfsense_evals/core/ingest_settings.py b/surfsense_evals/src/surfsense_evals/core/ingest_settings.py index 5cdece577..6c27abcd5 100644 --- a/surfsense_evals/src/surfsense_evals/core/ingest_settings.py +++ b/surfsense_evals/src/surfsense_evals/core/ingest_settings.py @@ -8,15 +8,13 @@ exactly three knobs (verified at * ``processing_mode`` — ``"basic"`` (default) | ``"premium"`` * ``use_vision_llm`` — ``bool`` (run vision LLM during ingest to extract image content / captions / tables) -* ``should_summarize`` — ``bool`` (generate document summary) This module gives every benchmark a uniform way to: 1. Receive sensible per-benchmark defaults (text-only benchmarks default vision off; image-bearing benchmarks default vision on). 2. Accept CLI overrides (``--use-vision-llm`` / ``--no-vision-llm``, - ``--processing-mode {basic,premium}``, - ``--should-summarize`` / ``--no-summarize``). + ``--processing-mode {basic,premium}``). 3. Persist the *actual* settings used into the doc-map manifest and the run artifact so reports can show "vision=ON, mode=premium → 65% accuracy" head-to-head with "vision=OFF, mode=basic → 52%". @@ -71,13 +69,11 @@ class IngestSettings: use_vision_llm: bool = False processing_mode: str = "basic" - should_summarize: bool = False def to_dict(self) -> dict[str, Any]: return { "use_vision_llm": self.use_vision_llm, "processing_mode": self.processing_mode, - "should_summarize": self.should_summarize, } @classmethod @@ -87,14 +83,13 @@ class IngestSettings: ``opts`` is the kwargs dict built by ``core.cli`` from the argparse namespace (see ``_cmd_ingest`` / ``_cmd_run``). Keys we look for: ``use_vision_llm`` (bool or None), ``processing_mode`` - (str or None), ``should_summarize`` (bool or None). Anything + (str or None). Anything else is ignored so benchmarks can pass through their own opts. """ return cls( use_vision_llm=_coerce_bool(opts.get("use_vision_llm"), defaults.use_vision_llm), processing_mode=_coerce_mode(opts.get("processing_mode"), defaults.processing_mode), - should_summarize=_coerce_bool(opts.get("should_summarize"), defaults.should_summarize), ) def render_label(self) -> str: @@ -102,8 +97,7 @@ class IngestSettings: return ( f"vision={'on' if self.use_vision_llm else 'off'}, " - f"mode={self.processing_mode}, " - f"summarize={'on' if self.should_summarize else 'off'}" + f"mode={self.processing_mode}" ) @@ -217,18 +211,6 @@ def add_ingest_settings_args( f"Default for this benchmark: {defaults.processing_mode!r}." ), ) - _add_bool_pair( - settings_group, - dest="should_summarize", - on_flag="--should-summarize", - off_flag="--no-summarize", - on_help=( - "Have SurfSense generate a document summary at ingest " - f"(default for this benchmark: " - f"{'on' if defaults.should_summarize else 'off'})." - ), - off_help="Skip per-document summary generation.", - ) # --------------------------------------------------------------------------- @@ -292,7 +274,6 @@ def format_ingest_settings_md(settings: Any) -> str: return "- SurfSense ingest settings: (not recorded — re-ingest to capture)" vision = "on" if settings.get("use_vision_llm") else "off" mode = settings.get("processing_mode") or "basic" - summarize = "on" if settings.get("should_summarize") else "off" return ( f"- SurfSense ingest settings: vision_llm=`{vision}`, " f"processing_mode=`{mode}`, summarize=`{summarize}`" diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/cure/ingest.py b/surfsense_evals/src/surfsense_evals/suites/medical/cure/ingest.py index 6eca8810c..275e28ce5 100644 --- a/surfsense_evals/src/surfsense_evals/suites/medical/cure/ingest.py +++ b/surfsense_evals/src/surfsense_evals/suites/medical/cure/ingest.py @@ -160,8 +160,7 @@ async def run_ingest( upload_result = await docs_client.upload( files=[b.path for b in batches], search_space_id=ctx.search_space_id, - should_summarize=settings.should_summarize, - use_vision_llm=settings.use_vision_llm, + use_vision_llm=settings.use_vision_llm, processing_mode=settings.processing_mode, ) new_doc_ids = list(upload_result.document_ids) diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/cure/runner.py b/surfsense_evals/src/surfsense_evals/suites/medical/cure/runner.py index 416912b14..041e0e8b5 100644 --- a/surfsense_evals/src/surfsense_evals/suites/medical/cure/runner.py +++ b/surfsense_evals/src/surfsense_evals/suites/medical/cure/runner.py @@ -63,7 +63,6 @@ _DESCRIPTION = "CUREv1 retrieval (single-arm SurfSense): Recall@k / MRR / nDCG@1 _DEFAULT_INGEST_SETTINGS = IngestSettings( use_vision_llm=False, processing_mode="basic", - should_summarize=False, ) diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/ingest.py b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/ingest.py index 5293e116f..ff43c7049 100644 --- a/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/ingest.py +++ b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/ingest.py @@ -208,7 +208,6 @@ async def _upload_pdfs( result = await docs_client.upload( files=batch, search_space_id=ctx.search_space_id, - should_summarize=settings.should_summarize, use_vision_llm=settings.use_vision_llm, processing_mode=settings.processing_mode, ) diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/runner.py b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/runner.py index 75646ef32..e1a830138 100644 --- a/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/runner.py +++ b/surfsense_evals/src/surfsense_evals/suites/medical/medxpertqa/runner.py @@ -169,7 +169,6 @@ _DESCRIPTION = ( _DEFAULT_INGEST_SETTINGS = IngestSettings( use_vision_llm=True, processing_mode="basic", - should_summarize=False, ) diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/mirage/ingest.py b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/ingest.py index 9769d078b..59006b6c0 100644 --- a/surfsense_evals/src/surfsense_evals/suites/medical/mirage/ingest.py +++ b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/ingest.py @@ -480,7 +480,6 @@ async def run_ingest( upload_result = await docs_client.upload( files=[b.path for b in batches], search_space_id=ctx.search_space_id, - should_summarize=settings.should_summarize, use_vision_llm=settings.use_vision_llm, processing_mode=settings.processing_mode, ) diff --git a/surfsense_evals/src/surfsense_evals/suites/medical/mirage/runner.py b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/runner.py index 0f336c0d5..b01b645a9 100644 --- a/surfsense_evals/src/surfsense_evals/suites/medical/mirage/runner.py +++ b/surfsense_evals/src/surfsense_evals/suites/medical/mirage/runner.py @@ -48,7 +48,6 @@ _DESCRIPTION = "MIRAGE (7,663 medical MCQs) — single-arm SurfSense per-task ac _DEFAULT_INGEST_SETTINGS = IngestSettings( use_vision_llm=False, processing_mode="basic", - should_summarize=False, ) diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/ingest.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/ingest.py index cf0572df8..15cdbeb77 100644 --- a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/ingest.py +++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/ingest.py @@ -225,7 +225,6 @@ async def _upload_pdfs( result = await docs_client.upload( files=batch, search_space_id=ctx.search_space_id, - should_summarize=settings.should_summarize, use_vision_llm=settings.use_vision_llm, processing_mode=settings.processing_mode, ) diff --git a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/runner.py b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/runner.py index 0e352d7ae..95a1e15eb 100644 --- a/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/runner.py +++ b/surfsense_evals/src/surfsense_evals/suites/multimodal_doc/mmlongbench/runner.py @@ -178,7 +178,6 @@ _TEXT_ONLY_HINTS = ("gpt-5.4-mini", "gpt-3.5", "text-only", "instruct-") _DEFAULT_INGEST_SETTINGS = IngestSettings( use_vision_llm=True, processing_mode="basic", - should_summarize=False, ) diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest.py index aad6a70bf..4e0c2bdc5 100644 --- a/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest.py +++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/ingest.py @@ -189,7 +189,6 @@ async def _upload_pages( result = await docs_client.upload( files=batch, search_space_id=ctx.search_space_id, - should_summarize=settings.should_summarize, use_vision_llm=settings.use_vision_llm, processing_mode=settings.processing_mode, ) @@ -306,8 +305,7 @@ async def run_ingest( settings = settings or IngestSettings( use_vision_llm=False, processing_mode="basic", - should_summarize=False, - ) + ) bench_dir = ctx.benchmark_data_dir() pages_dir = bench_dir / "pages" raw_cache = bench_dir / ".raw_cache" diff --git a/surfsense_evals/src/surfsense_evals/suites/research/crag/runner.py b/surfsense_evals/src/surfsense_evals/suites/research/crag/runner.py index 710f76744..8b759e0d8 100644 --- a/surfsense_evals/src/surfsense_evals/suites/research/crag/runner.py +++ b/surfsense_evals/src/surfsense_evals/suites/research/crag/runner.py @@ -177,7 +177,6 @@ _DESCRIPTION = ( _DEFAULT_INGEST_SETTINGS = IngestSettings( use_vision_llm=False, processing_mode="basic", - should_summarize=False, ) diff --git a/surfsense_evals/src/surfsense_evals/suites/research/frames/ingest.py b/surfsense_evals/src/surfsense_evals/suites/research/frames/ingest.py index 9780be4ed..98e035f28 100644 --- a/surfsense_evals/src/surfsense_evals/suites/research/frames/ingest.py +++ b/surfsense_evals/src/surfsense_evals/suites/research/frames/ingest.py @@ -136,7 +136,6 @@ async def _upload_markdowns( result = await docs_client.upload( files=batch, search_space_id=ctx.search_space_id, - should_summarize=settings.should_summarize, use_vision_llm=settings.use_vision_llm, processing_mode=settings.processing_mode, ) @@ -240,8 +239,7 @@ async def run_ingest( settings = settings or IngestSettings( use_vision_llm=False, processing_mode="basic", - should_summarize=False, - ) + ) bench_dir = ctx.benchmark_data_dir() wiki_cache = bench_dir / "wiki" wiki_cache.mkdir(parents=True, exist_ok=True) diff --git a/surfsense_evals/src/surfsense_evals/suites/research/frames/runner.py b/surfsense_evals/src/surfsense_evals/suites/research/frames/runner.py index a8dde0dd2..9c0e16b00 100644 --- a/surfsense_evals/src/surfsense_evals/suites/research/frames/runner.py +++ b/surfsense_evals/src/surfsense_evals/suites/research/frames/runner.py @@ -153,7 +153,6 @@ _DESCRIPTION = ( _DEFAULT_INGEST_SETTINGS = IngestSettings( use_vision_llm=False, processing_mode="basic", - should_summarize=False, ) diff --git a/surfsense_evals/tests/core/test_clients.py b/surfsense_evals/tests/core/test_clients.py index 9e2c4ad75..611408703 100644 --- a/surfsense_evals/tests/core/test_clients.py +++ b/surfsense_evals/tests/core/test_clients.py @@ -69,7 +69,7 @@ async def test_set_llm_preferences_partial_update(respx_mock, http): 200, json={ "agent_llm_id": -10042, - "document_summary_llm_id": None, + "agent_llm_id": None, "image_generation_config_id": None, "vision_llm_config_id": None, "agent_llm": { diff --git a/surfsense_evals/tests/core/test_ingest_settings.py b/surfsense_evals/tests/core/test_ingest_settings.py index acfac57a6..afbfc709d 100644 --- a/surfsense_evals/tests/core/test_ingest_settings.py +++ b/surfsense_evals/tests/core/test_ingest_settings.py @@ -40,7 +40,7 @@ from surfsense_evals.core.ingest_settings import ( class TestMerge: def test_silent_operator_uses_defaults(self) -> None: - defaults = IngestSettings(use_vision_llm=True, processing_mode="basic", should_summarize=True) + defaults = IngestSettings(use_vision_llm=True, processing_mode="basic") merged = IngestSettings.merge(defaults, {}) assert merged == defaults @@ -111,16 +111,16 @@ class TestMerge: assert merged.processing_mode == "basic" def test_to_dict_round_trips(self) -> None: - s = IngestSettings(use_vision_llm=True, processing_mode="premium", should_summarize=False) + s = IngestSettings(use_vision_llm=True, processing_mode="premium") d = s.to_dict() assert d == { "use_vision_llm": True, "processing_mode": "premium", - "should_summarize": False, + "use_vision_llm": False, } def test_render_label_format(self) -> None: - s = IngestSettings(use_vision_llm=True, processing_mode="premium", should_summarize=True) + s = IngestSettings(use_vision_llm=True, processing_mode="premium") assert s.render_label() == "vision=on, mode=premium, summarize=on" @@ -136,7 +136,7 @@ class TestAddArgs: add_ingest_settings_args( p, defaults=IngestSettings( - use_vision_llm=False, processing_mode="basic", should_summarize=False + use_vision_llm=False, processing_mode="basic" ), ) return p @@ -145,7 +145,7 @@ class TestAddArgs: args = parser.parse_args([]) assert args.use_vision_llm is None assert args.processing_mode is None - assert args.should_summarize is None + assert args.use_vision_llm is None def test_use_vision_llm_flag(self, parser: argparse.ArgumentParser) -> None: args = parser.parse_args(["--use-vision-llm"]) @@ -168,9 +168,9 @@ class TestAddArgs: def test_summarize_flag_pair(self, parser: argparse.ArgumentParser) -> None: on = parser.parse_args(["--should-summarize"]) - assert on.should_summarize is True + assert on.use_vision_llm is True off = parser.parse_args(["--no-summarize"]) - assert off.should_summarize is False + assert off.use_vision_llm is False def test_vision_flags_mutually_exclusive( self, parser: argparse.ArgumentParser @@ -185,11 +185,11 @@ class TestAddArgs: ["--use-vision-llm", "--processing-mode", "premium"] ) defaults = IngestSettings( - use_vision_llm=False, processing_mode="basic", should_summarize=False + use_vision_llm=False, processing_mode="basic" ) merged = IngestSettings.merge(defaults, vars(args)) assert merged == IngestSettings( - use_vision_llm=True, processing_mode="premium", should_summarize=False + use_vision_llm=True, processing_mode="premium" ) @@ -249,7 +249,7 @@ class TestHeader: class TestFormatMd: def test_full_settings(self) -> None: out = format_ingest_settings_md( - {"use_vision_llm": True, "processing_mode": "premium", "should_summarize": True} + {"use_vision_llm": True, "processing_mode": "premium", "use_vision_llm": True} ) assert "vision_llm=`on`" in out assert "processing_mode=`premium`" in out @@ -257,7 +257,7 @@ class TestFormatMd: def test_default_off(self) -> None: out = format_ingest_settings_md( - {"use_vision_llm": False, "processing_mode": "basic", "should_summarize": False} + {"use_vision_llm": False, "processing_mode": "basic", "use_vision_llm": False} ) assert "vision_llm=`off`" in out assert "processing_mode=`basic`" in out