From ce1e90386fac8437a9a855e458d54b2e212e1424 Mon Sep 17 00:00:00 2001
From: CREDO23 <bakerathierry@gmail.com>
Date: Fri, 12 Jun 2026 11:50:51 +0200
Subject: [PATCH] refactor(etl-cache): extract pure cacheability gate

---
 .../etl_pipeline/cache/cached_extraction.py   | 14 ++++------
 .../app/etl_pipeline/cache/eligibility.py     | 28 +++++++++++++++++++
 .../etl_pipeline/cache/schemas/parse_key.py   |  2 +-
 3 files changed, 35 insertions(+), 9 deletions(-)
 create mode 100644 surfsense_backend/app/etl_pipeline/cache/eligibility.py

diff --git a/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py b/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py
index 5348c5f4b..dba4b44da 100644
--- a/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py
+++ b/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py
@@ -7,12 +7,12 @@ import hashlib
 import logging
 
 from app.config import config
+from app.etl_pipeline.cache.eligibility import is_parse_cacheable
 from app.etl_pipeline.cache.schemas import ParseKey
 from app.etl_pipeline.cache.service import EtlCacheService
 from app.etl_pipeline.cache.settings import load_etl_cache_settings
 from app.etl_pipeline.etl_document import EtlRequest, EtlResult
 from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-from app.etl_pipeline.file_classifier import FileCategory, classify_file
 
 logger = logging.getLogger(__name__)
 
@@ -25,13 +25,11 @@ async def extract_with_cache(
     """Drop-in for ``EtlPipelineService.extract`` that reuses prior parser output."""
     settings = load_etl_cache_settings()
 
-    # Vision-LLM appends model-generated content not captured by the key, so its
-    # output must not be shared with plain parses (and vice versa): bypass cache.
-    cacheable = (
-        settings.enabled
-        and vision_llm is None
-        and bool(config.ETL_SERVICE)
-        and classify_file(request.filename) == FileCategory.DOCUMENT
+    cacheable = is_parse_cacheable(
+        filename=request.filename,
+        etl_service=config.ETL_SERVICE,
+        cache_enabled=settings.enabled,
+        has_vision_llm=vision_llm is not None,
     )
     if not cacheable:
         return await EtlPipelineService(vision_llm=vision_llm).extract(request)
diff --git a/surfsense_backend/app/etl_pipeline/cache/eligibility.py b/surfsense_backend/app/etl_pipeline/cache/eligibility.py
new file mode 100644
index 000000000..18f096218
--- /dev/null
+++ b/surfsense_backend/app/etl_pipeline/cache/eligibility.py
@@ -0,0 +1,28 @@
+"""Gating rule: may this upload be served from / written to the parse cache?"""
+
+from __future__ import annotations
+
+from app.etl_pipeline.file_classifier import FileCategory, classify_file
+
+
+def is_parse_cacheable(
+    *,
+    filename: str,
+    etl_service: str | None,
+    cache_enabled: bool,
+    has_vision_llm: bool,
+) -> bool:
+    """Only deterministic document parses are shareable across workspaces.
+
+    Vision-LLM runs append model-generated content not captured by the cache key,
+    and a missing ETL service means there is no document parser to key against --
+    both bypass the cache. Non-document categories (plaintext, audio, images,
+    direct-convert) are cheap or parser-agnostic and are handled outside it.
+    """
+    if not cache_enabled:
+        return False
+    if has_vision_llm:
+        return False
+    if not etl_service:
+        return False
+    return classify_file(filename) == FileCategory.DOCUMENT
diff --git a/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py b/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py
index 65e7b08a5..88133a418 100644
--- a/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py
+++ b/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py
@@ -15,7 +15,7 @@ class ParseKey:
     @classmethod
     def for_document(
         cls, source_sha256: str, *, etl_service: str, mode: str, version: int
-    ) -> "ParseKey":
+    ) -> ParseKey:
         return cls(
             source_sha256=source_sha256,
             etl_service=etl_service,