refactor(etl-cache): extract pure cacheability gate

2026-06-12 20:45:20 +02:00 · 2026-06-12 11:50:51 +02:00 · 2026-06-12 11:50:51 +02:00 · ce1e90386f
commit ce1e90386f
parent 5af594c405
3 changed files with 35 additions and 9 deletions
--- a/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py
+++ b/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py
@ -7,12 +7,12 @@ import hashlib
 import logging

 from app.config import config
+from app.etl_pipeline.cache.eligibility import is_parse_cacheable
 from app.etl_pipeline.cache.schemas import ParseKey
 from app.etl_pipeline.cache.service import EtlCacheService
 from app.etl_pipeline.cache.settings import load_etl_cache_settings
 from app.etl_pipeline.etl_document import EtlRequest, EtlResult
 from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
-from app.etl_pipeline.file_classifier import FileCategory, classify_file

 logger = logging.getLogger(__name__)

@ -25,13 +25,11 @@ async def extract_with_cache(
    """Drop-in for ``EtlPipelineService.extract`` that reuses prior parser output."""
    settings = load_etl_cache_settings()

-    # Vision-LLM appends model-generated content not captured by the key, so its
-    # output must not be shared with plain parses (and vice versa): bypass cache.
-    cacheable = (
-        settings.enabled
-        and vision_llm is None
-        and bool(config.ETL_SERVICE)
-        and classify_file(request.filename) == FileCategory.DOCUMENT
+    cacheable = is_parse_cacheable(
+        filename=request.filename,
+        etl_service=config.ETL_SERVICE,
+        cache_enabled=settings.enabled,
+        has_vision_llm=vision_llm is not None,
    )
    if not cacheable:
        return await EtlPipelineService(vision_llm=vision_llm).extract(request)
--- a/surfsense_backend/app/etl_pipeline/cache/eligibility.py
+++ b/surfsense_backend/app/etl_pipeline/cache/eligibility.py
@ -0,0 +1,28 @@
+"""Gating rule: may this upload be served from / written to the parse cache?"""
+
+from __future__ import annotations
+
+from app.etl_pipeline.file_classifier import FileCategory, classify_file
+
+
+def is_parse_cacheable(
+    *,
+    filename: str,
+    etl_service: str | None,
+    cache_enabled: bool,
+    has_vision_llm: bool,
+) -> bool:
+    """Only deterministic document parses are shareable across workspaces.
+
+    Vision-LLM runs append model-generated content not captured by the cache key,
+    and a missing ETL service means there is no document parser to key against --
+    both bypass the cache. Non-document categories (plaintext, audio, images,
+    direct-convert) are cheap or parser-agnostic and are handled outside it.
+    """
+    if not cache_enabled:
+        return False
+    if has_vision_llm:
+        return False
+    if not etl_service:
+        return False
+    return classify_file(filename) == FileCategory.DOCUMENT
--- a/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py
+++ b/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py
@ -15,7 +15,7 @@ class ParseKey:
    @classmethod
    def for_document(
        cls, source_sha256: str, *, etl_service: str, mode: str, version: int
-    ) -> "ParseKey":
+    ) -> ParseKey:
        return cls(
            source_sha256=source_sha256,
            etl_service=etl_service,