From ce1e90386fac8437a9a855e458d54b2e212e1424 Mon Sep 17 00:00:00 2001 From: CREDO23 Date: Fri, 12 Jun 2026 11:50:51 +0200 Subject: [PATCH] refactor(etl-cache): extract pure cacheability gate --- .../etl_pipeline/cache/cached_extraction.py | 14 ++++------ .../app/etl_pipeline/cache/eligibility.py | 28 +++++++++++++++++++ .../etl_pipeline/cache/schemas/parse_key.py | 2 +- 3 files changed, 35 insertions(+), 9 deletions(-) create mode 100644 surfsense_backend/app/etl_pipeline/cache/eligibility.py diff --git a/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py b/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py index 5348c5f4b..dba4b44da 100644 --- a/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py +++ b/surfsense_backend/app/etl_pipeline/cache/cached_extraction.py @@ -7,12 +7,12 @@ import hashlib import logging from app.config import config +from app.etl_pipeline.cache.eligibility import is_parse_cacheable from app.etl_pipeline.cache.schemas import ParseKey from app.etl_pipeline.cache.service import EtlCacheService from app.etl_pipeline.cache.settings import load_etl_cache_settings from app.etl_pipeline.etl_document import EtlRequest, EtlResult from app.etl_pipeline.etl_pipeline_service import EtlPipelineService -from app.etl_pipeline.file_classifier import FileCategory, classify_file logger = logging.getLogger(__name__) @@ -25,13 +25,11 @@ async def extract_with_cache( """Drop-in for ``EtlPipelineService.extract`` that reuses prior parser output.""" settings = load_etl_cache_settings() - # Vision-LLM appends model-generated content not captured by the key, so its - # output must not be shared with plain parses (and vice versa): bypass cache. - cacheable = ( - settings.enabled - and vision_llm is None - and bool(config.ETL_SERVICE) - and classify_file(request.filename) == FileCategory.DOCUMENT + cacheable = is_parse_cacheable( + filename=request.filename, + etl_service=config.ETL_SERVICE, + cache_enabled=settings.enabled, + has_vision_llm=vision_llm is not None, ) if not cacheable: return await EtlPipelineService(vision_llm=vision_llm).extract(request) diff --git a/surfsense_backend/app/etl_pipeline/cache/eligibility.py b/surfsense_backend/app/etl_pipeline/cache/eligibility.py new file mode 100644 index 000000000..18f096218 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/cache/eligibility.py @@ -0,0 +1,28 @@ +"""Gating rule: may this upload be served from / written to the parse cache?""" + +from __future__ import annotations + +from app.etl_pipeline.file_classifier import FileCategory, classify_file + + +def is_parse_cacheable( + *, + filename: str, + etl_service: str | None, + cache_enabled: bool, + has_vision_llm: bool, +) -> bool: + """Only deterministic document parses are shareable across workspaces. + + Vision-LLM runs append model-generated content not captured by the cache key, + and a missing ETL service means there is no document parser to key against -- + both bypass the cache. Non-document categories (plaintext, audio, images, + direct-convert) are cheap or parser-agnostic and are handled outside it. + """ + if not cache_enabled: + return False + if has_vision_llm: + return False + if not etl_service: + return False + return classify_file(filename) == FileCategory.DOCUMENT diff --git a/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py b/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py index 65e7b08a5..88133a418 100644 --- a/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py +++ b/surfsense_backend/app/etl_pipeline/cache/schemas/parse_key.py @@ -15,7 +15,7 @@ class ParseKey: @classmethod def for_document( cls, source_sha256: str, *, etl_service: str, mode: str, version: int - ) -> "ParseKey": + ) -> ParseKey: return cls( source_sha256=source_sha256, etl_service=etl_service,