refactor(etl-cache): extract pure cacheability gate

This commit is contained in:
CREDO23 2026-06-12 11:50:51 +02:00
parent 5af594c405
commit ce1e90386f
3 changed files with 35 additions and 9 deletions

View file

@ -7,12 +7,12 @@ import hashlib
import logging
from app.config import config
from app.etl_pipeline.cache.eligibility import is_parse_cacheable
from app.etl_pipeline.cache.schemas import ParseKey
from app.etl_pipeline.cache.service import EtlCacheService
from app.etl_pipeline.cache.settings import load_etl_cache_settings
from app.etl_pipeline.etl_document import EtlRequest, EtlResult
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
from app.etl_pipeline.file_classifier import FileCategory, classify_file
logger = logging.getLogger(__name__)
@ -25,13 +25,11 @@ async def extract_with_cache(
"""Drop-in for ``EtlPipelineService.extract`` that reuses prior parser output."""
settings = load_etl_cache_settings()
# Vision-LLM appends model-generated content not captured by the key, so its
# output must not be shared with plain parses (and vice versa): bypass cache.
cacheable = (
settings.enabled
and vision_llm is None
and bool(config.ETL_SERVICE)
and classify_file(request.filename) == FileCategory.DOCUMENT
cacheable = is_parse_cacheable(
filename=request.filename,
etl_service=config.ETL_SERVICE,
cache_enabled=settings.enabled,
has_vision_llm=vision_llm is not None,
)
if not cacheable:
return await EtlPipelineService(vision_llm=vision_llm).extract(request)

View file

@ -0,0 +1,28 @@
"""Gating rule: may this upload be served from / written to the parse cache?"""
from __future__ import annotations
from app.etl_pipeline.file_classifier import FileCategory, classify_file
def is_parse_cacheable(
*,
filename: str,
etl_service: str | None,
cache_enabled: bool,
has_vision_llm: bool,
) -> bool:
"""Only deterministic document parses are shareable across workspaces.
Vision-LLM runs append model-generated content not captured by the cache key,
and a missing ETL service means there is no document parser to key against --
both bypass the cache. Non-document categories (plaintext, audio, images,
direct-convert) are cheap or parser-agnostic and are handled outside it.
"""
if not cache_enabled:
return False
if has_vision_llm:
return False
if not etl_service:
return False
return classify_file(filename) == FileCategory.DOCUMENT

View file

@ -15,7 +15,7 @@ class ParseKey:
@classmethod
def for_document(
cls, source_sha256: str, *, etl_service: str, mode: str, version: int
) -> "ParseKey":
) -> ParseKey:
return cls(
source_sha256=source_sha256,
etl_service=etl_service,