mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-12 20:45:20 +02:00
refactor(etl-cache): extract pure cacheability gate
This commit is contained in:
parent
5af594c405
commit
ce1e90386f
3 changed files with 35 additions and 9 deletions
|
|
@ -7,12 +7,12 @@ import hashlib
|
|||
import logging
|
||||
|
||||
from app.config import config
|
||||
from app.etl_pipeline.cache.eligibility import is_parse_cacheable
|
||||
from app.etl_pipeline.cache.schemas import ParseKey
|
||||
from app.etl_pipeline.cache.service import EtlCacheService
|
||||
from app.etl_pipeline.cache.settings import load_etl_cache_settings
|
||||
from app.etl_pipeline.etl_document import EtlRequest, EtlResult
|
||||
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
||||
from app.etl_pipeline.file_classifier import FileCategory, classify_file
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -25,13 +25,11 @@ async def extract_with_cache(
|
|||
"""Drop-in for ``EtlPipelineService.extract`` that reuses prior parser output."""
|
||||
settings = load_etl_cache_settings()
|
||||
|
||||
# Vision-LLM appends model-generated content not captured by the key, so its
|
||||
# output must not be shared with plain parses (and vice versa): bypass cache.
|
||||
cacheable = (
|
||||
settings.enabled
|
||||
and vision_llm is None
|
||||
and bool(config.ETL_SERVICE)
|
||||
and classify_file(request.filename) == FileCategory.DOCUMENT
|
||||
cacheable = is_parse_cacheable(
|
||||
filename=request.filename,
|
||||
etl_service=config.ETL_SERVICE,
|
||||
cache_enabled=settings.enabled,
|
||||
has_vision_llm=vision_llm is not None,
|
||||
)
|
||||
if not cacheable:
|
||||
return await EtlPipelineService(vision_llm=vision_llm).extract(request)
|
||||
|
|
|
|||
28
surfsense_backend/app/etl_pipeline/cache/eligibility.py
vendored
Normal file
28
surfsense_backend/app/etl_pipeline/cache/eligibility.py
vendored
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
"""Gating rule: may this upload be served from / written to the parse cache?"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from app.etl_pipeline.file_classifier import FileCategory, classify_file
|
||||
|
||||
|
||||
def is_parse_cacheable(
|
||||
*,
|
||||
filename: str,
|
||||
etl_service: str | None,
|
||||
cache_enabled: bool,
|
||||
has_vision_llm: bool,
|
||||
) -> bool:
|
||||
"""Only deterministic document parses are shareable across workspaces.
|
||||
|
||||
Vision-LLM runs append model-generated content not captured by the cache key,
|
||||
and a missing ETL service means there is no document parser to key against --
|
||||
both bypass the cache. Non-document categories (plaintext, audio, images,
|
||||
direct-convert) are cheap or parser-agnostic and are handled outside it.
|
||||
"""
|
||||
if not cache_enabled:
|
||||
return False
|
||||
if has_vision_llm:
|
||||
return False
|
||||
if not etl_service:
|
||||
return False
|
||||
return classify_file(filename) == FileCategory.DOCUMENT
|
||||
|
|
@ -15,7 +15,7 @@ class ParseKey:
|
|||
@classmethod
|
||||
def for_document(
|
||||
cls, source_sha256: str, *, etl_service: str, mode: str, version: int
|
||||
) -> "ParseKey":
|
||||
) -> ParseKey:
|
||||
return cls(
|
||||
source_sha256=source_sha256,
|
||||
etl_service=etl_service,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue