SurfSense/surfsense_backend/app/etl_pipeline/cache/eligibility.py

"""Gating rule: may this upload be served from / written to the parse cache?"""

from __future__ import annotations

from app.etl_pipeline.file_classifier import FileCategory, classify_file


def is_parse_cacheable(
    *,
    filename: str,
    etl_service: str | None,
    cache_enabled: bool,
    has_vision_llm: bool,
) -> bool:
    """Only deterministic document parses are shareable across workspaces.

    Vision-LLM runs append model-generated content not captured by the cache key,
    and a missing ETL service means there is no document parser to key against --
    both bypass the cache. Non-document categories (plaintext, audio, images,
    direct-convert) are cheap or parser-agnostic and are handled outside it.
    """
    if not cache_enabled:
        return False
    if has_vision_llm:
        return False
    if not etl_service:
        return False
    return classify_file(filename) == FileCategory.DOCUMENT