test(etl-cache): cover content-addressing dedup and key shape

This commit is contained in:
CREDO23 2026-06-12 11:50:52 +02:00
parent ce1e90386f
commit dddacbe762
2 changed files with 98 additions and 0 deletions

View file

@ -0,0 +1,28 @@
"""Stub the cache package __init__s so unit tests import only pure leaf modules.
The real ``cache``/``storage``/``eviction``/``persistence`` __init__s eagerly
import the facade, file storage, Celery, and ``app.db`` -- none of which a pure
unit test should need. Turning those packages into bare namespace packages lets
``from app.etl_pipeline.cache.<pkg>.<leaf> import ...`` resolve the leaf module
without running the heavy __init__. ``schemas`` is left real (it is pure).
"""
import sys
import types
from pathlib import Path
_CACHE_DIR = Path(__file__).resolve().parents[4] / "app" / "etl_pipeline" / "cache"
def _stub_namespace_package(dotted: str, fs_dir: Path) -> None:
if dotted in sys.modules:
return
module = types.ModuleType(dotted)
module.__path__ = [str(fs_dir)]
module.__package__ = dotted
sys.modules[dotted] = module
_stub_namespace_package("app.etl_pipeline.cache", _CACHE_DIR)
_stub_namespace_package("app.etl_pipeline.cache.storage", _CACHE_DIR / "storage")
_stub_namespace_package("app.etl_pipeline.cache.eviction", _CACHE_DIR / "eviction")

View file

@ -0,0 +1,70 @@
"""Content-addressing: equal (bytes + recipe) must map to one storage location.
This is the dedup guarantee the whole cache rests on -- two users uploading the
same file under the same parser settings have to land on the same object key, and
any change to bytes or recipe has to land somewhere else.
"""
from __future__ import annotations
import pytest
from app.etl_pipeline.cache.schemas import ParseKey
from app.etl_pipeline.cache.storage.object_keys import (
CACHE_PREFIX,
build_parse_object_key,
)
pytestmark = pytest.mark.unit
def _key(**overrides) -> ParseKey:
base = {
"source_sha256": "a" * 64,
"etl_service": "LLAMACLOUD",
"mode": "basic",
"version": 1,
}
base.update(overrides)
return ParseKey.for_document(
base["source_sha256"],
etl_service=base["etl_service"],
mode=base["mode"],
version=base["version"],
)
def test_same_bytes_and_recipe_produce_the_same_object_key():
assert build_parse_object_key(_key()) == build_parse_object_key(_key())
def test_different_bytes_produce_different_object_keys():
assert build_parse_object_key(
_key(source_sha256="a" * 64)
) != build_parse_object_key(_key(source_sha256="b" * 64))
@pytest.mark.parametrize(
"field, value",
[
("etl_service", "DOCLING"),
("mode", "premium"),
("version", 2),
],
)
def test_any_recipe_change_produces_a_different_object_key(field, value):
# Same bytes but a different parser/mode/version must not collide: the recipe
# is part of the identity, so changing it has to re-parse, not reuse.
assert build_parse_object_key(_key()) != build_parse_object_key(
_key(**{field: value})
)
def test_object_key_is_prefixed_and_sharded_by_source_hash():
# Shape matters operationally: a dedicated top-level prefix keeps cache blobs
# out of the normal store, and the sha directory groups every recipe variant
# of one file together.
key = _key()
assert build_parse_object_key(key) == (
f"{CACHE_PREFIX}/{key.source_sha256}/LLAMACLOUD.basic.v1.md"
)