mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-12 20:45:20 +02:00
test(etl-cache): cover content-addressing dedup and key shape
This commit is contained in:
parent
ce1e90386f
commit
dddacbe762
2 changed files with 98 additions and 0 deletions
28
surfsense_backend/tests/unit/etl_pipeline/cache/conftest.py
vendored
Normal file
28
surfsense_backend/tests/unit/etl_pipeline/cache/conftest.py
vendored
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
"""Stub the cache package __init__s so unit tests import only pure leaf modules.
|
||||
|
||||
The real ``cache``/``storage``/``eviction``/``persistence`` __init__s eagerly
|
||||
import the facade, file storage, Celery, and ``app.db`` -- none of which a pure
|
||||
unit test should need. Turning those packages into bare namespace packages lets
|
||||
``from app.etl_pipeline.cache.<pkg>.<leaf> import ...`` resolve the leaf module
|
||||
without running the heavy __init__. ``schemas`` is left real (it is pure).
|
||||
"""
|
||||
|
||||
import sys
|
||||
import types
|
||||
from pathlib import Path
|
||||
|
||||
_CACHE_DIR = Path(__file__).resolve().parents[4] / "app" / "etl_pipeline" / "cache"
|
||||
|
||||
|
||||
def _stub_namespace_package(dotted: str, fs_dir: Path) -> None:
|
||||
if dotted in sys.modules:
|
||||
return
|
||||
module = types.ModuleType(dotted)
|
||||
module.__path__ = [str(fs_dir)]
|
||||
module.__package__ = dotted
|
||||
sys.modules[dotted] = module
|
||||
|
||||
|
||||
_stub_namespace_package("app.etl_pipeline.cache", _CACHE_DIR)
|
||||
_stub_namespace_package("app.etl_pipeline.cache.storage", _CACHE_DIR / "storage")
|
||||
_stub_namespace_package("app.etl_pipeline.cache.eviction", _CACHE_DIR / "eviction")
|
||||
70
surfsense_backend/tests/unit/etl_pipeline/cache/test_parse_key.py
vendored
Normal file
70
surfsense_backend/tests/unit/etl_pipeline/cache/test_parse_key.py
vendored
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
"""Content-addressing: equal (bytes + recipe) must map to one storage location.
|
||||
|
||||
This is the dedup guarantee the whole cache rests on -- two users uploading the
|
||||
same file under the same parser settings have to land on the same object key, and
|
||||
any change to bytes or recipe has to land somewhere else.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from app.etl_pipeline.cache.schemas import ParseKey
|
||||
from app.etl_pipeline.cache.storage.object_keys import (
|
||||
CACHE_PREFIX,
|
||||
build_parse_object_key,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
def _key(**overrides) -> ParseKey:
|
||||
base = {
|
||||
"source_sha256": "a" * 64,
|
||||
"etl_service": "LLAMACLOUD",
|
||||
"mode": "basic",
|
||||
"version": 1,
|
||||
}
|
||||
base.update(overrides)
|
||||
return ParseKey.for_document(
|
||||
base["source_sha256"],
|
||||
etl_service=base["etl_service"],
|
||||
mode=base["mode"],
|
||||
version=base["version"],
|
||||
)
|
||||
|
||||
|
||||
def test_same_bytes_and_recipe_produce_the_same_object_key():
|
||||
assert build_parse_object_key(_key()) == build_parse_object_key(_key())
|
||||
|
||||
|
||||
def test_different_bytes_produce_different_object_keys():
|
||||
assert build_parse_object_key(
|
||||
_key(source_sha256="a" * 64)
|
||||
) != build_parse_object_key(_key(source_sha256="b" * 64))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"field, value",
|
||||
[
|
||||
("etl_service", "DOCLING"),
|
||||
("mode", "premium"),
|
||||
("version", 2),
|
||||
],
|
||||
)
|
||||
def test_any_recipe_change_produces_a_different_object_key(field, value):
|
||||
# Same bytes but a different parser/mode/version must not collide: the recipe
|
||||
# is part of the identity, so changing it has to re-parse, not reuse.
|
||||
assert build_parse_object_key(_key()) != build_parse_object_key(
|
||||
_key(**{field: value})
|
||||
)
|
||||
|
||||
|
||||
def test_object_key_is_prefixed_and_sharded_by_source_hash():
|
||||
# Shape matters operationally: a dedicated top-level prefix keeps cache blobs
|
||||
# out of the normal store, and the sha directory groups every recipe variant
|
||||
# of one file together.
|
||||
key = _key()
|
||||
assert build_parse_object_key(key) == (
|
||||
f"{CACHE_PREFIX}/{key.source_sha256}/LLAMACLOUD.basic.v1.md"
|
||||
)
|
||||
Loading…
Add table
Add a link
Reference in a new issue