test: add document hashing unit tests and clean up conftest mocks

This commit is contained in:
CREDO23 2026-02-24 22:48:40 +02:00
parent d5e10bd8f9
commit a0134a5830
4 changed files with 79 additions and 50 deletions

View file

@ -1,49 +1,3 @@
from unittest.mock import AsyncMock, MagicMock
import pytest
_EMBEDDING_DIM = 4 # keep vectors tiny in tests; real model uses 768+
@pytest.fixture
def mock_session() -> AsyncMock:
session = AsyncMock()
session.add = MagicMock() # synchronous in real SQLAlchemy
session.execute = AsyncMock()
session.scalar = AsyncMock()
session.scalars = AsyncMock()
session.flush = AsyncMock()
session.commit = AsyncMock()
session.rollback = AsyncMock()
session.refresh = AsyncMock()
return session
@pytest.fixture
def mock_llm() -> AsyncMock:
llm = AsyncMock()
llm.ainvoke = AsyncMock(return_value=MagicMock(content="Mocked summary."))
return llm
@pytest.fixture
def mock_embedding_model() -> MagicMock:
model = MagicMock()
model.embed = MagicMock(
side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]
)
return model
@pytest.fixture
def mock_chunker() -> MagicMock:
chunker = MagicMock()
chunker.chunk = MagicMock(return_value=["chunk one", "chunk two"])
return chunker
@pytest.fixture
def mock_code_chunker() -> MagicMock:
chunker = MagicMock()
chunker.chunk = MagicMock(return_value=["chunk one", "chunk two"])
return chunker
# No fixtures needed for unit tests yet.
# Unit tests cover pure functions and value objects with no dependencies.
# External-boundary mocks (llm, embedding_model) live in tests/integration/conftest.py.

View file

@ -0,0 +1,42 @@
import pytest
from app.db import DocumentType
from app.indexing_pipeline.document_hashing import compute_content_hash, compute_unique_identifier_hash
pytestmark = pytest.mark.unit
def test_different_unique_id_produces_different_hash(make_connector_document):
doc_a = make_connector_document(unique_id="id-001")
doc_b = make_connector_document(unique_id="id-002")
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b)
def test_different_search_space_produces_different_identifier_hash(make_connector_document):
doc_a = make_connector_document(search_space_id=1)
doc_b = make_connector_document(search_space_id=2)
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b)
def test_different_document_type_produces_different_identifier_hash(make_connector_document):
doc_a = make_connector_document(document_type=DocumentType.CLICKUP_CONNECTOR)
doc_b = make_connector_document(document_type=DocumentType.NOTION_CONNECTOR)
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b)
def test_same_content_same_space_produces_same_content_hash(make_connector_document):
doc_a = make_connector_document(source_markdown="Hello world", search_space_id=1)
doc_b = make_connector_document(source_markdown="Hello world", search_space_id=1)
assert compute_content_hash(doc_a) == compute_content_hash(doc_b)
def test_same_content_different_space_produces_different_content_hash(make_connector_document):
doc_a = make_connector_document(source_markdown="Hello world", search_space_id=1)
doc_b = make_connector_document(source_markdown="Hello world", search_space_id=2)
assert compute_content_hash(doc_a) != compute_content_hash(doc_b)
def test_different_content_produces_different_content_hash(make_connector_document):
doc_a = make_connector_document(source_markdown="Original content")
doc_b = make_connector_document(source_markdown="Updated content")
assert compute_content_hash(doc_a) != compute_content_hash(doc_b)