diff --git a/surfsense_backend/app/indexing_pipeline/document_hashing.py b/surfsense_backend/app/indexing_pipeline/document_hashing.py new file mode 100644 index 000000000..6b352a953 --- /dev/null +++ b/surfsense_backend/app/indexing_pipeline/document_hashing.py @@ -0,0 +1,13 @@ +import hashlib + +from app.indexing_pipeline.connector_document import ConnectorDocument + + +def compute_unique_identifier_hash(doc: ConnectorDocument) -> str: + combined = f"{doc.document_type.value}:{doc.unique_id}:{doc.search_space_id}" + return hashlib.sha256(combined.encode("utf-8")).hexdigest() + + +def compute_content_hash(doc: ConnectorDocument) -> str: + combined = f"{doc.search_space_id}:{doc.source_markdown}" + return hashlib.sha256(combined.encode("utf-8")).hexdigest() diff --git a/surfsense_backend/tests/integration/conftest.py b/surfsense_backend/tests/integration/conftest.py index f36649c5b..3fd4e1a31 100644 --- a/surfsense_backend/tests/integration/conftest.py +++ b/surfsense_backend/tests/integration/conftest.py @@ -1,13 +1,17 @@ import os +from unittest.mock import AsyncMock, MagicMock +import pytest import pytest_asyncio from sqlalchemy import text -from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine +from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine from sqlalchemy.pool import NullPool from app.db import Base +_EMBEDDING_DIM = 4 # keep vectors tiny; real model uses 768+ + _DEFAULT_TEST_DB = "postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense_test" TEST_DATABASE_URL = os.environ.get("TEST_DATABASE_URL", _DEFAULT_TEST_DB) @@ -44,3 +48,19 @@ async def db_session(async_engine) -> AsyncSession: ) as session: yield session await transaction.rollback() + + +@pytest.fixture +def mock_llm() -> AsyncMock: + llm = AsyncMock() + llm.ainvoke = AsyncMock(return_value=MagicMock(content="Mocked summary.")) + return llm + + +@pytest.fixture +def mock_embedding_model() -> MagicMock: + model = MagicMock() + model.embed = MagicMock( + side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts] + ) + return model diff --git a/surfsense_backend/tests/unit/conftest.py b/surfsense_backend/tests/unit/conftest.py index 98fcfc147..4e5c81bcb 100644 --- a/surfsense_backend/tests/unit/conftest.py +++ b/surfsense_backend/tests/unit/conftest.py @@ -1,49 +1,3 @@ -from unittest.mock import AsyncMock, MagicMock - -import pytest - -_EMBEDDING_DIM = 4 # keep vectors tiny in tests; real model uses 768+ - - -@pytest.fixture -def mock_session() -> AsyncMock: - session = AsyncMock() - session.add = MagicMock() # synchronous in real SQLAlchemy - session.execute = AsyncMock() - session.scalar = AsyncMock() - session.scalars = AsyncMock() - session.flush = AsyncMock() - session.commit = AsyncMock() - session.rollback = AsyncMock() - session.refresh = AsyncMock() - return session - - -@pytest.fixture -def mock_llm() -> AsyncMock: - llm = AsyncMock() - llm.ainvoke = AsyncMock(return_value=MagicMock(content="Mocked summary.")) - return llm - - -@pytest.fixture -def mock_embedding_model() -> MagicMock: - model = MagicMock() - model.embed = MagicMock( - side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts] - ) - return model - - -@pytest.fixture -def mock_chunker() -> MagicMock: - chunker = MagicMock() - chunker.chunk = MagicMock(return_value=["chunk one", "chunk two"]) - return chunker - - -@pytest.fixture -def mock_code_chunker() -> MagicMock: - chunker = MagicMock() - chunker.chunk = MagicMock(return_value=["chunk one", "chunk two"]) - return chunker +# No fixtures needed for unit tests yet. +# Unit tests cover pure functions and value objects with no dependencies. +# External-boundary mocks (llm, embedding_model) live in tests/integration/conftest.py. diff --git a/surfsense_backend/tests/unit/indexing_pipeline/test_document_hashing.py b/surfsense_backend/tests/unit/indexing_pipeline/test_document_hashing.py new file mode 100644 index 000000000..c8e2e97e9 --- /dev/null +++ b/surfsense_backend/tests/unit/indexing_pipeline/test_document_hashing.py @@ -0,0 +1,42 @@ +import pytest + +from app.db import DocumentType +from app.indexing_pipeline.document_hashing import compute_content_hash, compute_unique_identifier_hash + +pytestmark = pytest.mark.unit + + +def test_different_unique_id_produces_different_hash(make_connector_document): + doc_a = make_connector_document(unique_id="id-001") + doc_b = make_connector_document(unique_id="id-002") + assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b) + + +def test_different_search_space_produces_different_identifier_hash(make_connector_document): + doc_a = make_connector_document(search_space_id=1) + doc_b = make_connector_document(search_space_id=2) + assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b) + + +def test_different_document_type_produces_different_identifier_hash(make_connector_document): + doc_a = make_connector_document(document_type=DocumentType.CLICKUP_CONNECTOR) + doc_b = make_connector_document(document_type=DocumentType.NOTION_CONNECTOR) + assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b) + + +def test_same_content_same_space_produces_same_content_hash(make_connector_document): + doc_a = make_connector_document(source_markdown="Hello world", search_space_id=1) + doc_b = make_connector_document(source_markdown="Hello world", search_space_id=1) + assert compute_content_hash(doc_a) == compute_content_hash(doc_b) + + +def test_same_content_different_space_produces_different_content_hash(make_connector_document): + doc_a = make_connector_document(source_markdown="Hello world", search_space_id=1) + doc_b = make_connector_document(source_markdown="Hello world", search_space_id=2) + assert compute_content_hash(doc_a) != compute_content_hash(doc_b) + + +def test_different_content_produces_different_content_hash(make_connector_document): + doc_a = make_connector_document(source_markdown="Original content") + doc_b = make_connector_document(source_markdown="Updated content") + assert compute_content_hash(doc_a) != compute_content_hash(doc_b)