mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-29 10:56:24 +02:00
test: add document hashing unit tests and clean up conftest mocks
This commit is contained in:
parent
d5e10bd8f9
commit
a0134a5830
4 changed files with 79 additions and 50 deletions
13
surfsense_backend/app/indexing_pipeline/document_hashing.py
Normal file
13
surfsense_backend/app/indexing_pipeline/document_hashing.py
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||||
|
|
||||||
|
|
||||||
|
def compute_unique_identifier_hash(doc: ConnectorDocument) -> str:
|
||||||
|
combined = f"{doc.document_type.value}:{doc.unique_id}:{doc.search_space_id}"
|
||||||
|
return hashlib.sha256(combined.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def compute_content_hash(doc: ConnectorDocument) -> str:
|
||||||
|
combined = f"{doc.search_space_id}:{doc.source_markdown}"
|
||||||
|
return hashlib.sha256(combined.encode("utf-8")).hexdigest()
|
||||||
|
|
@ -1,13 +1,17 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
from unittest.mock import AsyncMock, MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
from sqlalchemy import text
|
from sqlalchemy import text
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine
|
||||||
from sqlalchemy.pool import NullPool
|
from sqlalchemy.pool import NullPool
|
||||||
|
|
||||||
from app.db import Base
|
from app.db import Base
|
||||||
|
|
||||||
|
_EMBEDDING_DIM = 4 # keep vectors tiny; real model uses 768+
|
||||||
|
|
||||||
_DEFAULT_TEST_DB = "postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense_test"
|
_DEFAULT_TEST_DB = "postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense_test"
|
||||||
TEST_DATABASE_URL = os.environ.get("TEST_DATABASE_URL", _DEFAULT_TEST_DB)
|
TEST_DATABASE_URL = os.environ.get("TEST_DATABASE_URL", _DEFAULT_TEST_DB)
|
||||||
|
|
||||||
|
|
@ -44,3 +48,19 @@ async def db_session(async_engine) -> AsyncSession:
|
||||||
) as session:
|
) as session:
|
||||||
yield session
|
yield session
|
||||||
await transaction.rollback()
|
await transaction.rollback()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_llm() -> AsyncMock:
|
||||||
|
llm = AsyncMock()
|
||||||
|
llm.ainvoke = AsyncMock(return_value=MagicMock(content="Mocked summary."))
|
||||||
|
return llm
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_embedding_model() -> MagicMock:
|
||||||
|
model = MagicMock()
|
||||||
|
model.embed = MagicMock(
|
||||||
|
side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]
|
||||||
|
)
|
||||||
|
return model
|
||||||
|
|
|
||||||
|
|
@ -1,49 +1,3 @@
|
||||||
from unittest.mock import AsyncMock, MagicMock
|
# No fixtures needed for unit tests yet.
|
||||||
|
# Unit tests cover pure functions and value objects with no dependencies.
|
||||||
import pytest
|
# External-boundary mocks (llm, embedding_model) live in tests/integration/conftest.py.
|
||||||
|
|
||||||
_EMBEDDING_DIM = 4 # keep vectors tiny in tests; real model uses 768+
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_session() -> AsyncMock:
|
|
||||||
session = AsyncMock()
|
|
||||||
session.add = MagicMock() # synchronous in real SQLAlchemy
|
|
||||||
session.execute = AsyncMock()
|
|
||||||
session.scalar = AsyncMock()
|
|
||||||
session.scalars = AsyncMock()
|
|
||||||
session.flush = AsyncMock()
|
|
||||||
session.commit = AsyncMock()
|
|
||||||
session.rollback = AsyncMock()
|
|
||||||
session.refresh = AsyncMock()
|
|
||||||
return session
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_llm() -> AsyncMock:
|
|
||||||
llm = AsyncMock()
|
|
||||||
llm.ainvoke = AsyncMock(return_value=MagicMock(content="Mocked summary."))
|
|
||||||
return llm
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_embedding_model() -> MagicMock:
|
|
||||||
model = MagicMock()
|
|
||||||
model.embed = MagicMock(
|
|
||||||
side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]
|
|
||||||
)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_chunker() -> MagicMock:
|
|
||||||
chunker = MagicMock()
|
|
||||||
chunker.chunk = MagicMock(return_value=["chunk one", "chunk two"])
|
|
||||||
return chunker
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_code_chunker() -> MagicMock:
|
|
||||||
chunker = MagicMock()
|
|
||||||
chunker.chunk = MagicMock(return_value=["chunk one", "chunk two"])
|
|
||||||
return chunker
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,42 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.db import DocumentType
|
||||||
|
from app.indexing_pipeline.document_hashing import compute_content_hash, compute_unique_identifier_hash
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.unit
|
||||||
|
|
||||||
|
|
||||||
|
def test_different_unique_id_produces_different_hash(make_connector_document):
|
||||||
|
doc_a = make_connector_document(unique_id="id-001")
|
||||||
|
doc_b = make_connector_document(unique_id="id-002")
|
||||||
|
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b)
|
||||||
|
|
||||||
|
|
||||||
|
def test_different_search_space_produces_different_identifier_hash(make_connector_document):
|
||||||
|
doc_a = make_connector_document(search_space_id=1)
|
||||||
|
doc_b = make_connector_document(search_space_id=2)
|
||||||
|
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b)
|
||||||
|
|
||||||
|
|
||||||
|
def test_different_document_type_produces_different_identifier_hash(make_connector_document):
|
||||||
|
doc_a = make_connector_document(document_type=DocumentType.CLICKUP_CONNECTOR)
|
||||||
|
doc_b = make_connector_document(document_type=DocumentType.NOTION_CONNECTOR)
|
||||||
|
assert compute_unique_identifier_hash(doc_a) != compute_unique_identifier_hash(doc_b)
|
||||||
|
|
||||||
|
|
||||||
|
def test_same_content_same_space_produces_same_content_hash(make_connector_document):
|
||||||
|
doc_a = make_connector_document(source_markdown="Hello world", search_space_id=1)
|
||||||
|
doc_b = make_connector_document(source_markdown="Hello world", search_space_id=1)
|
||||||
|
assert compute_content_hash(doc_a) == compute_content_hash(doc_b)
|
||||||
|
|
||||||
|
|
||||||
|
def test_same_content_different_space_produces_different_content_hash(make_connector_document):
|
||||||
|
doc_a = make_connector_document(source_markdown="Hello world", search_space_id=1)
|
||||||
|
doc_b = make_connector_document(source_markdown="Hello world", search_space_id=2)
|
||||||
|
assert compute_content_hash(doc_a) != compute_content_hash(doc_b)
|
||||||
|
|
||||||
|
|
||||||
|
def test_different_content_produces_different_content_hash(make_connector_document):
|
||||||
|
doc_a = make_connector_document(source_markdown="Original content")
|
||||||
|
doc_b = make_connector_document(source_markdown="Updated content")
|
||||||
|
assert compute_content_hash(doc_a) != compute_content_hash(doc_b)
|
||||||
Loading…
Add table
Add a link
Reference in a new issue