mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 09:16:22 +02:00
feat: made agent file sytem optimized
This commit is contained in:
parent
ee0b59c0fa
commit
2cc2d339e6
67 changed files with 8011 additions and 5591 deletions
|
|
@ -0,0 +1,131 @@
|
|||
"""Unit tests for IndexingPipelineService.create_placeholder_documents."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
|
||||
from app.db import DocumentStatus, DocumentType
|
||||
from app.indexing_pipeline.document_hashing import compute_identifier_hash
|
||||
from app.indexing_pipeline.indexing_pipeline_service import (
|
||||
IndexingPipelineService,
|
||||
PlaceholderInfo,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_placeholder(**overrides) -> PlaceholderInfo:
|
||||
defaults = {
|
||||
"title": "Test Doc",
|
||||
"document_type": DocumentType.GOOGLE_DRIVE_FILE,
|
||||
"unique_id": "file-001",
|
||||
"search_space_id": 1,
|
||||
"connector_id": 42,
|
||||
"created_by_id": "00000000-0000-0000-0000-000000000001",
|
||||
}
|
||||
defaults.update(overrides)
|
||||
return PlaceholderInfo(**defaults)
|
||||
|
||||
|
||||
def _uid_hash(p: PlaceholderInfo) -> str:
|
||||
return compute_identifier_hash(
|
||||
p.document_type.value, p.unique_id, p.search_space_id
|
||||
)
|
||||
|
||||
|
||||
def _session_with_existing_hashes(existing: set[str] | None = None):
|
||||
"""Build an AsyncMock session whose batch-query returns *existing* hashes."""
|
||||
session = AsyncMock()
|
||||
result = MagicMock()
|
||||
result.scalars.return_value.all.return_value = list(existing or [])
|
||||
session.execute = AsyncMock(return_value=result)
|
||||
session.add = MagicMock()
|
||||
return session
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def test_empty_input_returns_zero_without_db_calls():
|
||||
session = AsyncMock()
|
||||
pipeline = IndexingPipelineService(session)
|
||||
|
||||
result = await pipeline.create_placeholder_documents([])
|
||||
|
||||
assert result == 0
|
||||
session.execute.assert_not_awaited()
|
||||
session.commit.assert_not_awaited()
|
||||
|
||||
|
||||
async def test_creates_documents_with_pending_status_and_commits():
|
||||
session = _session_with_existing_hashes(set())
|
||||
pipeline = IndexingPipelineService(session)
|
||||
p = _make_placeholder(title="My File", unique_id="file-abc")
|
||||
|
||||
result = await pipeline.create_placeholder_documents([p])
|
||||
|
||||
assert result == 1
|
||||
session.add.assert_called_once()
|
||||
|
||||
doc = session.add.call_args[0][0]
|
||||
assert doc.title == "My File"
|
||||
assert doc.document_type == DocumentType.GOOGLE_DRIVE_FILE
|
||||
assert doc.content == "Pending..."
|
||||
assert DocumentStatus.is_state(doc.status, DocumentStatus.PENDING)
|
||||
assert doc.search_space_id == 1
|
||||
assert doc.connector_id == 42
|
||||
|
||||
session.commit.assert_awaited_once()
|
||||
|
||||
|
||||
async def test_existing_documents_are_skipped():
|
||||
"""Placeholders whose unique_identifier_hash already exists are not re-created."""
|
||||
existing_p = _make_placeholder(unique_id="already-there")
|
||||
new_p = _make_placeholder(unique_id="brand-new")
|
||||
|
||||
existing_hash = _uid_hash(existing_p)
|
||||
session = _session_with_existing_hashes({existing_hash})
|
||||
pipeline = IndexingPipelineService(session)
|
||||
|
||||
result = await pipeline.create_placeholder_documents([existing_p, new_p])
|
||||
|
||||
assert result == 1
|
||||
doc = session.add.call_args[0][0]
|
||||
assert doc.unique_identifier_hash == _uid_hash(new_p)
|
||||
|
||||
|
||||
async def test_duplicate_unique_ids_within_input_are_deduped():
|
||||
"""Same unique_id passed twice only produces one placeholder."""
|
||||
p1 = _make_placeholder(unique_id="dup-id", title="First")
|
||||
p2 = _make_placeholder(unique_id="dup-id", title="Second")
|
||||
|
||||
session = _session_with_existing_hashes(set())
|
||||
pipeline = IndexingPipelineService(session)
|
||||
|
||||
result = await pipeline.create_placeholder_documents([p1, p2])
|
||||
|
||||
assert result == 1
|
||||
session.add.assert_called_once()
|
||||
|
||||
|
||||
async def test_integrity_error_on_commit_returns_zero():
|
||||
"""IntegrityError during commit (race condition) is swallowed gracefully."""
|
||||
session = _session_with_existing_hashes(set())
|
||||
session.commit = AsyncMock(side_effect=IntegrityError("dup", {}, None))
|
||||
pipeline = IndexingPipelineService(session)
|
||||
p = _make_placeholder()
|
||||
|
||||
result = await pipeline.create_placeholder_documents([p])
|
||||
|
||||
assert result == 0
|
||||
session.rollback.assert_awaited_once()
|
||||
|
|
@ -19,9 +19,7 @@ def pipeline(mock_session):
|
|||
return IndexingPipelineService(mock_session)
|
||||
|
||||
|
||||
async def test_calls_prepare_then_index_per_document(
|
||||
pipeline, make_connector_document
|
||||
):
|
||||
async def test_calls_prepare_then_index_per_document(pipeline, make_connector_document):
|
||||
"""index_batch calls prepare_for_indexing, then index() for each returned doc."""
|
||||
doc1 = make_connector_document(
|
||||
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import asyncio
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
|
|
@ -57,7 +57,9 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
|
|||
"app.indexing_pipeline.indexing_pipeline_service.chunk_text",
|
||||
mock_chunk,
|
||||
)
|
||||
mock_embed = MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts])
|
||||
mock_embed = MagicMock(
|
||||
side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]
|
||||
)
|
||||
mock_embed.__name__ = "embed_texts"
|
||||
monkeypatch.setattr(
|
||||
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,110 @@
|
|||
"""Unit tests for the duplicate-content safety logic in prepare_for_indexing.
|
||||
|
||||
Verifies that when an existing document's updated content matches another
|
||||
document's content_hash, the system marks it as failed (for placeholders)
|
||||
or leaves it untouched (for ready documents) — never deletes.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from app.db import Document, DocumentStatus, DocumentType
|
||||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||
from app.indexing_pipeline.document_hashing import (
|
||||
compute_unique_identifier_hash,
|
||||
)
|
||||
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
||||
|
||||
pytestmark = pytest.mark.unit
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_connector_doc(**overrides) -> ConnectorDocument:
|
||||
defaults = {
|
||||
"title": "Test Doc",
|
||||
"source_markdown": "## Some new content",
|
||||
"unique_id": "file-001",
|
||||
"document_type": DocumentType.GOOGLE_DRIVE_FILE,
|
||||
"search_space_id": 1,
|
||||
"connector_id": 42,
|
||||
"created_by_id": "00000000-0000-0000-0000-000000000001",
|
||||
}
|
||||
defaults.update(overrides)
|
||||
return ConnectorDocument(**defaults)
|
||||
|
||||
|
||||
def _make_existing_doc(connector_doc: ConnectorDocument, *, status: dict) -> MagicMock:
|
||||
"""Build a MagicMock that looks like an ORM Document with given status."""
|
||||
doc = MagicMock(spec=Document)
|
||||
doc.id = 999
|
||||
doc.unique_identifier_hash = compute_unique_identifier_hash(connector_doc)
|
||||
doc.content_hash = "old-placeholder-content-hash"
|
||||
doc.title = connector_doc.title
|
||||
doc.status = status
|
||||
return doc
|
||||
|
||||
|
||||
def _mock_session_for_dedup(existing_doc, *, has_duplicate: bool):
|
||||
"""Build a session whose sequential execute() calls return:
|
||||
|
||||
1. The *existing_doc* for the unique_identifier_hash lookup.
|
||||
2. A row (or None) for the duplicate content_hash check.
|
||||
"""
|
||||
session = AsyncMock()
|
||||
|
||||
existing_result = MagicMock()
|
||||
existing_result.scalars.return_value.first.return_value = existing_doc
|
||||
|
||||
dup_result = MagicMock()
|
||||
dup_result.scalars.return_value.first.return_value = 42 if has_duplicate else None
|
||||
|
||||
session.execute = AsyncMock(side_effect=[existing_result, dup_result])
|
||||
session.add = MagicMock()
|
||||
return session
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def test_pending_placeholder_with_duplicate_content_is_marked_failed():
|
||||
"""A placeholder (pending) whose updated content duplicates another doc
|
||||
must be marked as FAILED — never deleted."""
|
||||
cdoc = _make_connector_doc(source_markdown="## Shared content")
|
||||
existing = _make_existing_doc(cdoc, status=DocumentStatus.pending())
|
||||
|
||||
session = _mock_session_for_dedup(existing, has_duplicate=True)
|
||||
pipeline = IndexingPipelineService(session)
|
||||
|
||||
results = await pipeline.prepare_for_indexing([cdoc])
|
||||
|
||||
assert results == [], "duplicate should not be returned for indexing"
|
||||
|
||||
assert DocumentStatus.is_state(existing.status, DocumentStatus.FAILED)
|
||||
assert "Duplicate content" in existing.status.get("reason", "")
|
||||
session.delete.assert_not_called()
|
||||
|
||||
|
||||
async def test_ready_document_with_duplicate_content_is_left_untouched():
|
||||
"""A READY document whose updated content duplicates another doc
|
||||
must be left completely untouched — not failed, not deleted."""
|
||||
cdoc = _make_connector_doc(source_markdown="## Shared content")
|
||||
existing = _make_existing_doc(cdoc, status=DocumentStatus.ready())
|
||||
|
||||
session = _mock_session_for_dedup(existing, has_duplicate=True)
|
||||
pipeline = IndexingPipelineService(session)
|
||||
|
||||
results = await pipeline.prepare_for_indexing([cdoc])
|
||||
|
||||
assert results == [], "duplicate should not be returned for indexing"
|
||||
|
||||
assert DocumentStatus.is_state(existing.status, DocumentStatus.READY)
|
||||
session.delete.assert_not_called()
|
||||
Loading…
Add table
Add a link
Reference in a new issue