feat: made agent file sytem optimized

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-03-28 16:39:46 -07:00
parent ee0b59c0fa
commit 2cc2d339e6
67 changed files with 8011 additions and 5591 deletions

View file

@ -0,0 +1,131 @@
"""Unit tests for IndexingPipelineService.create_placeholder_documents."""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock
import pytest
from sqlalchemy.exc import IntegrityError
from app.db import DocumentStatus, DocumentType
from app.indexing_pipeline.document_hashing import compute_identifier_hash
from app.indexing_pipeline.indexing_pipeline_service import (
IndexingPipelineService,
PlaceholderInfo,
)
pytestmark = pytest.mark.unit
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_placeholder(**overrides) -> PlaceholderInfo:
defaults = {
"title": "Test Doc",
"document_type": DocumentType.GOOGLE_DRIVE_FILE,
"unique_id": "file-001",
"search_space_id": 1,
"connector_id": 42,
"created_by_id": "00000000-0000-0000-0000-000000000001",
}
defaults.update(overrides)
return PlaceholderInfo(**defaults)
def _uid_hash(p: PlaceholderInfo) -> str:
return compute_identifier_hash(
p.document_type.value, p.unique_id, p.search_space_id
)
def _session_with_existing_hashes(existing: set[str] | None = None):
"""Build an AsyncMock session whose batch-query returns *existing* hashes."""
session = AsyncMock()
result = MagicMock()
result.scalars.return_value.all.return_value = list(existing or [])
session.execute = AsyncMock(return_value=result)
session.add = MagicMock()
return session
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
async def test_empty_input_returns_zero_without_db_calls():
session = AsyncMock()
pipeline = IndexingPipelineService(session)
result = await pipeline.create_placeholder_documents([])
assert result == 0
session.execute.assert_not_awaited()
session.commit.assert_not_awaited()
async def test_creates_documents_with_pending_status_and_commits():
session = _session_with_existing_hashes(set())
pipeline = IndexingPipelineService(session)
p = _make_placeholder(title="My File", unique_id="file-abc")
result = await pipeline.create_placeholder_documents([p])
assert result == 1
session.add.assert_called_once()
doc = session.add.call_args[0][0]
assert doc.title == "My File"
assert doc.document_type == DocumentType.GOOGLE_DRIVE_FILE
assert doc.content == "Pending..."
assert DocumentStatus.is_state(doc.status, DocumentStatus.PENDING)
assert doc.search_space_id == 1
assert doc.connector_id == 42
session.commit.assert_awaited_once()
async def test_existing_documents_are_skipped():
"""Placeholders whose unique_identifier_hash already exists are not re-created."""
existing_p = _make_placeholder(unique_id="already-there")
new_p = _make_placeholder(unique_id="brand-new")
existing_hash = _uid_hash(existing_p)
session = _session_with_existing_hashes({existing_hash})
pipeline = IndexingPipelineService(session)
result = await pipeline.create_placeholder_documents([existing_p, new_p])
assert result == 1
doc = session.add.call_args[0][0]
assert doc.unique_identifier_hash == _uid_hash(new_p)
async def test_duplicate_unique_ids_within_input_are_deduped():
"""Same unique_id passed twice only produces one placeholder."""
p1 = _make_placeholder(unique_id="dup-id", title="First")
p2 = _make_placeholder(unique_id="dup-id", title="Second")
session = _session_with_existing_hashes(set())
pipeline = IndexingPipelineService(session)
result = await pipeline.create_placeholder_documents([p1, p2])
assert result == 1
session.add.assert_called_once()
async def test_integrity_error_on_commit_returns_zero():
"""IntegrityError during commit (race condition) is swallowed gracefully."""
session = _session_with_existing_hashes(set())
session.commit = AsyncMock(side_effect=IntegrityError("dup", {}, None))
pipeline = IndexingPipelineService(session)
p = _make_placeholder()
result = await pipeline.create_placeholder_documents([p])
assert result == 0
session.rollback.assert_awaited_once()

View file

@ -19,9 +19,7 @@ def pipeline(mock_session):
return IndexingPipelineService(mock_session)
async def test_calls_prepare_then_index_per_document(
pipeline, make_connector_document
):
async def test_calls_prepare_then_index_per_document(pipeline, make_connector_document):
"""index_batch calls prepare_for_indexing, then index() for each returned doc."""
doc1 = make_connector_document(
document_type=DocumentType.GOOGLE_GMAIL_CONNECTOR,

View file

@ -1,5 +1,5 @@
import asyncio
from unittest.mock import AsyncMock, MagicMock, patch
from unittest.mock import AsyncMock, MagicMock
import pytest
@ -57,7 +57,9 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
"app.indexing_pipeline.indexing_pipeline_service.chunk_text",
mock_chunk,
)
mock_embed = MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts])
mock_embed = MagicMock(
side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]
)
mock_embed.__name__ = "embed_texts"
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",

View file

@ -0,0 +1,110 @@
"""Unit tests for the duplicate-content safety logic in prepare_for_indexing.
Verifies that when an existing document's updated content matches another
document's content_hash, the system marks it as failed (for placeholders)
or leaves it untouched (for ready documents) never deletes.
"""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock
import pytest
from app.db import Document, DocumentStatus, DocumentType
from app.indexing_pipeline.connector_document import ConnectorDocument
from app.indexing_pipeline.document_hashing import (
compute_unique_identifier_hash,
)
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
pytestmark = pytest.mark.unit
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_connector_doc(**overrides) -> ConnectorDocument:
defaults = {
"title": "Test Doc",
"source_markdown": "## Some new content",
"unique_id": "file-001",
"document_type": DocumentType.GOOGLE_DRIVE_FILE,
"search_space_id": 1,
"connector_id": 42,
"created_by_id": "00000000-0000-0000-0000-000000000001",
}
defaults.update(overrides)
return ConnectorDocument(**defaults)
def _make_existing_doc(connector_doc: ConnectorDocument, *, status: dict) -> MagicMock:
"""Build a MagicMock that looks like an ORM Document with given status."""
doc = MagicMock(spec=Document)
doc.id = 999
doc.unique_identifier_hash = compute_unique_identifier_hash(connector_doc)
doc.content_hash = "old-placeholder-content-hash"
doc.title = connector_doc.title
doc.status = status
return doc
def _mock_session_for_dedup(existing_doc, *, has_duplicate: bool):
"""Build a session whose sequential execute() calls return:
1. The *existing_doc* for the unique_identifier_hash lookup.
2. A row (or None) for the duplicate content_hash check.
"""
session = AsyncMock()
existing_result = MagicMock()
existing_result.scalars.return_value.first.return_value = existing_doc
dup_result = MagicMock()
dup_result.scalars.return_value.first.return_value = 42 if has_duplicate else None
session.execute = AsyncMock(side_effect=[existing_result, dup_result])
session.add = MagicMock()
return session
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
async def test_pending_placeholder_with_duplicate_content_is_marked_failed():
"""A placeholder (pending) whose updated content duplicates another doc
must be marked as FAILED never deleted."""
cdoc = _make_connector_doc(source_markdown="## Shared content")
existing = _make_existing_doc(cdoc, status=DocumentStatus.pending())
session = _mock_session_for_dedup(existing, has_duplicate=True)
pipeline = IndexingPipelineService(session)
results = await pipeline.prepare_for_indexing([cdoc])
assert results == [], "duplicate should not be returned for indexing"
assert DocumentStatus.is_state(existing.status, DocumentStatus.FAILED)
assert "Duplicate content" in existing.status.get("reason", "")
session.delete.assert_not_called()
async def test_ready_document_with_duplicate_content_is_left_untouched():
"""A READY document whose updated content duplicates another doc
must be left completely untouched not failed, not deleted."""
cdoc = _make_connector_doc(source_markdown="## Shared content")
existing = _make_existing_doc(cdoc, status=DocumentStatus.ready())
session = _mock_session_for_dedup(existing, has_duplicate=True)
pipeline = IndexingPipelineService(session)
results = await pipeline.prepare_for_indexing([cdoc])
assert results == [], "duplicate should not be returned for indexing"
assert DocumentStatus.is_state(existing.status, DocumentStatus.READY)
session.delete.assert_not_called()