mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-25 19:15:18 +02:00
feat: made agent file sytem optimized
This commit is contained in:
parent
ee0b59c0fa
commit
2cc2d339e6
67 changed files with 8011 additions and 5591 deletions
|
|
@ -1,15 +1,23 @@
|
|||
import asyncio
|
||||
import contextlib
|
||||
import hashlib
|
||||
import logging
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from sqlalchemy import delete, select
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import NATIVE_TO_LEGACY_DOCTYPE, Chunk, Document, DocumentStatus
|
||||
from app.db import (
|
||||
NATIVE_TO_LEGACY_DOCTYPE,
|
||||
Chunk,
|
||||
Document,
|
||||
DocumentStatus,
|
||||
DocumentType,
|
||||
)
|
||||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||
from app.indexing_pipeline.document_chunker import chunk_text
|
||||
from app.indexing_pipeline.document_embedder import embed_texts
|
||||
|
|
@ -52,12 +60,114 @@ from app.indexing_pipeline.pipeline_logger import (
|
|||
from app.utils.perf import get_perf_logger
|
||||
|
||||
|
||||
@dataclass
|
||||
class PlaceholderInfo:
|
||||
"""Minimal info to create a placeholder document row for instant UI feedback.
|
||||
|
||||
These are created immediately when items are discovered (before content
|
||||
extraction) so users see them in the UI via Zero sync right away.
|
||||
"""
|
||||
|
||||
title: str
|
||||
document_type: DocumentType
|
||||
unique_id: str
|
||||
search_space_id: int
|
||||
connector_id: int | None
|
||||
created_by_id: str
|
||||
metadata: dict = field(default_factory=dict)
|
||||
|
||||
|
||||
class IndexingPipelineService:
|
||||
"""Single pipeline for indexing connector documents. All connectors use this service."""
|
||||
|
||||
def __init__(self, session: AsyncSession) -> None:
|
||||
self.session = session
|
||||
|
||||
async def create_placeholder_documents(
|
||||
self, placeholders: list[PlaceholderInfo]
|
||||
) -> int:
|
||||
"""Create placeholder document rows with pending status for instant UI feedback.
|
||||
|
||||
These rows appear immediately in the UI via Zero sync. They are later
|
||||
updated by prepare_for_indexing() when actual content is available.
|
||||
|
||||
Returns the number of placeholders successfully created.
|
||||
Failures are logged but never block the main indexing flow.
|
||||
|
||||
NOTE: This method commits on ``self.session`` so the rows become
|
||||
visible to Zero sync immediately. Any pending ORM mutations on the
|
||||
session are committed together, which is consistent with how other
|
||||
mid-flow commits work in the indexing codebase (e.g. rename-only
|
||||
updates in ``_should_skip_file``, ``migrate_legacy_docs``).
|
||||
"""
|
||||
if not placeholders:
|
||||
return 0
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
uid_hashes: dict[str, PlaceholderInfo] = {}
|
||||
for p in placeholders:
|
||||
try:
|
||||
uid_hash = compute_identifier_hash(
|
||||
p.document_type.value, p.unique_id, p.search_space_id
|
||||
)
|
||||
uid_hashes.setdefault(uid_hash, p)
|
||||
except Exception:
|
||||
_logger.debug(
|
||||
"Skipping placeholder hash for %s", p.unique_id, exc_info=True
|
||||
)
|
||||
|
||||
if not uid_hashes:
|
||||
return 0
|
||||
|
||||
result = await self.session.execute(
|
||||
select(Document.unique_identifier_hash).where(
|
||||
Document.unique_identifier_hash.in_(list(uid_hashes.keys()))
|
||||
)
|
||||
)
|
||||
existing_hashes: set[str] = set(result.scalars().all())
|
||||
|
||||
created = 0
|
||||
for uid_hash, p in uid_hashes.items():
|
||||
if uid_hash in existing_hashes:
|
||||
continue
|
||||
try:
|
||||
content_hash = hashlib.sha256(
|
||||
f"placeholder:{uid_hash}".encode()
|
||||
).hexdigest()
|
||||
|
||||
document = Document(
|
||||
title=p.title,
|
||||
document_type=p.document_type,
|
||||
content="Pending...",
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=uid_hash,
|
||||
document_metadata=p.metadata or {},
|
||||
search_space_id=p.search_space_id,
|
||||
connector_id=p.connector_id,
|
||||
created_by_id=p.created_by_id,
|
||||
updated_at=datetime.now(UTC),
|
||||
status=DocumentStatus.pending(),
|
||||
)
|
||||
self.session.add(document)
|
||||
created += 1
|
||||
except Exception:
|
||||
_logger.debug("Skipping placeholder for %s", p.unique_id, exc_info=True)
|
||||
|
||||
if created > 0:
|
||||
try:
|
||||
await self.session.commit()
|
||||
_logger.info(
|
||||
"Created %d placeholder document(s) for instant UI feedback",
|
||||
created,
|
||||
)
|
||||
except IntegrityError:
|
||||
await self.session.rollback()
|
||||
_logger.debug("Placeholder commit failed (race condition), continuing")
|
||||
created = 0
|
||||
|
||||
return created
|
||||
|
||||
async def migrate_legacy_docs(
|
||||
self, connector_docs: list[ConnectorDocument]
|
||||
) -> None:
|
||||
|
|
@ -77,9 +187,7 @@ class IndexingPipelineService:
|
|||
legacy_type, doc.unique_id, doc.search_space_id
|
||||
)
|
||||
result = await self.session.execute(
|
||||
select(Document).filter(
|
||||
Document.unique_identifier_hash == legacy_hash
|
||||
)
|
||||
select(Document).filter(Document.unique_identifier_hash == legacy_hash)
|
||||
)
|
||||
existing = result.scalars().first()
|
||||
if existing is None:
|
||||
|
|
@ -101,9 +209,7 @@ class IndexingPipelineService:
|
|||
Indexers that need heartbeat callbacks or custom per-document logic
|
||||
should call prepare_for_indexing() + index() directly instead.
|
||||
"""
|
||||
doc_map = {
|
||||
compute_unique_identifier_hash(cd): cd for cd in connector_docs
|
||||
}
|
||||
doc_map = {compute_unique_identifier_hash(cd): cd for cd in connector_docs}
|
||||
documents = await self.prepare_for_indexing(connector_docs)
|
||||
results: list[Document] = []
|
||||
for document in documents:
|
||||
|
|
@ -166,6 +272,21 @@ class IndexingPipelineService:
|
|||
log_document_requeued(ctx)
|
||||
continue
|
||||
|
||||
dup_check = await self.session.execute(
|
||||
select(Document.id).filter(
|
||||
Document.content_hash == content_hash,
|
||||
Document.id != existing.id,
|
||||
)
|
||||
)
|
||||
if dup_check.scalars().first() is not None:
|
||||
if not DocumentStatus.is_state(
|
||||
existing.status, DocumentStatus.READY
|
||||
):
|
||||
existing.status = DocumentStatus.failed(
|
||||
"Duplicate content — already indexed by another document"
|
||||
)
|
||||
continue
|
||||
|
||||
existing.title = connector_doc.title
|
||||
existing.content_hash = content_hash
|
||||
existing.source_markdown = connector_doc.source_markdown
|
||||
|
|
@ -349,9 +470,7 @@ class IndexingPipelineService:
|
|||
perf = get_perf_logger()
|
||||
t_total = time.perf_counter()
|
||||
|
||||
doc_map = {
|
||||
compute_unique_identifier_hash(cd): cd for cd in connector_docs
|
||||
}
|
||||
doc_map = {compute_unique_identifier_hash(cd): cd for cd in connector_docs}
|
||||
documents = await self.prepare_for_indexing(connector_docs)
|
||||
|
||||
if not documents:
|
||||
|
|
@ -383,9 +502,7 @@ class IndexingPipelineService:
|
|||
session_maker = get_celery_session_maker()
|
||||
async with session_maker() as isolated_session:
|
||||
try:
|
||||
refetched = await isolated_session.get(
|
||||
Document, document.id
|
||||
)
|
||||
refetched = await isolated_session.get(Document, document.id)
|
||||
if refetched is None:
|
||||
async with lock:
|
||||
failed_count += 1
|
||||
|
|
@ -393,9 +510,7 @@ class IndexingPipelineService:
|
|||
|
||||
llm = await get_llm(isolated_session)
|
||||
iso_pipeline = IndexingPipelineService(isolated_session)
|
||||
result = await iso_pipeline.index(
|
||||
refetched, connector_doc, llm
|
||||
)
|
||||
result = await iso_pipeline.index(refetched, connector_doc, llm)
|
||||
|
||||
async with lock:
|
||||
if DocumentStatus.is_state(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue