mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 08:46:22 +02:00
feat: implement page limit estimation and enforcement in file based connector indexers
- Added a static method `estimate_pages_from_metadata` to `PageLimitService` for estimating page counts based on file metadata. - Integrated page limit checks in Google Drive, Dropbox, and OneDrive indexers to prevent exceeding user quotas during file indexing. - Updated relevant indexing methods to utilize the new page estimation logic and enforce limits accordingly. - Enhanced tests for page limit functionality, ensuring accurate estimation and enforcement across different file types.
This commit is contained in:
parent
c1c4c534c0
commit
ce40da80ea
8 changed files with 1041 additions and 157 deletions
|
|
@ -3,6 +3,7 @@
|
|||
Prerequisites: PostgreSQL + pgvector only.
|
||||
|
||||
External system boundaries are mocked:
|
||||
- ETL parsing — LlamaParse (external API) and Docling (heavy library)
|
||||
- LLM summarization, text embedding, text chunking (external APIs)
|
||||
- Redis heartbeat (external infrastructure)
|
||||
- Task dispatch is swapped via DI (InlineTaskDispatcher)
|
||||
|
|
@ -11,6 +12,7 @@ External system boundaries are mocked:
|
|||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
from collections.abc import AsyncGenerator
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
|
|
@ -298,3 +300,64 @@ def _mock_redis_heartbeat(monkeypatch):
|
|||
"app.tasks.celery_tasks.document_tasks._run_heartbeat_loop",
|
||||
AsyncMock(),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _mock_etl_parsing(monkeypatch):
|
||||
"""Mock ETL parsing services — LlamaParse and Docling are external boundaries.
|
||||
|
||||
Preserves the real contract: empty/corrupt files raise an error just like
|
||||
the actual services would, so tests covering failure paths keep working.
|
||||
"""
|
||||
|
||||
_MOCK_MARKDOWN = "# Mocked Document\n\nThis is mocked ETL content."
|
||||
|
||||
def _reject_empty(file_path: str) -> None:
|
||||
if os.path.getsize(file_path) == 0:
|
||||
raise RuntimeError(f"Cannot parse empty file: {file_path}")
|
||||
|
||||
# -- LlamaParse mock (external API) --------------------------------
|
||||
|
||||
class _FakeMarkdownDoc:
|
||||
def __init__(self, text: str):
|
||||
self.text = text
|
||||
|
||||
class _FakeLlamaParseResult:
|
||||
async def aget_markdown_documents(self, *, split_by_page=False):
|
||||
return [_FakeMarkdownDoc(_MOCK_MARKDOWN)]
|
||||
|
||||
async def _fake_llamacloud_parse(**kwargs):
|
||||
_reject_empty(kwargs["file_path"])
|
||||
return _FakeLlamaParseResult()
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.tasks.document_processors.file_processors.parse_with_llamacloud_retry",
|
||||
_fake_llamacloud_parse,
|
||||
)
|
||||
|
||||
# -- Docling mock (heavy library boundary) -------------------------
|
||||
|
||||
async def _fake_docling_parse(file_path: str, filename: str):
|
||||
_reject_empty(file_path)
|
||||
return _MOCK_MARKDOWN
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.tasks.document_processors.file_processors.parse_with_docling",
|
||||
_fake_docling_parse,
|
||||
)
|
||||
|
||||
class _FakeDoclingResult:
|
||||
class document:
|
||||
@staticmethod
|
||||
def export_to_markdown():
|
||||
return _MOCK_MARKDOWN
|
||||
|
||||
class _FakeDocumentConverter:
|
||||
def convert(self, file_path):
|
||||
_reject_empty(file_path)
|
||||
return _FakeDoclingResult()
|
||||
|
||||
monkeypatch.setattr(
|
||||
"docling.document_converter.DocumentConverter",
|
||||
_FakeDocumentConverter,
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue