feat: implement page limit estimation and enforcement in file based connector indexers

- Added a static method `estimate_pages_from_metadata` to `PageLimitService` for estimating page counts based on file metadata.
- Integrated page limit checks in Google Drive, Dropbox, and OneDrive indexers to prevent exceeding user quotas during file indexing.
- Updated relevant indexing methods to utilize the new page estimation logic and enforce limits accordingly.
- Enhanced tests for page limit functionality, ensuring accurate estimation and enforcement across different file types.
This commit is contained in:
Anish Sarkar 2026-04-04 02:51:28 +05:30
parent c1c4c534c0
commit ce40da80ea
8 changed files with 1041 additions and 157 deletions

View file

@ -3,6 +3,7 @@
Prerequisites: PostgreSQL + pgvector only.
External system boundaries are mocked:
- ETL parsing LlamaParse (external API) and Docling (heavy library)
- LLM summarization, text embedding, text chunking (external APIs)
- Redis heartbeat (external infrastructure)
- Task dispatch is swapped via DI (InlineTaskDispatcher)
@ -11,6 +12,7 @@ External system boundaries are mocked:
from __future__ import annotations
import contextlib
import os
from collections.abc import AsyncGenerator
from unittest.mock import AsyncMock, MagicMock
@ -298,3 +300,64 @@ def _mock_redis_heartbeat(monkeypatch):
"app.tasks.celery_tasks.document_tasks._run_heartbeat_loop",
AsyncMock(),
)
@pytest.fixture(autouse=True)
def _mock_etl_parsing(monkeypatch):
"""Mock ETL parsing services — LlamaParse and Docling are external boundaries.
Preserves the real contract: empty/corrupt files raise an error just like
the actual services would, so tests covering failure paths keep working.
"""
_MOCK_MARKDOWN = "# Mocked Document\n\nThis is mocked ETL content."
def _reject_empty(file_path: str) -> None:
if os.path.getsize(file_path) == 0:
raise RuntimeError(f"Cannot parse empty file: {file_path}")
# -- LlamaParse mock (external API) --------------------------------
class _FakeMarkdownDoc:
def __init__(self, text: str):
self.text = text
class _FakeLlamaParseResult:
async def aget_markdown_documents(self, *, split_by_page=False):
return [_FakeMarkdownDoc(_MOCK_MARKDOWN)]
async def _fake_llamacloud_parse(**kwargs):
_reject_empty(kwargs["file_path"])
return _FakeLlamaParseResult()
monkeypatch.setattr(
"app.tasks.document_processors.file_processors.parse_with_llamacloud_retry",
_fake_llamacloud_parse,
)
# -- Docling mock (heavy library boundary) -------------------------
async def _fake_docling_parse(file_path: str, filename: str):
_reject_empty(file_path)
return _MOCK_MARKDOWN
monkeypatch.setattr(
"app.tasks.document_processors.file_processors.parse_with_docling",
_fake_docling_parse,
)
class _FakeDoclingResult:
class document:
@staticmethod
def export_to_markdown():
return _MOCK_MARKDOWN
class _FakeDocumentConverter:
def convert(self, file_path):
_reject_empty(file_path)
return _FakeDoclingResult()
monkeypatch.setattr(
"docling.document_converter.DocumentConverter",
_FakeDocumentConverter,
)