feat: implement page limit estimation and enforcement in file based connector indexers

- Added a static method `estimate_pages_from_metadata` to `PageLimitService` for estimating page counts based on file metadata.
- Integrated page limit checks in Google Drive, Dropbox, and OneDrive indexers to prevent exceeding user quotas during file indexing.
- Updated relevant indexing methods to utilize the new page estimation logic and enforce limits accordingly.
- Enhanced tests for page limit functionality, ensuring accurate estimation and enforcement across different file types.
This commit is contained in:
Anish Sarkar 2026-04-04 02:51:28 +05:30
parent c1c4c534c0
commit ce40da80ea
8 changed files with 1041 additions and 157 deletions

View file

@ -3,7 +3,7 @@ Service for managing user page limits for ETL services.
"""
import os
from pathlib import Path
from pathlib import Path, PurePosixPath
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
@ -223,10 +223,91 @@ class PageLimitService:
# Estimate ~2000 characters per page
return max(1, content_length // 2000)
@staticmethod
def estimate_pages_from_metadata(
file_name_or_ext: str, file_size: int | str | None = None
) -> int:
"""Size-based page estimation from file name/extension and byte size.
Pure function no file I/O, no database access. Used by cloud
connectors (which only have API metadata) and as the internal
fallback for :meth:`estimate_pages_before_processing`.
``file_name_or_ext`` can be a full filename (``"report.pdf"``) or
a bare extension (``".pdf"``). ``file_size`` may be an int, a
stringified int from a cloud API, or *None*.
"""
if file_size is not None:
try:
file_size = int(file_size)
except (ValueError, TypeError):
file_size = 0
else:
file_size = 0
if file_size <= 0:
return 1
ext = PurePosixPath(file_name_or_ext).suffix.lower() if file_name_or_ext else ""
if not ext and file_name_or_ext.startswith("."):
ext = file_name_or_ext.lower()
file_ext = ext
if file_ext == ".pdf":
return max(1, file_size // (100 * 1024))
if file_ext in {
".doc", ".docx", ".docm", ".dot", ".dotm",
".odt", ".ott", ".sxw", ".stw", ".uot",
".rtf", ".pages", ".wpd", ".wps",
".abw", ".zabw", ".cwk", ".hwp", ".lwp",
".mcw", ".mw", ".sdw", ".vor",
}:
return max(1, file_size // (50 * 1024))
if file_ext in {
".ppt", ".pptx", ".pptm", ".pot", ".potx",
".odp", ".otp", ".sxi", ".sti", ".uop",
".key", ".sda", ".sdd", ".sdp",
}:
return max(1, file_size // (200 * 1024))
if file_ext in {
".xls", ".xlsx", ".xlsm", ".xlsb", ".xlw", ".xlr",
".ods", ".ots", ".fods", ".numbers",
".123", ".wk1", ".wk2", ".wk3", ".wk4", ".wks",
".wb1", ".wb2", ".wb3", ".wq1", ".wq2",
".csv", ".tsv", ".slk", ".sylk", ".dif", ".dbf",
".prn", ".qpw", ".602", ".et", ".eth",
}:
return max(1, file_size // (100 * 1024))
if file_ext in {".epub"}:
return max(1, file_size // (50 * 1024))
if file_ext in {".txt", ".log", ".md", ".markdown", ".htm", ".html", ".xml"}:
return max(1, file_size // 3000)
if file_ext in {
".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff",
".webp", ".svg", ".cgm", ".odg", ".pbd",
}:
return 1
if file_ext in {".mp3", ".m4a", ".wav", ".mpga"}:
return max(1, file_size // (1024 * 1024))
if file_ext in {".mp4", ".mpeg", ".webm"}:
return max(1, file_size // (5 * 1024 * 1024))
return max(1, file_size // (80 * 1024))
def estimate_pages_before_processing(self, file_path: str) -> int:
"""
Estimate page count from file before processing (to avoid unnecessary API calls).
This is called BEFORE sending to ETL services to prevent cost on rejected files.
Estimate page count from a local file before processing.
For PDFs, attempts to read the actual page count via pypdf.
For everything else, delegates to :meth:`estimate_pages_from_metadata`.
Args:
file_path: Path to the file
@ -240,7 +321,6 @@ class PageLimitService:
file_ext = Path(file_path).suffix.lower()
file_size = os.path.getsize(file_path)
# PDF files - try to get actual page count
if file_ext == ".pdf":
try:
import pypdf
@ -249,153 +329,6 @@ class PageLimitService:
pdf_reader = pypdf.PdfReader(f)
return len(pdf_reader.pages)
except Exception:
# If PDF reading fails, fall back to size estimation
# Typical PDF: ~100KB per page (conservative estimate)
return max(1, file_size // (100 * 1024))
pass # fall through to size-based estimation
# Word Processing Documents
# Microsoft Word, LibreOffice Writer, WordPerfect, Pages, etc.
elif file_ext in [
".doc",
".docx",
".docm",
".dot",
".dotm", # Microsoft Word
".odt",
".ott",
".sxw",
".stw",
".uot", # OpenDocument/StarOffice Writer
".rtf", # Rich Text Format
".pages", # Apple Pages
".wpd",
".wps", # WordPerfect, Microsoft Works
".abw",
".zabw", # AbiWord
".cwk",
".hwp",
".lwp",
".mcw",
".mw",
".sdw",
".vor", # Other word processors
]:
# Typical word document: ~50KB per page (conservative)
return max(1, file_size // (50 * 1024))
# Presentation Documents
# PowerPoint, Impress, Keynote, etc.
elif file_ext in [
".ppt",
".pptx",
".pptm",
".pot",
".potx", # Microsoft PowerPoint
".odp",
".otp",
".sxi",
".sti",
".uop", # OpenDocument/StarOffice Impress
".key", # Apple Keynote
".sda",
".sdd",
".sdp", # StarOffice Draw/Impress
]:
# Typical presentation: ~200KB per slide (conservative)
return max(1, file_size // (200 * 1024))
# Spreadsheet Documents
# Excel, Calc, Numbers, Lotus, etc.
elif file_ext in [
".xls",
".xlsx",
".xlsm",
".xlsb",
".xlw",
".xlr", # Microsoft Excel
".ods",
".ots",
".fods", # OpenDocument Spreadsheet
".numbers", # Apple Numbers
".123",
".wk1",
".wk2",
".wk3",
".wk4",
".wks", # Lotus 1-2-3
".wb1",
".wb2",
".wb3",
".wq1",
".wq2", # Quattro Pro
".csv",
".tsv",
".slk",
".sylk",
".dif",
".dbf",
".prn",
".qpw", # Data formats
".602",
".et",
".eth", # Other spreadsheets
]:
# Spreadsheets typically have 1 sheet = 1 page for ETL
# Conservative: ~100KB per sheet
return max(1, file_size // (100 * 1024))
# E-books
elif file_ext in [".epub"]:
# E-books vary widely, estimate by size
# Typical e-book: ~50KB per page
return max(1, file_size // (50 * 1024))
# Plain Text and Markup Files
elif file_ext in [
".txt",
".log", # Plain text
".md",
".markdown", # Markdown
".htm",
".html",
".xml", # Markup
]:
# Plain text: ~3000 bytes per page
return max(1, file_size // 3000)
# Image Files
# Each image is typically processed as 1 page
elif file_ext in [
".jpg",
".jpeg", # JPEG
".png", # PNG
".gif", # GIF
".bmp", # Bitmap
".tiff", # TIFF
".webp", # WebP
".svg", # SVG
".cgm", # Computer Graphics Metafile
".odg",
".pbd", # OpenDocument Graphics
]:
# Each image = 1 page
return 1
# Audio Files (transcription = typically 1 page per minute)
# Note: These should be handled by audio transcription flow, not ETL
elif file_ext in [".mp3", ".m4a", ".wav", ".mpga"]:
# Audio files: estimate based on duration
# Fallback: ~1MB per minute of audio, 1 page per minute transcript
return max(1, file_size // (1024 * 1024))
# Video Files (typically not processed for pages, but just in case)
elif file_ext in [".mp4", ".mpeg", ".webm"]:
# Video files: very rough estimate
# Typically wouldn't be page-based, but use conservative estimate
return max(1, file_size // (5 * 1024 * 1024))
# Other/Unknown Document Types
else:
# Conservative estimate: ~80KB per page
# This catches: .sgl, .sxg, .uof, .uos1, .uos2, .web, and any future formats
return max(1, file_size // (80 * 1024))
return self.estimate_pages_from_metadata(file_ext, file_size)

View file

@ -4,7 +4,6 @@ Base functionality and shared imports for connector indexers.
import logging
from datetime import UTC, datetime, timedelta
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select

View file

@ -28,6 +28,7 @@ from app.indexing_pipeline.connector_document import ConnectorDocument
from app.indexing_pipeline.document_hashing import compute_identifier_hash
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
from app.services.llm_service import get_user_long_context_llm
from app.services.page_limit_service import PageLimitService
from app.services.task_logging_service import TaskLoggingService
from app.tasks.connector_indexers.base import (
check_document_by_unique_identifier,
@ -278,6 +279,12 @@ async def _index_full_scan(
},
)
page_limit_service = PageLimitService(session)
pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
remaining_quota = pages_limit - pages_used
batch_estimated_pages = 0
page_limit_reached = False
renamed_count = 0
skipped = 0
files_to_download: list[dict] = []
@ -307,6 +314,21 @@ async def _index_full_scan(
elif skip_item(file):
skipped += 1
continue
file_pages = PageLimitService.estimate_pages_from_metadata(
file.get("name", ""), file.get("size")
)
if batch_estimated_pages + file_pages > remaining_quota:
if not page_limit_reached:
logger.warning(
"Page limit reached during Dropbox full scan, "
"skipping remaining files"
)
page_limit_reached = True
skipped += 1
continue
batch_estimated_pages += file_pages
files_to_download.append(file)
batch_indexed, failed = await _download_and_index(
@ -320,6 +342,14 @@ async def _index_full_scan(
on_heartbeat=on_heartbeat_callback,
)
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
pages_to_deduct = max(
1, batch_estimated_pages * batch_indexed // len(files_to_download)
)
await page_limit_service.update_page_usage(
user_id, pages_to_deduct, allow_exceed=True
)
indexed = renamed_count + batch_indexed
logger.info(
f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed"
@ -340,6 +370,11 @@ async def _index_selected_files(
on_heartbeat: HeartbeatCallbackType | None = None,
) -> tuple[int, int, list[str]]:
"""Index user-selected files using the parallel pipeline."""
page_limit_service = PageLimitService(session)
pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
remaining_quota = pages_limit - pages_used
batch_estimated_pages = 0
files_to_download: list[dict] = []
errors: list[str] = []
renamed_count = 0
@ -364,6 +399,15 @@ async def _index_selected_files(
skipped += 1
continue
file_pages = PageLimitService.estimate_pages_from_metadata(
file.get("name", ""), file.get("size")
)
if batch_estimated_pages + file_pages > remaining_quota:
display = file_name or file_path
errors.append(f"File '{display}': page limit would be exceeded")
continue
batch_estimated_pages += file_pages
files_to_download.append(file)
batch_indexed, _failed = await _download_and_index(
@ -377,6 +421,14 @@ async def _index_selected_files(
on_heartbeat=on_heartbeat,
)
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
pages_to_deduct = max(
1, batch_estimated_pages * batch_indexed // len(files_to_download)
)
await page_limit_service.update_page_usage(
user_id, pages_to_deduct, allow_exceed=True
)
return renamed_count + batch_indexed, skipped, errors

View file

@ -34,6 +34,7 @@ from app.indexing_pipeline.indexing_pipeline_service import (
PlaceholderInfo,
)
from app.services.llm_service import get_user_long_context_llm
from app.services.page_limit_service import PageLimitService
from app.services.task_logging_service import TaskLoggingService
from app.tasks.connector_indexers.base import (
check_document_by_unique_identifier,
@ -327,6 +328,12 @@ async def _process_single_file(
return 1, 0, 0
return 0, 1, 0
page_limit_service = PageLimitService(session)
estimated_pages = PageLimitService.estimate_pages_from_metadata(
file_name, file.get("size")
)
await page_limit_service.check_page_limit(user_id, estimated_pages)
markdown, drive_metadata, error = await download_and_extract_content(
drive_client, file
)
@ -363,6 +370,9 @@ async def _process_single_file(
)
await pipeline.index(document, connector_doc, user_llm)
await page_limit_service.update_page_usage(
user_id, estimated_pages, allow_exceed=True
)
logger.info(f"Successfully indexed Google Drive file: {file_name}")
return 1, 0, 0
@ -466,6 +476,11 @@ async def _index_selected_files(
Returns (indexed_count, skipped_count, errors).
"""
page_limit_service = PageLimitService(session)
pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
remaining_quota = pages_limit - pages_used
batch_estimated_pages = 0
files_to_download: list[dict] = []
errors: list[str] = []
renamed_count = 0
@ -486,6 +501,15 @@ async def _index_selected_files(
skipped += 1
continue
file_pages = PageLimitService.estimate_pages_from_metadata(
file.get("name", ""), file.get("size")
)
if batch_estimated_pages + file_pages > remaining_quota:
display = file_name or file_id
errors.append(f"File '{display}': page limit would be exceeded")
continue
batch_estimated_pages += file_pages
files_to_download.append(file)
await _create_drive_placeholders(
@ -507,6 +531,14 @@ async def _index_selected_files(
on_heartbeat=on_heartbeat,
)
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
pages_to_deduct = max(
1, batch_estimated_pages * batch_indexed // len(files_to_download)
)
await page_limit_service.update_page_usage(
user_id, pages_to_deduct, allow_exceed=True
)
return renamed_count + batch_indexed, skipped, errors
@ -545,6 +577,12 @@ async def _index_full_scan(
# ------------------------------------------------------------------
# Phase 1 (serial): collect files, run skip checks, track renames
# ------------------------------------------------------------------
page_limit_service = PageLimitService(session)
pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
remaining_quota = pages_limit - pages_used
batch_estimated_pages = 0
page_limit_reached = False
renamed_count = 0
skipped = 0
files_processed = 0
@ -593,6 +631,20 @@ async def _index_full_scan(
skipped += 1
continue
file_pages = PageLimitService.estimate_pages_from_metadata(
file.get("name", ""), file.get("size")
)
if batch_estimated_pages + file_pages > remaining_quota:
if not page_limit_reached:
logger.warning(
"Page limit reached during Google Drive full scan, "
"skipping remaining files"
)
page_limit_reached = True
skipped += 1
continue
batch_estimated_pages += file_pages
files_to_download.append(file)
page_token = next_token
@ -636,6 +688,14 @@ async def _index_full_scan(
on_heartbeat=on_heartbeat_callback,
)
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
pages_to_deduct = max(
1, batch_estimated_pages * batch_indexed // len(files_to_download)
)
await page_limit_service.update_page_usage(
user_id, pages_to_deduct, allow_exceed=True
)
indexed = renamed_count + batch_indexed
logger.info(
f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed"
@ -686,6 +746,12 @@ async def _index_with_delta_sync(
# ------------------------------------------------------------------
# Phase 1 (serial): handle removals, collect files for download
# ------------------------------------------------------------------
page_limit_service = PageLimitService(session)
pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
remaining_quota = pages_limit - pages_used
batch_estimated_pages = 0
page_limit_reached = False
renamed_count = 0
skipped = 0
files_to_download: list[dict] = []
@ -715,6 +781,20 @@ async def _index_with_delta_sync(
skipped += 1
continue
file_pages = PageLimitService.estimate_pages_from_metadata(
file.get("name", ""), file.get("size")
)
if batch_estimated_pages + file_pages > remaining_quota:
if not page_limit_reached:
logger.warning(
"Page limit reached during Google Drive delta sync, "
"skipping remaining files"
)
page_limit_reached = True
skipped += 1
continue
batch_estimated_pages += file_pages
files_to_download.append(file)
# ------------------------------------------------------------------
@ -742,6 +822,14 @@ async def _index_with_delta_sync(
on_heartbeat=on_heartbeat_callback,
)
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
pages_to_deduct = max(
1, batch_estimated_pages * batch_indexed // len(files_to_download)
)
await page_limit_service.update_page_usage(
user_id, pages_to_deduct, allow_exceed=True
)
indexed = renamed_count + batch_indexed
logger.info(
f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed"

View file

@ -28,6 +28,7 @@ from app.indexing_pipeline.connector_document import ConnectorDocument
from app.indexing_pipeline.document_hashing import compute_identifier_hash
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
from app.services.llm_service import get_user_long_context_llm
from app.services.page_limit_service import PageLimitService
from app.services.task_logging_service import TaskLoggingService
from app.tasks.connector_indexers.base import (
check_document_by_unique_identifier,
@ -291,6 +292,11 @@ async def _index_selected_files(
on_heartbeat: HeartbeatCallbackType | None = None,
) -> tuple[int, int, list[str]]:
"""Index user-selected files using the parallel pipeline."""
page_limit_service = PageLimitService(session)
pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
remaining_quota = pages_limit - pages_used
batch_estimated_pages = 0
files_to_download: list[dict] = []
errors: list[str] = []
renamed_count = 0
@ -311,6 +317,15 @@ async def _index_selected_files(
skipped += 1
continue
file_pages = PageLimitService.estimate_pages_from_metadata(
file.get("name", ""), file.get("size")
)
if batch_estimated_pages + file_pages > remaining_quota:
display = file_name or file_id
errors.append(f"File '{display}': page limit would be exceeded")
continue
batch_estimated_pages += file_pages
files_to_download.append(file)
batch_indexed, _failed = await _download_and_index(
@ -324,6 +339,14 @@ async def _index_selected_files(
on_heartbeat=on_heartbeat,
)
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
pages_to_deduct = max(
1, batch_estimated_pages * batch_indexed // len(files_to_download)
)
await page_limit_service.update_page_usage(
user_id, pages_to_deduct, allow_exceed=True
)
return renamed_count + batch_indexed, skipped, errors
@ -358,6 +381,12 @@ async def _index_full_scan(
},
)
page_limit_service = PageLimitService(session)
pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
remaining_quota = pages_limit - pages_used
batch_estimated_pages = 0
page_limit_reached = False
renamed_count = 0
skipped = 0
files_to_download: list[dict] = []
@ -383,6 +412,21 @@ async def _index_full_scan(
else:
skipped += 1
continue
file_pages = PageLimitService.estimate_pages_from_metadata(
file.get("name", ""), file.get("size")
)
if batch_estimated_pages + file_pages > remaining_quota:
if not page_limit_reached:
logger.warning(
"Page limit reached during OneDrive full scan, "
"skipping remaining files"
)
page_limit_reached = True
skipped += 1
continue
batch_estimated_pages += file_pages
files_to_download.append(file)
batch_indexed, failed = await _download_and_index(
@ -396,6 +440,14 @@ async def _index_full_scan(
on_heartbeat=on_heartbeat_callback,
)
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
pages_to_deduct = max(
1, batch_estimated_pages * batch_indexed // len(files_to_download)
)
await page_limit_service.update_page_usage(
user_id, pages_to_deduct, allow_exceed=True
)
indexed = renamed_count + batch_indexed
logger.info(
f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed"
@ -441,6 +493,12 @@ async def _index_with_delta_sync(
logger.info(f"Processing {len(changes)} delta changes")
page_limit_service = PageLimitService(session)
pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
remaining_quota = pages_limit - pages_used
batch_estimated_pages = 0
page_limit_reached = False
renamed_count = 0
skipped = 0
files_to_download: list[dict] = []
@ -471,6 +529,20 @@ async def _index_with_delta_sync(
skipped += 1
continue
file_pages = PageLimitService.estimate_pages_from_metadata(
change.get("name", ""), change.get("size")
)
if batch_estimated_pages + file_pages > remaining_quota:
if not page_limit_reached:
logger.warning(
"Page limit reached during OneDrive delta sync, "
"skipping remaining files"
)
page_limit_reached = True
skipped += 1
continue
batch_estimated_pages += file_pages
files_to_download.append(change)
batch_indexed, failed = await _download_and_index(
@ -484,6 +556,14 @@ async def _index_with_delta_sync(
on_heartbeat=on_heartbeat_callback,
)
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
pages_to_deduct = max(
1, batch_estimated_pages * batch_indexed // len(files_to_download)
)
await page_limit_service.update_page_usage(
user_id, pages_to_deduct, allow_exceed=True
)
indexed = renamed_count + batch_indexed
logger.info(
f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed"