feat: implement page limit estimation and enforcement in file based connector indexers

- Added a static method `estimate_pages_from_metadata` to `PageLimitService` for estimating page counts based on file metadata. - Integrated page limit checks in Google Drive, Dropbox, and OneDrive indexers to prevent exceeding user quotas during file indexing. - Updated relevant indexing methods to utilize the new page estimation logic and enforce limits accordingly. - Enhanced tests for page limit functionality, ensuring accurate estimation and enforcement across different file types.
2026-06-20 21:18:13 +02:00 · 2026-04-04 02:51:28 +05:30 · 2026-04-04 02:51:28 +05:30 · ce40da80ea
commit ce40da80ea
parent c1c4c534c0
8 changed files with 1041 additions and 157 deletions
--- a/surfsense_backend/app/services/page_limit_service.py
+++ b/surfsense_backend/app/services/page_limit_service.py
@ -3,7 +3,7 @@ Service for managing user page limits for ETL services.
 """

 import os
-from pathlib import Path
+from pathlib import Path, PurePosixPath

 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
@ -223,10 +223,91 @@ class PageLimitService:
        # Estimate ~2000 characters per page
        return max(1, content_length // 2000)

+    @staticmethod
+    def estimate_pages_from_metadata(
+        file_name_or_ext: str, file_size: int | str | None = None
+    ) -> int:
+        """Size-based page estimation from file name/extension and byte size.
+
+        Pure function — no file I/O, no database access.  Used by cloud
+        connectors (which only have API metadata) and as the internal
+        fallback for :meth:`estimate_pages_before_processing`.
+
+        ``file_name_or_ext`` can be a full filename (``"report.pdf"``) or
+        a bare extension (``".pdf"``).  ``file_size`` may be an int, a
+        stringified int from a cloud API, or *None*.
+        """
+        if file_size is not None:
+            try:
+                file_size = int(file_size)
+            except (ValueError, TypeError):
+                file_size = 0
+        else:
+            file_size = 0
+
+        if file_size <= 0:
+            return 1
+
+        ext = PurePosixPath(file_name_or_ext).suffix.lower() if file_name_or_ext else ""
+        if not ext and file_name_or_ext.startswith("."):
+            ext = file_name_or_ext.lower()
+        file_ext = ext
+
+        if file_ext == ".pdf":
+            return max(1, file_size // (100 * 1024))
+
+        if file_ext in {
+            ".doc", ".docx", ".docm", ".dot", ".dotm",
+            ".odt", ".ott", ".sxw", ".stw", ".uot",
+            ".rtf", ".pages", ".wpd", ".wps",
+            ".abw", ".zabw", ".cwk", ".hwp", ".lwp",
+            ".mcw", ".mw", ".sdw", ".vor",
+        }:
+            return max(1, file_size // (50 * 1024))
+
+        if file_ext in {
+            ".ppt", ".pptx", ".pptm", ".pot", ".potx",
+            ".odp", ".otp", ".sxi", ".sti", ".uop",
+            ".key", ".sda", ".sdd", ".sdp",
+        }:
+            return max(1, file_size // (200 * 1024))
+
+        if file_ext in {
+            ".xls", ".xlsx", ".xlsm", ".xlsb", ".xlw", ".xlr",
+            ".ods", ".ots", ".fods", ".numbers",
+            ".123", ".wk1", ".wk2", ".wk3", ".wk4", ".wks",
+            ".wb1", ".wb2", ".wb3", ".wq1", ".wq2",
+            ".csv", ".tsv", ".slk", ".sylk", ".dif", ".dbf",
+            ".prn", ".qpw", ".602", ".et", ".eth",
+        }:
+            return max(1, file_size // (100 * 1024))
+
+        if file_ext in {".epub"}:
+            return max(1, file_size // (50 * 1024))
+
+        if file_ext in {".txt", ".log", ".md", ".markdown", ".htm", ".html", ".xml"}:
+            return max(1, file_size // 3000)
+
+        if file_ext in {
+            ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff",
+            ".webp", ".svg", ".cgm", ".odg", ".pbd",
+        }:
+            return 1
+
+        if file_ext in {".mp3", ".m4a", ".wav", ".mpga"}:
+            return max(1, file_size // (1024 * 1024))
+
+        if file_ext in {".mp4", ".mpeg", ".webm"}:
+            return max(1, file_size // (5 * 1024 * 1024))
+
+        return max(1, file_size // (80 * 1024))
+
    def estimate_pages_before_processing(self, file_path: str) -> int:
        """
-        Estimate page count from file before processing (to avoid unnecessary API calls).
-        This is called BEFORE sending to ETL services to prevent cost on rejected files.
+        Estimate page count from a local file before processing.
+
+        For PDFs, attempts to read the actual page count via pypdf.
+        For everything else, delegates to :meth:`estimate_pages_from_metadata`.

        Args:
            file_path: Path to the file
@ -240,7 +321,6 @@ class PageLimitService:
        file_ext = Path(file_path).suffix.lower()
        file_size = os.path.getsize(file_path)

-        # PDF files - try to get actual page count
        if file_ext == ".pdf":
            try:
                import pypdf
@ -249,153 +329,6 @@ class PageLimitService:
                    pdf_reader = pypdf.PdfReader(f)
                    return len(pdf_reader.pages)
            except Exception:
-                # If PDF reading fails, fall back to size estimation
-                # Typical PDF: ~100KB per page (conservative estimate)
-                return max(1, file_size // (100 * 1024))
+                pass  # fall through to size-based estimation

-        # Word Processing Documents
-        # Microsoft Word, LibreOffice Writer, WordPerfect, Pages, etc.
-        elif file_ext in [
-            ".doc",
-            ".docx",
-            ".docm",
-            ".dot",
-            ".dotm",  # Microsoft Word
-            ".odt",
-            ".ott",
-            ".sxw",
-            ".stw",
-            ".uot",  # OpenDocument/StarOffice Writer
-            ".rtf",  # Rich Text Format
-            ".pages",  # Apple Pages
-            ".wpd",
-            ".wps",  # WordPerfect, Microsoft Works
-            ".abw",
-            ".zabw",  # AbiWord
-            ".cwk",
-            ".hwp",
-            ".lwp",
-            ".mcw",
-            ".mw",
-            ".sdw",
-            ".vor",  # Other word processors
-        ]:
-            # Typical word document: ~50KB per page (conservative)
-            return max(1, file_size // (50 * 1024))
-
-        # Presentation Documents
-        # PowerPoint, Impress, Keynote, etc.
-        elif file_ext in [
-            ".ppt",
-            ".pptx",
-            ".pptm",
-            ".pot",
-            ".potx",  # Microsoft PowerPoint
-            ".odp",
-            ".otp",
-            ".sxi",
-            ".sti",
-            ".uop",  # OpenDocument/StarOffice Impress
-            ".key",  # Apple Keynote
-            ".sda",
-            ".sdd",
-            ".sdp",  # StarOffice Draw/Impress
-        ]:
-            # Typical presentation: ~200KB per slide (conservative)
-            return max(1, file_size // (200 * 1024))
-
-        # Spreadsheet Documents
-        # Excel, Calc, Numbers, Lotus, etc.
-        elif file_ext in [
-            ".xls",
-            ".xlsx",
-            ".xlsm",
-            ".xlsb",
-            ".xlw",
-            ".xlr",  # Microsoft Excel
-            ".ods",
-            ".ots",
-            ".fods",  # OpenDocument Spreadsheet
-            ".numbers",  # Apple Numbers
-            ".123",
-            ".wk1",
-            ".wk2",
-            ".wk3",
-            ".wk4",
-            ".wks",  # Lotus 1-2-3
-            ".wb1",
-            ".wb2",
-            ".wb3",
-            ".wq1",
-            ".wq2",  # Quattro Pro
-            ".csv",
-            ".tsv",
-            ".slk",
-            ".sylk",
-            ".dif",
-            ".dbf",
-            ".prn",
-            ".qpw",  # Data formats
-            ".602",
-            ".et",
-            ".eth",  # Other spreadsheets
-        ]:
-            # Spreadsheets typically have 1 sheet = 1 page for ETL
-            # Conservative: ~100KB per sheet
-            return max(1, file_size // (100 * 1024))
-
-        # E-books
-        elif file_ext in [".epub"]:
-            # E-books vary widely, estimate by size
-            # Typical e-book: ~50KB per page
-            return max(1, file_size // (50 * 1024))
-
-        # Plain Text and Markup Files
-        elif file_ext in [
-            ".txt",
-            ".log",  # Plain text
-            ".md",
-            ".markdown",  # Markdown
-            ".htm",
-            ".html",
-            ".xml",  # Markup
-        ]:
-            # Plain text: ~3000 bytes per page
-            return max(1, file_size // 3000)
-
-        # Image Files
-        # Each image is typically processed as 1 page
-        elif file_ext in [
-            ".jpg",
-            ".jpeg",  # JPEG
-            ".png",  # PNG
-            ".gif",  # GIF
-            ".bmp",  # Bitmap
-            ".tiff",  # TIFF
-            ".webp",  # WebP
-            ".svg",  # SVG
-            ".cgm",  # Computer Graphics Metafile
-            ".odg",
-            ".pbd",  # OpenDocument Graphics
-        ]:
-            # Each image = 1 page
-            return 1
-
-        # Audio Files (transcription = typically 1 page per minute)
-        # Note: These should be handled by audio transcription flow, not ETL
-        elif file_ext in [".mp3", ".m4a", ".wav", ".mpga"]:
-            # Audio files: estimate based on duration
-            # Fallback: ~1MB per minute of audio, 1 page per minute transcript
-            return max(1, file_size // (1024 * 1024))
-
-        # Video Files (typically not processed for pages, but just in case)
-        elif file_ext in [".mp4", ".mpeg", ".webm"]:
-            # Video files: very rough estimate
-            # Typically wouldn't be page-based, but use conservative estimate
-            return max(1, file_size // (5 * 1024 * 1024))
-
-        # Other/Unknown Document Types
-        else:
-            # Conservative estimate: ~80KB per page
-            # This catches: .sgl, .sxg, .uof, .uos1, .uos2, .web, and any future formats
-            return max(1, file_size // (80 * 1024))
+        return self.estimate_pages_from_metadata(file_ext, file_size)
--- a/surfsense_backend/app/tasks/connector_indexers/base.py
+++ b/surfsense_backend/app/tasks/connector_indexers/base.py
@ -4,7 +4,6 @@ Base functionality and shared imports for connector indexers.

 import logging
 from datetime import UTC, datetime, timedelta
-
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select

--- a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py
@ -28,6 +28,7 @@ from app.indexing_pipeline.connector_document import ConnectorDocument
 from app.indexing_pipeline.document_hashing import compute_identifier_hash
 from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
 from app.services.llm_service import get_user_long_context_llm
+from app.services.page_limit_service import PageLimitService
 from app.services.task_logging_service import TaskLoggingService
 from app.tasks.connector_indexers.base import (
    check_document_by_unique_identifier,
@ -278,6 +279,12 @@ async def _index_full_scan(
        },
    )

+    page_limit_service = PageLimitService(session)
+    pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
+    remaining_quota = pages_limit - pages_used
+    batch_estimated_pages = 0
+    page_limit_reached = False
+
    renamed_count = 0
    skipped = 0
    files_to_download: list[dict] = []
@ -307,6 +314,21 @@ async def _index_full_scan(
        elif skip_item(file):
            skipped += 1
            continue
+
+        file_pages = PageLimitService.estimate_pages_from_metadata(
+            file.get("name", ""), file.get("size")
+        )
+        if batch_estimated_pages + file_pages > remaining_quota:
+            if not page_limit_reached:
+                logger.warning(
+                    "Page limit reached during Dropbox full scan, "
+                    "skipping remaining files"
+                )
+                page_limit_reached = True
+            skipped += 1
+            continue
+
+        batch_estimated_pages += file_pages
        files_to_download.append(file)

    batch_indexed, failed = await _download_and_index(
@ -320,6 +342,14 @@ async def _index_full_scan(
        on_heartbeat=on_heartbeat_callback,
    )

+    if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
+        pages_to_deduct = max(
+            1, batch_estimated_pages * batch_indexed // len(files_to_download)
+        )
+        await page_limit_service.update_page_usage(
+            user_id, pages_to_deduct, allow_exceed=True
+        )
+
    indexed = renamed_count + batch_indexed
    logger.info(
        f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed"
@ -340,6 +370,11 @@ async def _index_selected_files(
    on_heartbeat: HeartbeatCallbackType | None = None,
 ) -> tuple[int, int, list[str]]:
    """Index user-selected files using the parallel pipeline."""
+    page_limit_service = PageLimitService(session)
+    pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
+    remaining_quota = pages_limit - pages_used
+    batch_estimated_pages = 0
+
    files_to_download: list[dict] = []
    errors: list[str] = []
    renamed_count = 0
@ -364,6 +399,15 @@ async def _index_selected_files(
            skipped += 1
            continue

+        file_pages = PageLimitService.estimate_pages_from_metadata(
+            file.get("name", ""), file.get("size")
+        )
+        if batch_estimated_pages + file_pages > remaining_quota:
+            display = file_name or file_path
+            errors.append(f"File '{display}': page limit would be exceeded")
+            continue
+
+        batch_estimated_pages += file_pages
        files_to_download.append(file)

    batch_indexed, _failed = await _download_and_index(
@ -377,6 +421,14 @@ async def _index_selected_files(
        on_heartbeat=on_heartbeat,
    )

+    if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
+        pages_to_deduct = max(
+            1, batch_estimated_pages * batch_indexed // len(files_to_download)
+        )
+        await page_limit_service.update_page_usage(
+            user_id, pages_to_deduct, allow_exceed=True
+        )
+
    return renamed_count + batch_indexed, skipped, errors


--- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py
@ -34,6 +34,7 @@ from app.indexing_pipeline.indexing_pipeline_service import (
    PlaceholderInfo,
 )
 from app.services.llm_service import get_user_long_context_llm
+from app.services.page_limit_service import PageLimitService
 from app.services.task_logging_service import TaskLoggingService
 from app.tasks.connector_indexers.base import (
    check_document_by_unique_identifier,
@ -327,6 +328,12 @@ async def _process_single_file(
                return 1, 0, 0
            return 0, 1, 0

+        page_limit_service = PageLimitService(session)
+        estimated_pages = PageLimitService.estimate_pages_from_metadata(
+            file_name, file.get("size")
+        )
+        await page_limit_service.check_page_limit(user_id, estimated_pages)
+
        markdown, drive_metadata, error = await download_and_extract_content(
            drive_client, file
        )
@ -363,6 +370,9 @@ async def _process_single_file(
            )
            await pipeline.index(document, connector_doc, user_llm)

+        await page_limit_service.update_page_usage(
+            user_id, estimated_pages, allow_exceed=True
+        )
        logger.info(f"Successfully indexed Google Drive file: {file_name}")
        return 1, 0, 0

@ -466,6 +476,11 @@ async def _index_selected_files(

    Returns (indexed_count, skipped_count, errors).
    """
+    page_limit_service = PageLimitService(session)
+    pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
+    remaining_quota = pages_limit - pages_used
+    batch_estimated_pages = 0
+
    files_to_download: list[dict] = []
    errors: list[str] = []
    renamed_count = 0
@ -486,6 +501,15 @@ async def _index_selected_files(
                skipped += 1
            continue

+        file_pages = PageLimitService.estimate_pages_from_metadata(
+            file.get("name", ""), file.get("size")
+        )
+        if batch_estimated_pages + file_pages > remaining_quota:
+            display = file_name or file_id
+            errors.append(f"File '{display}': page limit would be exceeded")
+            continue
+
+        batch_estimated_pages += file_pages
        files_to_download.append(file)

    await _create_drive_placeholders(
@ -507,6 +531,14 @@ async def _index_selected_files(
        on_heartbeat=on_heartbeat,
    )

+    if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
+        pages_to_deduct = max(
+            1, batch_estimated_pages * batch_indexed // len(files_to_download)
+        )
+        await page_limit_service.update_page_usage(
+            user_id, pages_to_deduct, allow_exceed=True
+        )
+
    return renamed_count + batch_indexed, skipped, errors


@ -545,6 +577,12 @@ async def _index_full_scan(
    # ------------------------------------------------------------------
    # Phase 1 (serial): collect files, run skip checks, track renames
    # ------------------------------------------------------------------
+    page_limit_service = PageLimitService(session)
+    pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
+    remaining_quota = pages_limit - pages_used
+    batch_estimated_pages = 0
+    page_limit_reached = False
+
    renamed_count = 0
    skipped = 0
    files_processed = 0
@ -593,6 +631,20 @@ async def _index_full_scan(
                        skipped += 1
                    continue

+                file_pages = PageLimitService.estimate_pages_from_metadata(
+                    file.get("name", ""), file.get("size")
+                )
+                if batch_estimated_pages + file_pages > remaining_quota:
+                    if not page_limit_reached:
+                        logger.warning(
+                            "Page limit reached during Google Drive full scan, "
+                            "skipping remaining files"
+                        )
+                        page_limit_reached = True
+                    skipped += 1
+                    continue
+
+                batch_estimated_pages += file_pages
                files_to_download.append(file)

            page_token = next_token
@ -636,6 +688,14 @@ async def _index_full_scan(
        on_heartbeat=on_heartbeat_callback,
    )

+    if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
+        pages_to_deduct = max(
+            1, batch_estimated_pages * batch_indexed // len(files_to_download)
+        )
+        await page_limit_service.update_page_usage(
+            user_id, pages_to_deduct, allow_exceed=True
+        )
+
    indexed = renamed_count + batch_indexed
    logger.info(
        f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed"
@ -686,6 +746,12 @@ async def _index_with_delta_sync(
    # ------------------------------------------------------------------
    # Phase 1 (serial): handle removals, collect files for download
    # ------------------------------------------------------------------
+    page_limit_service = PageLimitService(session)
+    pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
+    remaining_quota = pages_limit - pages_used
+    batch_estimated_pages = 0
+    page_limit_reached = False
+
    renamed_count = 0
    skipped = 0
    files_to_download: list[dict] = []
@ -715,6 +781,20 @@ async def _index_with_delta_sync(
                skipped += 1
            continue

+        file_pages = PageLimitService.estimate_pages_from_metadata(
+            file.get("name", ""), file.get("size")
+        )
+        if batch_estimated_pages + file_pages > remaining_quota:
+            if not page_limit_reached:
+                logger.warning(
+                    "Page limit reached during Google Drive delta sync, "
+                    "skipping remaining files"
+                )
+                page_limit_reached = True
+            skipped += 1
+            continue
+
+        batch_estimated_pages += file_pages
        files_to_download.append(file)

    # ------------------------------------------------------------------
@ -742,6 +822,14 @@ async def _index_with_delta_sync(
        on_heartbeat=on_heartbeat_callback,
    )

+    if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
+        pages_to_deduct = max(
+            1, batch_estimated_pages * batch_indexed // len(files_to_download)
+        )
+        await page_limit_service.update_page_usage(
+            user_id, pages_to_deduct, allow_exceed=True
+        )
+
    indexed = renamed_count + batch_indexed
    logger.info(
        f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed"
--- a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
+++ b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py
@ -28,6 +28,7 @@ from app.indexing_pipeline.connector_document import ConnectorDocument
 from app.indexing_pipeline.document_hashing import compute_identifier_hash
 from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
 from app.services.llm_service import get_user_long_context_llm
+from app.services.page_limit_service import PageLimitService
 from app.services.task_logging_service import TaskLoggingService
 from app.tasks.connector_indexers.base import (
    check_document_by_unique_identifier,
@ -291,6 +292,11 @@ async def _index_selected_files(
    on_heartbeat: HeartbeatCallbackType | None = None,
 ) -> tuple[int, int, list[str]]:
    """Index user-selected files using the parallel pipeline."""
+    page_limit_service = PageLimitService(session)
+    pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
+    remaining_quota = pages_limit - pages_used
+    batch_estimated_pages = 0
+
    files_to_download: list[dict] = []
    errors: list[str] = []
    renamed_count = 0
@ -311,6 +317,15 @@ async def _index_selected_files(
                skipped += 1
            continue

+        file_pages = PageLimitService.estimate_pages_from_metadata(
+            file.get("name", ""), file.get("size")
+        )
+        if batch_estimated_pages + file_pages > remaining_quota:
+            display = file_name or file_id
+            errors.append(f"File '{display}': page limit would be exceeded")
+            continue
+
+        batch_estimated_pages += file_pages
        files_to_download.append(file)

    batch_indexed, _failed = await _download_and_index(
@ -324,6 +339,14 @@ async def _index_selected_files(
        on_heartbeat=on_heartbeat,
    )

+    if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
+        pages_to_deduct = max(
+            1, batch_estimated_pages * batch_indexed // len(files_to_download)
+        )
+        await page_limit_service.update_page_usage(
+            user_id, pages_to_deduct, allow_exceed=True
+        )
+
    return renamed_count + batch_indexed, skipped, errors


@ -358,6 +381,12 @@ async def _index_full_scan(
        },
    )

+    page_limit_service = PageLimitService(session)
+    pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
+    remaining_quota = pages_limit - pages_used
+    batch_estimated_pages = 0
+    page_limit_reached = False
+
    renamed_count = 0
    skipped = 0
    files_to_download: list[dict] = []
@ -383,6 +412,21 @@ async def _index_full_scan(
            else:
                skipped += 1
            continue
+
+        file_pages = PageLimitService.estimate_pages_from_metadata(
+            file.get("name", ""), file.get("size")
+        )
+        if batch_estimated_pages + file_pages > remaining_quota:
+            if not page_limit_reached:
+                logger.warning(
+                    "Page limit reached during OneDrive full scan, "
+                    "skipping remaining files"
+                )
+                page_limit_reached = True
+            skipped += 1
+            continue
+
+        batch_estimated_pages += file_pages
        files_to_download.append(file)

    batch_indexed, failed = await _download_and_index(
@ -396,6 +440,14 @@ async def _index_full_scan(
        on_heartbeat=on_heartbeat_callback,
    )

+    if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
+        pages_to_deduct = max(
+            1, batch_estimated_pages * batch_indexed // len(files_to_download)
+        )
+        await page_limit_service.update_page_usage(
+            user_id, pages_to_deduct, allow_exceed=True
+        )
+
    indexed = renamed_count + batch_indexed
    logger.info(
        f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed"
@ -441,6 +493,12 @@ async def _index_with_delta_sync(

    logger.info(f"Processing {len(changes)} delta changes")

+    page_limit_service = PageLimitService(session)
+    pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
+    remaining_quota = pages_limit - pages_used
+    batch_estimated_pages = 0
+    page_limit_reached = False
+
    renamed_count = 0
    skipped = 0
    files_to_download: list[dict] = []
@ -471,6 +529,20 @@ async def _index_with_delta_sync(
                skipped += 1
            continue

+        file_pages = PageLimitService.estimate_pages_from_metadata(
+            change.get("name", ""), change.get("size")
+        )
+        if batch_estimated_pages + file_pages > remaining_quota:
+            if not page_limit_reached:
+                logger.warning(
+                    "Page limit reached during OneDrive delta sync, "
+                    "skipping remaining files"
+                )
+                page_limit_reached = True
+            skipped += 1
+            continue
+
+        batch_estimated_pages += file_pages
        files_to_download.append(change)

    batch_indexed, failed = await _download_and_index(
@ -484,6 +556,14 @@ async def _index_with_delta_sync(
        on_heartbeat=on_heartbeat_callback,
    )

+    if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
+        pages_to_deduct = max(
+            1, batch_estimated_pages * batch_indexed // len(files_to_download)
+        )
+        await page_limit_service.update_page_usage(
+            user_id, pages_to_deduct, allow_exceed=True
+        )
+
    indexed = renamed_count + batch_indexed
    logger.info(
        f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed"