""" Service for charging the unified credit wallet for ETL document processing. Replaces the legacy ``PageLimitService`` page-quota model. Page counts are still estimated the same way; they are now converted to USD micro-credits (``config.MICROS_PER_PAGE`` per page, times a per-mode multiplier) and debited from ``user.credit_micros_balance``. When ``config.ETL_CREDIT_BILLING_ENABLED`` is False (the default for self-hosted / OSS installs) every check/charge is a no-op, preserving the prior effectively-unlimited ETL behaviour. """ import os from pathlib import Path, PurePosixPath from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from app.config import config class InsufficientCreditsError(Exception): """Raised when a user lacks enough credit to process a document.""" def __init__( self, message: str = "Insufficient credits to process this document. " "Add more credits to continue.", balance_micros: int = 0, required_micros: int = 0, ): self.balance_micros = balance_micros self.required_micros = required_micros super().__init__(message) class EtlCreditService: """Checks and charges the credit wallet for ETL page processing.""" def __init__(self, session: AsyncSession): self.session = session @staticmethod def billing_enabled() -> bool: return config.ETL_CREDIT_BILLING_ENABLED @staticmethod def pages_to_micros(pages: int, multiplier: int = 1) -> int: """Convert a (multiplied) page count to USD micro-credits.""" return int(pages) * int(multiplier) * config.MICROS_PER_PAGE async def get_available_micros(self, user_id: str) -> int | None: """Return spendable credit in micro-USD (``balance - reserved``). Returns ``None`` when ETL billing is disabled, which callers treat as "unlimited" (no batch skipping, no blocking). """ if not config.ETL_CREDIT_BILLING_ENABLED: return None from app.db import User result = await self.session.execute( select(User.credit_micros_balance, User.credit_micros_reserved).where( User.id == user_id ) ) row = result.first() if not row: raise ValueError(f"User with ID {user_id} not found") balance, reserved = row return balance - reserved async def check_credits( self, user_id: str, estimated_pages: int = 1, multiplier: int = 1 ) -> None: """Raise :class:`InsufficientCreditsError` if the user can't afford to process ``estimated_pages`` (times ``multiplier``). No-op when ETL billing is disabled. """ if not config.ETL_CREDIT_BILLING_ENABLED: return required = self.pages_to_micros(estimated_pages, multiplier) available = await self.get_available_micros(user_id) if available is None: return if required > available: raise InsufficientCreditsError( message=( "Processing this document would exceed your available " f"credit. Available: ${available / 1_000_000:.2f}. " f"This document costs about ${required / 1_000_000:.2f} " f"({estimated_pages} page(s)). Add more credits to continue." ), balance_micros=available, required_micros=required, ) async def charge_credits( self, user_id: str, pages: int, multiplier: int = 1 ) -> int | None: """Debit the credit wallet after successful processing. The balance may dip slightly negative when the actual page count exceeds the pre-check estimate (the document is already processed), mirroring the prior ``allow_exceed=True`` semantics. Returns the new balance in micros, or ``None`` when billing is disabled. """ if not config.ETL_CREDIT_BILLING_ENABLED: return None from app.db import User result = await self.session.execute(select(User).where(User.id == user_id)) user = result.unique().scalar_one_or_none() if not user: raise ValueError(f"User with ID {user_id} not found") cost = self.pages_to_micros(pages, multiplier) user.credit_micros_balance -= cost await self.session.commit() await self.session.refresh(user) # Best-effort: fire an auto-reload check if the balance dropped low. try: from app.services.auto_reload_service import maybe_trigger_auto_reload await maybe_trigger_auto_reload(user_id) except Exception: pass return user.credit_micros_balance def estimate_pages_from_elements(self, elements: list) -> int: """ Estimate page count from document elements (for Unstructured). Args: elements: List of document elements Returns: Estimated number of pages """ # For Unstructured, we can count unique page numbers in metadata # or estimate based on content length page_numbers = set() for element in elements: # Try to get page number from metadata if hasattr(element, "metadata") and element.metadata: page_num = element.metadata.get("page_number") if page_num is not None: page_numbers.add(page_num) # If we found page numbers in metadata, use that count if page_numbers: return len(page_numbers) # Otherwise, estimate: assume ~2000 chars per page total_content_length = sum( len(element.page_content) if hasattr(element, "page_content") else 0 for element in elements ) estimated_pages = max(1, total_content_length // 2000) return estimated_pages def estimate_pages_from_markdown(self, markdown_documents: list) -> int: """ Estimate page count from markdown documents (for LlamaCloud). Args: markdown_documents: List of markdown document objects Returns: Estimated number of pages """ # For LlamaCloud, if split_by_page=True was used, each doc is a page # Otherwise, estimate based on content length if not markdown_documents: return 1 # Check if documents have page metadata total_pages = 0 for doc in markdown_documents: if hasattr(doc, "metadata") and doc.metadata: # If metadata contains page info, use it page_num = doc.metadata.get("page", doc.metadata.get("page_number")) if page_num is not None: total_pages += 1 continue # Otherwise estimate from content length content_length = len(doc.text) if hasattr(doc, "text") else 0 estimated = max(1, content_length // 2000) total_pages += estimated return max(1, total_pages) def estimate_pages_from_content_length(self, content_length: int) -> int: """ Estimate page count from content length (for Docling). Args: content_length: Length of the document content Returns: Estimated number of pages """ # Estimate ~2000 characters per page return max(1, content_length // 2000) @staticmethod def estimate_pages_from_metadata( file_name_or_ext: str, file_size: int | str | None = None ) -> int: """Size-based page estimation from file name/extension and byte size. Pure function — no file I/O, no database access. Used by cloud connectors (which only have API metadata) and as the internal fallback for :meth:`estimate_pages_before_processing`. ``file_name_or_ext`` can be a full filename (``"report.pdf"``) or a bare extension (``".pdf"``). ``file_size`` may be an int, a stringified int from a cloud API, or *None*. """ if file_size is not None: try: file_size = int(file_size) except (ValueError, TypeError): file_size = 0 else: file_size = 0 if file_size <= 0: return 1 ext = PurePosixPath(file_name_or_ext).suffix.lower() if file_name_or_ext else "" if not ext and file_name_or_ext.startswith("."): ext = file_name_or_ext.lower() file_ext = ext if file_ext == ".pdf": return max(1, file_size // (100 * 1024)) if file_ext in { ".doc", ".docx", ".docm", ".dot", ".dotm", ".odt", ".ott", ".sxw", ".stw", ".uot", ".rtf", ".pages", ".wpd", ".wps", ".abw", ".zabw", ".cwk", ".hwp", ".lwp", ".mcw", ".mw", ".sdw", ".vor", }: return max(1, file_size // (50 * 1024)) if file_ext in { ".ppt", ".pptx", ".pptm", ".pot", ".potx", ".odp", ".otp", ".sxi", ".sti", ".uop", ".key", ".sda", ".sdd", ".sdp", }: return max(1, file_size // (200 * 1024)) if file_ext in { ".xls", ".xlsx", ".xlsm", ".xlsb", ".xlw", ".xlr", ".ods", ".ots", ".fods", ".numbers", ".123", ".wk1", ".wk2", ".wk3", ".wk4", ".wks", ".wb1", ".wb2", ".wb3", ".wq1", ".wq2", ".csv", ".tsv", ".slk", ".sylk", ".dif", ".dbf", ".prn", ".qpw", ".602", ".et", ".eth", }: return max(1, file_size // (100 * 1024)) if file_ext in {".epub"}: return max(1, file_size // (50 * 1024)) if file_ext in {".txt", ".log", ".md", ".markdown", ".htm", ".html", ".xml"}: return max(1, file_size // 3000) if file_ext in { ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp", ".svg", ".cgm", ".odg", ".pbd", }: return 1 if file_ext in {".mp3", ".m4a", ".wav", ".mpga"}: return max(1, file_size // (1024 * 1024)) if file_ext in {".mp4", ".mpeg", ".webm"}: return max(1, file_size // (5 * 1024 * 1024)) return max(1, file_size // (80 * 1024)) def estimate_pages_before_processing(self, file_path: str) -> int: """ Estimate page count from a local file before processing. For PDFs, attempts to read the actual page count via pypdf. For everything else, delegates to :meth:`estimate_pages_from_metadata`. Args: file_path: Path to the file Returns: Estimated number of pages """ if not os.path.exists(file_path): raise ValueError(f"File not found: {file_path}") file_ext = Path(file_path).suffix.lower() file_size = os.path.getsize(file_path) if file_ext == ".pdf": try: import pypdf with open(file_path, "rb") as f: pdf_reader = pypdf.PdfReader(f) return len(pdf_reader.pages) except Exception: pass # fall through to size-based estimation return self.estimate_pages_from_metadata(file_ext, file_size)