diff --git a/surfsense_backend/alembic/versions/116_create_zero_publication.py b/surfsense_backend/alembic/versions/116_create_zero_publication.py index 8f0d7b5d3..ff74952a9 100644 --- a/surfsense_backend/alembic/versions/116_create_zero_publication.py +++ b/surfsense_backend/alembic/versions/116_create_zero_publication.py @@ -42,9 +42,7 @@ def upgrade() -> None: if not exists: table_list = ", ".join(TABLES) conn.execute( - sa.text( - f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE {table_list}" - ) + sa.text(f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE {table_list}") ) diff --git a/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py new file mode 100644 index 000000000..3c2d34c76 --- /dev/null +++ b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py @@ -0,0 +1,102 @@ +"""optimize zero_publication with column lists + +Recreates the zero_publication using column lists for the documents +table so that large text columns (content, source_markdown, +blocknote_document, etc.) are excluded from WAL replication. +This prevents RangeError: Invalid string length in zero-cache's +change-streamer when documents have very large content. + +Also resets REPLICA IDENTITY to DEFAULT on tables that had it set +to FULL for the old Electric SQL setup (migration 66/75/76). +With DEFAULT (primary-key) identity, column-list publications +only need to include the PK — not every column. + +After running this migration you MUST: + 1. Stop zero-cache + 2. Delete / reset the zero-cache data volume + 3. Restart zero-cache (it will do a fresh initial sync) + +Revision ID: 117 +Revises: 116 +""" + +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +revision: str = "117" +down_revision: str | None = "116" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +PUBLICATION_NAME = "zero_publication" + +TABLES_WITH_FULL_IDENTITY = [ + "documents", + "notifications", + "search_source_connectors", + "new_chat_messages", + "chat_comments", + "chat_session_state", +] + +DOCUMENT_COLS = [ + "id", + "title", + "document_type", + "search_space_id", + "folder_id", + "created_by_id", + "status", + "created_at", + "updated_at", +] + +PUBLICATION_DDL_FULL = f"""\ +CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE + notifications, documents, folders, + search_source_connectors, new_chat_messages, + chat_comments, chat_session_state +""" + + +def upgrade() -> None: + conn = op.get_bind() + + for tbl in TABLES_WITH_FULL_IDENTITY: + conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY DEFAULT')) + + conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}")) + + has_zero_ver = conn.execute( + sa.text( + "SELECT 1 FROM information_schema.columns " + "WHERE table_name = 'documents' AND column_name = '_0_version'" + ) + ).fetchone() + + cols = DOCUMENT_COLS + (['"_0_version"'] if has_zero_ver else []) + col_list = ", ".join(cols) + + conn.execute( + sa.text( + f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE " + f"notifications, " + f"documents ({col_list}), " + f"folders, " + f"search_source_connectors, " + f"new_chat_messages, " + f"chat_comments, " + f"chat_session_state" + ) + ) + + +def downgrade() -> None: + conn = op.get_bind() + conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}")) + conn.execute(sa.text(PUBLICATION_DDL_FULL)) + for tbl in TABLES_WITH_FULL_IDENTITY: + conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY FULL')) diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py index ccc06f272..fc1e80d28 100644 --- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py +++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py @@ -159,6 +159,7 @@ async def create_surfsense_deep_agent( additional_tools: Sequence[BaseTool] | None = None, firecrawl_api_key: str | None = None, thread_visibility: ChatVisibility | None = None, + mentioned_document_ids: list[int] | None = None, ): """ Create a SurfSense deep agent with configurable tools and prompts. @@ -451,6 +452,7 @@ async def create_surfsense_deep_agent( search_space_id=search_space_id, available_connectors=available_connectors, available_document_types=available_document_types, + mentioned_document_ids=mentioned_document_ids, ), SurfSenseFilesystemMiddleware( search_space_id=search_space_id, diff --git a/surfsense_backend/app/agents/new_chat/middleware/filesystem.py b/surfsense_backend/app/agents/new_chat/middleware/filesystem.py index 41b24f88b..d7697ef15 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/filesystem.py +++ b/surfsense_backend/app/agents/new_chat/middleware/filesystem.py @@ -66,6 +66,16 @@ the ``, identify chunks marked `matched="true"`, then use those sections instead of reading the entire file sequentially. Use `` values as citation IDs in your answers. + +## User-Mentioned Documents + +When the `ls` output tags a file with `[MENTIONED BY USER — read deeply]`, +the user **explicitly selected** that document. These files are your highest- +priority sources: +1. **Always read them thoroughly** — scan the full ``, then read + all major sections, not just matched chunks. +2. **Prefer their content** over other search results when answering. +3. **Cite from them first** whenever applicable. """ # ============================================================================= diff --git a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py index 3728f229c..7b0dd2f71 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py +++ b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py @@ -28,7 +28,13 @@ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from app.agents.new_chat.utils import parse_date_or_datetime, resolve_date_range -from app.db import NATIVE_TO_LEGACY_DOCTYPE, Document, Folder, shielded_async_session +from app.db import ( + NATIVE_TO_LEGACY_DOCTYPE, + Chunk, + Document, + Folder, + shielded_async_session, +) from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever from app.utils.document_converters import embed_texts from app.utils.perf import get_perf_logger @@ -430,21 +436,36 @@ async def _get_folder_paths( def _build_synthetic_ls( existing_files: dict[str, Any] | None, new_files: dict[str, Any], + *, + mentioned_paths: set[str] | None = None, ) -> tuple[AIMessage, ToolMessage]: """Build a synthetic ls("/documents") tool-call + result for the LLM context. - Paths are listed with *new* (rank-ordered) files first, then existing files - that were already in state from prior turns. + Mentioned files are listed first. A separate header tells the LLM which + files the user explicitly selected; the path list itself stays clean so + paths can be passed directly to ``read_file`` without stripping tags. """ + _mentioned = mentioned_paths or set() merged: dict[str, Any] = {**(existing_files or {}), **new_files} doc_paths = [ p for p, v in merged.items() if p.startswith("/documents/") and v is not None ] new_set = set(new_files) - new_paths = [p for p in doc_paths if p in new_set] + mentioned_list = [p for p in doc_paths if p in _mentioned] + new_non_mentioned = [p for p in doc_paths if p in new_set and p not in _mentioned] old_paths = [p for p in doc_paths if p not in new_set] - ordered = new_paths + old_paths + ordered = mentioned_list + new_non_mentioned + old_paths + + parts: list[str] = [] + if mentioned_list: + parts.append( + "USER-MENTIONED documents (read these thoroughly before answering):" + ) + for p in mentioned_list: + parts.append(f" {p}") + parts.append("") + parts.append(str(ordered) if ordered else "No documents found.") tool_call_id = f"auto_ls_{uuid.uuid4().hex[:12]}" ai_msg = AIMessage( @@ -452,7 +473,7 @@ def _build_synthetic_ls( tool_calls=[{"name": "ls", "args": {"path": "/documents"}, "id": tool_call_id}], ) tool_msg = ToolMessage( - content=str(ordered) if ordered else "No documents found.", + content="\n".join(parts), tool_call_id=tool_call_id, ) return ai_msg, tool_msg @@ -524,12 +545,92 @@ async def search_knowledge_base( return results[:top_k] +async def fetch_mentioned_documents( + *, + document_ids: list[int], + search_space_id: int, +) -> list[dict[str, Any]]: + """Fetch explicitly mentioned documents with *all* their chunks. + + Returns the same dict structure as ``search_knowledge_base`` so results + can be merged directly into ``build_scoped_filesystem``. Unlike search + results, every chunk is included (no top-K limiting) and none are marked + as ``matched`` since the entire document is relevant by virtue of the + user's explicit mention. + """ + if not document_ids: + return [] + + async with shielded_async_session() as session: + doc_result = await session.execute( + select(Document).where( + Document.id.in_(document_ids), + Document.search_space_id == search_space_id, + ) + ) + docs = {doc.id: doc for doc in doc_result.scalars().all()} + + if not docs: + return [] + + chunk_result = await session.execute( + select(Chunk.id, Chunk.content, Chunk.document_id) + .where(Chunk.document_id.in_(list(docs.keys()))) + .order_by(Chunk.document_id, Chunk.id) + ) + chunks_by_doc: dict[int, list[dict[str, Any]]] = {doc_id: [] for doc_id in docs} + for row in chunk_result.all(): + if row.document_id in chunks_by_doc: + chunks_by_doc[row.document_id].append( + {"chunk_id": row.id, "content": row.content} + ) + + results: list[dict[str, Any]] = [] + for doc_id in document_ids: + doc = docs.get(doc_id) + if doc is None: + continue + metadata = doc.document_metadata or {} + results.append( + { + "document_id": doc.id, + "content": "", + "score": 1.0, + "chunks": chunks_by_doc.get(doc.id, []), + "matched_chunk_ids": [], + "document": { + "id": doc.id, + "title": doc.title, + "document_type": ( + doc.document_type.value + if getattr(doc, "document_type", None) + else None + ), + "metadata": metadata, + }, + "source": ( + doc.document_type.value + if getattr(doc, "document_type", None) + else None + ), + "_user_mentioned": True, + } + ) + return results + + async def build_scoped_filesystem( *, documents: Sequence[dict[str, Any]], search_space_id: int, -) -> dict[str, dict[str, str]]: - """Build a StateBackend-compatible files dict from search results.""" +) -> tuple[dict[str, dict[str, str]], dict[int, str]]: + """Build a StateBackend-compatible files dict from search results. + + Returns ``(files, doc_id_to_path)`` so callers can reliably map a + document id back to its filesystem path without guessing by title. + Paths are collision-proof: when two documents resolve to the same + path the doc-id is appended to disambiguate. + """ async with shielded_async_session() as session: folder_paths = await _get_folder_paths(session, search_space_id) doc_ids = [ @@ -551,6 +652,7 @@ async def build_scoped_filesystem( } files: dict[str, dict[str, str]] = {} + doc_id_to_path: dict[int, str] = {} for document in documents: doc_meta = document.get("document") or {} title = str(doc_meta.get("title") or "untitled") @@ -559,6 +661,9 @@ async def build_scoped_filesystem( base_folder = folder_paths.get(folder_id, "/documents") file_name = _safe_filename(title) path = f"{base_folder}/{file_name}" + if path in files: + stem = file_name.removesuffix(".xml") + path = f"{base_folder}/{stem} ({doc_id}).xml" matched_ids = set(document.get("matched_chunk_ids") or []) xml_content = _build_document_xml(document, matched_chunk_ids=matched_ids) files[path] = { @@ -567,7 +672,9 @@ async def build_scoped_filesystem( "created_at": "", "modified_at": "", } - return files + if isinstance(doc_id, int): + doc_id_to_path[doc_id] = path + return files, doc_id_to_path class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg] @@ -583,12 +690,14 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg] available_connectors: list[str] | None = None, available_document_types: list[str] | None = None, top_k: int = 10, + mentioned_document_ids: list[int] | None = None, ) -> None: self.llm = llm self.search_space_id = search_space_id self.available_connectors = available_connectors self.available_document_types = available_document_types self.top_k = top_k + self.mentioned_document_ids = mentioned_document_ids or [] async def _plan_search_inputs( self, @@ -680,6 +789,18 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg] user_text=user_text, ) + # --- 1. Fetch mentioned documents (user-selected, all chunks) --- + mentioned_results: list[dict[str, Any]] = [] + if self.mentioned_document_ids: + mentioned_results = await fetch_mentioned_documents( + document_ids=self.mentioned_document_ids, + search_space_id=self.search_space_id, + ) + # Clear after first turn so they are not re-fetched on subsequent + # messages within the same agent instance. + self.mentioned_document_ids = [] + + # --- 2. Run KB hybrid search --- search_results = await search_knowledge_base( query=planned_query, search_space_id=self.search_space_id, @@ -689,19 +810,50 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg] start_date=start_date, end_date=end_date, ) - new_files = await build_scoped_filesystem( - documents=search_results, + + # --- 3. Merge: mentioned first, then search (dedup by doc id) --- + seen_doc_ids: set[int] = set() + merged: list[dict[str, Any]] = [] + for doc in mentioned_results: + doc_id = (doc.get("document") or {}).get("id") + if doc_id is not None: + seen_doc_ids.add(doc_id) + merged.append(doc) + for doc in search_results: + doc_id = (doc.get("document") or {}).get("id") + if doc_id is not None and doc_id in seen_doc_ids: + continue + merged.append(doc) + + # --- 4. Build scoped filesystem --- + new_files, doc_id_to_path = await build_scoped_filesystem( + documents=merged, search_space_id=self.search_space_id, ) - ai_msg, tool_msg = _build_synthetic_ls(existing_files, new_files) + # Identify which paths belong to user-mentioned documents using + # the authoritative doc_id -> path mapping (no title guessing). + mentioned_doc_ids = { + (d.get("document") or {}).get("id") for d in mentioned_results + } + mentioned_paths = { + doc_id_to_path[did] for did in mentioned_doc_ids if did in doc_id_to_path + } + + ai_msg, tool_msg = _build_synthetic_ls( + existing_files, + new_files, + mentioned_paths=mentioned_paths, + ) if t0 is not None: _perf_log.info( - "[kb_fs_middleware] completed in %.3fs query=%r optimized=%r new_files=%d total=%d", + "[kb_fs_middleware] completed in %.3fs query=%r optimized=%r " + "mentioned=%d new_files=%d total=%d", asyncio.get_event_loop().time() - t0, user_text[:80], planned_query[:120], + len(mentioned_results), len(new_files), len(new_files) + len(existing_files or {}), ) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 6e69218f1..f53c81bb6 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -1,7 +1,7 @@ # Force asyncio to use standard event loop before unstructured imports import asyncio -from fastapi import APIRouter, Depends, Form, HTTPException, UploadFile +from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from sqlalchemy.orm import selectinload @@ -17,6 +17,7 @@ from app.db import ( get_async_session, ) from app.schemas import ( + ChunkRead, DocumentRead, DocumentsCreate, DocumentStatusBatchResponse, @@ -45,9 +46,7 @@ os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1" router = APIRouter() -MAX_FILES_PER_UPLOAD = 10 -MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB per file -MAX_TOTAL_SIZE_BYTES = 200 * 1024 * 1024 # 200 MB total +MAX_FILE_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB per file @router.post("/documents") @@ -156,13 +155,6 @@ async def create_documents_file_upload( if not files: raise HTTPException(status_code=400, detail="No files provided") - if len(files) > MAX_FILES_PER_UPLOAD: - raise HTTPException( - status_code=413, - detail=f"Too many files. Maximum {MAX_FILES_PER_UPLOAD} files per upload.", - ) - - total_size = 0 for file in files: file_size = file.size or 0 if file_size > MAX_FILE_SIZE_BYTES: @@ -171,14 +163,6 @@ async def create_documents_file_upload( detail=f"File '{file.filename}' ({file_size / (1024 * 1024):.1f} MB) " f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.", ) - total_size += file_size - - if total_size > MAX_TOTAL_SIZE_BYTES: - raise HTTPException( - status_code=413, - detail=f"Total upload size ({total_size / (1024 * 1024):.1f} MB) " - f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.", - ) # ===== Read all files concurrently to avoid blocking the event loop ===== async def _read_and_save(file: UploadFile) -> tuple[str, str, int]: @@ -206,16 +190,6 @@ async def create_documents_file_upload( saved_files = await asyncio.gather(*(_read_and_save(f) for f in files)) - actual_total_size = sum(size for _, _, size in saved_files) - if actual_total_size > MAX_TOTAL_SIZE_BYTES: - for temp_path, _, _ in saved_files: - os.unlink(temp_path) - raise HTTPException( - status_code=413, - detail=f"Total upload size ({actual_total_size / (1024 * 1024):.1f} MB) " - f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.", - ) - # ===== PHASE 1: Create pending documents for all files ===== created_documents: list[Document] = [] files_to_process: list[tuple[Document, str, str]] = [] @@ -451,13 +425,15 @@ async def read_documents( reason=doc.status.get("reason"), ) + raw_content = doc.content or "" api_documents.append( DocumentRead( id=doc.id, title=doc.title, document_type=doc.document_type, document_metadata=doc.document_metadata, - content=doc.content, + content="", + content_preview=raw_content[:300], content_hash=doc.content_hash, unique_identifier_hash=doc.unique_identifier_hash, created_at=doc.created_at, @@ -609,13 +585,15 @@ async def search_documents( reason=doc.status.get("reason"), ) + raw_content = doc.content or "" api_documents.append( DocumentRead( id=doc.id, title=doc.title, document_type=doc.document_type, document_metadata=doc.document_metadata, - content=doc.content, + content="", + content_preview=raw_content[:300], content_hash=doc.content_hash, unique_identifier_hash=doc.unique_identifier_hash, created_at=doc.created_at, @@ -884,16 +862,19 @@ async def get_document_type_counts( @router.get("/documents/by-chunk/{chunk_id}", response_model=DocumentWithChunksRead) async def get_document_by_chunk_id( chunk_id: int, + chunk_window: int = Query( + 5, ge=0, description="Number of chunks before/after the cited chunk to include" + ), session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), ): """ - Retrieves a document based on a chunk ID, including all its chunks ordered by creation time. - Requires DOCUMENTS_READ permission for the search space. - The document's embedding and chunk embeddings are excluded from the response. + Retrieves a document based on a chunk ID, including a window of chunks around the cited one. + Uses SQL-level pagination to avoid loading all chunks into memory. """ try: - # First, get the chunk and verify it exists + from sqlalchemy import and_, func, or_ + chunk_result = await session.execute(select(Chunk).filter(Chunk.id == chunk_id)) chunk = chunk_result.scalars().first() @@ -902,11 +883,8 @@ async def get_document_by_chunk_id( status_code=404, detail=f"Chunk with id {chunk_id} not found" ) - # Get the associated document document_result = await session.execute( - select(Document) - .options(selectinload(Document.chunks)) - .filter(Document.id == chunk.document_id) + select(Document).filter(Document.id == chunk.document_id) ) document = document_result.scalars().first() @@ -916,7 +894,6 @@ async def get_document_by_chunk_id( detail="Document not found", ) - # Check permission for the search space await check_permission( session, user, @@ -925,10 +902,38 @@ async def get_document_by_chunk_id( "You don't have permission to read documents in this search space", ) - # Sort chunks by creation time - sorted_chunks = sorted(document.chunks, key=lambda x: x.created_at) + total_result = await session.execute( + select(func.count()) + .select_from(Chunk) + .filter(Chunk.document_id == document.id) + ) + total_chunks = total_result.scalar() or 0 + + cited_idx_result = await session.execute( + select(func.count()) + .select_from(Chunk) + .filter( + Chunk.document_id == document.id, + or_( + Chunk.created_at < chunk.created_at, + and_(Chunk.created_at == chunk.created_at, Chunk.id < chunk.id), + ), + ) + ) + cited_idx = cited_idx_result.scalar() or 0 + + start = max(0, cited_idx - chunk_window) + end = min(total_chunks, cited_idx + chunk_window + 1) + + windowed_result = await session.execute( + select(Chunk) + .filter(Chunk.document_id == document.id) + .order_by(Chunk.created_at, Chunk.id) + .offset(start) + .limit(end - start) + ) + windowed_chunks = windowed_result.scalars().all() - # Return the document with its chunks return DocumentWithChunksRead( id=document.id, title=document.title, @@ -940,7 +945,9 @@ async def get_document_by_chunk_id( created_at=document.created_at, updated_at=document.updated_at, search_space_id=document.search_space_id, - chunks=sorted_chunks, + chunks=windowed_chunks, + total_chunks=total_chunks, + chunk_start_index=start, ) except HTTPException: raise @@ -950,6 +957,75 @@ async def get_document_by_chunk_id( ) from e +@router.get( + "/documents/{document_id}/chunks", + response_model=PaginatedResponse[ChunkRead], +) +async def get_document_chunks_paginated( + document_id: int, + page: int = Query(0, ge=0), + page_size: int = Query(20, ge=1, le=100), + start_offset: int | None = Query( + None, ge=0, description="Direct offset; overrides page * page_size" + ), + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + Paginated chunk loading for a document. + Supports both page-based and offset-based access. + """ + try: + from sqlalchemy import func + + doc_result = await session.execute( + select(Document).filter(Document.id == document_id) + ) + document = doc_result.scalars().first() + + if not document: + raise HTTPException(status_code=404, detail="Document not found") + + await check_permission( + session, + user, + document.search_space_id, + Permission.DOCUMENTS_READ.value, + "You don't have permission to read documents in this search space", + ) + + total_result = await session.execute( + select(func.count()) + .select_from(Chunk) + .filter(Chunk.document_id == document_id) + ) + total = total_result.scalar() or 0 + + offset = start_offset if start_offset is not None else page * page_size + chunks_result = await session.execute( + select(Chunk) + .filter(Chunk.document_id == document_id) + .order_by(Chunk.created_at, Chunk.id) + .offset(offset) + .limit(page_size) + ) + chunks = chunks_result.scalars().all() + + return PaginatedResponse( + items=chunks, + total=total, + page=offset // page_size if page_size else page, + page_size=page_size, + has_more=(offset + len(chunks)) < total, + ) + except HTTPException: + raise + except Exception as e: + raise HTTPException( + status_code=500, detail=f"Failed to fetch chunks: {e!s}" + ) from e + + @router.get("/documents/{document_id}", response_model=DocumentRead) async def read_document( document_id: int, @@ -980,13 +1056,14 @@ async def read_document( "You don't have permission to read documents in this search space", ) - # Convert database object to API-friendly format + raw_content = document.content or "" return DocumentRead( id=document.id, title=document.title, document_type=document.document_type, document_metadata=document.document_metadata, - content=document.content, + content=raw_content, + content_preview=raw_content[:300], content_hash=document.content_hash, unique_identifier_hash=document.unique_identifier_hash, created_at=document.created_at, diff --git a/surfsense_backend/app/routes/editor_routes.py b/surfsense_backend/app/routes/editor_routes.py index f54f18def..09a35c619 100644 --- a/surfsense_backend/app/routes/editor_routes.py +++ b/surfsense_backend/app/routes/editor_routes.py @@ -15,11 +15,10 @@ import pypandoc import typst from fastapi import APIRouter, Depends, HTTPException, Query from fastapi.responses import StreamingResponse -from sqlalchemy import select +from sqlalchemy import func, select from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import selectinload -from app.db import Document, DocumentType, Permission, User, get_async_session +from app.db import Chunk, Document, DocumentType, Permission, User, get_async_session from app.routes.reports_routes import ( _FILE_EXTENSIONS, _MEDIA_TYPES, @@ -44,6 +43,9 @@ router = APIRouter() async def get_editor_content( search_space_id: int, document_id: int, + max_length: int | None = Query( + None, description="Truncate source_markdown to this many characters" + ), session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), ): @@ -65,9 +67,7 @@ async def get_editor_content( ) result = await session.execute( - select(Document) - .options(selectinload(Document.chunks)) - .filter( + select(Document).filter( Document.id == document_id, Document.search_space_id == search_space_id, ) @@ -77,62 +77,63 @@ async def get_editor_content( if not document: raise HTTPException(status_code=404, detail="Document not found") - # Priority 1: Return source_markdown if it exists (check `is not None` to allow empty strings) - if document.source_markdown is not None: + count_result = await session.execute( + select(func.count()).select_from(Chunk).filter(Chunk.document_id == document_id) + ) + chunk_count = count_result.scalar() or 0 + + def _build_response(md: str) -> dict: + size_bytes = len(md.encode("utf-8")) + truncated = False + output_md = md + if max_length is not None and size_bytes > max_length: + output_md = md[:max_length] + truncated = True return { "document_id": document.id, "title": document.title, "document_type": document.document_type.value, - "source_markdown": document.source_markdown, + "source_markdown": output_md, + "content_size_bytes": size_bytes, + "chunk_count": chunk_count, + "truncated": truncated, "updated_at": document.updated_at.isoformat() if document.updated_at else None, } - # Priority 2: Lazy-migrate from blocknote_document (pure Python, no external deps) + if document.source_markdown is not None: + return _build_response(document.source_markdown) + if document.blocknote_document: from app.utils.blocknote_to_markdown import blocknote_to_markdown markdown = blocknote_to_markdown(document.blocknote_document) if markdown: - # Persist the migration so we don't repeat it document.source_markdown = markdown await session.commit() - return { - "document_id": document.id, - "title": document.title, - "document_type": document.document_type.value, - "source_markdown": markdown, - "updated_at": document.updated_at.isoformat() - if document.updated_at - else None, - } + return _build_response(markdown) - # Priority 3: For NOTE type with no content, return empty markdown if document.document_type == DocumentType.NOTE: empty_markdown = "" document.source_markdown = empty_markdown await session.commit() - return { - "document_id": document.id, - "title": document.title, - "document_type": document.document_type.value, - "source_markdown": empty_markdown, - "updated_at": document.updated_at.isoformat() - if document.updated_at - else None, - } + return _build_response(empty_markdown) - # Priority 4: Reconstruct from chunks - chunks = sorted(document.chunks, key=lambda c: c.id) + chunk_contents_result = await session.execute( + select(Chunk.content) + .filter(Chunk.document_id == document_id) + .order_by(Chunk.id) + ) + chunk_contents = chunk_contents_result.scalars().all() - if not chunks: + if not chunk_contents: raise HTTPException( status_code=400, detail="This document has no content and cannot be edited. Please re-upload to enable editing.", ) - markdown_content = "\n\n".join(chunk.content for chunk in chunks) + markdown_content = "\n\n".join(chunk_contents) if not markdown_content.strip(): raise HTTPException( @@ -140,17 +141,77 @@ async def get_editor_content( detail="This document has empty content and cannot be edited.", ) - # Persist the lazy migration document.source_markdown = markdown_content await session.commit() - return { - "document_id": document.id, - "title": document.title, - "document_type": document.document_type.value, - "source_markdown": markdown_content, - "updated_at": document.updated_at.isoformat() if document.updated_at else None, - } + return _build_response(markdown_content) + + +@router.get( + "/search-spaces/{search_space_id}/documents/{document_id}/download-markdown" +) +async def download_document_markdown( + search_space_id: int, + document_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + Download the full document content as a .md file. + Reconstructs markdown from source_markdown or chunks. + """ + await check_permission( + session, + user, + search_space_id, + Permission.DOCUMENTS_READ.value, + "You don't have permission to read documents in this search space", + ) + + result = await session.execute( + select(Document).filter( + Document.id == document_id, + Document.search_space_id == search_space_id, + ) + ) + document = result.scalars().first() + + if not document: + raise HTTPException(status_code=404, detail="Document not found") + + markdown: str | None = document.source_markdown + if markdown is None and document.blocknote_document: + from app.utils.blocknote_to_markdown import blocknote_to_markdown + + markdown = blocknote_to_markdown(document.blocknote_document) + if markdown is None: + chunk_contents_result = await session.execute( + select(Chunk.content) + .filter(Chunk.document_id == document_id) + .order_by(Chunk.id) + ) + chunk_contents = chunk_contents_result.scalars().all() + if chunk_contents: + markdown = "\n\n".join(chunk_contents) + + if not markdown or not markdown.strip(): + raise HTTPException( + status_code=400, detail="Document has no content to download" + ) + + safe_title = ( + "".join( + c if c.isalnum() or c in " -_" else "_" + for c in (document.title or "document") + ).strip()[:80] + or "document" + ) + + return StreamingResponse( + io.BytesIO(markdown.encode("utf-8")), + media_type="text/markdown; charset=utf-8", + headers={"Content-Disposition": f'attachment; filename="{safe_title}.md"'}, + ) @router.post("/search-spaces/{search_space_id}/documents/{document_id}/save") @@ -258,9 +319,7 @@ async def export_document( ) result = await session.execute( - select(Document) - .options(selectinload(Document.chunks)) - .filter( + select(Document).filter( Document.id == document_id, Document.search_space_id == search_space_id, ) @@ -269,16 +328,20 @@ async def export_document( if not document: raise HTTPException(status_code=404, detail="Document not found") - # Resolve markdown content (same priority as editor-content endpoint) markdown_content: str | None = document.source_markdown if markdown_content is None and document.blocknote_document: from app.utils.blocknote_to_markdown import blocknote_to_markdown markdown_content = blocknote_to_markdown(document.blocknote_document) if markdown_content is None: - chunks = sorted(document.chunks, key=lambda c: c.id) - if chunks: - markdown_content = "\n\n".join(chunk.content for chunk in chunks) + chunk_contents_result = await session.execute( + select(Chunk.content) + .filter(Chunk.document_id == document_id) + .order_by(Chunk.id) + ) + chunk_contents = chunk_contents_result.scalars().all() + if chunk_contents: + markdown_content = "\n\n".join(chunk_contents) if not markdown_content or not markdown_content.strip(): raise HTTPException(status_code=400, detail="Document has no content to export") diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py index c022a09d2..49d2836b2 100644 --- a/surfsense_backend/app/schemas/documents.py +++ b/surfsense_backend/app/schemas/documents.py @@ -53,25 +53,26 @@ class DocumentRead(BaseModel): title: str document_type: DocumentType document_metadata: dict - content: str # Changed to string to match frontend + content: str = "" + content_preview: str = "" content_hash: str unique_identifier_hash: str | None created_at: datetime updated_at: datetime | None search_space_id: int folder_id: int | None = None - created_by_id: UUID | None = None # User who created/uploaded this document + created_by_id: UUID | None = None created_by_name: str | None = None created_by_email: str | None = None - status: DocumentStatusSchema | None = ( - None # Processing status (ready, processing, failed) - ) + status: DocumentStatusSchema | None = None model_config = ConfigDict(from_attributes=True) class DocumentWithChunksRead(DocumentRead): chunks: list[ChunkRead] = [] + total_chunks: int = 0 + chunk_start_index: int = 0 model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py index 7c1e3b7ea..5ff907459 100644 --- a/surfsense_backend/app/tasks/chat/stream_new_chat.py +++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py @@ -39,7 +39,6 @@ from app.agents.new_chat.llm_config import ( ) from app.db import ( ChatVisibility, - Document, NewChatMessage, NewChatThread, Report, @@ -63,74 +62,6 @@ _perf_log = get_perf_logger() _background_tasks: set[asyncio.Task] = set() -def format_mentioned_documents_as_context(documents: list[Document]) -> str: - """ - Format mentioned documents as context for the agent. - - Uses the same XML structure as knowledge_base.format_documents_for_context - to ensure citations work properly with chunk IDs. - """ - if not documents: - return "" - - context_parts = [""] - context_parts.append( - "The user has explicitly mentioned the following documents from their knowledge base. " - "These documents are directly relevant to the query and should be prioritized as primary sources. " - "Use [citation:CHUNK_ID] format for citations (e.g., [citation:123])." - ) - context_parts.append("") - - for doc in documents: - # Build metadata JSON - metadata = doc.document_metadata or {} - metadata_json = json.dumps(metadata, ensure_ascii=False) - - # Get URL from metadata - url = ( - metadata.get("url") - or metadata.get("source") - or metadata.get("page_url") - or "" - ) - - context_parts.append("") - context_parts.append("") - context_parts.append(f" {doc.id}") - context_parts.append( - f" {doc.document_type.value}" - ) - context_parts.append(f" <![CDATA[{doc.title}]]>") - context_parts.append(f" ") - context_parts.append( - f" " - ) - context_parts.append("") - context_parts.append("") - context_parts.append("") - - # Use chunks if available (preferred for proper citations) - if hasattr(doc, "chunks") and doc.chunks: - for chunk in doc.chunks: - context_parts.append( - f" " - ) - else: - # Fallback to document content if chunks not loaded - # Use document ID as chunk ID prefix for consistency - context_parts.append( - f" " - ) - - context_parts.append("") - context_parts.append("") - context_parts.append("") - - context_parts.append("") - - return "\n".join(context_parts) - - def format_mentioned_surfsense_docs_as_context( documents: list[SurfsenseDocsDocument], ) -> str: @@ -1317,6 +1248,7 @@ async def stream_new_chat( firecrawl_api_key=firecrawl_api_key, thread_visibility=visibility, disabled_tools=disabled_tools, + mentioned_document_ids=mentioned_document_ids, ) _perf_log.info( "[stream_new_chat] Agent created in %.3fs", time.perf_counter() - _t0 @@ -1340,18 +1272,9 @@ async def stream_new_chat( thread.needs_history_bootstrap = False await session.commit() - # Fetch mentioned documents if any (with chunks for proper citations) - mentioned_documents: list[Document] = [] - if mentioned_document_ids: - result = await session.execute( - select(Document) - .options(selectinload(Document.chunks)) - .filter( - Document.id.in_(mentioned_document_ids), - Document.search_space_id == search_space_id, - ) - ) - mentioned_documents = list(result.scalars().all()) + # Mentioned KB documents are now handled by KnowledgeBaseSearchMiddleware + # which merges them into the scoped filesystem with full document + # structure. Only SurfSense docs and report context are inlined here. # Fetch mentioned SurfSense docs if any mentioned_surfsense_docs: list[SurfsenseDocsDocument] = [] @@ -1379,15 +1302,10 @@ async def stream_new_chat( ) recent_reports = list(recent_reports_result.scalars().all()) - # Format the user query with context (mentioned documents + SurfSense docs) + # Format the user query with context (SurfSense docs + reports only) final_query = user_query context_parts = [] - if mentioned_documents: - context_parts.append( - format_mentioned_documents_as_context(mentioned_documents) - ) - if mentioned_surfsense_docs: context_parts.append( format_mentioned_surfsense_docs_as_context(mentioned_surfsense_docs) @@ -1479,7 +1397,7 @@ async def stream_new_chat( yield streaming_service.format_start_step() # Initial thinking step - analyzing the request - if mentioned_documents or mentioned_surfsense_docs: + if mentioned_surfsense_docs: initial_title = "Analyzing referenced content" action_verb = "Analyzing" else: @@ -1490,18 +1408,6 @@ async def stream_new_chat( query_text = user_query[:80] + ("..." if len(user_query) > 80 else "") processing_parts.append(query_text) - if mentioned_documents: - doc_names = [] - for doc in mentioned_documents: - title = doc.title - if len(title) > 30: - title = title[:27] + "..." - doc_names.append(title) - if len(doc_names) == 1: - processing_parts.append(f"[{doc_names[0]}]") - else: - processing_parts.append(f"[{len(doc_names)} documents]") - if mentioned_surfsense_docs: doc_names = [] for doc in mentioned_surfsense_docs: @@ -1527,7 +1433,7 @@ async def stream_new_chat( # These ORM objects (with eagerly-loaded chunks) can be very large. # They're only needed to build context strings already copied into # final_query / langchain_messages — release them before streaming. - del mentioned_documents, mentioned_surfsense_docs, recent_reports + del mentioned_surfsense_docs, recent_reports del langchain_messages, final_query # Check if this is the first assistant response so we can generate diff --git a/surfsense_backend/app/tasks/document_processors/__init__.py b/surfsense_backend/app/tasks/document_processors/__init__.py index e70c41cb4..2b5690d02 100644 --- a/surfsense_backend/app/tasks/document_processors/__init__.py +++ b/surfsense_backend/app/tasks/document_processors/__init__.py @@ -12,16 +12,14 @@ Available processors: - YouTube processor: Process YouTube videos and extract transcripts """ -# URL crawler # Extension processor -from .extension_processor import add_extension_received_document - -# File processors -from .file_processors import ( +# File processors (backward-compatible re-exports from _save) +from ._save import ( add_received_file_document_using_docling, add_received_file_document_using_llamacloud, add_received_file_document_using_unstructured, ) +from .extension_processor import add_extension_received_document # Markdown processor from .markdown_processor import add_received_markdown_file_document @@ -32,9 +30,9 @@ from .youtube_processor import add_youtube_video_document __all__ = [ # Extension processing "add_extension_received_document", + # File processing with different ETL services "add_received_file_document_using_docling", "add_received_file_document_using_llamacloud", - # File processing with different ETL services "add_received_file_document_using_unstructured", # Markdown file processing "add_received_markdown_file_document", diff --git a/surfsense_backend/app/tasks/document_processors/_constants.py b/surfsense_backend/app/tasks/document_processors/_constants.py new file mode 100644 index 000000000..f74d7acce --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/_constants.py @@ -0,0 +1,74 @@ +""" +Constants for file document processing. + +Centralizes file type classification, LlamaCloud retry configuration, +and timeout calculation parameters. +""" + +import ssl +from enum import Enum + +import httpx + +# --------------------------------------------------------------------------- +# File type classification +# --------------------------------------------------------------------------- + +MARKDOWN_EXTENSIONS = (".md", ".markdown", ".txt") +AUDIO_EXTENSIONS = (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm") +DIRECT_CONVERT_EXTENSIONS = (".csv", ".tsv", ".html", ".htm") + + +class FileCategory(Enum): + MARKDOWN = "markdown" + AUDIO = "audio" + DIRECT_CONVERT = "direct_convert" + DOCUMENT = "document" + + +def classify_file(filename: str) -> FileCategory: + """Classify a file by its extension into a processing category.""" + lower = filename.lower() + if lower.endswith(MARKDOWN_EXTENSIONS): + return FileCategory.MARKDOWN + if lower.endswith(AUDIO_EXTENSIONS): + return FileCategory.AUDIO + if lower.endswith(DIRECT_CONVERT_EXTENSIONS): + return FileCategory.DIRECT_CONVERT + return FileCategory.DOCUMENT + + +# --------------------------------------------------------------------------- +# LlamaCloud retry configuration +# --------------------------------------------------------------------------- + +LLAMACLOUD_MAX_RETRIES = 5 +LLAMACLOUD_BASE_DELAY = 10 # seconds (exponential backoff base) +LLAMACLOUD_MAX_DELAY = 120 # max delay between retries (2 minutes) +LLAMACLOUD_RETRYABLE_EXCEPTIONS = ( + ssl.SSLError, + httpx.ConnectError, + httpx.ConnectTimeout, + httpx.ReadError, + httpx.ReadTimeout, + httpx.WriteError, + httpx.WriteTimeout, + httpx.RemoteProtocolError, + httpx.LocalProtocolError, + ConnectionError, + ConnectionResetError, + TimeoutError, + OSError, +) + +# --------------------------------------------------------------------------- +# Timeout calculation constants +# --------------------------------------------------------------------------- + +UPLOAD_BYTES_PER_SECOND_SLOW = ( + 100 * 1024 +) # 100 KB/s (conservative for slow connections) +MIN_UPLOAD_TIMEOUT = 120 # Minimum 2 minutes for any file +MAX_UPLOAD_TIMEOUT = 1800 # Maximum 30 minutes for very large files +BASE_JOB_TIMEOUT = 600 # 10 minutes base for job processing +PER_PAGE_JOB_TIMEOUT = 60 # 1 minute per page for processing diff --git a/surfsense_backend/app/tasks/document_processors/_direct_converters.py b/surfsense_backend/app/tasks/document_processors/_direct_converters.py new file mode 100644 index 000000000..b1a69ef4f --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/_direct_converters.py @@ -0,0 +1,90 @@ +""" +Lossless file-to-markdown converters for text-based formats. + +These converters handle file types that can be faithfully represented as +markdown without any external ETL/OCR service: + +- CSV / TSV → markdown table (stdlib ``csv``) +- HTML / HTM → markdown (``markdownify``) +""" + +from __future__ import annotations + +import csv +from collections.abc import Callable +from pathlib import Path + +from markdownify import markdownify + +# The stdlib csv module defaults to a 128 KB field-size limit which is too +# small for real-world exports (e.g. chat logs, CRM dumps). We raise it once +# at import time so every csv.reader call in this module can handle large fields. +csv.field_size_limit(2**31 - 1) + + +def _escape_pipe(cell: str) -> str: + """Escape literal pipe characters inside a markdown table cell.""" + return cell.replace("|", "\\|") + + +def csv_to_markdown(file_path: str, *, delimiter: str = ",") -> str: + """Convert a CSV (or TSV) file to a markdown table. + + The first row is treated as the header. An empty file returns an + empty string so the caller can decide how to handle it. + """ + with open(file_path, encoding="utf-8", newline="") as fh: + reader = csv.reader(fh, delimiter=delimiter) + rows = list(reader) + + if not rows: + return "" + + header, *body = rows + col_count = len(header) + + lines: list[str] = [] + + header_cells = [_escape_pipe(c.strip()) for c in header] + lines.append("| " + " | ".join(header_cells) + " |") + lines.append("| " + " | ".join(["---"] * col_count) + " |") + + for row in body: + padded = row + [""] * (col_count - len(row)) + cells = [_escape_pipe(c.strip()) for c in padded[:col_count]] + lines.append("| " + " | ".join(cells) + " |") + + return "\n".join(lines) + "\n" + + +def tsv_to_markdown(file_path: str) -> str: + """Convert a TSV file to a markdown table.""" + return csv_to_markdown(file_path, delimiter="\t") + + +def html_to_markdown(file_path: str) -> str: + """Convert an HTML file to markdown via ``markdownify``.""" + html = Path(file_path).read_text(encoding="utf-8") + return markdownify(html).strip() + + +_CONVERTER_MAP: dict[str, Callable[..., str]] = { + ".csv": csv_to_markdown, + ".tsv": tsv_to_markdown, + ".html": html_to_markdown, + ".htm": html_to_markdown, +} + + +def convert_file_directly(file_path: str, filename: str) -> str: + """Dispatch to the appropriate lossless converter based on file extension. + + Raises ``ValueError`` if the extension is not supported. + """ + suffix = Path(filename).suffix.lower() + converter = _CONVERTER_MAP.get(suffix) + if converter is None: + raise ValueError( + f"No direct converter for extension '{suffix}' (file: {filename})" + ) + return converter(file_path) diff --git a/surfsense_backend/app/tasks/document_processors/_etl.py b/surfsense_backend/app/tasks/document_processors/_etl.py new file mode 100644 index 000000000..cc3a8b1ac --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/_etl.py @@ -0,0 +1,209 @@ +""" +ETL parsing strategies for different document processing services. + +Provides parse functions for Unstructured, LlamaCloud, and Docling, along with +LlamaCloud retry logic and dynamic timeout calculations. +""" + +import asyncio +import logging +import os +import random +import warnings +from logging import ERROR, getLogger + +import httpx + +from app.config import config as app_config +from app.db import Log +from app.services.task_logging_service import TaskLoggingService + +from ._constants import ( + LLAMACLOUD_BASE_DELAY, + LLAMACLOUD_MAX_DELAY, + LLAMACLOUD_MAX_RETRIES, + LLAMACLOUD_RETRYABLE_EXCEPTIONS, + PER_PAGE_JOB_TIMEOUT, +) +from ._helpers import calculate_job_timeout, calculate_upload_timeout + +# --------------------------------------------------------------------------- +# LlamaCloud parsing with retry +# --------------------------------------------------------------------------- + + +async def parse_with_llamacloud_retry( + file_path: str, + estimated_pages: int, + task_logger: TaskLoggingService | None = None, + log_entry: Log | None = None, +): + """ + Parse a file with LlamaCloud with retry logic for transient SSL/connection errors. + + Uses dynamic timeout calculations based on file size and page count to handle + very large files reliably. + + Returns: + LlamaParse result object + + Raises: + Exception: If all retries fail + """ + from llama_cloud_services import LlamaParse + from llama_cloud_services.parse.utils import ResultType + + file_size_bytes = os.path.getsize(file_path) + file_size_mb = file_size_bytes / (1024 * 1024) + + upload_timeout = calculate_upload_timeout(file_size_bytes) + job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes) + + custom_timeout = httpx.Timeout( + connect=120.0, + read=upload_timeout, + write=upload_timeout, + pool=120.0, + ) + + logging.info( + f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, " + f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, " + f"job_timeout={job_timeout:.0f}s" + ) + + last_exception = None + attempt_errors: list[str] = [] + + for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1): + try: + async with httpx.AsyncClient(timeout=custom_timeout) as custom_client: + parser = LlamaParse( + api_key=app_config.LLAMA_CLOUD_API_KEY, + num_workers=1, + verbose=True, + language="en", + result_type=ResultType.MD, + max_timeout=int(max(2000, job_timeout + upload_timeout)), + job_timeout_in_seconds=job_timeout, + job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT, + custom_client=custom_client, + ) + result = await parser.aparse(file_path) + + if attempt > 1: + logging.info( + f"LlamaCloud upload succeeded on attempt {attempt} after " + f"{len(attempt_errors)} failures" + ) + return result + + except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e: + last_exception = e + error_type = type(e).__name__ + error_msg = str(e)[:200] + attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}") + + if attempt < LLAMACLOUD_MAX_RETRIES: + base_delay = min( + LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)), + LLAMACLOUD_MAX_DELAY, + ) + jitter = base_delay * 0.25 * (2 * random.random() - 1) + delay = base_delay + jitter + + if task_logger and log_entry: + await task_logger.log_task_progress( + log_entry, + f"LlamaCloud upload failed " + f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), " + f"retrying in {delay:.0f}s", + { + "error_type": error_type, + "error_message": error_msg, + "attempt": attempt, + "retry_delay": delay, + "file_size_mb": round(file_size_mb, 1), + "upload_timeout": upload_timeout, + }, + ) + else: + logging.warning( + f"LlamaCloud upload failed " + f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): " + f"{error_type}. File: {file_size_mb:.1f}MB. " + f"Retrying in {delay:.0f}s..." + ) + + await asyncio.sleep(delay) + else: + logging.error( + f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} " + f"attempts. File size: {file_size_mb:.1f}MB, " + f"Pages: {estimated_pages}. " + f"Errors: {'; '.join(attempt_errors)}" + ) + + except Exception: + raise + + raise last_exception or RuntimeError( + f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. " + f"File size: {file_size_mb:.1f}MB" + ) + + +# --------------------------------------------------------------------------- +# Per-service parse functions +# --------------------------------------------------------------------------- + + +async def parse_with_unstructured(file_path: str): + """ + Parse a file using the Unstructured ETL service. + + Returns: + List of LangChain Document elements. + """ + from langchain_unstructured import UnstructuredLoader + + loader = UnstructuredLoader( + file_path, + mode="elements", + post_processors=[], + languages=["eng"], + include_orig_elements=False, + include_metadata=False, + strategy="auto", + ) + return await loader.aload() + + +async def parse_with_docling(file_path: str, filename: str) -> str: + """ + Parse a file using the Docling ETL service (via the Docling service wrapper). + + Returns: + Markdown content string. + """ + from app.services.docling_service import create_docling_service + + docling_service = create_docling_service() + + pdfminer_logger = getLogger("pdfminer") + original_level = pdfminer_logger.level + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer") + warnings.filterwarnings( + "ignore", message=".*Cannot set gray non-stroke color.*" + ) + warnings.filterwarnings("ignore", message=".*invalid float value.*") + pdfminer_logger.setLevel(ERROR) + + try: + result = await docling_service.process_document(file_path, filename) + finally: + pdfminer_logger.setLevel(original_level) + + return result["content"] diff --git a/surfsense_backend/app/tasks/document_processors/_helpers.py b/surfsense_backend/app/tasks/document_processors/_helpers.py new file mode 100644 index 000000000..7ac05932c --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/_helpers.py @@ -0,0 +1,218 @@ +""" +Document helper functions for deduplication, migration, and connector updates. + +Provides reusable logic shared across file processors and ETL strategies. +""" + +import logging + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import Document, DocumentStatus, DocumentType +from app.utils.document_converters import generate_unique_identifier_hash + +from ._constants import ( + BASE_JOB_TIMEOUT, + MAX_UPLOAD_TIMEOUT, + MIN_UPLOAD_TIMEOUT, + PER_PAGE_JOB_TIMEOUT, + UPLOAD_BYTES_PER_SECOND_SLOW, +) +from .base import ( + check_document_by_unique_identifier, + check_duplicate_document, +) + +# --------------------------------------------------------------------------- +# Unique identifier helpers +# --------------------------------------------------------------------------- + + +def get_google_drive_unique_identifier( + connector: dict | None, + filename: str, + search_space_id: int, +) -> tuple[str, str | None]: + """ + Get unique identifier hash, using file_id for Google Drive (stable across renames). + + Returns: + Tuple of (primary_hash, legacy_hash or None). + For Google Drive: (file_id-based hash, filename-based hash for migration). + For other sources: (filename-based hash, None). + """ + if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: + metadata = connector.get("metadata", {}) + file_id = metadata.get("google_drive_file_id") + + if file_id: + primary_hash = generate_unique_identifier_hash( + DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id + ) + legacy_hash = generate_unique_identifier_hash( + DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id + ) + return primary_hash, legacy_hash + + primary_hash = generate_unique_identifier_hash( + DocumentType.FILE, filename, search_space_id + ) + return primary_hash, None + + +# --------------------------------------------------------------------------- +# Document deduplication and migration +# --------------------------------------------------------------------------- + + +async def handle_existing_document_update( + session: AsyncSession, + existing_document: Document, + content_hash: str, + connector: dict | None, + filename: str, + primary_hash: str, +) -> tuple[bool, Document | None]: + """ + Handle update logic for an existing document. + + Returns: + Tuple of (should_skip_processing, document_to_return): + - (True, document): Content unchanged, return existing document + - (False, None): Content changed, needs re-processing + """ + if existing_document.unique_identifier_hash != primary_hash: + existing_document.unique_identifier_hash = primary_hash + logging.info(f"Migrated document to file_id-based identifier: {filename}") + + if existing_document.content_hash == content_hash: + if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: + connector_metadata = connector.get("metadata", {}) + new_name = connector_metadata.get("google_drive_file_name") + doc_metadata = existing_document.document_metadata or {} + old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get( + "google_drive_file_name" + ) + + if new_name and old_name and old_name != new_name: + from sqlalchemy.orm.attributes import flag_modified + + existing_document.title = new_name + if not existing_document.document_metadata: + existing_document.document_metadata = {} + existing_document.document_metadata["FILE_NAME"] = new_name + existing_document.document_metadata["google_drive_file_name"] = new_name + flag_modified(existing_document, "document_metadata") + await session.commit() + logging.info( + f"File renamed in Google Drive: '{old_name}' → '{new_name}' " + f"(no re-processing needed)" + ) + + logging.info(f"Document for file {filename} unchanged. Skipping.") + return True, existing_document + + # Content has changed — guard against content_hash collision before + # expensive ETL processing. + collision_doc = await check_duplicate_document(session, content_hash) + if collision_doc and collision_doc.id != existing_document.id: + logging.warning( + "Content-hash collision for %s: identical content exists in " + "document #%s (%s). Skipping re-processing.", + filename, + collision_doc.id, + collision_doc.document_type, + ) + if DocumentStatus.is_state( + existing_document.status, DocumentStatus.PENDING + ) or DocumentStatus.is_state( + existing_document.status, DocumentStatus.PROCESSING + ): + await session.delete(existing_document) + await session.commit() + return True, None + + return True, existing_document + + logging.info(f"Content changed for file {filename}. Updating document.") + return False, None + + +async def find_existing_document_with_migration( + session: AsyncSession, + primary_hash: str, + legacy_hash: str | None, + content_hash: str | None = None, +) -> Document | None: + """ + Find existing document, checking primary hash, legacy hash, and content_hash. + + Supports migration from filename-based to file_id-based hashing for + Google Drive files, with content_hash fallback for cross-source dedup. + """ + existing_document = await check_document_by_unique_identifier(session, primary_hash) + + if not existing_document and legacy_hash: + existing_document = await check_document_by_unique_identifier( + session, legacy_hash + ) + if existing_document: + logging.info( + "Found legacy document (filename-based hash), " + "will migrate to file_id-based hash" + ) + + if not existing_document and content_hash: + existing_document = await check_duplicate_document(session, content_hash) + if existing_document: + logging.info( + f"Found duplicate content from different source (content_hash match). " + f"Original document ID: {existing_document.id}, " + f"type: {existing_document.document_type}" + ) + + return existing_document + + +# --------------------------------------------------------------------------- +# Connector helpers +# --------------------------------------------------------------------------- + + +async def update_document_from_connector( + document: Document | None, + connector: dict | None, + session: AsyncSession, +) -> None: + """Update document type, metadata, and connector_id from connector info.""" + if not document or not connector: + return + if "type" in connector: + document.document_type = connector["type"] + if "metadata" in connector: + if not document.document_metadata: + document.document_metadata = connector["metadata"] + else: + merged = {**document.document_metadata, **connector["metadata"]} + document.document_metadata = merged + if "connector_id" in connector: + document.connector_id = connector["connector_id"] + await session.commit() + + +# --------------------------------------------------------------------------- +# Timeout calculations +# --------------------------------------------------------------------------- + + +def calculate_upload_timeout(file_size_bytes: int) -> float: + """Calculate upload timeout based on file size (conservative for slow connections).""" + estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5 + return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT)) + + +def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float: + """Calculate job processing timeout based on page count and file size.""" + page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT) + size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60 + return max(page_based_timeout, size_based_timeout) diff --git a/surfsense_backend/app/tasks/document_processors/_save.py b/surfsense_backend/app/tasks/document_processors/_save.py new file mode 100644 index 000000000..5088ad004 --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/_save.py @@ -0,0 +1,285 @@ +""" +Unified document save/update logic for file processors. + +Replaces the three nearly-identical ``add_received_file_document_using_*`` +functions with a single ``save_file_document`` function plus thin wrappers +for backward compatibility. +""" + +import logging + +from langchain_core.documents import Document as LangChainDocument +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import Document, DocumentStatus, DocumentType +from app.services.llm_service import get_user_long_context_llm +from app.utils.document_converters import ( + create_document_chunks, + embed_text, + generate_content_hash, + generate_document_summary, +) + +from ._helpers import ( + find_existing_document_with_migration, + get_google_drive_unique_identifier, + handle_existing_document_update, +) +from .base import get_current_timestamp, safe_set_chunks + +# --------------------------------------------------------------------------- +# Summary generation +# --------------------------------------------------------------------------- + + +async def _generate_summary( + markdown_content: str, + file_name: str, + etl_service: str, + user_llm, + enable_summary: bool, +) -> tuple[str, list[float]]: + """ + Generate a document summary and embedding. + + Docling uses its own large-document summary strategy; other ETL services + use the standard ``generate_document_summary`` helper. + """ + if not enable_summary: + summary = f"File: {file_name}\n\n{markdown_content[:4000]}" + return summary, embed_text(summary) + + if etl_service == "DOCLING": + from app.services.docling_service import create_docling_service + + docling_service = create_docling_service() + summary_text = await docling_service.process_large_document_summary( + content=markdown_content, llm=user_llm, document_title=file_name + ) + + meta = { + "file_name": file_name, + "etl_service": etl_service, + "document_type": "File Document", + } + parts = ["# DOCUMENT METADATA"] + for key, value in meta.items(): + if value: + formatted_key = key.replace("_", " ").title() + parts.append(f"**{formatted_key}:** {value}") + + enhanced = "\n".join(parts) + "\n\n# DOCUMENT SUMMARY\n\n" + summary_text + return enhanced, embed_text(enhanced) + + # Standard summary (Unstructured / LlamaCloud / others) + meta = { + "file_name": file_name, + "etl_service": etl_service, + "document_type": "File Document", + } + return await generate_document_summary(markdown_content, user_llm, meta) + + +# --------------------------------------------------------------------------- +# Unified save function +# --------------------------------------------------------------------------- + + +async def save_file_document( + session: AsyncSession, + file_name: str, + markdown_content: str, + search_space_id: int, + user_id: str, + etl_service: str, + connector: dict | None = None, + enable_summary: bool = True, +) -> Document | None: + """ + Process and store a file document with deduplication and migration support. + + Handles both creating new documents and updating existing ones. This is + the single implementation behind the per-ETL-service wrapper functions. + + Args: + session: Database session + file_name: Name of the processed file + markdown_content: Markdown content to store + search_space_id: ID of the search space + user_id: ID of the user + etl_service: Name of the ETL service (UNSTRUCTURED, LLAMACLOUD, DOCLING) + connector: Optional connector info for Google Drive files + enable_summary: Whether to generate an AI summary + + Returns: + Document object if successful, None if duplicate detected + """ + try: + primary_hash, legacy_hash = get_google_drive_unique_identifier( + connector, file_name, search_space_id + ) + content_hash = generate_content_hash(markdown_content, search_space_id) + + existing_document = await find_existing_document_with_migration( + session, primary_hash, legacy_hash, content_hash + ) + + if existing_document: + should_skip, doc = await handle_existing_document_update( + session, + existing_document, + content_hash, + connector, + file_name, + primary_hash, + ) + if should_skip: + return doc + + user_llm = await get_user_long_context_llm(session, user_id, search_space_id) + if not user_llm: + raise RuntimeError( + f"No long context LLM configured for user {user_id} " + f"in search space {search_space_id}" + ) + + summary_content, summary_embedding = await _generate_summary( + markdown_content, file_name, etl_service, user_llm, enable_summary + ) + chunks = await create_document_chunks(markdown_content) + doc_metadata = {"FILE_NAME": file_name, "ETL_SERVICE": etl_service} + + if existing_document: + existing_document.title = file_name + existing_document.content = summary_content + existing_document.content_hash = content_hash + existing_document.embedding = summary_embedding + existing_document.document_metadata = doc_metadata + await safe_set_chunks(session, existing_document, chunks) + existing_document.source_markdown = markdown_content + existing_document.content_needs_reindexing = False + existing_document.updated_at = get_current_timestamp() + existing_document.status = DocumentStatus.ready() + + await session.commit() + await session.refresh(existing_document) + return existing_document + + doc_type = DocumentType.FILE + if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: + doc_type = DocumentType.GOOGLE_DRIVE_FILE + + document = Document( + search_space_id=search_space_id, + title=file_name, + document_type=doc_type, + document_metadata=doc_metadata, + content=summary_content, + embedding=summary_embedding, + chunks=chunks, + content_hash=content_hash, + unique_identifier_hash=primary_hash, + source_markdown=markdown_content, + content_needs_reindexing=False, + updated_at=get_current_timestamp(), + created_by_id=user_id, + connector_id=connector.get("connector_id") if connector else None, + status=DocumentStatus.ready(), + ) + session.add(document) + await session.commit() + await session.refresh(document) + return document + + except SQLAlchemyError as db_error: + await session.rollback() + if "ix_documents_content_hash" in str(db_error): + logging.warning( + "content_hash collision during commit for %s (%s). Skipping.", + file_name, + etl_service, + ) + return None + raise db_error + except Exception as e: + await session.rollback() + raise RuntimeError( + f"Failed to process file document using {etl_service}: {e!s}" + ) from e + + +# --------------------------------------------------------------------------- +# Backward-compatible wrapper functions +# --------------------------------------------------------------------------- + + +async def add_received_file_document_using_unstructured( + session: AsyncSession, + file_name: str, + unstructured_processed_elements: list[LangChainDocument], + search_space_id: int, + user_id: str, + connector: dict | None = None, + enable_summary: bool = True, +) -> Document | None: + """Process and store a file document using the Unstructured service.""" + from app.utils.document_converters import convert_document_to_markdown + + markdown_content = await convert_document_to_markdown( + unstructured_processed_elements + ) + return await save_file_document( + session, + file_name, + markdown_content, + search_space_id, + user_id, + "UNSTRUCTURED", + connector, + enable_summary, + ) + + +async def add_received_file_document_using_llamacloud( + session: AsyncSession, + file_name: str, + llamacloud_markdown_document: str, + search_space_id: int, + user_id: str, + connector: dict | None = None, + enable_summary: bool = True, +) -> Document | None: + """Process and store document content parsed by LlamaCloud.""" + return await save_file_document( + session, + file_name, + llamacloud_markdown_document, + search_space_id, + user_id, + "LLAMACLOUD", + connector, + enable_summary, + ) + + +async def add_received_file_document_using_docling( + session: AsyncSession, + file_name: str, + docling_markdown_document: str, + search_space_id: int, + user_id: str, + connector: dict | None = None, + enable_summary: bool = True, +) -> Document | None: + """Process and store document content parsed by Docling.""" + return await save_file_document( + session, + file_name, + docling_markdown_document, + search_space_id, + user_id, + "DOCLING", + connector, + enable_summary, + ) diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index 6c0ae1870..0c1cad52d 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -1,905 +1,685 @@ """ -File document processors for different ETL services (Unstructured, LlamaCloud, Docling). +File document processors orchestrating content extraction and indexing. + +This module is the public entry point for file processing. It delegates to +specialised sub-modules that each own a single concern: + +- ``_constants`` — file type classification and configuration constants +- ``_helpers`` — document deduplication, migration, connector helpers +- ``_direct_converters`` — lossless file-to-markdown for csv/tsv/html +- ``_etl`` — ETL parsing strategies (Unstructured, LlamaCloud, Docling) +- ``_save`` — unified document creation / update logic """ -import asyncio +from __future__ import annotations + import contextlib import logging -import ssl -import warnings +import os +from dataclasses import dataclass, field from logging import ERROR, getLogger -import httpx from fastapi import HTTPException -from langchain_core.documents import Document as LangChainDocument -from litellm import atranscription -from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession from app.config import config as app_config -from app.db import Document, DocumentStatus, DocumentType, Log, Notification -from app.indexing_pipeline.adapters.file_upload_adapter import UploadDocumentAdapter -from app.services.llm_service import get_user_long_context_llm +from app.db import Document, Log, Notification from app.services.notification_service import NotificationService from app.services.task_logging_service import TaskLoggingService -from app.utils.document_converters import ( - convert_document_to_markdown, - create_document_chunks, - embed_text, - generate_content_hash, - generate_document_summary, - generate_unique_identifier_hash, -) -from .base import ( - check_document_by_unique_identifier, - check_duplicate_document, - get_current_timestamp, - safe_set_chunks, +from ._constants import FileCategory, classify_file +from ._direct_converters import convert_file_directly +from ._etl import ( + parse_with_docling, + parse_with_llamacloud_retry, + parse_with_unstructured, +) +from ._helpers import update_document_from_connector +from ._save import ( + add_received_file_document_using_docling, + add_received_file_document_using_llamacloud, + add_received_file_document_using_unstructured, + save_file_document, ) from .markdown_processor import add_received_markdown_file_document -# Constants for LlamaCloud retry configuration -LLAMACLOUD_MAX_RETRIES = 5 # Increased from 3 for large file resilience -LLAMACLOUD_BASE_DELAY = 10 # Base delay in seconds for exponential backoff -LLAMACLOUD_MAX_DELAY = 120 # Maximum delay between retries (2 minutes) -LLAMACLOUD_RETRYABLE_EXCEPTIONS = ( - ssl.SSLError, - httpx.ConnectError, - httpx.ConnectTimeout, - httpx.ReadTimeout, - httpx.WriteTimeout, - httpx.RemoteProtocolError, - httpx.LocalProtocolError, - ConnectionError, - ConnectionResetError, - TimeoutError, - OSError, # Catches various network-level errors -) - -# Timeout calculation constants -UPLOAD_BYTES_PER_SECOND_SLOW = ( - 100 * 1024 -) # 100 KB/s (conservative for slow connections) -MIN_UPLOAD_TIMEOUT = 120 # Minimum 2 minutes for any file -MAX_UPLOAD_TIMEOUT = 1800 # Maximum 30 minutes for very large files -BASE_JOB_TIMEOUT = 600 # 10 minutes base for job processing -PER_PAGE_JOB_TIMEOUT = 60 # 1 minute per page for processing +# Re-export public API so existing ``from file_processors import …`` keeps working. +__all__ = [ + "add_received_file_document_using_docling", + "add_received_file_document_using_llamacloud", + "add_received_file_document_using_unstructured", + "parse_with_llamacloud_retry", + "process_file_in_background", + "process_file_in_background_with_document", + "save_file_document", +] -def get_google_drive_unique_identifier( - connector: dict | None, - filename: str, - search_space_id: int, -) -> tuple[str, str | None]: - """ - Get unique identifier hash for a file, with special handling for Google Drive. - - For Google Drive files, uses file_id as the unique identifier (doesn't change on rename). - For other files, uses filename. - - Args: - connector: Optional connector info dict with type and metadata - filename: The filename (used for non-Google Drive files or as fallback) - search_space_id: The search space ID - - Returns: - Tuple of (primary_hash, legacy_hash or None) - - For Google Drive: (file_id_based_hash, filename_based_hash for migration) - - For other sources: (filename_based_hash, None) - """ - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - metadata = connector.get("metadata", {}) - file_id = metadata.get("google_drive_file_id") - - if file_id: - # New method: use file_id as unique identifier (doesn't change on rename) - primary_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id - ) - # Legacy method: for backward compatibility with existing documents - # that were indexed with filename-based hash - legacy_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id - ) - return primary_hash, legacy_hash - - # For non-Google Drive files, use filename as before - primary_hash = generate_unique_identifier_hash( - DocumentType.FILE, filename, search_space_id - ) - return primary_hash, None +# --------------------------------------------------------------------------- +# Processing context (bundles parameters shared across handler functions) +# --------------------------------------------------------------------------- -async def handle_existing_document_update( - session: AsyncSession, - existing_document: Document, - content_hash: str, - connector: dict | None, - filename: str, - primary_hash: str, -) -> tuple[bool, Document | None]: - """ - Handle update logic for an existing document. +@dataclass +class _ProcessingContext: + session: AsyncSession + file_path: str + filename: str + search_space_id: int + user_id: str + task_logger: TaskLoggingService + log_entry: Log + connector: dict | None = None + notification: Notification | None = None + enable_summary: bool = field(init=False) - Args: - session: Database session - existing_document: The existing document found in database - content_hash: Hash of the new content - connector: Optional connector info - filename: Current filename - primary_hash: The primary hash (file_id based for Google Drive) - - Returns: - Tuple of (should_skip_processing, document_to_return) - - (True, document): Content unchanged, just return existing document - - (False, None): Content changed, need to re-process - """ - # Check if this document needs hash migration (found via legacy hash) - if existing_document.unique_identifier_hash != primary_hash: - existing_document.unique_identifier_hash = primary_hash - logging.info(f"Migrated document to file_id-based identifier: {filename}") - - # Check if content has changed - if existing_document.content_hash == content_hash: - # Content unchanged - check if we need to update metadata (e.g., filename changed) - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - connector_metadata = connector.get("metadata", {}) - new_name = connector_metadata.get("google_drive_file_name") - # Check both possible keys for old name (FILE_NAME is used in stored documents) - doc_metadata = existing_document.document_metadata or {} - old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get( - "google_drive_file_name" - ) - - if new_name and old_name and old_name != new_name: - # File was renamed - update title and metadata, skip expensive processing - from sqlalchemy.orm.attributes import flag_modified - - existing_document.title = new_name - if not existing_document.document_metadata: - existing_document.document_metadata = {} - existing_document.document_metadata["FILE_NAME"] = new_name - existing_document.document_metadata["google_drive_file_name"] = new_name - flag_modified(existing_document, "document_metadata") - await session.commit() - logging.info( - f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)" - ) - - logging.info(f"Document for file {filename} unchanged. Skipping.") - return True, existing_document - else: - # Content has changed — guard against content_hash collision before - # expensive ETL processing. A collision means the exact same content - # already lives in a *different* document (e.g. a manual upload of the - # same file). Proceeding would trigger a unique-constraint violation - # on ix_documents_content_hash. - collision_doc = await check_duplicate_document(session, content_hash) - if collision_doc and collision_doc.id != existing_document.id: - logging.warning( - "Content-hash collision for %s: identical content exists in " - "document #%s (%s). Skipping re-processing.", - filename, - collision_doc.id, - collision_doc.document_type, - ) - if DocumentStatus.is_state( - existing_document.status, DocumentStatus.PENDING - ) or DocumentStatus.is_state( - existing_document.status, DocumentStatus.PROCESSING - ): - # Pending/processing doc has no real content yet — remove it - # so the UI doesn't show a contentless entry. - await session.delete(existing_document) - await session.commit() - return True, None - - # Document already has valid content — keep it as-is. - return True, existing_document - - logging.info(f"Content changed for file {filename}. Updating document.") - return False, None - - -async def find_existing_document_with_migration( - session: AsyncSession, - primary_hash: str, - legacy_hash: str | None, - content_hash: str | None = None, -) -> Document | None: - """ - Find existing document, checking both new hash and legacy hash for migration, - with fallback to content_hash for cross-source deduplication. - - Args: - session: Database session - primary_hash: The primary hash (file_id based for Google Drive) - legacy_hash: The legacy hash (filename based) for migration, or None - content_hash: The content hash for fallback deduplication, or None - - Returns: - Existing document if found, None otherwise - """ - # First check with primary hash (new method) - existing_document = await check_document_by_unique_identifier(session, primary_hash) - - # If not found and we have a legacy hash, check with that (migration path) - if not existing_document and legacy_hash: - existing_document = await check_document_by_unique_identifier( - session, legacy_hash - ) - if existing_document: - logging.info( - "Found legacy document (filename-based hash), will migrate to file_id-based hash" - ) - - # Fallback: check by content_hash to catch duplicates from different sources - # This prevents unique constraint violations when the same content exists - # under a different unique_identifier (e.g., manual upload vs Google Drive) - if not existing_document and content_hash: - existing_document = await check_duplicate_document(session, content_hash) - if existing_document: - logging.info( - f"Found duplicate content from different source (content_hash match). " - f"Original document ID: {existing_document.id}, type: {existing_document.document_type}" - ) - - return existing_document - - -def calculate_upload_timeout(file_size_bytes: int) -> float: - """ - Calculate appropriate upload timeout based on file size. - - Assumes a conservative slow connection speed to handle worst-case scenarios. - - Args: - file_size_bytes: Size of the file in bytes - - Returns: - Timeout in seconds - """ - # Calculate time needed at slow connection speed - # Add 50% buffer for network variability and SSL overhead - estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5 - - # Clamp to reasonable bounds - return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT)) - - -def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float: - """ - Calculate job processing timeout based on page count and file size. - - Args: - estimated_pages: Estimated number of pages - file_size_bytes: Size of the file in bytes - - Returns: - Timeout in seconds - """ - # Base timeout + time per page - page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT) - - # Also consider file size (large images take longer to process) - # ~1 minute per 10MB of file size - size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60 - - # Use the larger of the two estimates - return max(page_based_timeout, size_based_timeout) - - -async def parse_with_llamacloud_retry( - file_path: str, - estimated_pages: int, - task_logger: TaskLoggingService | None = None, - log_entry: Log | None = None, -): - """ - Parse a file with LlamaCloud with retry logic for transient SSL/connection errors. - - Uses dynamic timeout calculations based on file size and page count to handle - very large files reliably. - - Args: - file_path: Path to the file to parse - estimated_pages: Estimated number of pages for timeout calculation - task_logger: Optional task logger for progress updates - log_entry: Optional log entry for progress updates - - Returns: - LlamaParse result object - - Raises: - Exception: If all retries fail - """ - import os - import random - - from llama_cloud_services import LlamaParse - from llama_cloud_services.parse.utils import ResultType - - # Get file size for timeout calculations - file_size_bytes = os.path.getsize(file_path) - file_size_mb = file_size_bytes / (1024 * 1024) - - # Calculate dynamic timeouts based on file size and page count - upload_timeout = calculate_upload_timeout(file_size_bytes) - job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes) - - # HTTP client timeouts - scaled based on file size - # Write timeout is critical for large file uploads - custom_timeout = httpx.Timeout( - connect=120.0, # 2 minutes to establish connection (handles slow DNS, etc.) - read=upload_timeout, # Dynamic based on file size - write=upload_timeout, # Dynamic based on file size (upload time) - pool=120.0, # 2 minutes to acquire connection from pool - ) - - logging.info( - f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, " - f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, " - f"job_timeout={job_timeout:.0f}s" - ) - - last_exception = None - attempt_errors = [] - - for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1): - try: - # Create a fresh httpx client for each attempt - async with httpx.AsyncClient(timeout=custom_timeout) as custom_client: - # Create LlamaParse parser instance with optimized settings - parser = LlamaParse( - api_key=app_config.LLAMA_CLOUD_API_KEY, - num_workers=1, # Use single worker for file processing - verbose=True, - language="en", - result_type=ResultType.MD, - # Timeout settings for large files - max_timeout=int(max(2000, job_timeout + upload_timeout)), - job_timeout_in_seconds=job_timeout, - job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT, - # Use our custom client with larger timeouts - custom_client=custom_client, - ) - - # Parse the file asynchronously - result = await parser.aparse(file_path) - - # Success - log if we had previous failures - if attempt > 1: - logging.info( - f"LlamaCloud upload succeeded on attempt {attempt} after " - f"{len(attempt_errors)} failures" - ) - - return result - - except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e: - last_exception = e - error_type = type(e).__name__ - error_msg = str(e)[:200] - attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}") - - if attempt < LLAMACLOUD_MAX_RETRIES: - # Calculate exponential backoff with jitter - # Base delay doubles each attempt, capped at max delay - base_delay = min( - LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)), LLAMACLOUD_MAX_DELAY - ) - # Add random jitter (±25%) to prevent thundering herd - jitter = base_delay * 0.25 * (2 * random.random() - 1) - delay = base_delay + jitter - - if task_logger and log_entry: - await task_logger.log_task_progress( - log_entry, - f"LlamaCloud upload failed (attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), retrying in {delay:.0f}s", - { - "error_type": error_type, - "error_message": error_msg, - "attempt": attempt, - "retry_delay": delay, - "file_size_mb": round(file_size_mb, 1), - "upload_timeout": upload_timeout, - }, - ) - else: - logging.warning( - f"LlamaCloud upload failed (attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): " - f"{error_type}. File: {file_size_mb:.1f}MB. Retrying in {delay:.0f}s..." - ) - - await asyncio.sleep(delay) - else: - logging.error( - f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} attempts. " - f"File size: {file_size_mb:.1f}MB, Pages: {estimated_pages}. " - f"Errors: {'; '.join(attempt_errors)}" - ) - - except Exception: - # Non-retryable exception, raise immediately - raise - - # All retries exhausted - raise last_exception or RuntimeError( - f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. " - f"File size: {file_size_mb:.1f}MB" - ) - - -async def add_received_file_document_using_unstructured( - session: AsyncSession, - file_name: str, - unstructured_processed_elements: list[LangChainDocument], - search_space_id: int, - user_id: str, - connector: dict | None = None, - enable_summary: bool = True, -) -> Document | None: - """ - Process and store a file document using Unstructured service. - - Args: - session: Database session - file_name: Name of the processed file - unstructured_processed_elements: Processed elements from Unstructured - search_space_id: ID of the search space - user_id: ID of the user - connector: Optional connector info for Google Drive files - - Returns: - Document object if successful, None if failed - """ - try: - file_in_markdown = await convert_document_to_markdown( - unstructured_processed_elements + def __post_init__(self) -> None: + self.enable_summary = ( + self.connector.get("enable_summary", True) if self.connector else True ) - # Generate unique identifier hash (uses file_id for Google Drive, filename for others) - primary_hash, legacy_hash = get_google_drive_unique_identifier( - connector, file_name, search_space_id - ) - # Generate content hash - content_hash = generate_content_hash(file_in_markdown, search_space_id) - - # Check if document exists (with migration support for Google Drive and content_hash fallback) - existing_document = await find_existing_document_with_migration( - session, primary_hash, legacy_hash, content_hash - ) - - if existing_document: - # Handle existing document (rename detection, content change check) - should_skip, doc = await handle_existing_document_update( - session, - existing_document, - content_hash, - connector, - file_name, - primary_hash, - ) - if should_skip: - return doc - # Content changed - continue to update - - # Get user's long context LLM (needed for both create and update) - user_llm = await get_user_long_context_llm(session, user_id, search_space_id) - if not user_llm: - raise RuntimeError( - f"No long context LLM configured for user {user_id} in search space {search_space_id}" - ) - - # Generate summary with metadata - document_metadata = { - "file_name": file_name, - "etl_service": "UNSTRUCTURED", - "document_type": "File Document", - } - if enable_summary: - summary_content, summary_embedding = await generate_document_summary( - file_in_markdown, user_llm, document_metadata - ) - else: - summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}" - summary_embedding = embed_text(summary_content) - - # Process chunks - chunks = await create_document_chunks(file_in_markdown) - - # Update or create document - if existing_document: - # Update existing document - existing_document.title = file_name - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "FILE_NAME": file_name, - "ETL_SERVICE": "UNSTRUCTURED", - } - await safe_set_chunks(session, existing_document, chunks) - existing_document.source_markdown = file_in_markdown - existing_document.content_needs_reindexing = False - existing_document.updated_at = get_current_timestamp() - existing_document.status = DocumentStatus.ready() - - await session.commit() - await session.refresh(existing_document) - document = existing_document - else: - # Create new document - doc_type = DocumentType.FILE - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - doc_type = DocumentType.GOOGLE_DRIVE_FILE - - document = Document( - search_space_id=search_space_id, - title=file_name, - document_type=doc_type, - document_metadata={ - "FILE_NAME": file_name, - "ETL_SERVICE": "UNSTRUCTURED", - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - unique_identifier_hash=primary_hash, - source_markdown=file_in_markdown, - content_needs_reindexing=False, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector.get("connector_id") if connector else None, - status=DocumentStatus.ready(), - ) - - session.add(document) - await session.commit() - await session.refresh(document) - - return document - except SQLAlchemyError as db_error: - await session.rollback() - if "ix_documents_content_hash" in str(db_error): - logging.warning( - "content_hash collision during commit for %s (Unstructured). Skipping.", - file_name, - ) - return None - raise db_error - except Exception as e: - await session.rollback() - raise RuntimeError(f"Failed to process file document: {e!s}") from e +# --------------------------------------------------------------------------- +# Notification helper +# --------------------------------------------------------------------------- -async def add_received_file_document_using_llamacloud( - session: AsyncSession, - file_name: str, - llamacloud_markdown_document: str, - search_space_id: int, - user_id: str, - connector: dict | None = None, - enable_summary: bool = True, -) -> Document | None: - """ - Process and store document content parsed by LlamaCloud. - - Args: - session: Database session - file_name: Name of the processed file - llamacloud_markdown_document: Markdown content from LlamaCloud parsing - search_space_id: ID of the search space - user_id: ID of the user - connector: Optional connector info for Google Drive files - - Returns: - Document object if successful, None if failed - """ - try: - # Combine all markdown documents into one - file_in_markdown = llamacloud_markdown_document - - # Generate unique identifier hash (uses file_id for Google Drive, filename for others) - primary_hash, legacy_hash = get_google_drive_unique_identifier( - connector, file_name, search_space_id - ) - - # Generate content hash - content_hash = generate_content_hash(file_in_markdown, search_space_id) - - # Check if document exists (with migration support for Google Drive and content_hash fallback) - existing_document = await find_existing_document_with_migration( - session, primary_hash, legacy_hash, content_hash - ) - - if existing_document: - # Handle existing document (rename detection, content change check) - should_skip, doc = await handle_existing_document_update( - session, - existing_document, - content_hash, - connector, - file_name, - primary_hash, - ) - if should_skip: - return doc - # Content changed - continue to update - - # Get user's long context LLM (needed for both create and update) - user_llm = await get_user_long_context_llm(session, user_id, search_space_id) - if not user_llm: - raise RuntimeError( - f"No long context LLM configured for user {user_id} in search space {search_space_id}" - ) - - # Generate summary with metadata - document_metadata = { - "file_name": file_name, - "etl_service": "LLAMACLOUD", - "document_type": "File Document", - } - if enable_summary: - summary_content, summary_embedding = await generate_document_summary( - file_in_markdown, user_llm, document_metadata - ) - else: - summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}" - summary_embedding = embed_text(summary_content) - - # Process chunks - chunks = await create_document_chunks(file_in_markdown) - - # Update or create document - if existing_document: - existing_document.title = file_name - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "FILE_NAME": file_name, - "ETL_SERVICE": "LLAMACLOUD", - } - await safe_set_chunks(session, existing_document, chunks) - existing_document.source_markdown = file_in_markdown - existing_document.content_needs_reindexing = False - existing_document.updated_at = get_current_timestamp() - existing_document.status = DocumentStatus.ready() - - await session.commit() - await session.refresh(existing_document) - document = existing_document - else: - doc_type = DocumentType.FILE - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - doc_type = DocumentType.GOOGLE_DRIVE_FILE - - document = Document( - search_space_id=search_space_id, - title=file_name, - document_type=doc_type, - document_metadata={ - "FILE_NAME": file_name, - "ETL_SERVICE": "LLAMACLOUD", - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - unique_identifier_hash=primary_hash, - source_markdown=file_in_markdown, - content_needs_reindexing=False, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector.get("connector_id") if connector else None, - status=DocumentStatus.ready(), - ) - - session.add(document) - await session.commit() - await session.refresh(document) - - return document - except SQLAlchemyError as db_error: - await session.rollback() - if "ix_documents_content_hash" in str(db_error): - logging.warning( - "content_hash collision during commit for %s (LlamaCloud). Skipping.", - file_name, - ) - return None - raise db_error - except Exception as e: - await session.rollback() - raise RuntimeError( - f"Failed to process file document using LlamaCloud: {e!s}" - ) from e - - -async def add_received_file_document_using_docling( - session: AsyncSession, - file_name: str, - docling_markdown_document: str, - search_space_id: int, - user_id: str, - connector: dict | None = None, - enable_summary: bool = True, -) -> Document | None: - """ - Process and store document content parsed by Docling. - - Args: - session: Database session - file_name: Name of the processed file - docling_markdown_document: Markdown content from Docling parsing - search_space_id: ID of the search space - user_id: ID of the user - connector: Optional connector info for Google Drive files - - Returns: - Document object if successful, None if failed - """ - try: - file_in_markdown = docling_markdown_document - - # Generate unique identifier hash (uses file_id for Google Drive, filename for others) - primary_hash, legacy_hash = get_google_drive_unique_identifier( - connector, file_name, search_space_id - ) - - # Generate content hash - content_hash = generate_content_hash(file_in_markdown, search_space_id) - - # Check if document exists (with migration support for Google Drive and content_hash fallback) - existing_document = await find_existing_document_with_migration( - session, primary_hash, legacy_hash, content_hash - ) - - if existing_document: - # Handle existing document (rename detection, content change check) - should_skip, doc = await handle_existing_document_update( - session, - existing_document, - content_hash, - connector, - file_name, - primary_hash, - ) - if should_skip: - return doc - # Content changed - continue to update - - # Get user's long context LLM (needed for both create and update) - user_llm = await get_user_long_context_llm(session, user_id, search_space_id) - if not user_llm: - raise RuntimeError( - f"No long context LLM configured for user {user_id} in search_space {search_space_id}" - ) - - if enable_summary: - from app.services.docling_service import create_docling_service - - docling_service = create_docling_service() - - summary_content = await docling_service.process_large_document_summary( - content=file_in_markdown, llm=user_llm, document_title=file_name - ) - - document_metadata = { - "file_name": file_name, - "etl_service": "DOCLING", - "document_type": "File Document", - } - metadata_parts = ["# DOCUMENT METADATA"] - for key, value in document_metadata.items(): - if value: - formatted_key = key.replace("_", " ").title() - metadata_parts.append(f"**{formatted_key}:** {value}") - - metadata_section = "\n".join(metadata_parts) - enhanced_summary_content = ( - f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}" - ) - else: - enhanced_summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}" - - summary_embedding = embed_text(enhanced_summary_content) - - # Process chunks - chunks = await create_document_chunks(file_in_markdown) - - # Update or create document - if existing_document: - # Update existing document - existing_document.title = file_name - existing_document.content = enhanced_summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "FILE_NAME": file_name, - "ETL_SERVICE": "DOCLING", - } - await safe_set_chunks(session, existing_document, chunks) - existing_document.source_markdown = file_in_markdown - existing_document.content_needs_reindexing = False - existing_document.updated_at = get_current_timestamp() - existing_document.status = DocumentStatus.ready() # Mark as ready - - await session.commit() - await session.refresh(existing_document) - document = existing_document - else: - # Create new document - # Determine document type based on connector - doc_type = DocumentType.FILE - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - doc_type = DocumentType.GOOGLE_DRIVE_FILE - - document = Document( - search_space_id=search_space_id, - title=file_name, - document_type=doc_type, - document_metadata={ - "FILE_NAME": file_name, - "ETL_SERVICE": "DOCLING", - }, - content=enhanced_summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - unique_identifier_hash=primary_hash, - source_markdown=file_in_markdown, - content_needs_reindexing=False, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector.get("connector_id") if connector else None, - status=DocumentStatus.ready(), # Mark as ready - ) - - session.add(document) - await session.commit() - await session.refresh(document) - - return document - except SQLAlchemyError as db_error: - await session.rollback() - if "ix_documents_content_hash" in str(db_error): - logging.warning( - "content_hash collision during commit for %s (Docling). Skipping.", - file_name, - ) - return None - raise db_error - except Exception as e: - await session.rollback() - raise RuntimeError( - f"Failed to process file document using Docling: {e!s}" - ) from e - - -async def _update_document_from_connector( - document: Document | None, connector: dict | None, session: AsyncSession +async def _notify( + ctx: _ProcessingContext, + stage: str, + stage_message: str | None = None, + **kwargs, ) -> None: - """Helper to update document type, metadata, and connector_id from connector info.""" - if document and connector: - if "type" in connector: - document.document_type = connector["type"] - if "metadata" in connector: - # Merge with existing document_metadata (the actual column name) - if not document.document_metadata: - document.document_metadata = connector["metadata"] - else: - # Expand existing metadata with connector metadata - merged = {**document.document_metadata, **connector["metadata"]} - document.document_metadata = merged - # Set connector_id if provided for de-indexing support - if "connector_id" in connector: - document.connector_id = connector["connector_id"] - await session.commit() + """Send a processing-progress notification if one is attached.""" + if not ctx.notification: + return + await NotificationService.document_processing.notify_processing_progress( + ctx.session, + ctx.notification, + stage=stage, + stage_message=stage_message, + **kwargs, + ) + + +# --------------------------------------------------------------------------- +# Page-limit helpers +# --------------------------------------------------------------------------- + + +def _estimate_pages_safe(page_limit_service, file_path: str) -> int: + """Estimate page count with a file-size fallback.""" + try: + return page_limit_service.estimate_pages_before_processing(file_path) + except Exception: + file_size = os.path.getsize(file_path) + return max(1, file_size // (80 * 1024)) + + +async def _log_page_divergence( + task_logger: TaskLoggingService, + log_entry: Log, + filename: str, + estimated: int, + actual: int, + final: int, +) -> None: + """Log a warning when the actual page count far exceeds the pre-estimate.""" + if actual > estimated * 1.5: + await task_logger.log_task_progress( + log_entry, + f"Actual page count higher than estimate: {filename}", + { + "estimated_before": estimated, + "actual_pages": actual, + "using_count": final, + }, + ) + + +# =================================================================== +# Handlers for process_file_in_background (legacy / connector path) +# =================================================================== + + +async def _process_markdown_upload(ctx: _ProcessingContext) -> Document | None: + """Read a markdown / text file and create or update a document.""" + await _notify(ctx, "parsing", "Reading file") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Processing markdown/text file: {ctx.filename}", + {"file_type": "markdown", "processing_stage": "reading_file"}, + ) + + with open(ctx.file_path, encoding="utf-8") as f: + markdown_content = f.read() + + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + + await _notify(ctx, "chunking") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Creating document from markdown content: {ctx.filename}", + { + "processing_stage": "creating_document", + "content_length": len(markdown_content), + }, + ) + + result = await add_received_markdown_file_document( + ctx.session, + ctx.filename, + markdown_content, + ctx.search_space_id, + ctx.user_id, + ctx.connector, + ) + if ctx.connector: + await update_document_from_connector(result, ctx.connector, ctx.session) + + if result: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Successfully processed markdown file: {ctx.filename}", + { + "document_id": result.id, + "content_hash": result.content_hash, + "file_type": "markdown", + }, + ) + else: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Markdown file already exists (duplicate): {ctx.filename}", + {"duplicate_detected": True, "file_type": "markdown"}, + ) + return result + + +async def _process_direct_convert_upload(ctx: _ProcessingContext) -> Document | None: + """Convert a text-based file (csv/tsv/html) to markdown without ETL.""" + await _notify(ctx, "parsing", "Converting file") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Direct-converting file to markdown: {ctx.filename}", + {"file_type": "direct_convert", "processing_stage": "converting"}, + ) + + markdown_content = convert_file_directly(ctx.file_path, ctx.filename) + + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + + await _notify(ctx, "chunking") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Creating document from converted content: {ctx.filename}", + { + "processing_stage": "creating_document", + "content_length": len(markdown_content), + }, + ) + + result = await add_received_markdown_file_document( + ctx.session, + ctx.filename, + markdown_content, + ctx.search_space_id, + ctx.user_id, + ctx.connector, + ) + if ctx.connector: + await update_document_from_connector(result, ctx.connector, ctx.session) + + if result: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Successfully direct-converted file: {ctx.filename}", + { + "document_id": result.id, + "content_hash": result.content_hash, + "file_type": "direct_convert", + }, + ) + else: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Direct-converted file already exists (duplicate): {ctx.filename}", + {"duplicate_detected": True, "file_type": "direct_convert"}, + ) + return result + + +async def _process_audio_upload(ctx: _ProcessingContext) -> Document | None: + """Transcribe an audio file and create or update a document.""" + await _notify(ctx, "parsing", "Transcribing audio") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Processing audio file for transcription: {ctx.filename}", + {"file_type": "audio", "processing_stage": "starting_transcription"}, + ) + + stt_service_type = ( + "local" + if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") + else "external" + ) + + if stt_service_type == "local": + from app.services.stt_service import stt_service + + try: + stt_result = stt_service.transcribe_file(ctx.file_path) + transcribed_text = stt_result.get("text", "") + if not transcribed_text: + raise ValueError("Transcription returned empty text") + transcribed_text = ( + f"# Transcription of {ctx.filename}\n\n{transcribed_text}" + ) + except Exception as e: + raise HTTPException( + status_code=422, + detail=f"Failed to transcribe audio file {ctx.filename}: {e!s}", + ) from e + + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Local STT transcription completed: {ctx.filename}", + { + "processing_stage": "local_transcription_complete", + "language": stt_result.get("language"), + "confidence": stt_result.get("language_probability"), + "duration": stt_result.get("duration"), + }, + ) + else: + from litellm import atranscription + + with open(ctx.file_path, "rb") as audio_file: + transcription_kwargs: dict = { + "model": app_config.STT_SERVICE, + "file": audio_file, + "api_key": app_config.STT_SERVICE_API_KEY, + } + if app_config.STT_SERVICE_API_BASE: + transcription_kwargs["api_base"] = app_config.STT_SERVICE_API_BASE + + transcription_response = await atranscription(**transcription_kwargs) + transcribed_text = transcription_response.get("text", "") + if not transcribed_text: + raise ValueError("Transcription returned empty text") + + transcribed_text = f"# Transcription of {ctx.filename}\n\n{transcribed_text}" + + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Transcription completed, creating document: {ctx.filename}", + { + "processing_stage": "transcription_complete", + "transcript_length": len(transcribed_text), + }, + ) + + await _notify(ctx, "chunking") + + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + + result = await add_received_markdown_file_document( + ctx.session, + ctx.filename, + transcribed_text, + ctx.search_space_id, + ctx.user_id, + ctx.connector, + ) + if ctx.connector: + await update_document_from_connector(result, ctx.connector, ctx.session) + + if result: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Successfully transcribed and processed audio file: {ctx.filename}", + { + "document_id": result.id, + "content_hash": result.content_hash, + "file_type": "audio", + "transcript_length": len(transcribed_text), + "stt_service": stt_service_type, + }, + ) + else: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Audio file transcript already exists (duplicate): {ctx.filename}", + {"duplicate_detected": True, "file_type": "audio"}, + ) + return result + + +# --------------------------------------------------------------------------- +# Document file processing (ETL service dispatch) +# --------------------------------------------------------------------------- + + +async def _etl_unstructured( + ctx: _ProcessingContext, + page_limit_service, + estimated_pages: int, +) -> Document | None: + """Parse and save via the Unstructured ETL service.""" + await _notify(ctx, "parsing", "Extracting content") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Processing file with Unstructured ETL: {ctx.filename}", + { + "file_type": "document", + "etl_service": "UNSTRUCTURED", + "processing_stage": "loading", + }, + ) + + docs = await parse_with_unstructured(ctx.file_path) + + await _notify(ctx, "chunking", chunks_count=len(docs)) + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Unstructured ETL completed, creating document: {ctx.filename}", + {"processing_stage": "etl_complete", "elements_count": len(docs)}, + ) + + actual_pages = page_limit_service.estimate_pages_from_elements(docs) + final_pages = max(estimated_pages, actual_pages) + await _log_page_divergence( + ctx.task_logger, + ctx.log_entry, + ctx.filename, + estimated_pages, + actual_pages, + final_pages, + ) + + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + + result = await add_received_file_document_using_unstructured( + ctx.session, + ctx.filename, + docs, + ctx.search_space_id, + ctx.user_id, + ctx.connector, + enable_summary=ctx.enable_summary, + ) + if ctx.connector: + await update_document_from_connector(result, ctx.connector, ctx.session) + + if result: + await page_limit_service.update_page_usage( + ctx.user_id, final_pages, allow_exceed=True + ) + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Successfully processed file with Unstructured: {ctx.filename}", + { + "document_id": result.id, + "content_hash": result.content_hash, + "file_type": "document", + "etl_service": "UNSTRUCTURED", + "pages_processed": final_pages, + }, + ) + else: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Document already exists (duplicate): {ctx.filename}", + { + "duplicate_detected": True, + "file_type": "document", + "etl_service": "UNSTRUCTURED", + }, + ) + return result + + +async def _etl_llamacloud( + ctx: _ProcessingContext, + page_limit_service, + estimated_pages: int, +) -> Document | None: + """Parse and save via the LlamaCloud ETL service.""" + await _notify(ctx, "parsing", "Extracting content") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Processing file with LlamaCloud ETL: {ctx.filename}", + { + "file_type": "document", + "etl_service": "LLAMACLOUD", + "processing_stage": "parsing", + "estimated_pages": estimated_pages, + }, + ) + + raw_result = await parse_with_llamacloud_retry( + file_path=ctx.file_path, + estimated_pages=estimated_pages, + task_logger=ctx.task_logger, + log_entry=ctx.log_entry, + ) + + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + + markdown_documents = await raw_result.aget_markdown_documents(split_by_page=False) + + await _notify(ctx, "chunking", chunks_count=len(markdown_documents)) + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"LlamaCloud parsing completed, creating documents: {ctx.filename}", + { + "processing_stage": "parsing_complete", + "documents_count": len(markdown_documents), + }, + ) + + if not markdown_documents: + await ctx.task_logger.log_task_failure( + ctx.log_entry, + f"LlamaCloud parsing returned no documents: {ctx.filename}", + "ETL service returned empty document list", + {"error_type": "EmptyDocumentList", "etl_service": "LLAMACLOUD"}, + ) + raise ValueError(f"LlamaCloud parsing returned no documents for {ctx.filename}") + + actual_pages = page_limit_service.estimate_pages_from_markdown(markdown_documents) + final_pages = max(estimated_pages, actual_pages) + await _log_page_divergence( + ctx.task_logger, + ctx.log_entry, + ctx.filename, + estimated_pages, + actual_pages, + final_pages, + ) + + any_created = False + last_doc: Document | None = None + + for doc in markdown_documents: + doc_result = await add_received_file_document_using_llamacloud( + ctx.session, + ctx.filename, + llamacloud_markdown_document=doc.text, + search_space_id=ctx.search_space_id, + user_id=ctx.user_id, + connector=ctx.connector, + enable_summary=ctx.enable_summary, + ) + if doc_result: + any_created = True + last_doc = doc_result + + if any_created: + await page_limit_service.update_page_usage( + ctx.user_id, final_pages, allow_exceed=True + ) + if ctx.connector: + await update_document_from_connector(last_doc, ctx.connector, ctx.session) + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Successfully processed file with LlamaCloud: {ctx.filename}", + { + "document_id": last_doc.id, + "content_hash": last_doc.content_hash, + "file_type": "document", + "etl_service": "LLAMACLOUD", + "pages_processed": final_pages, + "documents_count": len(markdown_documents), + }, + ) + return last_doc + + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Document already exists (duplicate): {ctx.filename}", + { + "duplicate_detected": True, + "file_type": "document", + "etl_service": "LLAMACLOUD", + "documents_count": len(markdown_documents), + }, + ) + return None + + +async def _etl_docling( + ctx: _ProcessingContext, + page_limit_service, + estimated_pages: int, +) -> Document | None: + """Parse and save via the Docling ETL service.""" + await _notify(ctx, "parsing", "Extracting content") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Processing file with Docling ETL: {ctx.filename}", + { + "file_type": "document", + "etl_service": "DOCLING", + "processing_stage": "parsing", + }, + ) + + content = await parse_with_docling(ctx.file_path, ctx.filename) + + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Docling parsing completed, creating document: {ctx.filename}", + {"processing_stage": "parsing_complete", "content_length": len(content)}, + ) + + actual_pages = page_limit_service.estimate_pages_from_content_length(len(content)) + final_pages = max(estimated_pages, actual_pages) + await _log_page_divergence( + ctx.task_logger, + ctx.log_entry, + ctx.filename, + estimated_pages, + actual_pages, + final_pages, + ) + + await _notify(ctx, "chunking") + + result = await add_received_file_document_using_docling( + ctx.session, + ctx.filename, + docling_markdown_document=content, + search_space_id=ctx.search_space_id, + user_id=ctx.user_id, + connector=ctx.connector, + enable_summary=ctx.enable_summary, + ) + + if result: + await page_limit_service.update_page_usage( + ctx.user_id, final_pages, allow_exceed=True + ) + if ctx.connector: + await update_document_from_connector(result, ctx.connector, ctx.session) + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Successfully processed file with Docling: {ctx.filename}", + { + "document_id": result.id, + "content_hash": result.content_hash, + "file_type": "document", + "etl_service": "DOCLING", + "pages_processed": final_pages, + }, + ) + else: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Document already exists (duplicate): {ctx.filename}", + { + "duplicate_detected": True, + "file_type": "document", + "etl_service": "DOCLING", + }, + ) + return result + + +async def _process_document_upload(ctx: _ProcessingContext) -> Document | None: + """Route a document file to the configured ETL service.""" + from app.services.page_limit_service import PageLimitExceededError, PageLimitService + + page_limit_service = PageLimitService(ctx.session) + estimated_pages = _estimate_pages_safe(page_limit_service, ctx.file_path) + + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Estimated {estimated_pages} pages for file: {ctx.filename}", + {"estimated_pages": estimated_pages, "file_type": "document"}, + ) + + try: + await page_limit_service.check_page_limit(ctx.user_id, estimated_pages) + except PageLimitExceededError as e: + await ctx.task_logger.log_task_failure( + ctx.log_entry, + f"Page limit exceeded before processing: {ctx.filename}", + str(e), + { + "error_type": "PageLimitExceeded", + "pages_used": e.pages_used, + "pages_limit": e.pages_limit, + "estimated_pages": estimated_pages, + }, + ) + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + raise HTTPException(status_code=403, detail=str(e)) from e + + etl_dispatch = { + "UNSTRUCTURED": _etl_unstructured, + "LLAMACLOUD": _etl_llamacloud, + "DOCLING": _etl_docling, + } + handler = etl_dispatch.get(app_config.ETL_SERVICE) + if handler is None: + raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}") + + return await handler(ctx, page_limit_service, estimated_pages) + + +# =================================================================== +# Public orchestrators +# =================================================================== async def process_file_in_background( @@ -910,726 +690,35 @@ async def process_file_in_background( session: AsyncSession, task_logger: TaskLoggingService, log_entry: Log, - connector: dict - | None = None, # Optional: {"type": "GOOGLE_DRIVE_FILE", "metadata": {...}} - notification: Notification - | None = None, # Optional notification for progress updates + connector: dict | None = None, + notification: Notification | None = None, ) -> Document | None: + ctx = _ProcessingContext( + session=session, + file_path=file_path, + filename=filename, + search_space_id=search_space_id, + user_id=user_id, + task_logger=task_logger, + log_entry=log_entry, + connector=connector, + notification=notification, + ) + try: - # Check if the file is a markdown or text file - if filename.lower().endswith((".md", ".markdown", ".txt")): - # Update notification: parsing stage - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Reading file", - ) - ) + category = classify_file(filename) - await task_logger.log_task_progress( - log_entry, - f"Processing markdown/text file: {filename}", - {"file_type": "markdown", "processing_stage": "reading_file"}, - ) + if category == FileCategory.MARKDOWN: + return await _process_markdown_upload(ctx) + if category == FileCategory.DIRECT_CONVERT: + return await _process_direct_convert_upload(ctx) + if category == FileCategory.AUDIO: + return await _process_audio_upload(ctx) + return await _process_document_upload(ctx) - # For markdown files, read the content directly - with open(file_path, encoding="utf-8") as f: - markdown_content = f.read() - - # Clean up the temp file - import os - - try: - os.unlink(file_path) - except Exception as e: - print("Error deleting temp file", e) - pass - - # Update notification: chunking stage - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, notification, stage="chunking" - ) - ) - - await task_logger.log_task_progress( - log_entry, - f"Creating document from markdown content: {filename}", - { - "processing_stage": "creating_document", - "content_length": len(markdown_content), - }, - ) - - # Process markdown directly through specialized function - result = await add_received_markdown_file_document( - session, filename, markdown_content, search_space_id, user_id, connector - ) - - if connector: - await _update_document_from_connector(result, connector, session) - - if result: - await task_logger.log_task_success( - log_entry, - f"Successfully processed markdown file: {filename}", - { - "document_id": result.id, - "content_hash": result.content_hash, - "file_type": "markdown", - }, - ) - return result - else: - await task_logger.log_task_success( - log_entry, - f"Markdown file already exists (duplicate): {filename}", - {"duplicate_detected": True, "file_type": "markdown"}, - ) - return None - - # Check if the file is an audio file - elif filename.lower().endswith( - (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm") - ): - # Update notification: parsing stage (transcription) - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Transcribing audio", - ) - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing audio file for transcription: {filename}", - {"file_type": "audio", "processing_stage": "starting_transcription"}, - ) - - # Determine STT service type - stt_service_type = ( - "local" - if app_config.STT_SERVICE - and app_config.STT_SERVICE.startswith("local/") - else "external" - ) - - # Check if using local STT service - if stt_service_type == "local": - # Use local Faster-Whisper for transcription - from app.services.stt_service import stt_service - - try: - result = stt_service.transcribe_file(file_path) - transcribed_text = result.get("text", "") - - if not transcribed_text: - raise ValueError("Transcription returned empty text") - - # Add metadata about the transcription - transcribed_text = ( - f"# Transcription of {filename}\n\n{transcribed_text}" - ) - except Exception as e: - raise HTTPException( - status_code=422, - detail=f"Failed to transcribe audio file {filename}: {e!s}", - ) from e - - await task_logger.log_task_progress( - log_entry, - f"Local STT transcription completed: {filename}", - { - "processing_stage": "local_transcription_complete", - "language": result.get("language"), - "confidence": result.get("language_probability"), - "duration": result.get("duration"), - }, - ) - else: - # Use LiteLLM for audio transcription - with open(file_path, "rb") as audio_file: - transcription_kwargs = { - "model": app_config.STT_SERVICE, - "file": audio_file, - "api_key": app_config.STT_SERVICE_API_KEY, - } - if app_config.STT_SERVICE_API_BASE: - transcription_kwargs["api_base"] = ( - app_config.STT_SERVICE_API_BASE - ) - - transcription_response = await atranscription( - **transcription_kwargs - ) - - # Extract the transcribed text - transcribed_text = transcription_response.get("text", "") - - if not transcribed_text: - raise ValueError("Transcription returned empty text") - - # Add metadata about the transcription - transcribed_text = ( - f"# Transcription of {filename}\n\n{transcribed_text}" - ) - - await task_logger.log_task_progress( - log_entry, - f"Transcription completed, creating document: {filename}", - { - "processing_stage": "transcription_complete", - "transcript_length": len(transcribed_text), - }, - ) - - # Update notification: chunking stage - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, notification, stage="chunking" - ) - ) - - # Clean up the temp file - try: - os.unlink(file_path) - except Exception as e: - print("Error deleting temp file", e) - pass - - # Process transcription as markdown document - result = await add_received_markdown_file_document( - session, filename, transcribed_text, search_space_id, user_id, connector - ) - - if connector: - await _update_document_from_connector(result, connector, session) - - if result: - await task_logger.log_task_success( - log_entry, - f"Successfully transcribed and processed audio file: {filename}", - { - "document_id": result.id, - "content_hash": result.content_hash, - "file_type": "audio", - "transcript_length": len(transcribed_text), - "stt_service": stt_service_type, - }, - ) - return result - else: - await task_logger.log_task_success( - log_entry, - f"Audio file transcript already exists (duplicate): {filename}", - {"duplicate_detected": True, "file_type": "audio"}, - ) - return None - - else: - # Import page limit service - from app.services.page_limit_service import ( - PageLimitExceededError, - PageLimitService, - ) - - # Initialize page limit service - page_limit_service = PageLimitService(session) - - # CRITICAL: Estimate page count BEFORE making expensive ETL API calls - # This prevents users from incurring costs on files that would exceed their limit - try: - estimated_pages_before = ( - page_limit_service.estimate_pages_before_processing(file_path) - ) - except Exception: - # If estimation fails, use a conservative estimate based on file size - import os - - file_size = os.path.getsize(file_path) - estimated_pages_before = max( - 1, file_size // (80 * 1024) - ) # ~80KB per page - - await task_logger.log_task_progress( - log_entry, - f"Estimated {estimated_pages_before} pages for file: {filename}", - { - "estimated_pages": estimated_pages_before, - "file_type": "document", - }, - ) - - # Check page limit BEFORE calling ETL service to avoid unnecessary costs - try: - await page_limit_service.check_page_limit( - user_id, estimated_pages_before - ) - except PageLimitExceededError as e: - await task_logger.log_task_failure( - log_entry, - f"Page limit exceeded before processing: {filename}", - str(e), - { - "error_type": "PageLimitExceeded", - "pages_used": e.pages_used, - "pages_limit": e.pages_limit, - "estimated_pages": estimated_pages_before, - }, - ) - # Clean up the temp file - import os - - with contextlib.suppress(Exception): - os.unlink(file_path) - - raise HTTPException( - status_code=403, - detail=str(e), - ) from e - - if app_config.ETL_SERVICE == "UNSTRUCTURED": - # Update notification: parsing stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing file with Unstructured ETL: {filename}", - { - "file_type": "document", - "etl_service": "UNSTRUCTURED", - "processing_stage": "loading", - }, - ) - - from langchain_unstructured import UnstructuredLoader - - # Process the file - loader = UnstructuredLoader( - file_path, - mode="elements", - post_processors=[], - languages=["eng"], - include_orig_elements=False, - include_metadata=False, - strategy="auto", - ) - - docs = await loader.aload() - - # Update notification: chunking stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, notification, stage="chunking", chunks_count=len(docs) - ) - - await task_logger.log_task_progress( - log_entry, - f"Unstructured ETL completed, creating document: {filename}", - {"processing_stage": "etl_complete", "elements_count": len(docs)}, - ) - - # Verify actual page count from parsed documents - actual_pages = page_limit_service.estimate_pages_from_elements(docs) - - # Use the higher of the two estimates for safety (in case pre-estimate was too low) - final_page_count = max(estimated_pages_before, actual_pages) - - # If actual is significantly higher than estimate, log a warning - if actual_pages > estimated_pages_before * 1.5: - await task_logger.log_task_progress( - log_entry, - f"Actual page count higher than estimate: {filename}", - { - "estimated_before": estimated_pages_before, - "actual_pages": actual_pages, - "using_count": final_page_count, - }, - ) - - # Clean up the temp file - import os - - try: - os.unlink(file_path) - except Exception as e: - print("Error deleting temp file", e) - pass - - enable_summary = ( - connector.get("enable_summary", True) if connector else True - ) - result = await add_received_file_document_using_unstructured( - session, - filename, - docs, - search_space_id, - user_id, - connector, - enable_summary=enable_summary, - ) - - if connector: - await _update_document_from_connector(result, connector, session) - - if result: - # Update page usage after successful processing - # allow_exceed=True because document was already created after passing initial check - await page_limit_service.update_page_usage( - user_id, final_page_count, allow_exceed=True - ) - - await task_logger.log_task_success( - log_entry, - f"Successfully processed file with Unstructured: {filename}", - { - "document_id": result.id, - "content_hash": result.content_hash, - "file_type": "document", - "etl_service": "UNSTRUCTURED", - "pages_processed": final_page_count, - }, - ) - return result - else: - await task_logger.log_task_success( - log_entry, - f"Document already exists (duplicate): {filename}", - { - "duplicate_detected": True, - "file_type": "document", - "etl_service": "UNSTRUCTURED", - }, - ) - return None - - elif app_config.ETL_SERVICE == "LLAMACLOUD": - # Update notification: parsing stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing file with LlamaCloud ETL: {filename}", - { - "file_type": "document", - "etl_service": "LLAMACLOUD", - "processing_stage": "parsing", - "estimated_pages": estimated_pages_before, - }, - ) - - # Parse file with retry logic for SSL/connection errors (common with large files) - result = await parse_with_llamacloud_retry( - file_path=file_path, - estimated_pages=estimated_pages_before, - task_logger=task_logger, - log_entry=log_entry, - ) - - # Clean up the temp file - import os - - try: - os.unlink(file_path) - except Exception as e: - print("Error deleting temp file", e) - pass - - # Get markdown documents from the result - markdown_documents = await result.aget_markdown_documents( - split_by_page=False - ) - - # Update notification: chunking stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="chunking", - chunks_count=len(markdown_documents), - ) - - await task_logger.log_task_progress( - log_entry, - f"LlamaCloud parsing completed, creating documents: {filename}", - { - "processing_stage": "parsing_complete", - "documents_count": len(markdown_documents), - }, - ) - - # Check if LlamaCloud returned any documents - if not markdown_documents or len(markdown_documents) == 0: - await task_logger.log_task_failure( - log_entry, - f"LlamaCloud parsing returned no documents: {filename}", - "ETL service returned empty document list", - { - "error_type": "EmptyDocumentList", - "etl_service": "LLAMACLOUD", - }, - ) - raise ValueError( - f"LlamaCloud parsing returned no documents for {filename}" - ) - - # Verify actual page count from parsed markdown documents - actual_pages = page_limit_service.estimate_pages_from_markdown( - markdown_documents - ) - - # Use the higher of the two estimates for safety (in case pre-estimate was too low) - final_page_count = max(estimated_pages_before, actual_pages) - - # If actual is significantly higher than estimate, log a warning - if actual_pages > estimated_pages_before * 1.5: - await task_logger.log_task_progress( - log_entry, - f"Actual page count higher than estimate: {filename}", - { - "estimated_before": estimated_pages_before, - "actual_pages": actual_pages, - "using_count": final_page_count, - }, - ) - - # Track if any document was successfully created (not a duplicate) - any_doc_created = False - last_created_doc = None - - for doc in markdown_documents: - # Extract text content from the markdown documents - markdown_content = doc.text - - enable_summary = ( - connector.get("enable_summary", True) if connector else True - ) - doc_result = await add_received_file_document_using_llamacloud( - session, - filename, - llamacloud_markdown_document=markdown_content, - search_space_id=search_space_id, - user_id=user_id, - connector=connector, - enable_summary=enable_summary, - ) - - # Track if this document was successfully created - if doc_result: - any_doc_created = True - last_created_doc = doc_result - - # Update page usage once after processing all documents - # Only update if at least one document was created (not all duplicates) - if any_doc_created: - # Update page usage after successful processing - # allow_exceed=True because document was already created after passing initial check - await page_limit_service.update_page_usage( - user_id, final_page_count, allow_exceed=True - ) - - if connector: - await _update_document_from_connector( - last_created_doc, connector, session - ) - - await task_logger.log_task_success( - log_entry, - f"Successfully processed file with LlamaCloud: {filename}", - { - "document_id": last_created_doc.id, - "content_hash": last_created_doc.content_hash, - "file_type": "document", - "etl_service": "LLAMACLOUD", - "pages_processed": final_page_count, - "documents_count": len(markdown_documents), - }, - ) - return last_created_doc - else: - # All documents were duplicates (markdown_documents was not empty, but all returned None) - await task_logger.log_task_success( - log_entry, - f"Document already exists (duplicate): {filename}", - { - "duplicate_detected": True, - "file_type": "document", - "etl_service": "LLAMACLOUD", - "documents_count": len(markdown_documents), - }, - ) - return None - - elif app_config.ETL_SERVICE == "DOCLING": - # Update notification: parsing stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing file with Docling ETL: {filename}", - { - "file_type": "document", - "etl_service": "DOCLING", - "processing_stage": "parsing", - }, - ) - - # Use Docling service for document processing - from app.services.docling_service import create_docling_service - - # Create Docling service - docling_service = create_docling_service() - - # Suppress pdfminer warnings that can cause processing to hang - # These warnings are harmless but can spam logs and potentially halt processing - # Suppress both Python warnings and logging warnings from pdfminer - pdfminer_logger = getLogger("pdfminer") - original_level = pdfminer_logger.level - - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", category=UserWarning, module="pdfminer" - ) - warnings.filterwarnings( - "ignore", - message=".*Cannot set gray non-stroke color.*", - ) - warnings.filterwarnings("ignore", message=".*invalid float value.*") - - # Temporarily suppress pdfminer logging warnings - pdfminer_logger.setLevel(ERROR) - - try: - # Process the document - result = await docling_service.process_document( - file_path, filename - ) - finally: - # Restore original logging level - pdfminer_logger.setLevel(original_level) - - # Clean up the temp file - import os - - try: - os.unlink(file_path) - except Exception as e: - print("Error deleting temp file", e) - pass - - await task_logger.log_task_progress( - log_entry, - f"Docling parsing completed, creating document: {filename}", - { - "processing_stage": "parsing_complete", - "content_length": len(result["content"]), - }, - ) - - # Verify actual page count from content length - actual_pages = page_limit_service.estimate_pages_from_content_length( - len(result["content"]) - ) - - # Use the higher of the two estimates for safety (in case pre-estimate was too low) - final_page_count = max(estimated_pages_before, actual_pages) - - # If actual is significantly higher than estimate, log a warning - if actual_pages > estimated_pages_before * 1.5: - await task_logger.log_task_progress( - log_entry, - f"Actual page count higher than estimate: {filename}", - { - "estimated_before": estimated_pages_before, - "actual_pages": actual_pages, - "using_count": final_page_count, - }, - ) - - # Update notification: chunking stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, notification, stage="chunking" - ) - - enable_summary = ( - connector.get("enable_summary", True) if connector else True - ) - doc_result = await add_received_file_document_using_docling( - session, - filename, - docling_markdown_document=result["content"], - search_space_id=search_space_id, - user_id=user_id, - connector=connector, - enable_summary=enable_summary, - ) - - if doc_result: - # Update page usage after successful processing - # allow_exceed=True because document was already created after passing initial check - await page_limit_service.update_page_usage( - user_id, final_page_count, allow_exceed=True - ) - - if connector: - await _update_document_from_connector( - doc_result, connector, session - ) - - await task_logger.log_task_success( - log_entry, - f"Successfully processed file with Docling: {filename}", - { - "document_id": doc_result.id, - "content_hash": doc_result.content_hash, - "file_type": "document", - "etl_service": "DOCLING", - "pages_processed": final_page_count, - }, - ) - return doc_result - else: - await task_logger.log_task_success( - log_entry, - f"Document already exists (duplicate): {filename}", - { - "duplicate_detected": True, - "file_type": "document", - "etl_service": "DOCLING", - }, - ) - return None except Exception as e: await session.rollback() - # For page limit errors, use the detailed message from the exception from app.services.page_limit_service import PageLimitExceededError if isinstance(e, PageLimitExceededError): @@ -1645,10 +734,225 @@ async def process_file_in_background( str(e), {"error_type": type(e).__name__, "filename": filename}, ) - import logging - logging.error(f"Error processing file in background: {error_message}") - raise # Re-raise so the wrapper can also handle it + raise + + +# =================================================================== +# 2-phase handler (process_file_in_background_with_document) +# =================================================================== + + +async def _extract_file_content( + file_path: str, + filename: str, + session: AsyncSession, + user_id: str, + task_logger: TaskLoggingService, + log_entry: Log, + notification: Notification | None, +) -> tuple[str, str]: + """ + Extract markdown content from a file regardless of type. + + Returns: + Tuple of (markdown_content, etl_service_name). + """ + category = classify_file(filename) + + if category == FileCategory.MARKDOWN: + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Reading file", + ) + await task_logger.log_task_progress( + log_entry, + f"Processing markdown/text file: {filename}", + {"file_type": "markdown", "processing_stage": "reading_file"}, + ) + with open(file_path, encoding="utf-8") as f: + content = f.read() + with contextlib.suppress(Exception): + os.unlink(file_path) + return content, "MARKDOWN" + + if category == FileCategory.DIRECT_CONVERT: + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Converting file", + ) + await task_logger.log_task_progress( + log_entry, + f"Direct-converting file to markdown: {filename}", + {"file_type": "direct_convert", "processing_stage": "converting"}, + ) + content = convert_file_directly(file_path, filename) + with contextlib.suppress(Exception): + os.unlink(file_path) + return content, "DIRECT_CONVERT" + + if category == FileCategory.AUDIO: + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Transcribing audio", + ) + await task_logger.log_task_progress( + log_entry, + f"Processing audio file for transcription: {filename}", + {"file_type": "audio", "processing_stage": "starting_transcription"}, + ) + transcribed_text = await _transcribe_audio(file_path, filename) + with contextlib.suppress(Exception): + os.unlink(file_path) + return transcribed_text, "AUDIO_TRANSCRIPTION" + + # Document file — use ETL service + return await _extract_document_content( + file_path, + filename, + session, + user_id, + task_logger, + log_entry, + notification, + ) + + +async def _transcribe_audio(file_path: str, filename: str) -> str: + """Transcribe an audio file and return formatted markdown text.""" + stt_service_type = ( + "local" + if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") + else "external" + ) + + if stt_service_type == "local": + from app.services.stt_service import stt_service + + result = stt_service.transcribe_file(file_path) + text = result.get("text", "") + if not text: + raise ValueError("Transcription returned empty text") + else: + from litellm import atranscription + + with open(file_path, "rb") as audio_file: + kwargs: dict = { + "model": app_config.STT_SERVICE, + "file": audio_file, + "api_key": app_config.STT_SERVICE_API_KEY, + } + if app_config.STT_SERVICE_API_BASE: + kwargs["api_base"] = app_config.STT_SERVICE_API_BASE + response = await atranscription(**kwargs) + text = response.get("text", "") + if not text: + raise ValueError("Transcription returned empty text") + + return f"# Transcription of {filename}\n\n{text}" + + +async def _extract_document_content( + file_path: str, + filename: str, + session: AsyncSession, + user_id: str, + task_logger: TaskLoggingService, + log_entry: Log, + notification: Notification | None, +) -> tuple[str, str]: + """ + Parse a document file via the configured ETL service. + + Returns: + Tuple of (markdown_content, etl_service_name). + """ + from app.services.page_limit_service import PageLimitService + + page_limit_service = PageLimitService(session) + + try: + estimated_pages = page_limit_service.estimate_pages_before_processing(file_path) + except Exception: + file_size = os.path.getsize(file_path) + estimated_pages = max(1, file_size // (80 * 1024)) + + await page_limit_service.check_page_limit(user_id, estimated_pages) + + etl_service = app_config.ETL_SERVICE + markdown_content: str | None = None + + if notification: + await NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message="Extracting content", + ) + + if etl_service == "UNSTRUCTURED": + from app.utils.document_converters import convert_document_to_markdown + + docs = await parse_with_unstructured(file_path) + markdown_content = await convert_document_to_markdown(docs) + actual_pages = page_limit_service.estimate_pages_from_elements(docs) + final_pages = max(estimated_pages, actual_pages) + await page_limit_service.update_page_usage( + user_id, final_pages, allow_exceed=True + ) + + elif etl_service == "LLAMACLOUD": + raw_result = await parse_with_llamacloud_retry( + file_path=file_path, + estimated_pages=estimated_pages, + task_logger=task_logger, + log_entry=log_entry, + ) + markdown_documents = await raw_result.aget_markdown_documents( + split_by_page=False + ) + if not markdown_documents: + raise RuntimeError(f"LlamaCloud parsing returned no documents: {filename}") + markdown_content = markdown_documents[0].text + await page_limit_service.update_page_usage( + user_id, estimated_pages, allow_exceed=True + ) + + elif etl_service == "DOCLING": + getLogger("docling.pipeline.base_pipeline").setLevel(ERROR) + getLogger("docling.document_converter").setLevel(ERROR) + getLogger("docling_core.transforms.chunker.hierarchical_chunker").setLevel( + ERROR + ) + + from docling.document_converter import DocumentConverter + + converter = DocumentConverter() + result = converter.convert(file_path) + markdown_content = result.document.export_to_markdown() + await page_limit_service.update_page_usage( + user_id, estimated_pages, allow_exceed=True + ) + + else: + raise RuntimeError(f"Unknown ETL_SERVICE: {etl_service}") + + with contextlib.suppress(Exception): + os.unlink(file_path) + + if not markdown_content: + raise RuntimeError(f"Failed to extract content from file: {filename}") + + return markdown_content, etl_service async def process_file_in_background_with_document( @@ -1667,272 +971,50 @@ async def process_file_in_background_with_document( """ Process file and update existing pending document (2-phase pattern). - This function is Phase 2 of the real-time document status updates: - - Phase 1 (API): Created document with pending status - - Phase 2 (this): Process file and update document to ready/failed - - The document already exists with pending status. This function: - 1. Parses the file content (markdown, audio, or ETL services) - 2. Updates the document with content, embeddings, and chunks - 3. Sets status to 'ready' on success - - Args: - document: Existing document with pending status - file_path: Path to the uploaded file - filename: Original filename - search_space_id: ID of the search space - user_id: ID of the user - session: Database session - task_logger: Task logging service - log_entry: Log entry for this task - connector: Optional connector info for Google Drive files - notification: Optional notification for progress updates - - Returns: - Updated Document object if successful, None if duplicate content detected + Phase 1 (API layer): Created document with pending status. + Phase 2 (this function): Process file and update document to ready/failed. """ - import os - - from app.config import config as app_config + from app.indexing_pipeline.adapters.file_upload_adapter import ( + UploadDocumentAdapter, + ) from app.services.llm_service import get_user_long_context_llm + from app.utils.document_converters import generate_content_hash + + from .base import check_duplicate_document doc_id = document.id try: - markdown_content = None - etl_service = None - - # ===== STEP 1: Parse file content based on type ===== - - # Check if the file is a markdown or text file - if filename.lower().endswith((".md", ".markdown", ".txt")): - # Update notification: parsing stage - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Reading file", - ) - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing markdown/text file: {filename}", - {"file_type": "markdown", "processing_stage": "reading_file"}, - ) - - # Read markdown content directly - with open(file_path, encoding="utf-8") as f: - markdown_content = f.read() - etl_service = "MARKDOWN" - - # Clean up temp file - with contextlib.suppress(Exception): - os.unlink(file_path) - - # Check if the file is an audio file - elif filename.lower().endswith( - (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm") - ): - # Update notification: parsing stage (transcription) - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Transcribing audio", - ) - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing audio file for transcription: {filename}", - {"file_type": "audio", "processing_stage": "starting_transcription"}, - ) - - # Transcribe audio - stt_service_type = ( - "local" - if app_config.STT_SERVICE - and app_config.STT_SERVICE.startswith("local/") - else "external" - ) - - if stt_service_type == "local": - from app.services.stt_service import stt_service - - result = stt_service.transcribe_file(file_path) - transcribed_text = result.get("text", "") - if not transcribed_text: - raise ValueError("Transcription returned empty text") - markdown_content = ( - f"# Transcription of {filename}\n\n{transcribed_text}" - ) - else: - with open(file_path, "rb") as audio_file: - transcription_kwargs = { - "model": app_config.STT_SERVICE, - "file": audio_file, - "api_key": app_config.STT_SERVICE_API_KEY, - } - if app_config.STT_SERVICE_API_BASE: - transcription_kwargs["api_base"] = ( - app_config.STT_SERVICE_API_BASE - ) - transcription_response = await atranscription( - **transcription_kwargs - ) - transcribed_text = transcription_response.get("text", "") - if not transcribed_text: - raise ValueError("Transcription returned empty text") - markdown_content = ( - f"# Transcription of {filename}\n\n{transcribed_text}" - ) - - etl_service = "AUDIO_TRANSCRIPTION" - # Clean up temp file - with contextlib.suppress(Exception): - os.unlink(file_path) - - else: - # Document files - use ETL service - from app.services.page_limit_service import ( - PageLimitExceededError, - PageLimitService, - ) - - page_limit_service = PageLimitService(session) - - # Estimate page count - try: - estimated_pages = page_limit_service.estimate_pages_before_processing( - file_path - ) - except Exception: - file_size = os.path.getsize(file_path) - estimated_pages = max(1, file_size // (80 * 1024)) - - # Check page limit - await page_limit_service.check_page_limit(user_id, estimated_pages) - - if app_config.ETL_SERVICE == "UNSTRUCTURED": - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - from langchain_unstructured import UnstructuredLoader - - loader = UnstructuredLoader( - file_path, - mode="elements", - post_processors=[], - languages=["eng"], - include_orig_elements=False, - include_metadata=False, - strategy="auto", - ) - docs = await loader.aload() - markdown_content = await convert_document_to_markdown(docs) - actual_pages = page_limit_service.estimate_pages_from_elements(docs) - final_page_count = max(estimated_pages, actual_pages) - etl_service = "UNSTRUCTURED" - - # Update page usage - await page_limit_service.update_page_usage( - user_id, final_page_count, allow_exceed=True - ) - - elif app_config.ETL_SERVICE == "LLAMACLOUD": - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - result = await parse_with_llamacloud_retry( - file_path=file_path, - estimated_pages=estimated_pages, - task_logger=task_logger, - log_entry=log_entry, - ) - markdown_documents = await result.aget_markdown_documents( - split_by_page=False - ) - if not markdown_documents: - raise RuntimeError( - f"LlamaCloud parsing returned no documents: {filename}" - ) - markdown_content = markdown_documents[0].text - etl_service = "LLAMACLOUD" - - # Update page usage - await page_limit_service.update_page_usage( - user_id, estimated_pages, allow_exceed=True - ) - - elif app_config.ETL_SERVICE == "DOCLING": - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - # Suppress logging during Docling import - getLogger("docling.pipeline.base_pipeline").setLevel(ERROR) - getLogger("docling.document_converter").setLevel(ERROR) - getLogger( - "docling_core.transforms.chunker.hierarchical_chunker" - ).setLevel(ERROR) - - from docling.document_converter import DocumentConverter - - converter = DocumentConverter() - result = converter.convert(file_path) - markdown_content = result.document.export_to_markdown() - etl_service = "DOCLING" - - # Update page usage - await page_limit_service.update_page_usage( - user_id, estimated_pages, allow_exceed=True - ) - - else: - raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}") - - # Clean up temp file - with contextlib.suppress(Exception): - os.unlink(file_path) + # Step 1: extract content + markdown_content, etl_service = await _extract_file_content( + file_path, + filename, + session, + user_id, + task_logger, + log_entry, + notification, + ) if not markdown_content: raise RuntimeError(f"Failed to extract content from file: {filename}") - # ===== STEP 2: Check for duplicate content ===== + # Step 2: duplicate check content_hash = generate_content_hash(markdown_content, search_space_id) - existing_by_content = await check_duplicate_document(session, content_hash) if existing_by_content and existing_by_content.id != doc_id: - # Duplicate content found - mark this document as failed logging.info( f"Duplicate content detected for {filename}, " f"matches document {existing_by_content.id}" ) return None - # ===== STEP 3+4: Index via pipeline ===== + # Step 3: index via pipeline if notification: await NotificationService.document_processing.notify_processing_progress( - session, notification, stage="chunking" + session, + notification, + stage="chunking", ) user_llm = await get_user_long_context_llm(session, user_id, search_space_id) @@ -1957,7 +1039,6 @@ async def process_file_in_background_with_document( "file_type": etl_service, }, ) - return document except Exception as e: diff --git a/surfsense_backend/app/tasks/document_processors/markdown_processor.py b/surfsense_backend/app/tasks/document_processors/markdown_processor.py index 2fb711bf8..0ff340c0e 100644 --- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py +++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py @@ -14,88 +14,19 @@ from app.utils.document_converters import ( create_document_chunks, generate_content_hash, generate_document_summary, - generate_unique_identifier_hash, ) +from ._helpers import ( + find_existing_document_with_migration, + get_google_drive_unique_identifier, +) from .base import ( - check_document_by_unique_identifier, check_duplicate_document, get_current_timestamp, safe_set_chunks, ) -def _get_google_drive_unique_identifier( - connector: dict | None, - filename: str, - search_space_id: int, -) -> tuple[str, str | None]: - """ - Get unique identifier hash for a file, with special handling for Google Drive. - - For Google Drive files, uses file_id as the unique identifier (doesn't change on rename). - For other files, uses filename. - - Args: - connector: Optional connector info dict with type and metadata - filename: The filename (used for non-Google Drive files or as fallback) - search_space_id: The search space ID - - Returns: - Tuple of (primary_hash, legacy_hash or None) - """ - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - metadata = connector.get("metadata", {}) - file_id = metadata.get("google_drive_file_id") - - if file_id: - primary_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id - ) - legacy_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id - ) - return primary_hash, legacy_hash - - primary_hash = generate_unique_identifier_hash( - DocumentType.FILE, filename, search_space_id - ) - return primary_hash, None - - -async def _find_existing_document_with_migration( - session: AsyncSession, - primary_hash: str, - legacy_hash: str | None, - content_hash: str | None = None, -) -> Document | None: - """ - Find existing document, checking both new hash and legacy hash for migration, - with fallback to content_hash for cross-source deduplication. - """ - existing_document = await check_document_by_unique_identifier(session, primary_hash) - - if not existing_document and legacy_hash: - existing_document = await check_document_by_unique_identifier( - session, legacy_hash - ) - if existing_document: - logging.info( - "Found legacy document (filename-based hash), will migrate to file_id-based hash" - ) - - # Fallback: check by content_hash to catch duplicates from different sources - if not existing_document and content_hash: - existing_document = await check_duplicate_document(session, content_hash) - if existing_document: - logging.info( - f"Found duplicate content from different source (content_hash match). " - f"Original document ID: {existing_document.id}, type: {existing_document.document_type}" - ) - - return existing_document - - async def _handle_existing_document_update( session: AsyncSession, existing_document: Document, @@ -224,7 +155,7 @@ async def add_received_markdown_file_document( try: # Generate unique identifier hash (uses file_id for Google Drive, filename for others) - primary_hash, legacy_hash = _get_google_drive_unique_identifier( + primary_hash, legacy_hash = get_google_drive_unique_identifier( connector, file_name, search_space_id ) @@ -232,7 +163,7 @@ async def add_received_markdown_file_document( content_hash = generate_content_hash(file_in_markdown, search_space_id) # Check if document exists (with migration support for Google Drive and content_hash fallback) - existing_document = await _find_existing_document_with_migration( + existing_document = await find_existing_document_with_migration( session, primary_hash, legacy_hash, content_hash ) diff --git a/surfsense_backend/tests/integration/document_upload/test_upload_limits.py b/surfsense_backend/tests/integration/document_upload/test_upload_limits.py index a8dab43f0..a56398baa 100644 --- a/surfsense_backend/tests/integration/document_upload/test_upload_limits.py +++ b/surfsense_backend/tests/integration/document_upload/test_upload_limits.py @@ -2,12 +2,11 @@ Integration tests for backend file upload limit enforcement. These tests verify that the API rejects uploads that exceed: - - Max files per upload (10) - - Max per-file size (50 MB) - - Max total upload size (200 MB) + - Max per-file size (500 MB) -The limits mirror the frontend's DocumentUploadTab.tsx constants and are -enforced server-side to protect against direct API calls. +No file count or total size limits are enforced — the frontend batches +uploads in groups of 5 and there is no cap on how many files a user can +upload in a single session. Prerequisites: - PostgreSQL + pgvector @@ -24,60 +23,12 @@ pytestmark = pytest.mark.integration # --------------------------------------------------------------------------- -# Test A: File count limit -# --------------------------------------------------------------------------- - - -class TestFileCountLimit: - """Uploading more than 10 files in a single request should be rejected.""" - - async def test_11_files_returns_413( - self, - client: httpx.AsyncClient, - headers: dict[str, str], - search_space_id: int, - ): - files = [ - ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain")) - for i in range(11) - ] - resp = await client.post( - "/api/v1/documents/fileupload", - headers=headers, - files=files, - data={"search_space_id": str(search_space_id)}, - ) - assert resp.status_code == 413 - assert "too many files" in resp.json()["detail"].lower() - - async def test_10_files_accepted( - self, - client: httpx.AsyncClient, - headers: dict[str, str], - search_space_id: int, - cleanup_doc_ids: list[int], - ): - files = [ - ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain")) - for i in range(10) - ] - resp = await client.post( - "/api/v1/documents/fileupload", - headers=headers, - files=files, - data={"search_space_id": str(search_space_id)}, - ) - assert resp.status_code == 200 - cleanup_doc_ids.extend(resp.json().get("document_ids", [])) - - -# --------------------------------------------------------------------------- -# Test B: Per-file size limit +# Test: Per-file size limit (500 MB) # --------------------------------------------------------------------------- class TestPerFileSizeLimit: - """A single file exceeding 50 MB should be rejected.""" + """A single file exceeding 500 MB should be rejected.""" async def test_oversized_file_returns_413( self, @@ -85,7 +36,7 @@ class TestPerFileSizeLimit: headers: dict[str, str], search_space_id: int, ): - oversized = io.BytesIO(b"\x00" * (50 * 1024 * 1024 + 1)) + oversized = io.BytesIO(b"\x00" * (500 * 1024 * 1024 + 1)) resp = await client.post( "/api/v1/documents/fileupload", headers=headers, @@ -102,11 +53,11 @@ class TestPerFileSizeLimit: search_space_id: int, cleanup_doc_ids: list[int], ): - at_limit = io.BytesIO(b"\x00" * (50 * 1024 * 1024)) + at_limit = io.BytesIO(b"\x00" * (500 * 1024 * 1024)) resp = await client.post( "/api/v1/documents/fileupload", headers=headers, - files=[("files", ("exact50mb.txt", at_limit, "text/plain"))], + files=[("files", ("exact500mb.txt", at_limit, "text/plain"))], data={"search_space_id": str(search_space_id)}, ) assert resp.status_code == 200 @@ -114,26 +65,23 @@ class TestPerFileSizeLimit: # --------------------------------------------------------------------------- -# Test C: Total upload size limit +# Test: Multiple files accepted without count limit # --------------------------------------------------------------------------- -class TestTotalSizeLimit: - """Multiple files whose combined size exceeds 200 MB should be rejected.""" +class TestNoFileCountLimit: + """Many files in a single request should be accepted.""" - async def test_total_size_over_200mb_returns_413( + async def test_many_files_accepted( self, client: httpx.AsyncClient, headers: dict[str, str], search_space_id: int, + cleanup_doc_ids: list[int], ): - chunk_size = 45 * 1024 * 1024 # 45 MB each files = [ - ( - "files", - (f"chunk_{i}.txt", io.BytesIO(b"\x00" * chunk_size), "text/plain"), - ) - for i in range(5) # 5 x 45 MB = 225 MB > 200 MB + ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain")) + for i in range(20) ] resp = await client.post( "/api/v1/documents/fileupload", @@ -141,5 +89,5 @@ class TestTotalSizeLimit: files=files, data={"search_space_id": str(search_space_id)}, ) - assert resp.status_code == 413 - assert "total upload size" in resp.json()["detail"].lower() + assert resp.status_code == 200 + cleanup_doc_ids.extend(resp.json().get("document_ids", [])) diff --git a/surfsense_backend/tests/unit/middleware/test_knowledge_search.py b/surfsense_backend/tests/unit/middleware/test_knowledge_search.py index 163dd0d1d..a8cf5c93b 100644 --- a/surfsense_backend/tests/unit/middleware/test_knowledge_search.py +++ b/surfsense_backend/tests/unit/middleware/test_knowledge_search.py @@ -248,7 +248,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner: return [] async def fake_build_scoped_filesystem(**kwargs): - return {} + return {}, {} monkeypatch.setattr( "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base", @@ -298,7 +298,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner: return [] async def fake_build_scoped_filesystem(**kwargs): - return {} + return {}, {} monkeypatch.setattr( "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base", @@ -334,7 +334,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner: return [] async def fake_build_scoped_filesystem(**kwargs): - return {} + return {}, {} monkeypatch.setattr( "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base", diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx index 4e0c36267..1c246ed71 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx @@ -329,14 +329,15 @@ export function DocumentsTableShell({ const handleViewDocument = useCallback(async (doc: Document) => { setViewingDoc(doc); - if (doc.content) { - setViewingContent(doc.content); + const preview = doc.content_preview || doc.content; + if (preview) { + setViewingContent(preview); return; } setViewingLoading(true); try { const fullDoc = await documentsApiService.getDocument({ id: doc.id }); - setViewingContent(fullDoc.content); + setViewingContent(fullDoc.content_preview || fullDoc.content); } catch (err) { console.error("[DocumentsTableShell] Failed to fetch document content:", err); setViewingContent("Failed to load document content."); @@ -946,13 +947,36 @@ export function DocumentsTableShell({ WebkitMaskImage: `linear-gradient(to bottom, ${previewScrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${previewScrollPos === "bottom" ? "black" : "transparent"})`, }} > - {viewingLoading ? ( -
- -
- ) : ( - - )} + {viewingLoading ? ( +
+ +
+ ) : ( + <> + + {viewingDoc && ( +
+ +
+ )} + + )} diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts index d87f7374b..88914bd4f 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts @@ -9,9 +9,9 @@ export type Document = { id: number; title: string; document_type: DocumentType; - // Optional: Only needed when viewing document details (lazy loaded) document_metadata?: any; content?: string; + content_preview?: string; created_at: string; search_space_id: number; created_by_id?: string | null; diff --git a/surfsense_web/components/editor-panel/editor-panel.tsx b/surfsense_web/components/editor-panel/editor-panel.tsx index 3ea36f800..4b7079aef 100644 --- a/surfsense_web/components/editor-panel/editor-panel.tsx +++ b/surfsense_web/components/editor-panel/editor-panel.tsx @@ -1,12 +1,13 @@ "use client"; import { useAtomValue, useSetAtom } from "jotai"; -import { AlertCircle, XIcon } from "lucide-react"; +import { AlertCircle, Download, FileText, Loader2, XIcon } from "lucide-react"; import dynamic from "next/dynamic"; import { useCallback, useEffect, useRef, useState } from "react"; import { toast } from "sonner"; import { closeEditorPanelAtom, editorPanelAtom } from "@/atoms/editor/editor-panel.atom"; import { MarkdownViewer } from "@/components/markdown-viewer"; +import { Alert, AlertDescription } from "@/components/ui/alert"; import { Button } from "@/components/ui/button"; import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer"; import { Skeleton } from "@/components/ui/skeleton"; @@ -18,11 +19,16 @@ const PlateEditor = dynamic( { ssr: false, loading: () => } ); +const LARGE_DOCUMENT_THRESHOLD = 2 * 1024 * 1024; // 2MB + interface EditorContent { document_id: number; title: string; document_type?: string; source_markdown: string; + content_size_bytes?: number; + chunk_count?: number; + truncated?: boolean; } const EDITABLE_DOCUMENT_TYPES = new Set(["FILE", "NOTE"]); @@ -62,6 +68,7 @@ export function EditorPanelContent({ const [isLoading, setIsLoading] = useState(true); const [error, setError] = useState(null); const [saving, setSaving] = useState(false); + const [downloading, setDownloading] = useState(false); const [editedMarkdown, setEditedMarkdown] = useState(null); const markdownRef = useRef(""); @@ -69,6 +76,8 @@ export function EditorPanelContent({ const changeCountRef = useRef(0); const [displayTitle, setDisplayTitle] = useState(title || "Untitled"); + const isLargeDocument = (editorDoc?.content_size_bytes ?? 0) > LARGE_DOCUMENT_THRESHOLD; + useEffect(() => { let cancelled = false; setIsLoading(true); @@ -86,10 +95,12 @@ export function EditorPanelContent({ } try { - const response = await authenticatedFetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`, - { method: "GET" } + const url = new URL( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content` ); + url.searchParams.set("max_length", String(LARGE_DOCUMENT_THRESHOLD)); + + const response = await authenticatedFetch(url.toString(), { method: "GET" }); if (cancelled) return; @@ -175,7 +186,7 @@ export function EditorPanelContent({ }, [documentId, searchSpaceId]); const isEditableType = editorDoc - ? EDITABLE_DOCUMENT_TYPES.has(editorDoc.document_type ?? "") + ? EDITABLE_DOCUMENT_TYPES.has(editorDoc.document_type ?? "") && !isLargeDocument : false; return ( @@ -206,6 +217,57 @@ export function EditorPanelContent({

{error || "An unknown error occurred"}

+ ) : isLargeDocument ? ( +
+ + + + + This document is too large for the editor ({Math.round((editorDoc.content_size_bytes ?? 0) / 1024 / 1024)}MB, {editorDoc.chunk_count ?? 0} chunks). Showing a preview below. + + + + + +
) : isEditableType ? ( (null); const [isEditing, setIsEditing] = useState(false); const [saving, setSaving] = useState(false); + const [downloading, setDownloading] = useState(false); const [editedMarkdown, setEditedMarkdown] = useState(null); const markdownRef = useRef(""); const initialLoadDone = useRef(false); const changeCountRef = useRef(0); + const isLargeDocument = (doc?.content_size_bytes ?? 0) > LARGE_DOCUMENT_THRESHOLD; + useEffect(() => { let cancelled = false; setIsLoading(true); @@ -72,10 +81,12 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen } try { - const response = await authenticatedFetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`, - { method: "GET" } + const url = new URL( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content` ); + url.searchParams.set("max_length", String(LARGE_DOCUMENT_THRESHOLD)); + + const response = await authenticatedFetch(url.toString(), { method: "GET" }); if (cancelled) return; @@ -173,9 +184,9 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen ); } - const isEditable = EDITABLE_DOCUMENT_TYPES.has(doc.document_type ?? ""); + const isEditable = EDITABLE_DOCUMENT_TYPES.has(doc.document_type ?? "") && !isLargeDocument; - if (isEditing) { + if (isEditing && !isLargeDocument) { return (
@@ -236,7 +247,60 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen
- + {isLargeDocument ? ( + <> + + + + + This document is too large for the editor ({Math.round((doc.content_size_bytes ?? 0) / 1024 / 1024)}MB, {doc.chunk_count ?? 0} chunks). Showing a preview below. + + + + + + + ) : ( + + )}
diff --git a/surfsense_web/components/markdown-viewer.tsx b/surfsense_web/components/markdown-viewer.tsx index e22df8998..abd999301 100644 --- a/surfsense_web/components/markdown-viewer.tsx +++ b/surfsense_web/components/markdown-viewer.tsx @@ -15,6 +15,7 @@ const math = createMathPlugin({ interface MarkdownViewerProps { content: string; className?: string; + maxLength?: number; } /** @@ -79,8 +80,10 @@ function convertLatexDelimiters(content: string): string { return content; } -export function MarkdownViewer({ content, className }: MarkdownViewerProps) { - const processedContent = convertLatexDelimiters(stripOuterMarkdownFence(content)); +export function MarkdownViewer({ content, className, maxLength }: MarkdownViewerProps) { + const isTruncated = maxLength != null && content.length > maxLength; + const displayContent = isTruncated ? content.slice(0, maxLength) : content; + const processedContent = convertLatexDelimiters(stripOuterMarkdownFence(displayContent)); const components: StreamdownProps["components"] = { p: ({ children, ...props }) => (

@@ -171,6 +174,11 @@ export function MarkdownViewer({ content, className }: MarkdownViewerProps) { > {processedContent} + {isTruncated && ( +

+ Content truncated ({Math.round(content.length / 1024)}KB total). Showing first {Math.round(maxLength / 1024)}KB. +

+ )} ); } diff --git a/surfsense_web/components/new-chat/source-detail-panel.tsx b/surfsense_web/components/new-chat/source-detail-panel.tsx index b02b2e217..c17616c53 100644 --- a/surfsense_web/components/new-chat/source-detail-panel.tsx +++ b/surfsense_web/components/new-chat/source-detail-panel.tsx @@ -1,7 +1,7 @@ "use client"; import { useQuery } from "@tanstack/react-query"; -import { BookOpen, ChevronDown, ExternalLink, FileText, Hash, Sparkles, X } from "lucide-react"; +import { BookOpen, ChevronDown, ChevronUp, ExternalLink, FileText, Hash, Loader2, Sparkles, X } from "lucide-react"; import { AnimatePresence, motion, useReducedMotion } from "motion/react"; import { useTranslations } from "next-intl"; import type React from "react"; @@ -10,7 +10,6 @@ import { createPortal } from "react-dom"; import { MarkdownViewer } from "@/components/markdown-viewer"; import { Badge } from "@/components/ui/badge"; import { Button } from "@/components/ui/button"; -import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"; import { ScrollArea } from "@/components/ui/scroll-area"; import { Spinner } from "@/components/ui/spinner"; import type { @@ -48,7 +47,8 @@ const formatDocumentType = (type: string) => { // which break auto-scroll functionality interface ChunkCardProps { chunk: { id: number; content: string }; - index: number; + localIndex: number; + chunkNumber: number; totalChunks: number; isCited: boolean; isActive: boolean; @@ -56,11 +56,11 @@ interface ChunkCardProps { } const ChunkCard = memo( - forwardRef(({ chunk, index, totalChunks, isCited }, ref) => { + forwardRef(({ chunk, localIndex, chunkNumber, totalChunks, isCited }, ref) => { return (
- {/* Cited indicator glow effect */} {isCited &&
} - {/* Header */}
- {index + 1} + {chunkNumber}
- of {totalChunks} chunks + Chunk {chunkNumber} of {totalChunks}
{isCited && ( @@ -94,9 +92,8 @@ const ChunkCard = memo( )}
- {/* Content */}
- +
); @@ -118,7 +115,6 @@ export function SourceDetailPanel({ const t = useTranslations("dashboard"); const scrollAreaRef = useRef(null); const hasScrolledRef = useRef(false); // Use ref to avoid stale closures - const [summaryOpen, setSummaryOpen] = useState(false); const [activeChunkIndex, setActiveChunkIndex] = useState(null); const [mounted, setMounted] = useState(false); const [_hasScrolledToCited, setHasScrolledToCited] = useState(false); @@ -140,20 +136,88 @@ export function SourceDetailPanel({ if (isDocsChunk) { return documentsApiService.getSurfsenseDocByChunk(chunkId); } - return documentsApiService.getDocumentByChunk({ chunk_id: chunkId }); + return documentsApiService.getDocumentByChunk({ chunk_id: chunkId, chunk_window: 5 }); }, enabled: !!chunkId && open, staleTime: 5 * 60 * 1000, }); + const totalChunks = (documentData && "total_chunks" in documentData) + ? (documentData.total_chunks ?? documentData.chunks.length) + : (documentData?.chunks?.length ?? 0); + const [beforeChunks, setBeforeChunks] = useState>([]); + const [afterChunks, setAfterChunks] = useState>([]); + const [loadingBefore, setLoadingBefore] = useState(false); + const [loadingAfter, setLoadingAfter] = useState(false); + + useEffect(() => { + setBeforeChunks([]); + setAfterChunks([]); + }, [chunkId, open]); + + const chunkStartIndex = (documentData && "chunk_start_index" in documentData) + ? (documentData.chunk_start_index ?? 0) : 0; + const initialChunks = documentData?.chunks ?? []; + const allChunks = [...beforeChunks, ...initialChunks, ...afterChunks]; + const absoluteStart = chunkStartIndex - beforeChunks.length; + const absoluteEnd = chunkStartIndex + initialChunks.length + afterChunks.length; + const canLoadBefore = absoluteStart > 0; + const canLoadAfter = absoluteEnd < totalChunks; + + const EXPAND_SIZE = 10; + + const loadBefore = useCallback(async () => { + if (!documentData || !("search_space_id" in documentData) || !canLoadBefore) return; + setLoadingBefore(true); + try { + const count = Math.min(EXPAND_SIZE, absoluteStart); + const result = await documentsApiService.getDocumentChunks({ + document_id: documentData.id, + page: 0, + page_size: count, + start_offset: absoluteStart - count, + }); + const existingIds = new Set(allChunks.map(c => c.id)); + const newChunks = result.items + .filter(c => !existingIds.has(c.id)) + .map(c => ({ id: c.id, content: c.content, created_at: c.created_at })); + setBeforeChunks(prev => [...newChunks, ...prev]); + } catch (err) { + console.error("Failed to load earlier chunks:", err); + } finally { + setLoadingBefore(false); + } + }, [documentData, absoluteStart, canLoadBefore, allChunks]); + + const loadAfter = useCallback(async () => { + if (!documentData || !("search_space_id" in documentData) || !canLoadAfter) return; + setLoadingAfter(true); + try { + const result = await documentsApiService.getDocumentChunks({ + document_id: documentData.id, + page: 0, + page_size: EXPAND_SIZE, + start_offset: absoluteEnd, + }); + const existingIds = new Set(allChunks.map(c => c.id)); + const newChunks = result.items + .filter(c => !existingIds.has(c.id)) + .map(c => ({ id: c.id, content: c.content, created_at: c.created_at })); + setAfterChunks(prev => [...prev, ...newChunks]); + } catch (err) { + console.error("Failed to load later chunks:", err); + } finally { + setLoadingAfter(false); + } + }, [documentData, absoluteEnd, canLoadAfter, allChunks]); + const isDirectRenderSource = sourceType === "TAVILY_API" || sourceType === "LINKUP_API" || sourceType === "SEARXNG_API" || sourceType === "BAIDU_SEARCH_API"; - // Find cited chunk index - const citedChunkIndex = documentData?.chunks?.findIndex((chunk) => chunk.id === chunkId) ?? -1; + const citedChunkIndex = allChunks.findIndex((chunk) => chunk.id === chunkId); // Simple scroll function that scrolls to a chunk by index const scrollToChunkByIndex = useCallback( @@ -336,12 +400,12 @@ export function SourceDetailPanel({ {documentData && "document_type" in documentData ? formatDocumentType(documentData.document_type) : sourceType && formatDocumentType(sourceType)} - {documentData?.chunks && ( - - • {documentData.chunks.length} chunk - {documentData.chunks.length !== 1 ? "s" : ""} - - )} + {totalChunks > 0 && ( + + • {totalChunks} chunk{totalChunks !== 1 ? "s" : ""} + {allChunks.length < totalChunks && ` (showing ${allChunks.length})`} + + )}

@@ -450,7 +514,7 @@ export function SourceDetailPanel({ {!isDirectRenderSource && documentData && (
{/* Chunk Navigation Sidebar */} - {documentData.chunks.length > 1 && ( + {allChunks.length > 1 && (
- {documentData.chunks.map((chunk, idx) => { + {allChunks.map((chunk, idx) => { + const absNum = absoluteStart + idx + 1; const isCited = chunk.id === chunkId; const isActive = activeChunkIndex === idx; return ( @@ -478,9 +543,9 @@ export function SourceDetailPanel({ ? "bg-muted text-foreground" : "bg-muted/50 text-muted-foreground hover:bg-muted hover:text-foreground" )} - title={isCited ? `Chunk ${idx + 1} (Cited)` : `Chunk ${idx + 1}`} + title={isCited ? `Chunk ${absNum} (Cited)` : `Chunk ${absNum}`} > - {idx + 1} + {absNum} {isCited && ( @@ -524,44 +589,11 @@ export function SourceDetailPanel({ )} - {/* Summary Collapsible */} - {documentData.content && ( - - - - - - Document Summary - - - - - - - - - - - - - )} - {/* Chunks Header */} -
+

- Content Chunks + Chunks {absoluteStart + 1}–{absoluteEnd} of {totalChunks}

{citedChunkIndex !== -1 && ( +
+ )} + {/* Chunks */}
- {documentData.chunks.map((chunk, idx) => { + {allChunks.map((chunk, idx) => { const isCited = chunk.id === chunkId; + const chunkNumber = absoluteStart + idx + 1; return ( 30} + disableLayoutAnimation={allChunks.length > 30} /> ); })}
+ + {/* Load Later */} + {canLoadAfter && ( +
+ +
+ )}
diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index 6817b19db..faa042d8e 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -1,10 +1,10 @@ "use client"; import { useAtom } from "jotai"; -import { CheckCircle2, FileType, Info, Upload, X } from "lucide-react"; +import { CheckCircle2, FileType, FolderOpen, Info, Upload, X } from "lucide-react"; import { useTranslations } from "next-intl"; -import { useCallback, useMemo, useRef, useState } from "react"; +import { type ChangeEvent, useCallback, useMemo, useRef, useState } from "react"; import { useDropzone } from "react-dropzone"; import { toast } from "sonner"; import { uploadDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms"; @@ -51,6 +51,7 @@ const commonTypes = { "application/vnd.openxmlformats-officedocument.presentationml.presentation": [".pptx"], "text/html": [".html", ".htm"], "text/csv": [".csv"], + "text/tab-separated-values": [".tsv"], "image/jpeg": [".jpg", ".jpeg"], "image/png": [".png"], "image/bmp": [".bmp"], @@ -76,7 +77,6 @@ const FILE_TYPE_CONFIG: Record> = { "application/rtf": [".rtf"], "application/xml": [".xml"], "application/epub+zip": [".epub"], - "text/tab-separated-values": [".tsv"], "text/html": [".html", ".htm", ".web"], "image/gif": [".gif"], "image/svg+xml": [".svg"], @@ -102,7 +102,6 @@ const FILE_TYPE_CONFIG: Record> = { "application/vnd.ms-powerpoint": [".ppt"], "text/x-rst": [".rst"], "application/rtf": [".rtf"], - "text/tab-separated-values": [".tsv"], "application/vnd.ms-excel": [".xls"], "application/xml": [".xml"], ...audioFileTypes, @@ -116,10 +115,8 @@ interface FileWithId { const cardClass = "border border-border bg-slate-400/5 dark:bg-white/5"; -// Upload limits — files are sent in batches of 5 to avoid proxy timeouts -const MAX_FILES = 50; -const MAX_TOTAL_SIZE_MB = 200; -const MAX_TOTAL_SIZE_BYTES = MAX_TOTAL_SIZE_MB * 1024 * 1024; +const MAX_FILE_SIZE_MB = 500; +const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024; export function DocumentUploadTab({ searchSpaceId, @@ -134,6 +131,7 @@ export function DocumentUploadTab({ const [uploadDocumentMutation] = useAtom(uploadDocumentMutationAtom); const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation; const fileInputRef = useRef(null); + const folderInputRef = useRef(null); const acceptedFileTypes = useMemo(() => { const etlService = process.env.NEXT_PUBLIC_ETL_SERVICE; @@ -145,49 +143,76 @@ export function DocumentUploadTab({ [acceptedFileTypes] ); - const onDrop = useCallback( - (acceptedFiles: File[]) => { + const supportedExtensionsSet = useMemo( + () => new Set(supportedExtensions.map((ext) => ext.toLowerCase())), + [supportedExtensions] + ); + + const addFiles = useCallback( + (incoming: File[]) => { + const oversized = incoming.filter((f) => f.size > MAX_FILE_SIZE_BYTES); + if (oversized.length > 0) { + toast.error(t("file_too_large"), { + description: t("file_too_large_desc", { + name: oversized[0].name, + maxMB: MAX_FILE_SIZE_MB, + }), + }); + } + const valid = incoming.filter((f) => f.size <= MAX_FILE_SIZE_BYTES); + if (valid.length === 0) return; + setFiles((prev) => { - const newEntries = acceptedFiles.map((f) => ({ + const newEntries = valid.map((f) => ({ id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`, file: f, })); - const newFiles = [...prev, ...newEntries]; - - if (newFiles.length > MAX_FILES) { - toast.error(t("max_files_exceeded"), { - description: t("max_files_exceeded_desc", { max: MAX_FILES }), - }); - return prev; - } - - const newTotalSize = newFiles.reduce((sum, entry) => sum + entry.file.size, 0); - if (newTotalSize > MAX_TOTAL_SIZE_BYTES) { - toast.error(t("max_size_exceeded"), { - description: t("max_size_exceeded_desc", { max: MAX_TOTAL_SIZE_MB }), - }); - return prev; - } - - return newFiles; + return [...prev, ...newEntries]; }); }, [t] ); + const onDrop = useCallback( + (acceptedFiles: File[]) => { + addFiles(acceptedFiles); + }, + [addFiles] + ); + const { getRootProps, getInputProps, isDragActive } = useDropzone({ onDrop, accept: acceptedFileTypes, - maxSize: 50 * 1024 * 1024, // 50MB per file + maxSize: MAX_FILE_SIZE_BYTES, noClick: false, - disabled: files.length >= MAX_FILES, }); - // Handle file input click to prevent event bubbling that might reopen dialog const handleFileInputClick = useCallback((e: React.MouseEvent) => { e.stopPropagation(); }, []); + const handleFolderChange = useCallback( + (e: ChangeEvent) => { + const fileList = e.target.files; + if (!fileList || fileList.length === 0) return; + + const folderFiles = Array.from(fileList).filter((f) => { + const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : ""; + return ext !== "" && supportedExtensionsSet.has(ext); + }); + + if (folderFiles.length === 0) { + toast.error(t("no_supported_files_in_folder")); + e.target.value = ""; + return; + } + + addFiles(folderFiles); + e.target.value = ""; + }, + [addFiles, supportedExtensionsSet, t] + ); + const formatFileSize = (bytes: number) => { if (bytes === 0) return "0 Bytes"; const k = 1024; @@ -198,15 +223,6 @@ export function DocumentUploadTab({ const totalFileSize = files.reduce((total, entry) => total + entry.file.size, 0); - // Check if limits are reached - const isFileCountLimitReached = files.length >= MAX_FILES; - const isSizeLimitReached = totalFileSize >= MAX_TOTAL_SIZE_BYTES; - const remainingFiles = MAX_FILES - files.length; - const remainingSizeMB = Math.max( - 0, - (MAX_TOTAL_SIZE_BYTES - totalFileSize) / (1024 * 1024) - ).toFixed(1); - // Track accordion state changes const handleAccordionChange = useCallback( (value: string) => { @@ -257,11 +273,21 @@ export function DocumentUploadTab({ - {t("file_size_limit")}{" "} - {t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })} + {t("file_size_limit", { maxMB: MAX_FILE_SIZE_MB })}{" "} + {t("upload_limits")} + {/* Hidden folder input */} + )} + /> +
@@ -269,11 +295,7 @@ export function DocumentUploadTab({
- {isFileCountLimitReached ? ( -
- -
-

- {t("file_limit_reached")} -

-

- {t("file_limit_reached_desc", { max: MAX_FILES })} -

-
-
- ) : isDragActive ? ( + {isDragActive ? (

{t("drop_files")}

@@ -305,29 +315,35 @@ export function DocumentUploadTab({

{t("drag_drop")}

{t("or_browse")}

- {files.length > 0 && ( -

- {t("remaining_capacity", { files: remainingFiles, sizeMB: remainingSizeMB })} -

- )} -
- )} - {!isFileCountLimitReached && ( -
-
)} +
+ + +
diff --git a/surfsense_web/contracts/types/document.types.ts b/surfsense_web/contracts/types/document.types.ts index 1a3326bae..f5431aecb 100644 --- a/surfsense_web/contracts/types/document.types.ts +++ b/surfsense_web/contracts/types/document.types.ts @@ -39,6 +39,7 @@ export const document = z.object({ document_type: documentTypeEnum, document_metadata: z.record(z.string(), z.any()), content: z.string(), + content_preview: z.string().optional().default(""), content_hash: z.string(), unique_identifier_hash: z.string().nullable(), created_at: z.string(), @@ -69,6 +70,8 @@ export const documentWithChunks = document.extend({ created_at: z.string(), }) ), + total_chunks: z.number().optional().default(0), + chunk_start_index: z.number().optional().default(0), }); /** @@ -243,10 +246,36 @@ export const getDocumentTypeCountsResponse = z.record(z.string(), z.number()); */ export const getDocumentByChunkRequest = z.object({ chunk_id: z.number(), + chunk_window: z.number().optional(), }); export const getDocumentByChunkResponse = documentWithChunks; +/** + * Get paginated chunks for a document + */ +export const getDocumentChunksRequest = z.object({ + document_id: z.number(), + page: z.number().optional().default(0), + page_size: z.number().optional().default(20), + start_offset: z.number().optional(), +}); + +export const chunkRead = z.object({ + id: z.number(), + content: z.string(), + document_id: z.number(), + created_at: z.string(), +}); + +export const getDocumentChunksResponse = z.object({ + items: z.array(chunkRead), + total: z.number(), + page: z.number(), + page_size: z.number(), + has_more: z.boolean(), +}); + /** * Get Surfsense docs by chunk */ @@ -328,3 +357,6 @@ export type GetSurfsenseDocsByChunkRequest = z.infer; export type GetSurfsenseDocsRequest = z.infer; export type GetSurfsenseDocsResponse = z.infer; +export type GetDocumentChunksRequest = z.infer; +export type GetDocumentChunksResponse = z.infer; +export type ChunkRead = z.infer; diff --git a/surfsense_web/lib/apis/documents-api.service.ts b/surfsense_web/lib/apis/documents-api.service.ts index 14a247032..71fa58852 100644 --- a/surfsense_web/lib/apis/documents-api.service.ts +++ b/surfsense_web/lib/apis/documents-api.service.ts @@ -6,6 +6,7 @@ import { deleteDocumentRequest, deleteDocumentResponse, type GetDocumentByChunkRequest, + type GetDocumentChunksRequest, type GetDocumentRequest, type GetDocumentsRequest, type GetDocumentsStatusRequest, @@ -13,6 +14,8 @@ import { type GetSurfsenseDocsRequest, getDocumentByChunkRequest, getDocumentByChunkResponse, + getDocumentChunksRequest, + getDocumentChunksResponse, getDocumentRequest, getDocumentResponse, getDocumentsRequest, @@ -295,23 +298,52 @@ class DocumentsApiService { }; /** - * Get document by chunk ID (includes all chunks) + * Get document by chunk ID (includes a window of chunks around the cited one) */ getDocumentByChunk = async (request: GetDocumentByChunkRequest) => { - // Validate the request const parsedRequest = getDocumentByChunkRequest.safeParse(request); if (!parsedRequest.success) { console.error("Invalid request:", parsedRequest.error); - // Format a user friendly error message const errorMessage = parsedRequest.error.issues.map((issue) => issue.message).join(", "); throw new ValidationError(`Invalid request: ${errorMessage}`); } + const params = new URLSearchParams(); + if (request.chunk_window != null) { + params.set("chunk_window", String(request.chunk_window)); + } + const qs = params.toString(); + const url = `/api/v1/documents/by-chunk/${request.chunk_id}${qs ? `?${qs}` : ""}`; + + return baseApiService.get(url, getDocumentByChunkResponse); + }; + + /** + * Get paginated chunks for a document + */ + getDocumentChunks = async (request: GetDocumentChunksRequest) => { + const parsedRequest = getDocumentChunksRequest.safeParse(request); + + if (!parsedRequest.success) { + console.error("Invalid request:", parsedRequest.error); + + const errorMessage = parsedRequest.error.issues.map((issue) => issue.message).join(", "); + throw new ValidationError(`Invalid request: ${errorMessage}`); + } + + const params = new URLSearchParams({ + page: String(parsedRequest.data.page), + page_size: String(parsedRequest.data.page_size), + }); + if (parsedRequest.data.start_offset != null) { + params.set("start_offset", String(parsedRequest.data.start_offset)); + } + return baseApiService.get( - `/api/v1/documents/by-chunk/${request.chunk_id}`, - getDocumentByChunkResponse + `/api/v1/documents/${parsedRequest.data.document_id}/chunks?${params}`, + getDocumentChunksResponse ); }; diff --git a/surfsense_web/messages/en.json b/surfsense_web/messages/en.json index 53f80ea5f..cacaec557 100644 --- a/surfsense_web/messages/en.json +++ b/surfsense_web/messages/en.json @@ -376,12 +376,13 @@ "upload_documents": { "title": "Upload Documents", "subtitle": "Upload your files to make them searchable and accessible through AI-powered conversations.", - "file_size_limit": "Maximum file size: 50MB per file.", - "upload_limits": "Upload limit: {maxFiles} files, {maxSizeMB}MB total.", - "drop_files": "Drop files here", - "drag_drop": "Drag & drop files here", - "or_browse": "or click to browse", + "file_size_limit": "Maximum file size: {maxMB}MB per file.", + "upload_limits": "Upload files or entire folders", + "drop_files": "Drop files or folders here", + "drag_drop": "Drag & drop files or folders here", + "or_browse": "or click to browse files and folders", "browse_files": "Browse Files", + "browse_folder": "Browse Folder", "selected_files": "Selected Files ({count})", "total_size": "Total size", "clear_all": "Clear all", @@ -394,13 +395,9 @@ "upload_error_desc": "Error uploading files", "supported_file_types": "Supported File Types", "file_types_desc": "These file types are supported based on your current ETL service configuration.", - "max_files_exceeded": "File Limit Exceeded", - "max_files_exceeded_desc": "You can upload a maximum of {max} files at a time.", - "max_size_exceeded": "Size Limit Exceeded", - "max_size_exceeded_desc": "Total file size cannot exceed {max}MB.", - "file_limit_reached": "Maximum Files Reached", - "file_limit_reached_desc": "Remove some files to add more (max {max} files).", - "remaining_capacity": "{files} files remaining • {sizeMB}MB available" + "file_too_large": "File Too Large", + "file_too_large_desc": "\"{name}\" exceeds the {maxMB}MB per-file limit.", + "no_supported_files_in_folder": "No supported file types found in the selected folder." }, "add_webpage": { "title": "Add Webpages for Crawling", diff --git a/surfsense_web/messages/es.json b/surfsense_web/messages/es.json index 36e627295..7670e76df 100644 --- a/surfsense_web/messages/es.json +++ b/surfsense_web/messages/es.json @@ -376,12 +376,13 @@ "upload_documents": { "title": "Subir documentos", "subtitle": "Sube tus archivos para hacerlos buscables y accesibles a través de conversaciones con IA.", - "file_size_limit": "Tamaño máximo de archivo: 50 MB por archivo.", - "upload_limits": "Límite de subida: {maxFiles} archivos, {maxSizeMB} MB en total.", - "drop_files": "Suelta los archivos aquí", - "drag_drop": "Arrastra y suelta archivos aquí", - "or_browse": "o haz clic para explorar", + "file_size_limit": "Tamaño máximo de archivo: {maxMB} MB por archivo.", + "upload_limits": "Sube archivos o carpetas enteras", + "drop_files": "Suelta archivos o carpetas aquí", + "drag_drop": "Arrastra y suelta archivos o carpetas aquí", + "or_browse": "o haz clic para explorar archivos y carpetas", "browse_files": "Explorar archivos", + "browse_folder": "Explorar carpeta", "selected_files": "Archivos seleccionados ({count})", "total_size": "Tamaño total", "clear_all": "Limpiar todo", @@ -394,13 +395,9 @@ "upload_error_desc": "Error al subir archivos", "supported_file_types": "Tipos de archivo soportados", "file_types_desc": "Estos tipos de archivo son soportados según la configuración actual de tu servicio ETL.", - "max_files_exceeded": "Límite de archivos excedido", - "max_files_exceeded_desc": "Puedes subir un máximo de {max} archivos a la vez.", - "max_size_exceeded": "Límite de tamaño excedido", - "max_size_exceeded_desc": "El tamaño total de los archivos no puede exceder {max} MB.", - "file_limit_reached": "Máximo de archivos alcanzado", - "file_limit_reached_desc": "Elimina algunos archivos para agregar más (máximo {max} archivos).", - "remaining_capacity": "{files} archivos restantes • {sizeMB} MB disponibles" + "file_too_large": "Archivo demasiado grande", + "file_too_large_desc": "\"{name}\" excede el límite de {maxMB} MB por archivo.", + "no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada." }, "add_webpage": { "title": "Agregar páginas web para rastreo", diff --git a/surfsense_web/messages/hi.json b/surfsense_web/messages/hi.json index fd51acdc2..cbcff0b30 100644 --- a/surfsense_web/messages/hi.json +++ b/surfsense_web/messages/hi.json @@ -376,12 +376,13 @@ "upload_documents": { "title": "दस्तावेज़ अपलोड करें", "subtitle": "AI-संचालित बातचीत के माध्यम से अपनी फ़ाइलों को खोजने योग्य और सुलभ बनाने के लिए अपलोड करें।", - "file_size_limit": "अधिकतम फ़ाइल आकार: प्रति फ़ाइल 50MB।", - "upload_limits": "अपलोड सीमा: {maxFiles} फ़ाइलें, कुल {maxSizeMB}MB।", - "drop_files": "फ़ाइलें यहां छोड़ें", - "drag_drop": "फ़ाइलें यहां खींचें और छोड़ें", - "or_browse": "या ब्राउज़ करने के लिए क्लिक करें", + "file_size_limit": "अधिकतम फ़ाइल आकार: प्रति फ़ाइल {maxMB}MB।", + "upload_limits": "फ़ाइलें या पूरे फ़ोल्डर अपलोड करें", + "drop_files": "फ़ाइलें या फ़ोल्डर यहां छोड़ें", + "drag_drop": "फ़ाइलें या फ़ोल्डर यहां खींचें और छोड़ें", + "or_browse": "या फ़ाइलें और फ़ोल्डर ब्राउज़ करने के लिए क्लिक करें", "browse_files": "फ़ाइलें ब्राउज़ करें", + "browse_folder": "फ़ोल्डर ब्राउज़ करें", "selected_files": "चयनित फ़ाइलें ({count})", "total_size": "कुल आकार", "clear_all": "सभी साफ करें", @@ -394,13 +395,9 @@ "upload_error_desc": "फ़ाइलें अपलोड करने में त्रुटि", "supported_file_types": "समर्थित फ़ाइल प्रकार", "file_types_desc": "ये फ़ाइल प्रकार आपकी वर्तमान ETL सेवा कॉन्फ़िगरेशन के आधार पर समर्थित हैं।", - "max_files_exceeded": "फ़ाइल सीमा पार हो गई", - "max_files_exceeded_desc": "आप एक बार में अधिकतम {max} फ़ाइलें अपलोड कर सकते हैं।", - "max_size_exceeded": "आकार सीमा पार हो गई", - "max_size_exceeded_desc": "कुल फ़ाइल आकार {max}MB से अधिक नहीं हो सकता।", - "file_limit_reached": "अधिकतम फ़ाइलें पहुंच गई", - "file_limit_reached_desc": "और जोड़ने के लिए कुछ फ़ाइलें हटाएं (अधिकतम {max} फ़ाइलें)।", - "remaining_capacity": "{files} फ़ाइलें शेष • {sizeMB}MB उपलब्ध" + "file_too_large": "फ़ाइल बहुत बड़ी है", + "file_too_large_desc": "\"{name}\" प्रति फ़ाइल {maxMB}MB की सीमा से अधिक है।", + "no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।" }, "add_webpage": { "title": "क्रॉलिंग के लिए वेबपेज जोड़ें", diff --git a/surfsense_web/messages/pt.json b/surfsense_web/messages/pt.json index e26499f90..ec72ef0da 100644 --- a/surfsense_web/messages/pt.json +++ b/surfsense_web/messages/pt.json @@ -376,12 +376,13 @@ "upload_documents": { "title": "Enviar documentos", "subtitle": "Envie seus arquivos para torná-los pesquisáveis e acessíveis através de conversas com IA.", - "file_size_limit": "Tamanho máximo do arquivo: 50 MB por arquivo.", - "upload_limits": "Limite de envio: {maxFiles} arquivos, {maxSizeMB} MB no total.", - "drop_files": "Solte os arquivos aqui", - "drag_drop": "Arraste e solte arquivos aqui", - "or_browse": "ou clique para navegar", + "file_size_limit": "Tamanho máximo do arquivo: {maxMB} MB por arquivo.", + "upload_limits": "Envie arquivos ou pastas inteiras", + "drop_files": "Solte arquivos ou pastas aqui", + "drag_drop": "Arraste e solte arquivos ou pastas aqui", + "or_browse": "ou clique para navegar arquivos e pastas", "browse_files": "Navegar arquivos", + "browse_folder": "Navegar pasta", "selected_files": "Arquivos selecionados ({count})", "total_size": "Tamanho total", "clear_all": "Limpar tudo", @@ -394,13 +395,9 @@ "upload_error_desc": "Erro ao enviar arquivos", "supported_file_types": "Tipos de arquivo suportados", "file_types_desc": "Estes tipos de arquivo são suportados com base na configuração atual do seu serviço ETL.", - "max_files_exceeded": "Limite de arquivos excedido", - "max_files_exceeded_desc": "Você pode enviar no máximo {max} arquivos de uma vez.", - "max_size_exceeded": "Limite de tamanho excedido", - "max_size_exceeded_desc": "O tamanho total dos arquivos não pode exceder {max} MB.", - "file_limit_reached": "Máximo de arquivos atingido", - "file_limit_reached_desc": "Remova alguns arquivos para adicionar mais (máximo {max} arquivos).", - "remaining_capacity": "{files} arquivos restantes • {sizeMB} MB disponíveis" + "file_too_large": "Arquivo muito grande", + "file_too_large_desc": "\"{name}\" excede o limite de {maxMB} MB por arquivo.", + "no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada." }, "add_webpage": { "title": "Adicionar páginas web para rastreamento", diff --git a/surfsense_web/messages/zh.json b/surfsense_web/messages/zh.json index 819432410..db634dfd9 100644 --- a/surfsense_web/messages/zh.json +++ b/surfsense_web/messages/zh.json @@ -360,12 +360,13 @@ "upload_documents": { "title": "上传文档", "subtitle": "上传您的文件,使其可通过 AI 对话进行搜索和访问。", - "file_size_limit": "最大文件大小:每个文件 50MB。", - "upload_limits": "上传限制:最多 {maxFiles} 个文件,总大小不超过 {maxSizeMB}MB。", - "drop_files": "放下文件到这里", - "drag_drop": "拖放文件到这里", - "or_browse": "或点击浏览", + "file_size_limit": "最大文件大小:每个文件 {maxMB}MB。", + "upload_limits": "上传文件或整个文件夹", + "drop_files": "将文件或文件夹拖放到此处", + "drag_drop": "将文件或文件夹拖放到此处", + "or_browse": "或点击浏览文件和文件夹", "browse_files": "浏览文件", + "browse_folder": "浏览文件夹", "selected_files": "已选择的文件 ({count})", "total_size": "总大小", "clear_all": "全部清除", @@ -378,13 +379,9 @@ "upload_error_desc": "上传文件时出错", "supported_file_types": "支持的文件类型", "file_types_desc": "根据您当前的 ETL 服务配置支持这些文件类型。", - "max_files_exceeded": "超过文件数量限制", - "max_files_exceeded_desc": "一次最多只能上传 {max} 个文件。", - "max_size_exceeded": "超过文件大小限制", - "max_size_exceeded_desc": "文件总大小不能超过 {max}MB。", - "file_limit_reached": "已达到最大文件数量", - "file_limit_reached_desc": "移除一些文件以添加更多(最多 {max} 个文件)。", - "remaining_capacity": "剩余 {files} 个文件名额 • 可用 {sizeMB}MB" + "file_too_large": "文件过大", + "file_too_large_desc": "\"{name}\" 超过了每个文件 {maxMB}MB 的限制。", + "no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。" }, "add_webpage": { "title": "添加网页爬取",