diff --git a/surfsense_backend/alembic/versions/116_create_zero_publication.py b/surfsense_backend/alembic/versions/116_create_zero_publication.py
index 8f0d7b5d3..ff74952a9 100644
--- a/surfsense_backend/alembic/versions/116_create_zero_publication.py
+++ b/surfsense_backend/alembic/versions/116_create_zero_publication.py
@@ -42,9 +42,7 @@ def upgrade() -> None:
if not exists:
table_list = ", ".join(TABLES)
conn.execute(
- sa.text(
- f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE {table_list}"
- )
+ sa.text(f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE {table_list}")
)
diff --git a/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py
new file mode 100644
index 000000000..3c2d34c76
--- /dev/null
+++ b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py
@@ -0,0 +1,102 @@
+"""optimize zero_publication with column lists
+
+Recreates the zero_publication using column lists for the documents
+table so that large text columns (content, source_markdown,
+blocknote_document, etc.) are excluded from WAL replication.
+This prevents RangeError: Invalid string length in zero-cache's
+change-streamer when documents have very large content.
+
+Also resets REPLICA IDENTITY to DEFAULT on tables that had it set
+to FULL for the old Electric SQL setup (migration 66/75/76).
+With DEFAULT (primary-key) identity, column-list publications
+only need to include the PK — not every column.
+
+After running this migration you MUST:
+ 1. Stop zero-cache
+ 2. Delete / reset the zero-cache data volume
+ 3. Restart zero-cache (it will do a fresh initial sync)
+
+Revision ID: 117
+Revises: 116
+"""
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+revision: str = "117"
+down_revision: str | None = "116"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+PUBLICATION_NAME = "zero_publication"
+
+TABLES_WITH_FULL_IDENTITY = [
+ "documents",
+ "notifications",
+ "search_source_connectors",
+ "new_chat_messages",
+ "chat_comments",
+ "chat_session_state",
+]
+
+DOCUMENT_COLS = [
+ "id",
+ "title",
+ "document_type",
+ "search_space_id",
+ "folder_id",
+ "created_by_id",
+ "status",
+ "created_at",
+ "updated_at",
+]
+
+PUBLICATION_DDL_FULL = f"""\
+CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE
+ notifications, documents, folders,
+ search_source_connectors, new_chat_messages,
+ chat_comments, chat_session_state
+"""
+
+
+def upgrade() -> None:
+ conn = op.get_bind()
+
+ for tbl in TABLES_WITH_FULL_IDENTITY:
+ conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY DEFAULT'))
+
+ conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}"))
+
+ has_zero_ver = conn.execute(
+ sa.text(
+ "SELECT 1 FROM information_schema.columns "
+ "WHERE table_name = 'documents' AND column_name = '_0_version'"
+ )
+ ).fetchone()
+
+ cols = DOCUMENT_COLS + (['"_0_version"'] if has_zero_ver else [])
+ col_list = ", ".join(cols)
+
+ conn.execute(
+ sa.text(
+ f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE "
+ f"notifications, "
+ f"documents ({col_list}), "
+ f"folders, "
+ f"search_source_connectors, "
+ f"new_chat_messages, "
+ f"chat_comments, "
+ f"chat_session_state"
+ )
+ )
+
+
+def downgrade() -> None:
+ conn = op.get_bind()
+ conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}"))
+ conn.execute(sa.text(PUBLICATION_DDL_FULL))
+ for tbl in TABLES_WITH_FULL_IDENTITY:
+ conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY FULL'))
diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
index ccc06f272..fc1e80d28 100644
--- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py
+++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
@@ -159,6 +159,7 @@ async def create_surfsense_deep_agent(
additional_tools: Sequence[BaseTool] | None = None,
firecrawl_api_key: str | None = None,
thread_visibility: ChatVisibility | None = None,
+ mentioned_document_ids: list[int] | None = None,
):
"""
Create a SurfSense deep agent with configurable tools and prompts.
@@ -451,6 +452,7 @@ async def create_surfsense_deep_agent(
search_space_id=search_space_id,
available_connectors=available_connectors,
available_document_types=available_document_types,
+ mentioned_document_ids=mentioned_document_ids,
),
SurfSenseFilesystemMiddleware(
search_space_id=search_space_id,
diff --git a/surfsense_backend/app/agents/new_chat/middleware/filesystem.py b/surfsense_backend/app/agents/new_chat/middleware/filesystem.py
index 41b24f88b..d7697ef15 100644
--- a/surfsense_backend/app/agents/new_chat/middleware/filesystem.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/filesystem.py
@@ -66,6 +66,16 @@ the ``, identify chunks marked `matched="true"`, then use
those sections instead of reading the entire file sequentially.
Use `` values as citation IDs in your answers.
+
+## User-Mentioned Documents
+
+When the `ls` output tags a file with `[MENTIONED BY USER — read deeply]`,
+the user **explicitly selected** that document. These files are your highest-
+priority sources:
+1. **Always read them thoroughly** — scan the full ``, then read
+ all major sections, not just matched chunks.
+2. **Prefer their content** over other search results when answering.
+3. **Cite from them first** whenever applicable.
"""
# =============================================================================
diff --git a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
index 3728f229c..7b0dd2f71 100644
--- a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
@@ -28,7 +28,13 @@ from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.agents.new_chat.utils import parse_date_or_datetime, resolve_date_range
-from app.db import NATIVE_TO_LEGACY_DOCTYPE, Document, Folder, shielded_async_session
+from app.db import (
+ NATIVE_TO_LEGACY_DOCTYPE,
+ Chunk,
+ Document,
+ Folder,
+ shielded_async_session,
+)
from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
from app.utils.document_converters import embed_texts
from app.utils.perf import get_perf_logger
@@ -430,21 +436,36 @@ async def _get_folder_paths(
def _build_synthetic_ls(
existing_files: dict[str, Any] | None,
new_files: dict[str, Any],
+ *,
+ mentioned_paths: set[str] | None = None,
) -> tuple[AIMessage, ToolMessage]:
"""Build a synthetic ls("/documents") tool-call + result for the LLM context.
- Paths are listed with *new* (rank-ordered) files first, then existing files
- that were already in state from prior turns.
+ Mentioned files are listed first. A separate header tells the LLM which
+ files the user explicitly selected; the path list itself stays clean so
+ paths can be passed directly to ``read_file`` without stripping tags.
"""
+ _mentioned = mentioned_paths or set()
merged: dict[str, Any] = {**(existing_files or {}), **new_files}
doc_paths = [
p for p, v in merged.items() if p.startswith("/documents/") and v is not None
]
new_set = set(new_files)
- new_paths = [p for p in doc_paths if p in new_set]
+ mentioned_list = [p for p in doc_paths if p in _mentioned]
+ new_non_mentioned = [p for p in doc_paths if p in new_set and p not in _mentioned]
old_paths = [p for p in doc_paths if p not in new_set]
- ordered = new_paths + old_paths
+ ordered = mentioned_list + new_non_mentioned + old_paths
+
+ parts: list[str] = []
+ if mentioned_list:
+ parts.append(
+ "USER-MENTIONED documents (read these thoroughly before answering):"
+ )
+ for p in mentioned_list:
+ parts.append(f" {p}")
+ parts.append("")
+ parts.append(str(ordered) if ordered else "No documents found.")
tool_call_id = f"auto_ls_{uuid.uuid4().hex[:12]}"
ai_msg = AIMessage(
@@ -452,7 +473,7 @@ def _build_synthetic_ls(
tool_calls=[{"name": "ls", "args": {"path": "/documents"}, "id": tool_call_id}],
)
tool_msg = ToolMessage(
- content=str(ordered) if ordered else "No documents found.",
+ content="\n".join(parts),
tool_call_id=tool_call_id,
)
return ai_msg, tool_msg
@@ -524,12 +545,92 @@ async def search_knowledge_base(
return results[:top_k]
+async def fetch_mentioned_documents(
+ *,
+ document_ids: list[int],
+ search_space_id: int,
+) -> list[dict[str, Any]]:
+ """Fetch explicitly mentioned documents with *all* their chunks.
+
+ Returns the same dict structure as ``search_knowledge_base`` so results
+ can be merged directly into ``build_scoped_filesystem``. Unlike search
+ results, every chunk is included (no top-K limiting) and none are marked
+ as ``matched`` since the entire document is relevant by virtue of the
+ user's explicit mention.
+ """
+ if not document_ids:
+ return []
+
+ async with shielded_async_session() as session:
+ doc_result = await session.execute(
+ select(Document).where(
+ Document.id.in_(document_ids),
+ Document.search_space_id == search_space_id,
+ )
+ )
+ docs = {doc.id: doc for doc in doc_result.scalars().all()}
+
+ if not docs:
+ return []
+
+ chunk_result = await session.execute(
+ select(Chunk.id, Chunk.content, Chunk.document_id)
+ .where(Chunk.document_id.in_(list(docs.keys())))
+ .order_by(Chunk.document_id, Chunk.id)
+ )
+ chunks_by_doc: dict[int, list[dict[str, Any]]] = {doc_id: [] for doc_id in docs}
+ for row in chunk_result.all():
+ if row.document_id in chunks_by_doc:
+ chunks_by_doc[row.document_id].append(
+ {"chunk_id": row.id, "content": row.content}
+ )
+
+ results: list[dict[str, Any]] = []
+ for doc_id in document_ids:
+ doc = docs.get(doc_id)
+ if doc is None:
+ continue
+ metadata = doc.document_metadata or {}
+ results.append(
+ {
+ "document_id": doc.id,
+ "content": "",
+ "score": 1.0,
+ "chunks": chunks_by_doc.get(doc.id, []),
+ "matched_chunk_ids": [],
+ "document": {
+ "id": doc.id,
+ "title": doc.title,
+ "document_type": (
+ doc.document_type.value
+ if getattr(doc, "document_type", None)
+ else None
+ ),
+ "metadata": metadata,
+ },
+ "source": (
+ doc.document_type.value
+ if getattr(doc, "document_type", None)
+ else None
+ ),
+ "_user_mentioned": True,
+ }
+ )
+ return results
+
+
async def build_scoped_filesystem(
*,
documents: Sequence[dict[str, Any]],
search_space_id: int,
-) -> dict[str, dict[str, str]]:
- """Build a StateBackend-compatible files dict from search results."""
+) -> tuple[dict[str, dict[str, str]], dict[int, str]]:
+ """Build a StateBackend-compatible files dict from search results.
+
+ Returns ``(files, doc_id_to_path)`` so callers can reliably map a
+ document id back to its filesystem path without guessing by title.
+ Paths are collision-proof: when two documents resolve to the same
+ path the doc-id is appended to disambiguate.
+ """
async with shielded_async_session() as session:
folder_paths = await _get_folder_paths(session, search_space_id)
doc_ids = [
@@ -551,6 +652,7 @@ async def build_scoped_filesystem(
}
files: dict[str, dict[str, str]] = {}
+ doc_id_to_path: dict[int, str] = {}
for document in documents:
doc_meta = document.get("document") or {}
title = str(doc_meta.get("title") or "untitled")
@@ -559,6 +661,9 @@ async def build_scoped_filesystem(
base_folder = folder_paths.get(folder_id, "/documents")
file_name = _safe_filename(title)
path = f"{base_folder}/{file_name}"
+ if path in files:
+ stem = file_name.removesuffix(".xml")
+ path = f"{base_folder}/{stem} ({doc_id}).xml"
matched_ids = set(document.get("matched_chunk_ids") or [])
xml_content = _build_document_xml(document, matched_chunk_ids=matched_ids)
files[path] = {
@@ -567,7 +672,9 @@ async def build_scoped_filesystem(
"created_at": "",
"modified_at": "",
}
- return files
+ if isinstance(doc_id, int):
+ doc_id_to_path[doc_id] = path
+ return files, doc_id_to_path
class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg]
@@ -583,12 +690,14 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg]
available_connectors: list[str] | None = None,
available_document_types: list[str] | None = None,
top_k: int = 10,
+ mentioned_document_ids: list[int] | None = None,
) -> None:
self.llm = llm
self.search_space_id = search_space_id
self.available_connectors = available_connectors
self.available_document_types = available_document_types
self.top_k = top_k
+ self.mentioned_document_ids = mentioned_document_ids or []
async def _plan_search_inputs(
self,
@@ -680,6 +789,18 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg]
user_text=user_text,
)
+ # --- 1. Fetch mentioned documents (user-selected, all chunks) ---
+ mentioned_results: list[dict[str, Any]] = []
+ if self.mentioned_document_ids:
+ mentioned_results = await fetch_mentioned_documents(
+ document_ids=self.mentioned_document_ids,
+ search_space_id=self.search_space_id,
+ )
+ # Clear after first turn so they are not re-fetched on subsequent
+ # messages within the same agent instance.
+ self.mentioned_document_ids = []
+
+ # --- 2. Run KB hybrid search ---
search_results = await search_knowledge_base(
query=planned_query,
search_space_id=self.search_space_id,
@@ -689,19 +810,50 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg]
start_date=start_date,
end_date=end_date,
)
- new_files = await build_scoped_filesystem(
- documents=search_results,
+
+ # --- 3. Merge: mentioned first, then search (dedup by doc id) ---
+ seen_doc_ids: set[int] = set()
+ merged: list[dict[str, Any]] = []
+ for doc in mentioned_results:
+ doc_id = (doc.get("document") or {}).get("id")
+ if doc_id is not None:
+ seen_doc_ids.add(doc_id)
+ merged.append(doc)
+ for doc in search_results:
+ doc_id = (doc.get("document") or {}).get("id")
+ if doc_id is not None and doc_id in seen_doc_ids:
+ continue
+ merged.append(doc)
+
+ # --- 4. Build scoped filesystem ---
+ new_files, doc_id_to_path = await build_scoped_filesystem(
+ documents=merged,
search_space_id=self.search_space_id,
)
- ai_msg, tool_msg = _build_synthetic_ls(existing_files, new_files)
+ # Identify which paths belong to user-mentioned documents using
+ # the authoritative doc_id -> path mapping (no title guessing).
+ mentioned_doc_ids = {
+ (d.get("document") or {}).get("id") for d in mentioned_results
+ }
+ mentioned_paths = {
+ doc_id_to_path[did] for did in mentioned_doc_ids if did in doc_id_to_path
+ }
+
+ ai_msg, tool_msg = _build_synthetic_ls(
+ existing_files,
+ new_files,
+ mentioned_paths=mentioned_paths,
+ )
if t0 is not None:
_perf_log.info(
- "[kb_fs_middleware] completed in %.3fs query=%r optimized=%r new_files=%d total=%d",
+ "[kb_fs_middleware] completed in %.3fs query=%r optimized=%r "
+ "mentioned=%d new_files=%d total=%d",
asyncio.get_event_loop().time() - t0,
user_text[:80],
planned_query[:120],
+ len(mentioned_results),
len(new_files),
len(new_files) + len(existing_files or {}),
)
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index 6e69218f1..f53c81bb6 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -1,7 +1,7 @@
# Force asyncio to use standard event loop before unstructured imports
import asyncio
-from fastapi import APIRouter, Depends, Form, HTTPException, UploadFile
+from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
@@ -17,6 +17,7 @@ from app.db import (
get_async_session,
)
from app.schemas import (
+ ChunkRead,
DocumentRead,
DocumentsCreate,
DocumentStatusBatchResponse,
@@ -45,9 +46,7 @@ os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1"
router = APIRouter()
-MAX_FILES_PER_UPLOAD = 10
-MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB per file
-MAX_TOTAL_SIZE_BYTES = 200 * 1024 * 1024 # 200 MB total
+MAX_FILE_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB per file
@router.post("/documents")
@@ -156,13 +155,6 @@ async def create_documents_file_upload(
if not files:
raise HTTPException(status_code=400, detail="No files provided")
- if len(files) > MAX_FILES_PER_UPLOAD:
- raise HTTPException(
- status_code=413,
- detail=f"Too many files. Maximum {MAX_FILES_PER_UPLOAD} files per upload.",
- )
-
- total_size = 0
for file in files:
file_size = file.size or 0
if file_size > MAX_FILE_SIZE_BYTES:
@@ -171,14 +163,6 @@ async def create_documents_file_upload(
detail=f"File '{file.filename}' ({file_size / (1024 * 1024):.1f} MB) "
f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.",
)
- total_size += file_size
-
- if total_size > MAX_TOTAL_SIZE_BYTES:
- raise HTTPException(
- status_code=413,
- detail=f"Total upload size ({total_size / (1024 * 1024):.1f} MB) "
- f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.",
- )
# ===== Read all files concurrently to avoid blocking the event loop =====
async def _read_and_save(file: UploadFile) -> tuple[str, str, int]:
@@ -206,16 +190,6 @@ async def create_documents_file_upload(
saved_files = await asyncio.gather(*(_read_and_save(f) for f in files))
- actual_total_size = sum(size for _, _, size in saved_files)
- if actual_total_size > MAX_TOTAL_SIZE_BYTES:
- for temp_path, _, _ in saved_files:
- os.unlink(temp_path)
- raise HTTPException(
- status_code=413,
- detail=f"Total upload size ({actual_total_size / (1024 * 1024):.1f} MB) "
- f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.",
- )
-
# ===== PHASE 1: Create pending documents for all files =====
created_documents: list[Document] = []
files_to_process: list[tuple[Document, str, str]] = []
@@ -451,13 +425,15 @@ async def read_documents(
reason=doc.status.get("reason"),
)
+ raw_content = doc.content or ""
api_documents.append(
DocumentRead(
id=doc.id,
title=doc.title,
document_type=doc.document_type,
document_metadata=doc.document_metadata,
- content=doc.content,
+ content="",
+ content_preview=raw_content[:300],
content_hash=doc.content_hash,
unique_identifier_hash=doc.unique_identifier_hash,
created_at=doc.created_at,
@@ -609,13 +585,15 @@ async def search_documents(
reason=doc.status.get("reason"),
)
+ raw_content = doc.content or ""
api_documents.append(
DocumentRead(
id=doc.id,
title=doc.title,
document_type=doc.document_type,
document_metadata=doc.document_metadata,
- content=doc.content,
+ content="",
+ content_preview=raw_content[:300],
content_hash=doc.content_hash,
unique_identifier_hash=doc.unique_identifier_hash,
created_at=doc.created_at,
@@ -884,16 +862,19 @@ async def get_document_type_counts(
@router.get("/documents/by-chunk/{chunk_id}", response_model=DocumentWithChunksRead)
async def get_document_by_chunk_id(
chunk_id: int,
+ chunk_window: int = Query(
+ 5, ge=0, description="Number of chunks before/after the cited chunk to include"
+ ),
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
- Retrieves a document based on a chunk ID, including all its chunks ordered by creation time.
- Requires DOCUMENTS_READ permission for the search space.
- The document's embedding and chunk embeddings are excluded from the response.
+ Retrieves a document based on a chunk ID, including a window of chunks around the cited one.
+ Uses SQL-level pagination to avoid loading all chunks into memory.
"""
try:
- # First, get the chunk and verify it exists
+ from sqlalchemy import and_, func, or_
+
chunk_result = await session.execute(select(Chunk).filter(Chunk.id == chunk_id))
chunk = chunk_result.scalars().first()
@@ -902,11 +883,8 @@ async def get_document_by_chunk_id(
status_code=404, detail=f"Chunk with id {chunk_id} not found"
)
- # Get the associated document
document_result = await session.execute(
- select(Document)
- .options(selectinload(Document.chunks))
- .filter(Document.id == chunk.document_id)
+ select(Document).filter(Document.id == chunk.document_id)
)
document = document_result.scalars().first()
@@ -916,7 +894,6 @@ async def get_document_by_chunk_id(
detail="Document not found",
)
- # Check permission for the search space
await check_permission(
session,
user,
@@ -925,10 +902,38 @@ async def get_document_by_chunk_id(
"You don't have permission to read documents in this search space",
)
- # Sort chunks by creation time
- sorted_chunks = sorted(document.chunks, key=lambda x: x.created_at)
+ total_result = await session.execute(
+ select(func.count())
+ .select_from(Chunk)
+ .filter(Chunk.document_id == document.id)
+ )
+ total_chunks = total_result.scalar() or 0
+
+ cited_idx_result = await session.execute(
+ select(func.count())
+ .select_from(Chunk)
+ .filter(
+ Chunk.document_id == document.id,
+ or_(
+ Chunk.created_at < chunk.created_at,
+ and_(Chunk.created_at == chunk.created_at, Chunk.id < chunk.id),
+ ),
+ )
+ )
+ cited_idx = cited_idx_result.scalar() or 0
+
+ start = max(0, cited_idx - chunk_window)
+ end = min(total_chunks, cited_idx + chunk_window + 1)
+
+ windowed_result = await session.execute(
+ select(Chunk)
+ .filter(Chunk.document_id == document.id)
+ .order_by(Chunk.created_at, Chunk.id)
+ .offset(start)
+ .limit(end - start)
+ )
+ windowed_chunks = windowed_result.scalars().all()
- # Return the document with its chunks
return DocumentWithChunksRead(
id=document.id,
title=document.title,
@@ -940,7 +945,9 @@ async def get_document_by_chunk_id(
created_at=document.created_at,
updated_at=document.updated_at,
search_space_id=document.search_space_id,
- chunks=sorted_chunks,
+ chunks=windowed_chunks,
+ total_chunks=total_chunks,
+ chunk_start_index=start,
)
except HTTPException:
raise
@@ -950,6 +957,75 @@ async def get_document_by_chunk_id(
) from e
+@router.get(
+ "/documents/{document_id}/chunks",
+ response_model=PaginatedResponse[ChunkRead],
+)
+async def get_document_chunks_paginated(
+ document_id: int,
+ page: int = Query(0, ge=0),
+ page_size: int = Query(20, ge=1, le=100),
+ start_offset: int | None = Query(
+ None, ge=0, description="Direct offset; overrides page * page_size"
+ ),
+ session: AsyncSession = Depends(get_async_session),
+ user: User = Depends(current_active_user),
+):
+ """
+ Paginated chunk loading for a document.
+ Supports both page-based and offset-based access.
+ """
+ try:
+ from sqlalchemy import func
+
+ doc_result = await session.execute(
+ select(Document).filter(Document.id == document_id)
+ )
+ document = doc_result.scalars().first()
+
+ if not document:
+ raise HTTPException(status_code=404, detail="Document not found")
+
+ await check_permission(
+ session,
+ user,
+ document.search_space_id,
+ Permission.DOCUMENTS_READ.value,
+ "You don't have permission to read documents in this search space",
+ )
+
+ total_result = await session.execute(
+ select(func.count())
+ .select_from(Chunk)
+ .filter(Chunk.document_id == document_id)
+ )
+ total = total_result.scalar() or 0
+
+ offset = start_offset if start_offset is not None else page * page_size
+ chunks_result = await session.execute(
+ select(Chunk)
+ .filter(Chunk.document_id == document_id)
+ .order_by(Chunk.created_at, Chunk.id)
+ .offset(offset)
+ .limit(page_size)
+ )
+ chunks = chunks_result.scalars().all()
+
+ return PaginatedResponse(
+ items=chunks,
+ total=total,
+ page=offset // page_size if page_size else page,
+ page_size=page_size,
+ has_more=(offset + len(chunks)) < total,
+ )
+ except HTTPException:
+ raise
+ except Exception as e:
+ raise HTTPException(
+ status_code=500, detail=f"Failed to fetch chunks: {e!s}"
+ ) from e
+
+
@router.get("/documents/{document_id}", response_model=DocumentRead)
async def read_document(
document_id: int,
@@ -980,13 +1056,14 @@ async def read_document(
"You don't have permission to read documents in this search space",
)
- # Convert database object to API-friendly format
+ raw_content = document.content or ""
return DocumentRead(
id=document.id,
title=document.title,
document_type=document.document_type,
document_metadata=document.document_metadata,
- content=document.content,
+ content=raw_content,
+ content_preview=raw_content[:300],
content_hash=document.content_hash,
unique_identifier_hash=document.unique_identifier_hash,
created_at=document.created_at,
diff --git a/surfsense_backend/app/routes/editor_routes.py b/surfsense_backend/app/routes/editor_routes.py
index f54f18def..09a35c619 100644
--- a/surfsense_backend/app/routes/editor_routes.py
+++ b/surfsense_backend/app/routes/editor_routes.py
@@ -15,11 +15,10 @@ import pypandoc
import typst
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.responses import StreamingResponse
-from sqlalchemy import select
+from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession
-from sqlalchemy.orm import selectinload
-from app.db import Document, DocumentType, Permission, User, get_async_session
+from app.db import Chunk, Document, DocumentType, Permission, User, get_async_session
from app.routes.reports_routes import (
_FILE_EXTENSIONS,
_MEDIA_TYPES,
@@ -44,6 +43,9 @@ router = APIRouter()
async def get_editor_content(
search_space_id: int,
document_id: int,
+ max_length: int | None = Query(
+ None, description="Truncate source_markdown to this many characters"
+ ),
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
@@ -65,9 +67,7 @@ async def get_editor_content(
)
result = await session.execute(
- select(Document)
- .options(selectinload(Document.chunks))
- .filter(
+ select(Document).filter(
Document.id == document_id,
Document.search_space_id == search_space_id,
)
@@ -77,62 +77,63 @@ async def get_editor_content(
if not document:
raise HTTPException(status_code=404, detail="Document not found")
- # Priority 1: Return source_markdown if it exists (check `is not None` to allow empty strings)
- if document.source_markdown is not None:
+ count_result = await session.execute(
+ select(func.count()).select_from(Chunk).filter(Chunk.document_id == document_id)
+ )
+ chunk_count = count_result.scalar() or 0
+
+ def _build_response(md: str) -> dict:
+ size_bytes = len(md.encode("utf-8"))
+ truncated = False
+ output_md = md
+ if max_length is not None and size_bytes > max_length:
+ output_md = md[:max_length]
+ truncated = True
return {
"document_id": document.id,
"title": document.title,
"document_type": document.document_type.value,
- "source_markdown": document.source_markdown,
+ "source_markdown": output_md,
+ "content_size_bytes": size_bytes,
+ "chunk_count": chunk_count,
+ "truncated": truncated,
"updated_at": document.updated_at.isoformat()
if document.updated_at
else None,
}
- # Priority 2: Lazy-migrate from blocknote_document (pure Python, no external deps)
+ if document.source_markdown is not None:
+ return _build_response(document.source_markdown)
+
if document.blocknote_document:
from app.utils.blocknote_to_markdown import blocknote_to_markdown
markdown = blocknote_to_markdown(document.blocknote_document)
if markdown:
- # Persist the migration so we don't repeat it
document.source_markdown = markdown
await session.commit()
- return {
- "document_id": document.id,
- "title": document.title,
- "document_type": document.document_type.value,
- "source_markdown": markdown,
- "updated_at": document.updated_at.isoformat()
- if document.updated_at
- else None,
- }
+ return _build_response(markdown)
- # Priority 3: For NOTE type with no content, return empty markdown
if document.document_type == DocumentType.NOTE:
empty_markdown = ""
document.source_markdown = empty_markdown
await session.commit()
- return {
- "document_id": document.id,
- "title": document.title,
- "document_type": document.document_type.value,
- "source_markdown": empty_markdown,
- "updated_at": document.updated_at.isoformat()
- if document.updated_at
- else None,
- }
+ return _build_response(empty_markdown)
- # Priority 4: Reconstruct from chunks
- chunks = sorted(document.chunks, key=lambda c: c.id)
+ chunk_contents_result = await session.execute(
+ select(Chunk.content)
+ .filter(Chunk.document_id == document_id)
+ .order_by(Chunk.id)
+ )
+ chunk_contents = chunk_contents_result.scalars().all()
- if not chunks:
+ if not chunk_contents:
raise HTTPException(
status_code=400,
detail="This document has no content and cannot be edited. Please re-upload to enable editing.",
)
- markdown_content = "\n\n".join(chunk.content for chunk in chunks)
+ markdown_content = "\n\n".join(chunk_contents)
if not markdown_content.strip():
raise HTTPException(
@@ -140,17 +141,77 @@ async def get_editor_content(
detail="This document has empty content and cannot be edited.",
)
- # Persist the lazy migration
document.source_markdown = markdown_content
await session.commit()
- return {
- "document_id": document.id,
- "title": document.title,
- "document_type": document.document_type.value,
- "source_markdown": markdown_content,
- "updated_at": document.updated_at.isoformat() if document.updated_at else None,
- }
+ return _build_response(markdown_content)
+
+
+@router.get(
+ "/search-spaces/{search_space_id}/documents/{document_id}/download-markdown"
+)
+async def download_document_markdown(
+ search_space_id: int,
+ document_id: int,
+ session: AsyncSession = Depends(get_async_session),
+ user: User = Depends(current_active_user),
+):
+ """
+ Download the full document content as a .md file.
+ Reconstructs markdown from source_markdown or chunks.
+ """
+ await check_permission(
+ session,
+ user,
+ search_space_id,
+ Permission.DOCUMENTS_READ.value,
+ "You don't have permission to read documents in this search space",
+ )
+
+ result = await session.execute(
+ select(Document).filter(
+ Document.id == document_id,
+ Document.search_space_id == search_space_id,
+ )
+ )
+ document = result.scalars().first()
+
+ if not document:
+ raise HTTPException(status_code=404, detail="Document not found")
+
+ markdown: str | None = document.source_markdown
+ if markdown is None and document.blocknote_document:
+ from app.utils.blocknote_to_markdown import blocknote_to_markdown
+
+ markdown = blocknote_to_markdown(document.blocknote_document)
+ if markdown is None:
+ chunk_contents_result = await session.execute(
+ select(Chunk.content)
+ .filter(Chunk.document_id == document_id)
+ .order_by(Chunk.id)
+ )
+ chunk_contents = chunk_contents_result.scalars().all()
+ if chunk_contents:
+ markdown = "\n\n".join(chunk_contents)
+
+ if not markdown or not markdown.strip():
+ raise HTTPException(
+ status_code=400, detail="Document has no content to download"
+ )
+
+ safe_title = (
+ "".join(
+ c if c.isalnum() or c in " -_" else "_"
+ for c in (document.title or "document")
+ ).strip()[:80]
+ or "document"
+ )
+
+ return StreamingResponse(
+ io.BytesIO(markdown.encode("utf-8")),
+ media_type="text/markdown; charset=utf-8",
+ headers={"Content-Disposition": f'attachment; filename="{safe_title}.md"'},
+ )
@router.post("/search-spaces/{search_space_id}/documents/{document_id}/save")
@@ -258,9 +319,7 @@ async def export_document(
)
result = await session.execute(
- select(Document)
- .options(selectinload(Document.chunks))
- .filter(
+ select(Document).filter(
Document.id == document_id,
Document.search_space_id == search_space_id,
)
@@ -269,16 +328,20 @@ async def export_document(
if not document:
raise HTTPException(status_code=404, detail="Document not found")
- # Resolve markdown content (same priority as editor-content endpoint)
markdown_content: str | None = document.source_markdown
if markdown_content is None and document.blocknote_document:
from app.utils.blocknote_to_markdown import blocknote_to_markdown
markdown_content = blocknote_to_markdown(document.blocknote_document)
if markdown_content is None:
- chunks = sorted(document.chunks, key=lambda c: c.id)
- if chunks:
- markdown_content = "\n\n".join(chunk.content for chunk in chunks)
+ chunk_contents_result = await session.execute(
+ select(Chunk.content)
+ .filter(Chunk.document_id == document_id)
+ .order_by(Chunk.id)
+ )
+ chunk_contents = chunk_contents_result.scalars().all()
+ if chunk_contents:
+ markdown_content = "\n\n".join(chunk_contents)
if not markdown_content or not markdown_content.strip():
raise HTTPException(status_code=400, detail="Document has no content to export")
diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py
index c022a09d2..49d2836b2 100644
--- a/surfsense_backend/app/schemas/documents.py
+++ b/surfsense_backend/app/schemas/documents.py
@@ -53,25 +53,26 @@ class DocumentRead(BaseModel):
title: str
document_type: DocumentType
document_metadata: dict
- content: str # Changed to string to match frontend
+ content: str = ""
+ content_preview: str = ""
content_hash: str
unique_identifier_hash: str | None
created_at: datetime
updated_at: datetime | None
search_space_id: int
folder_id: int | None = None
- created_by_id: UUID | None = None # User who created/uploaded this document
+ created_by_id: UUID | None = None
created_by_name: str | None = None
created_by_email: str | None = None
- status: DocumentStatusSchema | None = (
- None # Processing status (ready, processing, failed)
- )
+ status: DocumentStatusSchema | None = None
model_config = ConfigDict(from_attributes=True)
class DocumentWithChunksRead(DocumentRead):
chunks: list[ChunkRead] = []
+ total_chunks: int = 0
+ chunk_start_index: int = 0
model_config = ConfigDict(from_attributes=True)
diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py
index 7c1e3b7ea..5ff907459 100644
--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@@ -39,7 +39,6 @@ from app.agents.new_chat.llm_config import (
)
from app.db import (
ChatVisibility,
- Document,
NewChatMessage,
NewChatThread,
Report,
@@ -63,74 +62,6 @@ _perf_log = get_perf_logger()
_background_tasks: set[asyncio.Task] = set()
-def format_mentioned_documents_as_context(documents: list[Document]) -> str:
- """
- Format mentioned documents as context for the agent.
-
- Uses the same XML structure as knowledge_base.format_documents_for_context
- to ensure citations work properly with chunk IDs.
- """
- if not documents:
- return ""
-
- context_parts = [""]
- context_parts.append(
- "The user has explicitly mentioned the following documents from their knowledge base. "
- "These documents are directly relevant to the query and should be prioritized as primary sources. "
- "Use [citation:CHUNK_ID] format for citations (e.g., [citation:123])."
- )
- context_parts.append("")
-
- for doc in documents:
- # Build metadata JSON
- metadata = doc.document_metadata or {}
- metadata_json = json.dumps(metadata, ensure_ascii=False)
-
- # Get URL from metadata
- url = (
- metadata.get("url")
- or metadata.get("source")
- or metadata.get("page_url")
- or ""
- )
-
- context_parts.append("")
- context_parts.append("")
- context_parts.append(f" {doc.id} ")
- context_parts.append(
- f" {doc.document_type.value} "
- )
- context_parts.append(f" ")
- context_parts.append(f" ")
- context_parts.append(
- f" "
- )
- context_parts.append(" ")
- context_parts.append("")
- context_parts.append("")
-
- # Use chunks if available (preferred for proper citations)
- if hasattr(doc, "chunks") and doc.chunks:
- for chunk in doc.chunks:
- context_parts.append(
- f" "
- )
- else:
- # Fallback to document content if chunks not loaded
- # Use document ID as chunk ID prefix for consistency
- context_parts.append(
- f" "
- )
-
- context_parts.append(" ")
- context_parts.append(" ")
- context_parts.append("")
-
- context_parts.append(" ")
-
- return "\n".join(context_parts)
-
-
def format_mentioned_surfsense_docs_as_context(
documents: list[SurfsenseDocsDocument],
) -> str:
@@ -1317,6 +1248,7 @@ async def stream_new_chat(
firecrawl_api_key=firecrawl_api_key,
thread_visibility=visibility,
disabled_tools=disabled_tools,
+ mentioned_document_ids=mentioned_document_ids,
)
_perf_log.info(
"[stream_new_chat] Agent created in %.3fs", time.perf_counter() - _t0
@@ -1340,18 +1272,9 @@ async def stream_new_chat(
thread.needs_history_bootstrap = False
await session.commit()
- # Fetch mentioned documents if any (with chunks for proper citations)
- mentioned_documents: list[Document] = []
- if mentioned_document_ids:
- result = await session.execute(
- select(Document)
- .options(selectinload(Document.chunks))
- .filter(
- Document.id.in_(mentioned_document_ids),
- Document.search_space_id == search_space_id,
- )
- )
- mentioned_documents = list(result.scalars().all())
+ # Mentioned KB documents are now handled by KnowledgeBaseSearchMiddleware
+ # which merges them into the scoped filesystem with full document
+ # structure. Only SurfSense docs and report context are inlined here.
# Fetch mentioned SurfSense docs if any
mentioned_surfsense_docs: list[SurfsenseDocsDocument] = []
@@ -1379,15 +1302,10 @@ async def stream_new_chat(
)
recent_reports = list(recent_reports_result.scalars().all())
- # Format the user query with context (mentioned documents + SurfSense docs)
+ # Format the user query with context (SurfSense docs + reports only)
final_query = user_query
context_parts = []
- if mentioned_documents:
- context_parts.append(
- format_mentioned_documents_as_context(mentioned_documents)
- )
-
if mentioned_surfsense_docs:
context_parts.append(
format_mentioned_surfsense_docs_as_context(mentioned_surfsense_docs)
@@ -1479,7 +1397,7 @@ async def stream_new_chat(
yield streaming_service.format_start_step()
# Initial thinking step - analyzing the request
- if mentioned_documents or mentioned_surfsense_docs:
+ if mentioned_surfsense_docs:
initial_title = "Analyzing referenced content"
action_verb = "Analyzing"
else:
@@ -1490,18 +1408,6 @@ async def stream_new_chat(
query_text = user_query[:80] + ("..." if len(user_query) > 80 else "")
processing_parts.append(query_text)
- if mentioned_documents:
- doc_names = []
- for doc in mentioned_documents:
- title = doc.title
- if len(title) > 30:
- title = title[:27] + "..."
- doc_names.append(title)
- if len(doc_names) == 1:
- processing_parts.append(f"[{doc_names[0]}]")
- else:
- processing_parts.append(f"[{len(doc_names)} documents]")
-
if mentioned_surfsense_docs:
doc_names = []
for doc in mentioned_surfsense_docs:
@@ -1527,7 +1433,7 @@ async def stream_new_chat(
# These ORM objects (with eagerly-loaded chunks) can be very large.
# They're only needed to build context strings already copied into
# final_query / langchain_messages — release them before streaming.
- del mentioned_documents, mentioned_surfsense_docs, recent_reports
+ del mentioned_surfsense_docs, recent_reports
del langchain_messages, final_query
# Check if this is the first assistant response so we can generate
diff --git a/surfsense_backend/app/tasks/document_processors/__init__.py b/surfsense_backend/app/tasks/document_processors/__init__.py
index e70c41cb4..2b5690d02 100644
--- a/surfsense_backend/app/tasks/document_processors/__init__.py
+++ b/surfsense_backend/app/tasks/document_processors/__init__.py
@@ -12,16 +12,14 @@ Available processors:
- YouTube processor: Process YouTube videos and extract transcripts
"""
-# URL crawler
# Extension processor
-from .extension_processor import add_extension_received_document
-
-# File processors
-from .file_processors import (
+# File processors (backward-compatible re-exports from _save)
+from ._save import (
add_received_file_document_using_docling,
add_received_file_document_using_llamacloud,
add_received_file_document_using_unstructured,
)
+from .extension_processor import add_extension_received_document
# Markdown processor
from .markdown_processor import add_received_markdown_file_document
@@ -32,9 +30,9 @@ from .youtube_processor import add_youtube_video_document
__all__ = [
# Extension processing
"add_extension_received_document",
+ # File processing with different ETL services
"add_received_file_document_using_docling",
"add_received_file_document_using_llamacloud",
- # File processing with different ETL services
"add_received_file_document_using_unstructured",
# Markdown file processing
"add_received_markdown_file_document",
diff --git a/surfsense_backend/app/tasks/document_processors/_constants.py b/surfsense_backend/app/tasks/document_processors/_constants.py
new file mode 100644
index 000000000..f74d7acce
--- /dev/null
+++ b/surfsense_backend/app/tasks/document_processors/_constants.py
@@ -0,0 +1,74 @@
+"""
+Constants for file document processing.
+
+Centralizes file type classification, LlamaCloud retry configuration,
+and timeout calculation parameters.
+"""
+
+import ssl
+from enum import Enum
+
+import httpx
+
+# ---------------------------------------------------------------------------
+# File type classification
+# ---------------------------------------------------------------------------
+
+MARKDOWN_EXTENSIONS = (".md", ".markdown", ".txt")
+AUDIO_EXTENSIONS = (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
+DIRECT_CONVERT_EXTENSIONS = (".csv", ".tsv", ".html", ".htm")
+
+
+class FileCategory(Enum):
+ MARKDOWN = "markdown"
+ AUDIO = "audio"
+ DIRECT_CONVERT = "direct_convert"
+ DOCUMENT = "document"
+
+
+def classify_file(filename: str) -> FileCategory:
+ """Classify a file by its extension into a processing category."""
+ lower = filename.lower()
+ if lower.endswith(MARKDOWN_EXTENSIONS):
+ return FileCategory.MARKDOWN
+ if lower.endswith(AUDIO_EXTENSIONS):
+ return FileCategory.AUDIO
+ if lower.endswith(DIRECT_CONVERT_EXTENSIONS):
+ return FileCategory.DIRECT_CONVERT
+ return FileCategory.DOCUMENT
+
+
+# ---------------------------------------------------------------------------
+# LlamaCloud retry configuration
+# ---------------------------------------------------------------------------
+
+LLAMACLOUD_MAX_RETRIES = 5
+LLAMACLOUD_BASE_DELAY = 10 # seconds (exponential backoff base)
+LLAMACLOUD_MAX_DELAY = 120 # max delay between retries (2 minutes)
+LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
+ ssl.SSLError,
+ httpx.ConnectError,
+ httpx.ConnectTimeout,
+ httpx.ReadError,
+ httpx.ReadTimeout,
+ httpx.WriteError,
+ httpx.WriteTimeout,
+ httpx.RemoteProtocolError,
+ httpx.LocalProtocolError,
+ ConnectionError,
+ ConnectionResetError,
+ TimeoutError,
+ OSError,
+)
+
+# ---------------------------------------------------------------------------
+# Timeout calculation constants
+# ---------------------------------------------------------------------------
+
+UPLOAD_BYTES_PER_SECOND_SLOW = (
+ 100 * 1024
+) # 100 KB/s (conservative for slow connections)
+MIN_UPLOAD_TIMEOUT = 120 # Minimum 2 minutes for any file
+MAX_UPLOAD_TIMEOUT = 1800 # Maximum 30 minutes for very large files
+BASE_JOB_TIMEOUT = 600 # 10 minutes base for job processing
+PER_PAGE_JOB_TIMEOUT = 60 # 1 minute per page for processing
diff --git a/surfsense_backend/app/tasks/document_processors/_direct_converters.py b/surfsense_backend/app/tasks/document_processors/_direct_converters.py
new file mode 100644
index 000000000..b1a69ef4f
--- /dev/null
+++ b/surfsense_backend/app/tasks/document_processors/_direct_converters.py
@@ -0,0 +1,90 @@
+"""
+Lossless file-to-markdown converters for text-based formats.
+
+These converters handle file types that can be faithfully represented as
+markdown without any external ETL/OCR service:
+
+- CSV / TSV → markdown table (stdlib ``csv``)
+- HTML / HTM → markdown (``markdownify``)
+"""
+
+from __future__ import annotations
+
+import csv
+from collections.abc import Callable
+from pathlib import Path
+
+from markdownify import markdownify
+
+# The stdlib csv module defaults to a 128 KB field-size limit which is too
+# small for real-world exports (e.g. chat logs, CRM dumps). We raise it once
+# at import time so every csv.reader call in this module can handle large fields.
+csv.field_size_limit(2**31 - 1)
+
+
+def _escape_pipe(cell: str) -> str:
+ """Escape literal pipe characters inside a markdown table cell."""
+ return cell.replace("|", "\\|")
+
+
+def csv_to_markdown(file_path: str, *, delimiter: str = ",") -> str:
+ """Convert a CSV (or TSV) file to a markdown table.
+
+ The first row is treated as the header. An empty file returns an
+ empty string so the caller can decide how to handle it.
+ """
+ with open(file_path, encoding="utf-8", newline="") as fh:
+ reader = csv.reader(fh, delimiter=delimiter)
+ rows = list(reader)
+
+ if not rows:
+ return ""
+
+ header, *body = rows
+ col_count = len(header)
+
+ lines: list[str] = []
+
+ header_cells = [_escape_pipe(c.strip()) for c in header]
+ lines.append("| " + " | ".join(header_cells) + " |")
+ lines.append("| " + " | ".join(["---"] * col_count) + " |")
+
+ for row in body:
+ padded = row + [""] * (col_count - len(row))
+ cells = [_escape_pipe(c.strip()) for c in padded[:col_count]]
+ lines.append("| " + " | ".join(cells) + " |")
+
+ return "\n".join(lines) + "\n"
+
+
+def tsv_to_markdown(file_path: str) -> str:
+ """Convert a TSV file to a markdown table."""
+ return csv_to_markdown(file_path, delimiter="\t")
+
+
+def html_to_markdown(file_path: str) -> str:
+ """Convert an HTML file to markdown via ``markdownify``."""
+ html = Path(file_path).read_text(encoding="utf-8")
+ return markdownify(html).strip()
+
+
+_CONVERTER_MAP: dict[str, Callable[..., str]] = {
+ ".csv": csv_to_markdown,
+ ".tsv": tsv_to_markdown,
+ ".html": html_to_markdown,
+ ".htm": html_to_markdown,
+}
+
+
+def convert_file_directly(file_path: str, filename: str) -> str:
+ """Dispatch to the appropriate lossless converter based on file extension.
+
+ Raises ``ValueError`` if the extension is not supported.
+ """
+ suffix = Path(filename).suffix.lower()
+ converter = _CONVERTER_MAP.get(suffix)
+ if converter is None:
+ raise ValueError(
+ f"No direct converter for extension '{suffix}' (file: {filename})"
+ )
+ return converter(file_path)
diff --git a/surfsense_backend/app/tasks/document_processors/_etl.py b/surfsense_backend/app/tasks/document_processors/_etl.py
new file mode 100644
index 000000000..cc3a8b1ac
--- /dev/null
+++ b/surfsense_backend/app/tasks/document_processors/_etl.py
@@ -0,0 +1,209 @@
+"""
+ETL parsing strategies for different document processing services.
+
+Provides parse functions for Unstructured, LlamaCloud, and Docling, along with
+LlamaCloud retry logic and dynamic timeout calculations.
+"""
+
+import asyncio
+import logging
+import os
+import random
+import warnings
+from logging import ERROR, getLogger
+
+import httpx
+
+from app.config import config as app_config
+from app.db import Log
+from app.services.task_logging_service import TaskLoggingService
+
+from ._constants import (
+ LLAMACLOUD_BASE_DELAY,
+ LLAMACLOUD_MAX_DELAY,
+ LLAMACLOUD_MAX_RETRIES,
+ LLAMACLOUD_RETRYABLE_EXCEPTIONS,
+ PER_PAGE_JOB_TIMEOUT,
+)
+from ._helpers import calculate_job_timeout, calculate_upload_timeout
+
+# ---------------------------------------------------------------------------
+# LlamaCloud parsing with retry
+# ---------------------------------------------------------------------------
+
+
+async def parse_with_llamacloud_retry(
+ file_path: str,
+ estimated_pages: int,
+ task_logger: TaskLoggingService | None = None,
+ log_entry: Log | None = None,
+):
+ """
+ Parse a file with LlamaCloud with retry logic for transient SSL/connection errors.
+
+ Uses dynamic timeout calculations based on file size and page count to handle
+ very large files reliably.
+
+ Returns:
+ LlamaParse result object
+
+ Raises:
+ Exception: If all retries fail
+ """
+ from llama_cloud_services import LlamaParse
+ from llama_cloud_services.parse.utils import ResultType
+
+ file_size_bytes = os.path.getsize(file_path)
+ file_size_mb = file_size_bytes / (1024 * 1024)
+
+ upload_timeout = calculate_upload_timeout(file_size_bytes)
+ job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes)
+
+ custom_timeout = httpx.Timeout(
+ connect=120.0,
+ read=upload_timeout,
+ write=upload_timeout,
+ pool=120.0,
+ )
+
+ logging.info(
+ f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
+ f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
+ f"job_timeout={job_timeout:.0f}s"
+ )
+
+ last_exception = None
+ attempt_errors: list[str] = []
+
+ for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1):
+ try:
+ async with httpx.AsyncClient(timeout=custom_timeout) as custom_client:
+ parser = LlamaParse(
+ api_key=app_config.LLAMA_CLOUD_API_KEY,
+ num_workers=1,
+ verbose=True,
+ language="en",
+ result_type=ResultType.MD,
+ max_timeout=int(max(2000, job_timeout + upload_timeout)),
+ job_timeout_in_seconds=job_timeout,
+ job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
+ custom_client=custom_client,
+ )
+ result = await parser.aparse(file_path)
+
+ if attempt > 1:
+ logging.info(
+ f"LlamaCloud upload succeeded on attempt {attempt} after "
+ f"{len(attempt_errors)} failures"
+ )
+ return result
+
+ except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
+ last_exception = e
+ error_type = type(e).__name__
+ error_msg = str(e)[:200]
+ attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}")
+
+ if attempt < LLAMACLOUD_MAX_RETRIES:
+ base_delay = min(
+ LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)),
+ LLAMACLOUD_MAX_DELAY,
+ )
+ jitter = base_delay * 0.25 * (2 * random.random() - 1)
+ delay = base_delay + jitter
+
+ if task_logger and log_entry:
+ await task_logger.log_task_progress(
+ log_entry,
+ f"LlamaCloud upload failed "
+ f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), "
+ f"retrying in {delay:.0f}s",
+ {
+ "error_type": error_type,
+ "error_message": error_msg,
+ "attempt": attempt,
+ "retry_delay": delay,
+ "file_size_mb": round(file_size_mb, 1),
+ "upload_timeout": upload_timeout,
+ },
+ )
+ else:
+ logging.warning(
+ f"LlamaCloud upload failed "
+ f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): "
+ f"{error_type}. File: {file_size_mb:.1f}MB. "
+ f"Retrying in {delay:.0f}s..."
+ )
+
+ await asyncio.sleep(delay)
+ else:
+ logging.error(
+ f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} "
+ f"attempts. File size: {file_size_mb:.1f}MB, "
+ f"Pages: {estimated_pages}. "
+ f"Errors: {'; '.join(attempt_errors)}"
+ )
+
+ except Exception:
+ raise
+
+ raise last_exception or RuntimeError(
+ f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. "
+ f"File size: {file_size_mb:.1f}MB"
+ )
+
+
+# ---------------------------------------------------------------------------
+# Per-service parse functions
+# ---------------------------------------------------------------------------
+
+
+async def parse_with_unstructured(file_path: str):
+ """
+ Parse a file using the Unstructured ETL service.
+
+ Returns:
+ List of LangChain Document elements.
+ """
+ from langchain_unstructured import UnstructuredLoader
+
+ loader = UnstructuredLoader(
+ file_path,
+ mode="elements",
+ post_processors=[],
+ languages=["eng"],
+ include_orig_elements=False,
+ include_metadata=False,
+ strategy="auto",
+ )
+ return await loader.aload()
+
+
+async def parse_with_docling(file_path: str, filename: str) -> str:
+ """
+ Parse a file using the Docling ETL service (via the Docling service wrapper).
+
+ Returns:
+ Markdown content string.
+ """
+ from app.services.docling_service import create_docling_service
+
+ docling_service = create_docling_service()
+
+ pdfminer_logger = getLogger("pdfminer")
+ original_level = pdfminer_logger.level
+
+ with warnings.catch_warnings():
+ warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer")
+ warnings.filterwarnings(
+ "ignore", message=".*Cannot set gray non-stroke color.*"
+ )
+ warnings.filterwarnings("ignore", message=".*invalid float value.*")
+ pdfminer_logger.setLevel(ERROR)
+
+ try:
+ result = await docling_service.process_document(file_path, filename)
+ finally:
+ pdfminer_logger.setLevel(original_level)
+
+ return result["content"]
diff --git a/surfsense_backend/app/tasks/document_processors/_helpers.py b/surfsense_backend/app/tasks/document_processors/_helpers.py
new file mode 100644
index 000000000..7ac05932c
--- /dev/null
+++ b/surfsense_backend/app/tasks/document_processors/_helpers.py
@@ -0,0 +1,218 @@
+"""
+Document helper functions for deduplication, migration, and connector updates.
+
+Provides reusable logic shared across file processors and ETL strategies.
+"""
+
+import logging
+
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import Document, DocumentStatus, DocumentType
+from app.utils.document_converters import generate_unique_identifier_hash
+
+from ._constants import (
+ BASE_JOB_TIMEOUT,
+ MAX_UPLOAD_TIMEOUT,
+ MIN_UPLOAD_TIMEOUT,
+ PER_PAGE_JOB_TIMEOUT,
+ UPLOAD_BYTES_PER_SECOND_SLOW,
+)
+from .base import (
+ check_document_by_unique_identifier,
+ check_duplicate_document,
+)
+
+# ---------------------------------------------------------------------------
+# Unique identifier helpers
+# ---------------------------------------------------------------------------
+
+
+def get_google_drive_unique_identifier(
+ connector: dict | None,
+ filename: str,
+ search_space_id: int,
+) -> tuple[str, str | None]:
+ """
+ Get unique identifier hash, using file_id for Google Drive (stable across renames).
+
+ Returns:
+ Tuple of (primary_hash, legacy_hash or None).
+ For Google Drive: (file_id-based hash, filename-based hash for migration).
+ For other sources: (filename-based hash, None).
+ """
+ if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
+ metadata = connector.get("metadata", {})
+ file_id = metadata.get("google_drive_file_id")
+
+ if file_id:
+ primary_hash = generate_unique_identifier_hash(
+ DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
+ )
+ legacy_hash = generate_unique_identifier_hash(
+ DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
+ )
+ return primary_hash, legacy_hash
+
+ primary_hash = generate_unique_identifier_hash(
+ DocumentType.FILE, filename, search_space_id
+ )
+ return primary_hash, None
+
+
+# ---------------------------------------------------------------------------
+# Document deduplication and migration
+# ---------------------------------------------------------------------------
+
+
+async def handle_existing_document_update(
+ session: AsyncSession,
+ existing_document: Document,
+ content_hash: str,
+ connector: dict | None,
+ filename: str,
+ primary_hash: str,
+) -> tuple[bool, Document | None]:
+ """
+ Handle update logic for an existing document.
+
+ Returns:
+ Tuple of (should_skip_processing, document_to_return):
+ - (True, document): Content unchanged, return existing document
+ - (False, None): Content changed, needs re-processing
+ """
+ if existing_document.unique_identifier_hash != primary_hash:
+ existing_document.unique_identifier_hash = primary_hash
+ logging.info(f"Migrated document to file_id-based identifier: {filename}")
+
+ if existing_document.content_hash == content_hash:
+ if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
+ connector_metadata = connector.get("metadata", {})
+ new_name = connector_metadata.get("google_drive_file_name")
+ doc_metadata = existing_document.document_metadata or {}
+ old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get(
+ "google_drive_file_name"
+ )
+
+ if new_name and old_name and old_name != new_name:
+ from sqlalchemy.orm.attributes import flag_modified
+
+ existing_document.title = new_name
+ if not existing_document.document_metadata:
+ existing_document.document_metadata = {}
+ existing_document.document_metadata["FILE_NAME"] = new_name
+ existing_document.document_metadata["google_drive_file_name"] = new_name
+ flag_modified(existing_document, "document_metadata")
+ await session.commit()
+ logging.info(
+ f"File renamed in Google Drive: '{old_name}' → '{new_name}' "
+ f"(no re-processing needed)"
+ )
+
+ logging.info(f"Document for file {filename} unchanged. Skipping.")
+ return True, existing_document
+
+ # Content has changed — guard against content_hash collision before
+ # expensive ETL processing.
+ collision_doc = await check_duplicate_document(session, content_hash)
+ if collision_doc and collision_doc.id != existing_document.id:
+ logging.warning(
+ "Content-hash collision for %s: identical content exists in "
+ "document #%s (%s). Skipping re-processing.",
+ filename,
+ collision_doc.id,
+ collision_doc.document_type,
+ )
+ if DocumentStatus.is_state(
+ existing_document.status, DocumentStatus.PENDING
+ ) or DocumentStatus.is_state(
+ existing_document.status, DocumentStatus.PROCESSING
+ ):
+ await session.delete(existing_document)
+ await session.commit()
+ return True, None
+
+ return True, existing_document
+
+ logging.info(f"Content changed for file {filename}. Updating document.")
+ return False, None
+
+
+async def find_existing_document_with_migration(
+ session: AsyncSession,
+ primary_hash: str,
+ legacy_hash: str | None,
+ content_hash: str | None = None,
+) -> Document | None:
+ """
+ Find existing document, checking primary hash, legacy hash, and content_hash.
+
+ Supports migration from filename-based to file_id-based hashing for
+ Google Drive files, with content_hash fallback for cross-source dedup.
+ """
+ existing_document = await check_document_by_unique_identifier(session, primary_hash)
+
+ if not existing_document and legacy_hash:
+ existing_document = await check_document_by_unique_identifier(
+ session, legacy_hash
+ )
+ if existing_document:
+ logging.info(
+ "Found legacy document (filename-based hash), "
+ "will migrate to file_id-based hash"
+ )
+
+ if not existing_document and content_hash:
+ existing_document = await check_duplicate_document(session, content_hash)
+ if existing_document:
+ logging.info(
+ f"Found duplicate content from different source (content_hash match). "
+ f"Original document ID: {existing_document.id}, "
+ f"type: {existing_document.document_type}"
+ )
+
+ return existing_document
+
+
+# ---------------------------------------------------------------------------
+# Connector helpers
+# ---------------------------------------------------------------------------
+
+
+async def update_document_from_connector(
+ document: Document | None,
+ connector: dict | None,
+ session: AsyncSession,
+) -> None:
+ """Update document type, metadata, and connector_id from connector info."""
+ if not document or not connector:
+ return
+ if "type" in connector:
+ document.document_type = connector["type"]
+ if "metadata" in connector:
+ if not document.document_metadata:
+ document.document_metadata = connector["metadata"]
+ else:
+ merged = {**document.document_metadata, **connector["metadata"]}
+ document.document_metadata = merged
+ if "connector_id" in connector:
+ document.connector_id = connector["connector_id"]
+ await session.commit()
+
+
+# ---------------------------------------------------------------------------
+# Timeout calculations
+# ---------------------------------------------------------------------------
+
+
+def calculate_upload_timeout(file_size_bytes: int) -> float:
+ """Calculate upload timeout based on file size (conservative for slow connections)."""
+ estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5
+ return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT))
+
+
+def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float:
+ """Calculate job processing timeout based on page count and file size."""
+ page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT)
+ size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60
+ return max(page_based_timeout, size_based_timeout)
diff --git a/surfsense_backend/app/tasks/document_processors/_save.py b/surfsense_backend/app/tasks/document_processors/_save.py
new file mode 100644
index 000000000..5088ad004
--- /dev/null
+++ b/surfsense_backend/app/tasks/document_processors/_save.py
@@ -0,0 +1,285 @@
+"""
+Unified document save/update logic for file processors.
+
+Replaces the three nearly-identical ``add_received_file_document_using_*``
+functions with a single ``save_file_document`` function plus thin wrappers
+for backward compatibility.
+"""
+
+import logging
+
+from langchain_core.documents import Document as LangChainDocument
+from sqlalchemy.exc import SQLAlchemyError
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import Document, DocumentStatus, DocumentType
+from app.services.llm_service import get_user_long_context_llm
+from app.utils.document_converters import (
+ create_document_chunks,
+ embed_text,
+ generate_content_hash,
+ generate_document_summary,
+)
+
+from ._helpers import (
+ find_existing_document_with_migration,
+ get_google_drive_unique_identifier,
+ handle_existing_document_update,
+)
+from .base import get_current_timestamp, safe_set_chunks
+
+# ---------------------------------------------------------------------------
+# Summary generation
+# ---------------------------------------------------------------------------
+
+
+async def _generate_summary(
+ markdown_content: str,
+ file_name: str,
+ etl_service: str,
+ user_llm,
+ enable_summary: bool,
+) -> tuple[str, list[float]]:
+ """
+ Generate a document summary and embedding.
+
+ Docling uses its own large-document summary strategy; other ETL services
+ use the standard ``generate_document_summary`` helper.
+ """
+ if not enable_summary:
+ summary = f"File: {file_name}\n\n{markdown_content[:4000]}"
+ return summary, embed_text(summary)
+
+ if etl_service == "DOCLING":
+ from app.services.docling_service import create_docling_service
+
+ docling_service = create_docling_service()
+ summary_text = await docling_service.process_large_document_summary(
+ content=markdown_content, llm=user_llm, document_title=file_name
+ )
+
+ meta = {
+ "file_name": file_name,
+ "etl_service": etl_service,
+ "document_type": "File Document",
+ }
+ parts = ["# DOCUMENT METADATA"]
+ for key, value in meta.items():
+ if value:
+ formatted_key = key.replace("_", " ").title()
+ parts.append(f"**{formatted_key}:** {value}")
+
+ enhanced = "\n".join(parts) + "\n\n# DOCUMENT SUMMARY\n\n" + summary_text
+ return enhanced, embed_text(enhanced)
+
+ # Standard summary (Unstructured / LlamaCloud / others)
+ meta = {
+ "file_name": file_name,
+ "etl_service": etl_service,
+ "document_type": "File Document",
+ }
+ return await generate_document_summary(markdown_content, user_llm, meta)
+
+
+# ---------------------------------------------------------------------------
+# Unified save function
+# ---------------------------------------------------------------------------
+
+
+async def save_file_document(
+ session: AsyncSession,
+ file_name: str,
+ markdown_content: str,
+ search_space_id: int,
+ user_id: str,
+ etl_service: str,
+ connector: dict | None = None,
+ enable_summary: bool = True,
+) -> Document | None:
+ """
+ Process and store a file document with deduplication and migration support.
+
+ Handles both creating new documents and updating existing ones. This is
+ the single implementation behind the per-ETL-service wrapper functions.
+
+ Args:
+ session: Database session
+ file_name: Name of the processed file
+ markdown_content: Markdown content to store
+ search_space_id: ID of the search space
+ user_id: ID of the user
+ etl_service: Name of the ETL service (UNSTRUCTURED, LLAMACLOUD, DOCLING)
+ connector: Optional connector info for Google Drive files
+ enable_summary: Whether to generate an AI summary
+
+ Returns:
+ Document object if successful, None if duplicate detected
+ """
+ try:
+ primary_hash, legacy_hash = get_google_drive_unique_identifier(
+ connector, file_name, search_space_id
+ )
+ content_hash = generate_content_hash(markdown_content, search_space_id)
+
+ existing_document = await find_existing_document_with_migration(
+ session, primary_hash, legacy_hash, content_hash
+ )
+
+ if existing_document:
+ should_skip, doc = await handle_existing_document_update(
+ session,
+ existing_document,
+ content_hash,
+ connector,
+ file_name,
+ primary_hash,
+ )
+ if should_skip:
+ return doc
+
+ user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
+ if not user_llm:
+ raise RuntimeError(
+ f"No long context LLM configured for user {user_id} "
+ f"in search space {search_space_id}"
+ )
+
+ summary_content, summary_embedding = await _generate_summary(
+ markdown_content, file_name, etl_service, user_llm, enable_summary
+ )
+ chunks = await create_document_chunks(markdown_content)
+ doc_metadata = {"FILE_NAME": file_name, "ETL_SERVICE": etl_service}
+
+ if existing_document:
+ existing_document.title = file_name
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = doc_metadata
+ await safe_set_chunks(session, existing_document, chunks)
+ existing_document.source_markdown = markdown_content
+ existing_document.content_needs_reindexing = False
+ existing_document.updated_at = get_current_timestamp()
+ existing_document.status = DocumentStatus.ready()
+
+ await session.commit()
+ await session.refresh(existing_document)
+ return existing_document
+
+ doc_type = DocumentType.FILE
+ if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
+ doc_type = DocumentType.GOOGLE_DRIVE_FILE
+
+ document = Document(
+ search_space_id=search_space_id,
+ title=file_name,
+ document_type=doc_type,
+ document_metadata=doc_metadata,
+ content=summary_content,
+ embedding=summary_embedding,
+ chunks=chunks,
+ content_hash=content_hash,
+ unique_identifier_hash=primary_hash,
+ source_markdown=markdown_content,
+ content_needs_reindexing=False,
+ updated_at=get_current_timestamp(),
+ created_by_id=user_id,
+ connector_id=connector.get("connector_id") if connector else None,
+ status=DocumentStatus.ready(),
+ )
+ session.add(document)
+ await session.commit()
+ await session.refresh(document)
+ return document
+
+ except SQLAlchemyError as db_error:
+ await session.rollback()
+ if "ix_documents_content_hash" in str(db_error):
+ logging.warning(
+ "content_hash collision during commit for %s (%s). Skipping.",
+ file_name,
+ etl_service,
+ )
+ return None
+ raise db_error
+ except Exception as e:
+ await session.rollback()
+ raise RuntimeError(
+ f"Failed to process file document using {etl_service}: {e!s}"
+ ) from e
+
+
+# ---------------------------------------------------------------------------
+# Backward-compatible wrapper functions
+# ---------------------------------------------------------------------------
+
+
+async def add_received_file_document_using_unstructured(
+ session: AsyncSession,
+ file_name: str,
+ unstructured_processed_elements: list[LangChainDocument],
+ search_space_id: int,
+ user_id: str,
+ connector: dict | None = None,
+ enable_summary: bool = True,
+) -> Document | None:
+ """Process and store a file document using the Unstructured service."""
+ from app.utils.document_converters import convert_document_to_markdown
+
+ markdown_content = await convert_document_to_markdown(
+ unstructured_processed_elements
+ )
+ return await save_file_document(
+ session,
+ file_name,
+ markdown_content,
+ search_space_id,
+ user_id,
+ "UNSTRUCTURED",
+ connector,
+ enable_summary,
+ )
+
+
+async def add_received_file_document_using_llamacloud(
+ session: AsyncSession,
+ file_name: str,
+ llamacloud_markdown_document: str,
+ search_space_id: int,
+ user_id: str,
+ connector: dict | None = None,
+ enable_summary: bool = True,
+) -> Document | None:
+ """Process and store document content parsed by LlamaCloud."""
+ return await save_file_document(
+ session,
+ file_name,
+ llamacloud_markdown_document,
+ search_space_id,
+ user_id,
+ "LLAMACLOUD",
+ connector,
+ enable_summary,
+ )
+
+
+async def add_received_file_document_using_docling(
+ session: AsyncSession,
+ file_name: str,
+ docling_markdown_document: str,
+ search_space_id: int,
+ user_id: str,
+ connector: dict | None = None,
+ enable_summary: bool = True,
+) -> Document | None:
+ """Process and store document content parsed by Docling."""
+ return await save_file_document(
+ session,
+ file_name,
+ docling_markdown_document,
+ search_space_id,
+ user_id,
+ "DOCLING",
+ connector,
+ enable_summary,
+ )
diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py
index 6c0ae1870..0c1cad52d 100644
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@@ -1,905 +1,685 @@
"""
-File document processors for different ETL services (Unstructured, LlamaCloud, Docling).
+File document processors orchestrating content extraction and indexing.
+
+This module is the public entry point for file processing. It delegates to
+specialised sub-modules that each own a single concern:
+
+- ``_constants`` — file type classification and configuration constants
+- ``_helpers`` — document deduplication, migration, connector helpers
+- ``_direct_converters`` — lossless file-to-markdown for csv/tsv/html
+- ``_etl`` — ETL parsing strategies (Unstructured, LlamaCloud, Docling)
+- ``_save`` — unified document creation / update logic
"""
-import asyncio
+from __future__ import annotations
+
import contextlib
import logging
-import ssl
-import warnings
+import os
+from dataclasses import dataclass, field
from logging import ERROR, getLogger
-import httpx
from fastapi import HTTPException
-from langchain_core.documents import Document as LangChainDocument
-from litellm import atranscription
-from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config as app_config
-from app.db import Document, DocumentStatus, DocumentType, Log, Notification
-from app.indexing_pipeline.adapters.file_upload_adapter import UploadDocumentAdapter
-from app.services.llm_service import get_user_long_context_llm
+from app.db import Document, Log, Notification
from app.services.notification_service import NotificationService
from app.services.task_logging_service import TaskLoggingService
-from app.utils.document_converters import (
- convert_document_to_markdown,
- create_document_chunks,
- embed_text,
- generate_content_hash,
- generate_document_summary,
- generate_unique_identifier_hash,
-)
-from .base import (
- check_document_by_unique_identifier,
- check_duplicate_document,
- get_current_timestamp,
- safe_set_chunks,
+from ._constants import FileCategory, classify_file
+from ._direct_converters import convert_file_directly
+from ._etl import (
+ parse_with_docling,
+ parse_with_llamacloud_retry,
+ parse_with_unstructured,
+)
+from ._helpers import update_document_from_connector
+from ._save import (
+ add_received_file_document_using_docling,
+ add_received_file_document_using_llamacloud,
+ add_received_file_document_using_unstructured,
+ save_file_document,
)
from .markdown_processor import add_received_markdown_file_document
-# Constants for LlamaCloud retry configuration
-LLAMACLOUD_MAX_RETRIES = 5 # Increased from 3 for large file resilience
-LLAMACLOUD_BASE_DELAY = 10 # Base delay in seconds for exponential backoff
-LLAMACLOUD_MAX_DELAY = 120 # Maximum delay between retries (2 minutes)
-LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
- ssl.SSLError,
- httpx.ConnectError,
- httpx.ConnectTimeout,
- httpx.ReadTimeout,
- httpx.WriteTimeout,
- httpx.RemoteProtocolError,
- httpx.LocalProtocolError,
- ConnectionError,
- ConnectionResetError,
- TimeoutError,
- OSError, # Catches various network-level errors
-)
-
-# Timeout calculation constants
-UPLOAD_BYTES_PER_SECOND_SLOW = (
- 100 * 1024
-) # 100 KB/s (conservative for slow connections)
-MIN_UPLOAD_TIMEOUT = 120 # Minimum 2 minutes for any file
-MAX_UPLOAD_TIMEOUT = 1800 # Maximum 30 minutes for very large files
-BASE_JOB_TIMEOUT = 600 # 10 minutes base for job processing
-PER_PAGE_JOB_TIMEOUT = 60 # 1 minute per page for processing
+# Re-export public API so existing ``from file_processors import …`` keeps working.
+__all__ = [
+ "add_received_file_document_using_docling",
+ "add_received_file_document_using_llamacloud",
+ "add_received_file_document_using_unstructured",
+ "parse_with_llamacloud_retry",
+ "process_file_in_background",
+ "process_file_in_background_with_document",
+ "save_file_document",
+]
-def get_google_drive_unique_identifier(
- connector: dict | None,
- filename: str,
- search_space_id: int,
-) -> tuple[str, str | None]:
- """
- Get unique identifier hash for a file, with special handling for Google Drive.
-
- For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
- For other files, uses filename.
-
- Args:
- connector: Optional connector info dict with type and metadata
- filename: The filename (used for non-Google Drive files or as fallback)
- search_space_id: The search space ID
-
- Returns:
- Tuple of (primary_hash, legacy_hash or None)
- - For Google Drive: (file_id_based_hash, filename_based_hash for migration)
- - For other sources: (filename_based_hash, None)
- """
- if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
- metadata = connector.get("metadata", {})
- file_id = metadata.get("google_drive_file_id")
-
- if file_id:
- # New method: use file_id as unique identifier (doesn't change on rename)
- primary_hash = generate_unique_identifier_hash(
- DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
- )
- # Legacy method: for backward compatibility with existing documents
- # that were indexed with filename-based hash
- legacy_hash = generate_unique_identifier_hash(
- DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
- )
- return primary_hash, legacy_hash
-
- # For non-Google Drive files, use filename as before
- primary_hash = generate_unique_identifier_hash(
- DocumentType.FILE, filename, search_space_id
- )
- return primary_hash, None
+# ---------------------------------------------------------------------------
+# Processing context (bundles parameters shared across handler functions)
+# ---------------------------------------------------------------------------
-async def handle_existing_document_update(
- session: AsyncSession,
- existing_document: Document,
- content_hash: str,
- connector: dict | None,
- filename: str,
- primary_hash: str,
-) -> tuple[bool, Document | None]:
- """
- Handle update logic for an existing document.
+@dataclass
+class _ProcessingContext:
+ session: AsyncSession
+ file_path: str
+ filename: str
+ search_space_id: int
+ user_id: str
+ task_logger: TaskLoggingService
+ log_entry: Log
+ connector: dict | None = None
+ notification: Notification | None = None
+ enable_summary: bool = field(init=False)
- Args:
- session: Database session
- existing_document: The existing document found in database
- content_hash: Hash of the new content
- connector: Optional connector info
- filename: Current filename
- primary_hash: The primary hash (file_id based for Google Drive)
-
- Returns:
- Tuple of (should_skip_processing, document_to_return)
- - (True, document): Content unchanged, just return existing document
- - (False, None): Content changed, need to re-process
- """
- # Check if this document needs hash migration (found via legacy hash)
- if existing_document.unique_identifier_hash != primary_hash:
- existing_document.unique_identifier_hash = primary_hash
- logging.info(f"Migrated document to file_id-based identifier: {filename}")
-
- # Check if content has changed
- if existing_document.content_hash == content_hash:
- # Content unchanged - check if we need to update metadata (e.g., filename changed)
- if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
- connector_metadata = connector.get("metadata", {})
- new_name = connector_metadata.get("google_drive_file_name")
- # Check both possible keys for old name (FILE_NAME is used in stored documents)
- doc_metadata = existing_document.document_metadata or {}
- old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get(
- "google_drive_file_name"
- )
-
- if new_name and old_name and old_name != new_name:
- # File was renamed - update title and metadata, skip expensive processing
- from sqlalchemy.orm.attributes import flag_modified
-
- existing_document.title = new_name
- if not existing_document.document_metadata:
- existing_document.document_metadata = {}
- existing_document.document_metadata["FILE_NAME"] = new_name
- existing_document.document_metadata["google_drive_file_name"] = new_name
- flag_modified(existing_document, "document_metadata")
- await session.commit()
- logging.info(
- f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)"
- )
-
- logging.info(f"Document for file {filename} unchanged. Skipping.")
- return True, existing_document
- else:
- # Content has changed — guard against content_hash collision before
- # expensive ETL processing. A collision means the exact same content
- # already lives in a *different* document (e.g. a manual upload of the
- # same file). Proceeding would trigger a unique-constraint violation
- # on ix_documents_content_hash.
- collision_doc = await check_duplicate_document(session, content_hash)
- if collision_doc and collision_doc.id != existing_document.id:
- logging.warning(
- "Content-hash collision for %s: identical content exists in "
- "document #%s (%s). Skipping re-processing.",
- filename,
- collision_doc.id,
- collision_doc.document_type,
- )
- if DocumentStatus.is_state(
- existing_document.status, DocumentStatus.PENDING
- ) or DocumentStatus.is_state(
- existing_document.status, DocumentStatus.PROCESSING
- ):
- # Pending/processing doc has no real content yet — remove it
- # so the UI doesn't show a contentless entry.
- await session.delete(existing_document)
- await session.commit()
- return True, None
-
- # Document already has valid content — keep it as-is.
- return True, existing_document
-
- logging.info(f"Content changed for file {filename}. Updating document.")
- return False, None
-
-
-async def find_existing_document_with_migration(
- session: AsyncSession,
- primary_hash: str,
- legacy_hash: str | None,
- content_hash: str | None = None,
-) -> Document | None:
- """
- Find existing document, checking both new hash and legacy hash for migration,
- with fallback to content_hash for cross-source deduplication.
-
- Args:
- session: Database session
- primary_hash: The primary hash (file_id based for Google Drive)
- legacy_hash: The legacy hash (filename based) for migration, or None
- content_hash: The content hash for fallback deduplication, or None
-
- Returns:
- Existing document if found, None otherwise
- """
- # First check with primary hash (new method)
- existing_document = await check_document_by_unique_identifier(session, primary_hash)
-
- # If not found and we have a legacy hash, check with that (migration path)
- if not existing_document and legacy_hash:
- existing_document = await check_document_by_unique_identifier(
- session, legacy_hash
- )
- if existing_document:
- logging.info(
- "Found legacy document (filename-based hash), will migrate to file_id-based hash"
- )
-
- # Fallback: check by content_hash to catch duplicates from different sources
- # This prevents unique constraint violations when the same content exists
- # under a different unique_identifier (e.g., manual upload vs Google Drive)
- if not existing_document and content_hash:
- existing_document = await check_duplicate_document(session, content_hash)
- if existing_document:
- logging.info(
- f"Found duplicate content from different source (content_hash match). "
- f"Original document ID: {existing_document.id}, type: {existing_document.document_type}"
- )
-
- return existing_document
-
-
-def calculate_upload_timeout(file_size_bytes: int) -> float:
- """
- Calculate appropriate upload timeout based on file size.
-
- Assumes a conservative slow connection speed to handle worst-case scenarios.
-
- Args:
- file_size_bytes: Size of the file in bytes
-
- Returns:
- Timeout in seconds
- """
- # Calculate time needed at slow connection speed
- # Add 50% buffer for network variability and SSL overhead
- estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5
-
- # Clamp to reasonable bounds
- return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT))
-
-
-def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float:
- """
- Calculate job processing timeout based on page count and file size.
-
- Args:
- estimated_pages: Estimated number of pages
- file_size_bytes: Size of the file in bytes
-
- Returns:
- Timeout in seconds
- """
- # Base timeout + time per page
- page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT)
-
- # Also consider file size (large images take longer to process)
- # ~1 minute per 10MB of file size
- size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60
-
- # Use the larger of the two estimates
- return max(page_based_timeout, size_based_timeout)
-
-
-async def parse_with_llamacloud_retry(
- file_path: str,
- estimated_pages: int,
- task_logger: TaskLoggingService | None = None,
- log_entry: Log | None = None,
-):
- """
- Parse a file with LlamaCloud with retry logic for transient SSL/connection errors.
-
- Uses dynamic timeout calculations based on file size and page count to handle
- very large files reliably.
-
- Args:
- file_path: Path to the file to parse
- estimated_pages: Estimated number of pages for timeout calculation
- task_logger: Optional task logger for progress updates
- log_entry: Optional log entry for progress updates
-
- Returns:
- LlamaParse result object
-
- Raises:
- Exception: If all retries fail
- """
- import os
- import random
-
- from llama_cloud_services import LlamaParse
- from llama_cloud_services.parse.utils import ResultType
-
- # Get file size for timeout calculations
- file_size_bytes = os.path.getsize(file_path)
- file_size_mb = file_size_bytes / (1024 * 1024)
-
- # Calculate dynamic timeouts based on file size and page count
- upload_timeout = calculate_upload_timeout(file_size_bytes)
- job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes)
-
- # HTTP client timeouts - scaled based on file size
- # Write timeout is critical for large file uploads
- custom_timeout = httpx.Timeout(
- connect=120.0, # 2 minutes to establish connection (handles slow DNS, etc.)
- read=upload_timeout, # Dynamic based on file size
- write=upload_timeout, # Dynamic based on file size (upload time)
- pool=120.0, # 2 minutes to acquire connection from pool
- )
-
- logging.info(
- f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
- f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
- f"job_timeout={job_timeout:.0f}s"
- )
-
- last_exception = None
- attempt_errors = []
-
- for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1):
- try:
- # Create a fresh httpx client for each attempt
- async with httpx.AsyncClient(timeout=custom_timeout) as custom_client:
- # Create LlamaParse parser instance with optimized settings
- parser = LlamaParse(
- api_key=app_config.LLAMA_CLOUD_API_KEY,
- num_workers=1, # Use single worker for file processing
- verbose=True,
- language="en",
- result_type=ResultType.MD,
- # Timeout settings for large files
- max_timeout=int(max(2000, job_timeout + upload_timeout)),
- job_timeout_in_seconds=job_timeout,
- job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
- # Use our custom client with larger timeouts
- custom_client=custom_client,
- )
-
- # Parse the file asynchronously
- result = await parser.aparse(file_path)
-
- # Success - log if we had previous failures
- if attempt > 1:
- logging.info(
- f"LlamaCloud upload succeeded on attempt {attempt} after "
- f"{len(attempt_errors)} failures"
- )
-
- return result
-
- except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
- last_exception = e
- error_type = type(e).__name__
- error_msg = str(e)[:200]
- attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}")
-
- if attempt < LLAMACLOUD_MAX_RETRIES:
- # Calculate exponential backoff with jitter
- # Base delay doubles each attempt, capped at max delay
- base_delay = min(
- LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)), LLAMACLOUD_MAX_DELAY
- )
- # Add random jitter (±25%) to prevent thundering herd
- jitter = base_delay * 0.25 * (2 * random.random() - 1)
- delay = base_delay + jitter
-
- if task_logger and log_entry:
- await task_logger.log_task_progress(
- log_entry,
- f"LlamaCloud upload failed (attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), retrying in {delay:.0f}s",
- {
- "error_type": error_type,
- "error_message": error_msg,
- "attempt": attempt,
- "retry_delay": delay,
- "file_size_mb": round(file_size_mb, 1),
- "upload_timeout": upload_timeout,
- },
- )
- else:
- logging.warning(
- f"LlamaCloud upload failed (attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): "
- f"{error_type}. File: {file_size_mb:.1f}MB. Retrying in {delay:.0f}s..."
- )
-
- await asyncio.sleep(delay)
- else:
- logging.error(
- f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} attempts. "
- f"File size: {file_size_mb:.1f}MB, Pages: {estimated_pages}. "
- f"Errors: {'; '.join(attempt_errors)}"
- )
-
- except Exception:
- # Non-retryable exception, raise immediately
- raise
-
- # All retries exhausted
- raise last_exception or RuntimeError(
- f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. "
- f"File size: {file_size_mb:.1f}MB"
- )
-
-
-async def add_received_file_document_using_unstructured(
- session: AsyncSession,
- file_name: str,
- unstructured_processed_elements: list[LangChainDocument],
- search_space_id: int,
- user_id: str,
- connector: dict | None = None,
- enable_summary: bool = True,
-) -> Document | None:
- """
- Process and store a file document using Unstructured service.
-
- Args:
- session: Database session
- file_name: Name of the processed file
- unstructured_processed_elements: Processed elements from Unstructured
- search_space_id: ID of the search space
- user_id: ID of the user
- connector: Optional connector info for Google Drive files
-
- Returns:
- Document object if successful, None if failed
- """
- try:
- file_in_markdown = await convert_document_to_markdown(
- unstructured_processed_elements
+ def __post_init__(self) -> None:
+ self.enable_summary = (
+ self.connector.get("enable_summary", True) if self.connector else True
)
- # Generate unique identifier hash (uses file_id for Google Drive, filename for others)
- primary_hash, legacy_hash = get_google_drive_unique_identifier(
- connector, file_name, search_space_id
- )
- # Generate content hash
- content_hash = generate_content_hash(file_in_markdown, search_space_id)
-
- # Check if document exists (with migration support for Google Drive and content_hash fallback)
- existing_document = await find_existing_document_with_migration(
- session, primary_hash, legacy_hash, content_hash
- )
-
- if existing_document:
- # Handle existing document (rename detection, content change check)
- should_skip, doc = await handle_existing_document_update(
- session,
- existing_document,
- content_hash,
- connector,
- file_name,
- primary_hash,
- )
- if should_skip:
- return doc
- # Content changed - continue to update
-
- # Get user's long context LLM (needed for both create and update)
- user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
- if not user_llm:
- raise RuntimeError(
- f"No long context LLM configured for user {user_id} in search space {search_space_id}"
- )
-
- # Generate summary with metadata
- document_metadata = {
- "file_name": file_name,
- "etl_service": "UNSTRUCTURED",
- "document_type": "File Document",
- }
- if enable_summary:
- summary_content, summary_embedding = await generate_document_summary(
- file_in_markdown, user_llm, document_metadata
- )
- else:
- summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}"
- summary_embedding = embed_text(summary_content)
-
- # Process chunks
- chunks = await create_document_chunks(file_in_markdown)
-
- # Update or create document
- if existing_document:
- # Update existing document
- existing_document.title = file_name
- existing_document.content = summary_content
- existing_document.content_hash = content_hash
- existing_document.embedding = summary_embedding
- existing_document.document_metadata = {
- "FILE_NAME": file_name,
- "ETL_SERVICE": "UNSTRUCTURED",
- }
- await safe_set_chunks(session, existing_document, chunks)
- existing_document.source_markdown = file_in_markdown
- existing_document.content_needs_reindexing = False
- existing_document.updated_at = get_current_timestamp()
- existing_document.status = DocumentStatus.ready()
-
- await session.commit()
- await session.refresh(existing_document)
- document = existing_document
- else:
- # Create new document
- doc_type = DocumentType.FILE
- if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
- doc_type = DocumentType.GOOGLE_DRIVE_FILE
-
- document = Document(
- search_space_id=search_space_id,
- title=file_name,
- document_type=doc_type,
- document_metadata={
- "FILE_NAME": file_name,
- "ETL_SERVICE": "UNSTRUCTURED",
- },
- content=summary_content,
- embedding=summary_embedding,
- chunks=chunks,
- content_hash=content_hash,
- unique_identifier_hash=primary_hash,
- source_markdown=file_in_markdown,
- content_needs_reindexing=False,
- updated_at=get_current_timestamp(),
- created_by_id=user_id,
- connector_id=connector.get("connector_id") if connector else None,
- status=DocumentStatus.ready(),
- )
-
- session.add(document)
- await session.commit()
- await session.refresh(document)
-
- return document
- except SQLAlchemyError as db_error:
- await session.rollback()
- if "ix_documents_content_hash" in str(db_error):
- logging.warning(
- "content_hash collision during commit for %s (Unstructured). Skipping.",
- file_name,
- )
- return None
- raise db_error
- except Exception as e:
- await session.rollback()
- raise RuntimeError(f"Failed to process file document: {e!s}") from e
+# ---------------------------------------------------------------------------
+# Notification helper
+# ---------------------------------------------------------------------------
-async def add_received_file_document_using_llamacloud(
- session: AsyncSession,
- file_name: str,
- llamacloud_markdown_document: str,
- search_space_id: int,
- user_id: str,
- connector: dict | None = None,
- enable_summary: bool = True,
-) -> Document | None:
- """
- Process and store document content parsed by LlamaCloud.
-
- Args:
- session: Database session
- file_name: Name of the processed file
- llamacloud_markdown_document: Markdown content from LlamaCloud parsing
- search_space_id: ID of the search space
- user_id: ID of the user
- connector: Optional connector info for Google Drive files
-
- Returns:
- Document object if successful, None if failed
- """
- try:
- # Combine all markdown documents into one
- file_in_markdown = llamacloud_markdown_document
-
- # Generate unique identifier hash (uses file_id for Google Drive, filename for others)
- primary_hash, legacy_hash = get_google_drive_unique_identifier(
- connector, file_name, search_space_id
- )
-
- # Generate content hash
- content_hash = generate_content_hash(file_in_markdown, search_space_id)
-
- # Check if document exists (with migration support for Google Drive and content_hash fallback)
- existing_document = await find_existing_document_with_migration(
- session, primary_hash, legacy_hash, content_hash
- )
-
- if existing_document:
- # Handle existing document (rename detection, content change check)
- should_skip, doc = await handle_existing_document_update(
- session,
- existing_document,
- content_hash,
- connector,
- file_name,
- primary_hash,
- )
- if should_skip:
- return doc
- # Content changed - continue to update
-
- # Get user's long context LLM (needed for both create and update)
- user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
- if not user_llm:
- raise RuntimeError(
- f"No long context LLM configured for user {user_id} in search space {search_space_id}"
- )
-
- # Generate summary with metadata
- document_metadata = {
- "file_name": file_name,
- "etl_service": "LLAMACLOUD",
- "document_type": "File Document",
- }
- if enable_summary:
- summary_content, summary_embedding = await generate_document_summary(
- file_in_markdown, user_llm, document_metadata
- )
- else:
- summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}"
- summary_embedding = embed_text(summary_content)
-
- # Process chunks
- chunks = await create_document_chunks(file_in_markdown)
-
- # Update or create document
- if existing_document:
- existing_document.title = file_name
- existing_document.content = summary_content
- existing_document.content_hash = content_hash
- existing_document.embedding = summary_embedding
- existing_document.document_metadata = {
- "FILE_NAME": file_name,
- "ETL_SERVICE": "LLAMACLOUD",
- }
- await safe_set_chunks(session, existing_document, chunks)
- existing_document.source_markdown = file_in_markdown
- existing_document.content_needs_reindexing = False
- existing_document.updated_at = get_current_timestamp()
- existing_document.status = DocumentStatus.ready()
-
- await session.commit()
- await session.refresh(existing_document)
- document = existing_document
- else:
- doc_type = DocumentType.FILE
- if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
- doc_type = DocumentType.GOOGLE_DRIVE_FILE
-
- document = Document(
- search_space_id=search_space_id,
- title=file_name,
- document_type=doc_type,
- document_metadata={
- "FILE_NAME": file_name,
- "ETL_SERVICE": "LLAMACLOUD",
- },
- content=summary_content,
- embedding=summary_embedding,
- chunks=chunks,
- content_hash=content_hash,
- unique_identifier_hash=primary_hash,
- source_markdown=file_in_markdown,
- content_needs_reindexing=False,
- updated_at=get_current_timestamp(),
- created_by_id=user_id,
- connector_id=connector.get("connector_id") if connector else None,
- status=DocumentStatus.ready(),
- )
-
- session.add(document)
- await session.commit()
- await session.refresh(document)
-
- return document
- except SQLAlchemyError as db_error:
- await session.rollback()
- if "ix_documents_content_hash" in str(db_error):
- logging.warning(
- "content_hash collision during commit for %s (LlamaCloud). Skipping.",
- file_name,
- )
- return None
- raise db_error
- except Exception as e:
- await session.rollback()
- raise RuntimeError(
- f"Failed to process file document using LlamaCloud: {e!s}"
- ) from e
-
-
-async def add_received_file_document_using_docling(
- session: AsyncSession,
- file_name: str,
- docling_markdown_document: str,
- search_space_id: int,
- user_id: str,
- connector: dict | None = None,
- enable_summary: bool = True,
-) -> Document | None:
- """
- Process and store document content parsed by Docling.
-
- Args:
- session: Database session
- file_name: Name of the processed file
- docling_markdown_document: Markdown content from Docling parsing
- search_space_id: ID of the search space
- user_id: ID of the user
- connector: Optional connector info for Google Drive files
-
- Returns:
- Document object if successful, None if failed
- """
- try:
- file_in_markdown = docling_markdown_document
-
- # Generate unique identifier hash (uses file_id for Google Drive, filename for others)
- primary_hash, legacy_hash = get_google_drive_unique_identifier(
- connector, file_name, search_space_id
- )
-
- # Generate content hash
- content_hash = generate_content_hash(file_in_markdown, search_space_id)
-
- # Check if document exists (with migration support for Google Drive and content_hash fallback)
- existing_document = await find_existing_document_with_migration(
- session, primary_hash, legacy_hash, content_hash
- )
-
- if existing_document:
- # Handle existing document (rename detection, content change check)
- should_skip, doc = await handle_existing_document_update(
- session,
- existing_document,
- content_hash,
- connector,
- file_name,
- primary_hash,
- )
- if should_skip:
- return doc
- # Content changed - continue to update
-
- # Get user's long context LLM (needed for both create and update)
- user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
- if not user_llm:
- raise RuntimeError(
- f"No long context LLM configured for user {user_id} in search_space {search_space_id}"
- )
-
- if enable_summary:
- from app.services.docling_service import create_docling_service
-
- docling_service = create_docling_service()
-
- summary_content = await docling_service.process_large_document_summary(
- content=file_in_markdown, llm=user_llm, document_title=file_name
- )
-
- document_metadata = {
- "file_name": file_name,
- "etl_service": "DOCLING",
- "document_type": "File Document",
- }
- metadata_parts = ["# DOCUMENT METADATA"]
- for key, value in document_metadata.items():
- if value:
- formatted_key = key.replace("_", " ").title()
- metadata_parts.append(f"**{formatted_key}:** {value}")
-
- metadata_section = "\n".join(metadata_parts)
- enhanced_summary_content = (
- f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}"
- )
- else:
- enhanced_summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}"
-
- summary_embedding = embed_text(enhanced_summary_content)
-
- # Process chunks
- chunks = await create_document_chunks(file_in_markdown)
-
- # Update or create document
- if existing_document:
- # Update existing document
- existing_document.title = file_name
- existing_document.content = enhanced_summary_content
- existing_document.content_hash = content_hash
- existing_document.embedding = summary_embedding
- existing_document.document_metadata = {
- "FILE_NAME": file_name,
- "ETL_SERVICE": "DOCLING",
- }
- await safe_set_chunks(session, existing_document, chunks)
- existing_document.source_markdown = file_in_markdown
- existing_document.content_needs_reindexing = False
- existing_document.updated_at = get_current_timestamp()
- existing_document.status = DocumentStatus.ready() # Mark as ready
-
- await session.commit()
- await session.refresh(existing_document)
- document = existing_document
- else:
- # Create new document
- # Determine document type based on connector
- doc_type = DocumentType.FILE
- if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
- doc_type = DocumentType.GOOGLE_DRIVE_FILE
-
- document = Document(
- search_space_id=search_space_id,
- title=file_name,
- document_type=doc_type,
- document_metadata={
- "FILE_NAME": file_name,
- "ETL_SERVICE": "DOCLING",
- },
- content=enhanced_summary_content,
- embedding=summary_embedding,
- chunks=chunks,
- content_hash=content_hash,
- unique_identifier_hash=primary_hash,
- source_markdown=file_in_markdown,
- content_needs_reindexing=False,
- updated_at=get_current_timestamp(),
- created_by_id=user_id,
- connector_id=connector.get("connector_id") if connector else None,
- status=DocumentStatus.ready(), # Mark as ready
- )
-
- session.add(document)
- await session.commit()
- await session.refresh(document)
-
- return document
- except SQLAlchemyError as db_error:
- await session.rollback()
- if "ix_documents_content_hash" in str(db_error):
- logging.warning(
- "content_hash collision during commit for %s (Docling). Skipping.",
- file_name,
- )
- return None
- raise db_error
- except Exception as e:
- await session.rollback()
- raise RuntimeError(
- f"Failed to process file document using Docling: {e!s}"
- ) from e
-
-
-async def _update_document_from_connector(
- document: Document | None, connector: dict | None, session: AsyncSession
+async def _notify(
+ ctx: _ProcessingContext,
+ stage: str,
+ stage_message: str | None = None,
+ **kwargs,
) -> None:
- """Helper to update document type, metadata, and connector_id from connector info."""
- if document and connector:
- if "type" in connector:
- document.document_type = connector["type"]
- if "metadata" in connector:
- # Merge with existing document_metadata (the actual column name)
- if not document.document_metadata:
- document.document_metadata = connector["metadata"]
- else:
- # Expand existing metadata with connector metadata
- merged = {**document.document_metadata, **connector["metadata"]}
- document.document_metadata = merged
- # Set connector_id if provided for de-indexing support
- if "connector_id" in connector:
- document.connector_id = connector["connector_id"]
- await session.commit()
+ """Send a processing-progress notification if one is attached."""
+ if not ctx.notification:
+ return
+ await NotificationService.document_processing.notify_processing_progress(
+ ctx.session,
+ ctx.notification,
+ stage=stage,
+ stage_message=stage_message,
+ **kwargs,
+ )
+
+
+# ---------------------------------------------------------------------------
+# Page-limit helpers
+# ---------------------------------------------------------------------------
+
+
+def _estimate_pages_safe(page_limit_service, file_path: str) -> int:
+ """Estimate page count with a file-size fallback."""
+ try:
+ return page_limit_service.estimate_pages_before_processing(file_path)
+ except Exception:
+ file_size = os.path.getsize(file_path)
+ return max(1, file_size // (80 * 1024))
+
+
+async def _log_page_divergence(
+ task_logger: TaskLoggingService,
+ log_entry: Log,
+ filename: str,
+ estimated: int,
+ actual: int,
+ final: int,
+) -> None:
+ """Log a warning when the actual page count far exceeds the pre-estimate."""
+ if actual > estimated * 1.5:
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Actual page count higher than estimate: {filename}",
+ {
+ "estimated_before": estimated,
+ "actual_pages": actual,
+ "using_count": final,
+ },
+ )
+
+
+# ===================================================================
+# Handlers for process_file_in_background (legacy / connector path)
+# ===================================================================
+
+
+async def _process_markdown_upload(ctx: _ProcessingContext) -> Document | None:
+ """Read a markdown / text file and create or update a document."""
+ await _notify(ctx, "parsing", "Reading file")
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Processing markdown/text file: {ctx.filename}",
+ {"file_type": "markdown", "processing_stage": "reading_file"},
+ )
+
+ with open(ctx.file_path, encoding="utf-8") as f:
+ markdown_content = f.read()
+
+ with contextlib.suppress(Exception):
+ os.unlink(ctx.file_path)
+
+ await _notify(ctx, "chunking")
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Creating document from markdown content: {ctx.filename}",
+ {
+ "processing_stage": "creating_document",
+ "content_length": len(markdown_content),
+ },
+ )
+
+ result = await add_received_markdown_file_document(
+ ctx.session,
+ ctx.filename,
+ markdown_content,
+ ctx.search_space_id,
+ ctx.user_id,
+ ctx.connector,
+ )
+ if ctx.connector:
+ await update_document_from_connector(result, ctx.connector, ctx.session)
+
+ if result:
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Successfully processed markdown file: {ctx.filename}",
+ {
+ "document_id": result.id,
+ "content_hash": result.content_hash,
+ "file_type": "markdown",
+ },
+ )
+ else:
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Markdown file already exists (duplicate): {ctx.filename}",
+ {"duplicate_detected": True, "file_type": "markdown"},
+ )
+ return result
+
+
+async def _process_direct_convert_upload(ctx: _ProcessingContext) -> Document | None:
+ """Convert a text-based file (csv/tsv/html) to markdown without ETL."""
+ await _notify(ctx, "parsing", "Converting file")
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Direct-converting file to markdown: {ctx.filename}",
+ {"file_type": "direct_convert", "processing_stage": "converting"},
+ )
+
+ markdown_content = convert_file_directly(ctx.file_path, ctx.filename)
+
+ with contextlib.suppress(Exception):
+ os.unlink(ctx.file_path)
+
+ await _notify(ctx, "chunking")
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Creating document from converted content: {ctx.filename}",
+ {
+ "processing_stage": "creating_document",
+ "content_length": len(markdown_content),
+ },
+ )
+
+ result = await add_received_markdown_file_document(
+ ctx.session,
+ ctx.filename,
+ markdown_content,
+ ctx.search_space_id,
+ ctx.user_id,
+ ctx.connector,
+ )
+ if ctx.connector:
+ await update_document_from_connector(result, ctx.connector, ctx.session)
+
+ if result:
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Successfully direct-converted file: {ctx.filename}",
+ {
+ "document_id": result.id,
+ "content_hash": result.content_hash,
+ "file_type": "direct_convert",
+ },
+ )
+ else:
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Direct-converted file already exists (duplicate): {ctx.filename}",
+ {"duplicate_detected": True, "file_type": "direct_convert"},
+ )
+ return result
+
+
+async def _process_audio_upload(ctx: _ProcessingContext) -> Document | None:
+ """Transcribe an audio file and create or update a document."""
+ await _notify(ctx, "parsing", "Transcribing audio")
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Processing audio file for transcription: {ctx.filename}",
+ {"file_type": "audio", "processing_stage": "starting_transcription"},
+ )
+
+ stt_service_type = (
+ "local"
+ if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
+ else "external"
+ )
+
+ if stt_service_type == "local":
+ from app.services.stt_service import stt_service
+
+ try:
+ stt_result = stt_service.transcribe_file(ctx.file_path)
+ transcribed_text = stt_result.get("text", "")
+ if not transcribed_text:
+ raise ValueError("Transcription returned empty text")
+ transcribed_text = (
+ f"# Transcription of {ctx.filename}\n\n{transcribed_text}"
+ )
+ except Exception as e:
+ raise HTTPException(
+ status_code=422,
+ detail=f"Failed to transcribe audio file {ctx.filename}: {e!s}",
+ ) from e
+
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Local STT transcription completed: {ctx.filename}",
+ {
+ "processing_stage": "local_transcription_complete",
+ "language": stt_result.get("language"),
+ "confidence": stt_result.get("language_probability"),
+ "duration": stt_result.get("duration"),
+ },
+ )
+ else:
+ from litellm import atranscription
+
+ with open(ctx.file_path, "rb") as audio_file:
+ transcription_kwargs: dict = {
+ "model": app_config.STT_SERVICE,
+ "file": audio_file,
+ "api_key": app_config.STT_SERVICE_API_KEY,
+ }
+ if app_config.STT_SERVICE_API_BASE:
+ transcription_kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
+
+ transcription_response = await atranscription(**transcription_kwargs)
+ transcribed_text = transcription_response.get("text", "")
+ if not transcribed_text:
+ raise ValueError("Transcription returned empty text")
+
+ transcribed_text = f"# Transcription of {ctx.filename}\n\n{transcribed_text}"
+
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Transcription completed, creating document: {ctx.filename}",
+ {
+ "processing_stage": "transcription_complete",
+ "transcript_length": len(transcribed_text),
+ },
+ )
+
+ await _notify(ctx, "chunking")
+
+ with contextlib.suppress(Exception):
+ os.unlink(ctx.file_path)
+
+ result = await add_received_markdown_file_document(
+ ctx.session,
+ ctx.filename,
+ transcribed_text,
+ ctx.search_space_id,
+ ctx.user_id,
+ ctx.connector,
+ )
+ if ctx.connector:
+ await update_document_from_connector(result, ctx.connector, ctx.session)
+
+ if result:
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Successfully transcribed and processed audio file: {ctx.filename}",
+ {
+ "document_id": result.id,
+ "content_hash": result.content_hash,
+ "file_type": "audio",
+ "transcript_length": len(transcribed_text),
+ "stt_service": stt_service_type,
+ },
+ )
+ else:
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Audio file transcript already exists (duplicate): {ctx.filename}",
+ {"duplicate_detected": True, "file_type": "audio"},
+ )
+ return result
+
+
+# ---------------------------------------------------------------------------
+# Document file processing (ETL service dispatch)
+# ---------------------------------------------------------------------------
+
+
+async def _etl_unstructured(
+ ctx: _ProcessingContext,
+ page_limit_service,
+ estimated_pages: int,
+) -> Document | None:
+ """Parse and save via the Unstructured ETL service."""
+ await _notify(ctx, "parsing", "Extracting content")
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Processing file with Unstructured ETL: {ctx.filename}",
+ {
+ "file_type": "document",
+ "etl_service": "UNSTRUCTURED",
+ "processing_stage": "loading",
+ },
+ )
+
+ docs = await parse_with_unstructured(ctx.file_path)
+
+ await _notify(ctx, "chunking", chunks_count=len(docs))
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Unstructured ETL completed, creating document: {ctx.filename}",
+ {"processing_stage": "etl_complete", "elements_count": len(docs)},
+ )
+
+ actual_pages = page_limit_service.estimate_pages_from_elements(docs)
+ final_pages = max(estimated_pages, actual_pages)
+ await _log_page_divergence(
+ ctx.task_logger,
+ ctx.log_entry,
+ ctx.filename,
+ estimated_pages,
+ actual_pages,
+ final_pages,
+ )
+
+ with contextlib.suppress(Exception):
+ os.unlink(ctx.file_path)
+
+ result = await add_received_file_document_using_unstructured(
+ ctx.session,
+ ctx.filename,
+ docs,
+ ctx.search_space_id,
+ ctx.user_id,
+ ctx.connector,
+ enable_summary=ctx.enable_summary,
+ )
+ if ctx.connector:
+ await update_document_from_connector(result, ctx.connector, ctx.session)
+
+ if result:
+ await page_limit_service.update_page_usage(
+ ctx.user_id, final_pages, allow_exceed=True
+ )
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Successfully processed file with Unstructured: {ctx.filename}",
+ {
+ "document_id": result.id,
+ "content_hash": result.content_hash,
+ "file_type": "document",
+ "etl_service": "UNSTRUCTURED",
+ "pages_processed": final_pages,
+ },
+ )
+ else:
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Document already exists (duplicate): {ctx.filename}",
+ {
+ "duplicate_detected": True,
+ "file_type": "document",
+ "etl_service": "UNSTRUCTURED",
+ },
+ )
+ return result
+
+
+async def _etl_llamacloud(
+ ctx: _ProcessingContext,
+ page_limit_service,
+ estimated_pages: int,
+) -> Document | None:
+ """Parse and save via the LlamaCloud ETL service."""
+ await _notify(ctx, "parsing", "Extracting content")
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Processing file with LlamaCloud ETL: {ctx.filename}",
+ {
+ "file_type": "document",
+ "etl_service": "LLAMACLOUD",
+ "processing_stage": "parsing",
+ "estimated_pages": estimated_pages,
+ },
+ )
+
+ raw_result = await parse_with_llamacloud_retry(
+ file_path=ctx.file_path,
+ estimated_pages=estimated_pages,
+ task_logger=ctx.task_logger,
+ log_entry=ctx.log_entry,
+ )
+
+ with contextlib.suppress(Exception):
+ os.unlink(ctx.file_path)
+
+ markdown_documents = await raw_result.aget_markdown_documents(split_by_page=False)
+
+ await _notify(ctx, "chunking", chunks_count=len(markdown_documents))
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"LlamaCloud parsing completed, creating documents: {ctx.filename}",
+ {
+ "processing_stage": "parsing_complete",
+ "documents_count": len(markdown_documents),
+ },
+ )
+
+ if not markdown_documents:
+ await ctx.task_logger.log_task_failure(
+ ctx.log_entry,
+ f"LlamaCloud parsing returned no documents: {ctx.filename}",
+ "ETL service returned empty document list",
+ {"error_type": "EmptyDocumentList", "etl_service": "LLAMACLOUD"},
+ )
+ raise ValueError(f"LlamaCloud parsing returned no documents for {ctx.filename}")
+
+ actual_pages = page_limit_service.estimate_pages_from_markdown(markdown_documents)
+ final_pages = max(estimated_pages, actual_pages)
+ await _log_page_divergence(
+ ctx.task_logger,
+ ctx.log_entry,
+ ctx.filename,
+ estimated_pages,
+ actual_pages,
+ final_pages,
+ )
+
+ any_created = False
+ last_doc: Document | None = None
+
+ for doc in markdown_documents:
+ doc_result = await add_received_file_document_using_llamacloud(
+ ctx.session,
+ ctx.filename,
+ llamacloud_markdown_document=doc.text,
+ search_space_id=ctx.search_space_id,
+ user_id=ctx.user_id,
+ connector=ctx.connector,
+ enable_summary=ctx.enable_summary,
+ )
+ if doc_result:
+ any_created = True
+ last_doc = doc_result
+
+ if any_created:
+ await page_limit_service.update_page_usage(
+ ctx.user_id, final_pages, allow_exceed=True
+ )
+ if ctx.connector:
+ await update_document_from_connector(last_doc, ctx.connector, ctx.session)
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Successfully processed file with LlamaCloud: {ctx.filename}",
+ {
+ "document_id": last_doc.id,
+ "content_hash": last_doc.content_hash,
+ "file_type": "document",
+ "etl_service": "LLAMACLOUD",
+ "pages_processed": final_pages,
+ "documents_count": len(markdown_documents),
+ },
+ )
+ return last_doc
+
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Document already exists (duplicate): {ctx.filename}",
+ {
+ "duplicate_detected": True,
+ "file_type": "document",
+ "etl_service": "LLAMACLOUD",
+ "documents_count": len(markdown_documents),
+ },
+ )
+ return None
+
+
+async def _etl_docling(
+ ctx: _ProcessingContext,
+ page_limit_service,
+ estimated_pages: int,
+) -> Document | None:
+ """Parse and save via the Docling ETL service."""
+ await _notify(ctx, "parsing", "Extracting content")
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Processing file with Docling ETL: {ctx.filename}",
+ {
+ "file_type": "document",
+ "etl_service": "DOCLING",
+ "processing_stage": "parsing",
+ },
+ )
+
+ content = await parse_with_docling(ctx.file_path, ctx.filename)
+
+ with contextlib.suppress(Exception):
+ os.unlink(ctx.file_path)
+
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Docling parsing completed, creating document: {ctx.filename}",
+ {"processing_stage": "parsing_complete", "content_length": len(content)},
+ )
+
+ actual_pages = page_limit_service.estimate_pages_from_content_length(len(content))
+ final_pages = max(estimated_pages, actual_pages)
+ await _log_page_divergence(
+ ctx.task_logger,
+ ctx.log_entry,
+ ctx.filename,
+ estimated_pages,
+ actual_pages,
+ final_pages,
+ )
+
+ await _notify(ctx, "chunking")
+
+ result = await add_received_file_document_using_docling(
+ ctx.session,
+ ctx.filename,
+ docling_markdown_document=content,
+ search_space_id=ctx.search_space_id,
+ user_id=ctx.user_id,
+ connector=ctx.connector,
+ enable_summary=ctx.enable_summary,
+ )
+
+ if result:
+ await page_limit_service.update_page_usage(
+ ctx.user_id, final_pages, allow_exceed=True
+ )
+ if ctx.connector:
+ await update_document_from_connector(result, ctx.connector, ctx.session)
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Successfully processed file with Docling: {ctx.filename}",
+ {
+ "document_id": result.id,
+ "content_hash": result.content_hash,
+ "file_type": "document",
+ "etl_service": "DOCLING",
+ "pages_processed": final_pages,
+ },
+ )
+ else:
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Document already exists (duplicate): {ctx.filename}",
+ {
+ "duplicate_detected": True,
+ "file_type": "document",
+ "etl_service": "DOCLING",
+ },
+ )
+ return result
+
+
+async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
+ """Route a document file to the configured ETL service."""
+ from app.services.page_limit_service import PageLimitExceededError, PageLimitService
+
+ page_limit_service = PageLimitService(ctx.session)
+ estimated_pages = _estimate_pages_safe(page_limit_service, ctx.file_path)
+
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Estimated {estimated_pages} pages for file: {ctx.filename}",
+ {"estimated_pages": estimated_pages, "file_type": "document"},
+ )
+
+ try:
+ await page_limit_service.check_page_limit(ctx.user_id, estimated_pages)
+ except PageLimitExceededError as e:
+ await ctx.task_logger.log_task_failure(
+ ctx.log_entry,
+ f"Page limit exceeded before processing: {ctx.filename}",
+ str(e),
+ {
+ "error_type": "PageLimitExceeded",
+ "pages_used": e.pages_used,
+ "pages_limit": e.pages_limit,
+ "estimated_pages": estimated_pages,
+ },
+ )
+ with contextlib.suppress(Exception):
+ os.unlink(ctx.file_path)
+ raise HTTPException(status_code=403, detail=str(e)) from e
+
+ etl_dispatch = {
+ "UNSTRUCTURED": _etl_unstructured,
+ "LLAMACLOUD": _etl_llamacloud,
+ "DOCLING": _etl_docling,
+ }
+ handler = etl_dispatch.get(app_config.ETL_SERVICE)
+ if handler is None:
+ raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
+
+ return await handler(ctx, page_limit_service, estimated_pages)
+
+
+# ===================================================================
+# Public orchestrators
+# ===================================================================
async def process_file_in_background(
@@ -910,726 +690,35 @@ async def process_file_in_background(
session: AsyncSession,
task_logger: TaskLoggingService,
log_entry: Log,
- connector: dict
- | None = None, # Optional: {"type": "GOOGLE_DRIVE_FILE", "metadata": {...}}
- notification: Notification
- | None = None, # Optional notification for progress updates
+ connector: dict | None = None,
+ notification: Notification | None = None,
) -> Document | None:
+ ctx = _ProcessingContext(
+ session=session,
+ file_path=file_path,
+ filename=filename,
+ search_space_id=search_space_id,
+ user_id=user_id,
+ task_logger=task_logger,
+ log_entry=log_entry,
+ connector=connector,
+ notification=notification,
+ )
+
try:
- # Check if the file is a markdown or text file
- if filename.lower().endswith((".md", ".markdown", ".txt")):
- # Update notification: parsing stage
- if notification:
- await (
- NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Reading file",
- )
- )
+ category = classify_file(filename)
- await task_logger.log_task_progress(
- log_entry,
- f"Processing markdown/text file: {filename}",
- {"file_type": "markdown", "processing_stage": "reading_file"},
- )
+ if category == FileCategory.MARKDOWN:
+ return await _process_markdown_upload(ctx)
+ if category == FileCategory.DIRECT_CONVERT:
+ return await _process_direct_convert_upload(ctx)
+ if category == FileCategory.AUDIO:
+ return await _process_audio_upload(ctx)
+ return await _process_document_upload(ctx)
- # For markdown files, read the content directly
- with open(file_path, encoding="utf-8") as f:
- markdown_content = f.read()
-
- # Clean up the temp file
- import os
-
- try:
- os.unlink(file_path)
- except Exception as e:
- print("Error deleting temp file", e)
- pass
-
- # Update notification: chunking stage
- if notification:
- await (
- NotificationService.document_processing.notify_processing_progress(
- session, notification, stage="chunking"
- )
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Creating document from markdown content: {filename}",
- {
- "processing_stage": "creating_document",
- "content_length": len(markdown_content),
- },
- )
-
- # Process markdown directly through specialized function
- result = await add_received_markdown_file_document(
- session, filename, markdown_content, search_space_id, user_id, connector
- )
-
- if connector:
- await _update_document_from_connector(result, connector, session)
-
- if result:
- await task_logger.log_task_success(
- log_entry,
- f"Successfully processed markdown file: {filename}",
- {
- "document_id": result.id,
- "content_hash": result.content_hash,
- "file_type": "markdown",
- },
- )
- return result
- else:
- await task_logger.log_task_success(
- log_entry,
- f"Markdown file already exists (duplicate): {filename}",
- {"duplicate_detected": True, "file_type": "markdown"},
- )
- return None
-
- # Check if the file is an audio file
- elif filename.lower().endswith(
- (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
- ):
- # Update notification: parsing stage (transcription)
- if notification:
- await (
- NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Transcribing audio",
- )
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Processing audio file for transcription: {filename}",
- {"file_type": "audio", "processing_stage": "starting_transcription"},
- )
-
- # Determine STT service type
- stt_service_type = (
- "local"
- if app_config.STT_SERVICE
- and app_config.STT_SERVICE.startswith("local/")
- else "external"
- )
-
- # Check if using local STT service
- if stt_service_type == "local":
- # Use local Faster-Whisper for transcription
- from app.services.stt_service import stt_service
-
- try:
- result = stt_service.transcribe_file(file_path)
- transcribed_text = result.get("text", "")
-
- if not transcribed_text:
- raise ValueError("Transcription returned empty text")
-
- # Add metadata about the transcription
- transcribed_text = (
- f"# Transcription of {filename}\n\n{transcribed_text}"
- )
- except Exception as e:
- raise HTTPException(
- status_code=422,
- detail=f"Failed to transcribe audio file {filename}: {e!s}",
- ) from e
-
- await task_logger.log_task_progress(
- log_entry,
- f"Local STT transcription completed: {filename}",
- {
- "processing_stage": "local_transcription_complete",
- "language": result.get("language"),
- "confidence": result.get("language_probability"),
- "duration": result.get("duration"),
- },
- )
- else:
- # Use LiteLLM for audio transcription
- with open(file_path, "rb") as audio_file:
- transcription_kwargs = {
- "model": app_config.STT_SERVICE,
- "file": audio_file,
- "api_key": app_config.STT_SERVICE_API_KEY,
- }
- if app_config.STT_SERVICE_API_BASE:
- transcription_kwargs["api_base"] = (
- app_config.STT_SERVICE_API_BASE
- )
-
- transcription_response = await atranscription(
- **transcription_kwargs
- )
-
- # Extract the transcribed text
- transcribed_text = transcription_response.get("text", "")
-
- if not transcribed_text:
- raise ValueError("Transcription returned empty text")
-
- # Add metadata about the transcription
- transcribed_text = (
- f"# Transcription of {filename}\n\n{transcribed_text}"
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Transcription completed, creating document: {filename}",
- {
- "processing_stage": "transcription_complete",
- "transcript_length": len(transcribed_text),
- },
- )
-
- # Update notification: chunking stage
- if notification:
- await (
- NotificationService.document_processing.notify_processing_progress(
- session, notification, stage="chunking"
- )
- )
-
- # Clean up the temp file
- try:
- os.unlink(file_path)
- except Exception as e:
- print("Error deleting temp file", e)
- pass
-
- # Process transcription as markdown document
- result = await add_received_markdown_file_document(
- session, filename, transcribed_text, search_space_id, user_id, connector
- )
-
- if connector:
- await _update_document_from_connector(result, connector, session)
-
- if result:
- await task_logger.log_task_success(
- log_entry,
- f"Successfully transcribed and processed audio file: {filename}",
- {
- "document_id": result.id,
- "content_hash": result.content_hash,
- "file_type": "audio",
- "transcript_length": len(transcribed_text),
- "stt_service": stt_service_type,
- },
- )
- return result
- else:
- await task_logger.log_task_success(
- log_entry,
- f"Audio file transcript already exists (duplicate): {filename}",
- {"duplicate_detected": True, "file_type": "audio"},
- )
- return None
-
- else:
- # Import page limit service
- from app.services.page_limit_service import (
- PageLimitExceededError,
- PageLimitService,
- )
-
- # Initialize page limit service
- page_limit_service = PageLimitService(session)
-
- # CRITICAL: Estimate page count BEFORE making expensive ETL API calls
- # This prevents users from incurring costs on files that would exceed their limit
- try:
- estimated_pages_before = (
- page_limit_service.estimate_pages_before_processing(file_path)
- )
- except Exception:
- # If estimation fails, use a conservative estimate based on file size
- import os
-
- file_size = os.path.getsize(file_path)
- estimated_pages_before = max(
- 1, file_size // (80 * 1024)
- ) # ~80KB per page
-
- await task_logger.log_task_progress(
- log_entry,
- f"Estimated {estimated_pages_before} pages for file: {filename}",
- {
- "estimated_pages": estimated_pages_before,
- "file_type": "document",
- },
- )
-
- # Check page limit BEFORE calling ETL service to avoid unnecessary costs
- try:
- await page_limit_service.check_page_limit(
- user_id, estimated_pages_before
- )
- except PageLimitExceededError as e:
- await task_logger.log_task_failure(
- log_entry,
- f"Page limit exceeded before processing: {filename}",
- str(e),
- {
- "error_type": "PageLimitExceeded",
- "pages_used": e.pages_used,
- "pages_limit": e.pages_limit,
- "estimated_pages": estimated_pages_before,
- },
- )
- # Clean up the temp file
- import os
-
- with contextlib.suppress(Exception):
- os.unlink(file_path)
-
- raise HTTPException(
- status_code=403,
- detail=str(e),
- ) from e
-
- if app_config.ETL_SERVICE == "UNSTRUCTURED":
- # Update notification: parsing stage
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Extracting content",
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Processing file with Unstructured ETL: {filename}",
- {
- "file_type": "document",
- "etl_service": "UNSTRUCTURED",
- "processing_stage": "loading",
- },
- )
-
- from langchain_unstructured import UnstructuredLoader
-
- # Process the file
- loader = UnstructuredLoader(
- file_path,
- mode="elements",
- post_processors=[],
- languages=["eng"],
- include_orig_elements=False,
- include_metadata=False,
- strategy="auto",
- )
-
- docs = await loader.aload()
-
- # Update notification: chunking stage
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session, notification, stage="chunking", chunks_count=len(docs)
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Unstructured ETL completed, creating document: {filename}",
- {"processing_stage": "etl_complete", "elements_count": len(docs)},
- )
-
- # Verify actual page count from parsed documents
- actual_pages = page_limit_service.estimate_pages_from_elements(docs)
-
- # Use the higher of the two estimates for safety (in case pre-estimate was too low)
- final_page_count = max(estimated_pages_before, actual_pages)
-
- # If actual is significantly higher than estimate, log a warning
- if actual_pages > estimated_pages_before * 1.5:
- await task_logger.log_task_progress(
- log_entry,
- f"Actual page count higher than estimate: {filename}",
- {
- "estimated_before": estimated_pages_before,
- "actual_pages": actual_pages,
- "using_count": final_page_count,
- },
- )
-
- # Clean up the temp file
- import os
-
- try:
- os.unlink(file_path)
- except Exception as e:
- print("Error deleting temp file", e)
- pass
-
- enable_summary = (
- connector.get("enable_summary", True) if connector else True
- )
- result = await add_received_file_document_using_unstructured(
- session,
- filename,
- docs,
- search_space_id,
- user_id,
- connector,
- enable_summary=enable_summary,
- )
-
- if connector:
- await _update_document_from_connector(result, connector, session)
-
- if result:
- # Update page usage after successful processing
- # allow_exceed=True because document was already created after passing initial check
- await page_limit_service.update_page_usage(
- user_id, final_page_count, allow_exceed=True
- )
-
- await task_logger.log_task_success(
- log_entry,
- f"Successfully processed file with Unstructured: {filename}",
- {
- "document_id": result.id,
- "content_hash": result.content_hash,
- "file_type": "document",
- "etl_service": "UNSTRUCTURED",
- "pages_processed": final_page_count,
- },
- )
- return result
- else:
- await task_logger.log_task_success(
- log_entry,
- f"Document already exists (duplicate): {filename}",
- {
- "duplicate_detected": True,
- "file_type": "document",
- "etl_service": "UNSTRUCTURED",
- },
- )
- return None
-
- elif app_config.ETL_SERVICE == "LLAMACLOUD":
- # Update notification: parsing stage
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Extracting content",
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Processing file with LlamaCloud ETL: {filename}",
- {
- "file_type": "document",
- "etl_service": "LLAMACLOUD",
- "processing_stage": "parsing",
- "estimated_pages": estimated_pages_before,
- },
- )
-
- # Parse file with retry logic for SSL/connection errors (common with large files)
- result = await parse_with_llamacloud_retry(
- file_path=file_path,
- estimated_pages=estimated_pages_before,
- task_logger=task_logger,
- log_entry=log_entry,
- )
-
- # Clean up the temp file
- import os
-
- try:
- os.unlink(file_path)
- except Exception as e:
- print("Error deleting temp file", e)
- pass
-
- # Get markdown documents from the result
- markdown_documents = await result.aget_markdown_documents(
- split_by_page=False
- )
-
- # Update notification: chunking stage
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="chunking",
- chunks_count=len(markdown_documents),
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"LlamaCloud parsing completed, creating documents: {filename}",
- {
- "processing_stage": "parsing_complete",
- "documents_count": len(markdown_documents),
- },
- )
-
- # Check if LlamaCloud returned any documents
- if not markdown_documents or len(markdown_documents) == 0:
- await task_logger.log_task_failure(
- log_entry,
- f"LlamaCloud parsing returned no documents: {filename}",
- "ETL service returned empty document list",
- {
- "error_type": "EmptyDocumentList",
- "etl_service": "LLAMACLOUD",
- },
- )
- raise ValueError(
- f"LlamaCloud parsing returned no documents for {filename}"
- )
-
- # Verify actual page count from parsed markdown documents
- actual_pages = page_limit_service.estimate_pages_from_markdown(
- markdown_documents
- )
-
- # Use the higher of the two estimates for safety (in case pre-estimate was too low)
- final_page_count = max(estimated_pages_before, actual_pages)
-
- # If actual is significantly higher than estimate, log a warning
- if actual_pages > estimated_pages_before * 1.5:
- await task_logger.log_task_progress(
- log_entry,
- f"Actual page count higher than estimate: {filename}",
- {
- "estimated_before": estimated_pages_before,
- "actual_pages": actual_pages,
- "using_count": final_page_count,
- },
- )
-
- # Track if any document was successfully created (not a duplicate)
- any_doc_created = False
- last_created_doc = None
-
- for doc in markdown_documents:
- # Extract text content from the markdown documents
- markdown_content = doc.text
-
- enable_summary = (
- connector.get("enable_summary", True) if connector else True
- )
- doc_result = await add_received_file_document_using_llamacloud(
- session,
- filename,
- llamacloud_markdown_document=markdown_content,
- search_space_id=search_space_id,
- user_id=user_id,
- connector=connector,
- enable_summary=enable_summary,
- )
-
- # Track if this document was successfully created
- if doc_result:
- any_doc_created = True
- last_created_doc = doc_result
-
- # Update page usage once after processing all documents
- # Only update if at least one document was created (not all duplicates)
- if any_doc_created:
- # Update page usage after successful processing
- # allow_exceed=True because document was already created after passing initial check
- await page_limit_service.update_page_usage(
- user_id, final_page_count, allow_exceed=True
- )
-
- if connector:
- await _update_document_from_connector(
- last_created_doc, connector, session
- )
-
- await task_logger.log_task_success(
- log_entry,
- f"Successfully processed file with LlamaCloud: {filename}",
- {
- "document_id": last_created_doc.id,
- "content_hash": last_created_doc.content_hash,
- "file_type": "document",
- "etl_service": "LLAMACLOUD",
- "pages_processed": final_page_count,
- "documents_count": len(markdown_documents),
- },
- )
- return last_created_doc
- else:
- # All documents were duplicates (markdown_documents was not empty, but all returned None)
- await task_logger.log_task_success(
- log_entry,
- f"Document already exists (duplicate): {filename}",
- {
- "duplicate_detected": True,
- "file_type": "document",
- "etl_service": "LLAMACLOUD",
- "documents_count": len(markdown_documents),
- },
- )
- return None
-
- elif app_config.ETL_SERVICE == "DOCLING":
- # Update notification: parsing stage
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Extracting content",
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Processing file with Docling ETL: {filename}",
- {
- "file_type": "document",
- "etl_service": "DOCLING",
- "processing_stage": "parsing",
- },
- )
-
- # Use Docling service for document processing
- from app.services.docling_service import create_docling_service
-
- # Create Docling service
- docling_service = create_docling_service()
-
- # Suppress pdfminer warnings that can cause processing to hang
- # These warnings are harmless but can spam logs and potentially halt processing
- # Suppress both Python warnings and logging warnings from pdfminer
- pdfminer_logger = getLogger("pdfminer")
- original_level = pdfminer_logger.level
-
- with warnings.catch_warnings():
- warnings.filterwarnings(
- "ignore", category=UserWarning, module="pdfminer"
- )
- warnings.filterwarnings(
- "ignore",
- message=".*Cannot set gray non-stroke color.*",
- )
- warnings.filterwarnings("ignore", message=".*invalid float value.*")
-
- # Temporarily suppress pdfminer logging warnings
- pdfminer_logger.setLevel(ERROR)
-
- try:
- # Process the document
- result = await docling_service.process_document(
- file_path, filename
- )
- finally:
- # Restore original logging level
- pdfminer_logger.setLevel(original_level)
-
- # Clean up the temp file
- import os
-
- try:
- os.unlink(file_path)
- except Exception as e:
- print("Error deleting temp file", e)
- pass
-
- await task_logger.log_task_progress(
- log_entry,
- f"Docling parsing completed, creating document: {filename}",
- {
- "processing_stage": "parsing_complete",
- "content_length": len(result["content"]),
- },
- )
-
- # Verify actual page count from content length
- actual_pages = page_limit_service.estimate_pages_from_content_length(
- len(result["content"])
- )
-
- # Use the higher of the two estimates for safety (in case pre-estimate was too low)
- final_page_count = max(estimated_pages_before, actual_pages)
-
- # If actual is significantly higher than estimate, log a warning
- if actual_pages > estimated_pages_before * 1.5:
- await task_logger.log_task_progress(
- log_entry,
- f"Actual page count higher than estimate: {filename}",
- {
- "estimated_before": estimated_pages_before,
- "actual_pages": actual_pages,
- "using_count": final_page_count,
- },
- )
-
- # Update notification: chunking stage
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session, notification, stage="chunking"
- )
-
- enable_summary = (
- connector.get("enable_summary", True) if connector else True
- )
- doc_result = await add_received_file_document_using_docling(
- session,
- filename,
- docling_markdown_document=result["content"],
- search_space_id=search_space_id,
- user_id=user_id,
- connector=connector,
- enable_summary=enable_summary,
- )
-
- if doc_result:
- # Update page usage after successful processing
- # allow_exceed=True because document was already created after passing initial check
- await page_limit_service.update_page_usage(
- user_id, final_page_count, allow_exceed=True
- )
-
- if connector:
- await _update_document_from_connector(
- doc_result, connector, session
- )
-
- await task_logger.log_task_success(
- log_entry,
- f"Successfully processed file with Docling: {filename}",
- {
- "document_id": doc_result.id,
- "content_hash": doc_result.content_hash,
- "file_type": "document",
- "etl_service": "DOCLING",
- "pages_processed": final_page_count,
- },
- )
- return doc_result
- else:
- await task_logger.log_task_success(
- log_entry,
- f"Document already exists (duplicate): {filename}",
- {
- "duplicate_detected": True,
- "file_type": "document",
- "etl_service": "DOCLING",
- },
- )
- return None
except Exception as e:
await session.rollback()
- # For page limit errors, use the detailed message from the exception
from app.services.page_limit_service import PageLimitExceededError
if isinstance(e, PageLimitExceededError):
@@ -1645,10 +734,225 @@ async def process_file_in_background(
str(e),
{"error_type": type(e).__name__, "filename": filename},
)
- import logging
-
logging.error(f"Error processing file in background: {error_message}")
- raise # Re-raise so the wrapper can also handle it
+ raise
+
+
+# ===================================================================
+# 2-phase handler (process_file_in_background_with_document)
+# ===================================================================
+
+
+async def _extract_file_content(
+ file_path: str,
+ filename: str,
+ session: AsyncSession,
+ user_id: str,
+ task_logger: TaskLoggingService,
+ log_entry: Log,
+ notification: Notification | None,
+) -> tuple[str, str]:
+ """
+ Extract markdown content from a file regardless of type.
+
+ Returns:
+ Tuple of (markdown_content, etl_service_name).
+ """
+ category = classify_file(filename)
+
+ if category == FileCategory.MARKDOWN:
+ if notification:
+ await NotificationService.document_processing.notify_processing_progress(
+ session,
+ notification,
+ stage="parsing",
+ stage_message="Reading file",
+ )
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Processing markdown/text file: {filename}",
+ {"file_type": "markdown", "processing_stage": "reading_file"},
+ )
+ with open(file_path, encoding="utf-8") as f:
+ content = f.read()
+ with contextlib.suppress(Exception):
+ os.unlink(file_path)
+ return content, "MARKDOWN"
+
+ if category == FileCategory.DIRECT_CONVERT:
+ if notification:
+ await NotificationService.document_processing.notify_processing_progress(
+ session,
+ notification,
+ stage="parsing",
+ stage_message="Converting file",
+ )
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Direct-converting file to markdown: {filename}",
+ {"file_type": "direct_convert", "processing_stage": "converting"},
+ )
+ content = convert_file_directly(file_path, filename)
+ with contextlib.suppress(Exception):
+ os.unlink(file_path)
+ return content, "DIRECT_CONVERT"
+
+ if category == FileCategory.AUDIO:
+ if notification:
+ await NotificationService.document_processing.notify_processing_progress(
+ session,
+ notification,
+ stage="parsing",
+ stage_message="Transcribing audio",
+ )
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Processing audio file for transcription: {filename}",
+ {"file_type": "audio", "processing_stage": "starting_transcription"},
+ )
+ transcribed_text = await _transcribe_audio(file_path, filename)
+ with contextlib.suppress(Exception):
+ os.unlink(file_path)
+ return transcribed_text, "AUDIO_TRANSCRIPTION"
+
+ # Document file — use ETL service
+ return await _extract_document_content(
+ file_path,
+ filename,
+ session,
+ user_id,
+ task_logger,
+ log_entry,
+ notification,
+ )
+
+
+async def _transcribe_audio(file_path: str, filename: str) -> str:
+ """Transcribe an audio file and return formatted markdown text."""
+ stt_service_type = (
+ "local"
+ if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
+ else "external"
+ )
+
+ if stt_service_type == "local":
+ from app.services.stt_service import stt_service
+
+ result = stt_service.transcribe_file(file_path)
+ text = result.get("text", "")
+ if not text:
+ raise ValueError("Transcription returned empty text")
+ else:
+ from litellm import atranscription
+
+ with open(file_path, "rb") as audio_file:
+ kwargs: dict = {
+ "model": app_config.STT_SERVICE,
+ "file": audio_file,
+ "api_key": app_config.STT_SERVICE_API_KEY,
+ }
+ if app_config.STT_SERVICE_API_BASE:
+ kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
+ response = await atranscription(**kwargs)
+ text = response.get("text", "")
+ if not text:
+ raise ValueError("Transcription returned empty text")
+
+ return f"# Transcription of {filename}\n\n{text}"
+
+
+async def _extract_document_content(
+ file_path: str,
+ filename: str,
+ session: AsyncSession,
+ user_id: str,
+ task_logger: TaskLoggingService,
+ log_entry: Log,
+ notification: Notification | None,
+) -> tuple[str, str]:
+ """
+ Parse a document file via the configured ETL service.
+
+ Returns:
+ Tuple of (markdown_content, etl_service_name).
+ """
+ from app.services.page_limit_service import PageLimitService
+
+ page_limit_service = PageLimitService(session)
+
+ try:
+ estimated_pages = page_limit_service.estimate_pages_before_processing(file_path)
+ except Exception:
+ file_size = os.path.getsize(file_path)
+ estimated_pages = max(1, file_size // (80 * 1024))
+
+ await page_limit_service.check_page_limit(user_id, estimated_pages)
+
+ etl_service = app_config.ETL_SERVICE
+ markdown_content: str | None = None
+
+ if notification:
+ await NotificationService.document_processing.notify_processing_progress(
+ session,
+ notification,
+ stage="parsing",
+ stage_message="Extracting content",
+ )
+
+ if etl_service == "UNSTRUCTURED":
+ from app.utils.document_converters import convert_document_to_markdown
+
+ docs = await parse_with_unstructured(file_path)
+ markdown_content = await convert_document_to_markdown(docs)
+ actual_pages = page_limit_service.estimate_pages_from_elements(docs)
+ final_pages = max(estimated_pages, actual_pages)
+ await page_limit_service.update_page_usage(
+ user_id, final_pages, allow_exceed=True
+ )
+
+ elif etl_service == "LLAMACLOUD":
+ raw_result = await parse_with_llamacloud_retry(
+ file_path=file_path,
+ estimated_pages=estimated_pages,
+ task_logger=task_logger,
+ log_entry=log_entry,
+ )
+ markdown_documents = await raw_result.aget_markdown_documents(
+ split_by_page=False
+ )
+ if not markdown_documents:
+ raise RuntimeError(f"LlamaCloud parsing returned no documents: {filename}")
+ markdown_content = markdown_documents[0].text
+ await page_limit_service.update_page_usage(
+ user_id, estimated_pages, allow_exceed=True
+ )
+
+ elif etl_service == "DOCLING":
+ getLogger("docling.pipeline.base_pipeline").setLevel(ERROR)
+ getLogger("docling.document_converter").setLevel(ERROR)
+ getLogger("docling_core.transforms.chunker.hierarchical_chunker").setLevel(
+ ERROR
+ )
+
+ from docling.document_converter import DocumentConverter
+
+ converter = DocumentConverter()
+ result = converter.convert(file_path)
+ markdown_content = result.document.export_to_markdown()
+ await page_limit_service.update_page_usage(
+ user_id, estimated_pages, allow_exceed=True
+ )
+
+ else:
+ raise RuntimeError(f"Unknown ETL_SERVICE: {etl_service}")
+
+ with contextlib.suppress(Exception):
+ os.unlink(file_path)
+
+ if not markdown_content:
+ raise RuntimeError(f"Failed to extract content from file: {filename}")
+
+ return markdown_content, etl_service
async def process_file_in_background_with_document(
@@ -1667,272 +971,50 @@ async def process_file_in_background_with_document(
"""
Process file and update existing pending document (2-phase pattern).
- This function is Phase 2 of the real-time document status updates:
- - Phase 1 (API): Created document with pending status
- - Phase 2 (this): Process file and update document to ready/failed
-
- The document already exists with pending status. This function:
- 1. Parses the file content (markdown, audio, or ETL services)
- 2. Updates the document with content, embeddings, and chunks
- 3. Sets status to 'ready' on success
-
- Args:
- document: Existing document with pending status
- file_path: Path to the uploaded file
- filename: Original filename
- search_space_id: ID of the search space
- user_id: ID of the user
- session: Database session
- task_logger: Task logging service
- log_entry: Log entry for this task
- connector: Optional connector info for Google Drive files
- notification: Optional notification for progress updates
-
- Returns:
- Updated Document object if successful, None if duplicate content detected
+ Phase 1 (API layer): Created document with pending status.
+ Phase 2 (this function): Process file and update document to ready/failed.
"""
- import os
-
- from app.config import config as app_config
+ from app.indexing_pipeline.adapters.file_upload_adapter import (
+ UploadDocumentAdapter,
+ )
from app.services.llm_service import get_user_long_context_llm
+ from app.utils.document_converters import generate_content_hash
+
+ from .base import check_duplicate_document
doc_id = document.id
try:
- markdown_content = None
- etl_service = None
-
- # ===== STEP 1: Parse file content based on type =====
-
- # Check if the file is a markdown or text file
- if filename.lower().endswith((".md", ".markdown", ".txt")):
- # Update notification: parsing stage
- if notification:
- await (
- NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Reading file",
- )
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Processing markdown/text file: {filename}",
- {"file_type": "markdown", "processing_stage": "reading_file"},
- )
-
- # Read markdown content directly
- with open(file_path, encoding="utf-8") as f:
- markdown_content = f.read()
- etl_service = "MARKDOWN"
-
- # Clean up temp file
- with contextlib.suppress(Exception):
- os.unlink(file_path)
-
- # Check if the file is an audio file
- elif filename.lower().endswith(
- (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
- ):
- # Update notification: parsing stage (transcription)
- if notification:
- await (
- NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Transcribing audio",
- )
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Processing audio file for transcription: {filename}",
- {"file_type": "audio", "processing_stage": "starting_transcription"},
- )
-
- # Transcribe audio
- stt_service_type = (
- "local"
- if app_config.STT_SERVICE
- and app_config.STT_SERVICE.startswith("local/")
- else "external"
- )
-
- if stt_service_type == "local":
- from app.services.stt_service import stt_service
-
- result = stt_service.transcribe_file(file_path)
- transcribed_text = result.get("text", "")
- if not transcribed_text:
- raise ValueError("Transcription returned empty text")
- markdown_content = (
- f"# Transcription of {filename}\n\n{transcribed_text}"
- )
- else:
- with open(file_path, "rb") as audio_file:
- transcription_kwargs = {
- "model": app_config.STT_SERVICE,
- "file": audio_file,
- "api_key": app_config.STT_SERVICE_API_KEY,
- }
- if app_config.STT_SERVICE_API_BASE:
- transcription_kwargs["api_base"] = (
- app_config.STT_SERVICE_API_BASE
- )
- transcription_response = await atranscription(
- **transcription_kwargs
- )
- transcribed_text = transcription_response.get("text", "")
- if not transcribed_text:
- raise ValueError("Transcription returned empty text")
- markdown_content = (
- f"# Transcription of {filename}\n\n{transcribed_text}"
- )
-
- etl_service = "AUDIO_TRANSCRIPTION"
- # Clean up temp file
- with contextlib.suppress(Exception):
- os.unlink(file_path)
-
- else:
- # Document files - use ETL service
- from app.services.page_limit_service import (
- PageLimitExceededError,
- PageLimitService,
- )
-
- page_limit_service = PageLimitService(session)
-
- # Estimate page count
- try:
- estimated_pages = page_limit_service.estimate_pages_before_processing(
- file_path
- )
- except Exception:
- file_size = os.path.getsize(file_path)
- estimated_pages = max(1, file_size // (80 * 1024))
-
- # Check page limit
- await page_limit_service.check_page_limit(user_id, estimated_pages)
-
- if app_config.ETL_SERVICE == "UNSTRUCTURED":
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Extracting content",
- )
-
- from langchain_unstructured import UnstructuredLoader
-
- loader = UnstructuredLoader(
- file_path,
- mode="elements",
- post_processors=[],
- languages=["eng"],
- include_orig_elements=False,
- include_metadata=False,
- strategy="auto",
- )
- docs = await loader.aload()
- markdown_content = await convert_document_to_markdown(docs)
- actual_pages = page_limit_service.estimate_pages_from_elements(docs)
- final_page_count = max(estimated_pages, actual_pages)
- etl_service = "UNSTRUCTURED"
-
- # Update page usage
- await page_limit_service.update_page_usage(
- user_id, final_page_count, allow_exceed=True
- )
-
- elif app_config.ETL_SERVICE == "LLAMACLOUD":
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Extracting content",
- )
-
- result = await parse_with_llamacloud_retry(
- file_path=file_path,
- estimated_pages=estimated_pages,
- task_logger=task_logger,
- log_entry=log_entry,
- )
- markdown_documents = await result.aget_markdown_documents(
- split_by_page=False
- )
- if not markdown_documents:
- raise RuntimeError(
- f"LlamaCloud parsing returned no documents: {filename}"
- )
- markdown_content = markdown_documents[0].text
- etl_service = "LLAMACLOUD"
-
- # Update page usage
- await page_limit_service.update_page_usage(
- user_id, estimated_pages, allow_exceed=True
- )
-
- elif app_config.ETL_SERVICE == "DOCLING":
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Extracting content",
- )
-
- # Suppress logging during Docling import
- getLogger("docling.pipeline.base_pipeline").setLevel(ERROR)
- getLogger("docling.document_converter").setLevel(ERROR)
- getLogger(
- "docling_core.transforms.chunker.hierarchical_chunker"
- ).setLevel(ERROR)
-
- from docling.document_converter import DocumentConverter
-
- converter = DocumentConverter()
- result = converter.convert(file_path)
- markdown_content = result.document.export_to_markdown()
- etl_service = "DOCLING"
-
- # Update page usage
- await page_limit_service.update_page_usage(
- user_id, estimated_pages, allow_exceed=True
- )
-
- else:
- raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
-
- # Clean up temp file
- with contextlib.suppress(Exception):
- os.unlink(file_path)
+ # Step 1: extract content
+ markdown_content, etl_service = await _extract_file_content(
+ file_path,
+ filename,
+ session,
+ user_id,
+ task_logger,
+ log_entry,
+ notification,
+ )
if not markdown_content:
raise RuntimeError(f"Failed to extract content from file: {filename}")
- # ===== STEP 2: Check for duplicate content =====
+ # Step 2: duplicate check
content_hash = generate_content_hash(markdown_content, search_space_id)
-
existing_by_content = await check_duplicate_document(session, content_hash)
if existing_by_content and existing_by_content.id != doc_id:
- # Duplicate content found - mark this document as failed
logging.info(
f"Duplicate content detected for {filename}, "
f"matches document {existing_by_content.id}"
)
return None
- # ===== STEP 3+4: Index via pipeline =====
+ # Step 3: index via pipeline
if notification:
await NotificationService.document_processing.notify_processing_progress(
- session, notification, stage="chunking"
+ session,
+ notification,
+ stage="chunking",
)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
@@ -1957,7 +1039,6 @@ async def process_file_in_background_with_document(
"file_type": etl_service,
},
)
-
return document
except Exception as e:
diff --git a/surfsense_backend/app/tasks/document_processors/markdown_processor.py b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
index 2fb711bf8..0ff340c0e 100644
--- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
@@ -14,88 +14,19 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
- generate_unique_identifier_hash,
)
+from ._helpers import (
+ find_existing_document_with_migration,
+ get_google_drive_unique_identifier,
+)
from .base import (
- check_document_by_unique_identifier,
check_duplicate_document,
get_current_timestamp,
safe_set_chunks,
)
-def _get_google_drive_unique_identifier(
- connector: dict | None,
- filename: str,
- search_space_id: int,
-) -> tuple[str, str | None]:
- """
- Get unique identifier hash for a file, with special handling for Google Drive.
-
- For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
- For other files, uses filename.
-
- Args:
- connector: Optional connector info dict with type and metadata
- filename: The filename (used for non-Google Drive files or as fallback)
- search_space_id: The search space ID
-
- Returns:
- Tuple of (primary_hash, legacy_hash or None)
- """
- if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
- metadata = connector.get("metadata", {})
- file_id = metadata.get("google_drive_file_id")
-
- if file_id:
- primary_hash = generate_unique_identifier_hash(
- DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
- )
- legacy_hash = generate_unique_identifier_hash(
- DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
- )
- return primary_hash, legacy_hash
-
- primary_hash = generate_unique_identifier_hash(
- DocumentType.FILE, filename, search_space_id
- )
- return primary_hash, None
-
-
-async def _find_existing_document_with_migration(
- session: AsyncSession,
- primary_hash: str,
- legacy_hash: str | None,
- content_hash: str | None = None,
-) -> Document | None:
- """
- Find existing document, checking both new hash and legacy hash for migration,
- with fallback to content_hash for cross-source deduplication.
- """
- existing_document = await check_document_by_unique_identifier(session, primary_hash)
-
- if not existing_document and legacy_hash:
- existing_document = await check_document_by_unique_identifier(
- session, legacy_hash
- )
- if existing_document:
- logging.info(
- "Found legacy document (filename-based hash), will migrate to file_id-based hash"
- )
-
- # Fallback: check by content_hash to catch duplicates from different sources
- if not existing_document and content_hash:
- existing_document = await check_duplicate_document(session, content_hash)
- if existing_document:
- logging.info(
- f"Found duplicate content from different source (content_hash match). "
- f"Original document ID: {existing_document.id}, type: {existing_document.document_type}"
- )
-
- return existing_document
-
-
async def _handle_existing_document_update(
session: AsyncSession,
existing_document: Document,
@@ -224,7 +155,7 @@ async def add_received_markdown_file_document(
try:
# Generate unique identifier hash (uses file_id for Google Drive, filename for others)
- primary_hash, legacy_hash = _get_google_drive_unique_identifier(
+ primary_hash, legacy_hash = get_google_drive_unique_identifier(
connector, file_name, search_space_id
)
@@ -232,7 +163,7 @@ async def add_received_markdown_file_document(
content_hash = generate_content_hash(file_in_markdown, search_space_id)
# Check if document exists (with migration support for Google Drive and content_hash fallback)
- existing_document = await _find_existing_document_with_migration(
+ existing_document = await find_existing_document_with_migration(
session, primary_hash, legacy_hash, content_hash
)
diff --git a/surfsense_backend/tests/integration/document_upload/test_upload_limits.py b/surfsense_backend/tests/integration/document_upload/test_upload_limits.py
index a8dab43f0..a56398baa 100644
--- a/surfsense_backend/tests/integration/document_upload/test_upload_limits.py
+++ b/surfsense_backend/tests/integration/document_upload/test_upload_limits.py
@@ -2,12 +2,11 @@
Integration tests for backend file upload limit enforcement.
These tests verify that the API rejects uploads that exceed:
- - Max files per upload (10)
- - Max per-file size (50 MB)
- - Max total upload size (200 MB)
+ - Max per-file size (500 MB)
-The limits mirror the frontend's DocumentUploadTab.tsx constants and are
-enforced server-side to protect against direct API calls.
+No file count or total size limits are enforced — the frontend batches
+uploads in groups of 5 and there is no cap on how many files a user can
+upload in a single session.
Prerequisites:
- PostgreSQL + pgvector
@@ -24,60 +23,12 @@ pytestmark = pytest.mark.integration
# ---------------------------------------------------------------------------
-# Test A: File count limit
-# ---------------------------------------------------------------------------
-
-
-class TestFileCountLimit:
- """Uploading more than 10 files in a single request should be rejected."""
-
- async def test_11_files_returns_413(
- self,
- client: httpx.AsyncClient,
- headers: dict[str, str],
- search_space_id: int,
- ):
- files = [
- ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain"))
- for i in range(11)
- ]
- resp = await client.post(
- "/api/v1/documents/fileupload",
- headers=headers,
- files=files,
- data={"search_space_id": str(search_space_id)},
- )
- assert resp.status_code == 413
- assert "too many files" in resp.json()["detail"].lower()
-
- async def test_10_files_accepted(
- self,
- client: httpx.AsyncClient,
- headers: dict[str, str],
- search_space_id: int,
- cleanup_doc_ids: list[int],
- ):
- files = [
- ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain"))
- for i in range(10)
- ]
- resp = await client.post(
- "/api/v1/documents/fileupload",
- headers=headers,
- files=files,
- data={"search_space_id": str(search_space_id)},
- )
- assert resp.status_code == 200
- cleanup_doc_ids.extend(resp.json().get("document_ids", []))
-
-
-# ---------------------------------------------------------------------------
-# Test B: Per-file size limit
+# Test: Per-file size limit (500 MB)
# ---------------------------------------------------------------------------
class TestPerFileSizeLimit:
- """A single file exceeding 50 MB should be rejected."""
+ """A single file exceeding 500 MB should be rejected."""
async def test_oversized_file_returns_413(
self,
@@ -85,7 +36,7 @@ class TestPerFileSizeLimit:
headers: dict[str, str],
search_space_id: int,
):
- oversized = io.BytesIO(b"\x00" * (50 * 1024 * 1024 + 1))
+ oversized = io.BytesIO(b"\x00" * (500 * 1024 * 1024 + 1))
resp = await client.post(
"/api/v1/documents/fileupload",
headers=headers,
@@ -102,11 +53,11 @@ class TestPerFileSizeLimit:
search_space_id: int,
cleanup_doc_ids: list[int],
):
- at_limit = io.BytesIO(b"\x00" * (50 * 1024 * 1024))
+ at_limit = io.BytesIO(b"\x00" * (500 * 1024 * 1024))
resp = await client.post(
"/api/v1/documents/fileupload",
headers=headers,
- files=[("files", ("exact50mb.txt", at_limit, "text/plain"))],
+ files=[("files", ("exact500mb.txt", at_limit, "text/plain"))],
data={"search_space_id": str(search_space_id)},
)
assert resp.status_code == 200
@@ -114,26 +65,23 @@ class TestPerFileSizeLimit:
# ---------------------------------------------------------------------------
-# Test C: Total upload size limit
+# Test: Multiple files accepted without count limit
# ---------------------------------------------------------------------------
-class TestTotalSizeLimit:
- """Multiple files whose combined size exceeds 200 MB should be rejected."""
+class TestNoFileCountLimit:
+ """Many files in a single request should be accepted."""
- async def test_total_size_over_200mb_returns_413(
+ async def test_many_files_accepted(
self,
client: httpx.AsyncClient,
headers: dict[str, str],
search_space_id: int,
+ cleanup_doc_ids: list[int],
):
- chunk_size = 45 * 1024 * 1024 # 45 MB each
files = [
- (
- "files",
- (f"chunk_{i}.txt", io.BytesIO(b"\x00" * chunk_size), "text/plain"),
- )
- for i in range(5) # 5 x 45 MB = 225 MB > 200 MB
+ ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain"))
+ for i in range(20)
]
resp = await client.post(
"/api/v1/documents/fileupload",
@@ -141,5 +89,5 @@ class TestTotalSizeLimit:
files=files,
data={"search_space_id": str(search_space_id)},
)
- assert resp.status_code == 413
- assert "total upload size" in resp.json()["detail"].lower()
+ assert resp.status_code == 200
+ cleanup_doc_ids.extend(resp.json().get("document_ids", []))
diff --git a/surfsense_backend/tests/unit/middleware/test_knowledge_search.py b/surfsense_backend/tests/unit/middleware/test_knowledge_search.py
index 163dd0d1d..a8cf5c93b 100644
--- a/surfsense_backend/tests/unit/middleware/test_knowledge_search.py
+++ b/surfsense_backend/tests/unit/middleware/test_knowledge_search.py
@@ -248,7 +248,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner:
return []
async def fake_build_scoped_filesystem(**kwargs):
- return {}
+ return {}, {}
monkeypatch.setattr(
"app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
@@ -298,7 +298,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner:
return []
async def fake_build_scoped_filesystem(**kwargs):
- return {}
+ return {}, {}
monkeypatch.setattr(
"app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
@@ -334,7 +334,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner:
return []
async def fake_build_scoped_filesystem(**kwargs):
- return {}
+ return {}, {}
monkeypatch.setattr(
"app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx
index 4e0c36267..1c246ed71 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx
@@ -329,14 +329,15 @@ export function DocumentsTableShell({
const handleViewDocument = useCallback(async (doc: Document) => {
setViewingDoc(doc);
- if (doc.content) {
- setViewingContent(doc.content);
+ const preview = doc.content_preview || doc.content;
+ if (preview) {
+ setViewingContent(preview);
return;
}
setViewingLoading(true);
try {
const fullDoc = await documentsApiService.getDocument({ id: doc.id });
- setViewingContent(fullDoc.content);
+ setViewingContent(fullDoc.content_preview || fullDoc.content);
} catch (err) {
console.error("[DocumentsTableShell] Failed to fetch document content:", err);
setViewingContent("Failed to load document content.");
@@ -946,13 +947,36 @@ export function DocumentsTableShell({
WebkitMaskImage: `linear-gradient(to bottom, ${previewScrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${previewScrollPos === "bottom" ? "black" : "transparent"})`,
}}
>
- {viewingLoading ? (
-
-
-
- ) : (
-
- )}
+ {viewingLoading ? (
+
+
+
+ ) : (
+ <>
+
+ {viewingDoc && (
+
+ {
+ if (viewingDoc) {
+ openEditor({
+ documentId: viewingDoc.id,
+ searchSpaceId: Number(searchSpaceId),
+ title: viewingDoc.title,
+ });
+ handleCloseViewer();
+ }
+ }}
+ >
+
+ View full document
+
+
+ )}
+ >
+ )}
diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts
index d87f7374b..88914bd4f 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/types.ts
@@ -9,9 +9,9 @@ export type Document = {
id: number;
title: string;
document_type: DocumentType;
- // Optional: Only needed when viewing document details (lazy loaded)
document_metadata?: any;
content?: string;
+ content_preview?: string;
created_at: string;
search_space_id: number;
created_by_id?: string | null;
diff --git a/surfsense_web/components/editor-panel/editor-panel.tsx b/surfsense_web/components/editor-panel/editor-panel.tsx
index 3ea36f800..4b7079aef 100644
--- a/surfsense_web/components/editor-panel/editor-panel.tsx
+++ b/surfsense_web/components/editor-panel/editor-panel.tsx
@@ -1,12 +1,13 @@
"use client";
import { useAtomValue, useSetAtom } from "jotai";
-import { AlertCircle, XIcon } from "lucide-react";
+import { AlertCircle, Download, FileText, Loader2, XIcon } from "lucide-react";
import dynamic from "next/dynamic";
import { useCallback, useEffect, useRef, useState } from "react";
import { toast } from "sonner";
import { closeEditorPanelAtom, editorPanelAtom } from "@/atoms/editor/editor-panel.atom";
import { MarkdownViewer } from "@/components/markdown-viewer";
+import { Alert, AlertDescription } from "@/components/ui/alert";
import { Button } from "@/components/ui/button";
import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer";
import { Skeleton } from "@/components/ui/skeleton";
@@ -18,11 +19,16 @@ const PlateEditor = dynamic(
{ ssr: false, loading: () => }
);
+const LARGE_DOCUMENT_THRESHOLD = 2 * 1024 * 1024; // 2MB
+
interface EditorContent {
document_id: number;
title: string;
document_type?: string;
source_markdown: string;
+ content_size_bytes?: number;
+ chunk_count?: number;
+ truncated?: boolean;
}
const EDITABLE_DOCUMENT_TYPES = new Set(["FILE", "NOTE"]);
@@ -62,6 +68,7 @@ export function EditorPanelContent({
const [isLoading, setIsLoading] = useState(true);
const [error, setError] = useState(null);
const [saving, setSaving] = useState(false);
+ const [downloading, setDownloading] = useState(false);
const [editedMarkdown, setEditedMarkdown] = useState(null);
const markdownRef = useRef("");
@@ -69,6 +76,8 @@ export function EditorPanelContent({
const changeCountRef = useRef(0);
const [displayTitle, setDisplayTitle] = useState(title || "Untitled");
+ const isLargeDocument = (editorDoc?.content_size_bytes ?? 0) > LARGE_DOCUMENT_THRESHOLD;
+
useEffect(() => {
let cancelled = false;
setIsLoading(true);
@@ -86,10 +95,12 @@ export function EditorPanelContent({
}
try {
- const response = await authenticatedFetch(
- `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`,
- { method: "GET" }
+ const url = new URL(
+ `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`
);
+ url.searchParams.set("max_length", String(LARGE_DOCUMENT_THRESHOLD));
+
+ const response = await authenticatedFetch(url.toString(), { method: "GET" });
if (cancelled) return;
@@ -175,7 +186,7 @@ export function EditorPanelContent({
}, [documentId, searchSpaceId]);
const isEditableType = editorDoc
- ? EDITABLE_DOCUMENT_TYPES.has(editorDoc.document_type ?? "")
+ ? EDITABLE_DOCUMENT_TYPES.has(editorDoc.document_type ?? "") && !isLargeDocument
: false;
return (
@@ -206,6 +217,57 @@ export function EditorPanelContent({
{error || "An unknown error occurred"}
+ ) : isLargeDocument ? (
+
+
+
+
+
+ This document is too large for the editor ({Math.round((editorDoc.content_size_bytes ?? 0) / 1024 / 1024)}MB, {editorDoc.chunk_count ?? 0} chunks). Showing a preview below.
+
+ {
+ setDownloading(true);
+ try {
+ const response = await authenticatedFetch(
+ `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/download-markdown`,
+ { method: "GET" }
+ );
+ if (!response.ok) throw new Error("Download failed");
+ const blob = await response.blob();
+ const url = URL.createObjectURL(blob);
+ const a = document.createElement("a");
+ a.href = url;
+ const disposition = response.headers.get("content-disposition");
+ const match = disposition?.match(/filename="(.+)"/);
+ a.download = match?.[1] ?? `${editorDoc.title || "document"}.md`;
+ document.body.appendChild(a);
+ a.click();
+ a.remove();
+ URL.revokeObjectURL(url);
+ toast.success("Download started");
+ } catch {
+ toast.error("Failed to download document");
+ } finally {
+ setDownloading(false);
+ }
+ }}
+ >
+ {downloading ? (
+
+ ) : (
+
+ )}
+ {downloading ? "Preparing..." : "Download .md"}
+
+
+
+
+
) : isEditableType ? (
(null);
const [isEditing, setIsEditing] = useState(false);
const [saving, setSaving] = useState(false);
+ const [downloading, setDownloading] = useState(false);
const [editedMarkdown, setEditedMarkdown] = useState(null);
const markdownRef = useRef("");
const initialLoadDone = useRef(false);
const changeCountRef = useRef(0);
+ const isLargeDocument = (doc?.content_size_bytes ?? 0) > LARGE_DOCUMENT_THRESHOLD;
+
useEffect(() => {
let cancelled = false;
setIsLoading(true);
@@ -72,10 +81,12 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen
}
try {
- const response = await authenticatedFetch(
- `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`,
- { method: "GET" }
+ const url = new URL(
+ `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`
);
+ url.searchParams.set("max_length", String(LARGE_DOCUMENT_THRESHOLD));
+
+ const response = await authenticatedFetch(url.toString(), { method: "GET" });
if (cancelled) return;
@@ -173,9 +184,9 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen
);
}
- const isEditable = EDITABLE_DOCUMENT_TYPES.has(doc.document_type ?? "");
+ const isEditable = EDITABLE_DOCUMENT_TYPES.has(doc.document_type ?? "") && !isLargeDocument;
- if (isEditing) {
+ if (isEditing && !isLargeDocument) {
return (
@@ -236,7 +247,60 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen
-
+ {isLargeDocument ? (
+ <>
+
+
+
+
+ This document is too large for the editor ({Math.round((doc.content_size_bytes ?? 0) / 1024 / 1024)}MB, {doc.chunk_count ?? 0} chunks). Showing a preview below.
+
+ {
+ setDownloading(true);
+ try {
+ const response = await authenticatedFetch(
+ `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/download-markdown`,
+ { method: "GET" }
+ );
+ if (!response.ok) throw new Error("Download failed");
+ const blob = await response.blob();
+ const url = URL.createObjectURL(blob);
+ const a = document.createElement("a");
+ a.href = url;
+ const disposition = response.headers.get("content-disposition");
+ const match = disposition?.match(/filename="(.+)"/);
+ a.download = match?.[1] ?? `${doc.title || "document"}.md`;
+ document.body.appendChild(a);
+ a.click();
+ a.remove();
+ URL.revokeObjectURL(url);
+ toast.success("Download started");
+ } catch {
+ toast.error("Failed to download document");
+ } finally {
+ setDownloading(false);
+ }
+ }}
+ >
+ {downloading ? (
+
+ ) : (
+
+ )}
+ {downloading ? "Preparing..." : "Download .md"}
+
+
+
+
+ >
+ ) : (
+
+ )}
diff --git a/surfsense_web/components/markdown-viewer.tsx b/surfsense_web/components/markdown-viewer.tsx
index e22df8998..abd999301 100644
--- a/surfsense_web/components/markdown-viewer.tsx
+++ b/surfsense_web/components/markdown-viewer.tsx
@@ -15,6 +15,7 @@ const math = createMathPlugin({
interface MarkdownViewerProps {
content: string;
className?: string;
+ maxLength?: number;
}
/**
@@ -79,8 +80,10 @@ function convertLatexDelimiters(content: string): string {
return content;
}
-export function MarkdownViewer({ content, className }: MarkdownViewerProps) {
- const processedContent = convertLatexDelimiters(stripOuterMarkdownFence(content));
+export function MarkdownViewer({ content, className, maxLength }: MarkdownViewerProps) {
+ const isTruncated = maxLength != null && content.length > maxLength;
+ const displayContent = isTruncated ? content.slice(0, maxLength) : content;
+ const processedContent = convertLatexDelimiters(stripOuterMarkdownFence(displayContent));
const components: StreamdownProps["components"] = {
p: ({ children, ...props }) => (
@@ -171,6 +174,11 @@ export function MarkdownViewer({ content, className }: MarkdownViewerProps) {
>
{processedContent}
+ {isTruncated && (
+
+ Content truncated ({Math.round(content.length / 1024)}KB total). Showing first {Math.round(maxLength / 1024)}KB.
+
+ )}
);
}
diff --git a/surfsense_web/components/new-chat/source-detail-panel.tsx b/surfsense_web/components/new-chat/source-detail-panel.tsx
index b02b2e217..c17616c53 100644
--- a/surfsense_web/components/new-chat/source-detail-panel.tsx
+++ b/surfsense_web/components/new-chat/source-detail-panel.tsx
@@ -1,7 +1,7 @@
"use client";
import { useQuery } from "@tanstack/react-query";
-import { BookOpen, ChevronDown, ExternalLink, FileText, Hash, Sparkles, X } from "lucide-react";
+import { BookOpen, ChevronDown, ChevronUp, ExternalLink, FileText, Hash, Loader2, Sparkles, X } from "lucide-react";
import { AnimatePresence, motion, useReducedMotion } from "motion/react";
import { useTranslations } from "next-intl";
import type React from "react";
@@ -10,7 +10,6 @@ import { createPortal } from "react-dom";
import { MarkdownViewer } from "@/components/markdown-viewer";
import { Badge } from "@/components/ui/badge";
import { Button } from "@/components/ui/button";
-import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible";
import { ScrollArea } from "@/components/ui/scroll-area";
import { Spinner } from "@/components/ui/spinner";
import type {
@@ -48,7 +47,8 @@ const formatDocumentType = (type: string) => {
// which break auto-scroll functionality
interface ChunkCardProps {
chunk: { id: number; content: string };
- index: number;
+ localIndex: number;
+ chunkNumber: number;
totalChunks: number;
isCited: boolean;
isActive: boolean;
@@ -56,11 +56,11 @@ interface ChunkCardProps {
}
const ChunkCard = memo(
- forwardRef(({ chunk, index, totalChunks, isCited }, ref) => {
+ forwardRef(({ chunk, localIndex, chunkNumber, totalChunks, isCited }, ref) => {
return (
- {/* Cited indicator glow effect */}
{isCited &&
}
- {/* Header */}
- {index + 1}
+ {chunkNumber}
-
of {totalChunks} chunks
+
Chunk {chunkNumber} of {totalChunks}
{isCited && (
@@ -94,9 +92,8 @@ const ChunkCard = memo(
)}
- {/* Content */}
-
+
);
@@ -118,7 +115,6 @@ export function SourceDetailPanel({
const t = useTranslations("dashboard");
const scrollAreaRef = useRef(null);
const hasScrolledRef = useRef(false); // Use ref to avoid stale closures
- const [summaryOpen, setSummaryOpen] = useState(false);
const [activeChunkIndex, setActiveChunkIndex] = useState(null);
const [mounted, setMounted] = useState(false);
const [_hasScrolledToCited, setHasScrolledToCited] = useState(false);
@@ -140,20 +136,88 @@ export function SourceDetailPanel({
if (isDocsChunk) {
return documentsApiService.getSurfsenseDocByChunk(chunkId);
}
- return documentsApiService.getDocumentByChunk({ chunk_id: chunkId });
+ return documentsApiService.getDocumentByChunk({ chunk_id: chunkId, chunk_window: 5 });
},
enabled: !!chunkId && open,
staleTime: 5 * 60 * 1000,
});
+ const totalChunks = (documentData && "total_chunks" in documentData)
+ ? (documentData.total_chunks ?? documentData.chunks.length)
+ : (documentData?.chunks?.length ?? 0);
+ const [beforeChunks, setBeforeChunks] = useState>([]);
+ const [afterChunks, setAfterChunks] = useState>([]);
+ const [loadingBefore, setLoadingBefore] = useState(false);
+ const [loadingAfter, setLoadingAfter] = useState(false);
+
+ useEffect(() => {
+ setBeforeChunks([]);
+ setAfterChunks([]);
+ }, [chunkId, open]);
+
+ const chunkStartIndex = (documentData && "chunk_start_index" in documentData)
+ ? (documentData.chunk_start_index ?? 0) : 0;
+ const initialChunks = documentData?.chunks ?? [];
+ const allChunks = [...beforeChunks, ...initialChunks, ...afterChunks];
+ const absoluteStart = chunkStartIndex - beforeChunks.length;
+ const absoluteEnd = chunkStartIndex + initialChunks.length + afterChunks.length;
+ const canLoadBefore = absoluteStart > 0;
+ const canLoadAfter = absoluteEnd < totalChunks;
+
+ const EXPAND_SIZE = 10;
+
+ const loadBefore = useCallback(async () => {
+ if (!documentData || !("search_space_id" in documentData) || !canLoadBefore) return;
+ setLoadingBefore(true);
+ try {
+ const count = Math.min(EXPAND_SIZE, absoluteStart);
+ const result = await documentsApiService.getDocumentChunks({
+ document_id: documentData.id,
+ page: 0,
+ page_size: count,
+ start_offset: absoluteStart - count,
+ });
+ const existingIds = new Set(allChunks.map(c => c.id));
+ const newChunks = result.items
+ .filter(c => !existingIds.has(c.id))
+ .map(c => ({ id: c.id, content: c.content, created_at: c.created_at }));
+ setBeforeChunks(prev => [...newChunks, ...prev]);
+ } catch (err) {
+ console.error("Failed to load earlier chunks:", err);
+ } finally {
+ setLoadingBefore(false);
+ }
+ }, [documentData, absoluteStart, canLoadBefore, allChunks]);
+
+ const loadAfter = useCallback(async () => {
+ if (!documentData || !("search_space_id" in documentData) || !canLoadAfter) return;
+ setLoadingAfter(true);
+ try {
+ const result = await documentsApiService.getDocumentChunks({
+ document_id: documentData.id,
+ page: 0,
+ page_size: EXPAND_SIZE,
+ start_offset: absoluteEnd,
+ });
+ const existingIds = new Set(allChunks.map(c => c.id));
+ const newChunks = result.items
+ .filter(c => !existingIds.has(c.id))
+ .map(c => ({ id: c.id, content: c.content, created_at: c.created_at }));
+ setAfterChunks(prev => [...prev, ...newChunks]);
+ } catch (err) {
+ console.error("Failed to load later chunks:", err);
+ } finally {
+ setLoadingAfter(false);
+ }
+ }, [documentData, absoluteEnd, canLoadAfter, allChunks]);
+
const isDirectRenderSource =
sourceType === "TAVILY_API" ||
sourceType === "LINKUP_API" ||
sourceType === "SEARXNG_API" ||
sourceType === "BAIDU_SEARCH_API";
- // Find cited chunk index
- const citedChunkIndex = documentData?.chunks?.findIndex((chunk) => chunk.id === chunkId) ?? -1;
+ const citedChunkIndex = allChunks.findIndex((chunk) => chunk.id === chunkId);
// Simple scroll function that scrolls to a chunk by index
const scrollToChunkByIndex = useCallback(
@@ -336,12 +400,12 @@ export function SourceDetailPanel({
{documentData && "document_type" in documentData
? formatDocumentType(documentData.document_type)
: sourceType && formatDocumentType(sourceType)}
- {documentData?.chunks && (
-
- • {documentData.chunks.length} chunk
- {documentData.chunks.length !== 1 ? "s" : ""}
-
- )}
+ {totalChunks > 0 && (
+
+ • {totalChunks} chunk{totalChunks !== 1 ? "s" : ""}
+ {allChunks.length < totalChunks && ` (showing ${allChunks.length})`}
+
+ )}
@@ -450,7 +514,7 @@ export function SourceDetailPanel({
{!isDirectRenderSource && documentData && (
{/* Chunk Navigation Sidebar */}
- {documentData.chunks.length > 1 && (
+ {allChunks.length > 1 && (
- {documentData.chunks.map((chunk, idx) => {
+ {allChunks.map((chunk, idx) => {
+ const absNum = absoluteStart + idx + 1;
const isCited = chunk.id === chunkId;
const isActive = activeChunkIndex === idx;
return (
@@ -478,9 +543,9 @@ export function SourceDetailPanel({
? "bg-muted text-foreground"
: "bg-muted/50 text-muted-foreground hover:bg-muted hover:text-foreground"
)}
- title={isCited ? `Chunk ${idx + 1} (Cited)` : `Chunk ${idx + 1}`}
+ title={isCited ? `Chunk ${absNum} (Cited)` : `Chunk ${absNum}`}
>
- {idx + 1}
+ {absNum}
{isCited && (
@@ -524,44 +589,11 @@ export function SourceDetailPanel({
)}
- {/* Summary Collapsible */}
- {documentData.content && (
-
-
-
-
-
- Document Summary
-
-
-
-
-
-
-
-
-
-
-
-
- )}
-
{/* Chunks Header */}
-
+
- Content Chunks
+ Chunks {absoluteStart + 1}–{absoluteEnd} of {totalChunks}
{citedChunkIndex !== -1 && (
+ {/* Load Earlier */}
+ {canLoadBefore && (
+
+
+ {loadingBefore ? (
+
+ ) : (
+
+ )}
+ {loadingBefore
+ ? "Loading..."
+ : `Load ${Math.min(EXPAND_SIZE, absoluteStart)} earlier chunks`}
+
+
+ )}
+
{/* Chunks */}
- {documentData.chunks.map((chunk, idx) => {
+ {allChunks.map((chunk, idx) => {
const isCited = chunk.id === chunkId;
+ const chunkNumber = absoluteStart + idx + 1;
return (
30}
+ disableLayoutAnimation={allChunks.length > 30}
/>
);
})}
+
+ {/* Load Later */}
+ {canLoadAfter && (
+
+
+ {loadingAfter ? (
+
+ ) : (
+
+ )}
+ {loadingAfter
+ ? "Loading..."
+ : `Load ${Math.min(EXPAND_SIZE, totalChunks - absoluteEnd)} later chunks`}
+
+
+ )}
diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx
index 6817b19db..faa042d8e 100644
--- a/surfsense_web/components/sources/DocumentUploadTab.tsx
+++ b/surfsense_web/components/sources/DocumentUploadTab.tsx
@@ -1,10 +1,10 @@
"use client";
import { useAtom } from "jotai";
-import { CheckCircle2, FileType, Info, Upload, X } from "lucide-react";
+import { CheckCircle2, FileType, FolderOpen, Info, Upload, X } from "lucide-react";
import { useTranslations } from "next-intl";
-import { useCallback, useMemo, useRef, useState } from "react";
+import { type ChangeEvent, useCallback, useMemo, useRef, useState } from "react";
import { useDropzone } from "react-dropzone";
import { toast } from "sonner";
import { uploadDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms";
@@ -51,6 +51,7 @@ const commonTypes = {
"application/vnd.openxmlformats-officedocument.presentationml.presentation": [".pptx"],
"text/html": [".html", ".htm"],
"text/csv": [".csv"],
+ "text/tab-separated-values": [".tsv"],
"image/jpeg": [".jpg", ".jpeg"],
"image/png": [".png"],
"image/bmp": [".bmp"],
@@ -76,7 +77,6 @@ const FILE_TYPE_CONFIG: Record> = {
"application/rtf": [".rtf"],
"application/xml": [".xml"],
"application/epub+zip": [".epub"],
- "text/tab-separated-values": [".tsv"],
"text/html": [".html", ".htm", ".web"],
"image/gif": [".gif"],
"image/svg+xml": [".svg"],
@@ -102,7 +102,6 @@ const FILE_TYPE_CONFIG: Record> = {
"application/vnd.ms-powerpoint": [".ppt"],
"text/x-rst": [".rst"],
"application/rtf": [".rtf"],
- "text/tab-separated-values": [".tsv"],
"application/vnd.ms-excel": [".xls"],
"application/xml": [".xml"],
...audioFileTypes,
@@ -116,10 +115,8 @@ interface FileWithId {
const cardClass = "border border-border bg-slate-400/5 dark:bg-white/5";
-// Upload limits — files are sent in batches of 5 to avoid proxy timeouts
-const MAX_FILES = 50;
-const MAX_TOTAL_SIZE_MB = 200;
-const MAX_TOTAL_SIZE_BYTES = MAX_TOTAL_SIZE_MB * 1024 * 1024;
+const MAX_FILE_SIZE_MB = 500;
+const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024;
export function DocumentUploadTab({
searchSpaceId,
@@ -134,6 +131,7 @@ export function DocumentUploadTab({
const [uploadDocumentMutation] = useAtom(uploadDocumentMutationAtom);
const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation;
const fileInputRef = useRef(null);
+ const folderInputRef = useRef(null);
const acceptedFileTypes = useMemo(() => {
const etlService = process.env.NEXT_PUBLIC_ETL_SERVICE;
@@ -145,49 +143,76 @@ export function DocumentUploadTab({
[acceptedFileTypes]
);
- const onDrop = useCallback(
- (acceptedFiles: File[]) => {
+ const supportedExtensionsSet = useMemo(
+ () => new Set(supportedExtensions.map((ext) => ext.toLowerCase())),
+ [supportedExtensions]
+ );
+
+ const addFiles = useCallback(
+ (incoming: File[]) => {
+ const oversized = incoming.filter((f) => f.size > MAX_FILE_SIZE_BYTES);
+ if (oversized.length > 0) {
+ toast.error(t("file_too_large"), {
+ description: t("file_too_large_desc", {
+ name: oversized[0].name,
+ maxMB: MAX_FILE_SIZE_MB,
+ }),
+ });
+ }
+ const valid = incoming.filter((f) => f.size <= MAX_FILE_SIZE_BYTES);
+ if (valid.length === 0) return;
+
setFiles((prev) => {
- const newEntries = acceptedFiles.map((f) => ({
+ const newEntries = valid.map((f) => ({
id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`,
file: f,
}));
- const newFiles = [...prev, ...newEntries];
-
- if (newFiles.length > MAX_FILES) {
- toast.error(t("max_files_exceeded"), {
- description: t("max_files_exceeded_desc", { max: MAX_FILES }),
- });
- return prev;
- }
-
- const newTotalSize = newFiles.reduce((sum, entry) => sum + entry.file.size, 0);
- if (newTotalSize > MAX_TOTAL_SIZE_BYTES) {
- toast.error(t("max_size_exceeded"), {
- description: t("max_size_exceeded_desc", { max: MAX_TOTAL_SIZE_MB }),
- });
- return prev;
- }
-
- return newFiles;
+ return [...prev, ...newEntries];
});
},
[t]
);
+ const onDrop = useCallback(
+ (acceptedFiles: File[]) => {
+ addFiles(acceptedFiles);
+ },
+ [addFiles]
+ );
+
const { getRootProps, getInputProps, isDragActive } = useDropzone({
onDrop,
accept: acceptedFileTypes,
- maxSize: 50 * 1024 * 1024, // 50MB per file
+ maxSize: MAX_FILE_SIZE_BYTES,
noClick: false,
- disabled: files.length >= MAX_FILES,
});
- // Handle file input click to prevent event bubbling that might reopen dialog
const handleFileInputClick = useCallback((e: React.MouseEvent) => {
e.stopPropagation();
}, []);
+ const handleFolderChange = useCallback(
+ (e: ChangeEvent) => {
+ const fileList = e.target.files;
+ if (!fileList || fileList.length === 0) return;
+
+ const folderFiles = Array.from(fileList).filter((f) => {
+ const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : "";
+ return ext !== "" && supportedExtensionsSet.has(ext);
+ });
+
+ if (folderFiles.length === 0) {
+ toast.error(t("no_supported_files_in_folder"));
+ e.target.value = "";
+ return;
+ }
+
+ addFiles(folderFiles);
+ e.target.value = "";
+ },
+ [addFiles, supportedExtensionsSet, t]
+ );
+
const formatFileSize = (bytes: number) => {
if (bytes === 0) return "0 Bytes";
const k = 1024;
@@ -198,15 +223,6 @@ export function DocumentUploadTab({
const totalFileSize = files.reduce((total, entry) => total + entry.file.size, 0);
- // Check if limits are reached
- const isFileCountLimitReached = files.length >= MAX_FILES;
- const isSizeLimitReached = totalFileSize >= MAX_TOTAL_SIZE_BYTES;
- const remainingFiles = MAX_FILES - files.length;
- const remainingSizeMB = Math.max(
- 0,
- (MAX_TOTAL_SIZE_BYTES - totalFileSize) / (1024 * 1024)
- ).toFixed(1);
-
// Track accordion state changes
const handleAccordionChange = useCallback(
(value: string) => {
@@ -257,11 +273,21 @@ export function DocumentUploadTab({
- {t("file_size_limit")}{" "}
- {t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })}
+ {t("file_size_limit", { maxMB: MAX_FILE_SIZE_MB })}{" "}
+ {t("upload_limits")}
+ {/* Hidden folder input */}
+ )}
+ />
+
@@ -269,11 +295,7 @@ export function DocumentUploadTab({
- {isFileCountLimitReached ? (
-
-
-
-
- {t("file_limit_reached")}
-
-
- {t("file_limit_reached_desc", { max: MAX_FILES })}
-
-
-
- ) : isDragActive ? (
+ {isDragActive ? (
{t("drop_files")}
@@ -305,29 +315,35 @@ export function DocumentUploadTab({
{t("drag_drop")}
{t("or_browse")}
- {files.length > 0 && (
-
- {t("remaining_capacity", { files: remainingFiles, sizeMB: remainingSizeMB })}
-
- )}
-
- )}
- {!isFileCountLimitReached && (
-
- {
- e.stopPropagation();
- e.preventDefault();
- fileInputRef.current?.click();
- }}
- >
- {t("browse_files")}
-
)}
+
+ {
+ e.stopPropagation();
+ e.preventDefault();
+ fileInputRef.current?.click();
+ }}
+ >
+ {t("browse_files")}
+
+ {
+ e.stopPropagation();
+ e.preventDefault();
+ folderInputRef.current?.click();
+ }}
+ >
+
+ {t("browse_folder")}
+
+
diff --git a/surfsense_web/contracts/types/document.types.ts b/surfsense_web/contracts/types/document.types.ts
index 1a3326bae..f5431aecb 100644
--- a/surfsense_web/contracts/types/document.types.ts
+++ b/surfsense_web/contracts/types/document.types.ts
@@ -39,6 +39,7 @@ export const document = z.object({
document_type: documentTypeEnum,
document_metadata: z.record(z.string(), z.any()),
content: z.string(),
+ content_preview: z.string().optional().default(""),
content_hash: z.string(),
unique_identifier_hash: z.string().nullable(),
created_at: z.string(),
@@ -69,6 +70,8 @@ export const documentWithChunks = document.extend({
created_at: z.string(),
})
),
+ total_chunks: z.number().optional().default(0),
+ chunk_start_index: z.number().optional().default(0),
});
/**
@@ -243,10 +246,36 @@ export const getDocumentTypeCountsResponse = z.record(z.string(), z.number());
*/
export const getDocumentByChunkRequest = z.object({
chunk_id: z.number(),
+ chunk_window: z.number().optional(),
});
export const getDocumentByChunkResponse = documentWithChunks;
+/**
+ * Get paginated chunks for a document
+ */
+export const getDocumentChunksRequest = z.object({
+ document_id: z.number(),
+ page: z.number().optional().default(0),
+ page_size: z.number().optional().default(20),
+ start_offset: z.number().optional(),
+});
+
+export const chunkRead = z.object({
+ id: z.number(),
+ content: z.string(),
+ document_id: z.number(),
+ created_at: z.string(),
+});
+
+export const getDocumentChunksResponse = z.object({
+ items: z.array(chunkRead),
+ total: z.number(),
+ page: z.number(),
+ page_size: z.number(),
+ has_more: z.boolean(),
+});
+
/**
* Get Surfsense docs by chunk
*/
@@ -328,3 +357,6 @@ export type GetSurfsenseDocsByChunkRequest = z.infer;
export type GetSurfsenseDocsRequest = z.infer;
export type GetSurfsenseDocsResponse = z.infer;
+export type GetDocumentChunksRequest = z.infer;
+export type GetDocumentChunksResponse = z.infer;
+export type ChunkRead = z.infer;
diff --git a/surfsense_web/lib/apis/documents-api.service.ts b/surfsense_web/lib/apis/documents-api.service.ts
index 14a247032..71fa58852 100644
--- a/surfsense_web/lib/apis/documents-api.service.ts
+++ b/surfsense_web/lib/apis/documents-api.service.ts
@@ -6,6 +6,7 @@ import {
deleteDocumentRequest,
deleteDocumentResponse,
type GetDocumentByChunkRequest,
+ type GetDocumentChunksRequest,
type GetDocumentRequest,
type GetDocumentsRequest,
type GetDocumentsStatusRequest,
@@ -13,6 +14,8 @@ import {
type GetSurfsenseDocsRequest,
getDocumentByChunkRequest,
getDocumentByChunkResponse,
+ getDocumentChunksRequest,
+ getDocumentChunksResponse,
getDocumentRequest,
getDocumentResponse,
getDocumentsRequest,
@@ -295,23 +298,52 @@ class DocumentsApiService {
};
/**
- * Get document by chunk ID (includes all chunks)
+ * Get document by chunk ID (includes a window of chunks around the cited one)
*/
getDocumentByChunk = async (request: GetDocumentByChunkRequest) => {
- // Validate the request
const parsedRequest = getDocumentByChunkRequest.safeParse(request);
if (!parsedRequest.success) {
console.error("Invalid request:", parsedRequest.error);
- // Format a user friendly error message
const errorMessage = parsedRequest.error.issues.map((issue) => issue.message).join(", ");
throw new ValidationError(`Invalid request: ${errorMessage}`);
}
+ const params = new URLSearchParams();
+ if (request.chunk_window != null) {
+ params.set("chunk_window", String(request.chunk_window));
+ }
+ const qs = params.toString();
+ const url = `/api/v1/documents/by-chunk/${request.chunk_id}${qs ? `?${qs}` : ""}`;
+
+ return baseApiService.get(url, getDocumentByChunkResponse);
+ };
+
+ /**
+ * Get paginated chunks for a document
+ */
+ getDocumentChunks = async (request: GetDocumentChunksRequest) => {
+ const parsedRequest = getDocumentChunksRequest.safeParse(request);
+
+ if (!parsedRequest.success) {
+ console.error("Invalid request:", parsedRequest.error);
+
+ const errorMessage = parsedRequest.error.issues.map((issue) => issue.message).join(", ");
+ throw new ValidationError(`Invalid request: ${errorMessage}`);
+ }
+
+ const params = new URLSearchParams({
+ page: String(parsedRequest.data.page),
+ page_size: String(parsedRequest.data.page_size),
+ });
+ if (parsedRequest.data.start_offset != null) {
+ params.set("start_offset", String(parsedRequest.data.start_offset));
+ }
+
return baseApiService.get(
- `/api/v1/documents/by-chunk/${request.chunk_id}`,
- getDocumentByChunkResponse
+ `/api/v1/documents/${parsedRequest.data.document_id}/chunks?${params}`,
+ getDocumentChunksResponse
);
};
diff --git a/surfsense_web/messages/en.json b/surfsense_web/messages/en.json
index 53f80ea5f..cacaec557 100644
--- a/surfsense_web/messages/en.json
+++ b/surfsense_web/messages/en.json
@@ -376,12 +376,13 @@
"upload_documents": {
"title": "Upload Documents",
"subtitle": "Upload your files to make them searchable and accessible through AI-powered conversations.",
- "file_size_limit": "Maximum file size: 50MB per file.",
- "upload_limits": "Upload limit: {maxFiles} files, {maxSizeMB}MB total.",
- "drop_files": "Drop files here",
- "drag_drop": "Drag & drop files here",
- "or_browse": "or click to browse",
+ "file_size_limit": "Maximum file size: {maxMB}MB per file.",
+ "upload_limits": "Upload files or entire folders",
+ "drop_files": "Drop files or folders here",
+ "drag_drop": "Drag & drop files or folders here",
+ "or_browse": "or click to browse files and folders",
"browse_files": "Browse Files",
+ "browse_folder": "Browse Folder",
"selected_files": "Selected Files ({count})",
"total_size": "Total size",
"clear_all": "Clear all",
@@ -394,13 +395,9 @@
"upload_error_desc": "Error uploading files",
"supported_file_types": "Supported File Types",
"file_types_desc": "These file types are supported based on your current ETL service configuration.",
- "max_files_exceeded": "File Limit Exceeded",
- "max_files_exceeded_desc": "You can upload a maximum of {max} files at a time.",
- "max_size_exceeded": "Size Limit Exceeded",
- "max_size_exceeded_desc": "Total file size cannot exceed {max}MB.",
- "file_limit_reached": "Maximum Files Reached",
- "file_limit_reached_desc": "Remove some files to add more (max {max} files).",
- "remaining_capacity": "{files} files remaining • {sizeMB}MB available"
+ "file_too_large": "File Too Large",
+ "file_too_large_desc": "\"{name}\" exceeds the {maxMB}MB per-file limit.",
+ "no_supported_files_in_folder": "No supported file types found in the selected folder."
},
"add_webpage": {
"title": "Add Webpages for Crawling",
diff --git a/surfsense_web/messages/es.json b/surfsense_web/messages/es.json
index 36e627295..7670e76df 100644
--- a/surfsense_web/messages/es.json
+++ b/surfsense_web/messages/es.json
@@ -376,12 +376,13 @@
"upload_documents": {
"title": "Subir documentos",
"subtitle": "Sube tus archivos para hacerlos buscables y accesibles a través de conversaciones con IA.",
- "file_size_limit": "Tamaño máximo de archivo: 50 MB por archivo.",
- "upload_limits": "Límite de subida: {maxFiles} archivos, {maxSizeMB} MB en total.",
- "drop_files": "Suelta los archivos aquí",
- "drag_drop": "Arrastra y suelta archivos aquí",
- "or_browse": "o haz clic para explorar",
+ "file_size_limit": "Tamaño máximo de archivo: {maxMB} MB por archivo.",
+ "upload_limits": "Sube archivos o carpetas enteras",
+ "drop_files": "Suelta archivos o carpetas aquí",
+ "drag_drop": "Arrastra y suelta archivos o carpetas aquí",
+ "or_browse": "o haz clic para explorar archivos y carpetas",
"browse_files": "Explorar archivos",
+ "browse_folder": "Explorar carpeta",
"selected_files": "Archivos seleccionados ({count})",
"total_size": "Tamaño total",
"clear_all": "Limpiar todo",
@@ -394,13 +395,9 @@
"upload_error_desc": "Error al subir archivos",
"supported_file_types": "Tipos de archivo soportados",
"file_types_desc": "Estos tipos de archivo son soportados según la configuración actual de tu servicio ETL.",
- "max_files_exceeded": "Límite de archivos excedido",
- "max_files_exceeded_desc": "Puedes subir un máximo de {max} archivos a la vez.",
- "max_size_exceeded": "Límite de tamaño excedido",
- "max_size_exceeded_desc": "El tamaño total de los archivos no puede exceder {max} MB.",
- "file_limit_reached": "Máximo de archivos alcanzado",
- "file_limit_reached_desc": "Elimina algunos archivos para agregar más (máximo {max} archivos).",
- "remaining_capacity": "{files} archivos restantes • {sizeMB} MB disponibles"
+ "file_too_large": "Archivo demasiado grande",
+ "file_too_large_desc": "\"{name}\" excede el límite de {maxMB} MB por archivo.",
+ "no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada."
},
"add_webpage": {
"title": "Agregar páginas web para rastreo",
diff --git a/surfsense_web/messages/hi.json b/surfsense_web/messages/hi.json
index fd51acdc2..cbcff0b30 100644
--- a/surfsense_web/messages/hi.json
+++ b/surfsense_web/messages/hi.json
@@ -376,12 +376,13 @@
"upload_documents": {
"title": "दस्तावेज़ अपलोड करें",
"subtitle": "AI-संचालित बातचीत के माध्यम से अपनी फ़ाइलों को खोजने योग्य और सुलभ बनाने के लिए अपलोड करें।",
- "file_size_limit": "अधिकतम फ़ाइल आकार: प्रति फ़ाइल 50MB।",
- "upload_limits": "अपलोड सीमा: {maxFiles} फ़ाइलें, कुल {maxSizeMB}MB।",
- "drop_files": "फ़ाइलें यहां छोड़ें",
- "drag_drop": "फ़ाइलें यहां खींचें और छोड़ें",
- "or_browse": "या ब्राउज़ करने के लिए क्लिक करें",
+ "file_size_limit": "अधिकतम फ़ाइल आकार: प्रति फ़ाइल {maxMB}MB।",
+ "upload_limits": "फ़ाइलें या पूरे फ़ोल्डर अपलोड करें",
+ "drop_files": "फ़ाइलें या फ़ोल्डर यहां छोड़ें",
+ "drag_drop": "फ़ाइलें या फ़ोल्डर यहां खींचें और छोड़ें",
+ "or_browse": "या फ़ाइलें और फ़ोल्डर ब्राउज़ करने के लिए क्लिक करें",
"browse_files": "फ़ाइलें ब्राउज़ करें",
+ "browse_folder": "फ़ोल्डर ब्राउज़ करें",
"selected_files": "चयनित फ़ाइलें ({count})",
"total_size": "कुल आकार",
"clear_all": "सभी साफ करें",
@@ -394,13 +395,9 @@
"upload_error_desc": "फ़ाइलें अपलोड करने में त्रुटि",
"supported_file_types": "समर्थित फ़ाइल प्रकार",
"file_types_desc": "ये फ़ाइल प्रकार आपकी वर्तमान ETL सेवा कॉन्फ़िगरेशन के आधार पर समर्थित हैं।",
- "max_files_exceeded": "फ़ाइल सीमा पार हो गई",
- "max_files_exceeded_desc": "आप एक बार में अधिकतम {max} फ़ाइलें अपलोड कर सकते हैं।",
- "max_size_exceeded": "आकार सीमा पार हो गई",
- "max_size_exceeded_desc": "कुल फ़ाइल आकार {max}MB से अधिक नहीं हो सकता।",
- "file_limit_reached": "अधिकतम फ़ाइलें पहुंच गई",
- "file_limit_reached_desc": "और जोड़ने के लिए कुछ फ़ाइलें हटाएं (अधिकतम {max} फ़ाइलें)।",
- "remaining_capacity": "{files} फ़ाइलें शेष • {sizeMB}MB उपलब्ध"
+ "file_too_large": "फ़ाइल बहुत बड़ी है",
+ "file_too_large_desc": "\"{name}\" प्रति फ़ाइल {maxMB}MB की सीमा से अधिक है।",
+ "no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।"
},
"add_webpage": {
"title": "क्रॉलिंग के लिए वेबपेज जोड़ें",
diff --git a/surfsense_web/messages/pt.json b/surfsense_web/messages/pt.json
index e26499f90..ec72ef0da 100644
--- a/surfsense_web/messages/pt.json
+++ b/surfsense_web/messages/pt.json
@@ -376,12 +376,13 @@
"upload_documents": {
"title": "Enviar documentos",
"subtitle": "Envie seus arquivos para torná-los pesquisáveis e acessíveis através de conversas com IA.",
- "file_size_limit": "Tamanho máximo do arquivo: 50 MB por arquivo.",
- "upload_limits": "Limite de envio: {maxFiles} arquivos, {maxSizeMB} MB no total.",
- "drop_files": "Solte os arquivos aqui",
- "drag_drop": "Arraste e solte arquivos aqui",
- "or_browse": "ou clique para navegar",
+ "file_size_limit": "Tamanho máximo do arquivo: {maxMB} MB por arquivo.",
+ "upload_limits": "Envie arquivos ou pastas inteiras",
+ "drop_files": "Solte arquivos ou pastas aqui",
+ "drag_drop": "Arraste e solte arquivos ou pastas aqui",
+ "or_browse": "ou clique para navegar arquivos e pastas",
"browse_files": "Navegar arquivos",
+ "browse_folder": "Navegar pasta",
"selected_files": "Arquivos selecionados ({count})",
"total_size": "Tamanho total",
"clear_all": "Limpar tudo",
@@ -394,13 +395,9 @@
"upload_error_desc": "Erro ao enviar arquivos",
"supported_file_types": "Tipos de arquivo suportados",
"file_types_desc": "Estes tipos de arquivo são suportados com base na configuração atual do seu serviço ETL.",
- "max_files_exceeded": "Limite de arquivos excedido",
- "max_files_exceeded_desc": "Você pode enviar no máximo {max} arquivos de uma vez.",
- "max_size_exceeded": "Limite de tamanho excedido",
- "max_size_exceeded_desc": "O tamanho total dos arquivos não pode exceder {max} MB.",
- "file_limit_reached": "Máximo de arquivos atingido",
- "file_limit_reached_desc": "Remova alguns arquivos para adicionar mais (máximo {max} arquivos).",
- "remaining_capacity": "{files} arquivos restantes • {sizeMB} MB disponíveis"
+ "file_too_large": "Arquivo muito grande",
+ "file_too_large_desc": "\"{name}\" excede o limite de {maxMB} MB por arquivo.",
+ "no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada."
},
"add_webpage": {
"title": "Adicionar páginas web para rastreamento",
diff --git a/surfsense_web/messages/zh.json b/surfsense_web/messages/zh.json
index 819432410..db634dfd9 100644
--- a/surfsense_web/messages/zh.json
+++ b/surfsense_web/messages/zh.json
@@ -360,12 +360,13 @@
"upload_documents": {
"title": "上传文档",
"subtitle": "上传您的文件,使其可通过 AI 对话进行搜索和访问。",
- "file_size_limit": "最大文件大小:每个文件 50MB。",
- "upload_limits": "上传限制:最多 {maxFiles} 个文件,总大小不超过 {maxSizeMB}MB。",
- "drop_files": "放下文件到这里",
- "drag_drop": "拖放文件到这里",
- "or_browse": "或点击浏览",
+ "file_size_limit": "最大文件大小:每个文件 {maxMB}MB。",
+ "upload_limits": "上传文件或整个文件夹",
+ "drop_files": "将文件或文件夹拖放到此处",
+ "drag_drop": "将文件或文件夹拖放到此处",
+ "or_browse": "或点击浏览文件和文件夹",
"browse_files": "浏览文件",
+ "browse_folder": "浏览文件夹",
"selected_files": "已选择的文件 ({count})",
"total_size": "总大小",
"clear_all": "全部清除",
@@ -378,13 +379,9 @@
"upload_error_desc": "上传文件时出错",
"supported_file_types": "支持的文件类型",
"file_types_desc": "根据您当前的 ETL 服务配置支持这些文件类型。",
- "max_files_exceeded": "超过文件数量限制",
- "max_files_exceeded_desc": "一次最多只能上传 {max} 个文件。",
- "max_size_exceeded": "超过文件大小限制",
- "max_size_exceeded_desc": "文件总大小不能超过 {max}MB。",
- "file_limit_reached": "已达到最大文件数量",
- "file_limit_reached_desc": "移除一些文件以添加更多(最多 {max} 个文件)。",
- "remaining_capacity": "剩余 {files} 个文件名额 • 可用 {sizeMB}MB"
+ "file_too_large": "文件过大",
+ "file_too_large_desc": "\"{name}\" 超过了每个文件 {maxMB}MB 的限制。",
+ "no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。"
},
"add_webpage": {
"title": "添加网页爬取",