mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-06 20:15:17 +02:00
refactor: streamline document upload limits and enhance handling of mentioned documents
- Updated maximum file size limit to 500 MB per file. - Removed restrictions on the number of files per upload and total upload size. - Enhanced handling of user-mentioning documents in the knowledge base search middleware. - Improved document reading and processing logic to accommodate new features and optimizations.
This commit is contained in:
parent
6727266107
commit
62e698d8aa
33 changed files with 2889 additions and 2443 deletions
|
|
@ -42,9 +42,7 @@ def upgrade() -> None:
|
|||
if not exists:
|
||||
table_list = ", ".join(TABLES)
|
||||
conn.execute(
|
||||
sa.text(
|
||||
f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE {table_list}"
|
||||
)
|
||||
sa.text(f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE {table_list}")
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,102 @@
|
|||
"""optimize zero_publication with column lists
|
||||
|
||||
Recreates the zero_publication using column lists for the documents
|
||||
table so that large text columns (content, source_markdown,
|
||||
blocknote_document, etc.) are excluded from WAL replication.
|
||||
This prevents RangeError: Invalid string length in zero-cache's
|
||||
change-streamer when documents have very large content.
|
||||
|
||||
Also resets REPLICA IDENTITY to DEFAULT on tables that had it set
|
||||
to FULL for the old Electric SQL setup (migration 66/75/76).
|
||||
With DEFAULT (primary-key) identity, column-list publications
|
||||
only need to include the PK — not every column.
|
||||
|
||||
After running this migration you MUST:
|
||||
1. Stop zero-cache
|
||||
2. Delete / reset the zero-cache data volume
|
||||
3. Restart zero-cache (it will do a fresh initial sync)
|
||||
|
||||
Revision ID: 117
|
||||
Revises: 116
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
import sqlalchemy as sa
|
||||
|
||||
from alembic import op
|
||||
|
||||
revision: str = "117"
|
||||
down_revision: str | None = "116"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
PUBLICATION_NAME = "zero_publication"
|
||||
|
||||
TABLES_WITH_FULL_IDENTITY = [
|
||||
"documents",
|
||||
"notifications",
|
||||
"search_source_connectors",
|
||||
"new_chat_messages",
|
||||
"chat_comments",
|
||||
"chat_session_state",
|
||||
]
|
||||
|
||||
DOCUMENT_COLS = [
|
||||
"id",
|
||||
"title",
|
||||
"document_type",
|
||||
"search_space_id",
|
||||
"folder_id",
|
||||
"created_by_id",
|
||||
"status",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
]
|
||||
|
||||
PUBLICATION_DDL_FULL = f"""\
|
||||
CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE
|
||||
notifications, documents, folders,
|
||||
search_source_connectors, new_chat_messages,
|
||||
chat_comments, chat_session_state
|
||||
"""
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
conn = op.get_bind()
|
||||
|
||||
for tbl in TABLES_WITH_FULL_IDENTITY:
|
||||
conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY DEFAULT'))
|
||||
|
||||
conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}"))
|
||||
|
||||
has_zero_ver = conn.execute(
|
||||
sa.text(
|
||||
"SELECT 1 FROM information_schema.columns "
|
||||
"WHERE table_name = 'documents' AND column_name = '_0_version'"
|
||||
)
|
||||
).fetchone()
|
||||
|
||||
cols = DOCUMENT_COLS + (['"_0_version"'] if has_zero_ver else [])
|
||||
col_list = ", ".join(cols)
|
||||
|
||||
conn.execute(
|
||||
sa.text(
|
||||
f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE "
|
||||
f"notifications, "
|
||||
f"documents ({col_list}), "
|
||||
f"folders, "
|
||||
f"search_source_connectors, "
|
||||
f"new_chat_messages, "
|
||||
f"chat_comments, "
|
||||
f"chat_session_state"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
conn = op.get_bind()
|
||||
conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}"))
|
||||
conn.execute(sa.text(PUBLICATION_DDL_FULL))
|
||||
for tbl in TABLES_WITH_FULL_IDENTITY:
|
||||
conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY FULL'))
|
||||
|
|
@ -159,6 +159,7 @@ async def create_surfsense_deep_agent(
|
|||
additional_tools: Sequence[BaseTool] | None = None,
|
||||
firecrawl_api_key: str | None = None,
|
||||
thread_visibility: ChatVisibility | None = None,
|
||||
mentioned_document_ids: list[int] | None = None,
|
||||
):
|
||||
"""
|
||||
Create a SurfSense deep agent with configurable tools and prompts.
|
||||
|
|
@ -451,6 +452,7 @@ async def create_surfsense_deep_agent(
|
|||
search_space_id=search_space_id,
|
||||
available_connectors=available_connectors,
|
||||
available_document_types=available_document_types,
|
||||
mentioned_document_ids=mentioned_document_ids,
|
||||
),
|
||||
SurfSenseFilesystemMiddleware(
|
||||
search_space_id=search_space_id,
|
||||
|
|
|
|||
|
|
@ -66,6 +66,16 @@ the `<chunk_index>`, identify chunks marked `matched="true"`, then use
|
|||
those sections instead of reading the entire file sequentially.
|
||||
|
||||
Use `<chunk id='...'>` values as citation IDs in your answers.
|
||||
|
||||
## User-Mentioned Documents
|
||||
|
||||
When the `ls` output tags a file with `[MENTIONED BY USER — read deeply]`,
|
||||
the user **explicitly selected** that document. These files are your highest-
|
||||
priority sources:
|
||||
1. **Always read them thoroughly** — scan the full `<chunk_index>`, then read
|
||||
all major sections, not just matched chunks.
|
||||
2. **Prefer their content** over other search results when answering.
|
||||
3. **Cite from them first** whenever applicable.
|
||||
"""
|
||||
|
||||
# =============================================================================
|
||||
|
|
|
|||
|
|
@ -28,7 +28,13 @@ from sqlalchemy import select
|
|||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.agents.new_chat.utils import parse_date_or_datetime, resolve_date_range
|
||||
from app.db import NATIVE_TO_LEGACY_DOCTYPE, Document, Folder, shielded_async_session
|
||||
from app.db import (
|
||||
NATIVE_TO_LEGACY_DOCTYPE,
|
||||
Chunk,
|
||||
Document,
|
||||
Folder,
|
||||
shielded_async_session,
|
||||
)
|
||||
from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
|
||||
from app.utils.document_converters import embed_texts
|
||||
from app.utils.perf import get_perf_logger
|
||||
|
|
@ -430,21 +436,36 @@ async def _get_folder_paths(
|
|||
def _build_synthetic_ls(
|
||||
existing_files: dict[str, Any] | None,
|
||||
new_files: dict[str, Any],
|
||||
*,
|
||||
mentioned_paths: set[str] | None = None,
|
||||
) -> tuple[AIMessage, ToolMessage]:
|
||||
"""Build a synthetic ls("/documents") tool-call + result for the LLM context.
|
||||
|
||||
Paths are listed with *new* (rank-ordered) files first, then existing files
|
||||
that were already in state from prior turns.
|
||||
Mentioned files are listed first. A separate header tells the LLM which
|
||||
files the user explicitly selected; the path list itself stays clean so
|
||||
paths can be passed directly to ``read_file`` without stripping tags.
|
||||
"""
|
||||
_mentioned = mentioned_paths or set()
|
||||
merged: dict[str, Any] = {**(existing_files or {}), **new_files}
|
||||
doc_paths = [
|
||||
p for p, v in merged.items() if p.startswith("/documents/") and v is not None
|
||||
]
|
||||
|
||||
new_set = set(new_files)
|
||||
new_paths = [p for p in doc_paths if p in new_set]
|
||||
mentioned_list = [p for p in doc_paths if p in _mentioned]
|
||||
new_non_mentioned = [p for p in doc_paths if p in new_set and p not in _mentioned]
|
||||
old_paths = [p for p in doc_paths if p not in new_set]
|
||||
ordered = new_paths + old_paths
|
||||
ordered = mentioned_list + new_non_mentioned + old_paths
|
||||
|
||||
parts: list[str] = []
|
||||
if mentioned_list:
|
||||
parts.append(
|
||||
"USER-MENTIONED documents (read these thoroughly before answering):"
|
||||
)
|
||||
for p in mentioned_list:
|
||||
parts.append(f" {p}")
|
||||
parts.append("")
|
||||
parts.append(str(ordered) if ordered else "No documents found.")
|
||||
|
||||
tool_call_id = f"auto_ls_{uuid.uuid4().hex[:12]}"
|
||||
ai_msg = AIMessage(
|
||||
|
|
@ -452,7 +473,7 @@ def _build_synthetic_ls(
|
|||
tool_calls=[{"name": "ls", "args": {"path": "/documents"}, "id": tool_call_id}],
|
||||
)
|
||||
tool_msg = ToolMessage(
|
||||
content=str(ordered) if ordered else "No documents found.",
|
||||
content="\n".join(parts),
|
||||
tool_call_id=tool_call_id,
|
||||
)
|
||||
return ai_msg, tool_msg
|
||||
|
|
@ -524,12 +545,92 @@ async def search_knowledge_base(
|
|||
return results[:top_k]
|
||||
|
||||
|
||||
async def fetch_mentioned_documents(
|
||||
*,
|
||||
document_ids: list[int],
|
||||
search_space_id: int,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Fetch explicitly mentioned documents with *all* their chunks.
|
||||
|
||||
Returns the same dict structure as ``search_knowledge_base`` so results
|
||||
can be merged directly into ``build_scoped_filesystem``. Unlike search
|
||||
results, every chunk is included (no top-K limiting) and none are marked
|
||||
as ``matched`` since the entire document is relevant by virtue of the
|
||||
user's explicit mention.
|
||||
"""
|
||||
if not document_ids:
|
||||
return []
|
||||
|
||||
async with shielded_async_session() as session:
|
||||
doc_result = await session.execute(
|
||||
select(Document).where(
|
||||
Document.id.in_(document_ids),
|
||||
Document.search_space_id == search_space_id,
|
||||
)
|
||||
)
|
||||
docs = {doc.id: doc for doc in doc_result.scalars().all()}
|
||||
|
||||
if not docs:
|
||||
return []
|
||||
|
||||
chunk_result = await session.execute(
|
||||
select(Chunk.id, Chunk.content, Chunk.document_id)
|
||||
.where(Chunk.document_id.in_(list(docs.keys())))
|
||||
.order_by(Chunk.document_id, Chunk.id)
|
||||
)
|
||||
chunks_by_doc: dict[int, list[dict[str, Any]]] = {doc_id: [] for doc_id in docs}
|
||||
for row in chunk_result.all():
|
||||
if row.document_id in chunks_by_doc:
|
||||
chunks_by_doc[row.document_id].append(
|
||||
{"chunk_id": row.id, "content": row.content}
|
||||
)
|
||||
|
||||
results: list[dict[str, Any]] = []
|
||||
for doc_id in document_ids:
|
||||
doc = docs.get(doc_id)
|
||||
if doc is None:
|
||||
continue
|
||||
metadata = doc.document_metadata or {}
|
||||
results.append(
|
||||
{
|
||||
"document_id": doc.id,
|
||||
"content": "",
|
||||
"score": 1.0,
|
||||
"chunks": chunks_by_doc.get(doc.id, []),
|
||||
"matched_chunk_ids": [],
|
||||
"document": {
|
||||
"id": doc.id,
|
||||
"title": doc.title,
|
||||
"document_type": (
|
||||
doc.document_type.value
|
||||
if getattr(doc, "document_type", None)
|
||||
else None
|
||||
),
|
||||
"metadata": metadata,
|
||||
},
|
||||
"source": (
|
||||
doc.document_type.value
|
||||
if getattr(doc, "document_type", None)
|
||||
else None
|
||||
),
|
||||
"_user_mentioned": True,
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
async def build_scoped_filesystem(
|
||||
*,
|
||||
documents: Sequence[dict[str, Any]],
|
||||
search_space_id: int,
|
||||
) -> dict[str, dict[str, str]]:
|
||||
"""Build a StateBackend-compatible files dict from search results."""
|
||||
) -> tuple[dict[str, dict[str, str]], dict[int, str]]:
|
||||
"""Build a StateBackend-compatible files dict from search results.
|
||||
|
||||
Returns ``(files, doc_id_to_path)`` so callers can reliably map a
|
||||
document id back to its filesystem path without guessing by title.
|
||||
Paths are collision-proof: when two documents resolve to the same
|
||||
path the doc-id is appended to disambiguate.
|
||||
"""
|
||||
async with shielded_async_session() as session:
|
||||
folder_paths = await _get_folder_paths(session, search_space_id)
|
||||
doc_ids = [
|
||||
|
|
@ -551,6 +652,7 @@ async def build_scoped_filesystem(
|
|||
}
|
||||
|
||||
files: dict[str, dict[str, str]] = {}
|
||||
doc_id_to_path: dict[int, str] = {}
|
||||
for document in documents:
|
||||
doc_meta = document.get("document") or {}
|
||||
title = str(doc_meta.get("title") or "untitled")
|
||||
|
|
@ -559,6 +661,9 @@ async def build_scoped_filesystem(
|
|||
base_folder = folder_paths.get(folder_id, "/documents")
|
||||
file_name = _safe_filename(title)
|
||||
path = f"{base_folder}/{file_name}"
|
||||
if path in files:
|
||||
stem = file_name.removesuffix(".xml")
|
||||
path = f"{base_folder}/{stem} ({doc_id}).xml"
|
||||
matched_ids = set(document.get("matched_chunk_ids") or [])
|
||||
xml_content = _build_document_xml(document, matched_chunk_ids=matched_ids)
|
||||
files[path] = {
|
||||
|
|
@ -567,7 +672,9 @@ async def build_scoped_filesystem(
|
|||
"created_at": "",
|
||||
"modified_at": "",
|
||||
}
|
||||
return files
|
||||
if isinstance(doc_id, int):
|
||||
doc_id_to_path[doc_id] = path
|
||||
return files, doc_id_to_path
|
||||
|
||||
|
||||
class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
||||
|
|
@ -583,12 +690,14 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
|||
available_connectors: list[str] | None = None,
|
||||
available_document_types: list[str] | None = None,
|
||||
top_k: int = 10,
|
||||
mentioned_document_ids: list[int] | None = None,
|
||||
) -> None:
|
||||
self.llm = llm
|
||||
self.search_space_id = search_space_id
|
||||
self.available_connectors = available_connectors
|
||||
self.available_document_types = available_document_types
|
||||
self.top_k = top_k
|
||||
self.mentioned_document_ids = mentioned_document_ids or []
|
||||
|
||||
async def _plan_search_inputs(
|
||||
self,
|
||||
|
|
@ -680,6 +789,18 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
|||
user_text=user_text,
|
||||
)
|
||||
|
||||
# --- 1. Fetch mentioned documents (user-selected, all chunks) ---
|
||||
mentioned_results: list[dict[str, Any]] = []
|
||||
if self.mentioned_document_ids:
|
||||
mentioned_results = await fetch_mentioned_documents(
|
||||
document_ids=self.mentioned_document_ids,
|
||||
search_space_id=self.search_space_id,
|
||||
)
|
||||
# Clear after first turn so they are not re-fetched on subsequent
|
||||
# messages within the same agent instance.
|
||||
self.mentioned_document_ids = []
|
||||
|
||||
# --- 2. Run KB hybrid search ---
|
||||
search_results = await search_knowledge_base(
|
||||
query=planned_query,
|
||||
search_space_id=self.search_space_id,
|
||||
|
|
@ -689,19 +810,50 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
|||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
)
|
||||
new_files = await build_scoped_filesystem(
|
||||
documents=search_results,
|
||||
|
||||
# --- 3. Merge: mentioned first, then search (dedup by doc id) ---
|
||||
seen_doc_ids: set[int] = set()
|
||||
merged: list[dict[str, Any]] = []
|
||||
for doc in mentioned_results:
|
||||
doc_id = (doc.get("document") or {}).get("id")
|
||||
if doc_id is not None:
|
||||
seen_doc_ids.add(doc_id)
|
||||
merged.append(doc)
|
||||
for doc in search_results:
|
||||
doc_id = (doc.get("document") or {}).get("id")
|
||||
if doc_id is not None and doc_id in seen_doc_ids:
|
||||
continue
|
||||
merged.append(doc)
|
||||
|
||||
# --- 4. Build scoped filesystem ---
|
||||
new_files, doc_id_to_path = await build_scoped_filesystem(
|
||||
documents=merged,
|
||||
search_space_id=self.search_space_id,
|
||||
)
|
||||
|
||||
ai_msg, tool_msg = _build_synthetic_ls(existing_files, new_files)
|
||||
# Identify which paths belong to user-mentioned documents using
|
||||
# the authoritative doc_id -> path mapping (no title guessing).
|
||||
mentioned_doc_ids = {
|
||||
(d.get("document") or {}).get("id") for d in mentioned_results
|
||||
}
|
||||
mentioned_paths = {
|
||||
doc_id_to_path[did] for did in mentioned_doc_ids if did in doc_id_to_path
|
||||
}
|
||||
|
||||
ai_msg, tool_msg = _build_synthetic_ls(
|
||||
existing_files,
|
||||
new_files,
|
||||
mentioned_paths=mentioned_paths,
|
||||
)
|
||||
|
||||
if t0 is not None:
|
||||
_perf_log.info(
|
||||
"[kb_fs_middleware] completed in %.3fs query=%r optimized=%r new_files=%d total=%d",
|
||||
"[kb_fs_middleware] completed in %.3fs query=%r optimized=%r "
|
||||
"mentioned=%d new_files=%d total=%d",
|
||||
asyncio.get_event_loop().time() - t0,
|
||||
user_text[:80],
|
||||
planned_query[:120],
|
||||
len(mentioned_results),
|
||||
len(new_files),
|
||||
len(new_files) + len(existing_files or {}),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
# Force asyncio to use standard event loop before unstructured imports
|
||||
import asyncio
|
||||
|
||||
from fastapi import APIRouter, Depends, Form, HTTPException, UploadFile
|
||||
from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.future import select
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
|
@ -17,6 +17,7 @@ from app.db import (
|
|||
get_async_session,
|
||||
)
|
||||
from app.schemas import (
|
||||
ChunkRead,
|
||||
DocumentRead,
|
||||
DocumentsCreate,
|
||||
DocumentStatusBatchResponse,
|
||||
|
|
@ -45,9 +46,7 @@ os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1"
|
|||
|
||||
router = APIRouter()
|
||||
|
||||
MAX_FILES_PER_UPLOAD = 10
|
||||
MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB per file
|
||||
MAX_TOTAL_SIZE_BYTES = 200 * 1024 * 1024 # 200 MB total
|
||||
MAX_FILE_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB per file
|
||||
|
||||
|
||||
@router.post("/documents")
|
||||
|
|
@ -156,13 +155,6 @@ async def create_documents_file_upload(
|
|||
if not files:
|
||||
raise HTTPException(status_code=400, detail="No files provided")
|
||||
|
||||
if len(files) > MAX_FILES_PER_UPLOAD:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"Too many files. Maximum {MAX_FILES_PER_UPLOAD} files per upload.",
|
||||
)
|
||||
|
||||
total_size = 0
|
||||
for file in files:
|
||||
file_size = file.size or 0
|
||||
if file_size > MAX_FILE_SIZE_BYTES:
|
||||
|
|
@ -171,14 +163,6 @@ async def create_documents_file_upload(
|
|||
detail=f"File '{file.filename}' ({file_size / (1024 * 1024):.1f} MB) "
|
||||
f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.",
|
||||
)
|
||||
total_size += file_size
|
||||
|
||||
if total_size > MAX_TOTAL_SIZE_BYTES:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"Total upload size ({total_size / (1024 * 1024):.1f} MB) "
|
||||
f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.",
|
||||
)
|
||||
|
||||
# ===== Read all files concurrently to avoid blocking the event loop =====
|
||||
async def _read_and_save(file: UploadFile) -> tuple[str, str, int]:
|
||||
|
|
@ -206,16 +190,6 @@ async def create_documents_file_upload(
|
|||
|
||||
saved_files = await asyncio.gather(*(_read_and_save(f) for f in files))
|
||||
|
||||
actual_total_size = sum(size for _, _, size in saved_files)
|
||||
if actual_total_size > MAX_TOTAL_SIZE_BYTES:
|
||||
for temp_path, _, _ in saved_files:
|
||||
os.unlink(temp_path)
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"Total upload size ({actual_total_size / (1024 * 1024):.1f} MB) "
|
||||
f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.",
|
||||
)
|
||||
|
||||
# ===== PHASE 1: Create pending documents for all files =====
|
||||
created_documents: list[Document] = []
|
||||
files_to_process: list[tuple[Document, str, str]] = []
|
||||
|
|
@ -451,13 +425,15 @@ async def read_documents(
|
|||
reason=doc.status.get("reason"),
|
||||
)
|
||||
|
||||
raw_content = doc.content or ""
|
||||
api_documents.append(
|
||||
DocumentRead(
|
||||
id=doc.id,
|
||||
title=doc.title,
|
||||
document_type=doc.document_type,
|
||||
document_metadata=doc.document_metadata,
|
||||
content=doc.content,
|
||||
content="",
|
||||
content_preview=raw_content[:300],
|
||||
content_hash=doc.content_hash,
|
||||
unique_identifier_hash=doc.unique_identifier_hash,
|
||||
created_at=doc.created_at,
|
||||
|
|
@ -609,13 +585,15 @@ async def search_documents(
|
|||
reason=doc.status.get("reason"),
|
||||
)
|
||||
|
||||
raw_content = doc.content or ""
|
||||
api_documents.append(
|
||||
DocumentRead(
|
||||
id=doc.id,
|
||||
title=doc.title,
|
||||
document_type=doc.document_type,
|
||||
document_metadata=doc.document_metadata,
|
||||
content=doc.content,
|
||||
content="",
|
||||
content_preview=raw_content[:300],
|
||||
content_hash=doc.content_hash,
|
||||
unique_identifier_hash=doc.unique_identifier_hash,
|
||||
created_at=doc.created_at,
|
||||
|
|
@ -884,16 +862,19 @@ async def get_document_type_counts(
|
|||
@router.get("/documents/by-chunk/{chunk_id}", response_model=DocumentWithChunksRead)
|
||||
async def get_document_by_chunk_id(
|
||||
chunk_id: int,
|
||||
chunk_window: int = Query(
|
||||
5, ge=0, description="Number of chunks before/after the cited chunk to include"
|
||||
),
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Retrieves a document based on a chunk ID, including all its chunks ordered by creation time.
|
||||
Requires DOCUMENTS_READ permission for the search space.
|
||||
The document's embedding and chunk embeddings are excluded from the response.
|
||||
Retrieves a document based on a chunk ID, including a window of chunks around the cited one.
|
||||
Uses SQL-level pagination to avoid loading all chunks into memory.
|
||||
"""
|
||||
try:
|
||||
# First, get the chunk and verify it exists
|
||||
from sqlalchemy import and_, func, or_
|
||||
|
||||
chunk_result = await session.execute(select(Chunk).filter(Chunk.id == chunk_id))
|
||||
chunk = chunk_result.scalars().first()
|
||||
|
||||
|
|
@ -902,11 +883,8 @@ async def get_document_by_chunk_id(
|
|||
status_code=404, detail=f"Chunk with id {chunk_id} not found"
|
||||
)
|
||||
|
||||
# Get the associated document
|
||||
document_result = await session.execute(
|
||||
select(Document)
|
||||
.options(selectinload(Document.chunks))
|
||||
.filter(Document.id == chunk.document_id)
|
||||
select(Document).filter(Document.id == chunk.document_id)
|
||||
)
|
||||
document = document_result.scalars().first()
|
||||
|
||||
|
|
@ -916,7 +894,6 @@ async def get_document_by_chunk_id(
|
|||
detail="Document not found",
|
||||
)
|
||||
|
||||
# Check permission for the search space
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
|
|
@ -925,10 +902,38 @@ async def get_document_by_chunk_id(
|
|||
"You don't have permission to read documents in this search space",
|
||||
)
|
||||
|
||||
# Sort chunks by creation time
|
||||
sorted_chunks = sorted(document.chunks, key=lambda x: x.created_at)
|
||||
total_result = await session.execute(
|
||||
select(func.count())
|
||||
.select_from(Chunk)
|
||||
.filter(Chunk.document_id == document.id)
|
||||
)
|
||||
total_chunks = total_result.scalar() or 0
|
||||
|
||||
cited_idx_result = await session.execute(
|
||||
select(func.count())
|
||||
.select_from(Chunk)
|
||||
.filter(
|
||||
Chunk.document_id == document.id,
|
||||
or_(
|
||||
Chunk.created_at < chunk.created_at,
|
||||
and_(Chunk.created_at == chunk.created_at, Chunk.id < chunk.id),
|
||||
),
|
||||
)
|
||||
)
|
||||
cited_idx = cited_idx_result.scalar() or 0
|
||||
|
||||
start = max(0, cited_idx - chunk_window)
|
||||
end = min(total_chunks, cited_idx + chunk_window + 1)
|
||||
|
||||
windowed_result = await session.execute(
|
||||
select(Chunk)
|
||||
.filter(Chunk.document_id == document.id)
|
||||
.order_by(Chunk.created_at, Chunk.id)
|
||||
.offset(start)
|
||||
.limit(end - start)
|
||||
)
|
||||
windowed_chunks = windowed_result.scalars().all()
|
||||
|
||||
# Return the document with its chunks
|
||||
return DocumentWithChunksRead(
|
||||
id=document.id,
|
||||
title=document.title,
|
||||
|
|
@ -940,7 +945,9 @@ async def get_document_by_chunk_id(
|
|||
created_at=document.created_at,
|
||||
updated_at=document.updated_at,
|
||||
search_space_id=document.search_space_id,
|
||||
chunks=sorted_chunks,
|
||||
chunks=windowed_chunks,
|
||||
total_chunks=total_chunks,
|
||||
chunk_start_index=start,
|
||||
)
|
||||
except HTTPException:
|
||||
raise
|
||||
|
|
@ -950,6 +957,75 @@ async def get_document_by_chunk_id(
|
|||
) from e
|
||||
|
||||
|
||||
@router.get(
|
||||
"/documents/{document_id}/chunks",
|
||||
response_model=PaginatedResponse[ChunkRead],
|
||||
)
|
||||
async def get_document_chunks_paginated(
|
||||
document_id: int,
|
||||
page: int = Query(0, ge=0),
|
||||
page_size: int = Query(20, ge=1, le=100),
|
||||
start_offset: int | None = Query(
|
||||
None, ge=0, description="Direct offset; overrides page * page_size"
|
||||
),
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Paginated chunk loading for a document.
|
||||
Supports both page-based and offset-based access.
|
||||
"""
|
||||
try:
|
||||
from sqlalchemy import func
|
||||
|
||||
doc_result = await session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
document = doc_result.scalars().first()
|
||||
|
||||
if not document:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
document.search_space_id,
|
||||
Permission.DOCUMENTS_READ.value,
|
||||
"You don't have permission to read documents in this search space",
|
||||
)
|
||||
|
||||
total_result = await session.execute(
|
||||
select(func.count())
|
||||
.select_from(Chunk)
|
||||
.filter(Chunk.document_id == document_id)
|
||||
)
|
||||
total = total_result.scalar() or 0
|
||||
|
||||
offset = start_offset if start_offset is not None else page * page_size
|
||||
chunks_result = await session.execute(
|
||||
select(Chunk)
|
||||
.filter(Chunk.document_id == document_id)
|
||||
.order_by(Chunk.created_at, Chunk.id)
|
||||
.offset(offset)
|
||||
.limit(page_size)
|
||||
)
|
||||
chunks = chunks_result.scalars().all()
|
||||
|
||||
return PaginatedResponse(
|
||||
items=chunks,
|
||||
total=total,
|
||||
page=offset // page_size if page_size else page,
|
||||
page_size=page_size,
|
||||
has_more=(offset + len(chunks)) < total,
|
||||
)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to fetch chunks: {e!s}"
|
||||
) from e
|
||||
|
||||
|
||||
@router.get("/documents/{document_id}", response_model=DocumentRead)
|
||||
async def read_document(
|
||||
document_id: int,
|
||||
|
|
@ -980,13 +1056,14 @@ async def read_document(
|
|||
"You don't have permission to read documents in this search space",
|
||||
)
|
||||
|
||||
# Convert database object to API-friendly format
|
||||
raw_content = document.content or ""
|
||||
return DocumentRead(
|
||||
id=document.id,
|
||||
title=document.title,
|
||||
document_type=document.document_type,
|
||||
document_metadata=document.document_metadata,
|
||||
content=document.content,
|
||||
content=raw_content,
|
||||
content_preview=raw_content[:300],
|
||||
content_hash=document.content_hash,
|
||||
unique_identifier_hash=document.unique_identifier_hash,
|
||||
created_at=document.created_at,
|
||||
|
|
|
|||
|
|
@ -15,11 +15,10 @@ import pypandoc
|
|||
import typst
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from fastapi.responses import StreamingResponse
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import func, select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.db import Document, DocumentType, Permission, User, get_async_session
|
||||
from app.db import Chunk, Document, DocumentType, Permission, User, get_async_session
|
||||
from app.routes.reports_routes import (
|
||||
_FILE_EXTENSIONS,
|
||||
_MEDIA_TYPES,
|
||||
|
|
@ -44,6 +43,9 @@ router = APIRouter()
|
|||
async def get_editor_content(
|
||||
search_space_id: int,
|
||||
document_id: int,
|
||||
max_length: int | None = Query(
|
||||
None, description="Truncate source_markdown to this many characters"
|
||||
),
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
|
|
@ -65,9 +67,7 @@ async def get_editor_content(
|
|||
)
|
||||
|
||||
result = await session.execute(
|
||||
select(Document)
|
||||
.options(selectinload(Document.chunks))
|
||||
.filter(
|
||||
select(Document).filter(
|
||||
Document.id == document_id,
|
||||
Document.search_space_id == search_space_id,
|
||||
)
|
||||
|
|
@ -77,62 +77,63 @@ async def get_editor_content(
|
|||
if not document:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
# Priority 1: Return source_markdown if it exists (check `is not None` to allow empty strings)
|
||||
if document.source_markdown is not None:
|
||||
count_result = await session.execute(
|
||||
select(func.count()).select_from(Chunk).filter(Chunk.document_id == document_id)
|
||||
)
|
||||
chunk_count = count_result.scalar() or 0
|
||||
|
||||
def _build_response(md: str) -> dict:
|
||||
size_bytes = len(md.encode("utf-8"))
|
||||
truncated = False
|
||||
output_md = md
|
||||
if max_length is not None and size_bytes > max_length:
|
||||
output_md = md[:max_length]
|
||||
truncated = True
|
||||
return {
|
||||
"document_id": document.id,
|
||||
"title": document.title,
|
||||
"document_type": document.document_type.value,
|
||||
"source_markdown": document.source_markdown,
|
||||
"source_markdown": output_md,
|
||||
"content_size_bytes": size_bytes,
|
||||
"chunk_count": chunk_count,
|
||||
"truncated": truncated,
|
||||
"updated_at": document.updated_at.isoformat()
|
||||
if document.updated_at
|
||||
else None,
|
||||
}
|
||||
|
||||
# Priority 2: Lazy-migrate from blocknote_document (pure Python, no external deps)
|
||||
if document.source_markdown is not None:
|
||||
return _build_response(document.source_markdown)
|
||||
|
||||
if document.blocknote_document:
|
||||
from app.utils.blocknote_to_markdown import blocknote_to_markdown
|
||||
|
||||
markdown = blocknote_to_markdown(document.blocknote_document)
|
||||
if markdown:
|
||||
# Persist the migration so we don't repeat it
|
||||
document.source_markdown = markdown
|
||||
await session.commit()
|
||||
return {
|
||||
"document_id": document.id,
|
||||
"title": document.title,
|
||||
"document_type": document.document_type.value,
|
||||
"source_markdown": markdown,
|
||||
"updated_at": document.updated_at.isoformat()
|
||||
if document.updated_at
|
||||
else None,
|
||||
}
|
||||
return _build_response(markdown)
|
||||
|
||||
# Priority 3: For NOTE type with no content, return empty markdown
|
||||
if document.document_type == DocumentType.NOTE:
|
||||
empty_markdown = ""
|
||||
document.source_markdown = empty_markdown
|
||||
await session.commit()
|
||||
return {
|
||||
"document_id": document.id,
|
||||
"title": document.title,
|
||||
"document_type": document.document_type.value,
|
||||
"source_markdown": empty_markdown,
|
||||
"updated_at": document.updated_at.isoformat()
|
||||
if document.updated_at
|
||||
else None,
|
||||
}
|
||||
return _build_response(empty_markdown)
|
||||
|
||||
# Priority 4: Reconstruct from chunks
|
||||
chunks = sorted(document.chunks, key=lambda c: c.id)
|
||||
chunk_contents_result = await session.execute(
|
||||
select(Chunk.content)
|
||||
.filter(Chunk.document_id == document_id)
|
||||
.order_by(Chunk.id)
|
||||
)
|
||||
chunk_contents = chunk_contents_result.scalars().all()
|
||||
|
||||
if not chunks:
|
||||
if not chunk_contents:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="This document has no content and cannot be edited. Please re-upload to enable editing.",
|
||||
)
|
||||
|
||||
markdown_content = "\n\n".join(chunk.content for chunk in chunks)
|
||||
markdown_content = "\n\n".join(chunk_contents)
|
||||
|
||||
if not markdown_content.strip():
|
||||
raise HTTPException(
|
||||
|
|
@ -140,17 +141,77 @@ async def get_editor_content(
|
|||
detail="This document has empty content and cannot be edited.",
|
||||
)
|
||||
|
||||
# Persist the lazy migration
|
||||
document.source_markdown = markdown_content
|
||||
await session.commit()
|
||||
|
||||
return {
|
||||
"document_id": document.id,
|
||||
"title": document.title,
|
||||
"document_type": document.document_type.value,
|
||||
"source_markdown": markdown_content,
|
||||
"updated_at": document.updated_at.isoformat() if document.updated_at else None,
|
||||
}
|
||||
return _build_response(markdown_content)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/search-spaces/{search_space_id}/documents/{document_id}/download-markdown"
|
||||
)
|
||||
async def download_document_markdown(
|
||||
search_space_id: int,
|
||||
document_id: int,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Download the full document content as a .md file.
|
||||
Reconstructs markdown from source_markdown or chunks.
|
||||
"""
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
search_space_id,
|
||||
Permission.DOCUMENTS_READ.value,
|
||||
"You don't have permission to read documents in this search space",
|
||||
)
|
||||
|
||||
result = await session.execute(
|
||||
select(Document).filter(
|
||||
Document.id == document_id,
|
||||
Document.search_space_id == search_space_id,
|
||||
)
|
||||
)
|
||||
document = result.scalars().first()
|
||||
|
||||
if not document:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
markdown: str | None = document.source_markdown
|
||||
if markdown is None and document.blocknote_document:
|
||||
from app.utils.blocknote_to_markdown import blocknote_to_markdown
|
||||
|
||||
markdown = blocknote_to_markdown(document.blocknote_document)
|
||||
if markdown is None:
|
||||
chunk_contents_result = await session.execute(
|
||||
select(Chunk.content)
|
||||
.filter(Chunk.document_id == document_id)
|
||||
.order_by(Chunk.id)
|
||||
)
|
||||
chunk_contents = chunk_contents_result.scalars().all()
|
||||
if chunk_contents:
|
||||
markdown = "\n\n".join(chunk_contents)
|
||||
|
||||
if not markdown or not markdown.strip():
|
||||
raise HTTPException(
|
||||
status_code=400, detail="Document has no content to download"
|
||||
)
|
||||
|
||||
safe_title = (
|
||||
"".join(
|
||||
c if c.isalnum() or c in " -_" else "_"
|
||||
for c in (document.title or "document")
|
||||
).strip()[:80]
|
||||
or "document"
|
||||
)
|
||||
|
||||
return StreamingResponse(
|
||||
io.BytesIO(markdown.encode("utf-8")),
|
||||
media_type="text/markdown; charset=utf-8",
|
||||
headers={"Content-Disposition": f'attachment; filename="{safe_title}.md"'},
|
||||
)
|
||||
|
||||
|
||||
@router.post("/search-spaces/{search_space_id}/documents/{document_id}/save")
|
||||
|
|
@ -258,9 +319,7 @@ async def export_document(
|
|||
)
|
||||
|
||||
result = await session.execute(
|
||||
select(Document)
|
||||
.options(selectinload(Document.chunks))
|
||||
.filter(
|
||||
select(Document).filter(
|
||||
Document.id == document_id,
|
||||
Document.search_space_id == search_space_id,
|
||||
)
|
||||
|
|
@ -269,16 +328,20 @@ async def export_document(
|
|||
if not document:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
# Resolve markdown content (same priority as editor-content endpoint)
|
||||
markdown_content: str | None = document.source_markdown
|
||||
if markdown_content is None and document.blocknote_document:
|
||||
from app.utils.blocknote_to_markdown import blocknote_to_markdown
|
||||
|
||||
markdown_content = blocknote_to_markdown(document.blocknote_document)
|
||||
if markdown_content is None:
|
||||
chunks = sorted(document.chunks, key=lambda c: c.id)
|
||||
if chunks:
|
||||
markdown_content = "\n\n".join(chunk.content for chunk in chunks)
|
||||
chunk_contents_result = await session.execute(
|
||||
select(Chunk.content)
|
||||
.filter(Chunk.document_id == document_id)
|
||||
.order_by(Chunk.id)
|
||||
)
|
||||
chunk_contents = chunk_contents_result.scalars().all()
|
||||
if chunk_contents:
|
||||
markdown_content = "\n\n".join(chunk_contents)
|
||||
|
||||
if not markdown_content or not markdown_content.strip():
|
||||
raise HTTPException(status_code=400, detail="Document has no content to export")
|
||||
|
|
|
|||
|
|
@ -53,25 +53,26 @@ class DocumentRead(BaseModel):
|
|||
title: str
|
||||
document_type: DocumentType
|
||||
document_metadata: dict
|
||||
content: str # Changed to string to match frontend
|
||||
content: str = ""
|
||||
content_preview: str = ""
|
||||
content_hash: str
|
||||
unique_identifier_hash: str | None
|
||||
created_at: datetime
|
||||
updated_at: datetime | None
|
||||
search_space_id: int
|
||||
folder_id: int | None = None
|
||||
created_by_id: UUID | None = None # User who created/uploaded this document
|
||||
created_by_id: UUID | None = None
|
||||
created_by_name: str | None = None
|
||||
created_by_email: str | None = None
|
||||
status: DocumentStatusSchema | None = (
|
||||
None # Processing status (ready, processing, failed)
|
||||
)
|
||||
status: DocumentStatusSchema | None = None
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
class DocumentWithChunksRead(DocumentRead):
|
||||
chunks: list[ChunkRead] = []
|
||||
total_chunks: int = 0
|
||||
chunk_start_index: int = 0
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -39,7 +39,6 @@ from app.agents.new_chat.llm_config import (
|
|||
)
|
||||
from app.db import (
|
||||
ChatVisibility,
|
||||
Document,
|
||||
NewChatMessage,
|
||||
NewChatThread,
|
||||
Report,
|
||||
|
|
@ -63,74 +62,6 @@ _perf_log = get_perf_logger()
|
|||
_background_tasks: set[asyncio.Task] = set()
|
||||
|
||||
|
||||
def format_mentioned_documents_as_context(documents: list[Document]) -> str:
|
||||
"""
|
||||
Format mentioned documents as context for the agent.
|
||||
|
||||
Uses the same XML structure as knowledge_base.format_documents_for_context
|
||||
to ensure citations work properly with chunk IDs.
|
||||
"""
|
||||
if not documents:
|
||||
return ""
|
||||
|
||||
context_parts = ["<mentioned_documents>"]
|
||||
context_parts.append(
|
||||
"The user has explicitly mentioned the following documents from their knowledge base. "
|
||||
"These documents are directly relevant to the query and should be prioritized as primary sources. "
|
||||
"Use [citation:CHUNK_ID] format for citations (e.g., [citation:123])."
|
||||
)
|
||||
context_parts.append("")
|
||||
|
||||
for doc in documents:
|
||||
# Build metadata JSON
|
||||
metadata = doc.document_metadata or {}
|
||||
metadata_json = json.dumps(metadata, ensure_ascii=False)
|
||||
|
||||
# Get URL from metadata
|
||||
url = (
|
||||
metadata.get("url")
|
||||
or metadata.get("source")
|
||||
or metadata.get("page_url")
|
||||
or ""
|
||||
)
|
||||
|
||||
context_parts.append("<document>")
|
||||
context_parts.append("<document_metadata>")
|
||||
context_parts.append(f" <document_id>{doc.id}</document_id>")
|
||||
context_parts.append(
|
||||
f" <document_type>{doc.document_type.value}</document_type>"
|
||||
)
|
||||
context_parts.append(f" <title><![CDATA[{doc.title}]]></title>")
|
||||
context_parts.append(f" <url><![CDATA[{url}]]></url>")
|
||||
context_parts.append(
|
||||
f" <metadata_json><![CDATA[{metadata_json}]]></metadata_json>"
|
||||
)
|
||||
context_parts.append("</document_metadata>")
|
||||
context_parts.append("")
|
||||
context_parts.append("<document_content>")
|
||||
|
||||
# Use chunks if available (preferred for proper citations)
|
||||
if hasattr(doc, "chunks") and doc.chunks:
|
||||
for chunk in doc.chunks:
|
||||
context_parts.append(
|
||||
f" <chunk id='{chunk.id}'><![CDATA[{chunk.content}]]></chunk>"
|
||||
)
|
||||
else:
|
||||
# Fallback to document content if chunks not loaded
|
||||
# Use document ID as chunk ID prefix for consistency
|
||||
context_parts.append(
|
||||
f" <chunk id='{doc.id}'><![CDATA[{doc.content}]]></chunk>"
|
||||
)
|
||||
|
||||
context_parts.append("</document_content>")
|
||||
context_parts.append("</document>")
|
||||
context_parts.append("")
|
||||
|
||||
context_parts.append("</mentioned_documents>")
|
||||
|
||||
return "\n".join(context_parts)
|
||||
|
||||
|
||||
def format_mentioned_surfsense_docs_as_context(
|
||||
documents: list[SurfsenseDocsDocument],
|
||||
) -> str:
|
||||
|
|
@ -1317,6 +1248,7 @@ async def stream_new_chat(
|
|||
firecrawl_api_key=firecrawl_api_key,
|
||||
thread_visibility=visibility,
|
||||
disabled_tools=disabled_tools,
|
||||
mentioned_document_ids=mentioned_document_ids,
|
||||
)
|
||||
_perf_log.info(
|
||||
"[stream_new_chat] Agent created in %.3fs", time.perf_counter() - _t0
|
||||
|
|
@ -1340,18 +1272,9 @@ async def stream_new_chat(
|
|||
thread.needs_history_bootstrap = False
|
||||
await session.commit()
|
||||
|
||||
# Fetch mentioned documents if any (with chunks for proper citations)
|
||||
mentioned_documents: list[Document] = []
|
||||
if mentioned_document_ids:
|
||||
result = await session.execute(
|
||||
select(Document)
|
||||
.options(selectinload(Document.chunks))
|
||||
.filter(
|
||||
Document.id.in_(mentioned_document_ids),
|
||||
Document.search_space_id == search_space_id,
|
||||
)
|
||||
)
|
||||
mentioned_documents = list(result.scalars().all())
|
||||
# Mentioned KB documents are now handled by KnowledgeBaseSearchMiddleware
|
||||
# which merges them into the scoped filesystem with full document
|
||||
# structure. Only SurfSense docs and report context are inlined here.
|
||||
|
||||
# Fetch mentioned SurfSense docs if any
|
||||
mentioned_surfsense_docs: list[SurfsenseDocsDocument] = []
|
||||
|
|
@ -1379,15 +1302,10 @@ async def stream_new_chat(
|
|||
)
|
||||
recent_reports = list(recent_reports_result.scalars().all())
|
||||
|
||||
# Format the user query with context (mentioned documents + SurfSense docs)
|
||||
# Format the user query with context (SurfSense docs + reports only)
|
||||
final_query = user_query
|
||||
context_parts = []
|
||||
|
||||
if mentioned_documents:
|
||||
context_parts.append(
|
||||
format_mentioned_documents_as_context(mentioned_documents)
|
||||
)
|
||||
|
||||
if mentioned_surfsense_docs:
|
||||
context_parts.append(
|
||||
format_mentioned_surfsense_docs_as_context(mentioned_surfsense_docs)
|
||||
|
|
@ -1479,7 +1397,7 @@ async def stream_new_chat(
|
|||
yield streaming_service.format_start_step()
|
||||
|
||||
# Initial thinking step - analyzing the request
|
||||
if mentioned_documents or mentioned_surfsense_docs:
|
||||
if mentioned_surfsense_docs:
|
||||
initial_title = "Analyzing referenced content"
|
||||
action_verb = "Analyzing"
|
||||
else:
|
||||
|
|
@ -1490,18 +1408,6 @@ async def stream_new_chat(
|
|||
query_text = user_query[:80] + ("..." if len(user_query) > 80 else "")
|
||||
processing_parts.append(query_text)
|
||||
|
||||
if mentioned_documents:
|
||||
doc_names = []
|
||||
for doc in mentioned_documents:
|
||||
title = doc.title
|
||||
if len(title) > 30:
|
||||
title = title[:27] + "..."
|
||||
doc_names.append(title)
|
||||
if len(doc_names) == 1:
|
||||
processing_parts.append(f"[{doc_names[0]}]")
|
||||
else:
|
||||
processing_parts.append(f"[{len(doc_names)} documents]")
|
||||
|
||||
if mentioned_surfsense_docs:
|
||||
doc_names = []
|
||||
for doc in mentioned_surfsense_docs:
|
||||
|
|
@ -1527,7 +1433,7 @@ async def stream_new_chat(
|
|||
# These ORM objects (with eagerly-loaded chunks) can be very large.
|
||||
# They're only needed to build context strings already copied into
|
||||
# final_query / langchain_messages — release them before streaming.
|
||||
del mentioned_documents, mentioned_surfsense_docs, recent_reports
|
||||
del mentioned_surfsense_docs, recent_reports
|
||||
del langchain_messages, final_query
|
||||
|
||||
# Check if this is the first assistant response so we can generate
|
||||
|
|
|
|||
|
|
@ -12,16 +12,14 @@ Available processors:
|
|||
- YouTube processor: Process YouTube videos and extract transcripts
|
||||
"""
|
||||
|
||||
# URL crawler
|
||||
# Extension processor
|
||||
from .extension_processor import add_extension_received_document
|
||||
|
||||
# File processors
|
||||
from .file_processors import (
|
||||
# File processors (backward-compatible re-exports from _save)
|
||||
from ._save import (
|
||||
add_received_file_document_using_docling,
|
||||
add_received_file_document_using_llamacloud,
|
||||
add_received_file_document_using_unstructured,
|
||||
)
|
||||
from .extension_processor import add_extension_received_document
|
||||
|
||||
# Markdown processor
|
||||
from .markdown_processor import add_received_markdown_file_document
|
||||
|
|
@ -32,9 +30,9 @@ from .youtube_processor import add_youtube_video_document
|
|||
__all__ = [
|
||||
# Extension processing
|
||||
"add_extension_received_document",
|
||||
# File processing with different ETL services
|
||||
"add_received_file_document_using_docling",
|
||||
"add_received_file_document_using_llamacloud",
|
||||
# File processing with different ETL services
|
||||
"add_received_file_document_using_unstructured",
|
||||
# Markdown file processing
|
||||
"add_received_markdown_file_document",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,74 @@
|
|||
"""
|
||||
Constants for file document processing.
|
||||
|
||||
Centralizes file type classification, LlamaCloud retry configuration,
|
||||
and timeout calculation parameters.
|
||||
"""
|
||||
|
||||
import ssl
|
||||
from enum import Enum
|
||||
|
||||
import httpx
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File type classification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
MARKDOWN_EXTENSIONS = (".md", ".markdown", ".txt")
|
||||
AUDIO_EXTENSIONS = (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
|
||||
DIRECT_CONVERT_EXTENSIONS = (".csv", ".tsv", ".html", ".htm")
|
||||
|
||||
|
||||
class FileCategory(Enum):
|
||||
MARKDOWN = "markdown"
|
||||
AUDIO = "audio"
|
||||
DIRECT_CONVERT = "direct_convert"
|
||||
DOCUMENT = "document"
|
||||
|
||||
|
||||
def classify_file(filename: str) -> FileCategory:
|
||||
"""Classify a file by its extension into a processing category."""
|
||||
lower = filename.lower()
|
||||
if lower.endswith(MARKDOWN_EXTENSIONS):
|
||||
return FileCategory.MARKDOWN
|
||||
if lower.endswith(AUDIO_EXTENSIONS):
|
||||
return FileCategory.AUDIO
|
||||
if lower.endswith(DIRECT_CONVERT_EXTENSIONS):
|
||||
return FileCategory.DIRECT_CONVERT
|
||||
return FileCategory.DOCUMENT
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LlamaCloud retry configuration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
LLAMACLOUD_MAX_RETRIES = 5
|
||||
LLAMACLOUD_BASE_DELAY = 10 # seconds (exponential backoff base)
|
||||
LLAMACLOUD_MAX_DELAY = 120 # max delay between retries (2 minutes)
|
||||
LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
|
||||
ssl.SSLError,
|
||||
httpx.ConnectError,
|
||||
httpx.ConnectTimeout,
|
||||
httpx.ReadError,
|
||||
httpx.ReadTimeout,
|
||||
httpx.WriteError,
|
||||
httpx.WriteTimeout,
|
||||
httpx.RemoteProtocolError,
|
||||
httpx.LocalProtocolError,
|
||||
ConnectionError,
|
||||
ConnectionResetError,
|
||||
TimeoutError,
|
||||
OSError,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Timeout calculation constants
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
UPLOAD_BYTES_PER_SECOND_SLOW = (
|
||||
100 * 1024
|
||||
) # 100 KB/s (conservative for slow connections)
|
||||
MIN_UPLOAD_TIMEOUT = 120 # Minimum 2 minutes for any file
|
||||
MAX_UPLOAD_TIMEOUT = 1800 # Maximum 30 minutes for very large files
|
||||
BASE_JOB_TIMEOUT = 600 # 10 minutes base for job processing
|
||||
PER_PAGE_JOB_TIMEOUT = 60 # 1 minute per page for processing
|
||||
|
|
@ -0,0 +1,90 @@
|
|||
"""
|
||||
Lossless file-to-markdown converters for text-based formats.
|
||||
|
||||
These converters handle file types that can be faithfully represented as
|
||||
markdown without any external ETL/OCR service:
|
||||
|
||||
- CSV / TSV → markdown table (stdlib ``csv``)
|
||||
- HTML / HTM → markdown (``markdownify``)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
|
||||
from markdownify import markdownify
|
||||
|
||||
# The stdlib csv module defaults to a 128 KB field-size limit which is too
|
||||
# small for real-world exports (e.g. chat logs, CRM dumps). We raise it once
|
||||
# at import time so every csv.reader call in this module can handle large fields.
|
||||
csv.field_size_limit(2**31 - 1)
|
||||
|
||||
|
||||
def _escape_pipe(cell: str) -> str:
|
||||
"""Escape literal pipe characters inside a markdown table cell."""
|
||||
return cell.replace("|", "\\|")
|
||||
|
||||
|
||||
def csv_to_markdown(file_path: str, *, delimiter: str = ",") -> str:
|
||||
"""Convert a CSV (or TSV) file to a markdown table.
|
||||
|
||||
The first row is treated as the header. An empty file returns an
|
||||
empty string so the caller can decide how to handle it.
|
||||
"""
|
||||
with open(file_path, encoding="utf-8", newline="") as fh:
|
||||
reader = csv.reader(fh, delimiter=delimiter)
|
||||
rows = list(reader)
|
||||
|
||||
if not rows:
|
||||
return ""
|
||||
|
||||
header, *body = rows
|
||||
col_count = len(header)
|
||||
|
||||
lines: list[str] = []
|
||||
|
||||
header_cells = [_escape_pipe(c.strip()) for c in header]
|
||||
lines.append("| " + " | ".join(header_cells) + " |")
|
||||
lines.append("| " + " | ".join(["---"] * col_count) + " |")
|
||||
|
||||
for row in body:
|
||||
padded = row + [""] * (col_count - len(row))
|
||||
cells = [_escape_pipe(c.strip()) for c in padded[:col_count]]
|
||||
lines.append("| " + " | ".join(cells) + " |")
|
||||
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def tsv_to_markdown(file_path: str) -> str:
|
||||
"""Convert a TSV file to a markdown table."""
|
||||
return csv_to_markdown(file_path, delimiter="\t")
|
||||
|
||||
|
||||
def html_to_markdown(file_path: str) -> str:
|
||||
"""Convert an HTML file to markdown via ``markdownify``."""
|
||||
html = Path(file_path).read_text(encoding="utf-8")
|
||||
return markdownify(html).strip()
|
||||
|
||||
|
||||
_CONVERTER_MAP: dict[str, Callable[..., str]] = {
|
||||
".csv": csv_to_markdown,
|
||||
".tsv": tsv_to_markdown,
|
||||
".html": html_to_markdown,
|
||||
".htm": html_to_markdown,
|
||||
}
|
||||
|
||||
|
||||
def convert_file_directly(file_path: str, filename: str) -> str:
|
||||
"""Dispatch to the appropriate lossless converter based on file extension.
|
||||
|
||||
Raises ``ValueError`` if the extension is not supported.
|
||||
"""
|
||||
suffix = Path(filename).suffix.lower()
|
||||
converter = _CONVERTER_MAP.get(suffix)
|
||||
if converter is None:
|
||||
raise ValueError(
|
||||
f"No direct converter for extension '{suffix}' (file: {filename})"
|
||||
)
|
||||
return converter(file_path)
|
||||
209
surfsense_backend/app/tasks/document_processors/_etl.py
Normal file
209
surfsense_backend/app/tasks/document_processors/_etl.py
Normal file
|
|
@ -0,0 +1,209 @@
|
|||
"""
|
||||
ETL parsing strategies for different document processing services.
|
||||
|
||||
Provides parse functions for Unstructured, LlamaCloud, and Docling, along with
|
||||
LlamaCloud retry logic and dynamic timeout calculations.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import warnings
|
||||
from logging import ERROR, getLogger
|
||||
|
||||
import httpx
|
||||
|
||||
from app.config import config as app_config
|
||||
from app.db import Log
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
|
||||
from ._constants import (
|
||||
LLAMACLOUD_BASE_DELAY,
|
||||
LLAMACLOUD_MAX_DELAY,
|
||||
LLAMACLOUD_MAX_RETRIES,
|
||||
LLAMACLOUD_RETRYABLE_EXCEPTIONS,
|
||||
PER_PAGE_JOB_TIMEOUT,
|
||||
)
|
||||
from ._helpers import calculate_job_timeout, calculate_upload_timeout
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LlamaCloud parsing with retry
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def parse_with_llamacloud_retry(
|
||||
file_path: str,
|
||||
estimated_pages: int,
|
||||
task_logger: TaskLoggingService | None = None,
|
||||
log_entry: Log | None = None,
|
||||
):
|
||||
"""
|
||||
Parse a file with LlamaCloud with retry logic for transient SSL/connection errors.
|
||||
|
||||
Uses dynamic timeout calculations based on file size and page count to handle
|
||||
very large files reliably.
|
||||
|
||||
Returns:
|
||||
LlamaParse result object
|
||||
|
||||
Raises:
|
||||
Exception: If all retries fail
|
||||
"""
|
||||
from llama_cloud_services import LlamaParse
|
||||
from llama_cloud_services.parse.utils import ResultType
|
||||
|
||||
file_size_bytes = os.path.getsize(file_path)
|
||||
file_size_mb = file_size_bytes / (1024 * 1024)
|
||||
|
||||
upload_timeout = calculate_upload_timeout(file_size_bytes)
|
||||
job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes)
|
||||
|
||||
custom_timeout = httpx.Timeout(
|
||||
connect=120.0,
|
||||
read=upload_timeout,
|
||||
write=upload_timeout,
|
||||
pool=120.0,
|
||||
)
|
||||
|
||||
logging.info(
|
||||
f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
|
||||
f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
|
||||
f"job_timeout={job_timeout:.0f}s"
|
||||
)
|
||||
|
||||
last_exception = None
|
||||
attempt_errors: list[str] = []
|
||||
|
||||
for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1):
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=custom_timeout) as custom_client:
|
||||
parser = LlamaParse(
|
||||
api_key=app_config.LLAMA_CLOUD_API_KEY,
|
||||
num_workers=1,
|
||||
verbose=True,
|
||||
language="en",
|
||||
result_type=ResultType.MD,
|
||||
max_timeout=int(max(2000, job_timeout + upload_timeout)),
|
||||
job_timeout_in_seconds=job_timeout,
|
||||
job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
|
||||
custom_client=custom_client,
|
||||
)
|
||||
result = await parser.aparse(file_path)
|
||||
|
||||
if attempt > 1:
|
||||
logging.info(
|
||||
f"LlamaCloud upload succeeded on attempt {attempt} after "
|
||||
f"{len(attempt_errors)} failures"
|
||||
)
|
||||
return result
|
||||
|
||||
except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
|
||||
last_exception = e
|
||||
error_type = type(e).__name__
|
||||
error_msg = str(e)[:200]
|
||||
attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}")
|
||||
|
||||
if attempt < LLAMACLOUD_MAX_RETRIES:
|
||||
base_delay = min(
|
||||
LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)),
|
||||
LLAMACLOUD_MAX_DELAY,
|
||||
)
|
||||
jitter = base_delay * 0.25 * (2 * random.random() - 1)
|
||||
delay = base_delay + jitter
|
||||
|
||||
if task_logger and log_entry:
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"LlamaCloud upload failed "
|
||||
f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), "
|
||||
f"retrying in {delay:.0f}s",
|
||||
{
|
||||
"error_type": error_type,
|
||||
"error_message": error_msg,
|
||||
"attempt": attempt,
|
||||
"retry_delay": delay,
|
||||
"file_size_mb": round(file_size_mb, 1),
|
||||
"upload_timeout": upload_timeout,
|
||||
},
|
||||
)
|
||||
else:
|
||||
logging.warning(
|
||||
f"LlamaCloud upload failed "
|
||||
f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): "
|
||||
f"{error_type}. File: {file_size_mb:.1f}MB. "
|
||||
f"Retrying in {delay:.0f}s..."
|
||||
)
|
||||
|
||||
await asyncio.sleep(delay)
|
||||
else:
|
||||
logging.error(
|
||||
f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} "
|
||||
f"attempts. File size: {file_size_mb:.1f}MB, "
|
||||
f"Pages: {estimated_pages}. "
|
||||
f"Errors: {'; '.join(attempt_errors)}"
|
||||
)
|
||||
|
||||
except Exception:
|
||||
raise
|
||||
|
||||
raise last_exception or RuntimeError(
|
||||
f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. "
|
||||
f"File size: {file_size_mb:.1f}MB"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-service parse functions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def parse_with_unstructured(file_path: str):
|
||||
"""
|
||||
Parse a file using the Unstructured ETL service.
|
||||
|
||||
Returns:
|
||||
List of LangChain Document elements.
|
||||
"""
|
||||
from langchain_unstructured import UnstructuredLoader
|
||||
|
||||
loader = UnstructuredLoader(
|
||||
file_path,
|
||||
mode="elements",
|
||||
post_processors=[],
|
||||
languages=["eng"],
|
||||
include_orig_elements=False,
|
||||
include_metadata=False,
|
||||
strategy="auto",
|
||||
)
|
||||
return await loader.aload()
|
||||
|
||||
|
||||
async def parse_with_docling(file_path: str, filename: str) -> str:
|
||||
"""
|
||||
Parse a file using the Docling ETL service (via the Docling service wrapper).
|
||||
|
||||
Returns:
|
||||
Markdown content string.
|
||||
"""
|
||||
from app.services.docling_service import create_docling_service
|
||||
|
||||
docling_service = create_docling_service()
|
||||
|
||||
pdfminer_logger = getLogger("pdfminer")
|
||||
original_level = pdfminer_logger.level
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer")
|
||||
warnings.filterwarnings(
|
||||
"ignore", message=".*Cannot set gray non-stroke color.*"
|
||||
)
|
||||
warnings.filterwarnings("ignore", message=".*invalid float value.*")
|
||||
pdfminer_logger.setLevel(ERROR)
|
||||
|
||||
try:
|
||||
result = await docling_service.process_document(file_path, filename)
|
||||
finally:
|
||||
pdfminer_logger.setLevel(original_level)
|
||||
|
||||
return result["content"]
|
||||
218
surfsense_backend/app/tasks/document_processors/_helpers.py
Normal file
218
surfsense_backend/app/tasks/document_processors/_helpers.py
Normal file
|
|
@ -0,0 +1,218 @@
|
|||
"""
|
||||
Document helper functions for deduplication, migration, and connector updates.
|
||||
|
||||
Provides reusable logic shared across file processors and ETL strategies.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import Document, DocumentStatus, DocumentType
|
||||
from app.utils.document_converters import generate_unique_identifier_hash
|
||||
|
||||
from ._constants import (
|
||||
BASE_JOB_TIMEOUT,
|
||||
MAX_UPLOAD_TIMEOUT,
|
||||
MIN_UPLOAD_TIMEOUT,
|
||||
PER_PAGE_JOB_TIMEOUT,
|
||||
UPLOAD_BYTES_PER_SECOND_SLOW,
|
||||
)
|
||||
from .base import (
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unique identifier helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def get_google_drive_unique_identifier(
|
||||
connector: dict | None,
|
||||
filename: str,
|
||||
search_space_id: int,
|
||||
) -> tuple[str, str | None]:
|
||||
"""
|
||||
Get unique identifier hash, using file_id for Google Drive (stable across renames).
|
||||
|
||||
Returns:
|
||||
Tuple of (primary_hash, legacy_hash or None).
|
||||
For Google Drive: (file_id-based hash, filename-based hash for migration).
|
||||
For other sources: (filename-based hash, None).
|
||||
"""
|
||||
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||
metadata = connector.get("metadata", {})
|
||||
file_id = metadata.get("google_drive_file_id")
|
||||
|
||||
if file_id:
|
||||
primary_hash = generate_unique_identifier_hash(
|
||||
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
||||
)
|
||||
legacy_hash = generate_unique_identifier_hash(
|
||||
DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
|
||||
)
|
||||
return primary_hash, legacy_hash
|
||||
|
||||
primary_hash = generate_unique_identifier_hash(
|
||||
DocumentType.FILE, filename, search_space_id
|
||||
)
|
||||
return primary_hash, None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Document deduplication and migration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def handle_existing_document_update(
|
||||
session: AsyncSession,
|
||||
existing_document: Document,
|
||||
content_hash: str,
|
||||
connector: dict | None,
|
||||
filename: str,
|
||||
primary_hash: str,
|
||||
) -> tuple[bool, Document | None]:
|
||||
"""
|
||||
Handle update logic for an existing document.
|
||||
|
||||
Returns:
|
||||
Tuple of (should_skip_processing, document_to_return):
|
||||
- (True, document): Content unchanged, return existing document
|
||||
- (False, None): Content changed, needs re-processing
|
||||
"""
|
||||
if existing_document.unique_identifier_hash != primary_hash:
|
||||
existing_document.unique_identifier_hash = primary_hash
|
||||
logging.info(f"Migrated document to file_id-based identifier: {filename}")
|
||||
|
||||
if existing_document.content_hash == content_hash:
|
||||
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||
connector_metadata = connector.get("metadata", {})
|
||||
new_name = connector_metadata.get("google_drive_file_name")
|
||||
doc_metadata = existing_document.document_metadata or {}
|
||||
old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get(
|
||||
"google_drive_file_name"
|
||||
)
|
||||
|
||||
if new_name and old_name and old_name != new_name:
|
||||
from sqlalchemy.orm.attributes import flag_modified
|
||||
|
||||
existing_document.title = new_name
|
||||
if not existing_document.document_metadata:
|
||||
existing_document.document_metadata = {}
|
||||
existing_document.document_metadata["FILE_NAME"] = new_name
|
||||
existing_document.document_metadata["google_drive_file_name"] = new_name
|
||||
flag_modified(existing_document, "document_metadata")
|
||||
await session.commit()
|
||||
logging.info(
|
||||
f"File renamed in Google Drive: '{old_name}' → '{new_name}' "
|
||||
f"(no re-processing needed)"
|
||||
)
|
||||
|
||||
logging.info(f"Document for file {filename} unchanged. Skipping.")
|
||||
return True, existing_document
|
||||
|
||||
# Content has changed — guard against content_hash collision before
|
||||
# expensive ETL processing.
|
||||
collision_doc = await check_duplicate_document(session, content_hash)
|
||||
if collision_doc and collision_doc.id != existing_document.id:
|
||||
logging.warning(
|
||||
"Content-hash collision for %s: identical content exists in "
|
||||
"document #%s (%s). Skipping re-processing.",
|
||||
filename,
|
||||
collision_doc.id,
|
||||
collision_doc.document_type,
|
||||
)
|
||||
if DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.PENDING
|
||||
) or DocumentStatus.is_state(
|
||||
existing_document.status, DocumentStatus.PROCESSING
|
||||
):
|
||||
await session.delete(existing_document)
|
||||
await session.commit()
|
||||
return True, None
|
||||
|
||||
return True, existing_document
|
||||
|
||||
logging.info(f"Content changed for file {filename}. Updating document.")
|
||||
return False, None
|
||||
|
||||
|
||||
async def find_existing_document_with_migration(
|
||||
session: AsyncSession,
|
||||
primary_hash: str,
|
||||
legacy_hash: str | None,
|
||||
content_hash: str | None = None,
|
||||
) -> Document | None:
|
||||
"""
|
||||
Find existing document, checking primary hash, legacy hash, and content_hash.
|
||||
|
||||
Supports migration from filename-based to file_id-based hashing for
|
||||
Google Drive files, with content_hash fallback for cross-source dedup.
|
||||
"""
|
||||
existing_document = await check_document_by_unique_identifier(session, primary_hash)
|
||||
|
||||
if not existing_document and legacy_hash:
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, legacy_hash
|
||||
)
|
||||
if existing_document:
|
||||
logging.info(
|
||||
"Found legacy document (filename-based hash), "
|
||||
"will migrate to file_id-based hash"
|
||||
)
|
||||
|
||||
if not existing_document and content_hash:
|
||||
existing_document = await check_duplicate_document(session, content_hash)
|
||||
if existing_document:
|
||||
logging.info(
|
||||
f"Found duplicate content from different source (content_hash match). "
|
||||
f"Original document ID: {existing_document.id}, "
|
||||
f"type: {existing_document.document_type}"
|
||||
)
|
||||
|
||||
return existing_document
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Connector helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def update_document_from_connector(
|
||||
document: Document | None,
|
||||
connector: dict | None,
|
||||
session: AsyncSession,
|
||||
) -> None:
|
||||
"""Update document type, metadata, and connector_id from connector info."""
|
||||
if not document or not connector:
|
||||
return
|
||||
if "type" in connector:
|
||||
document.document_type = connector["type"]
|
||||
if "metadata" in connector:
|
||||
if not document.document_metadata:
|
||||
document.document_metadata = connector["metadata"]
|
||||
else:
|
||||
merged = {**document.document_metadata, **connector["metadata"]}
|
||||
document.document_metadata = merged
|
||||
if "connector_id" in connector:
|
||||
document.connector_id = connector["connector_id"]
|
||||
await session.commit()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Timeout calculations
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def calculate_upload_timeout(file_size_bytes: int) -> float:
|
||||
"""Calculate upload timeout based on file size (conservative for slow connections)."""
|
||||
estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5
|
||||
return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT))
|
||||
|
||||
|
||||
def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float:
|
||||
"""Calculate job processing timeout based on page count and file size."""
|
||||
page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT)
|
||||
size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60
|
||||
return max(page_based_timeout, size_based_timeout)
|
||||
285
surfsense_backend/app/tasks/document_processors/_save.py
Normal file
285
surfsense_backend/app/tasks/document_processors/_save.py
Normal file
|
|
@ -0,0 +1,285 @@
|
|||
"""
|
||||
Unified document save/update logic for file processors.
|
||||
|
||||
Replaces the three nearly-identical ``add_received_file_document_using_*``
|
||||
functions with a single ``save_file_document`` function plus thin wrappers
|
||||
for backward compatibility.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from langchain_core.documents import Document as LangChainDocument
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import Document, DocumentStatus, DocumentType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
)
|
||||
|
||||
from ._helpers import (
|
||||
find_existing_document_with_migration,
|
||||
get_google_drive_unique_identifier,
|
||||
handle_existing_document_update,
|
||||
)
|
||||
from .base import get_current_timestamp, safe_set_chunks
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Summary generation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def _generate_summary(
|
||||
markdown_content: str,
|
||||
file_name: str,
|
||||
etl_service: str,
|
||||
user_llm,
|
||||
enable_summary: bool,
|
||||
) -> tuple[str, list[float]]:
|
||||
"""
|
||||
Generate a document summary and embedding.
|
||||
|
||||
Docling uses its own large-document summary strategy; other ETL services
|
||||
use the standard ``generate_document_summary`` helper.
|
||||
"""
|
||||
if not enable_summary:
|
||||
summary = f"File: {file_name}\n\n{markdown_content[:4000]}"
|
||||
return summary, embed_text(summary)
|
||||
|
||||
if etl_service == "DOCLING":
|
||||
from app.services.docling_service import create_docling_service
|
||||
|
||||
docling_service = create_docling_service()
|
||||
summary_text = await docling_service.process_large_document_summary(
|
||||
content=markdown_content, llm=user_llm, document_title=file_name
|
||||
)
|
||||
|
||||
meta = {
|
||||
"file_name": file_name,
|
||||
"etl_service": etl_service,
|
||||
"document_type": "File Document",
|
||||
}
|
||||
parts = ["# DOCUMENT METADATA"]
|
||||
for key, value in meta.items():
|
||||
if value:
|
||||
formatted_key = key.replace("_", " ").title()
|
||||
parts.append(f"**{formatted_key}:** {value}")
|
||||
|
||||
enhanced = "\n".join(parts) + "\n\n# DOCUMENT SUMMARY\n\n" + summary_text
|
||||
return enhanced, embed_text(enhanced)
|
||||
|
||||
# Standard summary (Unstructured / LlamaCloud / others)
|
||||
meta = {
|
||||
"file_name": file_name,
|
||||
"etl_service": etl_service,
|
||||
"document_type": "File Document",
|
||||
}
|
||||
return await generate_document_summary(markdown_content, user_llm, meta)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unified save function
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def save_file_document(
|
||||
session: AsyncSession,
|
||||
file_name: str,
|
||||
markdown_content: str,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
etl_service: str,
|
||||
connector: dict | None = None,
|
||||
enable_summary: bool = True,
|
||||
) -> Document | None:
|
||||
"""
|
||||
Process and store a file document with deduplication and migration support.
|
||||
|
||||
Handles both creating new documents and updating existing ones. This is
|
||||
the single implementation behind the per-ETL-service wrapper functions.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
file_name: Name of the processed file
|
||||
markdown_content: Markdown content to store
|
||||
search_space_id: ID of the search space
|
||||
user_id: ID of the user
|
||||
etl_service: Name of the ETL service (UNSTRUCTURED, LLAMACLOUD, DOCLING)
|
||||
connector: Optional connector info for Google Drive files
|
||||
enable_summary: Whether to generate an AI summary
|
||||
|
||||
Returns:
|
||||
Document object if successful, None if duplicate detected
|
||||
"""
|
||||
try:
|
||||
primary_hash, legacy_hash = get_google_drive_unique_identifier(
|
||||
connector, file_name, search_space_id
|
||||
)
|
||||
content_hash = generate_content_hash(markdown_content, search_space_id)
|
||||
|
||||
existing_document = await find_existing_document_with_migration(
|
||||
session, primary_hash, legacy_hash, content_hash
|
||||
)
|
||||
|
||||
if existing_document:
|
||||
should_skip, doc = await handle_existing_document_update(
|
||||
session,
|
||||
existing_document,
|
||||
content_hash,
|
||||
connector,
|
||||
file_name,
|
||||
primary_hash,
|
||||
)
|
||||
if should_skip:
|
||||
return doc
|
||||
|
||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||
if not user_llm:
|
||||
raise RuntimeError(
|
||||
f"No long context LLM configured for user {user_id} "
|
||||
f"in search space {search_space_id}"
|
||||
)
|
||||
|
||||
summary_content, summary_embedding = await _generate_summary(
|
||||
markdown_content, file_name, etl_service, user_llm, enable_summary
|
||||
)
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
doc_metadata = {"FILE_NAME": file_name, "ETL_SERVICE": etl_service}
|
||||
|
||||
if existing_document:
|
||||
existing_document.title = file_name
|
||||
existing_document.content = summary_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = doc_metadata
|
||||
await safe_set_chunks(session, existing_document, chunks)
|
||||
existing_document.source_markdown = markdown_content
|
||||
existing_document.content_needs_reindexing = False
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
existing_document.status = DocumentStatus.ready()
|
||||
|
||||
await session.commit()
|
||||
await session.refresh(existing_document)
|
||||
return existing_document
|
||||
|
||||
doc_type = DocumentType.FILE
|
||||
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||
doc_type = DocumentType.GOOGLE_DRIVE_FILE
|
||||
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=file_name,
|
||||
document_type=doc_type,
|
||||
document_metadata=doc_metadata,
|
||||
content=summary_content,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=primary_hash,
|
||||
source_markdown=markdown_content,
|
||||
content_needs_reindexing=False,
|
||||
updated_at=get_current_timestamp(),
|
||||
created_by_id=user_id,
|
||||
connector_id=connector.get("connector_id") if connector else None,
|
||||
status=DocumentStatus.ready(),
|
||||
)
|
||||
session.add(document)
|
||||
await session.commit()
|
||||
await session.refresh(document)
|
||||
return document
|
||||
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
if "ix_documents_content_hash" in str(db_error):
|
||||
logging.warning(
|
||||
"content_hash collision during commit for %s (%s). Skipping.",
|
||||
file_name,
|
||||
etl_service,
|
||||
)
|
||||
return None
|
||||
raise db_error
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
raise RuntimeError(
|
||||
f"Failed to process file document using {etl_service}: {e!s}"
|
||||
) from e
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Backward-compatible wrapper functions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def add_received_file_document_using_unstructured(
|
||||
session: AsyncSession,
|
||||
file_name: str,
|
||||
unstructured_processed_elements: list[LangChainDocument],
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
connector: dict | None = None,
|
||||
enable_summary: bool = True,
|
||||
) -> Document | None:
|
||||
"""Process and store a file document using the Unstructured service."""
|
||||
from app.utils.document_converters import convert_document_to_markdown
|
||||
|
||||
markdown_content = await convert_document_to_markdown(
|
||||
unstructured_processed_elements
|
||||
)
|
||||
return await save_file_document(
|
||||
session,
|
||||
file_name,
|
||||
markdown_content,
|
||||
search_space_id,
|
||||
user_id,
|
||||
"UNSTRUCTURED",
|
||||
connector,
|
||||
enable_summary,
|
||||
)
|
||||
|
||||
|
||||
async def add_received_file_document_using_llamacloud(
|
||||
session: AsyncSession,
|
||||
file_name: str,
|
||||
llamacloud_markdown_document: str,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
connector: dict | None = None,
|
||||
enable_summary: bool = True,
|
||||
) -> Document | None:
|
||||
"""Process and store document content parsed by LlamaCloud."""
|
||||
return await save_file_document(
|
||||
session,
|
||||
file_name,
|
||||
llamacloud_markdown_document,
|
||||
search_space_id,
|
||||
user_id,
|
||||
"LLAMACLOUD",
|
||||
connector,
|
||||
enable_summary,
|
||||
)
|
||||
|
||||
|
||||
async def add_received_file_document_using_docling(
|
||||
session: AsyncSession,
|
||||
file_name: str,
|
||||
docling_markdown_document: str,
|
||||
search_space_id: int,
|
||||
user_id: str,
|
||||
connector: dict | None = None,
|
||||
enable_summary: bool = True,
|
||||
) -> Document | None:
|
||||
"""Process and store document content parsed by Docling."""
|
||||
return await save_file_document(
|
||||
session,
|
||||
file_name,
|
||||
docling_markdown_document,
|
||||
search_space_id,
|
||||
user_id,
|
||||
"DOCLING",
|
||||
connector,
|
||||
enable_summary,
|
||||
)
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -14,88 +14,19 @@ from app.utils.document_converters import (
|
|||
create_document_chunks,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
|
||||
from ._helpers import (
|
||||
find_existing_document_with_migration,
|
||||
get_google_drive_unique_identifier,
|
||||
)
|
||||
from .base import (
|
||||
check_document_by_unique_identifier,
|
||||
check_duplicate_document,
|
||||
get_current_timestamp,
|
||||
safe_set_chunks,
|
||||
)
|
||||
|
||||
|
||||
def _get_google_drive_unique_identifier(
|
||||
connector: dict | None,
|
||||
filename: str,
|
||||
search_space_id: int,
|
||||
) -> tuple[str, str | None]:
|
||||
"""
|
||||
Get unique identifier hash for a file, with special handling for Google Drive.
|
||||
|
||||
For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
|
||||
For other files, uses filename.
|
||||
|
||||
Args:
|
||||
connector: Optional connector info dict with type and metadata
|
||||
filename: The filename (used for non-Google Drive files or as fallback)
|
||||
search_space_id: The search space ID
|
||||
|
||||
Returns:
|
||||
Tuple of (primary_hash, legacy_hash or None)
|
||||
"""
|
||||
if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
|
||||
metadata = connector.get("metadata", {})
|
||||
file_id = metadata.get("google_drive_file_id")
|
||||
|
||||
if file_id:
|
||||
primary_hash = generate_unique_identifier_hash(
|
||||
DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
|
||||
)
|
||||
legacy_hash = generate_unique_identifier_hash(
|
||||
DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
|
||||
)
|
||||
return primary_hash, legacy_hash
|
||||
|
||||
primary_hash = generate_unique_identifier_hash(
|
||||
DocumentType.FILE, filename, search_space_id
|
||||
)
|
||||
return primary_hash, None
|
||||
|
||||
|
||||
async def _find_existing_document_with_migration(
|
||||
session: AsyncSession,
|
||||
primary_hash: str,
|
||||
legacy_hash: str | None,
|
||||
content_hash: str | None = None,
|
||||
) -> Document | None:
|
||||
"""
|
||||
Find existing document, checking both new hash and legacy hash for migration,
|
||||
with fallback to content_hash for cross-source deduplication.
|
||||
"""
|
||||
existing_document = await check_document_by_unique_identifier(session, primary_hash)
|
||||
|
||||
if not existing_document and legacy_hash:
|
||||
existing_document = await check_document_by_unique_identifier(
|
||||
session, legacy_hash
|
||||
)
|
||||
if existing_document:
|
||||
logging.info(
|
||||
"Found legacy document (filename-based hash), will migrate to file_id-based hash"
|
||||
)
|
||||
|
||||
# Fallback: check by content_hash to catch duplicates from different sources
|
||||
if not existing_document and content_hash:
|
||||
existing_document = await check_duplicate_document(session, content_hash)
|
||||
if existing_document:
|
||||
logging.info(
|
||||
f"Found duplicate content from different source (content_hash match). "
|
||||
f"Original document ID: {existing_document.id}, type: {existing_document.document_type}"
|
||||
)
|
||||
|
||||
return existing_document
|
||||
|
||||
|
||||
async def _handle_existing_document_update(
|
||||
session: AsyncSession,
|
||||
existing_document: Document,
|
||||
|
|
@ -224,7 +155,7 @@ async def add_received_markdown_file_document(
|
|||
|
||||
try:
|
||||
# Generate unique identifier hash (uses file_id for Google Drive, filename for others)
|
||||
primary_hash, legacy_hash = _get_google_drive_unique_identifier(
|
||||
primary_hash, legacy_hash = get_google_drive_unique_identifier(
|
||||
connector, file_name, search_space_id
|
||||
)
|
||||
|
||||
|
|
@ -232,7 +163,7 @@ async def add_received_markdown_file_document(
|
|||
content_hash = generate_content_hash(file_in_markdown, search_space_id)
|
||||
|
||||
# Check if document exists (with migration support for Google Drive and content_hash fallback)
|
||||
existing_document = await _find_existing_document_with_migration(
|
||||
existing_document = await find_existing_document_with_migration(
|
||||
session, primary_hash, legacy_hash, content_hash
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -2,12 +2,11 @@
|
|||
Integration tests for backend file upload limit enforcement.
|
||||
|
||||
These tests verify that the API rejects uploads that exceed:
|
||||
- Max files per upload (10)
|
||||
- Max per-file size (50 MB)
|
||||
- Max total upload size (200 MB)
|
||||
- Max per-file size (500 MB)
|
||||
|
||||
The limits mirror the frontend's DocumentUploadTab.tsx constants and are
|
||||
enforced server-side to protect against direct API calls.
|
||||
No file count or total size limits are enforced — the frontend batches
|
||||
uploads in groups of 5 and there is no cap on how many files a user can
|
||||
upload in a single session.
|
||||
|
||||
Prerequisites:
|
||||
- PostgreSQL + pgvector
|
||||
|
|
@ -24,60 +23,12 @@ pytestmark = pytest.mark.integration
|
|||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test A: File count limit
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestFileCountLimit:
|
||||
"""Uploading more than 10 files in a single request should be rejected."""
|
||||
|
||||
async def test_11_files_returns_413(
|
||||
self,
|
||||
client: httpx.AsyncClient,
|
||||
headers: dict[str, str],
|
||||
search_space_id: int,
|
||||
):
|
||||
files = [
|
||||
("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain"))
|
||||
for i in range(11)
|
||||
]
|
||||
resp = await client.post(
|
||||
"/api/v1/documents/fileupload",
|
||||
headers=headers,
|
||||
files=files,
|
||||
data={"search_space_id": str(search_space_id)},
|
||||
)
|
||||
assert resp.status_code == 413
|
||||
assert "too many files" in resp.json()["detail"].lower()
|
||||
|
||||
async def test_10_files_accepted(
|
||||
self,
|
||||
client: httpx.AsyncClient,
|
||||
headers: dict[str, str],
|
||||
search_space_id: int,
|
||||
cleanup_doc_ids: list[int],
|
||||
):
|
||||
files = [
|
||||
("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain"))
|
||||
for i in range(10)
|
||||
]
|
||||
resp = await client.post(
|
||||
"/api/v1/documents/fileupload",
|
||||
headers=headers,
|
||||
files=files,
|
||||
data={"search_space_id": str(search_space_id)},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
cleanup_doc_ids.extend(resp.json().get("document_ids", []))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test B: Per-file size limit
|
||||
# Test: Per-file size limit (500 MB)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestPerFileSizeLimit:
|
||||
"""A single file exceeding 50 MB should be rejected."""
|
||||
"""A single file exceeding 500 MB should be rejected."""
|
||||
|
||||
async def test_oversized_file_returns_413(
|
||||
self,
|
||||
|
|
@ -85,7 +36,7 @@ class TestPerFileSizeLimit:
|
|||
headers: dict[str, str],
|
||||
search_space_id: int,
|
||||
):
|
||||
oversized = io.BytesIO(b"\x00" * (50 * 1024 * 1024 + 1))
|
||||
oversized = io.BytesIO(b"\x00" * (500 * 1024 * 1024 + 1))
|
||||
resp = await client.post(
|
||||
"/api/v1/documents/fileupload",
|
||||
headers=headers,
|
||||
|
|
@ -102,11 +53,11 @@ class TestPerFileSizeLimit:
|
|||
search_space_id: int,
|
||||
cleanup_doc_ids: list[int],
|
||||
):
|
||||
at_limit = io.BytesIO(b"\x00" * (50 * 1024 * 1024))
|
||||
at_limit = io.BytesIO(b"\x00" * (500 * 1024 * 1024))
|
||||
resp = await client.post(
|
||||
"/api/v1/documents/fileupload",
|
||||
headers=headers,
|
||||
files=[("files", ("exact50mb.txt", at_limit, "text/plain"))],
|
||||
files=[("files", ("exact500mb.txt", at_limit, "text/plain"))],
|
||||
data={"search_space_id": str(search_space_id)},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
|
|
@ -114,26 +65,23 @@ class TestPerFileSizeLimit:
|
|||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test C: Total upload size limit
|
||||
# Test: Multiple files accepted without count limit
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestTotalSizeLimit:
|
||||
"""Multiple files whose combined size exceeds 200 MB should be rejected."""
|
||||
class TestNoFileCountLimit:
|
||||
"""Many files in a single request should be accepted."""
|
||||
|
||||
async def test_total_size_over_200mb_returns_413(
|
||||
async def test_many_files_accepted(
|
||||
self,
|
||||
client: httpx.AsyncClient,
|
||||
headers: dict[str, str],
|
||||
search_space_id: int,
|
||||
cleanup_doc_ids: list[int],
|
||||
):
|
||||
chunk_size = 45 * 1024 * 1024 # 45 MB each
|
||||
files = [
|
||||
(
|
||||
"files",
|
||||
(f"chunk_{i}.txt", io.BytesIO(b"\x00" * chunk_size), "text/plain"),
|
||||
)
|
||||
for i in range(5) # 5 x 45 MB = 225 MB > 200 MB
|
||||
("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain"))
|
||||
for i in range(20)
|
||||
]
|
||||
resp = await client.post(
|
||||
"/api/v1/documents/fileupload",
|
||||
|
|
@ -141,5 +89,5 @@ class TestTotalSizeLimit:
|
|||
files=files,
|
||||
data={"search_space_id": str(search_space_id)},
|
||||
)
|
||||
assert resp.status_code == 413
|
||||
assert "total upload size" in resp.json()["detail"].lower()
|
||||
assert resp.status_code == 200
|
||||
cleanup_doc_ids.extend(resp.json().get("document_ids", []))
|
||||
|
|
|
|||
|
|
@ -248,7 +248,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner:
|
|||
return []
|
||||
|
||||
async def fake_build_scoped_filesystem(**kwargs):
|
||||
return {}
|
||||
return {}, {}
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
|
||||
|
|
@ -298,7 +298,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner:
|
|||
return []
|
||||
|
||||
async def fake_build_scoped_filesystem(**kwargs):
|
||||
return {}
|
||||
return {}, {}
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
|
||||
|
|
@ -334,7 +334,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner:
|
|||
return []
|
||||
|
||||
async def fake_build_scoped_filesystem(**kwargs):
|
||||
return {}
|
||||
return {}, {}
|
||||
|
||||
monkeypatch.setattr(
|
||||
"app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
|
||||
|
|
|
|||
|
|
@ -329,14 +329,15 @@ export function DocumentsTableShell({
|
|||
|
||||
const handleViewDocument = useCallback(async (doc: Document) => {
|
||||
setViewingDoc(doc);
|
||||
if (doc.content) {
|
||||
setViewingContent(doc.content);
|
||||
const preview = doc.content_preview || doc.content;
|
||||
if (preview) {
|
||||
setViewingContent(preview);
|
||||
return;
|
||||
}
|
||||
setViewingLoading(true);
|
||||
try {
|
||||
const fullDoc = await documentsApiService.getDocument({ id: doc.id });
|
||||
setViewingContent(fullDoc.content);
|
||||
setViewingContent(fullDoc.content_preview || fullDoc.content);
|
||||
} catch (err) {
|
||||
console.error("[DocumentsTableShell] Failed to fetch document content:", err);
|
||||
setViewingContent("Failed to load document content.");
|
||||
|
|
@ -946,13 +947,36 @@ export function DocumentsTableShell({
|
|||
WebkitMaskImage: `linear-gradient(to bottom, ${previewScrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${previewScrollPos === "bottom" ? "black" : "transparent"})`,
|
||||
}}
|
||||
>
|
||||
{viewingLoading ? (
|
||||
<div className="flex items-center justify-center py-12">
|
||||
<Spinner size="lg" className="text-muted-foreground" />
|
||||
</div>
|
||||
) : (
|
||||
<MarkdownViewer content={viewingContent} />
|
||||
)}
|
||||
{viewingLoading ? (
|
||||
<div className="flex items-center justify-center py-12">
|
||||
<Spinner size="lg" className="text-muted-foreground" />
|
||||
</div>
|
||||
) : (
|
||||
<>
|
||||
<MarkdownViewer content={viewingContent} maxLength={50_000} />
|
||||
{viewingDoc && (
|
||||
<div className="mt-4 flex justify-center">
|
||||
<Button
|
||||
variant="outline"
|
||||
size="sm"
|
||||
onClick={() => {
|
||||
if (viewingDoc) {
|
||||
openEditor({
|
||||
documentId: viewingDoc.id,
|
||||
searchSpaceId: Number(searchSpaceId),
|
||||
title: viewingDoc.title,
|
||||
});
|
||||
handleCloseViewer();
|
||||
}
|
||||
}}
|
||||
>
|
||||
<Eye className="h-3.5 w-3.5 mr-1.5" />
|
||||
View full document
|
||||
</Button>
|
||||
</div>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
</DrawerContent>
|
||||
</Drawer>
|
||||
|
|
|
|||
|
|
@ -9,9 +9,9 @@ export type Document = {
|
|||
id: number;
|
||||
title: string;
|
||||
document_type: DocumentType;
|
||||
// Optional: Only needed when viewing document details (lazy loaded)
|
||||
document_metadata?: any;
|
||||
content?: string;
|
||||
content_preview?: string;
|
||||
created_at: string;
|
||||
search_space_id: number;
|
||||
created_by_id?: string | null;
|
||||
|
|
|
|||
|
|
@ -1,12 +1,13 @@
|
|||
"use client";
|
||||
|
||||
import { useAtomValue, useSetAtom } from "jotai";
|
||||
import { AlertCircle, XIcon } from "lucide-react";
|
||||
import { AlertCircle, Download, FileText, Loader2, XIcon } from "lucide-react";
|
||||
import dynamic from "next/dynamic";
|
||||
import { useCallback, useEffect, useRef, useState } from "react";
|
||||
import { toast } from "sonner";
|
||||
import { closeEditorPanelAtom, editorPanelAtom } from "@/atoms/editor/editor-panel.atom";
|
||||
import { MarkdownViewer } from "@/components/markdown-viewer";
|
||||
import { Alert, AlertDescription } from "@/components/ui/alert";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer";
|
||||
import { Skeleton } from "@/components/ui/skeleton";
|
||||
|
|
@ -18,11 +19,16 @@ const PlateEditor = dynamic(
|
|||
{ ssr: false, loading: () => <Skeleton className="h-64 w-full" /> }
|
||||
);
|
||||
|
||||
const LARGE_DOCUMENT_THRESHOLD = 2 * 1024 * 1024; // 2MB
|
||||
|
||||
interface EditorContent {
|
||||
document_id: number;
|
||||
title: string;
|
||||
document_type?: string;
|
||||
source_markdown: string;
|
||||
content_size_bytes?: number;
|
||||
chunk_count?: number;
|
||||
truncated?: boolean;
|
||||
}
|
||||
|
||||
const EDITABLE_DOCUMENT_TYPES = new Set(["FILE", "NOTE"]);
|
||||
|
|
@ -62,6 +68,7 @@ export function EditorPanelContent({
|
|||
const [isLoading, setIsLoading] = useState(true);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
const [saving, setSaving] = useState(false);
|
||||
const [downloading, setDownloading] = useState(false);
|
||||
|
||||
const [editedMarkdown, setEditedMarkdown] = useState<string | null>(null);
|
||||
const markdownRef = useRef<string>("");
|
||||
|
|
@ -69,6 +76,8 @@ export function EditorPanelContent({
|
|||
const changeCountRef = useRef(0);
|
||||
const [displayTitle, setDisplayTitle] = useState(title || "Untitled");
|
||||
|
||||
const isLargeDocument = (editorDoc?.content_size_bytes ?? 0) > LARGE_DOCUMENT_THRESHOLD;
|
||||
|
||||
useEffect(() => {
|
||||
let cancelled = false;
|
||||
setIsLoading(true);
|
||||
|
|
@ -86,10 +95,12 @@ export function EditorPanelContent({
|
|||
}
|
||||
|
||||
try {
|
||||
const response = await authenticatedFetch(
|
||||
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`,
|
||||
{ method: "GET" }
|
||||
const url = new URL(
|
||||
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`
|
||||
);
|
||||
url.searchParams.set("max_length", String(LARGE_DOCUMENT_THRESHOLD));
|
||||
|
||||
const response = await authenticatedFetch(url.toString(), { method: "GET" });
|
||||
|
||||
if (cancelled) return;
|
||||
|
||||
|
|
@ -175,7 +186,7 @@ export function EditorPanelContent({
|
|||
}, [documentId, searchSpaceId]);
|
||||
|
||||
const isEditableType = editorDoc
|
||||
? EDITABLE_DOCUMENT_TYPES.has(editorDoc.document_type ?? "")
|
||||
? EDITABLE_DOCUMENT_TYPES.has(editorDoc.document_type ?? "") && !isLargeDocument
|
||||
: false;
|
||||
|
||||
return (
|
||||
|
|
@ -206,6 +217,57 @@ export function EditorPanelContent({
|
|||
<p className="text-sm text-red-500 mt-1">{error || "An unknown error occurred"}</p>
|
||||
</div>
|
||||
</div>
|
||||
) : isLargeDocument ? (
|
||||
<div className="h-full overflow-y-auto px-5 py-4">
|
||||
<Alert className="mb-4">
|
||||
<FileText className="size-4" />
|
||||
<AlertDescription className="flex items-center justify-between gap-4">
|
||||
<span>
|
||||
This document is too large for the editor ({Math.round((editorDoc.content_size_bytes ?? 0) / 1024 / 1024)}MB, {editorDoc.chunk_count ?? 0} chunks). Showing a preview below.
|
||||
</span>
|
||||
<Button
|
||||
variant="outline"
|
||||
size="sm"
|
||||
className="shrink-0 gap-1.5"
|
||||
disabled={downloading}
|
||||
onClick={async () => {
|
||||
setDownloading(true);
|
||||
try {
|
||||
const response = await authenticatedFetch(
|
||||
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/download-markdown`,
|
||||
{ method: "GET" }
|
||||
);
|
||||
if (!response.ok) throw new Error("Download failed");
|
||||
const blob = await response.blob();
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement("a");
|
||||
a.href = url;
|
||||
const disposition = response.headers.get("content-disposition");
|
||||
const match = disposition?.match(/filename="(.+)"/);
|
||||
a.download = match?.[1] ?? `${editorDoc.title || "document"}.md`;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
a.remove();
|
||||
URL.revokeObjectURL(url);
|
||||
toast.success("Download started");
|
||||
} catch {
|
||||
toast.error("Failed to download document");
|
||||
} finally {
|
||||
setDownloading(false);
|
||||
}
|
||||
}}
|
||||
>
|
||||
{downloading ? (
|
||||
<Loader2 className="size-3.5 animate-spin" />
|
||||
) : (
|
||||
<Download className="size-3.5" />
|
||||
)}
|
||||
{downloading ? "Preparing..." : "Download .md"}
|
||||
</Button>
|
||||
</AlertDescription>
|
||||
</Alert>
|
||||
<MarkdownViewer content={editorDoc.source_markdown} />
|
||||
</div>
|
||||
) : isEditableType ? (
|
||||
<PlateEditor
|
||||
key={documentId}
|
||||
|
|
|
|||
|
|
@ -1,18 +1,24 @@
|
|||
"use client";
|
||||
|
||||
import { AlertCircle, Pencil } from "lucide-react";
|
||||
import { AlertCircle, Download, FileText, Loader2, Pencil } from "lucide-react";
|
||||
import { useCallback, useEffect, useRef, useState } from "react";
|
||||
import { toast } from "sonner";
|
||||
import { PlateEditor } from "@/components/editor/plate-editor";
|
||||
import { MarkdownViewer } from "@/components/markdown-viewer";
|
||||
import { Alert, AlertDescription } from "@/components/ui/alert";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { authenticatedFetch, getBearerToken, redirectToLogin } from "@/lib/auth-utils";
|
||||
|
||||
const LARGE_DOCUMENT_THRESHOLD = 2 * 1024 * 1024; // 2MB
|
||||
|
||||
interface DocumentContent {
|
||||
document_id: number;
|
||||
title: string;
|
||||
document_type?: string;
|
||||
source_markdown: string;
|
||||
content_size_bytes?: number;
|
||||
chunk_count?: number;
|
||||
truncated?: boolean;
|
||||
}
|
||||
|
||||
function DocumentSkeleton() {
|
||||
|
|
@ -49,11 +55,14 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen
|
|||
const [error, setError] = useState<string | null>(null);
|
||||
const [isEditing, setIsEditing] = useState(false);
|
||||
const [saving, setSaving] = useState(false);
|
||||
const [downloading, setDownloading] = useState(false);
|
||||
const [editedMarkdown, setEditedMarkdown] = useState<string | null>(null);
|
||||
const markdownRef = useRef<string>("");
|
||||
const initialLoadDone = useRef(false);
|
||||
const changeCountRef = useRef(0);
|
||||
|
||||
const isLargeDocument = (doc?.content_size_bytes ?? 0) > LARGE_DOCUMENT_THRESHOLD;
|
||||
|
||||
useEffect(() => {
|
||||
let cancelled = false;
|
||||
setIsLoading(true);
|
||||
|
|
@ -72,10 +81,12 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen
|
|||
}
|
||||
|
||||
try {
|
||||
const response = await authenticatedFetch(
|
||||
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`,
|
||||
{ method: "GET" }
|
||||
const url = new URL(
|
||||
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`
|
||||
);
|
||||
url.searchParams.set("max_length", String(LARGE_DOCUMENT_THRESHOLD));
|
||||
|
||||
const response = await authenticatedFetch(url.toString(), { method: "GET" });
|
||||
|
||||
if (cancelled) return;
|
||||
|
||||
|
|
@ -173,9 +184,9 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen
|
|||
);
|
||||
}
|
||||
|
||||
const isEditable = EDITABLE_DOCUMENT_TYPES.has(doc.document_type ?? "");
|
||||
const isEditable = EDITABLE_DOCUMENT_TYPES.has(doc.document_type ?? "") && !isLargeDocument;
|
||||
|
||||
if (isEditing) {
|
||||
if (isEditing && !isLargeDocument) {
|
||||
return (
|
||||
<div className="flex flex-col h-full overflow-hidden">
|
||||
<div className="flex items-center justify-between px-6 py-3 border-b shrink-0">
|
||||
|
|
@ -236,7 +247,60 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen
|
|||
</div>
|
||||
<div className="flex-1 overflow-auto">
|
||||
<div className="max-w-4xl mx-auto px-6 py-6">
|
||||
<MarkdownViewer content={doc.source_markdown} />
|
||||
{isLargeDocument ? (
|
||||
<>
|
||||
<Alert className="mb-4">
|
||||
<FileText className="size-4" />
|
||||
<AlertDescription className="flex items-center justify-between gap-4">
|
||||
<span>
|
||||
This document is too large for the editor ({Math.round((doc.content_size_bytes ?? 0) / 1024 / 1024)}MB, {doc.chunk_count ?? 0} chunks). Showing a preview below.
|
||||
</span>
|
||||
<Button
|
||||
variant="outline"
|
||||
size="sm"
|
||||
className="shrink-0 gap-1.5"
|
||||
disabled={downloading}
|
||||
onClick={async () => {
|
||||
setDownloading(true);
|
||||
try {
|
||||
const response = await authenticatedFetch(
|
||||
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/download-markdown`,
|
||||
{ method: "GET" }
|
||||
);
|
||||
if (!response.ok) throw new Error("Download failed");
|
||||
const blob = await response.blob();
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement("a");
|
||||
a.href = url;
|
||||
const disposition = response.headers.get("content-disposition");
|
||||
const match = disposition?.match(/filename="(.+)"/);
|
||||
a.download = match?.[1] ?? `${doc.title || "document"}.md`;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
a.remove();
|
||||
URL.revokeObjectURL(url);
|
||||
toast.success("Download started");
|
||||
} catch {
|
||||
toast.error("Failed to download document");
|
||||
} finally {
|
||||
setDownloading(false);
|
||||
}
|
||||
}}
|
||||
>
|
||||
{downloading ? (
|
||||
<Loader2 className="size-3.5 animate-spin" />
|
||||
) : (
|
||||
<Download className="size-3.5" />
|
||||
)}
|
||||
{downloading ? "Preparing..." : "Download .md"}
|
||||
</Button>
|
||||
</AlertDescription>
|
||||
</Alert>
|
||||
<MarkdownViewer content={doc.source_markdown} />
|
||||
</>
|
||||
) : (
|
||||
<MarkdownViewer content={doc.source_markdown} />
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ const math = createMathPlugin({
|
|||
interface MarkdownViewerProps {
|
||||
content: string;
|
||||
className?: string;
|
||||
maxLength?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -79,8 +80,10 @@ function convertLatexDelimiters(content: string): string {
|
|||
return content;
|
||||
}
|
||||
|
||||
export function MarkdownViewer({ content, className }: MarkdownViewerProps) {
|
||||
const processedContent = convertLatexDelimiters(stripOuterMarkdownFence(content));
|
||||
export function MarkdownViewer({ content, className, maxLength }: MarkdownViewerProps) {
|
||||
const isTruncated = maxLength != null && content.length > maxLength;
|
||||
const displayContent = isTruncated ? content.slice(0, maxLength) : content;
|
||||
const processedContent = convertLatexDelimiters(stripOuterMarkdownFence(displayContent));
|
||||
const components: StreamdownProps["components"] = {
|
||||
p: ({ children, ...props }) => (
|
||||
<p className="my-2" {...props}>
|
||||
|
|
@ -171,6 +174,11 @@ export function MarkdownViewer({ content, className }: MarkdownViewerProps) {
|
|||
>
|
||||
{processedContent}
|
||||
</Streamdown>
|
||||
{isTruncated && (
|
||||
<p className="mt-4 text-sm text-muted-foreground italic">
|
||||
Content truncated ({Math.round(content.length / 1024)}KB total). Showing first {Math.round(maxLength / 1024)}KB.
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
"use client";
|
||||
|
||||
import { useQuery } from "@tanstack/react-query";
|
||||
import { BookOpen, ChevronDown, ExternalLink, FileText, Hash, Sparkles, X } from "lucide-react";
|
||||
import { BookOpen, ChevronDown, ChevronUp, ExternalLink, FileText, Hash, Loader2, Sparkles, X } from "lucide-react";
|
||||
import { AnimatePresence, motion, useReducedMotion } from "motion/react";
|
||||
import { useTranslations } from "next-intl";
|
||||
import type React from "react";
|
||||
|
|
@ -10,7 +10,6 @@ import { createPortal } from "react-dom";
|
|||
import { MarkdownViewer } from "@/components/markdown-viewer";
|
||||
import { Badge } from "@/components/ui/badge";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible";
|
||||
import { ScrollArea } from "@/components/ui/scroll-area";
|
||||
import { Spinner } from "@/components/ui/spinner";
|
||||
import type {
|
||||
|
|
@ -48,7 +47,8 @@ const formatDocumentType = (type: string) => {
|
|||
// which break auto-scroll functionality
|
||||
interface ChunkCardProps {
|
||||
chunk: { id: number; content: string };
|
||||
index: number;
|
||||
localIndex: number;
|
||||
chunkNumber: number;
|
||||
totalChunks: number;
|
||||
isCited: boolean;
|
||||
isActive: boolean;
|
||||
|
|
@ -56,11 +56,11 @@ interface ChunkCardProps {
|
|||
}
|
||||
|
||||
const ChunkCard = memo(
|
||||
forwardRef<HTMLDivElement, ChunkCardProps>(({ chunk, index, totalChunks, isCited }, ref) => {
|
||||
forwardRef<HTMLDivElement, ChunkCardProps>(({ chunk, localIndex, chunkNumber, totalChunks, isCited }, ref) => {
|
||||
return (
|
||||
<div
|
||||
ref={ref}
|
||||
data-chunk-index={index}
|
||||
data-chunk-index={localIndex}
|
||||
className={cn(
|
||||
"group relative rounded-2xl border-2 transition-all duration-300",
|
||||
isCited
|
||||
|
|
@ -68,10 +68,8 @@ const ChunkCard = memo(
|
|||
: "bg-card border-border/50 hover:border-border hover:shadow-md"
|
||||
)}
|
||||
>
|
||||
{/* Cited indicator glow effect */}
|
||||
{isCited && <div className="absolute inset-0 rounded-2xl bg-primary/5 blur-xl -z-10" />}
|
||||
|
||||
{/* Header */}
|
||||
<div className="flex items-center justify-between px-5 py-4 border-b border-border/50">
|
||||
<div className="flex items-center gap-3">
|
||||
<div
|
||||
|
|
@ -82,9 +80,9 @@ const ChunkCard = memo(
|
|||
: "bg-muted text-muted-foreground group-hover:bg-muted/80"
|
||||
)}
|
||||
>
|
||||
{index + 1}
|
||||
{chunkNumber}
|
||||
</div>
|
||||
<span className="text-sm text-muted-foreground">of {totalChunks} chunks</span>
|
||||
<span className="text-sm text-muted-foreground">Chunk {chunkNumber} of {totalChunks}</span>
|
||||
</div>
|
||||
{isCited && (
|
||||
<Badge variant="default" className="gap-1.5 px-3 py-1">
|
||||
|
|
@ -94,9 +92,8 @@ const ChunkCard = memo(
|
|||
)}
|
||||
</div>
|
||||
|
||||
{/* Content */}
|
||||
<div className="p-5 overflow-hidden">
|
||||
<MarkdownViewer content={chunk.content} />
|
||||
<MarkdownViewer content={chunk.content} maxLength={100_000} />
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
|
|
@ -118,7 +115,6 @@ export function SourceDetailPanel({
|
|||
const t = useTranslations("dashboard");
|
||||
const scrollAreaRef = useRef<HTMLDivElement>(null);
|
||||
const hasScrolledRef = useRef(false); // Use ref to avoid stale closures
|
||||
const [summaryOpen, setSummaryOpen] = useState(false);
|
||||
const [activeChunkIndex, setActiveChunkIndex] = useState<number | null>(null);
|
||||
const [mounted, setMounted] = useState(false);
|
||||
const [_hasScrolledToCited, setHasScrolledToCited] = useState(false);
|
||||
|
|
@ -140,20 +136,88 @@ export function SourceDetailPanel({
|
|||
if (isDocsChunk) {
|
||||
return documentsApiService.getSurfsenseDocByChunk(chunkId);
|
||||
}
|
||||
return documentsApiService.getDocumentByChunk({ chunk_id: chunkId });
|
||||
return documentsApiService.getDocumentByChunk({ chunk_id: chunkId, chunk_window: 5 });
|
||||
},
|
||||
enabled: !!chunkId && open,
|
||||
staleTime: 5 * 60 * 1000,
|
||||
});
|
||||
|
||||
const totalChunks = (documentData && "total_chunks" in documentData)
|
||||
? (documentData.total_chunks ?? documentData.chunks.length)
|
||||
: (documentData?.chunks?.length ?? 0);
|
||||
const [beforeChunks, setBeforeChunks] = useState<Array<{ id: number; content: string; created_at: string }>>([]);
|
||||
const [afterChunks, setAfterChunks] = useState<Array<{ id: number; content: string; created_at: string }>>([]);
|
||||
const [loadingBefore, setLoadingBefore] = useState(false);
|
||||
const [loadingAfter, setLoadingAfter] = useState(false);
|
||||
|
||||
useEffect(() => {
|
||||
setBeforeChunks([]);
|
||||
setAfterChunks([]);
|
||||
}, [chunkId, open]);
|
||||
|
||||
const chunkStartIndex = (documentData && "chunk_start_index" in documentData)
|
||||
? (documentData.chunk_start_index ?? 0) : 0;
|
||||
const initialChunks = documentData?.chunks ?? [];
|
||||
const allChunks = [...beforeChunks, ...initialChunks, ...afterChunks];
|
||||
const absoluteStart = chunkStartIndex - beforeChunks.length;
|
||||
const absoluteEnd = chunkStartIndex + initialChunks.length + afterChunks.length;
|
||||
const canLoadBefore = absoluteStart > 0;
|
||||
const canLoadAfter = absoluteEnd < totalChunks;
|
||||
|
||||
const EXPAND_SIZE = 10;
|
||||
|
||||
const loadBefore = useCallback(async () => {
|
||||
if (!documentData || !("search_space_id" in documentData) || !canLoadBefore) return;
|
||||
setLoadingBefore(true);
|
||||
try {
|
||||
const count = Math.min(EXPAND_SIZE, absoluteStart);
|
||||
const result = await documentsApiService.getDocumentChunks({
|
||||
document_id: documentData.id,
|
||||
page: 0,
|
||||
page_size: count,
|
||||
start_offset: absoluteStart - count,
|
||||
});
|
||||
const existingIds = new Set(allChunks.map(c => c.id));
|
||||
const newChunks = result.items
|
||||
.filter(c => !existingIds.has(c.id))
|
||||
.map(c => ({ id: c.id, content: c.content, created_at: c.created_at }));
|
||||
setBeforeChunks(prev => [...newChunks, ...prev]);
|
||||
} catch (err) {
|
||||
console.error("Failed to load earlier chunks:", err);
|
||||
} finally {
|
||||
setLoadingBefore(false);
|
||||
}
|
||||
}, [documentData, absoluteStart, canLoadBefore, allChunks]);
|
||||
|
||||
const loadAfter = useCallback(async () => {
|
||||
if (!documentData || !("search_space_id" in documentData) || !canLoadAfter) return;
|
||||
setLoadingAfter(true);
|
||||
try {
|
||||
const result = await documentsApiService.getDocumentChunks({
|
||||
document_id: documentData.id,
|
||||
page: 0,
|
||||
page_size: EXPAND_SIZE,
|
||||
start_offset: absoluteEnd,
|
||||
});
|
||||
const existingIds = new Set(allChunks.map(c => c.id));
|
||||
const newChunks = result.items
|
||||
.filter(c => !existingIds.has(c.id))
|
||||
.map(c => ({ id: c.id, content: c.content, created_at: c.created_at }));
|
||||
setAfterChunks(prev => [...prev, ...newChunks]);
|
||||
} catch (err) {
|
||||
console.error("Failed to load later chunks:", err);
|
||||
} finally {
|
||||
setLoadingAfter(false);
|
||||
}
|
||||
}, [documentData, absoluteEnd, canLoadAfter, allChunks]);
|
||||
|
||||
const isDirectRenderSource =
|
||||
sourceType === "TAVILY_API" ||
|
||||
sourceType === "LINKUP_API" ||
|
||||
sourceType === "SEARXNG_API" ||
|
||||
sourceType === "BAIDU_SEARCH_API";
|
||||
|
||||
// Find cited chunk index
|
||||
const citedChunkIndex = documentData?.chunks?.findIndex((chunk) => chunk.id === chunkId) ?? -1;
|
||||
const citedChunkIndex = allChunks.findIndex((chunk) => chunk.id === chunkId);
|
||||
|
||||
// Simple scroll function that scrolls to a chunk by index
|
||||
const scrollToChunkByIndex = useCallback(
|
||||
|
|
@ -336,12 +400,12 @@ export function SourceDetailPanel({
|
|||
{documentData && "document_type" in documentData
|
||||
? formatDocumentType(documentData.document_type)
|
||||
: sourceType && formatDocumentType(sourceType)}
|
||||
{documentData?.chunks && (
|
||||
<span className="ml-2">
|
||||
• {documentData.chunks.length} chunk
|
||||
{documentData.chunks.length !== 1 ? "s" : ""}
|
||||
</span>
|
||||
)}
|
||||
{totalChunks > 0 && (
|
||||
<span className="ml-2">
|
||||
• {totalChunks} chunk{totalChunks !== 1 ? "s" : ""}
|
||||
{allChunks.length < totalChunks && ` (showing ${allChunks.length})`}
|
||||
</span>
|
||||
)}
|
||||
</p>
|
||||
</div>
|
||||
<div className="flex items-center gap-3 shrink-0">
|
||||
|
|
@ -450,7 +514,7 @@ export function SourceDetailPanel({
|
|||
{!isDirectRenderSource && documentData && (
|
||||
<div className="flex-1 flex overflow-hidden">
|
||||
{/* Chunk Navigation Sidebar */}
|
||||
{documentData.chunks.length > 1 && (
|
||||
{allChunks.length > 1 && (
|
||||
<motion.div
|
||||
initial={{ opacity: 0, x: -20 }}
|
||||
animate={{ opacity: 1, x: 0 }}
|
||||
|
|
@ -459,7 +523,8 @@ export function SourceDetailPanel({
|
|||
>
|
||||
<ScrollArea className="flex-1 h-full">
|
||||
<div className="p-2 pt-3 flex flex-col gap-1.5">
|
||||
{documentData.chunks.map((chunk, idx) => {
|
||||
{allChunks.map((chunk, idx) => {
|
||||
const absNum = absoluteStart + idx + 1;
|
||||
const isCited = chunk.id === chunkId;
|
||||
const isActive = activeChunkIndex === idx;
|
||||
return (
|
||||
|
|
@ -478,9 +543,9 @@ export function SourceDetailPanel({
|
|||
? "bg-muted text-foreground"
|
||||
: "bg-muted/50 text-muted-foreground hover:bg-muted hover:text-foreground"
|
||||
)}
|
||||
title={isCited ? `Chunk ${idx + 1} (Cited)` : `Chunk ${idx + 1}`}
|
||||
title={isCited ? `Chunk ${absNum} (Cited)` : `Chunk ${absNum}`}
|
||||
>
|
||||
{idx + 1}
|
||||
{absNum}
|
||||
{isCited && (
|
||||
<span className="absolute -top-1.5 -right-1.5 flex items-center justify-center w-4 h-4 bg-primary rounded-full border-2 border-background shadow-sm">
|
||||
<Sparkles className="h-2.5 w-2.5 text-primary-foreground" />
|
||||
|
|
@ -524,44 +589,11 @@ export function SourceDetailPanel({
|
|||
</motion.div>
|
||||
)}
|
||||
|
||||
{/* Summary Collapsible */}
|
||||
{documentData.content && (
|
||||
<motion.div
|
||||
initial={{ opacity: 0, y: 10 }}
|
||||
animate={{ opacity: 1, y: 0 }}
|
||||
transition={{ delay: 0.15 }}
|
||||
>
|
||||
<Collapsible open={summaryOpen} onOpenChange={setSummaryOpen}>
|
||||
<CollapsibleTrigger className="w-full flex items-center justify-between p-5 rounded-2xl bg-linear-to-r from-muted/50 to-muted/30 border hover:from-muted/70 hover:to-muted/50 transition-all duration-200">
|
||||
<span className="font-semibold flex items-center gap-2">
|
||||
<BookOpen className="h-4 w-4" />
|
||||
Document Summary
|
||||
</span>
|
||||
<motion.div
|
||||
animate={{ rotate: summaryOpen ? 180 : 0 }}
|
||||
transition={{ duration: 0.2 }}
|
||||
>
|
||||
<ChevronDown className="h-5 w-5 text-muted-foreground" />
|
||||
</motion.div>
|
||||
</CollapsibleTrigger>
|
||||
<CollapsibleContent>
|
||||
<motion.div
|
||||
initial={{ opacity: 0 }}
|
||||
animate={{ opacity: 1 }}
|
||||
className="mt-3 p-5 bg-muted/20 rounded-2xl border"
|
||||
>
|
||||
<MarkdownViewer content={documentData.content} />
|
||||
</motion.div>
|
||||
</CollapsibleContent>
|
||||
</Collapsible>
|
||||
</motion.div>
|
||||
)}
|
||||
|
||||
{/* Chunks Header */}
|
||||
<div className="flex items-center justify-between pt-4">
|
||||
<div className="flex items-center justify-between pt-2">
|
||||
<h3 className="text-sm font-semibold text-muted-foreground uppercase tracking-wider flex items-center gap-2">
|
||||
<Hash className="h-4 w-4" />
|
||||
Content Chunks
|
||||
Chunks {absoluteStart + 1}–{absoluteEnd} of {totalChunks}
|
||||
</h3>
|
||||
{citedChunkIndex !== -1 && (
|
||||
<Button
|
||||
|
|
@ -576,24 +608,70 @@ export function SourceDetailPanel({
|
|||
)}
|
||||
</div>
|
||||
|
||||
{/* Load Earlier */}
|
||||
{canLoadBefore && (
|
||||
<div className="flex items-center justify-center">
|
||||
<Button
|
||||
variant="outline"
|
||||
size="sm"
|
||||
onClick={loadBefore}
|
||||
disabled={loadingBefore}
|
||||
className="gap-2"
|
||||
>
|
||||
{loadingBefore ? (
|
||||
<Loader2 className="h-3.5 w-3.5 animate-spin" />
|
||||
) : (
|
||||
<ChevronUp className="h-3.5 w-3.5" />
|
||||
)}
|
||||
{loadingBefore
|
||||
? "Loading..."
|
||||
: `Load ${Math.min(EXPAND_SIZE, absoluteStart)} earlier chunks`}
|
||||
</Button>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Chunks */}
|
||||
<div className="space-y-4">
|
||||
{documentData.chunks.map((chunk, idx) => {
|
||||
{allChunks.map((chunk, idx) => {
|
||||
const isCited = chunk.id === chunkId;
|
||||
const chunkNumber = absoluteStart + idx + 1;
|
||||
return (
|
||||
<ChunkCard
|
||||
key={chunk.id}
|
||||
ref={isCited ? citedChunkRefCallback : undefined}
|
||||
chunk={chunk}
|
||||
index={idx}
|
||||
totalChunks={documentData.chunks.length}
|
||||
localIndex={idx}
|
||||
chunkNumber={chunkNumber}
|
||||
totalChunks={totalChunks}
|
||||
isCited={isCited}
|
||||
isActive={activeChunkIndex === idx}
|
||||
disableLayoutAnimation={documentData.chunks.length > 30}
|
||||
disableLayoutAnimation={allChunks.length > 30}
|
||||
/>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
|
||||
{/* Load Later */}
|
||||
{canLoadAfter && (
|
||||
<div className="flex items-center justify-center py-3">
|
||||
<Button
|
||||
variant="outline"
|
||||
size="sm"
|
||||
onClick={loadAfter}
|
||||
disabled={loadingAfter}
|
||||
className="gap-2"
|
||||
>
|
||||
{loadingAfter ? (
|
||||
<Loader2 className="h-3.5 w-3.5 animate-spin" />
|
||||
) : (
|
||||
<ChevronDown className="h-3.5 w-3.5" />
|
||||
)}
|
||||
{loadingAfter
|
||||
? "Loading..."
|
||||
: `Load ${Math.min(EXPAND_SIZE, totalChunks - absoluteEnd)} later chunks`}
|
||||
</Button>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</ScrollArea>
|
||||
</div>
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
"use client";
|
||||
|
||||
import { useAtom } from "jotai";
|
||||
import { CheckCircle2, FileType, Info, Upload, X } from "lucide-react";
|
||||
import { CheckCircle2, FileType, FolderOpen, Info, Upload, X } from "lucide-react";
|
||||
|
||||
import { useTranslations } from "next-intl";
|
||||
import { useCallback, useMemo, useRef, useState } from "react";
|
||||
import { type ChangeEvent, useCallback, useMemo, useRef, useState } from "react";
|
||||
import { useDropzone } from "react-dropzone";
|
||||
import { toast } from "sonner";
|
||||
import { uploadDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms";
|
||||
|
|
@ -51,6 +51,7 @@ const commonTypes = {
|
|||
"application/vnd.openxmlformats-officedocument.presentationml.presentation": [".pptx"],
|
||||
"text/html": [".html", ".htm"],
|
||||
"text/csv": [".csv"],
|
||||
"text/tab-separated-values": [".tsv"],
|
||||
"image/jpeg": [".jpg", ".jpeg"],
|
||||
"image/png": [".png"],
|
||||
"image/bmp": [".bmp"],
|
||||
|
|
@ -76,7 +77,6 @@ const FILE_TYPE_CONFIG: Record<string, Record<string, string[]>> = {
|
|||
"application/rtf": [".rtf"],
|
||||
"application/xml": [".xml"],
|
||||
"application/epub+zip": [".epub"],
|
||||
"text/tab-separated-values": [".tsv"],
|
||||
"text/html": [".html", ".htm", ".web"],
|
||||
"image/gif": [".gif"],
|
||||
"image/svg+xml": [".svg"],
|
||||
|
|
@ -102,7 +102,6 @@ const FILE_TYPE_CONFIG: Record<string, Record<string, string[]>> = {
|
|||
"application/vnd.ms-powerpoint": [".ppt"],
|
||||
"text/x-rst": [".rst"],
|
||||
"application/rtf": [".rtf"],
|
||||
"text/tab-separated-values": [".tsv"],
|
||||
"application/vnd.ms-excel": [".xls"],
|
||||
"application/xml": [".xml"],
|
||||
...audioFileTypes,
|
||||
|
|
@ -116,10 +115,8 @@ interface FileWithId {
|
|||
|
||||
const cardClass = "border border-border bg-slate-400/5 dark:bg-white/5";
|
||||
|
||||
// Upload limits — files are sent in batches of 5 to avoid proxy timeouts
|
||||
const MAX_FILES = 50;
|
||||
const MAX_TOTAL_SIZE_MB = 200;
|
||||
const MAX_TOTAL_SIZE_BYTES = MAX_TOTAL_SIZE_MB * 1024 * 1024;
|
||||
const MAX_FILE_SIZE_MB = 500;
|
||||
const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024;
|
||||
|
||||
export function DocumentUploadTab({
|
||||
searchSpaceId,
|
||||
|
|
@ -134,6 +131,7 @@ export function DocumentUploadTab({
|
|||
const [uploadDocumentMutation] = useAtom(uploadDocumentMutationAtom);
|
||||
const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation;
|
||||
const fileInputRef = useRef<HTMLInputElement>(null);
|
||||
const folderInputRef = useRef<HTMLInputElement>(null);
|
||||
|
||||
const acceptedFileTypes = useMemo(() => {
|
||||
const etlService = process.env.NEXT_PUBLIC_ETL_SERVICE;
|
||||
|
|
@ -145,49 +143,76 @@ export function DocumentUploadTab({
|
|||
[acceptedFileTypes]
|
||||
);
|
||||
|
||||
const onDrop = useCallback(
|
||||
(acceptedFiles: File[]) => {
|
||||
const supportedExtensionsSet = useMemo(
|
||||
() => new Set(supportedExtensions.map((ext) => ext.toLowerCase())),
|
||||
[supportedExtensions]
|
||||
);
|
||||
|
||||
const addFiles = useCallback(
|
||||
(incoming: File[]) => {
|
||||
const oversized = incoming.filter((f) => f.size > MAX_FILE_SIZE_BYTES);
|
||||
if (oversized.length > 0) {
|
||||
toast.error(t("file_too_large"), {
|
||||
description: t("file_too_large_desc", {
|
||||
name: oversized[0].name,
|
||||
maxMB: MAX_FILE_SIZE_MB,
|
||||
}),
|
||||
});
|
||||
}
|
||||
const valid = incoming.filter((f) => f.size <= MAX_FILE_SIZE_BYTES);
|
||||
if (valid.length === 0) return;
|
||||
|
||||
setFiles((prev) => {
|
||||
const newEntries = acceptedFiles.map((f) => ({
|
||||
const newEntries = valid.map((f) => ({
|
||||
id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`,
|
||||
file: f,
|
||||
}));
|
||||
const newFiles = [...prev, ...newEntries];
|
||||
|
||||
if (newFiles.length > MAX_FILES) {
|
||||
toast.error(t("max_files_exceeded"), {
|
||||
description: t("max_files_exceeded_desc", { max: MAX_FILES }),
|
||||
});
|
||||
return prev;
|
||||
}
|
||||
|
||||
const newTotalSize = newFiles.reduce((sum, entry) => sum + entry.file.size, 0);
|
||||
if (newTotalSize > MAX_TOTAL_SIZE_BYTES) {
|
||||
toast.error(t("max_size_exceeded"), {
|
||||
description: t("max_size_exceeded_desc", { max: MAX_TOTAL_SIZE_MB }),
|
||||
});
|
||||
return prev;
|
||||
}
|
||||
|
||||
return newFiles;
|
||||
return [...prev, ...newEntries];
|
||||
});
|
||||
},
|
||||
[t]
|
||||
);
|
||||
|
||||
const onDrop = useCallback(
|
||||
(acceptedFiles: File[]) => {
|
||||
addFiles(acceptedFiles);
|
||||
},
|
||||
[addFiles]
|
||||
);
|
||||
|
||||
const { getRootProps, getInputProps, isDragActive } = useDropzone({
|
||||
onDrop,
|
||||
accept: acceptedFileTypes,
|
||||
maxSize: 50 * 1024 * 1024, // 50MB per file
|
||||
maxSize: MAX_FILE_SIZE_BYTES,
|
||||
noClick: false,
|
||||
disabled: files.length >= MAX_FILES,
|
||||
});
|
||||
|
||||
// Handle file input click to prevent event bubbling that might reopen dialog
|
||||
const handleFileInputClick = useCallback((e: React.MouseEvent<HTMLInputElement>) => {
|
||||
e.stopPropagation();
|
||||
}, []);
|
||||
|
||||
const handleFolderChange = useCallback(
|
||||
(e: ChangeEvent<HTMLInputElement>) => {
|
||||
const fileList = e.target.files;
|
||||
if (!fileList || fileList.length === 0) return;
|
||||
|
||||
const folderFiles = Array.from(fileList).filter((f) => {
|
||||
const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : "";
|
||||
return ext !== "" && supportedExtensionsSet.has(ext);
|
||||
});
|
||||
|
||||
if (folderFiles.length === 0) {
|
||||
toast.error(t("no_supported_files_in_folder"));
|
||||
e.target.value = "";
|
||||
return;
|
||||
}
|
||||
|
||||
addFiles(folderFiles);
|
||||
e.target.value = "";
|
||||
},
|
||||
[addFiles, supportedExtensionsSet, t]
|
||||
);
|
||||
|
||||
const formatFileSize = (bytes: number) => {
|
||||
if (bytes === 0) return "0 Bytes";
|
||||
const k = 1024;
|
||||
|
|
@ -198,15 +223,6 @@ export function DocumentUploadTab({
|
|||
|
||||
const totalFileSize = files.reduce((total, entry) => total + entry.file.size, 0);
|
||||
|
||||
// Check if limits are reached
|
||||
const isFileCountLimitReached = files.length >= MAX_FILES;
|
||||
const isSizeLimitReached = totalFileSize >= MAX_TOTAL_SIZE_BYTES;
|
||||
const remainingFiles = MAX_FILES - files.length;
|
||||
const remainingSizeMB = Math.max(
|
||||
0,
|
||||
(MAX_TOTAL_SIZE_BYTES - totalFileSize) / (1024 * 1024)
|
||||
).toFixed(1);
|
||||
|
||||
// Track accordion state changes
|
||||
const handleAccordionChange = useCallback(
|
||||
(value: string) => {
|
||||
|
|
@ -257,11 +273,21 @@ export function DocumentUploadTab({
|
|||
<Alert className="border border-border bg-slate-400/5 dark:bg-white/5">
|
||||
<Info className="h-4 w-4 shrink-0 mt-0.5" />
|
||||
<AlertDescription className="text-xs sm:text-sm leading-relaxed pt-0.5">
|
||||
{t("file_size_limit")}{" "}
|
||||
{t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })}
|
||||
{t("file_size_limit", { maxMB: MAX_FILE_SIZE_MB })}{" "}
|
||||
{t("upload_limits")}
|
||||
</AlertDescription>
|
||||
</Alert>
|
||||
|
||||
{/* Hidden folder input */}
|
||||
<input
|
||||
ref={folderInputRef}
|
||||
type="file"
|
||||
className="hidden"
|
||||
onChange={handleFolderChange}
|
||||
multiple
|
||||
{...({ webkitdirectory: "", directory: "" } as React.InputHTMLAttributes<HTMLInputElement>)}
|
||||
/>
|
||||
|
||||
<Card className={`relative overflow-hidden ${cardClass}`}>
|
||||
<div className="absolute inset-0 [mask-image:radial-gradient(ellipse_at_center,white,transparent)] opacity-30">
|
||||
<GridPattern />
|
||||
|
|
@ -269,11 +295,7 @@ export function DocumentUploadTab({
|
|||
<CardContent className="p-4 sm:p-10 relative z-10">
|
||||
<div
|
||||
{...getRootProps()}
|
||||
className={`flex flex-col items-center justify-center min-h-[200px] sm:min-h-[300px] border-2 border-dashed rounded-lg transition-colors ${
|
||||
isFileCountLimitReached || isSizeLimitReached
|
||||
? "border-destructive/50 bg-destructive/5 cursor-not-allowed"
|
||||
: "border-border hover:border-primary/50 cursor-pointer"
|
||||
}`}
|
||||
className="flex flex-col items-center justify-center min-h-[200px] sm:min-h-[300px] border-2 border-dashed rounded-lg transition-colors border-border hover:border-primary/50 cursor-pointer"
|
||||
>
|
||||
<input
|
||||
{...getInputProps()}
|
||||
|
|
@ -281,19 +303,7 @@ export function DocumentUploadTab({
|
|||
className="hidden"
|
||||
onClick={handleFileInputClick}
|
||||
/>
|
||||
{isFileCountLimitReached ? (
|
||||
<div className="flex flex-col items-center gap-2 sm:gap-4 text-center px-4">
|
||||
<Upload className="h-8 w-8 sm:h-12 sm:w-12 text-destructive/70" />
|
||||
<div>
|
||||
<p className="text-sm sm:text-lg font-medium text-destructive">
|
||||
{t("file_limit_reached")}
|
||||
</p>
|
||||
<p className="text-xs sm:text-sm text-muted-foreground mt-1">
|
||||
{t("file_limit_reached_desc", { max: MAX_FILES })}
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
) : isDragActive ? (
|
||||
{isDragActive ? (
|
||||
<div className="flex flex-col items-center gap-2 sm:gap-4">
|
||||
<Upload className="h-8 w-8 sm:h-12 sm:w-12 text-primary" />
|
||||
<p className="text-sm sm:text-lg font-medium text-primary">{t("drop_files")}</p>
|
||||
|
|
@ -305,29 +315,35 @@ export function DocumentUploadTab({
|
|||
<p className="text-sm sm:text-lg font-medium">{t("drag_drop")}</p>
|
||||
<p className="text-xs sm:text-sm text-muted-foreground mt-1">{t("or_browse")}</p>
|
||||
</div>
|
||||
{files.length > 0 && (
|
||||
<p className="text-xs text-muted-foreground">
|
||||
{t("remaining_capacity", { files: remainingFiles, sizeMB: remainingSizeMB })}
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
{!isFileCountLimitReached && (
|
||||
<div className="mt-2 sm:mt-4">
|
||||
<Button
|
||||
variant="secondary"
|
||||
size="sm"
|
||||
className="text-xs sm:text-sm"
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
e.preventDefault();
|
||||
fileInputRef.current?.click();
|
||||
}}
|
||||
>
|
||||
{t("browse_files")}
|
||||
</Button>
|
||||
</div>
|
||||
)}
|
||||
<div className="mt-2 sm:mt-4 flex gap-2">
|
||||
<Button
|
||||
variant="secondary"
|
||||
size="sm"
|
||||
className="text-xs sm:text-sm"
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
e.preventDefault();
|
||||
fileInputRef.current?.click();
|
||||
}}
|
||||
>
|
||||
{t("browse_files")}
|
||||
</Button>
|
||||
<Button
|
||||
variant="outline"
|
||||
size="sm"
|
||||
className="text-xs sm:text-sm"
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
e.preventDefault();
|
||||
folderInputRef.current?.click();
|
||||
}}
|
||||
>
|
||||
<FolderOpen className="h-4 w-4 mr-1.5" />
|
||||
{t("browse_folder")}
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
</CardContent>
|
||||
</Card>
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@ export const document = z.object({
|
|||
document_type: documentTypeEnum,
|
||||
document_metadata: z.record(z.string(), z.any()),
|
||||
content: z.string(),
|
||||
content_preview: z.string().optional().default(""),
|
||||
content_hash: z.string(),
|
||||
unique_identifier_hash: z.string().nullable(),
|
||||
created_at: z.string(),
|
||||
|
|
@ -69,6 +70,8 @@ export const documentWithChunks = document.extend({
|
|||
created_at: z.string(),
|
||||
})
|
||||
),
|
||||
total_chunks: z.number().optional().default(0),
|
||||
chunk_start_index: z.number().optional().default(0),
|
||||
});
|
||||
|
||||
/**
|
||||
|
|
@ -243,10 +246,36 @@ export const getDocumentTypeCountsResponse = z.record(z.string(), z.number());
|
|||
*/
|
||||
export const getDocumentByChunkRequest = z.object({
|
||||
chunk_id: z.number(),
|
||||
chunk_window: z.number().optional(),
|
||||
});
|
||||
|
||||
export const getDocumentByChunkResponse = documentWithChunks;
|
||||
|
||||
/**
|
||||
* Get paginated chunks for a document
|
||||
*/
|
||||
export const getDocumentChunksRequest = z.object({
|
||||
document_id: z.number(),
|
||||
page: z.number().optional().default(0),
|
||||
page_size: z.number().optional().default(20),
|
||||
start_offset: z.number().optional(),
|
||||
});
|
||||
|
||||
export const chunkRead = z.object({
|
||||
id: z.number(),
|
||||
content: z.string(),
|
||||
document_id: z.number(),
|
||||
created_at: z.string(),
|
||||
});
|
||||
|
||||
export const getDocumentChunksResponse = z.object({
|
||||
items: z.array(chunkRead),
|
||||
total: z.number(),
|
||||
page: z.number(),
|
||||
page_size: z.number(),
|
||||
has_more: z.boolean(),
|
||||
});
|
||||
|
||||
/**
|
||||
* Get Surfsense docs by chunk
|
||||
*/
|
||||
|
|
@ -328,3 +357,6 @@ export type GetSurfsenseDocsByChunkRequest = z.infer<typeof getSurfsenseDocsByCh
|
|||
export type GetSurfsenseDocsByChunkResponse = z.infer<typeof getSurfsenseDocsByChunkResponse>;
|
||||
export type GetSurfsenseDocsRequest = z.infer<typeof getSurfsenseDocsRequest>;
|
||||
export type GetSurfsenseDocsResponse = z.infer<typeof getSurfsenseDocsResponse>;
|
||||
export type GetDocumentChunksRequest = z.infer<typeof getDocumentChunksRequest>;
|
||||
export type GetDocumentChunksResponse = z.infer<typeof getDocumentChunksResponse>;
|
||||
export type ChunkRead = z.infer<typeof chunkRead>;
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ import {
|
|||
deleteDocumentRequest,
|
||||
deleteDocumentResponse,
|
||||
type GetDocumentByChunkRequest,
|
||||
type GetDocumentChunksRequest,
|
||||
type GetDocumentRequest,
|
||||
type GetDocumentsRequest,
|
||||
type GetDocumentsStatusRequest,
|
||||
|
|
@ -13,6 +14,8 @@ import {
|
|||
type GetSurfsenseDocsRequest,
|
||||
getDocumentByChunkRequest,
|
||||
getDocumentByChunkResponse,
|
||||
getDocumentChunksRequest,
|
||||
getDocumentChunksResponse,
|
||||
getDocumentRequest,
|
||||
getDocumentResponse,
|
||||
getDocumentsRequest,
|
||||
|
|
@ -295,23 +298,52 @@ class DocumentsApiService {
|
|||
};
|
||||
|
||||
/**
|
||||
* Get document by chunk ID (includes all chunks)
|
||||
* Get document by chunk ID (includes a window of chunks around the cited one)
|
||||
*/
|
||||
getDocumentByChunk = async (request: GetDocumentByChunkRequest) => {
|
||||
// Validate the request
|
||||
const parsedRequest = getDocumentByChunkRequest.safeParse(request);
|
||||
|
||||
if (!parsedRequest.success) {
|
||||
console.error("Invalid request:", parsedRequest.error);
|
||||
|
||||
// Format a user friendly error message
|
||||
const errorMessage = parsedRequest.error.issues.map((issue) => issue.message).join(", ");
|
||||
throw new ValidationError(`Invalid request: ${errorMessage}`);
|
||||
}
|
||||
|
||||
const params = new URLSearchParams();
|
||||
if (request.chunk_window != null) {
|
||||
params.set("chunk_window", String(request.chunk_window));
|
||||
}
|
||||
const qs = params.toString();
|
||||
const url = `/api/v1/documents/by-chunk/${request.chunk_id}${qs ? `?${qs}` : ""}`;
|
||||
|
||||
return baseApiService.get(url, getDocumentByChunkResponse);
|
||||
};
|
||||
|
||||
/**
|
||||
* Get paginated chunks for a document
|
||||
*/
|
||||
getDocumentChunks = async (request: GetDocumentChunksRequest) => {
|
||||
const parsedRequest = getDocumentChunksRequest.safeParse(request);
|
||||
|
||||
if (!parsedRequest.success) {
|
||||
console.error("Invalid request:", parsedRequest.error);
|
||||
|
||||
const errorMessage = parsedRequest.error.issues.map((issue) => issue.message).join(", ");
|
||||
throw new ValidationError(`Invalid request: ${errorMessage}`);
|
||||
}
|
||||
|
||||
const params = new URLSearchParams({
|
||||
page: String(parsedRequest.data.page),
|
||||
page_size: String(parsedRequest.data.page_size),
|
||||
});
|
||||
if (parsedRequest.data.start_offset != null) {
|
||||
params.set("start_offset", String(parsedRequest.data.start_offset));
|
||||
}
|
||||
|
||||
return baseApiService.get(
|
||||
`/api/v1/documents/by-chunk/${request.chunk_id}`,
|
||||
getDocumentByChunkResponse
|
||||
`/api/v1/documents/${parsedRequest.data.document_id}/chunks?${params}`,
|
||||
getDocumentChunksResponse
|
||||
);
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -376,12 +376,13 @@
|
|||
"upload_documents": {
|
||||
"title": "Upload Documents",
|
||||
"subtitle": "Upload your files to make them searchable and accessible through AI-powered conversations.",
|
||||
"file_size_limit": "Maximum file size: 50MB per file.",
|
||||
"upload_limits": "Upload limit: {maxFiles} files, {maxSizeMB}MB total.",
|
||||
"drop_files": "Drop files here",
|
||||
"drag_drop": "Drag & drop files here",
|
||||
"or_browse": "or click to browse",
|
||||
"file_size_limit": "Maximum file size: {maxMB}MB per file.",
|
||||
"upload_limits": "Upload files or entire folders",
|
||||
"drop_files": "Drop files or folders here",
|
||||
"drag_drop": "Drag & drop files or folders here",
|
||||
"or_browse": "or click to browse files and folders",
|
||||
"browse_files": "Browse Files",
|
||||
"browse_folder": "Browse Folder",
|
||||
"selected_files": "Selected Files ({count})",
|
||||
"total_size": "Total size",
|
||||
"clear_all": "Clear all",
|
||||
|
|
@ -394,13 +395,9 @@
|
|||
"upload_error_desc": "Error uploading files",
|
||||
"supported_file_types": "Supported File Types",
|
||||
"file_types_desc": "These file types are supported based on your current ETL service configuration.",
|
||||
"max_files_exceeded": "File Limit Exceeded",
|
||||
"max_files_exceeded_desc": "You can upload a maximum of {max} files at a time.",
|
||||
"max_size_exceeded": "Size Limit Exceeded",
|
||||
"max_size_exceeded_desc": "Total file size cannot exceed {max}MB.",
|
||||
"file_limit_reached": "Maximum Files Reached",
|
||||
"file_limit_reached_desc": "Remove some files to add more (max {max} files).",
|
||||
"remaining_capacity": "{files} files remaining • {sizeMB}MB available"
|
||||
"file_too_large": "File Too Large",
|
||||
"file_too_large_desc": "\"{name}\" exceeds the {maxMB}MB per-file limit.",
|
||||
"no_supported_files_in_folder": "No supported file types found in the selected folder."
|
||||
},
|
||||
"add_webpage": {
|
||||
"title": "Add Webpages for Crawling",
|
||||
|
|
|
|||
|
|
@ -376,12 +376,13 @@
|
|||
"upload_documents": {
|
||||
"title": "Subir documentos",
|
||||
"subtitle": "Sube tus archivos para hacerlos buscables y accesibles a través de conversaciones con IA.",
|
||||
"file_size_limit": "Tamaño máximo de archivo: 50 MB por archivo.",
|
||||
"upload_limits": "Límite de subida: {maxFiles} archivos, {maxSizeMB} MB en total.",
|
||||
"drop_files": "Suelta los archivos aquí",
|
||||
"drag_drop": "Arrastra y suelta archivos aquí",
|
||||
"or_browse": "o haz clic para explorar",
|
||||
"file_size_limit": "Tamaño máximo de archivo: {maxMB} MB por archivo.",
|
||||
"upload_limits": "Sube archivos o carpetas enteras",
|
||||
"drop_files": "Suelta archivos o carpetas aquí",
|
||||
"drag_drop": "Arrastra y suelta archivos o carpetas aquí",
|
||||
"or_browse": "o haz clic para explorar archivos y carpetas",
|
||||
"browse_files": "Explorar archivos",
|
||||
"browse_folder": "Explorar carpeta",
|
||||
"selected_files": "Archivos seleccionados ({count})",
|
||||
"total_size": "Tamaño total",
|
||||
"clear_all": "Limpiar todo",
|
||||
|
|
@ -394,13 +395,9 @@
|
|||
"upload_error_desc": "Error al subir archivos",
|
||||
"supported_file_types": "Tipos de archivo soportados",
|
||||
"file_types_desc": "Estos tipos de archivo son soportados según la configuración actual de tu servicio ETL.",
|
||||
"max_files_exceeded": "Límite de archivos excedido",
|
||||
"max_files_exceeded_desc": "Puedes subir un máximo de {max} archivos a la vez.",
|
||||
"max_size_exceeded": "Límite de tamaño excedido",
|
||||
"max_size_exceeded_desc": "El tamaño total de los archivos no puede exceder {max} MB.",
|
||||
"file_limit_reached": "Máximo de archivos alcanzado",
|
||||
"file_limit_reached_desc": "Elimina algunos archivos para agregar más (máximo {max} archivos).",
|
||||
"remaining_capacity": "{files} archivos restantes • {sizeMB} MB disponibles"
|
||||
"file_too_large": "Archivo demasiado grande",
|
||||
"file_too_large_desc": "\"{name}\" excede el límite de {maxMB} MB por archivo.",
|
||||
"no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada."
|
||||
},
|
||||
"add_webpage": {
|
||||
"title": "Agregar páginas web para rastreo",
|
||||
|
|
|
|||
|
|
@ -376,12 +376,13 @@
|
|||
"upload_documents": {
|
||||
"title": "दस्तावेज़ अपलोड करें",
|
||||
"subtitle": "AI-संचालित बातचीत के माध्यम से अपनी फ़ाइलों को खोजने योग्य और सुलभ बनाने के लिए अपलोड करें।",
|
||||
"file_size_limit": "अधिकतम फ़ाइल आकार: प्रति फ़ाइल 50MB।",
|
||||
"upload_limits": "अपलोड सीमा: {maxFiles} फ़ाइलें, कुल {maxSizeMB}MB।",
|
||||
"drop_files": "फ़ाइलें यहां छोड़ें",
|
||||
"drag_drop": "फ़ाइलें यहां खींचें और छोड़ें",
|
||||
"or_browse": "या ब्राउज़ करने के लिए क्लिक करें",
|
||||
"file_size_limit": "अधिकतम फ़ाइल आकार: प्रति फ़ाइल {maxMB}MB।",
|
||||
"upload_limits": "फ़ाइलें या पूरे फ़ोल्डर अपलोड करें",
|
||||
"drop_files": "फ़ाइलें या फ़ोल्डर यहां छोड़ें",
|
||||
"drag_drop": "फ़ाइलें या फ़ोल्डर यहां खींचें और छोड़ें",
|
||||
"or_browse": "या फ़ाइलें और फ़ोल्डर ब्राउज़ करने के लिए क्लिक करें",
|
||||
"browse_files": "फ़ाइलें ब्राउज़ करें",
|
||||
"browse_folder": "फ़ोल्डर ब्राउज़ करें",
|
||||
"selected_files": "चयनित फ़ाइलें ({count})",
|
||||
"total_size": "कुल आकार",
|
||||
"clear_all": "सभी साफ करें",
|
||||
|
|
@ -394,13 +395,9 @@
|
|||
"upload_error_desc": "फ़ाइलें अपलोड करने में त्रुटि",
|
||||
"supported_file_types": "समर्थित फ़ाइल प्रकार",
|
||||
"file_types_desc": "ये फ़ाइल प्रकार आपकी वर्तमान ETL सेवा कॉन्फ़िगरेशन के आधार पर समर्थित हैं।",
|
||||
"max_files_exceeded": "फ़ाइल सीमा पार हो गई",
|
||||
"max_files_exceeded_desc": "आप एक बार में अधिकतम {max} फ़ाइलें अपलोड कर सकते हैं।",
|
||||
"max_size_exceeded": "आकार सीमा पार हो गई",
|
||||
"max_size_exceeded_desc": "कुल फ़ाइल आकार {max}MB से अधिक नहीं हो सकता।",
|
||||
"file_limit_reached": "अधिकतम फ़ाइलें पहुंच गई",
|
||||
"file_limit_reached_desc": "और जोड़ने के लिए कुछ फ़ाइलें हटाएं (अधिकतम {max} फ़ाइलें)।",
|
||||
"remaining_capacity": "{files} फ़ाइलें शेष • {sizeMB}MB उपलब्ध"
|
||||
"file_too_large": "फ़ाइल बहुत बड़ी है",
|
||||
"file_too_large_desc": "\"{name}\" प्रति फ़ाइल {maxMB}MB की सीमा से अधिक है।",
|
||||
"no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।"
|
||||
},
|
||||
"add_webpage": {
|
||||
"title": "क्रॉलिंग के लिए वेबपेज जोड़ें",
|
||||
|
|
|
|||
|
|
@ -376,12 +376,13 @@
|
|||
"upload_documents": {
|
||||
"title": "Enviar documentos",
|
||||
"subtitle": "Envie seus arquivos para torná-los pesquisáveis e acessíveis através de conversas com IA.",
|
||||
"file_size_limit": "Tamanho máximo do arquivo: 50 MB por arquivo.",
|
||||
"upload_limits": "Limite de envio: {maxFiles} arquivos, {maxSizeMB} MB no total.",
|
||||
"drop_files": "Solte os arquivos aqui",
|
||||
"drag_drop": "Arraste e solte arquivos aqui",
|
||||
"or_browse": "ou clique para navegar",
|
||||
"file_size_limit": "Tamanho máximo do arquivo: {maxMB} MB por arquivo.",
|
||||
"upload_limits": "Envie arquivos ou pastas inteiras",
|
||||
"drop_files": "Solte arquivos ou pastas aqui",
|
||||
"drag_drop": "Arraste e solte arquivos ou pastas aqui",
|
||||
"or_browse": "ou clique para navegar arquivos e pastas",
|
||||
"browse_files": "Navegar arquivos",
|
||||
"browse_folder": "Navegar pasta",
|
||||
"selected_files": "Arquivos selecionados ({count})",
|
||||
"total_size": "Tamanho total",
|
||||
"clear_all": "Limpar tudo",
|
||||
|
|
@ -394,13 +395,9 @@
|
|||
"upload_error_desc": "Erro ao enviar arquivos",
|
||||
"supported_file_types": "Tipos de arquivo suportados",
|
||||
"file_types_desc": "Estes tipos de arquivo são suportados com base na configuração atual do seu serviço ETL.",
|
||||
"max_files_exceeded": "Limite de arquivos excedido",
|
||||
"max_files_exceeded_desc": "Você pode enviar no máximo {max} arquivos de uma vez.",
|
||||
"max_size_exceeded": "Limite de tamanho excedido",
|
||||
"max_size_exceeded_desc": "O tamanho total dos arquivos não pode exceder {max} MB.",
|
||||
"file_limit_reached": "Máximo de arquivos atingido",
|
||||
"file_limit_reached_desc": "Remova alguns arquivos para adicionar mais (máximo {max} arquivos).",
|
||||
"remaining_capacity": "{files} arquivos restantes • {sizeMB} MB disponíveis"
|
||||
"file_too_large": "Arquivo muito grande",
|
||||
"file_too_large_desc": "\"{name}\" excede o limite de {maxMB} MB por arquivo.",
|
||||
"no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada."
|
||||
},
|
||||
"add_webpage": {
|
||||
"title": "Adicionar páginas web para rastreamento",
|
||||
|
|
|
|||
|
|
@ -360,12 +360,13 @@
|
|||
"upload_documents": {
|
||||
"title": "上传文档",
|
||||
"subtitle": "上传您的文件,使其可通过 AI 对话进行搜索和访问。",
|
||||
"file_size_limit": "最大文件大小:每个文件 50MB。",
|
||||
"upload_limits": "上传限制:最多 {maxFiles} 个文件,总大小不超过 {maxSizeMB}MB。",
|
||||
"drop_files": "放下文件到这里",
|
||||
"drag_drop": "拖放文件到这里",
|
||||
"or_browse": "或点击浏览",
|
||||
"file_size_limit": "最大文件大小:每个文件 {maxMB}MB。",
|
||||
"upload_limits": "上传文件或整个文件夹",
|
||||
"drop_files": "将文件或文件夹拖放到此处",
|
||||
"drag_drop": "将文件或文件夹拖放到此处",
|
||||
"or_browse": "或点击浏览文件和文件夹",
|
||||
"browse_files": "浏览文件",
|
||||
"browse_folder": "浏览文件夹",
|
||||
"selected_files": "已选择的文件 ({count})",
|
||||
"total_size": "总大小",
|
||||
"clear_all": "全部清除",
|
||||
|
|
@ -378,13 +379,9 @@
|
|||
"upload_error_desc": "上传文件时出错",
|
||||
"supported_file_types": "支持的文件类型",
|
||||
"file_types_desc": "根据您当前的 ETL 服务配置支持这些文件类型。",
|
||||
"max_files_exceeded": "超过文件数量限制",
|
||||
"max_files_exceeded_desc": "一次最多只能上传 {max} 个文件。",
|
||||
"max_size_exceeded": "超过文件大小限制",
|
||||
"max_size_exceeded_desc": "文件总大小不能超过 {max}MB。",
|
||||
"file_limit_reached": "已达到最大文件数量",
|
||||
"file_limit_reached_desc": "移除一些文件以添加更多(最多 {max} 个文件)。",
|
||||
"remaining_capacity": "剩余 {files} 个文件名额 • 可用 {sizeMB}MB"
|
||||
"file_too_large": "文件过大",
|
||||
"file_too_large_desc": "\"{name}\" 超过了每个文件 {maxMB}MB 的限制。",
|
||||
"no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。"
|
||||
},
|
||||
"add_webpage": {
|
||||
"title": "添加网页爬取",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue