mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-26 17:26:23 +02:00
refactor: streamline document upload limits and enhance handling of mentioned documents
- Updated maximum file size limit to 500 MB per file. - Removed restrictions on the number of files per upload and total upload size. - Enhanced handling of user-mentioning documents in the knowledge base search middleware. - Improved document reading and processing logic to accommodate new features and optimizations.
This commit is contained in:
parent
6727266107
commit
62e698d8aa
33 changed files with 2889 additions and 2443 deletions
|
|
@ -1,7 +1,7 @@
|
|||
# Force asyncio to use standard event loop before unstructured imports
|
||||
import asyncio
|
||||
|
||||
from fastapi import APIRouter, Depends, Form, HTTPException, UploadFile
|
||||
from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.future import select
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
|
@ -17,6 +17,7 @@ from app.db import (
|
|||
get_async_session,
|
||||
)
|
||||
from app.schemas import (
|
||||
ChunkRead,
|
||||
DocumentRead,
|
||||
DocumentsCreate,
|
||||
DocumentStatusBatchResponse,
|
||||
|
|
@ -45,9 +46,7 @@ os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1"
|
|||
|
||||
router = APIRouter()
|
||||
|
||||
MAX_FILES_PER_UPLOAD = 10
|
||||
MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB per file
|
||||
MAX_TOTAL_SIZE_BYTES = 200 * 1024 * 1024 # 200 MB total
|
||||
MAX_FILE_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB per file
|
||||
|
||||
|
||||
@router.post("/documents")
|
||||
|
|
@ -156,13 +155,6 @@ async def create_documents_file_upload(
|
|||
if not files:
|
||||
raise HTTPException(status_code=400, detail="No files provided")
|
||||
|
||||
if len(files) > MAX_FILES_PER_UPLOAD:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"Too many files. Maximum {MAX_FILES_PER_UPLOAD} files per upload.",
|
||||
)
|
||||
|
||||
total_size = 0
|
||||
for file in files:
|
||||
file_size = file.size or 0
|
||||
if file_size > MAX_FILE_SIZE_BYTES:
|
||||
|
|
@ -171,14 +163,6 @@ async def create_documents_file_upload(
|
|||
detail=f"File '{file.filename}' ({file_size / (1024 * 1024):.1f} MB) "
|
||||
f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.",
|
||||
)
|
||||
total_size += file_size
|
||||
|
||||
if total_size > MAX_TOTAL_SIZE_BYTES:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"Total upload size ({total_size / (1024 * 1024):.1f} MB) "
|
||||
f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.",
|
||||
)
|
||||
|
||||
# ===== Read all files concurrently to avoid blocking the event loop =====
|
||||
async def _read_and_save(file: UploadFile) -> tuple[str, str, int]:
|
||||
|
|
@ -206,16 +190,6 @@ async def create_documents_file_upload(
|
|||
|
||||
saved_files = await asyncio.gather(*(_read_and_save(f) for f in files))
|
||||
|
||||
actual_total_size = sum(size for _, _, size in saved_files)
|
||||
if actual_total_size > MAX_TOTAL_SIZE_BYTES:
|
||||
for temp_path, _, _ in saved_files:
|
||||
os.unlink(temp_path)
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"Total upload size ({actual_total_size / (1024 * 1024):.1f} MB) "
|
||||
f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.",
|
||||
)
|
||||
|
||||
# ===== PHASE 1: Create pending documents for all files =====
|
||||
created_documents: list[Document] = []
|
||||
files_to_process: list[tuple[Document, str, str]] = []
|
||||
|
|
@ -451,13 +425,15 @@ async def read_documents(
|
|||
reason=doc.status.get("reason"),
|
||||
)
|
||||
|
||||
raw_content = doc.content or ""
|
||||
api_documents.append(
|
||||
DocumentRead(
|
||||
id=doc.id,
|
||||
title=doc.title,
|
||||
document_type=doc.document_type,
|
||||
document_metadata=doc.document_metadata,
|
||||
content=doc.content,
|
||||
content="",
|
||||
content_preview=raw_content[:300],
|
||||
content_hash=doc.content_hash,
|
||||
unique_identifier_hash=doc.unique_identifier_hash,
|
||||
created_at=doc.created_at,
|
||||
|
|
@ -609,13 +585,15 @@ async def search_documents(
|
|||
reason=doc.status.get("reason"),
|
||||
)
|
||||
|
||||
raw_content = doc.content or ""
|
||||
api_documents.append(
|
||||
DocumentRead(
|
||||
id=doc.id,
|
||||
title=doc.title,
|
||||
document_type=doc.document_type,
|
||||
document_metadata=doc.document_metadata,
|
||||
content=doc.content,
|
||||
content="",
|
||||
content_preview=raw_content[:300],
|
||||
content_hash=doc.content_hash,
|
||||
unique_identifier_hash=doc.unique_identifier_hash,
|
||||
created_at=doc.created_at,
|
||||
|
|
@ -884,16 +862,19 @@ async def get_document_type_counts(
|
|||
@router.get("/documents/by-chunk/{chunk_id}", response_model=DocumentWithChunksRead)
|
||||
async def get_document_by_chunk_id(
|
||||
chunk_id: int,
|
||||
chunk_window: int = Query(
|
||||
5, ge=0, description="Number of chunks before/after the cited chunk to include"
|
||||
),
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Retrieves a document based on a chunk ID, including all its chunks ordered by creation time.
|
||||
Requires DOCUMENTS_READ permission for the search space.
|
||||
The document's embedding and chunk embeddings are excluded from the response.
|
||||
Retrieves a document based on a chunk ID, including a window of chunks around the cited one.
|
||||
Uses SQL-level pagination to avoid loading all chunks into memory.
|
||||
"""
|
||||
try:
|
||||
# First, get the chunk and verify it exists
|
||||
from sqlalchemy import and_, func, or_
|
||||
|
||||
chunk_result = await session.execute(select(Chunk).filter(Chunk.id == chunk_id))
|
||||
chunk = chunk_result.scalars().first()
|
||||
|
||||
|
|
@ -902,11 +883,8 @@ async def get_document_by_chunk_id(
|
|||
status_code=404, detail=f"Chunk with id {chunk_id} not found"
|
||||
)
|
||||
|
||||
# Get the associated document
|
||||
document_result = await session.execute(
|
||||
select(Document)
|
||||
.options(selectinload(Document.chunks))
|
||||
.filter(Document.id == chunk.document_id)
|
||||
select(Document).filter(Document.id == chunk.document_id)
|
||||
)
|
||||
document = document_result.scalars().first()
|
||||
|
||||
|
|
@ -916,7 +894,6 @@ async def get_document_by_chunk_id(
|
|||
detail="Document not found",
|
||||
)
|
||||
|
||||
# Check permission for the search space
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
|
|
@ -925,10 +902,38 @@ async def get_document_by_chunk_id(
|
|||
"You don't have permission to read documents in this search space",
|
||||
)
|
||||
|
||||
# Sort chunks by creation time
|
||||
sorted_chunks = sorted(document.chunks, key=lambda x: x.created_at)
|
||||
total_result = await session.execute(
|
||||
select(func.count())
|
||||
.select_from(Chunk)
|
||||
.filter(Chunk.document_id == document.id)
|
||||
)
|
||||
total_chunks = total_result.scalar() or 0
|
||||
|
||||
cited_idx_result = await session.execute(
|
||||
select(func.count())
|
||||
.select_from(Chunk)
|
||||
.filter(
|
||||
Chunk.document_id == document.id,
|
||||
or_(
|
||||
Chunk.created_at < chunk.created_at,
|
||||
and_(Chunk.created_at == chunk.created_at, Chunk.id < chunk.id),
|
||||
),
|
||||
)
|
||||
)
|
||||
cited_idx = cited_idx_result.scalar() or 0
|
||||
|
||||
start = max(0, cited_idx - chunk_window)
|
||||
end = min(total_chunks, cited_idx + chunk_window + 1)
|
||||
|
||||
windowed_result = await session.execute(
|
||||
select(Chunk)
|
||||
.filter(Chunk.document_id == document.id)
|
||||
.order_by(Chunk.created_at, Chunk.id)
|
||||
.offset(start)
|
||||
.limit(end - start)
|
||||
)
|
||||
windowed_chunks = windowed_result.scalars().all()
|
||||
|
||||
# Return the document with its chunks
|
||||
return DocumentWithChunksRead(
|
||||
id=document.id,
|
||||
title=document.title,
|
||||
|
|
@ -940,7 +945,9 @@ async def get_document_by_chunk_id(
|
|||
created_at=document.created_at,
|
||||
updated_at=document.updated_at,
|
||||
search_space_id=document.search_space_id,
|
||||
chunks=sorted_chunks,
|
||||
chunks=windowed_chunks,
|
||||
total_chunks=total_chunks,
|
||||
chunk_start_index=start,
|
||||
)
|
||||
except HTTPException:
|
||||
raise
|
||||
|
|
@ -950,6 +957,75 @@ async def get_document_by_chunk_id(
|
|||
) from e
|
||||
|
||||
|
||||
@router.get(
|
||||
"/documents/{document_id}/chunks",
|
||||
response_model=PaginatedResponse[ChunkRead],
|
||||
)
|
||||
async def get_document_chunks_paginated(
|
||||
document_id: int,
|
||||
page: int = Query(0, ge=0),
|
||||
page_size: int = Query(20, ge=1, le=100),
|
||||
start_offset: int | None = Query(
|
||||
None, ge=0, description="Direct offset; overrides page * page_size"
|
||||
),
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Paginated chunk loading for a document.
|
||||
Supports both page-based and offset-based access.
|
||||
"""
|
||||
try:
|
||||
from sqlalchemy import func
|
||||
|
||||
doc_result = await session.execute(
|
||||
select(Document).filter(Document.id == document_id)
|
||||
)
|
||||
document = doc_result.scalars().first()
|
||||
|
||||
if not document:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
await check_permission(
|
||||
session,
|
||||
user,
|
||||
document.search_space_id,
|
||||
Permission.DOCUMENTS_READ.value,
|
||||
"You don't have permission to read documents in this search space",
|
||||
)
|
||||
|
||||
total_result = await session.execute(
|
||||
select(func.count())
|
||||
.select_from(Chunk)
|
||||
.filter(Chunk.document_id == document_id)
|
||||
)
|
||||
total = total_result.scalar() or 0
|
||||
|
||||
offset = start_offset if start_offset is not None else page * page_size
|
||||
chunks_result = await session.execute(
|
||||
select(Chunk)
|
||||
.filter(Chunk.document_id == document_id)
|
||||
.order_by(Chunk.created_at, Chunk.id)
|
||||
.offset(offset)
|
||||
.limit(page_size)
|
||||
)
|
||||
chunks = chunks_result.scalars().all()
|
||||
|
||||
return PaginatedResponse(
|
||||
items=chunks,
|
||||
total=total,
|
||||
page=offset // page_size if page_size else page,
|
||||
page_size=page_size,
|
||||
has_more=(offset + len(chunks)) < total,
|
||||
)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to fetch chunks: {e!s}"
|
||||
) from e
|
||||
|
||||
|
||||
@router.get("/documents/{document_id}", response_model=DocumentRead)
|
||||
async def read_document(
|
||||
document_id: int,
|
||||
|
|
@ -980,13 +1056,14 @@ async def read_document(
|
|||
"You don't have permission to read documents in this search space",
|
||||
)
|
||||
|
||||
# Convert database object to API-friendly format
|
||||
raw_content = document.content or ""
|
||||
return DocumentRead(
|
||||
id=document.id,
|
||||
title=document.title,
|
||||
document_type=document.document_type,
|
||||
document_metadata=document.document_metadata,
|
||||
content=document.content,
|
||||
content=raw_content,
|
||||
content_preview=raw_content[:300],
|
||||
content_hash=document.content_hash,
|
||||
unique_identifier_hash=document.unique_identifier_hash,
|
||||
created_at=document.created_at,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue