2025-07-24 14:43:48 -07:00
|
|
|
# Force asyncio to use standard event loop before unstructured imports
|
|
|
|
|
import asyncio
|
|
|
|
|
|
2025-10-20 00:30:00 -07:00
|
|
|
from fastapi import APIRouter, Depends, Form, HTTPException, UploadFile
|
2025-03-14 18:53:14 -07:00
|
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
|
from sqlalchemy.future import select
|
2025-08-23 18:48:18 -07:00
|
|
|
from sqlalchemy.orm import selectinload
|
2025-07-24 14:43:48 -07:00
|
|
|
|
2025-08-23 18:48:18 -07:00
|
|
|
from app.db import (
|
|
|
|
|
Chunk,
|
|
|
|
|
Document,
|
|
|
|
|
DocumentType,
|
2025-11-27 22:45:04 -08:00
|
|
|
Permission,
|
2025-08-23 18:48:18 -07:00
|
|
|
SearchSpace,
|
2025-11-27 22:45:04 -08:00
|
|
|
SearchSpaceMembership,
|
2025-08-23 18:48:18 -07:00
|
|
|
User,
|
|
|
|
|
get_async_session,
|
|
|
|
|
)
|
|
|
|
|
from app.schemas import (
|
|
|
|
|
DocumentRead,
|
|
|
|
|
DocumentsCreate,
|
2026-01-17 20:45:10 +05:30
|
|
|
DocumentTitleRead,
|
|
|
|
|
DocumentTitleSearchResponse,
|
2025-08-23 18:48:18 -07:00
|
|
|
DocumentUpdate,
|
|
|
|
|
DocumentWithChunksRead,
|
2025-10-01 13:05:22 -07:00
|
|
|
PaginatedResponse,
|
2025-08-23 18:48:18 -07:00
|
|
|
)
|
2025-03-14 18:53:14 -07:00
|
|
|
from app.users import current_active_user
|
2025-11-27 22:45:04 -08:00
|
|
|
from app.utils.rbac import check_permission
|
2025-07-21 06:19:37 -07:00
|
|
|
|
2025-03-20 18:52:06 -07:00
|
|
|
try:
|
|
|
|
|
asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
|
2025-07-24 14:43:48 -07:00
|
|
|
except RuntimeError as e:
|
|
|
|
|
print("Error setting event loop policy", e)
|
2025-03-20 18:52:06 -07:00
|
|
|
pass
|
2025-07-24 14:43:48 -07:00
|
|
|
|
2025-03-20 18:52:06 -07:00
|
|
|
import os
|
2025-07-24 14:43:48 -07:00
|
|
|
|
2025-03-20 18:52:06 -07:00
|
|
|
os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1"
|
2025-05-07 22:04:57 -07:00
|
|
|
|
|
|
|
|
|
2025-03-14 18:53:14 -07:00
|
|
|
router = APIRouter()
|
|
|
|
|
|
2025-05-13 21:13:53 -07:00
|
|
|
|
2025-10-31 01:33:01 -07:00
|
|
|
@router.post("/documents")
|
2025-03-14 18:53:14 -07:00
|
|
|
async def create_documents(
|
|
|
|
|
request: DocumentsCreate,
|
|
|
|
|
session: AsyncSession = Depends(get_async_session),
|
|
|
|
|
user: User = Depends(current_active_user),
|
|
|
|
|
):
|
2025-11-27 22:45:04 -08:00
|
|
|
"""
|
|
|
|
|
Create new documents.
|
|
|
|
|
Requires DOCUMENTS_CREATE permission.
|
|
|
|
|
"""
|
2025-03-14 18:53:14 -07:00
|
|
|
try:
|
2025-11-27 22:45:04 -08:00
|
|
|
# Check permission
|
|
|
|
|
await check_permission(
|
|
|
|
|
session,
|
|
|
|
|
user,
|
|
|
|
|
request.search_space_id,
|
|
|
|
|
Permission.DOCUMENTS_CREATE.value,
|
|
|
|
|
"You don't have permission to create documents in this search space",
|
|
|
|
|
)
|
2025-05-13 21:13:53 -07:00
|
|
|
|
2025-03-14 18:53:14 -07:00
|
|
|
if request.document_type == DocumentType.EXTENSION:
|
2025-10-20 00:30:00 -07:00
|
|
|
from app.tasks.celery_tasks.document_tasks import (
|
|
|
|
|
process_extension_document_task,
|
|
|
|
|
)
|
|
|
|
|
|
2025-03-14 18:53:14 -07:00
|
|
|
for individual_document in request.content:
|
2025-10-20 00:30:00 -07:00
|
|
|
# Convert document to dict for Celery serialization
|
|
|
|
|
document_dict = {
|
|
|
|
|
"metadata": {
|
|
|
|
|
"VisitedWebPageTitle": individual_document.metadata.VisitedWebPageTitle,
|
|
|
|
|
"VisitedWebPageURL": individual_document.metadata.VisitedWebPageURL,
|
2025-12-03 15:32:32 +00:00
|
|
|
"BrowsingSessionId": individual_document.metadata.BrowsingSessionId,
|
|
|
|
|
"VisitedWebPageDateWithTimeInISOString": individual_document.metadata.VisitedWebPageDateWithTimeInISOString,
|
|
|
|
|
"VisitedWebPageVisitDurationInMilliseconds": individual_document.metadata.VisitedWebPageVisitDurationInMilliseconds,
|
|
|
|
|
"VisitedWebPageReffererURL": individual_document.metadata.VisitedWebPageReffererURL,
|
2025-10-20 00:30:00 -07:00
|
|
|
},
|
2025-12-04 00:28:39 +00:00
|
|
|
"pageContent": individual_document.pageContent,
|
2025-10-20 00:30:00 -07:00
|
|
|
}
|
|
|
|
|
process_extension_document_task.delay(
|
|
|
|
|
document_dict, request.search_space_id, str(user.id)
|
2025-03-14 18:53:14 -07:00
|
|
|
)
|
2025-04-09 18:46:10 -07:00
|
|
|
elif request.document_type == DocumentType.YOUTUBE_VIDEO:
|
2025-10-20 00:30:00 -07:00
|
|
|
from app.tasks.celery_tasks.document_tasks import process_youtube_video_task
|
|
|
|
|
|
2025-04-09 18:46:10 -07:00
|
|
|
for url in request.content:
|
2025-10-20 00:30:00 -07:00
|
|
|
process_youtube_video_task.delay(
|
|
|
|
|
url, request.search_space_id, str(user.id)
|
2025-04-09 18:46:10 -07:00
|
|
|
)
|
2025-03-14 18:53:14 -07:00
|
|
|
else:
|
2025-07-24 14:43:48 -07:00
|
|
|
raise HTTPException(status_code=400, detail="Invalid document type")
|
2025-05-13 21:13:53 -07:00
|
|
|
|
2025-03-14 18:53:14 -07:00
|
|
|
await session.commit()
|
|
|
|
|
return {"message": "Documents processed successfully"}
|
|
|
|
|
except HTTPException:
|
|
|
|
|
raise
|
|
|
|
|
except Exception as e:
|
|
|
|
|
await session.rollback()
|
|
|
|
|
raise HTTPException(
|
2025-07-24 14:43:48 -07:00
|
|
|
status_code=500, detail=f"Failed to process documents: {e!s}"
|
|
|
|
|
) from e
|
2025-03-14 18:53:14 -07:00
|
|
|
|
2025-05-13 21:13:53 -07:00
|
|
|
|
2025-03-14 18:53:14 -07:00
|
|
|
@router.post("/documents/fileupload")
|
2025-07-24 14:43:48 -07:00
|
|
|
async def create_documents_file_upload(
|
2025-03-14 18:53:14 -07:00
|
|
|
files: list[UploadFile],
|
|
|
|
|
search_space_id: int = Form(...),
|
|
|
|
|
session: AsyncSession = Depends(get_async_session),
|
|
|
|
|
user: User = Depends(current_active_user),
|
|
|
|
|
):
|
2025-11-27 22:45:04 -08:00
|
|
|
"""
|
|
|
|
|
Upload files as documents.
|
|
|
|
|
Requires DOCUMENTS_CREATE permission.
|
|
|
|
|
"""
|
2025-03-14 18:53:14 -07:00
|
|
|
try:
|
2025-11-27 22:45:04 -08:00
|
|
|
# Check permission
|
|
|
|
|
await check_permission(
|
|
|
|
|
session,
|
|
|
|
|
user,
|
|
|
|
|
search_space_id,
|
|
|
|
|
Permission.DOCUMENTS_CREATE.value,
|
|
|
|
|
"You don't have permission to create documents in this search space",
|
|
|
|
|
)
|
2025-05-13 21:13:53 -07:00
|
|
|
|
2025-03-14 18:53:14 -07:00
|
|
|
if not files:
|
|
|
|
|
raise HTTPException(status_code=400, detail="No files provided")
|
2025-05-13 21:13:53 -07:00
|
|
|
|
2025-03-14 18:53:14 -07:00
|
|
|
for file in files:
|
|
|
|
|
try:
|
2025-03-20 18:52:06 -07:00
|
|
|
# Save file to a temporary location to avoid stream issues
|
|
|
|
|
import os
|
2025-07-24 14:43:48 -07:00
|
|
|
import tempfile
|
2025-05-13 21:13:53 -07:00
|
|
|
|
2025-03-20 18:52:06 -07:00
|
|
|
# Create temp file
|
2025-07-24 14:43:48 -07:00
|
|
|
with tempfile.NamedTemporaryFile(
|
|
|
|
|
delete=False, suffix=os.path.splitext(file.filename)[1]
|
|
|
|
|
) as temp_file:
|
2025-03-20 18:52:06 -07:00
|
|
|
temp_path = temp_file.name
|
2025-05-13 21:13:53 -07:00
|
|
|
|
2025-03-20 18:52:06 -07:00
|
|
|
# Write uploaded file to temp file
|
|
|
|
|
content = await file.read()
|
|
|
|
|
with open(temp_path, "wb") as f:
|
|
|
|
|
f.write(content)
|
2025-07-24 14:43:48 -07:00
|
|
|
|
2025-10-20 00:30:00 -07:00
|
|
|
from app.tasks.celery_tasks.document_tasks import (
|
|
|
|
|
process_file_upload_task,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
process_file_upload_task.delay(
|
|
|
|
|
temp_path, file.filename, search_space_id, str(user.id)
|
2025-03-14 18:53:14 -07:00
|
|
|
)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
status_code=422,
|
2025-07-24 14:43:48 -07:00
|
|
|
detail=f"Failed to process file {file.filename}: {e!s}",
|
|
|
|
|
) from e
|
2025-05-13 21:13:53 -07:00
|
|
|
|
2025-03-14 18:53:14 -07:00
|
|
|
await session.commit()
|
2025-03-20 18:52:06 -07:00
|
|
|
return {"message": "Files uploaded for processing"}
|
2025-03-14 18:53:14 -07:00
|
|
|
except HTTPException:
|
|
|
|
|
raise
|
|
|
|
|
except Exception as e:
|
|
|
|
|
await session.rollback()
|
|
|
|
|
raise HTTPException(
|
2025-07-24 14:43:48 -07:00
|
|
|
status_code=500, detail=f"Failed to upload files: {e!s}"
|
|
|
|
|
) from e
|
2025-03-14 18:53:14 -07:00
|
|
|
|
2025-03-26 20:02:53 -07:00
|
|
|
|
2025-10-31 01:33:01 -07:00
|
|
|
@router.get("/documents", response_model=PaginatedResponse[DocumentRead])
|
2025-08-23 18:48:18 -07:00
|
|
|
async def read_documents(
|
2025-10-01 13:05:22 -07:00
|
|
|
skip: int | None = None,
|
|
|
|
|
page: int | None = None,
|
|
|
|
|
page_size: int = 50,
|
2025-08-23 18:48:18 -07:00
|
|
|
search_space_id: int | None = None,
|
2025-10-21 21:53:55 -07:00
|
|
|
document_types: str | None = None,
|
2025-08-23 18:48:18 -07:00
|
|
|
session: AsyncSession = Depends(get_async_session),
|
|
|
|
|
user: User = Depends(current_active_user),
|
2025-03-20 18:52:06 -07:00
|
|
|
):
|
2025-10-01 13:05:22 -07:00
|
|
|
"""
|
2025-11-27 22:45:04 -08:00
|
|
|
List documents the user has access to, with optional filtering and pagination.
|
|
|
|
|
Requires DOCUMENTS_READ permission for the search space(s).
|
2025-10-01 13:05:22 -07:00
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
skip: Absolute number of items to skip from the beginning. If provided, it takes precedence over 'page'.
|
|
|
|
|
page: Zero-based page index used when 'skip' is not provided.
|
|
|
|
|
page_size: Number of items per page (default: 50). Use -1 to return all remaining items after the offset.
|
|
|
|
|
search_space_id: If provided, restrict results to a specific search space.
|
2025-10-21 21:53:55 -07:00
|
|
|
document_types: Comma-separated list of document types to filter by (e.g., "EXTENSION,FILE,SLACK_CONNECTOR").
|
2025-10-01 13:05:22 -07:00
|
|
|
session: Database session (injected).
|
|
|
|
|
user: Current authenticated user (injected).
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
PaginatedResponse[DocumentRead]: Paginated list of documents visible to the user.
|
|
|
|
|
|
|
|
|
|
Notes:
|
|
|
|
|
- If both 'skip' and 'page' are provided, 'skip' is used.
|
2025-11-27 22:45:04 -08:00
|
|
|
- Results are scoped to documents in search spaces the user has membership in.
|
2025-10-01 13:05:22 -07:00
|
|
|
"""
|
2025-03-20 18:52:06 -07:00
|
|
|
try:
|
2025-10-01 13:05:22 -07:00
|
|
|
from sqlalchemy import func
|
|
|
|
|
|
2025-11-27 22:45:04 -08:00
|
|
|
# If specific search_space_id, check permission
|
2025-08-23 18:48:18 -07:00
|
|
|
if search_space_id is not None:
|
2025-11-27 22:45:04 -08:00
|
|
|
await check_permission(
|
|
|
|
|
session,
|
|
|
|
|
user,
|
|
|
|
|
search_space_id,
|
|
|
|
|
Permission.DOCUMENTS_READ.value,
|
|
|
|
|
"You don't have permission to read documents in this search space",
|
|
|
|
|
)
|
|
|
|
|
query = select(Document).filter(Document.search_space_id == search_space_id)
|
|
|
|
|
count_query = (
|
|
|
|
|
select(func.count())
|
|
|
|
|
.select_from(Document)
|
|
|
|
|
.filter(Document.search_space_id == search_space_id)
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
# Get documents from all search spaces user has membership in
|
|
|
|
|
query = (
|
|
|
|
|
select(Document)
|
|
|
|
|
.join(SearchSpace)
|
|
|
|
|
.join(SearchSpaceMembership)
|
|
|
|
|
.filter(SearchSpaceMembership.user_id == user.id)
|
|
|
|
|
)
|
|
|
|
|
count_query = (
|
|
|
|
|
select(func.count())
|
|
|
|
|
.select_from(Document)
|
|
|
|
|
.join(SearchSpace)
|
|
|
|
|
.join(SearchSpaceMembership)
|
|
|
|
|
.filter(SearchSpaceMembership.user_id == user.id)
|
|
|
|
|
)
|
2025-08-23 18:48:18 -07:00
|
|
|
|
2025-10-21 21:53:55 -07:00
|
|
|
# Filter by document_types if provided
|
|
|
|
|
if document_types is not None and document_types.strip():
|
|
|
|
|
type_list = [t.strip() for t in document_types.split(",") if t.strip()]
|
|
|
|
|
if type_list:
|
|
|
|
|
query = query.filter(Document.document_type.in_(type_list))
|
|
|
|
|
count_query = count_query.filter(Document.document_type.in_(type_list))
|
2025-11-27 22:45:04 -08:00
|
|
|
|
2025-10-01 13:05:22 -07:00
|
|
|
total_result = await session.execute(count_query)
|
|
|
|
|
total = total_result.scalar() or 0
|
|
|
|
|
|
|
|
|
|
# Calculate offset
|
|
|
|
|
offset = 0
|
|
|
|
|
if skip is not None:
|
|
|
|
|
offset = skip
|
|
|
|
|
elif page is not None:
|
|
|
|
|
offset = page * page_size
|
|
|
|
|
|
|
|
|
|
# Get paginated results
|
|
|
|
|
if page_size == -1:
|
|
|
|
|
result = await session.execute(query.offset(offset))
|
|
|
|
|
else:
|
|
|
|
|
result = await session.execute(query.offset(offset).limit(page_size))
|
|
|
|
|
|
2025-08-23 18:48:18 -07:00
|
|
|
db_documents = result.scalars().all()
|
|
|
|
|
|
|
|
|
|
# Convert database objects to API-friendly format
|
|
|
|
|
api_documents = []
|
|
|
|
|
for doc in db_documents:
|
|
|
|
|
api_documents.append(
|
|
|
|
|
DocumentRead(
|
|
|
|
|
id=doc.id,
|
|
|
|
|
title=doc.title,
|
|
|
|
|
document_type=doc.document_type,
|
|
|
|
|
document_metadata=doc.document_metadata,
|
|
|
|
|
content=doc.content,
|
2025-12-17 00:09:43 -08:00
|
|
|
content_hash=doc.content_hash,
|
|
|
|
|
unique_identifier_hash=doc.unique_identifier_hash,
|
2025-08-23 18:48:18 -07:00
|
|
|
created_at=doc.created_at,
|
2025-12-17 00:09:43 -08:00
|
|
|
updated_at=doc.updated_at,
|
2025-08-23 18:48:18 -07:00
|
|
|
search_space_id=doc.search_space_id,
|
|
|
|
|
)
|
2025-07-16 01:10:33 -07:00
|
|
|
)
|
2025-07-24 14:43:48 -07:00
|
|
|
|
2025-12-17 00:09:43 -08:00
|
|
|
# Calculate pagination info
|
|
|
|
|
actual_page = (
|
|
|
|
|
page if page is not None else (offset // page_size if page_size > 0 else 0)
|
|
|
|
|
)
|
|
|
|
|
has_more = (offset + len(api_documents)) < total if page_size > 0 else False
|
|
|
|
|
|
|
|
|
|
return PaginatedResponse(
|
|
|
|
|
items=api_documents,
|
|
|
|
|
total=total,
|
|
|
|
|
page=actual_page,
|
|
|
|
|
page_size=page_size,
|
|
|
|
|
has_more=has_more,
|
|
|
|
|
)
|
2025-11-27 22:45:04 -08:00
|
|
|
except HTTPException:
|
|
|
|
|
raise
|
2025-08-23 18:48:18 -07:00
|
|
|
except Exception as e:
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
status_code=500, detail=f"Failed to fetch documents: {e!s}"
|
|
|
|
|
) from e
|
2025-05-13 21:13:53 -07:00
|
|
|
|
2025-07-24 14:43:48 -07:00
|
|
|
|
2025-10-31 01:33:01 -07:00
|
|
|
@router.get("/documents/search", response_model=PaginatedResponse[DocumentRead])
|
2025-10-01 13:05:22 -07:00
|
|
|
async def search_documents(
|
|
|
|
|
title: str,
|
|
|
|
|
skip: int | None = None,
|
|
|
|
|
page: int | None = None,
|
|
|
|
|
page_size: int = 50,
|
|
|
|
|
search_space_id: int | None = None,
|
2025-10-21 21:53:55 -07:00
|
|
|
document_types: str | None = None,
|
2025-10-01 13:05:22 -07:00
|
|
|
session: AsyncSession = Depends(get_async_session),
|
|
|
|
|
user: User = Depends(current_active_user),
|
|
|
|
|
):
|
|
|
|
|
"""
|
2025-10-21 21:53:55 -07:00
|
|
|
Search documents by title substring, optionally filtered by search_space_id and document_types.
|
2025-11-27 22:45:04 -08:00
|
|
|
Requires DOCUMENTS_READ permission for the search space(s).
|
2025-10-01 13:05:22 -07:00
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
title: Case-insensitive substring to match against document titles. Required.
|
|
|
|
|
skip: Absolute number of items to skip from the beginning. If provided, it takes precedence over 'page'. Default: None.
|
|
|
|
|
page: Zero-based page index used when 'skip' is not provided. Default: None.
|
|
|
|
|
page_size: Number of items per page. Use -1 to return all remaining items after the offset. Default: 50.
|
|
|
|
|
search_space_id: Filter results to a specific search space. Default: None.
|
2025-10-21 21:53:55 -07:00
|
|
|
document_types: Comma-separated list of document types to filter by (e.g., "EXTENSION,FILE,SLACK_CONNECTOR").
|
2025-10-01 13:05:22 -07:00
|
|
|
session: Database session (injected).
|
|
|
|
|
user: Current authenticated user (injected).
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
PaginatedResponse[DocumentRead]: Paginated list of documents matching the query and filter.
|
|
|
|
|
|
|
|
|
|
Notes:
|
|
|
|
|
- Title matching uses ILIKE (case-insensitive).
|
|
|
|
|
- If both 'skip' and 'page' are provided, 'skip' is used.
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
from sqlalchemy import func
|
|
|
|
|
|
2025-11-27 22:45:04 -08:00
|
|
|
# If specific search_space_id, check permission
|
2025-10-01 13:05:22 -07:00
|
|
|
if search_space_id is not None:
|
2025-11-27 22:45:04 -08:00
|
|
|
await check_permission(
|
|
|
|
|
session,
|
|
|
|
|
user,
|
|
|
|
|
search_space_id,
|
|
|
|
|
Permission.DOCUMENTS_READ.value,
|
|
|
|
|
"You don't have permission to read documents in this search space",
|
|
|
|
|
)
|
|
|
|
|
query = select(Document).filter(Document.search_space_id == search_space_id)
|
|
|
|
|
count_query = (
|
|
|
|
|
select(func.count())
|
|
|
|
|
.select_from(Document)
|
|
|
|
|
.filter(Document.search_space_id == search_space_id)
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
# Get documents from all search spaces user has membership in
|
|
|
|
|
query = (
|
|
|
|
|
select(Document)
|
|
|
|
|
.join(SearchSpace)
|
|
|
|
|
.join(SearchSpaceMembership)
|
|
|
|
|
.filter(SearchSpaceMembership.user_id == user.id)
|
|
|
|
|
)
|
|
|
|
|
count_query = (
|
|
|
|
|
select(func.count())
|
|
|
|
|
.select_from(Document)
|
|
|
|
|
.join(SearchSpace)
|
|
|
|
|
.join(SearchSpaceMembership)
|
|
|
|
|
.filter(SearchSpaceMembership.user_id == user.id)
|
|
|
|
|
)
|
2025-10-01 13:05:22 -07:00
|
|
|
|
|
|
|
|
# Only search by title (case-insensitive)
|
|
|
|
|
query = query.filter(Document.title.ilike(f"%{title}%"))
|
2025-11-27 22:45:04 -08:00
|
|
|
count_query = count_query.filter(Document.title.ilike(f"%{title}%"))
|
2025-10-01 13:05:22 -07:00
|
|
|
|
2025-10-21 21:53:55 -07:00
|
|
|
# Filter by document_types if provided
|
|
|
|
|
if document_types is not None and document_types.strip():
|
|
|
|
|
type_list = [t.strip() for t in document_types.split(",") if t.strip()]
|
|
|
|
|
if type_list:
|
|
|
|
|
query = query.filter(Document.document_type.in_(type_list))
|
|
|
|
|
count_query = count_query.filter(Document.document_type.in_(type_list))
|
2025-11-27 22:45:04 -08:00
|
|
|
|
2025-10-01 13:05:22 -07:00
|
|
|
total_result = await session.execute(count_query)
|
|
|
|
|
total = total_result.scalar() or 0
|
|
|
|
|
|
|
|
|
|
# Calculate offset
|
|
|
|
|
offset = 0
|
|
|
|
|
if skip is not None:
|
|
|
|
|
offset = skip
|
|
|
|
|
elif page is not None:
|
|
|
|
|
offset = page * page_size
|
|
|
|
|
|
|
|
|
|
# Get paginated results
|
|
|
|
|
if page_size == -1:
|
|
|
|
|
result = await session.execute(query.offset(offset))
|
|
|
|
|
else:
|
|
|
|
|
result = await session.execute(query.offset(offset).limit(page_size))
|
|
|
|
|
|
|
|
|
|
db_documents = result.scalars().all()
|
|
|
|
|
|
|
|
|
|
# Convert database objects to API-friendly format
|
|
|
|
|
api_documents = []
|
|
|
|
|
for doc in db_documents:
|
|
|
|
|
api_documents.append(
|
|
|
|
|
DocumentRead(
|
|
|
|
|
id=doc.id,
|
|
|
|
|
title=doc.title,
|
|
|
|
|
document_type=doc.document_type,
|
|
|
|
|
document_metadata=doc.document_metadata,
|
|
|
|
|
content=doc.content,
|
2025-12-17 00:09:43 -08:00
|
|
|
content_hash=doc.content_hash,
|
|
|
|
|
unique_identifier_hash=doc.unique_identifier_hash,
|
2025-10-01 13:05:22 -07:00
|
|
|
created_at=doc.created_at,
|
2025-12-17 00:09:43 -08:00
|
|
|
updated_at=doc.updated_at,
|
2025-10-01 13:05:22 -07:00
|
|
|
search_space_id=doc.search_space_id,
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
2025-12-17 00:09:43 -08:00
|
|
|
# Calculate pagination info
|
|
|
|
|
actual_page = (
|
|
|
|
|
page if page is not None else (offset // page_size if page_size > 0 else 0)
|
|
|
|
|
)
|
|
|
|
|
has_more = (offset + len(api_documents)) < total if page_size > 0 else False
|
|
|
|
|
|
|
|
|
|
return PaginatedResponse(
|
|
|
|
|
items=api_documents,
|
|
|
|
|
total=total,
|
|
|
|
|
page=actual_page,
|
|
|
|
|
page_size=page_size,
|
|
|
|
|
has_more=has_more,
|
|
|
|
|
)
|
2025-11-27 22:45:04 -08:00
|
|
|
except HTTPException:
|
|
|
|
|
raise
|
2025-10-01 13:05:22 -07:00
|
|
|
except Exception as e:
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
status_code=500, detail=f"Failed to search documents: {e!s}"
|
|
|
|
|
) from e
|
|
|
|
|
|
|
|
|
|
|
2026-01-17 20:45:10 +05:30
|
|
|
@router.get("/documents/search/titles", response_model=DocumentTitleSearchResponse)
|
|
|
|
|
async def search_document_titles(
|
|
|
|
|
search_space_id: int,
|
|
|
|
|
title: str = "",
|
|
|
|
|
page: int = 0,
|
|
|
|
|
page_size: int = 20,
|
|
|
|
|
session: AsyncSession = Depends(get_async_session),
|
|
|
|
|
user: User = Depends(current_active_user),
|
|
|
|
|
):
|
|
|
|
|
"""
|
|
|
|
|
Lightweight document title search optimized for mention picker (@mentions).
|
|
|
|
|
|
|
|
|
|
Returns only id, title, and document_type - no content or metadata.
|
2026-01-17 20:46:47 +05:30
|
|
|
Uses pg_trgm fuzzy search with similarity scoring for typo tolerance.
|
|
|
|
|
Results are ordered by relevance using trigram similarity scores.
|
2026-01-17 20:45:10 +05:30
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
search_space_id: The search space to search in. Required.
|
|
|
|
|
title: Search query (case-insensitive). If empty or < 2 chars, returns recent documents.
|
|
|
|
|
page: Zero-based page index. Default: 0.
|
|
|
|
|
page_size: Number of items per page. Default: 20.
|
|
|
|
|
session: Database session (injected).
|
|
|
|
|
user: Current authenticated user (injected).
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
DocumentTitleSearchResponse: Lightweight list with has_more flag (no total count).
|
|
|
|
|
"""
|
2026-01-17 20:46:47 +05:30
|
|
|
from sqlalchemy import desc, func, or_
|
2026-01-17 20:45:10 +05:30
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Check permission for the search space
|
|
|
|
|
await check_permission(
|
|
|
|
|
session,
|
|
|
|
|
user,
|
|
|
|
|
search_space_id,
|
|
|
|
|
Permission.DOCUMENTS_READ.value,
|
|
|
|
|
"You don't have permission to read documents in this search space",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Base query - only select lightweight fields
|
|
|
|
|
query = select(
|
|
|
|
|
Document.id,
|
|
|
|
|
Document.title,
|
|
|
|
|
Document.document_type,
|
|
|
|
|
).filter(Document.search_space_id == search_space_id)
|
|
|
|
|
|
|
|
|
|
# If query is too short, return recent documents ordered by updated_at
|
|
|
|
|
if len(title.strip()) < 2:
|
|
|
|
|
query = query.order_by(Document.updated_at.desc().nullslast())
|
|
|
|
|
else:
|
2026-01-17 20:46:47 +05:30
|
|
|
# Fuzzy search using pg_trgm similarity + ILIKE fallback
|
2026-01-17 20:45:10 +05:30
|
|
|
search_term = title.strip()
|
|
|
|
|
|
2026-01-17 20:46:47 +05:30
|
|
|
# Similarity threshold for fuzzy matching (0.3 = ~30% trigram overlap)
|
|
|
|
|
# Lower values = more fuzzy, higher values = stricter matching
|
|
|
|
|
similarity_threshold = 0.3
|
|
|
|
|
|
|
|
|
|
# Match documents that either:
|
|
|
|
|
# 1. Have high trigram similarity (fuzzy match - handles typos)
|
|
|
|
|
# 2. Contain the exact substring (ILIKE - handles partial matches)
|
|
|
|
|
query = query.filter(
|
|
|
|
|
or_(
|
|
|
|
|
func.similarity(Document.title, search_term) > similarity_threshold,
|
|
|
|
|
Document.title.ilike(f"%{search_term}%"),
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Order by similarity score (descending) for best relevance ranking
|
|
|
|
|
# Higher similarity = better match = appears first
|
|
|
|
|
query = query.order_by(
|
|
|
|
|
desc(func.similarity(Document.title, search_term)),
|
|
|
|
|
Document.title, # Alphabetical tiebreaker
|
2026-01-17 20:45:10 +05:30
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Fetch page_size + 1 to determine has_more without COUNT query
|
|
|
|
|
offset = page * page_size
|
|
|
|
|
result = await session.execute(query.offset(offset).limit(page_size + 1))
|
|
|
|
|
rows = result.all()
|
|
|
|
|
|
|
|
|
|
# Check if there are more results
|
|
|
|
|
has_more = len(rows) > page_size
|
|
|
|
|
items = rows[:page_size] # Only return requested page_size
|
|
|
|
|
|
|
|
|
|
# Convert to response format
|
|
|
|
|
api_documents = [
|
|
|
|
|
DocumentTitleRead(
|
|
|
|
|
id=row.id,
|
|
|
|
|
title=row.title,
|
|
|
|
|
document_type=row.document_type,
|
|
|
|
|
)
|
|
|
|
|
for row in items
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
return DocumentTitleSearchResponse(
|
|
|
|
|
items=api_documents,
|
|
|
|
|
has_more=has_more,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
except HTTPException:
|
|
|
|
|
raise
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
status_code=500, detail=f"Failed to search document titles: {e!s}"
|
|
|
|
|
) from e
|
|
|
|
|
|
|
|
|
|
|
2025-10-31 01:33:01 -07:00
|
|
|
@router.get("/documents/type-counts")
|
|
|
|
|
async def get_document_type_counts(
|
|
|
|
|
search_space_id: int | None = None,
|
|
|
|
|
session: AsyncSession = Depends(get_async_session),
|
|
|
|
|
user: User = Depends(current_active_user),
|
|
|
|
|
):
|
|
|
|
|
"""
|
2025-11-27 22:45:04 -08:00
|
|
|
Get counts of documents by type for search spaces the user has access to.
|
|
|
|
|
Requires DOCUMENTS_READ permission for the search space(s).
|
2025-10-31 01:33:01 -07:00
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
search_space_id: If provided, restrict counts to a specific search space.
|
|
|
|
|
session: Database session (injected).
|
|
|
|
|
user: Current authenticated user (injected).
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Dict mapping document types to their counts.
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
from sqlalchemy import func
|
|
|
|
|
|
|
|
|
|
if search_space_id is not None:
|
2025-11-27 22:45:04 -08:00
|
|
|
# Check permission for specific search space
|
|
|
|
|
await check_permission(
|
|
|
|
|
session,
|
|
|
|
|
user,
|
|
|
|
|
search_space_id,
|
|
|
|
|
Permission.DOCUMENTS_READ.value,
|
|
|
|
|
"You don't have permission to read documents in this search space",
|
|
|
|
|
)
|
|
|
|
|
query = (
|
|
|
|
|
select(Document.document_type, func.count(Document.id))
|
|
|
|
|
.filter(Document.search_space_id == search_space_id)
|
|
|
|
|
.group_by(Document.document_type)
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
# Get counts from all search spaces user has membership in
|
|
|
|
|
query = (
|
|
|
|
|
select(Document.document_type, func.count(Document.id))
|
|
|
|
|
.join(SearchSpace)
|
|
|
|
|
.join(SearchSpaceMembership)
|
|
|
|
|
.filter(SearchSpaceMembership.user_id == user.id)
|
|
|
|
|
.group_by(Document.document_type)
|
|
|
|
|
)
|
2025-10-31 01:33:01 -07:00
|
|
|
|
|
|
|
|
result = await session.execute(query)
|
|
|
|
|
type_counts = dict(result.all())
|
|
|
|
|
|
|
|
|
|
return type_counts
|
2025-11-27 22:45:04 -08:00
|
|
|
except HTTPException:
|
|
|
|
|
raise
|
2025-10-31 01:33:01 -07:00
|
|
|
except Exception as e:
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
status_code=500, detail=f"Failed to fetch document type counts: {e!s}"
|
|
|
|
|
) from e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@router.get("/documents/by-chunk/{chunk_id}", response_model=DocumentWithChunksRead)
|
|
|
|
|
async def get_document_by_chunk_id(
|
|
|
|
|
chunk_id: int,
|
|
|
|
|
session: AsyncSession = Depends(get_async_session),
|
|
|
|
|
user: User = Depends(current_active_user),
|
|
|
|
|
):
|
|
|
|
|
"""
|
|
|
|
|
Retrieves a document based on a chunk ID, including all its chunks ordered by creation time.
|
2025-11-27 22:45:04 -08:00
|
|
|
Requires DOCUMENTS_READ permission for the search space.
|
2025-10-31 01:33:01 -07:00
|
|
|
The document's embedding and chunk embeddings are excluded from the response.
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
# First, get the chunk and verify it exists
|
|
|
|
|
chunk_result = await session.execute(select(Chunk).filter(Chunk.id == chunk_id))
|
|
|
|
|
chunk = chunk_result.scalars().first()
|
|
|
|
|
|
|
|
|
|
if not chunk:
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
status_code=404, detail=f"Chunk with id {chunk_id} not found"
|
|
|
|
|
)
|
|
|
|
|
|
2025-11-27 22:45:04 -08:00
|
|
|
# Get the associated document
|
2025-10-31 01:33:01 -07:00
|
|
|
document_result = await session.execute(
|
|
|
|
|
select(Document)
|
|
|
|
|
.options(selectinload(Document.chunks))
|
2025-11-27 22:45:04 -08:00
|
|
|
.filter(Document.id == chunk.document_id)
|
2025-10-31 01:33:01 -07:00
|
|
|
)
|
|
|
|
|
document = document_result.scalars().first()
|
|
|
|
|
|
|
|
|
|
if not document:
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
status_code=404,
|
2025-11-27 22:45:04 -08:00
|
|
|
detail="Document not found",
|
2025-10-31 01:33:01 -07:00
|
|
|
)
|
|
|
|
|
|
2025-11-27 22:45:04 -08:00
|
|
|
# Check permission for the search space
|
|
|
|
|
await check_permission(
|
|
|
|
|
session,
|
|
|
|
|
user,
|
|
|
|
|
document.search_space_id,
|
|
|
|
|
Permission.DOCUMENTS_READ.value,
|
|
|
|
|
"You don't have permission to read documents in this search space",
|
|
|
|
|
)
|
|
|
|
|
|
2025-10-31 01:33:01 -07:00
|
|
|
# Sort chunks by creation time
|
|
|
|
|
sorted_chunks = sorted(document.chunks, key=lambda x: x.created_at)
|
|
|
|
|
|
|
|
|
|
# Return the document with its chunks
|
|
|
|
|
return DocumentWithChunksRead(
|
|
|
|
|
id=document.id,
|
|
|
|
|
title=document.title,
|
|
|
|
|
document_type=document.document_type,
|
|
|
|
|
document_metadata=document.document_metadata,
|
|
|
|
|
content=document.content,
|
2025-12-17 00:09:43 -08:00
|
|
|
content_hash=document.content_hash,
|
|
|
|
|
unique_identifier_hash=document.unique_identifier_hash,
|
2025-10-31 01:33:01 -07:00
|
|
|
created_at=document.created_at,
|
2025-12-17 00:09:43 -08:00
|
|
|
updated_at=document.updated_at,
|
2025-10-31 01:33:01 -07:00
|
|
|
search_space_id=document.search_space_id,
|
|
|
|
|
chunks=sorted_chunks,
|
|
|
|
|
)
|
|
|
|
|
except HTTPException:
|
|
|
|
|
raise
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
status_code=500, detail=f"Failed to retrieve document: {e!s}"
|
|
|
|
|
) from e
|
|
|
|
|
|
|
|
|
|
|
2025-08-23 18:48:18 -07:00
|
|
|
@router.get("/documents/{document_id}", response_model=DocumentRead)
|
|
|
|
|
async def read_document(
|
|
|
|
|
document_id: int,
|
|
|
|
|
session: AsyncSession = Depends(get_async_session),
|
|
|
|
|
user: User = Depends(current_active_user),
|
|
|
|
|
):
|
2025-11-27 22:45:04 -08:00
|
|
|
"""
|
|
|
|
|
Get a specific document by ID.
|
|
|
|
|
Requires DOCUMENTS_READ permission for the search space.
|
|
|
|
|
"""
|
2025-08-23 18:48:18 -07:00
|
|
|
try:
|
|
|
|
|
result = await session.execute(
|
2025-11-27 22:45:04 -08:00
|
|
|
select(Document).filter(Document.id == document_id)
|
2025-08-23 18:48:18 -07:00
|
|
|
)
|
|
|
|
|
document = result.scalars().first()
|
2025-05-13 21:13:53 -07:00
|
|
|
|
2025-08-23 18:48:18 -07:00
|
|
|
if not document:
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
status_code=404, detail=f"Document with id {document_id} not found"
|
2025-07-16 01:10:33 -07:00
|
|
|
)
|
|
|
|
|
|
2025-11-27 22:45:04 -08:00
|
|
|
# Check permission for the search space
|
|
|
|
|
await check_permission(
|
|
|
|
|
session,
|
|
|
|
|
user,
|
|
|
|
|
document.search_space_id,
|
|
|
|
|
Permission.DOCUMENTS_READ.value,
|
|
|
|
|
"You don't have permission to read documents in this search space",
|
|
|
|
|
)
|
|
|
|
|
|
2025-08-23 18:48:18 -07:00
|
|
|
# Convert database object to API-friendly format
|
|
|
|
|
return DocumentRead(
|
|
|
|
|
id=document.id,
|
|
|
|
|
title=document.title,
|
|
|
|
|
document_type=document.document_type,
|
|
|
|
|
document_metadata=document.document_metadata,
|
|
|
|
|
content=document.content,
|
2025-12-17 00:09:43 -08:00
|
|
|
content_hash=document.content_hash,
|
|
|
|
|
unique_identifier_hash=document.unique_identifier_hash,
|
2025-08-23 18:48:18 -07:00
|
|
|
created_at=document.created_at,
|
2025-12-17 00:09:43 -08:00
|
|
|
updated_at=document.updated_at,
|
2025-08-23 18:48:18 -07:00
|
|
|
search_space_id=document.search_space_id,
|
|
|
|
|
)
|
2025-11-27 22:45:04 -08:00
|
|
|
except HTTPException:
|
|
|
|
|
raise
|
2025-08-23 18:48:18 -07:00
|
|
|
except Exception as e:
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
status_code=500, detail=f"Failed to fetch document: {e!s}"
|
|
|
|
|
) from e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@router.put("/documents/{document_id}", response_model=DocumentRead)
|
|
|
|
|
async def update_document(
|
|
|
|
|
document_id: int,
|
|
|
|
|
document_update: DocumentUpdate,
|
|
|
|
|
session: AsyncSession = Depends(get_async_session),
|
|
|
|
|
user: User = Depends(current_active_user),
|
|
|
|
|
):
|
2025-11-27 22:45:04 -08:00
|
|
|
"""
|
|
|
|
|
Update a document.
|
|
|
|
|
Requires DOCUMENTS_UPDATE permission for the search space.
|
|
|
|
|
"""
|
2025-08-23 18:48:18 -07:00
|
|
|
try:
|
|
|
|
|
result = await session.execute(
|
2025-11-27 22:45:04 -08:00
|
|
|
select(Document).filter(Document.id == document_id)
|
2025-08-23 18:48:18 -07:00
|
|
|
)
|
|
|
|
|
db_document = result.scalars().first()
|
|
|
|
|
|
|
|
|
|
if not db_document:
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
status_code=404, detail=f"Document with id {document_id} not found"
|
2025-05-07 22:04:57 -07:00
|
|
|
)
|
2025-07-24 14:43:48 -07:00
|
|
|
|
2025-11-27 22:45:04 -08:00
|
|
|
# Check permission for the search space
|
|
|
|
|
await check_permission(
|
|
|
|
|
session,
|
|
|
|
|
user,
|
|
|
|
|
db_document.search_space_id,
|
|
|
|
|
Permission.DOCUMENTS_UPDATE.value,
|
|
|
|
|
"You don't have permission to update documents in this search space",
|
|
|
|
|
)
|
|
|
|
|
|
2025-08-23 18:48:18 -07:00
|
|
|
update_data = document_update.model_dump(exclude_unset=True)
|
|
|
|
|
for key, value in update_data.items():
|
|
|
|
|
setattr(db_document, key, value)
|
|
|
|
|
await session.commit()
|
|
|
|
|
await session.refresh(db_document)
|
2025-07-24 14:43:48 -07:00
|
|
|
|
2025-08-23 18:48:18 -07:00
|
|
|
# Convert to DocumentRead for response
|
|
|
|
|
return DocumentRead(
|
|
|
|
|
id=db_document.id,
|
|
|
|
|
title=db_document.title,
|
|
|
|
|
document_type=db_document.document_type,
|
|
|
|
|
document_metadata=db_document.document_metadata,
|
|
|
|
|
content=db_document.content,
|
2025-12-17 00:09:43 -08:00
|
|
|
content_hash=db_document.content_hash,
|
|
|
|
|
unique_identifier_hash=db_document.unique_identifier_hash,
|
2025-08-23 18:48:18 -07:00
|
|
|
created_at=db_document.created_at,
|
2025-12-17 00:09:43 -08:00
|
|
|
updated_at=db_document.updated_at,
|
2025-08-23 18:48:18 -07:00
|
|
|
search_space_id=db_document.search_space_id,
|
|
|
|
|
)
|
|
|
|
|
except HTTPException:
|
|
|
|
|
raise
|
|
|
|
|
except Exception as e:
|
|
|
|
|
await session.rollback()
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
status_code=500, detail=f"Failed to update document: {e!s}"
|
|
|
|
|
) from e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@router.delete("/documents/{document_id}", response_model=dict)
|
|
|
|
|
async def delete_document(
|
|
|
|
|
document_id: int,
|
|
|
|
|
session: AsyncSession = Depends(get_async_session),
|
|
|
|
|
user: User = Depends(current_active_user),
|
|
|
|
|
):
|
2025-11-27 22:45:04 -08:00
|
|
|
"""
|
|
|
|
|
Delete a document.
|
|
|
|
|
Requires DOCUMENTS_DELETE permission for the search space.
|
|
|
|
|
"""
|
2025-08-23 18:48:18 -07:00
|
|
|
try:
|
|
|
|
|
result = await session.execute(
|
2025-11-27 22:45:04 -08:00
|
|
|
select(Document).filter(Document.id == document_id)
|
2025-08-23 18:48:18 -07:00
|
|
|
)
|
|
|
|
|
document = result.scalars().first()
|
|
|
|
|
|
|
|
|
|
if not document:
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
status_code=404, detail=f"Document with id {document_id} not found"
|
2025-07-16 01:10:33 -07:00
|
|
|
)
|
2025-07-24 14:43:48 -07:00
|
|
|
|
2025-11-27 22:45:04 -08:00
|
|
|
# Check permission for the search space
|
|
|
|
|
await check_permission(
|
|
|
|
|
session,
|
|
|
|
|
user,
|
|
|
|
|
document.search_space_id,
|
|
|
|
|
Permission.DOCUMENTS_DELETE.value,
|
|
|
|
|
"You don't have permission to delete documents in this search space",
|
|
|
|
|
)
|
|
|
|
|
|
2025-08-23 18:48:18 -07:00
|
|
|
await session.delete(document)
|
|
|
|
|
await session.commit()
|
|
|
|
|
return {"message": "Document deleted successfully"}
|
|
|
|
|
except HTTPException:
|
|
|
|
|
raise
|
|
|
|
|
except Exception as e:
|
|
|
|
|
await session.rollback()
|
|
|
|
|
raise HTTPException(
|
|
|
|
|
status_code=500, detail=f"Failed to delete document: {e!s}"
|
|
|
|
|
) from e
|