SurfSense/surfsense_backend/app/schemas/documents.py

from datetime import datetime
from typing import TypeVar
from uuid import UUID

from pydantic import BaseModel, ConfigDict

from app.db import DocumentType

from .chunks import ChunkRead

T = TypeVar("T")


class ExtensionDocumentMetadata(BaseModel):
    BrowsingSessionId: str
    VisitedWebPageURL: str
    VisitedWebPageTitle: str
    VisitedWebPageDateWithTimeInISOString: str
    VisitedWebPageReffererURL: str
    VisitedWebPageVisitDurationInMilliseconds: str


class ExtensionDocumentContent(BaseModel):
    metadata: ExtensionDocumentMetadata
    pageContent: str  # noqa: N815


class DocumentBase(BaseModel):
    document_type: DocumentType
    content: (
        list[ExtensionDocumentContent] | list[str] | str
    )  # Updated to allow string content
    search_space_id: int


class DocumentsCreate(DocumentBase):
    pass


class DocumentUpdate(DocumentBase):
    pass


class DocumentStatusSchema(BaseModel):
    """Document processing status."""

    state: str  # "ready", "processing", "failed"
    reason: str | None = None


class DocumentRead(BaseModel):
    id: int
    title: str
    document_type: DocumentType
    document_metadata: dict
    content: str  # Changed to string to match frontend
    content_hash: str
    unique_identifier_hash: str | None
    created_at: datetime
    updated_at: datetime | None
    search_space_id: int
    created_by_id: UUID | None = None  # User who created/uploaded this document
    created_by_name: str | None = (
        None  # Display name or email of the user who created this document
    )
    status: DocumentStatusSchema | None = (
        None  # Processing status (ready, processing, failed)
    )

    model_config = ConfigDict(from_attributes=True)


class DocumentWithChunksRead(DocumentRead):
    chunks: list[ChunkRead] = []

    model_config = ConfigDict(from_attributes=True)


class PaginatedResponse[T](BaseModel):
    items: list[T]
    total: int
    page: int
    page_size: int
    has_more: bool


class DocumentTitleRead(BaseModel):
    """Lightweight document response for mention picker - only essential fields."""

    id: int
    title: str
    document_type: DocumentType

    model_config = ConfigDict(from_attributes=True)


class DocumentTitleSearchResponse(BaseModel):
    """Response for document title search - optimized for typeahead."""

    items: list[DocumentTitleRead]
    has_more: bool


class DocumentStatusItemRead(BaseModel):
    """Lightweight document status payload for batch status polling."""

    id: int
    title: str
    document_type: DocumentType
    status: DocumentStatusSchema

    model_config = ConfigDict(from_attributes=True)


class DocumentStatusBatchResponse(BaseModel):
    """Batch status response for a set of document IDs."""

    items: list[DocumentStatusItemRead]