Incremental / large document loading (#659)

Tech spec BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py): - get_stream() - yields document content in chunks for streaming retrieval - create_multipart_upload() - initializes S3 multipart upload, returns upload_id - upload_part() - uploads a single part, returns etag - complete_multipart_upload() - finalizes upload with part etags - abort_multipart_upload() - cancels and cleans up Cassandra schema (trustgraph-flow/trustgraph/tables/library.py): - New upload_session table with 24-hour TTL - Index on user for listing sessions - Prepared statements for all operations - Methods: create_upload_session(), get_upload_session(), update_upload_session_chunk(), delete_upload_session(), list_upload_sessions() - Schema extended with UploadSession, UploadProgress, and new request/response fields - Librarian methods: begin_upload, upload_chunk, complete_upload, abort_upload, get_upload_status, list_uploads - Service routing for all new operations - Python SDK with transparent chunked upload: - add_document() auto-switches to chunked for files > 10MB - Progress callback support (on_progress) - get_pending_uploads(), get_upload_status(), abort_upload(), resume_upload() - Document table: Added parent_id and document_type columns with index - Document schema (knowledge/document.py): Added document_id field for streaming retrieval - Librarian operations: - add-child-document for extracted PDF pages - list-children to get child documents - stream-document for chunked content retrieval - Cascade delete removes children when parent is deleted - list-documents filters children by default - PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large documents from librarian API to temp file - Librarian service (librarian/service.py): Sends document_id instead of content for large PDFs (>2MB) - Deprecated tools (load_pdf.py, load_text.py): Added deprecation warnings directing users to tg-add-library-document + tg-start-library-processing Remove load_pdf and load_text utils Move chunker/librarian comms to base class Updating tests
2026-05-18 20:05:13 +02:00 · 2026-03-04 16:57:58 +00:00 · 2026-03-04 16:57:58 +00:00 · a630e143ef
commit a630e143ef
parent a38ca9474f
21 changed files with 3164 additions and 650 deletions
--- a/trustgraph-base/trustgraph/schema/services/library.py
+++ b/trustgraph-base/trustgraph/schema/services/library.py
@ -49,6 +49,36 @@ from ..core.metadata import Metadata
 #   <- (processing_metadata[])
 #   <- (error)

+# begin-upload
+#   -> (document_metadata, total_size, chunk_size)
+#   <- (upload_id, chunk_size, total_chunks)
+#   <- (error)
+
+# upload-chunk
+#   -> (upload_id, chunk_index, content)
+#   <- (upload_id, chunk_index, chunks_received, total_chunks, bytes_received, total_bytes)
+#   <- (error)
+
+# complete-upload
+#   -> (upload_id)
+#   <- (document_id, object_id)
+#   <- (error)
+
+# abort-upload
+#   -> (upload_id)
+#   <- ()
+#   <- (error)
+
+# get-upload-status
+#   -> (upload_id)
+#   <- (upload_id, state, chunks_received, missing_chunks, total_chunks, bytes_received, total_bytes)
+#   <- (error)
+
+# list-uploads
+#   -> (user)
+#   <- (uploads[])
+#   <- (error)
+
@dataclass
 class DocumentMetadata:
    id: str = ""
@ -59,6 +89,9 @@ class DocumentMetadata:
    metadata: list[Triple] = field(default_factory=list)
    user: str = ""
    tags: list[str] = field(default_factory=list)
+    # Child document support
+    parent_id: str = ""  # Empty for top-level docs, set for children
+    document_type: str = "source"  # "source" or "extracted"

@dataclass
 class ProcessingMetadata:
@ -76,11 +109,33 @@ class Criteria:
    value: str = ""
    operator: str = ""

+@dataclass
+class UploadProgress:
+    """Progress information for chunked uploads."""
+    upload_id: str = ""
+    chunks_received: int = 0
+    total_chunks: int = 0
+    bytes_received: int = 0
+    total_bytes: int = 0
+
+@dataclass
+class UploadSession:
+    """Information about an in-progress upload."""
+    upload_id: str = ""
+    document_id: str = ""
+    document_metadata_json: str = ""  # JSON-encoded DocumentMetadata
+    total_size: int = 0
+    chunk_size: int = 0
+    total_chunks: int = 0
+    chunks_received: int = 0
+    created_at: str = ""
+
@dataclass
 class LibrarianRequest:
    # add-document, remove-document, update-document, get-document-metadata,
    # get-document-content, add-processing, remove-processing, list-documents,
-    # list-processing
+    # list-processing, begin-upload, upload-chunk, complete-upload, abort-upload,
+    # get-upload-status, list-uploads
    operation: str = ""

    # add-document, remove-document, update-document, get-document-metadata,
@ -90,16 +145,16 @@ class LibrarianRequest:
    # add-processing, remove-processing
    processing_id: str = ""

-    # add-document, update-document
+    # add-document, update-document, begin-upload
    document_metadata: DocumentMetadata | None = None

    # add-processing
    processing_metadata: ProcessingMetadata | None = None

-    # add-document
+    # add-document, upload-chunk
    content: bytes = b""

-    # list-documents, list-processing
+    # list-documents, list-processing, list-uploads
    user: str = ""

    # list-documents?, list-processing?
@ -108,6 +163,19 @@ class LibrarianRequest:
    #
    criteria: list[Criteria] = field(default_factory=list)

+    # begin-upload
+    total_size: int = 0
+    chunk_size: int = 0
+
+    # upload-chunk, complete-upload, abort-upload, get-upload-status
+    upload_id: str = ""
+
+    # upload-chunk, stream-document
+    chunk_index: int = 0
+
+    # list-documents - whether to include child documents (default False)
+    include_children: bool = False
+
@dataclass
 class LibrarianResponse:
    error: Error | None = None
@ -116,6 +184,29 @@ class LibrarianResponse:
    document_metadatas: list[DocumentMetadata] = field(default_factory=list)
    processing_metadatas: list[ProcessingMetadata] = field(default_factory=list)

+    # begin-upload response
+    upload_id: str = ""
+    chunk_size: int = 0
+    total_chunks: int = 0
+
+    # upload-chunk response
+    chunk_index: int = 0
+    chunks_received: int = 0
+    bytes_received: int = 0
+    total_bytes: int = 0
+
+    # complete-upload response
+    document_id: str = ""
+    object_id: str = ""
+
+    # get-upload-status response
+    upload_state: str = ""  # "in-progress", "completed", "expired"
+    received_chunks: list[int] = field(default_factory=list)
+    missing_chunks: list[int] = field(default_factory=list)
+
+    # list-uploads response
+    upload_sessions: list[UploadSession] = field(default_factory=list)
+
 # FIXME: Is this right?  Using persistence on librarian so that
 # message chunking works