Incremental / large document loading (#659)

Tech spec BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py): - get_stream() - yields document content in chunks for streaming retrieval - create_multipart_upload() - initializes S3 multipart upload, returns upload_id - upload_part() - uploads a single part, returns etag - complete_multipart_upload() - finalizes upload with part etags - abort_multipart_upload() - cancels and cleans up Cassandra schema (trustgraph-flow/trustgraph/tables/library.py): - New upload_session table with 24-hour TTL - Index on user for listing sessions - Prepared statements for all operations - Methods: create_upload_session(), get_upload_session(), update_upload_session_chunk(), delete_upload_session(), list_upload_sessions() - Schema extended with UploadSession, UploadProgress, and new request/response fields - Librarian methods: begin_upload, upload_chunk, complete_upload, abort_upload, get_upload_status, list_uploads - Service routing for all new operations - Python SDK with transparent chunked upload: - add_document() auto-switches to chunked for files > 10MB - Progress callback support (on_progress) - get_pending_uploads(), get_upload_status(), abort_upload(), resume_upload() - Document table: Added parent_id and document_type columns with index - Document schema (knowledge/document.py): Added document_id field for streaming retrieval - Librarian operations: - add-child-document for extracted PDF pages - list-children to get child documents - stream-document for chunked content retrieval - Cascade delete removes children when parent is deleted - list-documents filters children by default - PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large documents from librarian API to temp file - Librarian service (librarian/service.py): Sends document_id instead of content for large PDFs (>2MB) - Deprecated tools (load_pdf.py, load_text.py): Added deprecation warnings directing users to tg-add-library-document + tg-start-library-processing Remove load_pdf and load_text utils Move chunker/librarian comms to base class Updating tests
2026-05-04 12:52:36 +02:00 · 2026-03-04 16:57:58 +00:00 · 2026-03-04 16:57:58 +00:00 · a630e143ef
commit a630e143ef
parent a38ca9474f
21 changed files with 3164 additions and 650 deletions
--- a/trustgraph-flow/trustgraph/librarian/service.py
+++ b/trustgraph-flow/trustgraph/librarian/service.py
@ -271,6 +271,9 @@ class Processor(AsyncProcessor):

        pass

+    # Threshold for sending document_id instead of inline content (2MB)
+    STREAMING_THRESHOLD = 2 * 1024 * 1024
+
    async def load_document(self, document, processing, content):

        logger.debug("Ready for document processing...")
@ -292,26 +295,57 @@ class Processor(AsyncProcessor):
        q = flow["interfaces"][kind]

        if kind == "text-load":
-            doc = TextDocument(
-                metadata = Metadata(
-                    id = document.id,
-                    metadata = document.metadata,
-                    user = processing.user,
-                    collection = processing.collection
-                ),
-                text = content,
-            )
+            # For large text documents, send document_id for streaming retrieval
+            if len(content) >= self.STREAMING_THRESHOLD:
+                logger.info(f"Text document {document.id} is large ({len(content)} bytes), "
+                           f"sending document_id for streaming retrieval")
+                doc = TextDocument(
+                    metadata = Metadata(
+                        id = document.id,
+                        metadata = document.metadata,
+                        user = processing.user,
+                        collection = processing.collection
+                    ),
+                    document_id = document.id,
+                    text = b"",  # Empty, receiver will fetch via librarian
+                )
+            else:
+                doc = TextDocument(
+                    metadata = Metadata(
+                        id = document.id,
+                        metadata = document.metadata,
+                        user = processing.user,
+                        collection = processing.collection
+                    ),
+                    text = content,
+                )
            schema = TextDocument
        else:
-            doc = Document(
-                metadata = Metadata(
-                    id = document.id,
-                    metadata = document.metadata,
-                    user = processing.user,
-                    collection = processing.collection
-                ),
-                data = base64.b64encode(content).decode("utf-8")
-            )
+            # For large PDF documents, send document_id for streaming retrieval
+            # instead of embedding the entire content in the message
+            if len(content) >= self.STREAMING_THRESHOLD:
+                logger.info(f"Document {document.id} is large ({len(content)} bytes), "
+                           f"sending document_id for streaming retrieval")
+                doc = Document(
+                    metadata = Metadata(
+                        id = document.id,
+                        metadata = document.metadata,
+                        user = processing.user,
+                        collection = processing.collection
+                    ),
+                    document_id = document.id,
+                    data = b"",  # Empty data, receiver will fetch via API
+                )
+            else:
+                doc = Document(
+                    metadata = Metadata(
+                        id = document.id,
+                        metadata = document.metadata,
+                        user = processing.user,
+                        collection = processing.collection
+                    ),
+                    data = base64.b64encode(content).decode("utf-8")
+                )
            schema = Document

        logger.debug(f"Submitting to queue {q}...")
@ -361,6 +395,17 @@ class Processor(AsyncProcessor):
            "remove-processing": self.librarian.remove_processing,
            "list-documents": self.librarian.list_documents,
            "list-processing": self.librarian.list_processing,
+            # Chunked upload operations
+            "begin-upload": self.librarian.begin_upload,
+            "upload-chunk": self.librarian.upload_chunk,
+            "complete-upload": self.librarian.complete_upload,
+            "abort-upload": self.librarian.abort_upload,
+            "get-upload-status": self.librarian.get_upload_status,
+            "list-uploads": self.librarian.list_uploads,
+            # Child document and streaming operations
+            "add-child-document": self.librarian.add_child_document,
+            "list-children": self.librarian.list_children,
+            "stream-document": self.librarian.stream_document,
        }

        if v.operation not in impls: