Incremental / large document loading (#659)

Tech spec BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py): - get_stream() - yields document content in chunks for streaming retrieval - create_multipart_upload() - initializes S3 multipart upload, returns upload_id - upload_part() - uploads a single part, returns etag - complete_multipart_upload() - finalizes upload with part etags - abort_multipart_upload() - cancels and cleans up Cassandra schema (trustgraph-flow/trustgraph/tables/library.py): - New upload_session table with 24-hour TTL - Index on user for listing sessions - Prepared statements for all operations - Methods: create_upload_session(), get_upload_session(), update_upload_session_chunk(), delete_upload_session(), list_upload_sessions() - Schema extended with UploadSession, UploadProgress, and new request/response fields - Librarian methods: begin_upload, upload_chunk, complete_upload, abort_upload, get_upload_status, list_uploads - Service routing for all new operations - Python SDK with transparent chunked upload: - add_document() auto-switches to chunked for files > 10MB - Progress callback support (on_progress) - get_pending_uploads(), get_upload_status(), abort_upload(), resume_upload() - Document table: Added parent_id and document_type columns with index - Document schema (knowledge/document.py): Added document_id field for streaming retrieval - Librarian operations: - add-child-document for extracted PDF pages - list-children to get child documents - stream-document for chunked content retrieval - Cascade delete removes children when parent is deleted - list-documents filters children by default - PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large documents from librarian API to temp file - Librarian service (librarian/service.py): Sends document_id instead of content for large PDFs (>2MB) - Deprecated tools (load_pdf.py, load_text.py): Added deprecation warnings directing users to tg-add-library-document + tg-start-library-processing Remove load_pdf and load_text utils Move chunker/librarian comms to base class Updating tests
2026-07-02 02:58:10 +02:00 · 2026-03-04 16:57:58 +00:00 · 2026-03-04 16:57:58 +00:00 · a630e143ef
commit a630e143ef
parent a38ca9474f
21 changed files with 3164 additions and 650 deletions
--- a/trustgraph-flow/trustgraph/librarian/blob_store.py
+++ b/trustgraph-flow/trustgraph/librarian/blob_store.py
@ -3,9 +3,12 @@ from .. knowledge import hash
 from .. exceptions import RequestError

 from minio import Minio
+from minio.datatypes import Part
 import time
 import io
 import logging
+from typing import Iterator, List, Tuple
+from uuid import UUID

 # Module logger
 logger = logging.getLogger(__name__)
@ -78,3 +81,141 @@ class BlobStore:

        return resp.read()

+    def get_stream(self, object_id, chunk_size: int = 1024 * 1024) -> Iterator[bytes]:
+        """
+        Stream document content in chunks.
+
+        Yields chunks of the document, allowing processing without loading
+        the entire document into memory.
+
+        Args:
+            object_id: The UUID of the document object
+            chunk_size: Size of each chunk in bytes (default 1MB)
+
+        Yields:
+            Chunks of document content as bytes
+        """
+        resp = self.client.get_object(
+            bucket_name=self.bucket_name,
+            object_name="doc/" + str(object_id),
+        )
+
+        try:
+            while True:
+                chunk = resp.read(chunk_size)
+                if not chunk:
+                    break
+                yield chunk
+        finally:
+            resp.close()
+            resp.release_conn()
+
+        logger.debug("Stream complete")
+
+    def create_multipart_upload(self, object_id: UUID, kind: str) -> str:
+        """
+        Initialize a multipart upload.
+
+        Args:
+            object_id: The UUID for the new object
+            kind: MIME type of the document
+
+        Returns:
+            The S3 upload_id for this multipart upload session
+        """
+        object_name = "doc/" + str(object_id)
+
+        # Use minio's internal method to create multipart upload
+        upload_id = self.client._create_multipart_upload(
+            bucket_name=self.bucket_name,
+            object_name=object_name,
+            headers={"Content-Type": kind},
+        )
+
+        logger.info(f"Created multipart upload {upload_id} for {object_id}")
+        return upload_id
+
+    def upload_part(
+        self,
+        object_id: UUID,
+        upload_id: str,
+        part_number: int,
+        data: bytes
+    ) -> str:
+        """
+        Upload a single part of a multipart upload.
+
+        Args:
+            object_id: The UUID of the object being uploaded
+            upload_id: The S3 upload_id from create_multipart_upload
+            part_number: Part number (1-indexed, as per S3 spec)
+            data: The chunk data to upload
+
+        Returns:
+            The ETag for this part (needed for complete_multipart_upload)
+        """
+        object_name = "doc/" + str(object_id)
+
+        etag = self.client._upload_part(
+            bucket_name=self.bucket_name,
+            object_name=object_name,
+            data=data,
+            headers={"Content-Length": str(len(data))},
+            upload_id=upload_id,
+            part_number=part_number,
+        )
+
+        logger.debug(f"Uploaded part {part_number} for {object_id}, etag={etag}")
+        return etag
+
+    def complete_multipart_upload(
+        self,
+        object_id: UUID,
+        upload_id: str,
+        parts: List[Tuple[int, str]]
+    ) -> None:
+        """
+        Complete a multipart upload, assembling all parts into the final object.
+
+        S3 coalesces the parts server-side - no data transfer through this client.
+
+        Args:
+            object_id: The UUID of the object
+            upload_id: The S3 upload_id from create_multipart_upload
+            parts: List of (part_number, etag) tuples in order
+        """
+        object_name = "doc/" + str(object_id)
+
+        # Convert to Part objects as expected by minio
+        part_objects = [
+            Part(part_number, etag)
+            for part_number, etag in parts
+        ]
+
+        self.client._complete_multipart_upload(
+            bucket_name=self.bucket_name,
+            object_name=object_name,
+            upload_id=upload_id,
+            parts=part_objects,
+        )
+
+        logger.info(f"Completed multipart upload for {object_id}")
+
+    def abort_multipart_upload(self, object_id: UUID, upload_id: str) -> None:
+        """
+        Abort a multipart upload, cleaning up any uploaded parts.
+
+        Args:
+            object_id: The UUID of the object
+            upload_id: The S3 upload_id from create_multipart_upload
+        """
+        object_name = "doc/" + str(object_id)
+
+        self.client._abort_multipart_upload(
+            bucket_name=self.bucket_name,
+            object_name=object_name,
+            upload_id=upload_id,
+        )
+
+        logger.info(f"Aborted multipart upload {upload_id} for {object_id}")
+
--- a/trustgraph-flow/trustgraph/librarian/librarian.py
+++ b/trustgraph-flow/trustgraph/librarian/librarian.py
@ -1,17 +1,24 @@

 from .. schema import LibrarianRequest, LibrarianResponse, Error, Triple
+from .. schema import UploadSession
 from .. knowledge import hash
 from .. exceptions import RequestError
 from .. tables.library import LibraryTableStore
 from . blob_store import BlobStore
 import base64
+import json
 import logging
+import math
+import time

 import uuid

 # Module logger
 logger = logging.getLogger(__name__)

+# Default chunk size for multipart uploads (5MB - S3 minimum)
+DEFAULT_CHUNK_SIZE = 5 * 1024 * 1024
+
 class Librarian:

    def __init__(
@ -66,13 +73,7 @@ class Librarian:

        logger.debug("Add complete")

-        return LibrarianResponse(
-            error = None,
-            document_metadata = None,
-            content = None,
-            document_metadatas = None,
-            processing_metadatas = None,
-        )
+        return LibrarianResponse()

    async def remove_document(self, request):

@ -84,6 +85,21 @@ class Librarian:
        ):
            raise RuntimeError("Document does not exist")

+        # First, cascade delete all child documents
+        children = await self.table_store.list_children(request.document_id)
+        for child in children:
+            logger.debug(f"Cascade deleting child document {child.id}")
+            try:
+                child_object_id = await self.table_store.get_document_object_id(
+                    child.user,
+                    child.id
+                )
+                await self.blob_store.remove(child_object_id)
+                await self.table_store.remove_document(child.user, child.id)
+            except Exception as e:
+                logger.warning(f"Failed to delete child document {child.id}: {e}")
+
+        # Now remove the parent document
        object_id = await self.table_store.get_document_object_id(
            request.user,
            request.document_id
@ -100,13 +116,7 @@ class Librarian:

        logger.debug("Remove complete")

-        return LibrarianResponse(
-            error = None,
-            document_metadata = None,
-            content = None,
-            document_metadatas = None,
-            processing_metadatas = None,
-        )
+        return LibrarianResponse()

    async def update_document(self, request):

@ -124,13 +134,7 @@ class Librarian:

        logger.debug("Update complete")

-        return LibrarianResponse(
-            error = None,
-            document_metadata = None,
-            content = None,
-            document_metadatas = None,
-            processing_metadatas = None,
-        )
+        return LibrarianResponse()

    async def get_document_metadata(self, request):

@ -147,8 +151,6 @@ class Librarian:
            error = None,
            document_metadata = doc,
            content = None,
-            document_metadatas = None,
-            processing_metadatas = None,
        )

    async def get_document_content(self, request):
@ -170,8 +172,6 @@ class Librarian:
            error = None,
            document_metadata = None,
            content = base64.b64encode(content),
-            document_metadatas = None,
-            processing_metadatas = None,
        )

    async def add_processing(self, request):
@ -217,13 +217,7 @@ class Librarian:

        logger.debug("Add complete")

-        return LibrarianResponse(
-            error = None,
-            document_metadata = None,
-            content = None,
-            document_metadatas = None,
-            processing_metadatas = None,
-        )
+        return LibrarianResponse()

    async def remove_processing(self, request):

@ -243,24 +237,22 @@ class Librarian:

        logger.debug("Remove complete")

-        return LibrarianResponse(
-            error = None,
-            document_metadata = None,
-            content = None,
-            document_metadatas = None,
-            processing_metadatas = None,
-        )
+        return LibrarianResponse()

    async def list_documents(self, request):

        docs = await self.table_store.list_documents(request.user)

+        # Filter out child documents by default unless include_children is True
+        include_children = getattr(request, 'include_children', False)
+        if not include_children:
+            docs = [
+                doc for doc in docs
+                if not doc.parent_id  # Only include top-level documents
+            ]
+
        return LibrarianResponse(
-            error = None,
-            document_metadata = None,
-            content = None,
            document_metadatas = docs,
-            processing_metadatas = None,
        )

    async def list_processing(self, request):
@ -268,10 +260,438 @@ class Librarian:
        procs = await self.table_store.list_processing(request.user)

        return LibrarianResponse(
-            error = None,
-            document_metadata = None,
-            content = None,
-            document_metadatas = None,
            processing_metadatas = procs,
        )

+    # Chunked upload operations
+
+    async def begin_upload(self, request):
+        """
+        Initialize a chunked upload session.
+
+        Creates an S3 multipart upload and stores session state in Cassandra.
+        """
+        logger.info(f"Beginning chunked upload for document {request.document_metadata.id}")
+
+        if request.document_metadata.kind not in ("text/plain", "application/pdf"):
+            raise RequestError(
+                "Invalid document kind: " + request.document_metadata.kind
+            )
+
+        if await self.table_store.document_exists(
+                request.document_metadata.user,
+                request.document_metadata.id
+        ):
+            raise RequestError("Document already exists")
+
+        # Validate sizes
+        total_size = request.total_size
+        if total_size <= 0:
+            raise RequestError("total_size must be positive")
+
+        # Use provided chunk size or default (minimum 5MB for S3)
+        chunk_size = request.chunk_size if request.chunk_size > 0 else DEFAULT_CHUNK_SIZE
+        if chunk_size < DEFAULT_CHUNK_SIZE:
+            chunk_size = DEFAULT_CHUNK_SIZE
+
+        # Calculate total chunks
+        total_chunks = math.ceil(total_size / chunk_size)
+
+        # Generate IDs
+        upload_id = str(uuid.uuid4())
+        object_id = uuid.uuid4()
+
+        # Create S3 multipart upload
+        s3_upload_id = self.blob_store.create_multipart_upload(
+            object_id, request.document_metadata.kind
+        )
+
+        # Serialize document metadata for storage
+        doc_meta_json = json.dumps({
+            "id": request.document_metadata.id,
+            "time": request.document_metadata.time,
+            "kind": request.document_metadata.kind,
+            "title": request.document_metadata.title,
+            "comments": request.document_metadata.comments,
+            "user": request.document_metadata.user,
+            "tags": request.document_metadata.tags,
+        })
+
+        # Store session in Cassandra
+        await self.table_store.create_upload_session(
+            upload_id=upload_id,
+            user=request.document_metadata.user,
+            document_id=request.document_metadata.id,
+            document_metadata=doc_meta_json,
+            s3_upload_id=s3_upload_id,
+            object_id=object_id,
+            total_size=total_size,
+            chunk_size=chunk_size,
+            total_chunks=total_chunks,
+        )
+
+        logger.info(f"Created upload session {upload_id} with {total_chunks} chunks")
+
+        return LibrarianResponse(
+            error=None,
+            upload_id=upload_id,
+            chunk_size=chunk_size,
+            total_chunks=total_chunks,
+        )
+
+    async def upload_chunk(self, request):
+        """
+        Upload a single chunk of a document.
+
+        Forwards the chunk to S3 and updates session state.
+        """
+        logger.debug(f"Uploading chunk {request.chunk_index} for upload {request.upload_id}")
+
+        # Get session
+        session = await self.table_store.get_upload_session(request.upload_id)
+        if session is None:
+            raise RequestError("Upload session not found or expired")
+
+        # Validate ownership
+        if session["user"] != request.user:
+            raise RequestError("Not authorized to upload to this session")
+
+        # Validate chunk index
+        if request.chunk_index < 0 or request.chunk_index >= session["total_chunks"]:
+            raise RequestError(
+                f"Invalid chunk index {request.chunk_index}, "
+                f"must be 0-{session['total_chunks']-1}"
+            )
+
+        # Decode content
+        content = base64.b64decode(request.content)
+
+        # Upload to S3 (part numbers are 1-indexed in S3)
+        part_number = request.chunk_index + 1
+        etag = self.blob_store.upload_part(
+            object_id=session["object_id"],
+            upload_id=session["s3_upload_id"],
+            part_number=part_number,
+            data=content,
+        )
+
+        # Update session with chunk info
+        await self.table_store.update_upload_session_chunk(
+            upload_id=request.upload_id,
+            chunk_index=request.chunk_index,
+            etag=etag,
+        )
+
+        # Calculate progress
+        chunks_received = session["chunks_received"]
+        # Add this chunk if not already present
+        if request.chunk_index not in chunks_received:
+            chunks_received[request.chunk_index] = etag
+
+        num_chunks_received = len(chunks_received) + 1  # +1 for this chunk
+        bytes_received = num_chunks_received * session["chunk_size"]
+        # Adjust for last chunk potentially being smaller
+        if bytes_received > session["total_size"]:
+            bytes_received = session["total_size"]
+
+        logger.debug(f"Chunk {request.chunk_index} uploaded, {num_chunks_received}/{session['total_chunks']} complete")
+
+        return LibrarianResponse(
+            error=None,
+            upload_id=request.upload_id,
+            chunk_index=request.chunk_index,
+            chunks_received=num_chunks_received,
+            total_chunks=session["total_chunks"],
+            bytes_received=bytes_received,
+            total_bytes=session["total_size"],
+        )
+
+    async def complete_upload(self, request):
+        """
+        Finalize a chunked upload and create the document.
+
+        Completes the S3 multipart upload and creates the document metadata.
+        """
+        logger.info(f"Completing upload {request.upload_id}")
+
+        # Get session
+        session = await self.table_store.get_upload_session(request.upload_id)
+        if session is None:
+            raise RequestError("Upload session not found or expired")
+
+        # Validate ownership
+        if session["user"] != request.user:
+            raise RequestError("Not authorized to complete this upload")
+
+        # Verify all chunks received
+        chunks_received = session["chunks_received"]
+        if len(chunks_received) != session["total_chunks"]:
+            missing = [
+                i for i in range(session["total_chunks"])
+                if i not in chunks_received
+            ]
+            raise RequestError(
+                f"Missing chunks: {missing[:10]}{'...' if len(missing) > 10 else ''}"
+            )
+
+        # Build parts list for S3 (sorted by part number)
+        parts = [
+            (chunk_index + 1, etag)  # S3 part numbers are 1-indexed
+            for chunk_index, etag in sorted(chunks_received.items())
+        ]
+
+        # Complete S3 multipart upload
+        self.blob_store.complete_multipart_upload(
+            object_id=session["object_id"],
+            upload_id=session["s3_upload_id"],
+            parts=parts,
+        )
+
+        # Parse document metadata from session
+        doc_meta_dict = json.loads(session["document_metadata"])
+
+        # Create DocumentMetadata object
+        from .. schema import DocumentMetadata
+        doc_metadata = DocumentMetadata(
+            id=doc_meta_dict["id"],
+            time=doc_meta_dict.get("time", int(time.time())),
+            kind=doc_meta_dict["kind"],
+            title=doc_meta_dict.get("title", ""),
+            comments=doc_meta_dict.get("comments", ""),
+            user=doc_meta_dict["user"],
+            tags=doc_meta_dict.get("tags", []),
+            metadata=[],  # Triples not supported in chunked upload yet
+        )
+
+        # Add document to table
+        await self.table_store.add_document(doc_metadata, session["object_id"])
+
+        # Delete upload session
+        await self.table_store.delete_upload_session(request.upload_id)
+
+        logger.info(f"Upload {request.upload_id} completed, document {doc_metadata.id} created")
+
+        return LibrarianResponse(
+            error=None,
+            document_id=doc_metadata.id,
+            object_id=str(session["object_id"]),
+        )
+
+    async def abort_upload(self, request):
+        """
+        Cancel a chunked upload and clean up resources.
+        """
+        logger.info(f"Aborting upload {request.upload_id}")
+
+        # Get session
+        session = await self.table_store.get_upload_session(request.upload_id)
+        if session is None:
+            raise RequestError("Upload session not found or expired")
+
+        # Validate ownership
+        if session["user"] != request.user:
+            raise RequestError("Not authorized to abort this upload")
+
+        # Abort S3 multipart upload
+        self.blob_store.abort_multipart_upload(
+            object_id=session["object_id"],
+            upload_id=session["s3_upload_id"],
+        )
+
+        # Delete session from Cassandra
+        await self.table_store.delete_upload_session(request.upload_id)
+
+        logger.info(f"Upload {request.upload_id} aborted")
+
+        return LibrarianResponse(error=None)
+
+    async def get_upload_status(self, request):
+        """
+        Get the status of an in-progress upload.
+        """
+        logger.debug(f"Getting status for upload {request.upload_id}")
+
+        # Get session
+        session = await self.table_store.get_upload_session(request.upload_id)
+        if session is None:
+            return LibrarianResponse(
+                error=None,
+                upload_id=request.upload_id,
+                upload_state="expired",
+            )
+
+        # Validate ownership
+        if session["user"] != request.user:
+            raise RequestError("Not authorized to view this upload")
+
+        chunks_received = session["chunks_received"]
+        received_list = sorted(chunks_received.keys())
+        missing_list = [
+            i for i in range(session["total_chunks"])
+            if i not in chunks_received
+        ]
+
+        bytes_received = len(chunks_received) * session["chunk_size"]
+        if bytes_received > session["total_size"]:
+            bytes_received = session["total_size"]
+
+        return LibrarianResponse(
+            error=None,
+            upload_id=request.upload_id,
+            upload_state="in-progress",
+            received_chunks=received_list,
+            missing_chunks=missing_list,
+            chunks_received=len(chunks_received),
+            total_chunks=session["total_chunks"],
+            bytes_received=bytes_received,
+            total_bytes=session["total_size"],
+        )
+
+    async def list_uploads(self, request):
+        """
+        List all in-progress uploads for a user.
+        """
+        logger.debug(f"Listing uploads for user {request.user}")
+
+        sessions = await self.table_store.list_upload_sessions(request.user)
+
+        upload_sessions = [
+            UploadSession(
+                upload_id=s["upload_id"],
+                document_id=s["document_id"],
+                document_metadata_json=s.get("document_metadata", ""),
+                total_size=s["total_size"],
+                chunk_size=s["chunk_size"],
+                total_chunks=s["total_chunks"],
+                chunks_received=s["chunks_received"],
+                created_at=str(s.get("created_at", "")),
+            )
+            for s in sessions
+        ]
+
+        return LibrarianResponse(
+            error=None,
+            upload_sessions=upload_sessions,
+        )
+
+    # Child document operations
+
+    async def add_child_document(self, request):
+        """
+        Add a child document linked to a parent document.
+
+        Child documents are typically extracted content (e.g., pages from a PDF).
+        They have a parent_id pointing to the source document and document_type
+        set to "extracted".
+        """
+        logger.info(f"Adding child document {request.document_metadata.id} "
+                   f"for parent {request.document_metadata.parent_id}")
+
+        if not request.document_metadata.parent_id:
+            raise RequestError("parent_id is required for child documents")
+
+        # Verify parent exists
+        if not await self.table_store.document_exists(
+                request.document_metadata.user,
+                request.document_metadata.parent_id
+        ):
+            raise RequestError(
+                f"Parent document {request.document_metadata.parent_id} does not exist"
+            )
+
+        if await self.table_store.document_exists(
+                request.document_metadata.user,
+                request.document_metadata.id
+        ):
+            raise RequestError("Document already exists")
+
+        # Ensure document_type is set to "extracted"
+        request.document_metadata.document_type = "extracted"
+
+        # Create object ID for blob
+        object_id = uuid.uuid4()
+
+        logger.debug("Adding blob...")
+
+        await self.blob_store.add(
+            object_id, base64.b64decode(request.content),
+            request.document_metadata.kind
+        )
+
+        logger.debug("Adding to table...")
+
+        await self.table_store.add_document(
+            request.document_metadata, object_id
+        )
+
+        logger.debug("Add child document complete")
+
+        return LibrarianResponse(
+            error=None,
+            document_id=request.document_metadata.id,
+        )
+
+    async def list_children(self, request):
+        """
+        List all child documents for a given parent document.
+        """
+        logger.debug(f"Listing children for parent {request.document_id}")
+
+        children = await self.table_store.list_children(request.document_id)
+
+        return LibrarianResponse(
+            error=None,
+            document_metadatas=children,
+        )
+
+    async def stream_document(self, request):
+        """
+        Stream document content in chunks.
+
+        This operation returns document content in smaller chunks, allowing
+        memory-efficient processing of large documents. The response includes
+        chunk information for reassembly.
+
+        Note: This operation returns a single chunk at a time. Clients should
+        call repeatedly with increasing chunk_index until all chunks are received.
+        """
+        logger.debug(f"Streaming document {request.document_id}, chunk {request.chunk_index}")
+
+        object_id = await self.table_store.get_document_object_id(
+            request.user,
+            request.document_id
+        )
+
+        # Default chunk size of 1MB
+        chunk_size = request.chunk_size if request.chunk_size > 0 else 1024 * 1024
+
+        # Get the full content and slice out the requested chunk
+        # Note: This is a simple implementation. For true streaming, we'd need
+        # range requests on the object storage.
+        content = await self.blob_store.get(object_id)
+        total_size = len(content)
+        total_chunks = math.ceil(total_size / chunk_size)
+
+        if request.chunk_index >= total_chunks:
+            raise RequestError(
+                f"Invalid chunk index {request.chunk_index}, "
+                f"document has {total_chunks} chunks"
+            )
+
+        start = request.chunk_index * chunk_size
+        end = min(start + chunk_size, total_size)
+        chunk_content = content[start:end]
+
+        logger.debug(f"Returning chunk {request.chunk_index}/{total_chunks}, "
+                    f"bytes {start}-{end} of {total_size}")
+
+        return LibrarianResponse(
+            error=None,
+            content=base64.b64encode(chunk_content),
+            chunk_index=request.chunk_index,
+            chunks_received=1,  # Using as "current chunk" indicator
+            total_chunks=total_chunks,
+            bytes_received=end,
+            total_bytes=total_size,
+        )
+
--- a/trustgraph-flow/trustgraph/librarian/service.py
+++ b/trustgraph-flow/trustgraph/librarian/service.py
@ -271,6 +271,9 @@ class Processor(AsyncProcessor):

        pass

+    # Threshold for sending document_id instead of inline content (2MB)
+    STREAMING_THRESHOLD = 2 * 1024 * 1024
+
    async def load_document(self, document, processing, content):

        logger.debug("Ready for document processing...")
@ -292,26 +295,57 @@ class Processor(AsyncProcessor):
        q = flow["interfaces"][kind]

        if kind == "text-load":
-            doc = TextDocument(
-                metadata = Metadata(
-                    id = document.id,
-                    metadata = document.metadata,
-                    user = processing.user,
-                    collection = processing.collection
-                ),
-                text = content,
-            )
+            # For large text documents, send document_id for streaming retrieval
+            if len(content) >= self.STREAMING_THRESHOLD:
+                logger.info(f"Text document {document.id} is large ({len(content)} bytes), "
+                           f"sending document_id for streaming retrieval")
+                doc = TextDocument(
+                    metadata = Metadata(
+                        id = document.id,
+                        metadata = document.metadata,
+                        user = processing.user,
+                        collection = processing.collection
+                    ),
+                    document_id = document.id,
+                    text = b"",  # Empty, receiver will fetch via librarian
+                )
+            else:
+                doc = TextDocument(
+                    metadata = Metadata(
+                        id = document.id,
+                        metadata = document.metadata,
+                        user = processing.user,
+                        collection = processing.collection
+                    ),
+                    text = content,
+                )
            schema = TextDocument
        else:
-            doc = Document(
-                metadata = Metadata(
-                    id = document.id,
-                    metadata = document.metadata,
-                    user = processing.user,
-                    collection = processing.collection
-                ),
-                data = base64.b64encode(content).decode("utf-8")
-            )
+            # For large PDF documents, send document_id for streaming retrieval
+            # instead of embedding the entire content in the message
+            if len(content) >= self.STREAMING_THRESHOLD:
+                logger.info(f"Document {document.id} is large ({len(content)} bytes), "
+                           f"sending document_id for streaming retrieval")
+                doc = Document(
+                    metadata = Metadata(
+                        id = document.id,
+                        metadata = document.metadata,
+                        user = processing.user,
+                        collection = processing.collection
+                    ),
+                    document_id = document.id,
+                    data = b"",  # Empty data, receiver will fetch via API
+                )
+            else:
+                doc = Document(
+                    metadata = Metadata(
+                        id = document.id,
+                        metadata = document.metadata,
+                        user = processing.user,
+                        collection = processing.collection
+                    ),
+                    data = base64.b64encode(content).decode("utf-8")
+                )
            schema = Document

        logger.debug(f"Submitting to queue {q}...")
@ -361,6 +395,17 @@ class Processor(AsyncProcessor):
            "remove-processing": self.librarian.remove_processing,
            "list-documents": self.librarian.list_documents,
            "list-processing": self.librarian.list_processing,
+            # Chunked upload operations
+            "begin-upload": self.librarian.begin_upload,
+            "upload-chunk": self.librarian.upload_chunk,
+            "complete-upload": self.librarian.complete_upload,
+            "abort-upload": self.librarian.abort_upload,
+            "get-upload-status": self.librarian.get_upload_status,
+            "list-uploads": self.librarian.list_uploads,
+            # Child document and streaming operations
+            "add-child-document": self.librarian.add_child_document,
+            "list-children": self.librarian.list_children,
+            "stream-document": self.librarian.stream_document,
        }

        if v.operation not in impls: