Incremental / large document loading (#659)

Tech spec BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py): - get_stream() - yields document content in chunks for streaming retrieval - create_multipart_upload() - initializes S3 multipart upload, returns upload_id - upload_part() - uploads a single part, returns etag - complete_multipart_upload() - finalizes upload with part etags - abort_multipart_upload() - cancels and cleans up Cassandra schema (trustgraph-flow/trustgraph/tables/library.py): - New upload_session table with 24-hour TTL - Index on user for listing sessions - Prepared statements for all operations - Methods: create_upload_session(), get_upload_session(), update_upload_session_chunk(), delete_upload_session(), list_upload_sessions() - Schema extended with UploadSession, UploadProgress, and new request/response fields - Librarian methods: begin_upload, upload_chunk, complete_upload, abort_upload, get_upload_status, list_uploads - Service routing for all new operations - Python SDK with transparent chunked upload: - add_document() auto-switches to chunked for files > 10MB - Progress callback support (on_progress) - get_pending_uploads(), get_upload_status(), abort_upload(), resume_upload() - Document table: Added parent_id and document_type columns with index - Document schema (knowledge/document.py): Added document_id field for streaming retrieval - Librarian operations: - add-child-document for extracted PDF pages - list-children to get child documents - stream-document for chunked content retrieval - Cascade delete removes children when parent is deleted - list-documents filters children by default - PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large documents from librarian API to temp file - Librarian service (librarian/service.py): Sends document_id instead of content for large PDFs (>2MB) - Deprecated tools (load_pdf.py, load_text.py): Added deprecation warnings directing users to tg-add-library-document + tg-start-library-processing Remove load_pdf and load_text utils Move chunker/librarian comms to base class Updating tests
2026-06-15 01:35:13 +02:00 · 2026-03-04 16:57:58 +00:00 · 2026-03-04 16:57:58 +00:00 · a630e143ef
commit a630e143ef
parent a38ca9474f
21 changed files with 3164 additions and 650 deletions
--- a/trustgraph-base/trustgraph/api/library.py
+++ b/trustgraph-base/trustgraph/api/library.py
@ -6,6 +6,7 @@ including document storage, metadata management, and processing workflow coordin
 """

 import datetime
+import math
 import time
 import base64
 import logging
@ -17,6 +18,13 @@ from . exceptions import *

 logger = logging.getLogger(__name__)

+# Threshold for switching to chunked upload (2MB)
+# Lower threshold provides progress feedback and resumability on slower connections
+CHUNKED_UPLOAD_THRESHOLD = 2 * 1024 * 1024
+
+# Default chunk size (5MB - S3 multipart minimum)
+DEFAULT_CHUNK_SIZE = 5 * 1024 * 1024
+

 def to_value(x):
    """Convert wire format to Uri or Literal."""
@ -67,13 +75,14 @@ class Library:

    def add_document(
            self, document, id, metadata, user, title, comments,
-            kind="text/plain", tags=[],
+            kind="text/plain", tags=[], on_progress=None,
    ):
        """
        Add a document to the library.

        Stores a document with associated metadata in the library for
-        retrieval and processing.
+        retrieval and processing. For large documents (> 10MB), automatically
+        uses chunked upload for better reliability and progress tracking.

        Args:
            document: Document content as bytes
@ -84,6 +93,7 @@ class Library:
            comments: Document description or comments
            kind: MIME type of the document (default: "text/plain")
            tags: List of tags for categorization (default: [])
+            on_progress: Optional callback(bytes_sent, total_bytes) for progress updates

        Returns:
            dict: Response from the add operation
@ -107,6 +117,22 @@ class Library:
                    kind="application/pdf",
                    tags=["research", "physics"]
                )
+
+            # Add a large document with progress tracking
+            def progress(sent, total):
+                print(f"Uploaded {sent}/{total} bytes ({100*sent//total}%)")
+
+            with open("large_document.pdf", "rb") as f:
+                library.add_document(
+                    document=f.read(),
+                    id="large-doc-001",
+                    metadata=[],
+                    user="trustgraph",
+                    title="Large Document",
+                    comments="A very large document",
+                    kind="application/pdf",
+                    on_progress=progress
+                )
            ```
        """

@ -124,6 +150,21 @@ class Library:
        if not title: title = ""
        if not comments: comments = ""

+        # Check if we should use chunked upload
+        if len(document) >= CHUNKED_UPLOAD_THRESHOLD:
+            return self._add_document_chunked(
+                document=document,
+                id=id,
+                metadata=metadata,
+                user=user,
+                title=title,
+                comments=comments,
+                kind=kind,
+                tags=tags,
+                on_progress=on_progress,
+            )
+
+        # Small document: use single operation (existing behavior)
        triples = []

        def emit(t):
@ -167,14 +208,111 @@ class Library:

        return self.request(input)

-    def get_documents(self, user):
+    def _add_document_chunked(
+            self, document, id, metadata, user, title, comments,
+            kind, tags, on_progress=None,
+    ):
+        """
+        Add a large document using chunked upload.
+
+        Internal method that handles multipart upload for large documents.
+        """
+        total_size = len(document)
+        chunk_size = DEFAULT_CHUNK_SIZE
+
+        logger.info(f"Starting chunked upload for document {id} ({total_size} bytes)")
+
+        # Begin upload session
+        begin_request = {
+            "operation": "begin-upload",
+            "document-metadata": {
+                "id": id,
+                "time": int(time.time()),
+                "kind": kind,
+                "title": title,
+                "comments": comments,
+                "user": user,
+                "tags": tags,
+            },
+            "total-size": total_size,
+            "chunk-size": chunk_size,
+        }
+
+        begin_response = self.request(begin_request)
+
+        upload_id = begin_response.get("upload-id")
+        if not upload_id:
+            raise RuntimeError("Failed to begin upload: no upload_id returned")
+
+        actual_chunk_size = begin_response.get("chunk-size", chunk_size)
+        total_chunks = begin_response.get("total-chunks", math.ceil(total_size / actual_chunk_size))
+
+        logger.info(f"Upload session {upload_id} created, {total_chunks} chunks")
+
+        try:
+            # Upload chunks
+            bytes_sent = 0
+            for chunk_index in range(total_chunks):
+                start = chunk_index * actual_chunk_size
+                end = min(start + actual_chunk_size, total_size)
+                chunk_data = document[start:end]
+
+                chunk_request = {
+                    "operation": "upload-chunk",
+                    "upload-id": upload_id,
+                    "chunk-index": chunk_index,
+                    "content": base64.b64encode(chunk_data).decode("utf-8"),
+                    "user": user,
+                }
+
+                chunk_response = self.request(chunk_request)
+
+                bytes_sent = end
+
+                # Call progress callback if provided
+                if on_progress:
+                    on_progress(bytes_sent, total_size)
+
+                logger.debug(f"Chunk {chunk_index + 1}/{total_chunks} uploaded")
+
+            # Complete upload
+            complete_request = {
+                "operation": "complete-upload",
+                "upload-id": upload_id,
+                "user": user,
+            }
+
+            complete_response = self.request(complete_request)
+
+            logger.info(f"Chunked upload completed for document {id}")
+
+            return complete_response
+
+        except Exception as e:
+            # Try to abort on failure
+            logger.error(f"Chunked upload failed: {e}")
+            try:
+                abort_request = {
+                    "operation": "abort-upload",
+                    "upload-id": upload_id,
+                    "user": user,
+                }
+                self.request(abort_request)
+                logger.info(f"Aborted failed upload {upload_id}")
+            except Exception as abort_error:
+                logger.warning(f"Failed to abort upload: {abort_error}")
+            raise
+
+    def get_documents(self, user, include_children=False):
        """
        List all documents for a user.

        Retrieves metadata for all documents owned by the specified user.
+        By default, only returns top-level documents (not child/extracted documents).

        Args:
            user: User identifier
+            include_children: If True, also include child documents (default: False)

        Returns:
            list[DocumentMetadata]: List of document metadata objects
@ -185,18 +323,24 @@ class Library:
        Example:
            ```python
            library = api.library()
+
+            # Get only top-level documents
            docs = library.get_documents(user="trustgraph")

            for doc in docs:
                print(f"{doc.id}: {doc.title} ({doc.kind})")
                print(f"  Uploaded: {doc.time}")
                print(f"  Tags: {', '.join(doc.tags)}")
+
+            # Get all documents including extracted pages
+            all_docs = library.get_documents(user="trustgraph", include_children=True)
            ```
        """

        input = {
            "operation": "list-documents",
            "user": user,
+            "include-children": include_children,
        }

        object = self.request(input)
@ -218,7 +362,9 @@ class Library:
                        for w in v["metadata"]
                    ],
                    user = v["user"],
-                    tags = v["tags"]
+                    tags = v["tags"],
+                    parent_id = v.get("parent-id", ""),
+                    document_type = v.get("document-type", "source"),
                )
                for v in object["document-metadatas"]
            ]
@ -261,7 +407,7 @@ class Library:
        doc = object["document-metadata"]

        try:
-            DocumentMetadata(
+            return DocumentMetadata(
                id = doc["id"],
                time = datetime.datetime.fromtimestamp(doc["time"]),
                kind = doc["kind"],
@ -276,7 +422,9 @@ class Library:
                    for w in doc["metadata"]
                ],
                user = doc["user"],
-                tags = doc["tags"]
+                tags = doc["tags"],
+                parent_id = doc.get("parent-id", ""),
+                document_type = doc.get("document-type", "source"),
            )
        except Exception as e:
            logger.error("Failed to parse document response", exc_info=True)
@ -535,3 +683,447 @@ class Library:
            logger.error("Failed to parse processing list response", exc_info=True)
            raise ProtocolException(f"Response not formatted correctly")

+    # Chunked upload management methods
+
+    def get_pending_uploads(self, user):
+        """
+        List all pending (in-progress) uploads for a user.
+
+        Retrieves information about chunked uploads that have been started
+        but not yet completed.
+
+        Args:
+            user: User identifier
+
+        Returns:
+            list[dict]: List of pending upload information
+
+        Example:
+            ```python
+            library = api.library()
+            pending = library.get_pending_uploads(user="trustgraph")
+
+            for upload in pending:
+                print(f"Upload {upload['upload_id']}:")
+                print(f"  Document: {upload['document_id']}")
+                print(f"  Progress: {upload['chunks_received']}/{upload['total_chunks']}")
+            ```
+        """
+        input = {
+            "operation": "list-uploads",
+            "user": user,
+        }
+
+        response = self.request(input)
+
+        return response.get("upload-sessions", [])
+
+    def get_upload_status(self, upload_id, user):
+        """
+        Get the status of a specific upload.
+
+        Retrieves detailed status information about a chunked upload,
+        including which chunks have been received and which are missing.
+
+        Args:
+            upload_id: Upload session identifier
+            user: User identifier
+
+        Returns:
+            dict: Upload status information including:
+                - upload_id: The upload session ID
+                - state: "in-progress", "completed", or "expired"
+                - chunks_received: Number of chunks received
+                - total_chunks: Total number of chunks expected
+                - received_chunks: List of received chunk indices
+                - missing_chunks: List of missing chunk indices
+                - bytes_received: Total bytes received
+                - total_bytes: Total expected bytes
+
+        Example:
+            ```python
+            library = api.library()
+            status = library.get_upload_status(
+                upload_id="abc-123",
+                user="trustgraph"
+            )
+
+            if status['state'] == 'in-progress':
+                print(f"Missing chunks: {status['missing_chunks']}")
+            ```
+        """
+        input = {
+            "operation": "get-upload-status",
+            "upload-id": upload_id,
+            "user": user,
+        }
+
+        return self.request(input)
+
+    def abort_upload(self, upload_id, user):
+        """
+        Abort an in-progress upload.
+
+        Cancels a chunked upload and cleans up any uploaded chunks.
+
+        Args:
+            upload_id: Upload session identifier
+            user: User identifier
+
+        Returns:
+            dict: Empty response on success
+
+        Example:
+            ```python
+            library = api.library()
+            library.abort_upload(upload_id="abc-123", user="trustgraph")
+            ```
+        """
+        input = {
+            "operation": "abort-upload",
+            "upload-id": upload_id,
+            "user": user,
+        }
+
+        return self.request(input)
+
+    def resume_upload(self, upload_id, document, user, on_progress=None):
+        """
+        Resume an interrupted upload.
+
+        Continues a chunked upload that was previously interrupted,
+        uploading only the missing chunks.
+
+        Args:
+            upload_id: Upload session identifier to resume
+            document: Complete document content as bytes
+            user: User identifier
+            on_progress: Optional callback(bytes_sent, total_bytes) for progress updates
+
+        Returns:
+            dict: Response from completing the upload
+
+        Example:
+            ```python
+            library = api.library()
+
+            # Check what's missing
+            status = library.get_upload_status(
+                upload_id="abc-123",
+                user="trustgraph"
+            )
+
+            if status['state'] == 'in-progress':
+                # Resume with the same document
+                with open("large_document.pdf", "rb") as f:
+                    library.resume_upload(
+                        upload_id="abc-123",
+                        document=f.read(),
+                        user="trustgraph"
+                    )
+            ```
+        """
+        # Get current status
+        status = self.get_upload_status(upload_id, user)
+
+        if status.get("upload-state") == "expired":
+            raise RuntimeError("Upload session has expired, please start a new upload")
+
+        if status.get("upload-state") == "completed":
+            return {"message": "Upload already completed"}
+
+        missing_chunks = status.get("missing-chunks", [])
+        total_chunks = status.get("total-chunks", 0)
+        total_bytes = status.get("total-bytes", len(document))
+        chunk_size = total_bytes // total_chunks if total_chunks > 0 else DEFAULT_CHUNK_SIZE
+
+        logger.info(f"Resuming upload {upload_id}, {len(missing_chunks)} chunks remaining")
+
+        # Upload missing chunks
+        for chunk_index in missing_chunks:
+            start = chunk_index * chunk_size
+            end = min(start + chunk_size, len(document))
+            chunk_data = document[start:end]
+
+            chunk_request = {
+                "operation": "upload-chunk",
+                "upload-id": upload_id,
+                "chunk-index": chunk_index,
+                "content": base64.b64encode(chunk_data).decode("utf-8"),
+                "user": user,
+            }
+
+            self.request(chunk_request)
+
+            if on_progress:
+                # Estimate progress including previously uploaded chunks
+                uploaded = total_chunks - len(missing_chunks) + missing_chunks.index(chunk_index) + 1
+                bytes_sent = min(uploaded * chunk_size, total_bytes)
+                on_progress(bytes_sent, total_bytes)
+
+            logger.debug(f"Resumed chunk {chunk_index}")
+
+        # Complete upload
+        complete_request = {
+            "operation": "complete-upload",
+            "upload-id": upload_id,
+            "user": user,
+        }
+
+        return self.request(complete_request)
+
+    # Child document methods
+
+    def add_child_document(
+            self, document, id, parent_id, user, title, comments,
+            kind="text/plain", tags=[], metadata=None,
+    ):
+        """
+        Add a child document linked to a parent document.
+
+        Child documents are typically extracted content (e.g., pages from a PDF).
+        They are automatically marked with document_type="extracted" and linked
+        to their parent via parent_id.
+
+        Args:
+            document: Document content as bytes
+            id: Document identifier (auto-generated if None)
+            parent_id: Parent document identifier (required)
+            user: User/owner identifier
+            title: Document title
+            comments: Document description or comments
+            kind: MIME type of the document (default: "text/plain")
+            tags: List of tags for categorization (default: [])
+            metadata: Optional metadata as list of Triple objects
+
+        Returns:
+            dict: Response from the add operation
+
+        Raises:
+            RuntimeError: If parent_id is not provided
+
+        Example:
+            ```python
+            library = api.library()
+
+            # Add extracted page from a PDF
+            library.add_child_document(
+                document=page_text.encode('utf-8'),
+                id="doc-123-page-1",
+                parent_id="doc-123",
+                user="trustgraph",
+                title="Page 1 of Research Paper",
+                comments="First page extracted from PDF",
+                kind="text/plain",
+                tags=["extracted", "page"]
+            )
+            ```
+        """
+        if not parent_id:
+            raise RuntimeError("parent_id is required for child documents")
+
+        if id is None:
+            id = hash(document)
+
+        if not title:
+            title = ""
+        if not comments:
+            comments = ""
+
+        triples = []
+        if metadata:
+            if isinstance(metadata, list):
+                triples = [
+                    {
+                        "s": from_value(t.s),
+                        "p": from_value(t.p),
+                        "o": from_value(t.o),
+                    }
+                    for t in metadata
+                ]
+
+        input = {
+            "operation": "add-child-document",
+            "document-metadata": {
+                "id": id,
+                "time": int(time.time()),
+                "kind": kind,
+                "title": title,
+                "comments": comments,
+                "metadata": triples,
+                "user": user,
+                "tags": tags,
+                "parent-id": parent_id,
+                "document-type": "extracted",
+            },
+            "content": base64.b64encode(document).decode("utf-8"),
+        }
+
+        return self.request(input)
+
+    def list_children(self, document_id, user):
+        """
+        List all child documents for a given parent document.
+
+        Args:
+            document_id: Parent document identifier
+            user: User identifier
+
+        Returns:
+            list[DocumentMetadata]: List of child document metadata objects
+
+        Example:
+            ```python
+            library = api.library()
+            children = library.list_children(
+                document_id="doc-123",
+                user="trustgraph"
+            )
+
+            for child in children:
+                print(f"{child.id}: {child.title}")
+            ```
+        """
+        input = {
+            "operation": "list-children",
+            "document-id": document_id,
+            "user": user,
+        }
+
+        response = self.request(input)
+
+        try:
+            return [
+                DocumentMetadata(
+                    id=v["id"],
+                    time=datetime.datetime.fromtimestamp(v["time"]),
+                    kind=v["kind"],
+                    title=v["title"],
+                    comments=v.get("comments", ""),
+                    metadata=[
+                        Triple(
+                            s=to_value(w["s"]),
+                            p=to_value(w["p"]),
+                            o=to_value(w["o"])
+                        )
+                        for w in v.get("metadata", [])
+                    ],
+                    user=v["user"],
+                    tags=v.get("tags", []),
+                    parent_id=v.get("parent-id", ""),
+                    document_type=v.get("document-type", "source"),
+                )
+                for v in response.get("document-metadatas", [])
+            ]
+        except Exception as e:
+            logger.error("Failed to parse children response", exc_info=True)
+            raise ProtocolException("Response not formatted correctly")
+
+    def get_document_content(self, user, id):
+        """
+        Get the content of a document.
+
+        Retrieves the full content of a document as bytes.
+
+        Args:
+            user: User identifier
+            id: Document identifier
+
+        Returns:
+            bytes: Document content
+
+        Example:
+            ```python
+            library = api.library()
+            content = library.get_document_content(
+                user="trustgraph",
+                id="doc-123"
+            )
+
+            # Write to file
+            with open("output.pdf", "wb") as f:
+                f.write(content)
+            ```
+        """
+        input = {
+            "operation": "get-document-content",
+            "user": user,
+            "document-id": id,
+        }
+
+        response = self.request(input)
+        content_b64 = response.get("content", "")
+
+        return base64.b64decode(content_b64)
+
+    def stream_document_to_file(self, user, id, file_path, chunk_size=1024*1024, on_progress=None):
+        """
+        Stream document content to a file.
+
+        Downloads document content in chunks and writes directly to a file,
+        enabling memory-efficient handling of large documents.
+
+        Args:
+            user: User identifier
+            id: Document identifier
+            file_path: Path to write the document content
+            chunk_size: Size of each chunk to download (default 1MB)
+            on_progress: Optional callback(bytes_received, total_bytes) for progress updates
+
+        Returns:
+            int: Total bytes written
+
+        Example:
+            ```python
+            library = api.library()
+
+            def progress(received, total):
+                print(f"Downloaded {received}/{total} bytes")
+
+            library.stream_document_to_file(
+                user="trustgraph",
+                id="large-doc-123",
+                file_path="/tmp/document.pdf",
+                on_progress=progress
+            )
+            ```
+        """
+        chunk_index = 0
+        total_bytes_written = 0
+        total_bytes = None
+
+        with open(file_path, "wb") as f:
+            while True:
+                input = {
+                    "operation": "stream-document",
+                    "user": user,
+                    "document-id": id,
+                    "chunk-index": chunk_index,
+                    "chunk-size": chunk_size,
+                }
+
+                response = self.request(input)
+
+                content_b64 = response.get("content", "")
+                chunk_data = base64.b64decode(content_b64)
+
+                if not chunk_data:
+                    break
+
+                f.write(chunk_data)
+                total_bytes_written += len(chunk_data)
+
+                total_chunks = response.get("total-chunks", 1)
+                total_bytes = response.get("total-bytes", total_bytes_written)
+
+                if on_progress:
+                    on_progress(total_bytes_written, total_bytes)
+
+                # Check if we've received all chunks
+                if chunk_index >= total_chunks - 1:
+                    break
+
+                chunk_index += 1
+
+        return total_bytes_written
+
--- a/trustgraph-base/trustgraph/api/types.py
+++ b/trustgraph-base/trustgraph/api/types.py
@ -64,6 +64,8 @@ class DocumentMetadata:
        metadata: List of RDF triples providing structured metadata
        user: User/owner identifier
        tags: List of tags for categorization
+        parent_id: Parent document ID for child documents (empty for top-level docs)
+        document_type: "source" for uploaded documents, "extracted" for derived content
    """
    id : str
    time : datetime.datetime
@ -73,6 +75,8 @@ class DocumentMetadata:
    metadata : List[Triple]
    user : str
    tags : List[str]
+    parent_id : str = ""
+    document_type : str = "source"

@dataclasses.dataclass
 class ProcessingMetadata:
--- a/trustgraph-base/trustgraph/base/chunking_service.py
+++ b/trustgraph-base/trustgraph/base/chunking_service.py
@ -1,20 +1,37 @@
 """
 Base chunking service that provides parameter specification functionality
-for chunk-size and chunk-overlap parameters
+for chunk-size and chunk-overlap parameters, and librarian client for
+fetching large document content.
 """

+import asyncio
+import base64
 import logging
+import uuid
+
 from .flow_processor import FlowProcessor
 from .parameter_spec import ParameterSpec
+from .consumer import Consumer
+from .producer import Producer
+from .metrics import ConsumerMetrics, ProducerMetrics
+
+from ..schema import LibrarianRequest, LibrarianResponse
+from ..schema import librarian_request_queue, librarian_response_queue

 # Module logger
 logger = logging.getLogger(__name__)

+default_librarian_request_queue = librarian_request_queue
+default_librarian_response_queue = librarian_response_queue
+
+
 class ChunkingService(FlowProcessor):
    """Base service for chunking processors with parameter specification support"""

    def __init__(self, **params):

+        id = params.get("id", "chunker")
+
        # Call parent constructor
        super(ChunkingService, self).__init__(**params)

@ -27,8 +44,122 @@ class ChunkingService(FlowProcessor):
            ParameterSpec(name="chunk-overlap")
        )

+        # Librarian client for fetching document content
+        librarian_request_q = params.get(
+            "librarian_request_queue", default_librarian_request_queue
+        )
+        librarian_response_q = params.get(
+            "librarian_response_queue", default_librarian_response_queue
+        )
+
+        librarian_request_metrics = ProducerMetrics(
+            processor=id, flow=None, name="librarian-request"
+        )
+
+        self.librarian_request_producer = Producer(
+            backend=self.pubsub,
+            topic=librarian_request_q,
+            schema=LibrarianRequest,
+            metrics=librarian_request_metrics,
+        )
+
+        librarian_response_metrics = ConsumerMetrics(
+            processor=id, flow=None, name="librarian-response"
+        )
+
+        self.librarian_response_consumer = Consumer(
+            taskgroup=self.taskgroup,
+            backend=self.pubsub,
+            flow=None,
+            topic=librarian_response_q,
+            subscriber=f"{id}-librarian",
+            schema=LibrarianResponse,
+            handler=self.on_librarian_response,
+            metrics=librarian_response_metrics,
+        )
+
+        # Pending librarian requests: request_id -> asyncio.Future
+        self.pending_requests = {}
+
        logger.debug("ChunkingService initialized with parameter specifications")

+    async def start(self):
+        await super(ChunkingService, self).start()
+        await self.librarian_request_producer.start()
+        await self.librarian_response_consumer.start()
+
+    async def on_librarian_response(self, msg, consumer, flow):
+        """Handle responses from the librarian service."""
+        response = msg.value()
+        request_id = msg.properties().get("id")
+
+        if request_id and request_id in self.pending_requests:
+            future = self.pending_requests.pop(request_id)
+            future.set_result(response)
+        else:
+            logger.warning(f"Received unexpected librarian response: {request_id}")
+
+    async def fetch_document_content(self, document_id, user, timeout=120):
+        """
+        Fetch document content from librarian via Pulsar.
+        """
+        request_id = str(uuid.uuid4())
+
+        request = LibrarianRequest(
+            operation="get-document-content",
+            document_id=document_id,
+            user=user,
+        )
+
+        # Create future for response
+        future = asyncio.get_event_loop().create_future()
+        self.pending_requests[request_id] = future
+
+        try:
+            # Send request
+            await self.librarian_request_producer.send(
+                request, properties={"id": request_id}
+            )
+
+            # Wait for response
+            response = await asyncio.wait_for(future, timeout=timeout)
+
+            if response.error:
+                raise RuntimeError(
+                    f"Librarian error: {response.error.type}: {response.error.message}"
+                )
+
+            return response.content
+
+        except asyncio.TimeoutError:
+            self.pending_requests.pop(request_id, None)
+            raise RuntimeError(f"Timeout fetching document {document_id}")
+
+    async def get_document_text(self, doc):
+        """
+        Get text content from a TextDocument, fetching from librarian if needed.
+
+        Args:
+            doc: TextDocument with either inline text or document_id
+
+        Returns:
+            str: The document text content
+        """
+        if doc.document_id and not doc.text:
+            logger.info(f"Fetching document {doc.document_id} from librarian...")
+            content = await self.fetch_document_content(
+                document_id=doc.document_id,
+                user=doc.metadata.user,
+            )
+            # Content is base64 encoded
+            if isinstance(content, str):
+                content = content.encode('utf-8')
+            text = base64.b64decode(content).decode("utf-8")
+            logger.info(f"Fetched {len(text)} characters from librarian")
+            return text
+        else:
+            return doc.text.decode("utf-8")
+
    async def chunk_document(self, msg, consumer, flow, default_chunk_size, default_chunk_overlap):
        """
        Extract chunk parameters from flow and return effective values
@ -59,4 +190,16 @@ class ChunkingService(FlowProcessor):
    @staticmethod
    def add_args(parser):
        """Add chunking service arguments to parser"""
-        FlowProcessor.add_args(parser)
+        FlowProcessor.add_args(parser)
+
+        parser.add_argument(
+            '--librarian-request-queue',
+            default=default_librarian_request_queue,
+            help=f'Librarian request queue (default: {default_librarian_request_queue})',
+        )
+
+        parser.add_argument(
+            '--librarian-response-queue',
+            default=default_librarian_response_queue,
+            help=f'Librarian response queue (default: {default_librarian_response_queue})',
+        )
--- a/trustgraph-base/trustgraph/messaging/translators/library.py
+++ b/trustgraph-base/trustgraph/messaging/translators/library.py
@ -44,14 +44,21 @@ class LibraryRequestTranslator(MessageTranslator):
        
        return LibrarianRequest(
            operation=data.get("operation"),
-            document_id=data.get("document-id"),
-            processing_id=data.get("processing-id"),
+            document_id=data.get("document-id", ""),
+            processing_id=data.get("processing-id", ""),
            document_metadata=doc_metadata,
            processing_metadata=proc_metadata,
            content=content,
-            user=data.get("user"),
-            collection=data.get("collection"),
-            criteria=criteria
+            user=data.get("user", ""),
+            collection=data.get("collection", ""),
+            criteria=criteria,
+            # Chunked upload fields
+            total_size=data.get("total-size", 0),
+            chunk_size=data.get("chunk-size", 0),
+            upload_id=data.get("upload-id", ""),
+            chunk_index=data.get("chunk-index", 0),
+            # List documents filtering
+            include_children=data.get("include-children", False),
        )
    
    def from_pulsar(self, obj: LibrarianRequest) -> Dict[str, Any]:
@ -98,25 +105,71 @@ class LibraryResponseTranslator(MessageTranslator):
    
    def from_pulsar(self, obj: LibrarianResponse) -> Dict[str, Any]:
        result = {}
-        
+
+        if obj.error:
+            result["error"] = {
+                "type": obj.error.type,
+                "message": obj.error.message,
+            }
+
        if obj.document_metadata:
            result["document-metadata"] = self.doc_metadata_translator.from_pulsar(obj.document_metadata)
-        
+
        if obj.content:
            result["content"] = obj.content.decode("utf-8") if isinstance(obj.content, bytes) else obj.content
-        
+
        if obj.document_metadatas is not None:
            result["document-metadatas"] = [
                self.doc_metadata_translator.from_pulsar(dm)
                for dm in obj.document_metadatas
            ]
-        
+
        if obj.processing_metadatas is not None:
            result["processing-metadatas"] = [
                self.proc_metadata_translator.from_pulsar(pm)
                for pm in obj.processing_metadatas
            ]
-        
+
+        # Chunked upload response fields
+        if obj.upload_id:
+            result["upload-id"] = obj.upload_id
+        if obj.chunk_size:
+            result["chunk-size"] = obj.chunk_size
+        if obj.total_chunks:
+            result["total-chunks"] = obj.total_chunks
+        if obj.chunk_index:
+            result["chunk-index"] = obj.chunk_index
+        if obj.chunks_received:
+            result["chunks-received"] = obj.chunks_received
+        if obj.bytes_received:
+            result["bytes-received"] = obj.bytes_received
+        if obj.total_bytes:
+            result["total-bytes"] = obj.total_bytes
+        if obj.document_id:
+            result["document-id"] = obj.document_id
+        if obj.object_id:
+            result["object-id"] = obj.object_id
+        if obj.upload_state:
+            result["upload-state"] = obj.upload_state
+        if obj.received_chunks:
+            result["received-chunks"] = obj.received_chunks
+        if obj.missing_chunks:
+            result["missing-chunks"] = obj.missing_chunks
+        if obj.upload_sessions:
+            result["upload-sessions"] = [
+                {
+                    "upload-id": s.upload_id,
+                    "document-id": s.document_id,
+                    "document-metadata-json": s.document_metadata_json,
+                    "total-size": s.total_size,
+                    "chunk-size": s.chunk_size,
+                    "total-chunks": s.total_chunks,
+                    "chunks-received": s.chunks_received,
+                    "created-at": s.created_at,
+                }
+                for s in obj.upload_sessions
+            ]
+
        return result
    
    def from_response_with_completion(self, obj: LibrarianResponse) -> Tuple[Dict[str, Any], bool]:
--- a/trustgraph-base/trustgraph/messaging/translators/metadata.py
+++ b/trustgraph-base/trustgraph/messaging/translators/metadata.py
@ -20,12 +20,14 @@ class DocumentMetadataTranslator(Translator):
            comments=data.get("comments"),
            metadata=self.subgraph_translator.to_pulsar(metadata) if metadata is not None else [],
            user=data.get("user"),
-            tags=data.get("tags")
+            tags=data.get("tags"),
+            parent_id=data.get("parent-id", ""),
+            document_type=data.get("document-type", "source"),
        )
    
    def from_pulsar(self, obj: DocumentMetadata) -> Dict[str, Any]:
        result = {}
-        
+
        if obj.id:
            result["id"] = obj.id
        if obj.time:
@ -42,7 +44,11 @@ class DocumentMetadataTranslator(Translator):
            result["user"] = obj.user
        if obj.tags is not None:
            result["tags"] = obj.tags
-            
+        if obj.parent_id:
+            result["parent-id"] = obj.parent_id
+        if obj.document_type:
+            result["document-type"] = obj.document_type
+
        return result


--- a/trustgraph-base/trustgraph/schema/knowledge/document.py
+++ b/trustgraph-base/trustgraph/schema/knowledge/document.py
@ -10,6 +10,9 @@ from ..core.topic import topic
 class Document:
    metadata: Metadata | None = None
    data: bytes = b""
+    # For large document streaming: if document_id is set, the receiver should
+    # fetch content from librarian instead of using inline data
+    document_id: str = ""

 ############################################################################

@ -19,6 +22,9 @@ class Document:
 class TextDocument:
    metadata: Metadata | None = None
    text: bytes = b""
+    # For large document streaming: if document_id is set, the receiver should
+    # fetch content from librarian instead of using inline text
+    document_id: str = ""

 ############################################################################

--- a/trustgraph-base/trustgraph/schema/services/library.py
+++ b/trustgraph-base/trustgraph/schema/services/library.py
@ -49,6 +49,36 @@ from ..core.metadata import Metadata
 #   <- (processing_metadata[])
 #   <- (error)

+# begin-upload
+#   -> (document_metadata, total_size, chunk_size)
+#   <- (upload_id, chunk_size, total_chunks)
+#   <- (error)
+
+# upload-chunk
+#   -> (upload_id, chunk_index, content)
+#   <- (upload_id, chunk_index, chunks_received, total_chunks, bytes_received, total_bytes)
+#   <- (error)
+
+# complete-upload
+#   -> (upload_id)
+#   <- (document_id, object_id)
+#   <- (error)
+
+# abort-upload
+#   -> (upload_id)
+#   <- ()
+#   <- (error)
+
+# get-upload-status
+#   -> (upload_id)
+#   <- (upload_id, state, chunks_received, missing_chunks, total_chunks, bytes_received, total_bytes)
+#   <- (error)
+
+# list-uploads
+#   -> (user)
+#   <- (uploads[])
+#   <- (error)
+
@dataclass
 class DocumentMetadata:
    id: str = ""
@ -59,6 +89,9 @@ class DocumentMetadata:
    metadata: list[Triple] = field(default_factory=list)
    user: str = ""
    tags: list[str] = field(default_factory=list)
+    # Child document support
+    parent_id: str = ""  # Empty for top-level docs, set for children
+    document_type: str = "source"  # "source" or "extracted"

@dataclass
 class ProcessingMetadata:
@ -76,11 +109,33 @@ class Criteria:
    value: str = ""
    operator: str = ""

+@dataclass
+class UploadProgress:
+    """Progress information for chunked uploads."""
+    upload_id: str = ""
+    chunks_received: int = 0
+    total_chunks: int = 0
+    bytes_received: int = 0
+    total_bytes: int = 0
+
+@dataclass
+class UploadSession:
+    """Information about an in-progress upload."""
+    upload_id: str = ""
+    document_id: str = ""
+    document_metadata_json: str = ""  # JSON-encoded DocumentMetadata
+    total_size: int = 0
+    chunk_size: int = 0
+    total_chunks: int = 0
+    chunks_received: int = 0
+    created_at: str = ""
+
@dataclass
 class LibrarianRequest:
    # add-document, remove-document, update-document, get-document-metadata,
    # get-document-content, add-processing, remove-processing, list-documents,
-    # list-processing
+    # list-processing, begin-upload, upload-chunk, complete-upload, abort-upload,
+    # get-upload-status, list-uploads
    operation: str = ""

    # add-document, remove-document, update-document, get-document-metadata,
@ -90,16 +145,16 @@ class LibrarianRequest:
    # add-processing, remove-processing
    processing_id: str = ""

-    # add-document, update-document
+    # add-document, update-document, begin-upload
    document_metadata: DocumentMetadata | None = None

    # add-processing
    processing_metadata: ProcessingMetadata | None = None

-    # add-document
+    # add-document, upload-chunk
    content: bytes = b""

-    # list-documents, list-processing
+    # list-documents, list-processing, list-uploads
    user: str = ""

    # list-documents?, list-processing?
@ -108,6 +163,19 @@ class LibrarianRequest:
    #
    criteria: list[Criteria] = field(default_factory=list)

+    # begin-upload
+    total_size: int = 0
+    chunk_size: int = 0
+
+    # upload-chunk, complete-upload, abort-upload, get-upload-status
+    upload_id: str = ""
+
+    # upload-chunk, stream-document
+    chunk_index: int = 0
+
+    # list-documents - whether to include child documents (default False)
+    include_children: bool = False
+
@dataclass
 class LibrarianResponse:
    error: Error | None = None
@ -116,6 +184,29 @@ class LibrarianResponse:
    document_metadatas: list[DocumentMetadata] = field(default_factory=list)
    processing_metadatas: list[ProcessingMetadata] = field(default_factory=list)

+    # begin-upload response
+    upload_id: str = ""
+    chunk_size: int = 0
+    total_chunks: int = 0
+
+    # upload-chunk response
+    chunk_index: int = 0
+    chunks_received: int = 0
+    bytes_received: int = 0
+    total_bytes: int = 0
+
+    # complete-upload response
+    document_id: str = ""
+    object_id: str = ""
+
+    # get-upload-status response
+    upload_state: str = ""  # "in-progress", "completed", "expired"
+    received_chunks: list[int] = field(default_factory=list)
+    missing_chunks: list[int] = field(default_factory=list)
+
+    # list-uploads response
+    upload_sessions: list[UploadSession] = field(default_factory=list)
+
 # FIXME: Is this right?  Using persistence on librarian so that
 # message chunking works