Incremental / large document loading (#659)

Tech spec BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py): - get_stream() - yields document content in chunks for streaming retrieval - create_multipart_upload() - initializes S3 multipart upload, returns upload_id - upload_part() - uploads a single part, returns etag - complete_multipart_upload() - finalizes upload with part etags - abort_multipart_upload() - cancels and cleans up Cassandra schema (trustgraph-flow/trustgraph/tables/library.py): - New upload_session table with 24-hour TTL - Index on user for listing sessions - Prepared statements for all operations - Methods: create_upload_session(), get_upload_session(), update_upload_session_chunk(), delete_upload_session(), list_upload_sessions() - Schema extended with UploadSession, UploadProgress, and new request/response fields - Librarian methods: begin_upload, upload_chunk, complete_upload, abort_upload, get_upload_status, list_uploads - Service routing for all new operations - Python SDK with transparent chunked upload: - add_document() auto-switches to chunked for files > 10MB - Progress callback support (on_progress) - get_pending_uploads(), get_upload_status(), abort_upload(), resume_upload() - Document table: Added parent_id and document_type columns with index - Document schema (knowledge/document.py): Added document_id field for streaming retrieval - Librarian operations: - add-child-document for extracted PDF pages - list-children to get child documents - stream-document for chunked content retrieval - Cascade delete removes children when parent is deleted - list-documents filters children by default - PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large documents from librarian API to temp file - Librarian service (librarian/service.py): Sends document_id instead of content for large PDFs (>2MB) - Deprecated tools (load_pdf.py, load_text.py): Added deprecation warnings directing users to tg-add-library-document + tg-start-library-processing Remove load_pdf and load_text utils Move chunker/librarian comms to base class Updating tests
2026-07-01 17:39:39 +02:00 · 2026-03-04 16:57:58 +00:00 · 2026-03-04 16:57:58 +00:00 · a630e143ef
commit a630e143ef
parent a38ca9474f
21 changed files with 3164 additions and 650 deletions
--- a/trustgraph-base/trustgraph/messaging/translators/library.py
+++ b/trustgraph-base/trustgraph/messaging/translators/library.py
@ -44,14 +44,21 @@ class LibraryRequestTranslator(MessageTranslator):
        
        return LibrarianRequest(
            operation=data.get("operation"),
-            document_id=data.get("document-id"),
-            processing_id=data.get("processing-id"),
+            document_id=data.get("document-id", ""),
+            processing_id=data.get("processing-id", ""),
            document_metadata=doc_metadata,
            processing_metadata=proc_metadata,
            content=content,
-            user=data.get("user"),
-            collection=data.get("collection"),
-            criteria=criteria
+            user=data.get("user", ""),
+            collection=data.get("collection", ""),
+            criteria=criteria,
+            # Chunked upload fields
+            total_size=data.get("total-size", 0),
+            chunk_size=data.get("chunk-size", 0),
+            upload_id=data.get("upload-id", ""),
+            chunk_index=data.get("chunk-index", 0),
+            # List documents filtering
+            include_children=data.get("include-children", False),
        )
    
    def from_pulsar(self, obj: LibrarianRequest) -> Dict[str, Any]:
@ -98,25 +105,71 @@ class LibraryResponseTranslator(MessageTranslator):
    
    def from_pulsar(self, obj: LibrarianResponse) -> Dict[str, Any]:
        result = {}
-        
+
+        if obj.error:
+            result["error"] = {
+                "type": obj.error.type,
+                "message": obj.error.message,
+            }
+
        if obj.document_metadata:
            result["document-metadata"] = self.doc_metadata_translator.from_pulsar(obj.document_metadata)
-        
+
        if obj.content:
            result["content"] = obj.content.decode("utf-8") if isinstance(obj.content, bytes) else obj.content
-        
+
        if obj.document_metadatas is not None:
            result["document-metadatas"] = [
                self.doc_metadata_translator.from_pulsar(dm)
                for dm in obj.document_metadatas
            ]
-        
+
        if obj.processing_metadatas is not None:
            result["processing-metadatas"] = [
                self.proc_metadata_translator.from_pulsar(pm)
                for pm in obj.processing_metadatas
            ]
-        
+
+        # Chunked upload response fields
+        if obj.upload_id:
+            result["upload-id"] = obj.upload_id
+        if obj.chunk_size:
+            result["chunk-size"] = obj.chunk_size
+        if obj.total_chunks:
+            result["total-chunks"] = obj.total_chunks
+        if obj.chunk_index:
+            result["chunk-index"] = obj.chunk_index
+        if obj.chunks_received:
+            result["chunks-received"] = obj.chunks_received
+        if obj.bytes_received:
+            result["bytes-received"] = obj.bytes_received
+        if obj.total_bytes:
+            result["total-bytes"] = obj.total_bytes
+        if obj.document_id:
+            result["document-id"] = obj.document_id
+        if obj.object_id:
+            result["object-id"] = obj.object_id
+        if obj.upload_state:
+            result["upload-state"] = obj.upload_state
+        if obj.received_chunks:
+            result["received-chunks"] = obj.received_chunks
+        if obj.missing_chunks:
+            result["missing-chunks"] = obj.missing_chunks
+        if obj.upload_sessions:
+            result["upload-sessions"] = [
+                {
+                    "upload-id": s.upload_id,
+                    "document-id": s.document_id,
+                    "document-metadata-json": s.document_metadata_json,
+                    "total-size": s.total_size,
+                    "chunk-size": s.chunk_size,
+                    "total-chunks": s.total_chunks,
+                    "chunks-received": s.chunks_received,
+                    "created-at": s.created_at,
+                }
+                for s in obj.upload_sessions
+            ]
+
        return result
    
    def from_response_with_completion(self, obj: LibrarianResponse) -> Tuple[Dict[str, Any], bool]:
--- a/trustgraph-base/trustgraph/messaging/translators/metadata.py
+++ b/trustgraph-base/trustgraph/messaging/translators/metadata.py
@ -20,12 +20,14 @@ class DocumentMetadataTranslator(Translator):
            comments=data.get("comments"),
            metadata=self.subgraph_translator.to_pulsar(metadata) if metadata is not None else [],
            user=data.get("user"),
-            tags=data.get("tags")
+            tags=data.get("tags"),
+            parent_id=data.get("parent-id", ""),
+            document_type=data.get("document-type", "source"),
        )
    
    def from_pulsar(self, obj: DocumentMetadata) -> Dict[str, Any]:
        result = {}
-        
+
        if obj.id:
            result["id"] = obj.id
        if obj.time:
@ -42,7 +44,11 @@ class DocumentMetadataTranslator(Translator):
            result["user"] = obj.user
        if obj.tags is not None:
            result["tags"] = obj.tags
-            
+        if obj.parent_id:
+            result["parent-id"] = obj.parent_id
+        if obj.document_type:
+            result["document-type"] = obj.document_type
+
        return result