Incremental / large document loading (#659)

Tech spec BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py): - get_stream() - yields document content in chunks for streaming retrieval - create_multipart_upload() - initializes S3 multipart upload, returns upload_id - upload_part() - uploads a single part, returns etag - complete_multipart_upload() - finalizes upload with part etags - abort_multipart_upload() - cancels and cleans up Cassandra schema (trustgraph-flow/trustgraph/tables/library.py): - New upload_session table with 24-hour TTL - Index on user for listing sessions - Prepared statements for all operations - Methods: create_upload_session(), get_upload_session(), update_upload_session_chunk(), delete_upload_session(), list_upload_sessions() - Schema extended with UploadSession, UploadProgress, and new request/response fields - Librarian methods: begin_upload, upload_chunk, complete_upload, abort_upload, get_upload_status, list_uploads - Service routing for all new operations - Python SDK with transparent chunked upload: - add_document() auto-switches to chunked for files > 10MB - Progress callback support (on_progress) - get_pending_uploads(), get_upload_status(), abort_upload(), resume_upload() - Document table: Added parent_id and document_type columns with index - Document schema (knowledge/document.py): Added document_id field for streaming retrieval - Librarian operations: - add-child-document for extracted PDF pages - list-children to get child documents - stream-document for chunked content retrieval - Cascade delete removes children when parent is deleted - list-documents filters children by default - PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large documents from librarian API to temp file - Librarian service (librarian/service.py): Sends document_id instead of content for large PDFs (>2MB) - Deprecated tools (load_pdf.py, load_text.py): Added deprecation warnings directing users to tg-add-library-document + tg-start-library-processing Remove load_pdf and load_text utils Move chunker/librarian comms to base class Updating tests
2026-04-25 08:26:21 +02:00 · 2026-03-04 16:57:58 +00:00 · 2026-03-04 16:57:58 +00:00 · a630e143ef
commit a630e143ef
parent a38ca9474f
21 changed files with 3164 additions and 650 deletions
--- a/trustgraph-cli/trustgraph/cli/load_pdf.py
+++ b/trustgraph-cli/trustgraph/cli/load_pdf.py
@ -1,200 +0,0 @@
-"""
-Loads a PDF document into TrustGraph processing by directing to
-the pdf-decoder queue.
-Consider using tg-add-library-document to load
-a document, followed by tg-start-library-processing to initiate processing.
-"""
-
-import hashlib
-import argparse
-import os
-import time
-import uuid
-
-from trustgraph.api import Api
-from trustgraph.knowledge import hash, to_uri
-from trustgraph.knowledge import PREF_PUBEV, PREF_DOC, PREF_ORG
-from trustgraph.knowledge import Organization, PublicationEvent
-from trustgraph.knowledge import DigitalDocument
-
-default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
-default_user = 'trustgraph'
-default_collection = 'default'
-
-class Loader:
-
-    def __init__(
-            self,
-            url,
-            flow_id,
-            user,
-            collection,
-            metadata,
-    ):
-
-        self.api = Api(url).flow().id(flow_id)
-
-        self.user = user
-        self.collection = collection
-        self.metadata = metadata
-
-    def load(self, files):
-
-        for file in files:
-            self.load_file(file)
-
-    def load_file(self, file):
-
-        try:
-
-            path = file
-            data = open(path, "rb").read()
-
-            # Create a SHA256 hash from the data
-            id = hash(data)
-
-            id = to_uri(PREF_DOC, id)
-
-            self.metadata.id = id
-
-            self.api.load_document(
-                document=data, id=id, metadata=self.metadata, 
-                user=self.user,
-                collection=self.collection,
-            )
-
-            print(f"{file}: Loaded successfully.")
-
-        except Exception as e:
-            print(f"{file}: Failed: {str(e)}", flush=True)
-            raise e
-
-def main():
-
-    parser = argparse.ArgumentParser(
-        prog='tg-load-pdf',
-        description=__doc__,
-    )
-
-    parser.add_argument(
-        '-u', '--url',
-        default=default_url,
-        help=f'API URL (default: {default_url})',
-    )
-
-    parser.add_argument(
-        '-f', '--flow-id',
-        default="default",
-        help=f'Flow ID (default: default)'
-    )
-
-    parser.add_argument(
-        '-U', '--user',
-        default=default_user,
-        help=f'User ID (default: {default_user})'
-    )
-
-    parser.add_argument(
-        '-C', '--collection',
-        default=default_collection,
-        help=f'Collection ID (default: {default_collection})'
-    )
-
-    parser.add_argument(
-        '--name', help=f'Document name'
-    )
-
-    parser.add_argument(
-        '--description', help=f'Document description'
-    )
-
-    parser.add_argument(
-        '--copyright-notice', help=f'Copyright notice'
-    )
-
-    parser.add_argument(
-        '--copyright-holder', help=f'Copyright holder'
-    )
-
-    parser.add_argument(
-        '--copyright-year', help=f'Copyright year'
-    )
-
-    parser.add_argument(
-        '--license', help=f'Copyright license'
-    )
-
-    parser.add_argument(
-        '--publication-organization', help=f'Publication organization'
-    )
-
-    parser.add_argument(
-        '--publication-description', help=f'Publication description'
-    )
-
-    parser.add_argument(
-        '--publication-date', help=f'Publication date'
-    )
-
-    parser.add_argument(
-        '--document-url', help=f'Document URL'
-    )
-
-    parser.add_argument(
-        '--keyword', nargs='+', help=f'Keyword'
-    )
-
-    parser.add_argument(
-        '--identifier', '--id', help=f'Document ID'
-    )
-
-    parser.add_argument(
-        'files', nargs='+',
-        help=f'File to load'
-    )
-
-    args = parser.parse_args()
-
-    try:
-
-        document = DigitalDocument(
-            id,
-            name=args.name,
-            description=args.description,
-            copyright_notice=args.copyright_notice,
-            copyright_holder=args.copyright_holder,
-            copyright_year=args.copyright_year,
-            license=args.license,
-            url=args.document_url,
-            keywords=args.keyword,
-        )
-
-        if args.publication_organization:
-            org = Organization(
-                id=to_uri(PREF_ORG, hash(args.publication_organization)),
-                name=args.publication_organization,
-            )
-            document.publication = PublicationEvent(
-                id = to_uri(PREF_PUBEV, str(uuid.uuid4())),
-                organization=org,
-                description=args.publication_description,
-                start_date=args.publication_date,
-                end_date=args.publication_date,
-            )
-
-        p = Loader(
-            url=args.url,
-            flow_id = args.flow_id,
-            user=args.user,
-            collection=args.collection,
-            metadata=document,
-        )
-
-        p.load(args.files)
-
-    except Exception as e:
-
-        print("Exception:", e, flush=True)
-
-if __name__ == "__main__":
-    main()
--- a/trustgraph-cli/trustgraph/cli/load_text.py
+++ b/trustgraph-cli/trustgraph/cli/load_text.py
@ -1,205 +0,0 @@
-"""
-Loads a text document into TrustGraph processing by directing to a text
-loader queue.
-Consider using tg-add-library-document to load
-a document, followed by tg-start-library-processing to initiate processing.
-"""
-
-import pulsar
-from pulsar.schema import JsonSchema
-import hashlib
-import argparse
-import os
-import time
-import uuid
-
-from trustgraph.api import Api
-from trustgraph.knowledge import hash, to_uri
-from trustgraph.knowledge import PREF_PUBEV, PREF_DOC, PREF_ORG
-from trustgraph.knowledge import Organization, PublicationEvent
-from trustgraph.knowledge import DigitalDocument
-
-default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
-default_user = 'trustgraph'
-default_collection = 'default'
-
-class Loader:
-
-    def __init__(
-            self,
-            url,
-            flow_id,
-            user,
-            collection,
-            metadata,
-    ):
-
-        self.api = Api(url).flow().id(flow_id)
-
-        self.user = user
-        self.collection = collection
-        self.metadata = metadata
-
-    def load(self, files):
-
-        for file in files:
-            self.load_file(file)
-
-    def load_file(self, file):
-
-        try:
-
-            path = file
-            data = open(path, "rb").read()
-
-            # Create a SHA256 hash from the data
-            id = hash(data)
-
-            id = to_uri(PREF_DOC, id)
-
-            self.metadata.id = id
-
-            self.api.load_text(
-                text=data, id=id, metadata=self.metadata, 
-                user=self.user,
-                collection=self.collection,
-            )
-
-            print(f"{file}: Loaded successfully.")
-
-        except Exception as e:
-            print(f"{file}: Failed: {str(e)}", flush=True)
-            raise e
-
-def main():
-
-    parser = argparse.ArgumentParser(
-        prog='tg-load-text',
-        description=__doc__,
-    )
-
-    parser.add_argument(
-        '-u', '--url',
-        default=default_url,
-        help=f'API URL (default: {default_url})',
-    )
-
-    parser.add_argument(
-        '-f', '--flow-id',
-        default="default",
-        help=f'Flow ID (default: default)'
-    )
-
-    parser.add_argument(
-        '-U', '--user',
-        default=default_user,
-        help=f'User ID (default: {default_user})'
-    )
-
-    parser.add_argument(
-        '-C', '--collection',
-        default=default_collection,
-        help=f'Collection ID (default: {default_collection})'
-    )
-
-    parser.add_argument(
-        '--name', help=f'Document name'
-    )
-
-    parser.add_argument(
-        '--description', help=f'Document description'
-    )
-
-    parser.add_argument(
-        '--copyright-notice', help=f'Copyright notice'
-    )
-
-    parser.add_argument(
-        '--copyright-holder', help=f'Copyright holder'
-    )
-
-    parser.add_argument(
-        '--copyright-year', help=f'Copyright year'
-    )
-
-    parser.add_argument(
-        '--license', help=f'Copyright license'
-    )
-
-    parser.add_argument(
-        '--publication-organization', help=f'Publication organization'
-    )
-
-    parser.add_argument(
-        '--publication-description', help=f'Publication description'
-    )
-
-    parser.add_argument(
-        '--publication-date', help=f'Publication date'
-    )
-
-    parser.add_argument(
-        '--document-url', help=f'Document URL'
-    )
-
-    parser.add_argument(
-        '--keyword', nargs='+', help=f'Keyword'
-    )
-
-    parser.add_argument(
-        '--identifier', '--id', help=f'Document ID'
-    )
-
-    parser.add_argument(
-        'files', nargs='+',
-        help=f'File to load'
-    )
-
-    args = parser.parse_args()
-
-
-    try:
-
-        document = DigitalDocument(
-            id,
-            name=args.name,
-            description=args.description,
-            copyright_notice=args.copyright_notice,
-            copyright_holder=args.copyright_holder,
-            copyright_year=args.copyright_year,
-            license=args.license,
-            url=args.document_url,
-            keywords=args.keyword,
-        )
-
-        if args.publication_organization:
-            org = Organization(
-                id=to_uri(PREF_ORG, hash(args.publication_organization)),
-                name=args.publication_organization,
-            )
-            document.publication = PublicationEvent(
-                id = to_uri(PREF_PUBEV, str(uuid.uuid4())),
-                organization=org,
-                description=args.publication_description,
-                start_date=args.publication_date,
-                end_date=args.publication_date,
-            )
-
-        p = Loader(
-            url = args.url,
-            flow_id = args.flow_id,
-            user = args.user,
-            collection = args.collection,
-            metadata = document,
-        )
-
-        p.load(args.files)
-
-        print("All done.")
-
-    except Exception as e:
-
-        print("Exception:", e, flush=True)
-
-if __name__ == "__main__":
-    main()