Incremental / large document loading (#659)

Tech spec

BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
  upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up

Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
  update_upload_session_chunk(), delete_upload_session(),
  list_upload_sessions()

- Schema extended with UploadSession, UploadProgress, and new
  request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
  abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
  - add_document() auto-switches to chunked for files > 10MB
  - Progress callback support (on_progress)
  - get_pending_uploads(), get_upload_status(), abort_upload(),
    resume_upload()

- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
  streaming retrieval
- Librarian operations:
  - add-child-document for extracted PDF pages
  - list-children to get child documents
  - stream-document for chunked content retrieval
  - Cascade delete removes children when parent is deleted
  - list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
  documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
  content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
  warnings directing users to tg-add-library-document +
  tg-start-library-processing

Remove load_pdf and load_text utils

Move chunker/librarian comms to base class

Updating tests
This commit is contained in:
cybermaggedon 2026-03-04 16:57:58 +00:00 committed by GitHub
parent a38ca9474f
commit a630e143ef
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 3164 additions and 650 deletions

View file

@ -44,14 +44,21 @@ class LibraryRequestTranslator(MessageTranslator):
return LibrarianRequest(
operation=data.get("operation"),
document_id=data.get("document-id"),
processing_id=data.get("processing-id"),
document_id=data.get("document-id", ""),
processing_id=data.get("processing-id", ""),
document_metadata=doc_metadata,
processing_metadata=proc_metadata,
content=content,
user=data.get("user"),
collection=data.get("collection"),
criteria=criteria
user=data.get("user", ""),
collection=data.get("collection", ""),
criteria=criteria,
# Chunked upload fields
total_size=data.get("total-size", 0),
chunk_size=data.get("chunk-size", 0),
upload_id=data.get("upload-id", ""),
chunk_index=data.get("chunk-index", 0),
# List documents filtering
include_children=data.get("include-children", False),
)
def from_pulsar(self, obj: LibrarianRequest) -> Dict[str, Any]:
@ -98,25 +105,71 @@ class LibraryResponseTranslator(MessageTranslator):
def from_pulsar(self, obj: LibrarianResponse) -> Dict[str, Any]:
result = {}
if obj.error:
result["error"] = {
"type": obj.error.type,
"message": obj.error.message,
}
if obj.document_metadata:
result["document-metadata"] = self.doc_metadata_translator.from_pulsar(obj.document_metadata)
if obj.content:
result["content"] = obj.content.decode("utf-8") if isinstance(obj.content, bytes) else obj.content
if obj.document_metadatas is not None:
result["document-metadatas"] = [
self.doc_metadata_translator.from_pulsar(dm)
for dm in obj.document_metadatas
]
if obj.processing_metadatas is not None:
result["processing-metadatas"] = [
self.proc_metadata_translator.from_pulsar(pm)
for pm in obj.processing_metadatas
]
# Chunked upload response fields
if obj.upload_id:
result["upload-id"] = obj.upload_id
if obj.chunk_size:
result["chunk-size"] = obj.chunk_size
if obj.total_chunks:
result["total-chunks"] = obj.total_chunks
if obj.chunk_index:
result["chunk-index"] = obj.chunk_index
if obj.chunks_received:
result["chunks-received"] = obj.chunks_received
if obj.bytes_received:
result["bytes-received"] = obj.bytes_received
if obj.total_bytes:
result["total-bytes"] = obj.total_bytes
if obj.document_id:
result["document-id"] = obj.document_id
if obj.object_id:
result["object-id"] = obj.object_id
if obj.upload_state:
result["upload-state"] = obj.upload_state
if obj.received_chunks:
result["received-chunks"] = obj.received_chunks
if obj.missing_chunks:
result["missing-chunks"] = obj.missing_chunks
if obj.upload_sessions:
result["upload-sessions"] = [
{
"upload-id": s.upload_id,
"document-id": s.document_id,
"document-metadata-json": s.document_metadata_json,
"total-size": s.total_size,
"chunk-size": s.chunk_size,
"total-chunks": s.total_chunks,
"chunks-received": s.chunks_received,
"created-at": s.created_at,
}
for s in obj.upload_sessions
]
return result
def from_response_with_completion(self, obj: LibrarianResponse) -> Tuple[Dict[str, Any], bool]:

View file

@ -20,12 +20,14 @@ class DocumentMetadataTranslator(Translator):
comments=data.get("comments"),
metadata=self.subgraph_translator.to_pulsar(metadata) if metadata is not None else [],
user=data.get("user"),
tags=data.get("tags")
tags=data.get("tags"),
parent_id=data.get("parent-id", ""),
document_type=data.get("document-type", "source"),
)
def from_pulsar(self, obj: DocumentMetadata) -> Dict[str, Any]:
result = {}
if obj.id:
result["id"] = obj.id
if obj.time:
@ -42,7 +44,11 @@ class DocumentMetadataTranslator(Translator):
result["user"] = obj.user
if obj.tags is not None:
result["tags"] = obj.tags
if obj.parent_id:
result["parent-id"] = obj.parent_id
if obj.document_type:
result["document-type"] = obj.document_type
return result