Incremental / large document loading (#659)

Tech spec

BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
  upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up

Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
  update_upload_session_chunk(), delete_upload_session(),
  list_upload_sessions()

- Schema extended with UploadSession, UploadProgress, and new
  request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
  abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
  - add_document() auto-switches to chunked for files > 10MB
  - Progress callback support (on_progress)
  - get_pending_uploads(), get_upload_status(), abort_upload(),
    resume_upload()

- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
  streaming retrieval
- Librarian operations:
  - add-child-document for extracted PDF pages
  - list-children to get child documents
  - stream-document for chunked content retrieval
  - Cascade delete removes children when parent is deleted
  - list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
  documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
  content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
  warnings directing users to tg-add-library-document +
  tg-start-library-processing

Remove load_pdf and load_text utils

Move chunker/librarian comms to base class

Updating tests
This commit is contained in:
cybermaggedon 2026-03-04 16:57:58 +00:00 committed by GitHub
parent a38ca9474f
commit a630e143ef
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 3164 additions and 650 deletions

View file

@ -3,9 +3,12 @@ from .. knowledge import hash
from .. exceptions import RequestError
from minio import Minio
from minio.datatypes import Part
import time
import io
import logging
from typing import Iterator, List, Tuple
from uuid import UUID
# Module logger
logger = logging.getLogger(__name__)
@ -78,3 +81,141 @@ class BlobStore:
return resp.read()
def get_stream(self, object_id, chunk_size: int = 1024 * 1024) -> Iterator[bytes]:
"""
Stream document content in chunks.
Yields chunks of the document, allowing processing without loading
the entire document into memory.
Args:
object_id: The UUID of the document object
chunk_size: Size of each chunk in bytes (default 1MB)
Yields:
Chunks of document content as bytes
"""
resp = self.client.get_object(
bucket_name=self.bucket_name,
object_name="doc/" + str(object_id),
)
try:
while True:
chunk = resp.read(chunk_size)
if not chunk:
break
yield chunk
finally:
resp.close()
resp.release_conn()
logger.debug("Stream complete")
def create_multipart_upload(self, object_id: UUID, kind: str) -> str:
"""
Initialize a multipart upload.
Args:
object_id: The UUID for the new object
kind: MIME type of the document
Returns:
The S3 upload_id for this multipart upload session
"""
object_name = "doc/" + str(object_id)
# Use minio's internal method to create multipart upload
upload_id = self.client._create_multipart_upload(
bucket_name=self.bucket_name,
object_name=object_name,
headers={"Content-Type": kind},
)
logger.info(f"Created multipart upload {upload_id} for {object_id}")
return upload_id
def upload_part(
self,
object_id: UUID,
upload_id: str,
part_number: int,
data: bytes
) -> str:
"""
Upload a single part of a multipart upload.
Args:
object_id: The UUID of the object being uploaded
upload_id: The S3 upload_id from create_multipart_upload
part_number: Part number (1-indexed, as per S3 spec)
data: The chunk data to upload
Returns:
The ETag for this part (needed for complete_multipart_upload)
"""
object_name = "doc/" + str(object_id)
etag = self.client._upload_part(
bucket_name=self.bucket_name,
object_name=object_name,
data=data,
headers={"Content-Length": str(len(data))},
upload_id=upload_id,
part_number=part_number,
)
logger.debug(f"Uploaded part {part_number} for {object_id}, etag={etag}")
return etag
def complete_multipart_upload(
self,
object_id: UUID,
upload_id: str,
parts: List[Tuple[int, str]]
) -> None:
"""
Complete a multipart upload, assembling all parts into the final object.
S3 coalesces the parts server-side - no data transfer through this client.
Args:
object_id: The UUID of the object
upload_id: The S3 upload_id from create_multipart_upload
parts: List of (part_number, etag) tuples in order
"""
object_name = "doc/" + str(object_id)
# Convert to Part objects as expected by minio
part_objects = [
Part(part_number, etag)
for part_number, etag in parts
]
self.client._complete_multipart_upload(
bucket_name=self.bucket_name,
object_name=object_name,
upload_id=upload_id,
parts=part_objects,
)
logger.info(f"Completed multipart upload for {object_id}")
def abort_multipart_upload(self, object_id: UUID, upload_id: str) -> None:
"""
Abort a multipart upload, cleaning up any uploaded parts.
Args:
object_id: The UUID of the object
upload_id: The S3 upload_id from create_multipart_upload
"""
object_name = "doc/" + str(object_id)
self.client._abort_multipart_upload(
bucket_name=self.bucket_name,
object_name=object_name,
upload_id=upload_id,
)
logger.info(f"Aborted multipart upload {upload_id} for {object_id}")

View file

@ -1,17 +1,24 @@
from .. schema import LibrarianRequest, LibrarianResponse, Error, Triple
from .. schema import UploadSession
from .. knowledge import hash
from .. exceptions import RequestError
from .. tables.library import LibraryTableStore
from . blob_store import BlobStore
import base64
import json
import logging
import math
import time
import uuid
# Module logger
logger = logging.getLogger(__name__)
# Default chunk size for multipart uploads (5MB - S3 minimum)
DEFAULT_CHUNK_SIZE = 5 * 1024 * 1024
class Librarian:
def __init__(
@ -66,13 +73,7 @@ class Librarian:
logger.debug("Add complete")
return LibrarianResponse(
error = None,
document_metadata = None,
content = None,
document_metadatas = None,
processing_metadatas = None,
)
return LibrarianResponse()
async def remove_document(self, request):
@ -84,6 +85,21 @@ class Librarian:
):
raise RuntimeError("Document does not exist")
# First, cascade delete all child documents
children = await self.table_store.list_children(request.document_id)
for child in children:
logger.debug(f"Cascade deleting child document {child.id}")
try:
child_object_id = await self.table_store.get_document_object_id(
child.user,
child.id
)
await self.blob_store.remove(child_object_id)
await self.table_store.remove_document(child.user, child.id)
except Exception as e:
logger.warning(f"Failed to delete child document {child.id}: {e}")
# Now remove the parent document
object_id = await self.table_store.get_document_object_id(
request.user,
request.document_id
@ -100,13 +116,7 @@ class Librarian:
logger.debug("Remove complete")
return LibrarianResponse(
error = None,
document_metadata = None,
content = None,
document_metadatas = None,
processing_metadatas = None,
)
return LibrarianResponse()
async def update_document(self, request):
@ -124,13 +134,7 @@ class Librarian:
logger.debug("Update complete")
return LibrarianResponse(
error = None,
document_metadata = None,
content = None,
document_metadatas = None,
processing_metadatas = None,
)
return LibrarianResponse()
async def get_document_metadata(self, request):
@ -147,8 +151,6 @@ class Librarian:
error = None,
document_metadata = doc,
content = None,
document_metadatas = None,
processing_metadatas = None,
)
async def get_document_content(self, request):
@ -170,8 +172,6 @@ class Librarian:
error = None,
document_metadata = None,
content = base64.b64encode(content),
document_metadatas = None,
processing_metadatas = None,
)
async def add_processing(self, request):
@ -217,13 +217,7 @@ class Librarian:
logger.debug("Add complete")
return LibrarianResponse(
error = None,
document_metadata = None,
content = None,
document_metadatas = None,
processing_metadatas = None,
)
return LibrarianResponse()
async def remove_processing(self, request):
@ -243,24 +237,22 @@ class Librarian:
logger.debug("Remove complete")
return LibrarianResponse(
error = None,
document_metadata = None,
content = None,
document_metadatas = None,
processing_metadatas = None,
)
return LibrarianResponse()
async def list_documents(self, request):
docs = await self.table_store.list_documents(request.user)
# Filter out child documents by default unless include_children is True
include_children = getattr(request, 'include_children', False)
if not include_children:
docs = [
doc for doc in docs
if not doc.parent_id # Only include top-level documents
]
return LibrarianResponse(
error = None,
document_metadata = None,
content = None,
document_metadatas = docs,
processing_metadatas = None,
)
async def list_processing(self, request):
@ -268,10 +260,438 @@ class Librarian:
procs = await self.table_store.list_processing(request.user)
return LibrarianResponse(
error = None,
document_metadata = None,
content = None,
document_metadatas = None,
processing_metadatas = procs,
)
# Chunked upload operations
async def begin_upload(self, request):
"""
Initialize a chunked upload session.
Creates an S3 multipart upload and stores session state in Cassandra.
"""
logger.info(f"Beginning chunked upload for document {request.document_metadata.id}")
if request.document_metadata.kind not in ("text/plain", "application/pdf"):
raise RequestError(
"Invalid document kind: " + request.document_metadata.kind
)
if await self.table_store.document_exists(
request.document_metadata.user,
request.document_metadata.id
):
raise RequestError("Document already exists")
# Validate sizes
total_size = request.total_size
if total_size <= 0:
raise RequestError("total_size must be positive")
# Use provided chunk size or default (minimum 5MB for S3)
chunk_size = request.chunk_size if request.chunk_size > 0 else DEFAULT_CHUNK_SIZE
if chunk_size < DEFAULT_CHUNK_SIZE:
chunk_size = DEFAULT_CHUNK_SIZE
# Calculate total chunks
total_chunks = math.ceil(total_size / chunk_size)
# Generate IDs
upload_id = str(uuid.uuid4())
object_id = uuid.uuid4()
# Create S3 multipart upload
s3_upload_id = self.blob_store.create_multipart_upload(
object_id, request.document_metadata.kind
)
# Serialize document metadata for storage
doc_meta_json = json.dumps({
"id": request.document_metadata.id,
"time": request.document_metadata.time,
"kind": request.document_metadata.kind,
"title": request.document_metadata.title,
"comments": request.document_metadata.comments,
"user": request.document_metadata.user,
"tags": request.document_metadata.tags,
})
# Store session in Cassandra
await self.table_store.create_upload_session(
upload_id=upload_id,
user=request.document_metadata.user,
document_id=request.document_metadata.id,
document_metadata=doc_meta_json,
s3_upload_id=s3_upload_id,
object_id=object_id,
total_size=total_size,
chunk_size=chunk_size,
total_chunks=total_chunks,
)
logger.info(f"Created upload session {upload_id} with {total_chunks} chunks")
return LibrarianResponse(
error=None,
upload_id=upload_id,
chunk_size=chunk_size,
total_chunks=total_chunks,
)
async def upload_chunk(self, request):
"""
Upload a single chunk of a document.
Forwards the chunk to S3 and updates session state.
"""
logger.debug(f"Uploading chunk {request.chunk_index} for upload {request.upload_id}")
# Get session
session = await self.table_store.get_upload_session(request.upload_id)
if session is None:
raise RequestError("Upload session not found or expired")
# Validate ownership
if session["user"] != request.user:
raise RequestError("Not authorized to upload to this session")
# Validate chunk index
if request.chunk_index < 0 or request.chunk_index >= session["total_chunks"]:
raise RequestError(
f"Invalid chunk index {request.chunk_index}, "
f"must be 0-{session['total_chunks']-1}"
)
# Decode content
content = base64.b64decode(request.content)
# Upload to S3 (part numbers are 1-indexed in S3)
part_number = request.chunk_index + 1
etag = self.blob_store.upload_part(
object_id=session["object_id"],
upload_id=session["s3_upload_id"],
part_number=part_number,
data=content,
)
# Update session with chunk info
await self.table_store.update_upload_session_chunk(
upload_id=request.upload_id,
chunk_index=request.chunk_index,
etag=etag,
)
# Calculate progress
chunks_received = session["chunks_received"]
# Add this chunk if not already present
if request.chunk_index not in chunks_received:
chunks_received[request.chunk_index] = etag
num_chunks_received = len(chunks_received) + 1 # +1 for this chunk
bytes_received = num_chunks_received * session["chunk_size"]
# Adjust for last chunk potentially being smaller
if bytes_received > session["total_size"]:
bytes_received = session["total_size"]
logger.debug(f"Chunk {request.chunk_index} uploaded, {num_chunks_received}/{session['total_chunks']} complete")
return LibrarianResponse(
error=None,
upload_id=request.upload_id,
chunk_index=request.chunk_index,
chunks_received=num_chunks_received,
total_chunks=session["total_chunks"],
bytes_received=bytes_received,
total_bytes=session["total_size"],
)
async def complete_upload(self, request):
"""
Finalize a chunked upload and create the document.
Completes the S3 multipart upload and creates the document metadata.
"""
logger.info(f"Completing upload {request.upload_id}")
# Get session
session = await self.table_store.get_upload_session(request.upload_id)
if session is None:
raise RequestError("Upload session not found or expired")
# Validate ownership
if session["user"] != request.user:
raise RequestError("Not authorized to complete this upload")
# Verify all chunks received
chunks_received = session["chunks_received"]
if len(chunks_received) != session["total_chunks"]:
missing = [
i for i in range(session["total_chunks"])
if i not in chunks_received
]
raise RequestError(
f"Missing chunks: {missing[:10]}{'...' if len(missing) > 10 else ''}"
)
# Build parts list for S3 (sorted by part number)
parts = [
(chunk_index + 1, etag) # S3 part numbers are 1-indexed
for chunk_index, etag in sorted(chunks_received.items())
]
# Complete S3 multipart upload
self.blob_store.complete_multipart_upload(
object_id=session["object_id"],
upload_id=session["s3_upload_id"],
parts=parts,
)
# Parse document metadata from session
doc_meta_dict = json.loads(session["document_metadata"])
# Create DocumentMetadata object
from .. schema import DocumentMetadata
doc_metadata = DocumentMetadata(
id=doc_meta_dict["id"],
time=doc_meta_dict.get("time", int(time.time())),
kind=doc_meta_dict["kind"],
title=doc_meta_dict.get("title", ""),
comments=doc_meta_dict.get("comments", ""),
user=doc_meta_dict["user"],
tags=doc_meta_dict.get("tags", []),
metadata=[], # Triples not supported in chunked upload yet
)
# Add document to table
await self.table_store.add_document(doc_metadata, session["object_id"])
# Delete upload session
await self.table_store.delete_upload_session(request.upload_id)
logger.info(f"Upload {request.upload_id} completed, document {doc_metadata.id} created")
return LibrarianResponse(
error=None,
document_id=doc_metadata.id,
object_id=str(session["object_id"]),
)
async def abort_upload(self, request):
"""
Cancel a chunked upload and clean up resources.
"""
logger.info(f"Aborting upload {request.upload_id}")
# Get session
session = await self.table_store.get_upload_session(request.upload_id)
if session is None:
raise RequestError("Upload session not found or expired")
# Validate ownership
if session["user"] != request.user:
raise RequestError("Not authorized to abort this upload")
# Abort S3 multipart upload
self.blob_store.abort_multipart_upload(
object_id=session["object_id"],
upload_id=session["s3_upload_id"],
)
# Delete session from Cassandra
await self.table_store.delete_upload_session(request.upload_id)
logger.info(f"Upload {request.upload_id} aborted")
return LibrarianResponse(error=None)
async def get_upload_status(self, request):
"""
Get the status of an in-progress upload.
"""
logger.debug(f"Getting status for upload {request.upload_id}")
# Get session
session = await self.table_store.get_upload_session(request.upload_id)
if session is None:
return LibrarianResponse(
error=None,
upload_id=request.upload_id,
upload_state="expired",
)
# Validate ownership
if session["user"] != request.user:
raise RequestError("Not authorized to view this upload")
chunks_received = session["chunks_received"]
received_list = sorted(chunks_received.keys())
missing_list = [
i for i in range(session["total_chunks"])
if i not in chunks_received
]
bytes_received = len(chunks_received) * session["chunk_size"]
if bytes_received > session["total_size"]:
bytes_received = session["total_size"]
return LibrarianResponse(
error=None,
upload_id=request.upload_id,
upload_state="in-progress",
received_chunks=received_list,
missing_chunks=missing_list,
chunks_received=len(chunks_received),
total_chunks=session["total_chunks"],
bytes_received=bytes_received,
total_bytes=session["total_size"],
)
async def list_uploads(self, request):
"""
List all in-progress uploads for a user.
"""
logger.debug(f"Listing uploads for user {request.user}")
sessions = await self.table_store.list_upload_sessions(request.user)
upload_sessions = [
UploadSession(
upload_id=s["upload_id"],
document_id=s["document_id"],
document_metadata_json=s.get("document_metadata", ""),
total_size=s["total_size"],
chunk_size=s["chunk_size"],
total_chunks=s["total_chunks"],
chunks_received=s["chunks_received"],
created_at=str(s.get("created_at", "")),
)
for s in sessions
]
return LibrarianResponse(
error=None,
upload_sessions=upload_sessions,
)
# Child document operations
async def add_child_document(self, request):
"""
Add a child document linked to a parent document.
Child documents are typically extracted content (e.g., pages from a PDF).
They have a parent_id pointing to the source document and document_type
set to "extracted".
"""
logger.info(f"Adding child document {request.document_metadata.id} "
f"for parent {request.document_metadata.parent_id}")
if not request.document_metadata.parent_id:
raise RequestError("parent_id is required for child documents")
# Verify parent exists
if not await self.table_store.document_exists(
request.document_metadata.user,
request.document_metadata.parent_id
):
raise RequestError(
f"Parent document {request.document_metadata.parent_id} does not exist"
)
if await self.table_store.document_exists(
request.document_metadata.user,
request.document_metadata.id
):
raise RequestError("Document already exists")
# Ensure document_type is set to "extracted"
request.document_metadata.document_type = "extracted"
# Create object ID for blob
object_id = uuid.uuid4()
logger.debug("Adding blob...")
await self.blob_store.add(
object_id, base64.b64decode(request.content),
request.document_metadata.kind
)
logger.debug("Adding to table...")
await self.table_store.add_document(
request.document_metadata, object_id
)
logger.debug("Add child document complete")
return LibrarianResponse(
error=None,
document_id=request.document_metadata.id,
)
async def list_children(self, request):
"""
List all child documents for a given parent document.
"""
logger.debug(f"Listing children for parent {request.document_id}")
children = await self.table_store.list_children(request.document_id)
return LibrarianResponse(
error=None,
document_metadatas=children,
)
async def stream_document(self, request):
"""
Stream document content in chunks.
This operation returns document content in smaller chunks, allowing
memory-efficient processing of large documents. The response includes
chunk information for reassembly.
Note: This operation returns a single chunk at a time. Clients should
call repeatedly with increasing chunk_index until all chunks are received.
"""
logger.debug(f"Streaming document {request.document_id}, chunk {request.chunk_index}")
object_id = await self.table_store.get_document_object_id(
request.user,
request.document_id
)
# Default chunk size of 1MB
chunk_size = request.chunk_size if request.chunk_size > 0 else 1024 * 1024
# Get the full content and slice out the requested chunk
# Note: This is a simple implementation. For true streaming, we'd need
# range requests on the object storage.
content = await self.blob_store.get(object_id)
total_size = len(content)
total_chunks = math.ceil(total_size / chunk_size)
if request.chunk_index >= total_chunks:
raise RequestError(
f"Invalid chunk index {request.chunk_index}, "
f"document has {total_chunks} chunks"
)
start = request.chunk_index * chunk_size
end = min(start + chunk_size, total_size)
chunk_content = content[start:end]
logger.debug(f"Returning chunk {request.chunk_index}/{total_chunks}, "
f"bytes {start}-{end} of {total_size}")
return LibrarianResponse(
error=None,
content=base64.b64encode(chunk_content),
chunk_index=request.chunk_index,
chunks_received=1, # Using as "current chunk" indicator
total_chunks=total_chunks,
bytes_received=end,
total_bytes=total_size,
)

View file

@ -271,6 +271,9 @@ class Processor(AsyncProcessor):
pass
# Threshold for sending document_id instead of inline content (2MB)
STREAMING_THRESHOLD = 2 * 1024 * 1024
async def load_document(self, document, processing, content):
logger.debug("Ready for document processing...")
@ -292,26 +295,57 @@ class Processor(AsyncProcessor):
q = flow["interfaces"][kind]
if kind == "text-load":
doc = TextDocument(
metadata = Metadata(
id = document.id,
metadata = document.metadata,
user = processing.user,
collection = processing.collection
),
text = content,
)
# For large text documents, send document_id for streaming retrieval
if len(content) >= self.STREAMING_THRESHOLD:
logger.info(f"Text document {document.id} is large ({len(content)} bytes), "
f"sending document_id for streaming retrieval")
doc = TextDocument(
metadata = Metadata(
id = document.id,
metadata = document.metadata,
user = processing.user,
collection = processing.collection
),
document_id = document.id,
text = b"", # Empty, receiver will fetch via librarian
)
else:
doc = TextDocument(
metadata = Metadata(
id = document.id,
metadata = document.metadata,
user = processing.user,
collection = processing.collection
),
text = content,
)
schema = TextDocument
else:
doc = Document(
metadata = Metadata(
id = document.id,
metadata = document.metadata,
user = processing.user,
collection = processing.collection
),
data = base64.b64encode(content).decode("utf-8")
)
# For large PDF documents, send document_id for streaming retrieval
# instead of embedding the entire content in the message
if len(content) >= self.STREAMING_THRESHOLD:
logger.info(f"Document {document.id} is large ({len(content)} bytes), "
f"sending document_id for streaming retrieval")
doc = Document(
metadata = Metadata(
id = document.id,
metadata = document.metadata,
user = processing.user,
collection = processing.collection
),
document_id = document.id,
data = b"", # Empty data, receiver will fetch via API
)
else:
doc = Document(
metadata = Metadata(
id = document.id,
metadata = document.metadata,
user = processing.user,
collection = processing.collection
),
data = base64.b64encode(content).decode("utf-8")
)
schema = Document
logger.debug(f"Submitting to queue {q}...")
@ -361,6 +395,17 @@ class Processor(AsyncProcessor):
"remove-processing": self.librarian.remove_processing,
"list-documents": self.librarian.list_documents,
"list-processing": self.librarian.list_processing,
# Chunked upload operations
"begin-upload": self.librarian.begin_upload,
"upload-chunk": self.librarian.upload_chunk,
"complete-upload": self.librarian.complete_upload,
"abort-upload": self.librarian.abort_upload,
"get-upload-status": self.librarian.get_upload_status,
"list-uploads": self.librarian.list_uploads,
# Child document and streaming operations
"add-child-document": self.librarian.add_child_document,
"list-children": self.librarian.list_children,
"stream-document": self.librarian.stream_document,
}
if v.operation not in impls: