mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-07-02 02:58:10 +02:00
Incremental / large document loading (#659)
Tech spec
BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up
Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
update_upload_session_chunk(), delete_upload_session(),
list_upload_sessions()
- Schema extended with UploadSession, UploadProgress, and new
request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
- add_document() auto-switches to chunked for files > 10MB
- Progress callback support (on_progress)
- get_pending_uploads(), get_upload_status(), abort_upload(),
resume_upload()
- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
streaming retrieval
- Librarian operations:
- add-child-document for extracted PDF pages
- list-children to get child documents
- stream-document for chunked content retrieval
- Cascade delete removes children when parent is deleted
- list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
warnings directing users to tg-add-library-document +
tg-start-library-processing
Remove load_pdf and load_text utils
Move chunker/librarian comms to base class
Updating tests
This commit is contained in:
parent
a38ca9474f
commit
a630e143ef
21 changed files with 3164 additions and 650 deletions
|
|
@ -3,9 +3,12 @@ from .. knowledge import hash
|
|||
from .. exceptions import RequestError
|
||||
|
||||
from minio import Minio
|
||||
from minio.datatypes import Part
|
||||
import time
|
||||
import io
|
||||
import logging
|
||||
from typing import Iterator, List, Tuple
|
||||
from uuid import UUID
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -78,3 +81,141 @@ class BlobStore:
|
|||
|
||||
return resp.read()
|
||||
|
||||
def get_stream(self, object_id, chunk_size: int = 1024 * 1024) -> Iterator[bytes]:
|
||||
"""
|
||||
Stream document content in chunks.
|
||||
|
||||
Yields chunks of the document, allowing processing without loading
|
||||
the entire document into memory.
|
||||
|
||||
Args:
|
||||
object_id: The UUID of the document object
|
||||
chunk_size: Size of each chunk in bytes (default 1MB)
|
||||
|
||||
Yields:
|
||||
Chunks of document content as bytes
|
||||
"""
|
||||
resp = self.client.get_object(
|
||||
bucket_name=self.bucket_name,
|
||||
object_name="doc/" + str(object_id),
|
||||
)
|
||||
|
||||
try:
|
||||
while True:
|
||||
chunk = resp.read(chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
finally:
|
||||
resp.close()
|
||||
resp.release_conn()
|
||||
|
||||
logger.debug("Stream complete")
|
||||
|
||||
def create_multipart_upload(self, object_id: UUID, kind: str) -> str:
|
||||
"""
|
||||
Initialize a multipart upload.
|
||||
|
||||
Args:
|
||||
object_id: The UUID for the new object
|
||||
kind: MIME type of the document
|
||||
|
||||
Returns:
|
||||
The S3 upload_id for this multipart upload session
|
||||
"""
|
||||
object_name = "doc/" + str(object_id)
|
||||
|
||||
# Use minio's internal method to create multipart upload
|
||||
upload_id = self.client._create_multipart_upload(
|
||||
bucket_name=self.bucket_name,
|
||||
object_name=object_name,
|
||||
headers={"Content-Type": kind},
|
||||
)
|
||||
|
||||
logger.info(f"Created multipart upload {upload_id} for {object_id}")
|
||||
return upload_id
|
||||
|
||||
def upload_part(
|
||||
self,
|
||||
object_id: UUID,
|
||||
upload_id: str,
|
||||
part_number: int,
|
||||
data: bytes
|
||||
) -> str:
|
||||
"""
|
||||
Upload a single part of a multipart upload.
|
||||
|
||||
Args:
|
||||
object_id: The UUID of the object being uploaded
|
||||
upload_id: The S3 upload_id from create_multipart_upload
|
||||
part_number: Part number (1-indexed, as per S3 spec)
|
||||
data: The chunk data to upload
|
||||
|
||||
Returns:
|
||||
The ETag for this part (needed for complete_multipart_upload)
|
||||
"""
|
||||
object_name = "doc/" + str(object_id)
|
||||
|
||||
etag = self.client._upload_part(
|
||||
bucket_name=self.bucket_name,
|
||||
object_name=object_name,
|
||||
data=data,
|
||||
headers={"Content-Length": str(len(data))},
|
||||
upload_id=upload_id,
|
||||
part_number=part_number,
|
||||
)
|
||||
|
||||
logger.debug(f"Uploaded part {part_number} for {object_id}, etag={etag}")
|
||||
return etag
|
||||
|
||||
def complete_multipart_upload(
|
||||
self,
|
||||
object_id: UUID,
|
||||
upload_id: str,
|
||||
parts: List[Tuple[int, str]]
|
||||
) -> None:
|
||||
"""
|
||||
Complete a multipart upload, assembling all parts into the final object.
|
||||
|
||||
S3 coalesces the parts server-side - no data transfer through this client.
|
||||
|
||||
Args:
|
||||
object_id: The UUID of the object
|
||||
upload_id: The S3 upload_id from create_multipart_upload
|
||||
parts: List of (part_number, etag) tuples in order
|
||||
"""
|
||||
object_name = "doc/" + str(object_id)
|
||||
|
||||
# Convert to Part objects as expected by minio
|
||||
part_objects = [
|
||||
Part(part_number, etag)
|
||||
for part_number, etag in parts
|
||||
]
|
||||
|
||||
self.client._complete_multipart_upload(
|
||||
bucket_name=self.bucket_name,
|
||||
object_name=object_name,
|
||||
upload_id=upload_id,
|
||||
parts=part_objects,
|
||||
)
|
||||
|
||||
logger.info(f"Completed multipart upload for {object_id}")
|
||||
|
||||
def abort_multipart_upload(self, object_id: UUID, upload_id: str) -> None:
|
||||
"""
|
||||
Abort a multipart upload, cleaning up any uploaded parts.
|
||||
|
||||
Args:
|
||||
object_id: The UUID of the object
|
||||
upload_id: The S3 upload_id from create_multipart_upload
|
||||
"""
|
||||
object_name = "doc/" + str(object_id)
|
||||
|
||||
self.client._abort_multipart_upload(
|
||||
bucket_name=self.bucket_name,
|
||||
object_name=object_name,
|
||||
upload_id=upload_id,
|
||||
)
|
||||
|
||||
logger.info(f"Aborted multipart upload {upload_id} for {object_id}")
|
||||
|
||||
|
|
|
|||
|
|
@ -1,17 +1,24 @@
|
|||
|
||||
from .. schema import LibrarianRequest, LibrarianResponse, Error, Triple
|
||||
from .. schema import UploadSession
|
||||
from .. knowledge import hash
|
||||
from .. exceptions import RequestError
|
||||
from .. tables.library import LibraryTableStore
|
||||
from . blob_store import BlobStore
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import time
|
||||
|
||||
import uuid
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default chunk size for multipart uploads (5MB - S3 minimum)
|
||||
DEFAULT_CHUNK_SIZE = 5 * 1024 * 1024
|
||||
|
||||
class Librarian:
|
||||
|
||||
def __init__(
|
||||
|
|
@ -66,13 +73,7 @@ class Librarian:
|
|||
|
||||
logger.debug("Add complete")
|
||||
|
||||
return LibrarianResponse(
|
||||
error = None,
|
||||
document_metadata = None,
|
||||
content = None,
|
||||
document_metadatas = None,
|
||||
processing_metadatas = None,
|
||||
)
|
||||
return LibrarianResponse()
|
||||
|
||||
async def remove_document(self, request):
|
||||
|
||||
|
|
@ -84,6 +85,21 @@ class Librarian:
|
|||
):
|
||||
raise RuntimeError("Document does not exist")
|
||||
|
||||
# First, cascade delete all child documents
|
||||
children = await self.table_store.list_children(request.document_id)
|
||||
for child in children:
|
||||
logger.debug(f"Cascade deleting child document {child.id}")
|
||||
try:
|
||||
child_object_id = await self.table_store.get_document_object_id(
|
||||
child.user,
|
||||
child.id
|
||||
)
|
||||
await self.blob_store.remove(child_object_id)
|
||||
await self.table_store.remove_document(child.user, child.id)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to delete child document {child.id}: {e}")
|
||||
|
||||
# Now remove the parent document
|
||||
object_id = await self.table_store.get_document_object_id(
|
||||
request.user,
|
||||
request.document_id
|
||||
|
|
@ -100,13 +116,7 @@ class Librarian:
|
|||
|
||||
logger.debug("Remove complete")
|
||||
|
||||
return LibrarianResponse(
|
||||
error = None,
|
||||
document_metadata = None,
|
||||
content = None,
|
||||
document_metadatas = None,
|
||||
processing_metadatas = None,
|
||||
)
|
||||
return LibrarianResponse()
|
||||
|
||||
async def update_document(self, request):
|
||||
|
||||
|
|
@ -124,13 +134,7 @@ class Librarian:
|
|||
|
||||
logger.debug("Update complete")
|
||||
|
||||
return LibrarianResponse(
|
||||
error = None,
|
||||
document_metadata = None,
|
||||
content = None,
|
||||
document_metadatas = None,
|
||||
processing_metadatas = None,
|
||||
)
|
||||
return LibrarianResponse()
|
||||
|
||||
async def get_document_metadata(self, request):
|
||||
|
||||
|
|
@ -147,8 +151,6 @@ class Librarian:
|
|||
error = None,
|
||||
document_metadata = doc,
|
||||
content = None,
|
||||
document_metadatas = None,
|
||||
processing_metadatas = None,
|
||||
)
|
||||
|
||||
async def get_document_content(self, request):
|
||||
|
|
@ -170,8 +172,6 @@ class Librarian:
|
|||
error = None,
|
||||
document_metadata = None,
|
||||
content = base64.b64encode(content),
|
||||
document_metadatas = None,
|
||||
processing_metadatas = None,
|
||||
)
|
||||
|
||||
async def add_processing(self, request):
|
||||
|
|
@ -217,13 +217,7 @@ class Librarian:
|
|||
|
||||
logger.debug("Add complete")
|
||||
|
||||
return LibrarianResponse(
|
||||
error = None,
|
||||
document_metadata = None,
|
||||
content = None,
|
||||
document_metadatas = None,
|
||||
processing_metadatas = None,
|
||||
)
|
||||
return LibrarianResponse()
|
||||
|
||||
async def remove_processing(self, request):
|
||||
|
||||
|
|
@ -243,24 +237,22 @@ class Librarian:
|
|||
|
||||
logger.debug("Remove complete")
|
||||
|
||||
return LibrarianResponse(
|
||||
error = None,
|
||||
document_metadata = None,
|
||||
content = None,
|
||||
document_metadatas = None,
|
||||
processing_metadatas = None,
|
||||
)
|
||||
return LibrarianResponse()
|
||||
|
||||
async def list_documents(self, request):
|
||||
|
||||
docs = await self.table_store.list_documents(request.user)
|
||||
|
||||
# Filter out child documents by default unless include_children is True
|
||||
include_children = getattr(request, 'include_children', False)
|
||||
if not include_children:
|
||||
docs = [
|
||||
doc for doc in docs
|
||||
if not doc.parent_id # Only include top-level documents
|
||||
]
|
||||
|
||||
return LibrarianResponse(
|
||||
error = None,
|
||||
document_metadata = None,
|
||||
content = None,
|
||||
document_metadatas = docs,
|
||||
processing_metadatas = None,
|
||||
)
|
||||
|
||||
async def list_processing(self, request):
|
||||
|
|
@ -268,10 +260,438 @@ class Librarian:
|
|||
procs = await self.table_store.list_processing(request.user)
|
||||
|
||||
return LibrarianResponse(
|
||||
error = None,
|
||||
document_metadata = None,
|
||||
content = None,
|
||||
document_metadatas = None,
|
||||
processing_metadatas = procs,
|
||||
)
|
||||
|
||||
# Chunked upload operations
|
||||
|
||||
async def begin_upload(self, request):
|
||||
"""
|
||||
Initialize a chunked upload session.
|
||||
|
||||
Creates an S3 multipart upload and stores session state in Cassandra.
|
||||
"""
|
||||
logger.info(f"Beginning chunked upload for document {request.document_metadata.id}")
|
||||
|
||||
if request.document_metadata.kind not in ("text/plain", "application/pdf"):
|
||||
raise RequestError(
|
||||
"Invalid document kind: " + request.document_metadata.kind
|
||||
)
|
||||
|
||||
if await self.table_store.document_exists(
|
||||
request.document_metadata.user,
|
||||
request.document_metadata.id
|
||||
):
|
||||
raise RequestError("Document already exists")
|
||||
|
||||
# Validate sizes
|
||||
total_size = request.total_size
|
||||
if total_size <= 0:
|
||||
raise RequestError("total_size must be positive")
|
||||
|
||||
# Use provided chunk size or default (minimum 5MB for S3)
|
||||
chunk_size = request.chunk_size if request.chunk_size > 0 else DEFAULT_CHUNK_SIZE
|
||||
if chunk_size < DEFAULT_CHUNK_SIZE:
|
||||
chunk_size = DEFAULT_CHUNK_SIZE
|
||||
|
||||
# Calculate total chunks
|
||||
total_chunks = math.ceil(total_size / chunk_size)
|
||||
|
||||
# Generate IDs
|
||||
upload_id = str(uuid.uuid4())
|
||||
object_id = uuid.uuid4()
|
||||
|
||||
# Create S3 multipart upload
|
||||
s3_upload_id = self.blob_store.create_multipart_upload(
|
||||
object_id, request.document_metadata.kind
|
||||
)
|
||||
|
||||
# Serialize document metadata for storage
|
||||
doc_meta_json = json.dumps({
|
||||
"id": request.document_metadata.id,
|
||||
"time": request.document_metadata.time,
|
||||
"kind": request.document_metadata.kind,
|
||||
"title": request.document_metadata.title,
|
||||
"comments": request.document_metadata.comments,
|
||||
"user": request.document_metadata.user,
|
||||
"tags": request.document_metadata.tags,
|
||||
})
|
||||
|
||||
# Store session in Cassandra
|
||||
await self.table_store.create_upload_session(
|
||||
upload_id=upload_id,
|
||||
user=request.document_metadata.user,
|
||||
document_id=request.document_metadata.id,
|
||||
document_metadata=doc_meta_json,
|
||||
s3_upload_id=s3_upload_id,
|
||||
object_id=object_id,
|
||||
total_size=total_size,
|
||||
chunk_size=chunk_size,
|
||||
total_chunks=total_chunks,
|
||||
)
|
||||
|
||||
logger.info(f"Created upload session {upload_id} with {total_chunks} chunks")
|
||||
|
||||
return LibrarianResponse(
|
||||
error=None,
|
||||
upload_id=upload_id,
|
||||
chunk_size=chunk_size,
|
||||
total_chunks=total_chunks,
|
||||
)
|
||||
|
||||
async def upload_chunk(self, request):
|
||||
"""
|
||||
Upload a single chunk of a document.
|
||||
|
||||
Forwards the chunk to S3 and updates session state.
|
||||
"""
|
||||
logger.debug(f"Uploading chunk {request.chunk_index} for upload {request.upload_id}")
|
||||
|
||||
# Get session
|
||||
session = await self.table_store.get_upload_session(request.upload_id)
|
||||
if session is None:
|
||||
raise RequestError("Upload session not found or expired")
|
||||
|
||||
# Validate ownership
|
||||
if session["user"] != request.user:
|
||||
raise RequestError("Not authorized to upload to this session")
|
||||
|
||||
# Validate chunk index
|
||||
if request.chunk_index < 0 or request.chunk_index >= session["total_chunks"]:
|
||||
raise RequestError(
|
||||
f"Invalid chunk index {request.chunk_index}, "
|
||||
f"must be 0-{session['total_chunks']-1}"
|
||||
)
|
||||
|
||||
# Decode content
|
||||
content = base64.b64decode(request.content)
|
||||
|
||||
# Upload to S3 (part numbers are 1-indexed in S3)
|
||||
part_number = request.chunk_index + 1
|
||||
etag = self.blob_store.upload_part(
|
||||
object_id=session["object_id"],
|
||||
upload_id=session["s3_upload_id"],
|
||||
part_number=part_number,
|
||||
data=content,
|
||||
)
|
||||
|
||||
# Update session with chunk info
|
||||
await self.table_store.update_upload_session_chunk(
|
||||
upload_id=request.upload_id,
|
||||
chunk_index=request.chunk_index,
|
||||
etag=etag,
|
||||
)
|
||||
|
||||
# Calculate progress
|
||||
chunks_received = session["chunks_received"]
|
||||
# Add this chunk if not already present
|
||||
if request.chunk_index not in chunks_received:
|
||||
chunks_received[request.chunk_index] = etag
|
||||
|
||||
num_chunks_received = len(chunks_received) + 1 # +1 for this chunk
|
||||
bytes_received = num_chunks_received * session["chunk_size"]
|
||||
# Adjust for last chunk potentially being smaller
|
||||
if bytes_received > session["total_size"]:
|
||||
bytes_received = session["total_size"]
|
||||
|
||||
logger.debug(f"Chunk {request.chunk_index} uploaded, {num_chunks_received}/{session['total_chunks']} complete")
|
||||
|
||||
return LibrarianResponse(
|
||||
error=None,
|
||||
upload_id=request.upload_id,
|
||||
chunk_index=request.chunk_index,
|
||||
chunks_received=num_chunks_received,
|
||||
total_chunks=session["total_chunks"],
|
||||
bytes_received=bytes_received,
|
||||
total_bytes=session["total_size"],
|
||||
)
|
||||
|
||||
async def complete_upload(self, request):
|
||||
"""
|
||||
Finalize a chunked upload and create the document.
|
||||
|
||||
Completes the S3 multipart upload and creates the document metadata.
|
||||
"""
|
||||
logger.info(f"Completing upload {request.upload_id}")
|
||||
|
||||
# Get session
|
||||
session = await self.table_store.get_upload_session(request.upload_id)
|
||||
if session is None:
|
||||
raise RequestError("Upload session not found or expired")
|
||||
|
||||
# Validate ownership
|
||||
if session["user"] != request.user:
|
||||
raise RequestError("Not authorized to complete this upload")
|
||||
|
||||
# Verify all chunks received
|
||||
chunks_received = session["chunks_received"]
|
||||
if len(chunks_received) != session["total_chunks"]:
|
||||
missing = [
|
||||
i for i in range(session["total_chunks"])
|
||||
if i not in chunks_received
|
||||
]
|
||||
raise RequestError(
|
||||
f"Missing chunks: {missing[:10]}{'...' if len(missing) > 10 else ''}"
|
||||
)
|
||||
|
||||
# Build parts list for S3 (sorted by part number)
|
||||
parts = [
|
||||
(chunk_index + 1, etag) # S3 part numbers are 1-indexed
|
||||
for chunk_index, etag in sorted(chunks_received.items())
|
||||
]
|
||||
|
||||
# Complete S3 multipart upload
|
||||
self.blob_store.complete_multipart_upload(
|
||||
object_id=session["object_id"],
|
||||
upload_id=session["s3_upload_id"],
|
||||
parts=parts,
|
||||
)
|
||||
|
||||
# Parse document metadata from session
|
||||
doc_meta_dict = json.loads(session["document_metadata"])
|
||||
|
||||
# Create DocumentMetadata object
|
||||
from .. schema import DocumentMetadata
|
||||
doc_metadata = DocumentMetadata(
|
||||
id=doc_meta_dict["id"],
|
||||
time=doc_meta_dict.get("time", int(time.time())),
|
||||
kind=doc_meta_dict["kind"],
|
||||
title=doc_meta_dict.get("title", ""),
|
||||
comments=doc_meta_dict.get("comments", ""),
|
||||
user=doc_meta_dict["user"],
|
||||
tags=doc_meta_dict.get("tags", []),
|
||||
metadata=[], # Triples not supported in chunked upload yet
|
||||
)
|
||||
|
||||
# Add document to table
|
||||
await self.table_store.add_document(doc_metadata, session["object_id"])
|
||||
|
||||
# Delete upload session
|
||||
await self.table_store.delete_upload_session(request.upload_id)
|
||||
|
||||
logger.info(f"Upload {request.upload_id} completed, document {doc_metadata.id} created")
|
||||
|
||||
return LibrarianResponse(
|
||||
error=None,
|
||||
document_id=doc_metadata.id,
|
||||
object_id=str(session["object_id"]),
|
||||
)
|
||||
|
||||
async def abort_upload(self, request):
|
||||
"""
|
||||
Cancel a chunked upload and clean up resources.
|
||||
"""
|
||||
logger.info(f"Aborting upload {request.upload_id}")
|
||||
|
||||
# Get session
|
||||
session = await self.table_store.get_upload_session(request.upload_id)
|
||||
if session is None:
|
||||
raise RequestError("Upload session not found or expired")
|
||||
|
||||
# Validate ownership
|
||||
if session["user"] != request.user:
|
||||
raise RequestError("Not authorized to abort this upload")
|
||||
|
||||
# Abort S3 multipart upload
|
||||
self.blob_store.abort_multipart_upload(
|
||||
object_id=session["object_id"],
|
||||
upload_id=session["s3_upload_id"],
|
||||
)
|
||||
|
||||
# Delete session from Cassandra
|
||||
await self.table_store.delete_upload_session(request.upload_id)
|
||||
|
||||
logger.info(f"Upload {request.upload_id} aborted")
|
||||
|
||||
return LibrarianResponse(error=None)
|
||||
|
||||
async def get_upload_status(self, request):
|
||||
"""
|
||||
Get the status of an in-progress upload.
|
||||
"""
|
||||
logger.debug(f"Getting status for upload {request.upload_id}")
|
||||
|
||||
# Get session
|
||||
session = await self.table_store.get_upload_session(request.upload_id)
|
||||
if session is None:
|
||||
return LibrarianResponse(
|
||||
error=None,
|
||||
upload_id=request.upload_id,
|
||||
upload_state="expired",
|
||||
)
|
||||
|
||||
# Validate ownership
|
||||
if session["user"] != request.user:
|
||||
raise RequestError("Not authorized to view this upload")
|
||||
|
||||
chunks_received = session["chunks_received"]
|
||||
received_list = sorted(chunks_received.keys())
|
||||
missing_list = [
|
||||
i for i in range(session["total_chunks"])
|
||||
if i not in chunks_received
|
||||
]
|
||||
|
||||
bytes_received = len(chunks_received) * session["chunk_size"]
|
||||
if bytes_received > session["total_size"]:
|
||||
bytes_received = session["total_size"]
|
||||
|
||||
return LibrarianResponse(
|
||||
error=None,
|
||||
upload_id=request.upload_id,
|
||||
upload_state="in-progress",
|
||||
received_chunks=received_list,
|
||||
missing_chunks=missing_list,
|
||||
chunks_received=len(chunks_received),
|
||||
total_chunks=session["total_chunks"],
|
||||
bytes_received=bytes_received,
|
||||
total_bytes=session["total_size"],
|
||||
)
|
||||
|
||||
async def list_uploads(self, request):
|
||||
"""
|
||||
List all in-progress uploads for a user.
|
||||
"""
|
||||
logger.debug(f"Listing uploads for user {request.user}")
|
||||
|
||||
sessions = await self.table_store.list_upload_sessions(request.user)
|
||||
|
||||
upload_sessions = [
|
||||
UploadSession(
|
||||
upload_id=s["upload_id"],
|
||||
document_id=s["document_id"],
|
||||
document_metadata_json=s.get("document_metadata", ""),
|
||||
total_size=s["total_size"],
|
||||
chunk_size=s["chunk_size"],
|
||||
total_chunks=s["total_chunks"],
|
||||
chunks_received=s["chunks_received"],
|
||||
created_at=str(s.get("created_at", "")),
|
||||
)
|
||||
for s in sessions
|
||||
]
|
||||
|
||||
return LibrarianResponse(
|
||||
error=None,
|
||||
upload_sessions=upload_sessions,
|
||||
)
|
||||
|
||||
# Child document operations
|
||||
|
||||
async def add_child_document(self, request):
|
||||
"""
|
||||
Add a child document linked to a parent document.
|
||||
|
||||
Child documents are typically extracted content (e.g., pages from a PDF).
|
||||
They have a parent_id pointing to the source document and document_type
|
||||
set to "extracted".
|
||||
"""
|
||||
logger.info(f"Adding child document {request.document_metadata.id} "
|
||||
f"for parent {request.document_metadata.parent_id}")
|
||||
|
||||
if not request.document_metadata.parent_id:
|
||||
raise RequestError("parent_id is required for child documents")
|
||||
|
||||
# Verify parent exists
|
||||
if not await self.table_store.document_exists(
|
||||
request.document_metadata.user,
|
||||
request.document_metadata.parent_id
|
||||
):
|
||||
raise RequestError(
|
||||
f"Parent document {request.document_metadata.parent_id} does not exist"
|
||||
)
|
||||
|
||||
if await self.table_store.document_exists(
|
||||
request.document_metadata.user,
|
||||
request.document_metadata.id
|
||||
):
|
||||
raise RequestError("Document already exists")
|
||||
|
||||
# Ensure document_type is set to "extracted"
|
||||
request.document_metadata.document_type = "extracted"
|
||||
|
||||
# Create object ID for blob
|
||||
object_id = uuid.uuid4()
|
||||
|
||||
logger.debug("Adding blob...")
|
||||
|
||||
await self.blob_store.add(
|
||||
object_id, base64.b64decode(request.content),
|
||||
request.document_metadata.kind
|
||||
)
|
||||
|
||||
logger.debug("Adding to table...")
|
||||
|
||||
await self.table_store.add_document(
|
||||
request.document_metadata, object_id
|
||||
)
|
||||
|
||||
logger.debug("Add child document complete")
|
||||
|
||||
return LibrarianResponse(
|
||||
error=None,
|
||||
document_id=request.document_metadata.id,
|
||||
)
|
||||
|
||||
async def list_children(self, request):
|
||||
"""
|
||||
List all child documents for a given parent document.
|
||||
"""
|
||||
logger.debug(f"Listing children for parent {request.document_id}")
|
||||
|
||||
children = await self.table_store.list_children(request.document_id)
|
||||
|
||||
return LibrarianResponse(
|
||||
error=None,
|
||||
document_metadatas=children,
|
||||
)
|
||||
|
||||
async def stream_document(self, request):
|
||||
"""
|
||||
Stream document content in chunks.
|
||||
|
||||
This operation returns document content in smaller chunks, allowing
|
||||
memory-efficient processing of large documents. The response includes
|
||||
chunk information for reassembly.
|
||||
|
||||
Note: This operation returns a single chunk at a time. Clients should
|
||||
call repeatedly with increasing chunk_index until all chunks are received.
|
||||
"""
|
||||
logger.debug(f"Streaming document {request.document_id}, chunk {request.chunk_index}")
|
||||
|
||||
object_id = await self.table_store.get_document_object_id(
|
||||
request.user,
|
||||
request.document_id
|
||||
)
|
||||
|
||||
# Default chunk size of 1MB
|
||||
chunk_size = request.chunk_size if request.chunk_size > 0 else 1024 * 1024
|
||||
|
||||
# Get the full content and slice out the requested chunk
|
||||
# Note: This is a simple implementation. For true streaming, we'd need
|
||||
# range requests on the object storage.
|
||||
content = await self.blob_store.get(object_id)
|
||||
total_size = len(content)
|
||||
total_chunks = math.ceil(total_size / chunk_size)
|
||||
|
||||
if request.chunk_index >= total_chunks:
|
||||
raise RequestError(
|
||||
f"Invalid chunk index {request.chunk_index}, "
|
||||
f"document has {total_chunks} chunks"
|
||||
)
|
||||
|
||||
start = request.chunk_index * chunk_size
|
||||
end = min(start + chunk_size, total_size)
|
||||
chunk_content = content[start:end]
|
||||
|
||||
logger.debug(f"Returning chunk {request.chunk_index}/{total_chunks}, "
|
||||
f"bytes {start}-{end} of {total_size}")
|
||||
|
||||
return LibrarianResponse(
|
||||
error=None,
|
||||
content=base64.b64encode(chunk_content),
|
||||
chunk_index=request.chunk_index,
|
||||
chunks_received=1, # Using as "current chunk" indicator
|
||||
total_chunks=total_chunks,
|
||||
bytes_received=end,
|
||||
total_bytes=total_size,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -271,6 +271,9 @@ class Processor(AsyncProcessor):
|
|||
|
||||
pass
|
||||
|
||||
# Threshold for sending document_id instead of inline content (2MB)
|
||||
STREAMING_THRESHOLD = 2 * 1024 * 1024
|
||||
|
||||
async def load_document(self, document, processing, content):
|
||||
|
||||
logger.debug("Ready for document processing...")
|
||||
|
|
@ -292,26 +295,57 @@ class Processor(AsyncProcessor):
|
|||
q = flow["interfaces"][kind]
|
||||
|
||||
if kind == "text-load":
|
||||
doc = TextDocument(
|
||||
metadata = Metadata(
|
||||
id = document.id,
|
||||
metadata = document.metadata,
|
||||
user = processing.user,
|
||||
collection = processing.collection
|
||||
),
|
||||
text = content,
|
||||
)
|
||||
# For large text documents, send document_id for streaming retrieval
|
||||
if len(content) >= self.STREAMING_THRESHOLD:
|
||||
logger.info(f"Text document {document.id} is large ({len(content)} bytes), "
|
||||
f"sending document_id for streaming retrieval")
|
||||
doc = TextDocument(
|
||||
metadata = Metadata(
|
||||
id = document.id,
|
||||
metadata = document.metadata,
|
||||
user = processing.user,
|
||||
collection = processing.collection
|
||||
),
|
||||
document_id = document.id,
|
||||
text = b"", # Empty, receiver will fetch via librarian
|
||||
)
|
||||
else:
|
||||
doc = TextDocument(
|
||||
metadata = Metadata(
|
||||
id = document.id,
|
||||
metadata = document.metadata,
|
||||
user = processing.user,
|
||||
collection = processing.collection
|
||||
),
|
||||
text = content,
|
||||
)
|
||||
schema = TextDocument
|
||||
else:
|
||||
doc = Document(
|
||||
metadata = Metadata(
|
||||
id = document.id,
|
||||
metadata = document.metadata,
|
||||
user = processing.user,
|
||||
collection = processing.collection
|
||||
),
|
||||
data = base64.b64encode(content).decode("utf-8")
|
||||
)
|
||||
# For large PDF documents, send document_id for streaming retrieval
|
||||
# instead of embedding the entire content in the message
|
||||
if len(content) >= self.STREAMING_THRESHOLD:
|
||||
logger.info(f"Document {document.id} is large ({len(content)} bytes), "
|
||||
f"sending document_id for streaming retrieval")
|
||||
doc = Document(
|
||||
metadata = Metadata(
|
||||
id = document.id,
|
||||
metadata = document.metadata,
|
||||
user = processing.user,
|
||||
collection = processing.collection
|
||||
),
|
||||
document_id = document.id,
|
||||
data = b"", # Empty data, receiver will fetch via API
|
||||
)
|
||||
else:
|
||||
doc = Document(
|
||||
metadata = Metadata(
|
||||
id = document.id,
|
||||
metadata = document.metadata,
|
||||
user = processing.user,
|
||||
collection = processing.collection
|
||||
),
|
||||
data = base64.b64encode(content).decode("utf-8")
|
||||
)
|
||||
schema = Document
|
||||
|
||||
logger.debug(f"Submitting to queue {q}...")
|
||||
|
|
@ -361,6 +395,17 @@ class Processor(AsyncProcessor):
|
|||
"remove-processing": self.librarian.remove_processing,
|
||||
"list-documents": self.librarian.list_documents,
|
||||
"list-processing": self.librarian.list_processing,
|
||||
# Chunked upload operations
|
||||
"begin-upload": self.librarian.begin_upload,
|
||||
"upload-chunk": self.librarian.upload_chunk,
|
||||
"complete-upload": self.librarian.complete_upload,
|
||||
"abort-upload": self.librarian.abort_upload,
|
||||
"get-upload-status": self.librarian.get_upload_status,
|
||||
"list-uploads": self.librarian.list_uploads,
|
||||
# Child document and streaming operations
|
||||
"add-child-document": self.librarian.add_child_document,
|
||||
"list-children": self.librarian.list_children,
|
||||
"stream-document": self.librarian.stream_document,
|
||||
}
|
||||
|
||||
if v.operation not in impls:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue