mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-26 17:06:22 +02:00
Incremental / large document loading (#659)
Tech spec
BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up
Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
update_upload_session_chunk(), delete_upload_session(),
list_upload_sessions()
- Schema extended with UploadSession, UploadProgress, and new
request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
- add_document() auto-switches to chunked for files > 10MB
- Progress callback support (on_progress)
- get_pending_uploads(), get_upload_status(), abort_upload(),
resume_upload()
- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
streaming retrieval
- Librarian operations:
- add-child-document for extracted PDF pages
- list-children to get child documents
- stream-document for chunked content retrieval
- Cascade delete removes children when parent is deleted
- list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
warnings directing users to tg-add-library-document +
tg-start-library-processing
Remove load_pdf and load_text utils
Move chunker/librarian comms to base class
Updating tests
This commit is contained in:
parent
a38ca9474f
commit
a630e143ef
21 changed files with 3164 additions and 650 deletions
|
|
@ -6,6 +6,7 @@ including document storage, metadata management, and processing workflow coordin
|
|||
"""
|
||||
|
||||
import datetime
|
||||
import math
|
||||
import time
|
||||
import base64
|
||||
import logging
|
||||
|
|
@ -17,6 +18,13 @@ from . exceptions import *
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Threshold for switching to chunked upload (2MB)
|
||||
# Lower threshold provides progress feedback and resumability on slower connections
|
||||
CHUNKED_UPLOAD_THRESHOLD = 2 * 1024 * 1024
|
||||
|
||||
# Default chunk size (5MB - S3 multipart minimum)
|
||||
DEFAULT_CHUNK_SIZE = 5 * 1024 * 1024
|
||||
|
||||
|
||||
def to_value(x):
|
||||
"""Convert wire format to Uri or Literal."""
|
||||
|
|
@ -67,13 +75,14 @@ class Library:
|
|||
|
||||
def add_document(
|
||||
self, document, id, metadata, user, title, comments,
|
||||
kind="text/plain", tags=[],
|
||||
kind="text/plain", tags=[], on_progress=None,
|
||||
):
|
||||
"""
|
||||
Add a document to the library.
|
||||
|
||||
Stores a document with associated metadata in the library for
|
||||
retrieval and processing.
|
||||
retrieval and processing. For large documents (> 10MB), automatically
|
||||
uses chunked upload for better reliability and progress tracking.
|
||||
|
||||
Args:
|
||||
document: Document content as bytes
|
||||
|
|
@ -84,6 +93,7 @@ class Library:
|
|||
comments: Document description or comments
|
||||
kind: MIME type of the document (default: "text/plain")
|
||||
tags: List of tags for categorization (default: [])
|
||||
on_progress: Optional callback(bytes_sent, total_bytes) for progress updates
|
||||
|
||||
Returns:
|
||||
dict: Response from the add operation
|
||||
|
|
@ -107,6 +117,22 @@ class Library:
|
|||
kind="application/pdf",
|
||||
tags=["research", "physics"]
|
||||
)
|
||||
|
||||
# Add a large document with progress tracking
|
||||
def progress(sent, total):
|
||||
print(f"Uploaded {sent}/{total} bytes ({100*sent//total}%)")
|
||||
|
||||
with open("large_document.pdf", "rb") as f:
|
||||
library.add_document(
|
||||
document=f.read(),
|
||||
id="large-doc-001",
|
||||
metadata=[],
|
||||
user="trustgraph",
|
||||
title="Large Document",
|
||||
comments="A very large document",
|
||||
kind="application/pdf",
|
||||
on_progress=progress
|
||||
)
|
||||
```
|
||||
"""
|
||||
|
||||
|
|
@ -124,6 +150,21 @@ class Library:
|
|||
if not title: title = ""
|
||||
if not comments: comments = ""
|
||||
|
||||
# Check if we should use chunked upload
|
||||
if len(document) >= CHUNKED_UPLOAD_THRESHOLD:
|
||||
return self._add_document_chunked(
|
||||
document=document,
|
||||
id=id,
|
||||
metadata=metadata,
|
||||
user=user,
|
||||
title=title,
|
||||
comments=comments,
|
||||
kind=kind,
|
||||
tags=tags,
|
||||
on_progress=on_progress,
|
||||
)
|
||||
|
||||
# Small document: use single operation (existing behavior)
|
||||
triples = []
|
||||
|
||||
def emit(t):
|
||||
|
|
@ -167,14 +208,111 @@ class Library:
|
|||
|
||||
return self.request(input)
|
||||
|
||||
def get_documents(self, user):
|
||||
def _add_document_chunked(
|
||||
self, document, id, metadata, user, title, comments,
|
||||
kind, tags, on_progress=None,
|
||||
):
|
||||
"""
|
||||
Add a large document using chunked upload.
|
||||
|
||||
Internal method that handles multipart upload for large documents.
|
||||
"""
|
||||
total_size = len(document)
|
||||
chunk_size = DEFAULT_CHUNK_SIZE
|
||||
|
||||
logger.info(f"Starting chunked upload for document {id} ({total_size} bytes)")
|
||||
|
||||
# Begin upload session
|
||||
begin_request = {
|
||||
"operation": "begin-upload",
|
||||
"document-metadata": {
|
||||
"id": id,
|
||||
"time": int(time.time()),
|
||||
"kind": kind,
|
||||
"title": title,
|
||||
"comments": comments,
|
||||
"user": user,
|
||||
"tags": tags,
|
||||
},
|
||||
"total-size": total_size,
|
||||
"chunk-size": chunk_size,
|
||||
}
|
||||
|
||||
begin_response = self.request(begin_request)
|
||||
|
||||
upload_id = begin_response.get("upload-id")
|
||||
if not upload_id:
|
||||
raise RuntimeError("Failed to begin upload: no upload_id returned")
|
||||
|
||||
actual_chunk_size = begin_response.get("chunk-size", chunk_size)
|
||||
total_chunks = begin_response.get("total-chunks", math.ceil(total_size / actual_chunk_size))
|
||||
|
||||
logger.info(f"Upload session {upload_id} created, {total_chunks} chunks")
|
||||
|
||||
try:
|
||||
# Upload chunks
|
||||
bytes_sent = 0
|
||||
for chunk_index in range(total_chunks):
|
||||
start = chunk_index * actual_chunk_size
|
||||
end = min(start + actual_chunk_size, total_size)
|
||||
chunk_data = document[start:end]
|
||||
|
||||
chunk_request = {
|
||||
"operation": "upload-chunk",
|
||||
"upload-id": upload_id,
|
||||
"chunk-index": chunk_index,
|
||||
"content": base64.b64encode(chunk_data).decode("utf-8"),
|
||||
"user": user,
|
||||
}
|
||||
|
||||
chunk_response = self.request(chunk_request)
|
||||
|
||||
bytes_sent = end
|
||||
|
||||
# Call progress callback if provided
|
||||
if on_progress:
|
||||
on_progress(bytes_sent, total_size)
|
||||
|
||||
logger.debug(f"Chunk {chunk_index + 1}/{total_chunks} uploaded")
|
||||
|
||||
# Complete upload
|
||||
complete_request = {
|
||||
"operation": "complete-upload",
|
||||
"upload-id": upload_id,
|
||||
"user": user,
|
||||
}
|
||||
|
||||
complete_response = self.request(complete_request)
|
||||
|
||||
logger.info(f"Chunked upload completed for document {id}")
|
||||
|
||||
return complete_response
|
||||
|
||||
except Exception as e:
|
||||
# Try to abort on failure
|
||||
logger.error(f"Chunked upload failed: {e}")
|
||||
try:
|
||||
abort_request = {
|
||||
"operation": "abort-upload",
|
||||
"upload-id": upload_id,
|
||||
"user": user,
|
||||
}
|
||||
self.request(abort_request)
|
||||
logger.info(f"Aborted failed upload {upload_id}")
|
||||
except Exception as abort_error:
|
||||
logger.warning(f"Failed to abort upload: {abort_error}")
|
||||
raise
|
||||
|
||||
def get_documents(self, user, include_children=False):
|
||||
"""
|
||||
List all documents for a user.
|
||||
|
||||
Retrieves metadata for all documents owned by the specified user.
|
||||
By default, only returns top-level documents (not child/extracted documents).
|
||||
|
||||
Args:
|
||||
user: User identifier
|
||||
include_children: If True, also include child documents (default: False)
|
||||
|
||||
Returns:
|
||||
list[DocumentMetadata]: List of document metadata objects
|
||||
|
|
@ -185,18 +323,24 @@ class Library:
|
|||
Example:
|
||||
```python
|
||||
library = api.library()
|
||||
|
||||
# Get only top-level documents
|
||||
docs = library.get_documents(user="trustgraph")
|
||||
|
||||
for doc in docs:
|
||||
print(f"{doc.id}: {doc.title} ({doc.kind})")
|
||||
print(f" Uploaded: {doc.time}")
|
||||
print(f" Tags: {', '.join(doc.tags)}")
|
||||
|
||||
# Get all documents including extracted pages
|
||||
all_docs = library.get_documents(user="trustgraph", include_children=True)
|
||||
```
|
||||
"""
|
||||
|
||||
input = {
|
||||
"operation": "list-documents",
|
||||
"user": user,
|
||||
"include-children": include_children,
|
||||
}
|
||||
|
||||
object = self.request(input)
|
||||
|
|
@ -218,7 +362,9 @@ class Library:
|
|||
for w in v["metadata"]
|
||||
],
|
||||
user = v["user"],
|
||||
tags = v["tags"]
|
||||
tags = v["tags"],
|
||||
parent_id = v.get("parent-id", ""),
|
||||
document_type = v.get("document-type", "source"),
|
||||
)
|
||||
for v in object["document-metadatas"]
|
||||
]
|
||||
|
|
@ -261,7 +407,7 @@ class Library:
|
|||
doc = object["document-metadata"]
|
||||
|
||||
try:
|
||||
DocumentMetadata(
|
||||
return DocumentMetadata(
|
||||
id = doc["id"],
|
||||
time = datetime.datetime.fromtimestamp(doc["time"]),
|
||||
kind = doc["kind"],
|
||||
|
|
@ -276,7 +422,9 @@ class Library:
|
|||
for w in doc["metadata"]
|
||||
],
|
||||
user = doc["user"],
|
||||
tags = doc["tags"]
|
||||
tags = doc["tags"],
|
||||
parent_id = doc.get("parent-id", ""),
|
||||
document_type = doc.get("document-type", "source"),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Failed to parse document response", exc_info=True)
|
||||
|
|
@ -535,3 +683,447 @@ class Library:
|
|||
logger.error("Failed to parse processing list response", exc_info=True)
|
||||
raise ProtocolException(f"Response not formatted correctly")
|
||||
|
||||
# Chunked upload management methods
|
||||
|
||||
def get_pending_uploads(self, user):
|
||||
"""
|
||||
List all pending (in-progress) uploads for a user.
|
||||
|
||||
Retrieves information about chunked uploads that have been started
|
||||
but not yet completed.
|
||||
|
||||
Args:
|
||||
user: User identifier
|
||||
|
||||
Returns:
|
||||
list[dict]: List of pending upload information
|
||||
|
||||
Example:
|
||||
```python
|
||||
library = api.library()
|
||||
pending = library.get_pending_uploads(user="trustgraph")
|
||||
|
||||
for upload in pending:
|
||||
print(f"Upload {upload['upload_id']}:")
|
||||
print(f" Document: {upload['document_id']}")
|
||||
print(f" Progress: {upload['chunks_received']}/{upload['total_chunks']}")
|
||||
```
|
||||
"""
|
||||
input = {
|
||||
"operation": "list-uploads",
|
||||
"user": user,
|
||||
}
|
||||
|
||||
response = self.request(input)
|
||||
|
||||
return response.get("upload-sessions", [])
|
||||
|
||||
def get_upload_status(self, upload_id, user):
|
||||
"""
|
||||
Get the status of a specific upload.
|
||||
|
||||
Retrieves detailed status information about a chunked upload,
|
||||
including which chunks have been received and which are missing.
|
||||
|
||||
Args:
|
||||
upload_id: Upload session identifier
|
||||
user: User identifier
|
||||
|
||||
Returns:
|
||||
dict: Upload status information including:
|
||||
- upload_id: The upload session ID
|
||||
- state: "in-progress", "completed", or "expired"
|
||||
- chunks_received: Number of chunks received
|
||||
- total_chunks: Total number of chunks expected
|
||||
- received_chunks: List of received chunk indices
|
||||
- missing_chunks: List of missing chunk indices
|
||||
- bytes_received: Total bytes received
|
||||
- total_bytes: Total expected bytes
|
||||
|
||||
Example:
|
||||
```python
|
||||
library = api.library()
|
||||
status = library.get_upload_status(
|
||||
upload_id="abc-123",
|
||||
user="trustgraph"
|
||||
)
|
||||
|
||||
if status['state'] == 'in-progress':
|
||||
print(f"Missing chunks: {status['missing_chunks']}")
|
||||
```
|
||||
"""
|
||||
input = {
|
||||
"operation": "get-upload-status",
|
||||
"upload-id": upload_id,
|
||||
"user": user,
|
||||
}
|
||||
|
||||
return self.request(input)
|
||||
|
||||
def abort_upload(self, upload_id, user):
|
||||
"""
|
||||
Abort an in-progress upload.
|
||||
|
||||
Cancels a chunked upload and cleans up any uploaded chunks.
|
||||
|
||||
Args:
|
||||
upload_id: Upload session identifier
|
||||
user: User identifier
|
||||
|
||||
Returns:
|
||||
dict: Empty response on success
|
||||
|
||||
Example:
|
||||
```python
|
||||
library = api.library()
|
||||
library.abort_upload(upload_id="abc-123", user="trustgraph")
|
||||
```
|
||||
"""
|
||||
input = {
|
||||
"operation": "abort-upload",
|
||||
"upload-id": upload_id,
|
||||
"user": user,
|
||||
}
|
||||
|
||||
return self.request(input)
|
||||
|
||||
def resume_upload(self, upload_id, document, user, on_progress=None):
|
||||
"""
|
||||
Resume an interrupted upload.
|
||||
|
||||
Continues a chunked upload that was previously interrupted,
|
||||
uploading only the missing chunks.
|
||||
|
||||
Args:
|
||||
upload_id: Upload session identifier to resume
|
||||
document: Complete document content as bytes
|
||||
user: User identifier
|
||||
on_progress: Optional callback(bytes_sent, total_bytes) for progress updates
|
||||
|
||||
Returns:
|
||||
dict: Response from completing the upload
|
||||
|
||||
Example:
|
||||
```python
|
||||
library = api.library()
|
||||
|
||||
# Check what's missing
|
||||
status = library.get_upload_status(
|
||||
upload_id="abc-123",
|
||||
user="trustgraph"
|
||||
)
|
||||
|
||||
if status['state'] == 'in-progress':
|
||||
# Resume with the same document
|
||||
with open("large_document.pdf", "rb") as f:
|
||||
library.resume_upload(
|
||||
upload_id="abc-123",
|
||||
document=f.read(),
|
||||
user="trustgraph"
|
||||
)
|
||||
```
|
||||
"""
|
||||
# Get current status
|
||||
status = self.get_upload_status(upload_id, user)
|
||||
|
||||
if status.get("upload-state") == "expired":
|
||||
raise RuntimeError("Upload session has expired, please start a new upload")
|
||||
|
||||
if status.get("upload-state") == "completed":
|
||||
return {"message": "Upload already completed"}
|
||||
|
||||
missing_chunks = status.get("missing-chunks", [])
|
||||
total_chunks = status.get("total-chunks", 0)
|
||||
total_bytes = status.get("total-bytes", len(document))
|
||||
chunk_size = total_bytes // total_chunks if total_chunks > 0 else DEFAULT_CHUNK_SIZE
|
||||
|
||||
logger.info(f"Resuming upload {upload_id}, {len(missing_chunks)} chunks remaining")
|
||||
|
||||
# Upload missing chunks
|
||||
for chunk_index in missing_chunks:
|
||||
start = chunk_index * chunk_size
|
||||
end = min(start + chunk_size, len(document))
|
||||
chunk_data = document[start:end]
|
||||
|
||||
chunk_request = {
|
||||
"operation": "upload-chunk",
|
||||
"upload-id": upload_id,
|
||||
"chunk-index": chunk_index,
|
||||
"content": base64.b64encode(chunk_data).decode("utf-8"),
|
||||
"user": user,
|
||||
}
|
||||
|
||||
self.request(chunk_request)
|
||||
|
||||
if on_progress:
|
||||
# Estimate progress including previously uploaded chunks
|
||||
uploaded = total_chunks - len(missing_chunks) + missing_chunks.index(chunk_index) + 1
|
||||
bytes_sent = min(uploaded * chunk_size, total_bytes)
|
||||
on_progress(bytes_sent, total_bytes)
|
||||
|
||||
logger.debug(f"Resumed chunk {chunk_index}")
|
||||
|
||||
# Complete upload
|
||||
complete_request = {
|
||||
"operation": "complete-upload",
|
||||
"upload-id": upload_id,
|
||||
"user": user,
|
||||
}
|
||||
|
||||
return self.request(complete_request)
|
||||
|
||||
# Child document methods
|
||||
|
||||
def add_child_document(
|
||||
self, document, id, parent_id, user, title, comments,
|
||||
kind="text/plain", tags=[], metadata=None,
|
||||
):
|
||||
"""
|
||||
Add a child document linked to a parent document.
|
||||
|
||||
Child documents are typically extracted content (e.g., pages from a PDF).
|
||||
They are automatically marked with document_type="extracted" and linked
|
||||
to their parent via parent_id.
|
||||
|
||||
Args:
|
||||
document: Document content as bytes
|
||||
id: Document identifier (auto-generated if None)
|
||||
parent_id: Parent document identifier (required)
|
||||
user: User/owner identifier
|
||||
title: Document title
|
||||
comments: Document description or comments
|
||||
kind: MIME type of the document (default: "text/plain")
|
||||
tags: List of tags for categorization (default: [])
|
||||
metadata: Optional metadata as list of Triple objects
|
||||
|
||||
Returns:
|
||||
dict: Response from the add operation
|
||||
|
||||
Raises:
|
||||
RuntimeError: If parent_id is not provided
|
||||
|
||||
Example:
|
||||
```python
|
||||
library = api.library()
|
||||
|
||||
# Add extracted page from a PDF
|
||||
library.add_child_document(
|
||||
document=page_text.encode('utf-8'),
|
||||
id="doc-123-page-1",
|
||||
parent_id="doc-123",
|
||||
user="trustgraph",
|
||||
title="Page 1 of Research Paper",
|
||||
comments="First page extracted from PDF",
|
||||
kind="text/plain",
|
||||
tags=["extracted", "page"]
|
||||
)
|
||||
```
|
||||
"""
|
||||
if not parent_id:
|
||||
raise RuntimeError("parent_id is required for child documents")
|
||||
|
||||
if id is None:
|
||||
id = hash(document)
|
||||
|
||||
if not title:
|
||||
title = ""
|
||||
if not comments:
|
||||
comments = ""
|
||||
|
||||
triples = []
|
||||
if metadata:
|
||||
if isinstance(metadata, list):
|
||||
triples = [
|
||||
{
|
||||
"s": from_value(t.s),
|
||||
"p": from_value(t.p),
|
||||
"o": from_value(t.o),
|
||||
}
|
||||
for t in metadata
|
||||
]
|
||||
|
||||
input = {
|
||||
"operation": "add-child-document",
|
||||
"document-metadata": {
|
||||
"id": id,
|
||||
"time": int(time.time()),
|
||||
"kind": kind,
|
||||
"title": title,
|
||||
"comments": comments,
|
||||
"metadata": triples,
|
||||
"user": user,
|
||||
"tags": tags,
|
||||
"parent-id": parent_id,
|
||||
"document-type": "extracted",
|
||||
},
|
||||
"content": base64.b64encode(document).decode("utf-8"),
|
||||
}
|
||||
|
||||
return self.request(input)
|
||||
|
||||
def list_children(self, document_id, user):
|
||||
"""
|
||||
List all child documents for a given parent document.
|
||||
|
||||
Args:
|
||||
document_id: Parent document identifier
|
||||
user: User identifier
|
||||
|
||||
Returns:
|
||||
list[DocumentMetadata]: List of child document metadata objects
|
||||
|
||||
Example:
|
||||
```python
|
||||
library = api.library()
|
||||
children = library.list_children(
|
||||
document_id="doc-123",
|
||||
user="trustgraph"
|
||||
)
|
||||
|
||||
for child in children:
|
||||
print(f"{child.id}: {child.title}")
|
||||
```
|
||||
"""
|
||||
input = {
|
||||
"operation": "list-children",
|
||||
"document-id": document_id,
|
||||
"user": user,
|
||||
}
|
||||
|
||||
response = self.request(input)
|
||||
|
||||
try:
|
||||
return [
|
||||
DocumentMetadata(
|
||||
id=v["id"],
|
||||
time=datetime.datetime.fromtimestamp(v["time"]),
|
||||
kind=v["kind"],
|
||||
title=v["title"],
|
||||
comments=v.get("comments", ""),
|
||||
metadata=[
|
||||
Triple(
|
||||
s=to_value(w["s"]),
|
||||
p=to_value(w["p"]),
|
||||
o=to_value(w["o"])
|
||||
)
|
||||
for w in v.get("metadata", [])
|
||||
],
|
||||
user=v["user"],
|
||||
tags=v.get("tags", []),
|
||||
parent_id=v.get("parent-id", ""),
|
||||
document_type=v.get("document-type", "source"),
|
||||
)
|
||||
for v in response.get("document-metadatas", [])
|
||||
]
|
||||
except Exception as e:
|
||||
logger.error("Failed to parse children response", exc_info=True)
|
||||
raise ProtocolException("Response not formatted correctly")
|
||||
|
||||
def get_document_content(self, user, id):
|
||||
"""
|
||||
Get the content of a document.
|
||||
|
||||
Retrieves the full content of a document as bytes.
|
||||
|
||||
Args:
|
||||
user: User identifier
|
||||
id: Document identifier
|
||||
|
||||
Returns:
|
||||
bytes: Document content
|
||||
|
||||
Example:
|
||||
```python
|
||||
library = api.library()
|
||||
content = library.get_document_content(
|
||||
user="trustgraph",
|
||||
id="doc-123"
|
||||
)
|
||||
|
||||
# Write to file
|
||||
with open("output.pdf", "wb") as f:
|
||||
f.write(content)
|
||||
```
|
||||
"""
|
||||
input = {
|
||||
"operation": "get-document-content",
|
||||
"user": user,
|
||||
"document-id": id,
|
||||
}
|
||||
|
||||
response = self.request(input)
|
||||
content_b64 = response.get("content", "")
|
||||
|
||||
return base64.b64decode(content_b64)
|
||||
|
||||
def stream_document_to_file(self, user, id, file_path, chunk_size=1024*1024, on_progress=None):
|
||||
"""
|
||||
Stream document content to a file.
|
||||
|
||||
Downloads document content in chunks and writes directly to a file,
|
||||
enabling memory-efficient handling of large documents.
|
||||
|
||||
Args:
|
||||
user: User identifier
|
||||
id: Document identifier
|
||||
file_path: Path to write the document content
|
||||
chunk_size: Size of each chunk to download (default 1MB)
|
||||
on_progress: Optional callback(bytes_received, total_bytes) for progress updates
|
||||
|
||||
Returns:
|
||||
int: Total bytes written
|
||||
|
||||
Example:
|
||||
```python
|
||||
library = api.library()
|
||||
|
||||
def progress(received, total):
|
||||
print(f"Downloaded {received}/{total} bytes")
|
||||
|
||||
library.stream_document_to_file(
|
||||
user="trustgraph",
|
||||
id="large-doc-123",
|
||||
file_path="/tmp/document.pdf",
|
||||
on_progress=progress
|
||||
)
|
||||
```
|
||||
"""
|
||||
chunk_index = 0
|
||||
total_bytes_written = 0
|
||||
total_bytes = None
|
||||
|
||||
with open(file_path, "wb") as f:
|
||||
while True:
|
||||
input = {
|
||||
"operation": "stream-document",
|
||||
"user": user,
|
||||
"document-id": id,
|
||||
"chunk-index": chunk_index,
|
||||
"chunk-size": chunk_size,
|
||||
}
|
||||
|
||||
response = self.request(input)
|
||||
|
||||
content_b64 = response.get("content", "")
|
||||
chunk_data = base64.b64decode(content_b64)
|
||||
|
||||
if not chunk_data:
|
||||
break
|
||||
|
||||
f.write(chunk_data)
|
||||
total_bytes_written += len(chunk_data)
|
||||
|
||||
total_chunks = response.get("total-chunks", 1)
|
||||
total_bytes = response.get("total-bytes", total_bytes_written)
|
||||
|
||||
if on_progress:
|
||||
on_progress(total_bytes_written, total_bytes)
|
||||
|
||||
# Check if we've received all chunks
|
||||
if chunk_index >= total_chunks - 1:
|
||||
break
|
||||
|
||||
chunk_index += 1
|
||||
|
||||
return total_bytes_written
|
||||
|
||||
|
|
|
|||
|
|
@ -64,6 +64,8 @@ class DocumentMetadata:
|
|||
metadata: List of RDF triples providing structured metadata
|
||||
user: User/owner identifier
|
||||
tags: List of tags for categorization
|
||||
parent_id: Parent document ID for child documents (empty for top-level docs)
|
||||
document_type: "source" for uploaded documents, "extracted" for derived content
|
||||
"""
|
||||
id : str
|
||||
time : datetime.datetime
|
||||
|
|
@ -73,6 +75,8 @@ class DocumentMetadata:
|
|||
metadata : List[Triple]
|
||||
user : str
|
||||
tags : List[str]
|
||||
parent_id : str = ""
|
||||
document_type : str = "source"
|
||||
|
||||
@dataclasses.dataclass
|
||||
class ProcessingMetadata:
|
||||
|
|
|
|||
|
|
@ -1,20 +1,37 @@
|
|||
"""
|
||||
Base chunking service that provides parameter specification functionality
|
||||
for chunk-size and chunk-overlap parameters
|
||||
for chunk-size and chunk-overlap parameters, and librarian client for
|
||||
fetching large document content.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
from .flow_processor import FlowProcessor
|
||||
from .parameter_spec import ParameterSpec
|
||||
from .consumer import Consumer
|
||||
from .producer import Producer
|
||||
from .metrics import ConsumerMetrics, ProducerMetrics
|
||||
|
||||
from ..schema import LibrarianRequest, LibrarianResponse
|
||||
from ..schema import librarian_request_queue, librarian_response_queue
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
default_librarian_request_queue = librarian_request_queue
|
||||
default_librarian_response_queue = librarian_response_queue
|
||||
|
||||
|
||||
class ChunkingService(FlowProcessor):
|
||||
"""Base service for chunking processors with parameter specification support"""
|
||||
|
||||
def __init__(self, **params):
|
||||
|
||||
id = params.get("id", "chunker")
|
||||
|
||||
# Call parent constructor
|
||||
super(ChunkingService, self).__init__(**params)
|
||||
|
||||
|
|
@ -27,8 +44,122 @@ class ChunkingService(FlowProcessor):
|
|||
ParameterSpec(name="chunk-overlap")
|
||||
)
|
||||
|
||||
# Librarian client for fetching document content
|
||||
librarian_request_q = params.get(
|
||||
"librarian_request_queue", default_librarian_request_queue
|
||||
)
|
||||
librarian_response_q = params.get(
|
||||
"librarian_response_queue", default_librarian_response_queue
|
||||
)
|
||||
|
||||
librarian_request_metrics = ProducerMetrics(
|
||||
processor=id, flow=None, name="librarian-request"
|
||||
)
|
||||
|
||||
self.librarian_request_producer = Producer(
|
||||
backend=self.pubsub,
|
||||
topic=librarian_request_q,
|
||||
schema=LibrarianRequest,
|
||||
metrics=librarian_request_metrics,
|
||||
)
|
||||
|
||||
librarian_response_metrics = ConsumerMetrics(
|
||||
processor=id, flow=None, name="librarian-response"
|
||||
)
|
||||
|
||||
self.librarian_response_consumer = Consumer(
|
||||
taskgroup=self.taskgroup,
|
||||
backend=self.pubsub,
|
||||
flow=None,
|
||||
topic=librarian_response_q,
|
||||
subscriber=f"{id}-librarian",
|
||||
schema=LibrarianResponse,
|
||||
handler=self.on_librarian_response,
|
||||
metrics=librarian_response_metrics,
|
||||
)
|
||||
|
||||
# Pending librarian requests: request_id -> asyncio.Future
|
||||
self.pending_requests = {}
|
||||
|
||||
logger.debug("ChunkingService initialized with parameter specifications")
|
||||
|
||||
async def start(self):
|
||||
await super(ChunkingService, self).start()
|
||||
await self.librarian_request_producer.start()
|
||||
await self.librarian_response_consumer.start()
|
||||
|
||||
async def on_librarian_response(self, msg, consumer, flow):
|
||||
"""Handle responses from the librarian service."""
|
||||
response = msg.value()
|
||||
request_id = msg.properties().get("id")
|
||||
|
||||
if request_id and request_id in self.pending_requests:
|
||||
future = self.pending_requests.pop(request_id)
|
||||
future.set_result(response)
|
||||
else:
|
||||
logger.warning(f"Received unexpected librarian response: {request_id}")
|
||||
|
||||
async def fetch_document_content(self, document_id, user, timeout=120):
|
||||
"""
|
||||
Fetch document content from librarian via Pulsar.
|
||||
"""
|
||||
request_id = str(uuid.uuid4())
|
||||
|
||||
request = LibrarianRequest(
|
||||
operation="get-document-content",
|
||||
document_id=document_id,
|
||||
user=user,
|
||||
)
|
||||
|
||||
# Create future for response
|
||||
future = asyncio.get_event_loop().create_future()
|
||||
self.pending_requests[request_id] = future
|
||||
|
||||
try:
|
||||
# Send request
|
||||
await self.librarian_request_producer.send(
|
||||
request, properties={"id": request_id}
|
||||
)
|
||||
|
||||
# Wait for response
|
||||
response = await asyncio.wait_for(future, timeout=timeout)
|
||||
|
||||
if response.error:
|
||||
raise RuntimeError(
|
||||
f"Librarian error: {response.error.type}: {response.error.message}"
|
||||
)
|
||||
|
||||
return response.content
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
self.pending_requests.pop(request_id, None)
|
||||
raise RuntimeError(f"Timeout fetching document {document_id}")
|
||||
|
||||
async def get_document_text(self, doc):
|
||||
"""
|
||||
Get text content from a TextDocument, fetching from librarian if needed.
|
||||
|
||||
Args:
|
||||
doc: TextDocument with either inline text or document_id
|
||||
|
||||
Returns:
|
||||
str: The document text content
|
||||
"""
|
||||
if doc.document_id and not doc.text:
|
||||
logger.info(f"Fetching document {doc.document_id} from librarian...")
|
||||
content = await self.fetch_document_content(
|
||||
document_id=doc.document_id,
|
||||
user=doc.metadata.user,
|
||||
)
|
||||
# Content is base64 encoded
|
||||
if isinstance(content, str):
|
||||
content = content.encode('utf-8')
|
||||
text = base64.b64decode(content).decode("utf-8")
|
||||
logger.info(f"Fetched {len(text)} characters from librarian")
|
||||
return text
|
||||
else:
|
||||
return doc.text.decode("utf-8")
|
||||
|
||||
async def chunk_document(self, msg, consumer, flow, default_chunk_size, default_chunk_overlap):
|
||||
"""
|
||||
Extract chunk parameters from flow and return effective values
|
||||
|
|
@ -59,4 +190,16 @@ class ChunkingService(FlowProcessor):
|
|||
@staticmethod
|
||||
def add_args(parser):
|
||||
"""Add chunking service arguments to parser"""
|
||||
FlowProcessor.add_args(parser)
|
||||
FlowProcessor.add_args(parser)
|
||||
|
||||
parser.add_argument(
|
||||
'--librarian-request-queue',
|
||||
default=default_librarian_request_queue,
|
||||
help=f'Librarian request queue (default: {default_librarian_request_queue})',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--librarian-response-queue',
|
||||
default=default_librarian_response_queue,
|
||||
help=f'Librarian response queue (default: {default_librarian_response_queue})',
|
||||
)
|
||||
|
|
@ -44,14 +44,21 @@ class LibraryRequestTranslator(MessageTranslator):
|
|||
|
||||
return LibrarianRequest(
|
||||
operation=data.get("operation"),
|
||||
document_id=data.get("document-id"),
|
||||
processing_id=data.get("processing-id"),
|
||||
document_id=data.get("document-id", ""),
|
||||
processing_id=data.get("processing-id", ""),
|
||||
document_metadata=doc_metadata,
|
||||
processing_metadata=proc_metadata,
|
||||
content=content,
|
||||
user=data.get("user"),
|
||||
collection=data.get("collection"),
|
||||
criteria=criteria
|
||||
user=data.get("user", ""),
|
||||
collection=data.get("collection", ""),
|
||||
criteria=criteria,
|
||||
# Chunked upload fields
|
||||
total_size=data.get("total-size", 0),
|
||||
chunk_size=data.get("chunk-size", 0),
|
||||
upload_id=data.get("upload-id", ""),
|
||||
chunk_index=data.get("chunk-index", 0),
|
||||
# List documents filtering
|
||||
include_children=data.get("include-children", False),
|
||||
)
|
||||
|
||||
def from_pulsar(self, obj: LibrarianRequest) -> Dict[str, Any]:
|
||||
|
|
@ -98,25 +105,71 @@ class LibraryResponseTranslator(MessageTranslator):
|
|||
|
||||
def from_pulsar(self, obj: LibrarianResponse) -> Dict[str, Any]:
|
||||
result = {}
|
||||
|
||||
|
||||
if obj.error:
|
||||
result["error"] = {
|
||||
"type": obj.error.type,
|
||||
"message": obj.error.message,
|
||||
}
|
||||
|
||||
if obj.document_metadata:
|
||||
result["document-metadata"] = self.doc_metadata_translator.from_pulsar(obj.document_metadata)
|
||||
|
||||
|
||||
if obj.content:
|
||||
result["content"] = obj.content.decode("utf-8") if isinstance(obj.content, bytes) else obj.content
|
||||
|
||||
|
||||
if obj.document_metadatas is not None:
|
||||
result["document-metadatas"] = [
|
||||
self.doc_metadata_translator.from_pulsar(dm)
|
||||
for dm in obj.document_metadatas
|
||||
]
|
||||
|
||||
|
||||
if obj.processing_metadatas is not None:
|
||||
result["processing-metadatas"] = [
|
||||
self.proc_metadata_translator.from_pulsar(pm)
|
||||
for pm in obj.processing_metadatas
|
||||
]
|
||||
|
||||
|
||||
# Chunked upload response fields
|
||||
if obj.upload_id:
|
||||
result["upload-id"] = obj.upload_id
|
||||
if obj.chunk_size:
|
||||
result["chunk-size"] = obj.chunk_size
|
||||
if obj.total_chunks:
|
||||
result["total-chunks"] = obj.total_chunks
|
||||
if obj.chunk_index:
|
||||
result["chunk-index"] = obj.chunk_index
|
||||
if obj.chunks_received:
|
||||
result["chunks-received"] = obj.chunks_received
|
||||
if obj.bytes_received:
|
||||
result["bytes-received"] = obj.bytes_received
|
||||
if obj.total_bytes:
|
||||
result["total-bytes"] = obj.total_bytes
|
||||
if obj.document_id:
|
||||
result["document-id"] = obj.document_id
|
||||
if obj.object_id:
|
||||
result["object-id"] = obj.object_id
|
||||
if obj.upload_state:
|
||||
result["upload-state"] = obj.upload_state
|
||||
if obj.received_chunks:
|
||||
result["received-chunks"] = obj.received_chunks
|
||||
if obj.missing_chunks:
|
||||
result["missing-chunks"] = obj.missing_chunks
|
||||
if obj.upload_sessions:
|
||||
result["upload-sessions"] = [
|
||||
{
|
||||
"upload-id": s.upload_id,
|
||||
"document-id": s.document_id,
|
||||
"document-metadata-json": s.document_metadata_json,
|
||||
"total-size": s.total_size,
|
||||
"chunk-size": s.chunk_size,
|
||||
"total-chunks": s.total_chunks,
|
||||
"chunks-received": s.chunks_received,
|
||||
"created-at": s.created_at,
|
||||
}
|
||||
for s in obj.upload_sessions
|
||||
]
|
||||
|
||||
return result
|
||||
|
||||
def from_response_with_completion(self, obj: LibrarianResponse) -> Tuple[Dict[str, Any], bool]:
|
||||
|
|
|
|||
|
|
@ -20,12 +20,14 @@ class DocumentMetadataTranslator(Translator):
|
|||
comments=data.get("comments"),
|
||||
metadata=self.subgraph_translator.to_pulsar(metadata) if metadata is not None else [],
|
||||
user=data.get("user"),
|
||||
tags=data.get("tags")
|
||||
tags=data.get("tags"),
|
||||
parent_id=data.get("parent-id", ""),
|
||||
document_type=data.get("document-type", "source"),
|
||||
)
|
||||
|
||||
def from_pulsar(self, obj: DocumentMetadata) -> Dict[str, Any]:
|
||||
result = {}
|
||||
|
||||
|
||||
if obj.id:
|
||||
result["id"] = obj.id
|
||||
if obj.time:
|
||||
|
|
@ -42,7 +44,11 @@ class DocumentMetadataTranslator(Translator):
|
|||
result["user"] = obj.user
|
||||
if obj.tags is not None:
|
||||
result["tags"] = obj.tags
|
||||
|
||||
if obj.parent_id:
|
||||
result["parent-id"] = obj.parent_id
|
||||
if obj.document_type:
|
||||
result["document-type"] = obj.document_type
|
||||
|
||||
return result
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -10,6 +10,9 @@ from ..core.topic import topic
|
|||
class Document:
|
||||
metadata: Metadata | None = None
|
||||
data: bytes = b""
|
||||
# For large document streaming: if document_id is set, the receiver should
|
||||
# fetch content from librarian instead of using inline data
|
||||
document_id: str = ""
|
||||
|
||||
############################################################################
|
||||
|
||||
|
|
@ -19,6 +22,9 @@ class Document:
|
|||
class TextDocument:
|
||||
metadata: Metadata | None = None
|
||||
text: bytes = b""
|
||||
# For large document streaming: if document_id is set, the receiver should
|
||||
# fetch content from librarian instead of using inline text
|
||||
document_id: str = ""
|
||||
|
||||
############################################################################
|
||||
|
||||
|
|
|
|||
|
|
@ -49,6 +49,36 @@ from ..core.metadata import Metadata
|
|||
# <- (processing_metadata[])
|
||||
# <- (error)
|
||||
|
||||
# begin-upload
|
||||
# -> (document_metadata, total_size, chunk_size)
|
||||
# <- (upload_id, chunk_size, total_chunks)
|
||||
# <- (error)
|
||||
|
||||
# upload-chunk
|
||||
# -> (upload_id, chunk_index, content)
|
||||
# <- (upload_id, chunk_index, chunks_received, total_chunks, bytes_received, total_bytes)
|
||||
# <- (error)
|
||||
|
||||
# complete-upload
|
||||
# -> (upload_id)
|
||||
# <- (document_id, object_id)
|
||||
# <- (error)
|
||||
|
||||
# abort-upload
|
||||
# -> (upload_id)
|
||||
# <- ()
|
||||
# <- (error)
|
||||
|
||||
# get-upload-status
|
||||
# -> (upload_id)
|
||||
# <- (upload_id, state, chunks_received, missing_chunks, total_chunks, bytes_received, total_bytes)
|
||||
# <- (error)
|
||||
|
||||
# list-uploads
|
||||
# -> (user)
|
||||
# <- (uploads[])
|
||||
# <- (error)
|
||||
|
||||
@dataclass
|
||||
class DocumentMetadata:
|
||||
id: str = ""
|
||||
|
|
@ -59,6 +89,9 @@ class DocumentMetadata:
|
|||
metadata: list[Triple] = field(default_factory=list)
|
||||
user: str = ""
|
||||
tags: list[str] = field(default_factory=list)
|
||||
# Child document support
|
||||
parent_id: str = "" # Empty for top-level docs, set for children
|
||||
document_type: str = "source" # "source" or "extracted"
|
||||
|
||||
@dataclass
|
||||
class ProcessingMetadata:
|
||||
|
|
@ -76,11 +109,33 @@ class Criteria:
|
|||
value: str = ""
|
||||
operator: str = ""
|
||||
|
||||
@dataclass
|
||||
class UploadProgress:
|
||||
"""Progress information for chunked uploads."""
|
||||
upload_id: str = ""
|
||||
chunks_received: int = 0
|
||||
total_chunks: int = 0
|
||||
bytes_received: int = 0
|
||||
total_bytes: int = 0
|
||||
|
||||
@dataclass
|
||||
class UploadSession:
|
||||
"""Information about an in-progress upload."""
|
||||
upload_id: str = ""
|
||||
document_id: str = ""
|
||||
document_metadata_json: str = "" # JSON-encoded DocumentMetadata
|
||||
total_size: int = 0
|
||||
chunk_size: int = 0
|
||||
total_chunks: int = 0
|
||||
chunks_received: int = 0
|
||||
created_at: str = ""
|
||||
|
||||
@dataclass
|
||||
class LibrarianRequest:
|
||||
# add-document, remove-document, update-document, get-document-metadata,
|
||||
# get-document-content, add-processing, remove-processing, list-documents,
|
||||
# list-processing
|
||||
# list-processing, begin-upload, upload-chunk, complete-upload, abort-upload,
|
||||
# get-upload-status, list-uploads
|
||||
operation: str = ""
|
||||
|
||||
# add-document, remove-document, update-document, get-document-metadata,
|
||||
|
|
@ -90,16 +145,16 @@ class LibrarianRequest:
|
|||
# add-processing, remove-processing
|
||||
processing_id: str = ""
|
||||
|
||||
# add-document, update-document
|
||||
# add-document, update-document, begin-upload
|
||||
document_metadata: DocumentMetadata | None = None
|
||||
|
||||
# add-processing
|
||||
processing_metadata: ProcessingMetadata | None = None
|
||||
|
||||
# add-document
|
||||
# add-document, upload-chunk
|
||||
content: bytes = b""
|
||||
|
||||
# list-documents, list-processing
|
||||
# list-documents, list-processing, list-uploads
|
||||
user: str = ""
|
||||
|
||||
# list-documents?, list-processing?
|
||||
|
|
@ -108,6 +163,19 @@ class LibrarianRequest:
|
|||
#
|
||||
criteria: list[Criteria] = field(default_factory=list)
|
||||
|
||||
# begin-upload
|
||||
total_size: int = 0
|
||||
chunk_size: int = 0
|
||||
|
||||
# upload-chunk, complete-upload, abort-upload, get-upload-status
|
||||
upload_id: str = ""
|
||||
|
||||
# upload-chunk, stream-document
|
||||
chunk_index: int = 0
|
||||
|
||||
# list-documents - whether to include child documents (default False)
|
||||
include_children: bool = False
|
||||
|
||||
@dataclass
|
||||
class LibrarianResponse:
|
||||
error: Error | None = None
|
||||
|
|
@ -116,6 +184,29 @@ class LibrarianResponse:
|
|||
document_metadatas: list[DocumentMetadata] = field(default_factory=list)
|
||||
processing_metadatas: list[ProcessingMetadata] = field(default_factory=list)
|
||||
|
||||
# begin-upload response
|
||||
upload_id: str = ""
|
||||
chunk_size: int = 0
|
||||
total_chunks: int = 0
|
||||
|
||||
# upload-chunk response
|
||||
chunk_index: int = 0
|
||||
chunks_received: int = 0
|
||||
bytes_received: int = 0
|
||||
total_bytes: int = 0
|
||||
|
||||
# complete-upload response
|
||||
document_id: str = ""
|
||||
object_id: str = ""
|
||||
|
||||
# get-upload-status response
|
||||
upload_state: str = "" # "in-progress", "completed", "expired"
|
||||
received_chunks: list[int] = field(default_factory=list)
|
||||
missing_chunks: list[int] = field(default_factory=list)
|
||||
|
||||
# list-uploads response
|
||||
upload_sessions: list[UploadSession] = field(default_factory=list)
|
||||
|
||||
# FIXME: Is this right? Using persistence on librarian so that
|
||||
# message chunking works
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue