mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-07-01 17:39:39 +02:00
Incremental / large document loading (#659)
Tech spec
BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up
Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
update_upload_session_chunk(), delete_upload_session(),
list_upload_sessions()
- Schema extended with UploadSession, UploadProgress, and new
request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
- add_document() auto-switches to chunked for files > 10MB
- Progress callback support (on_progress)
- get_pending_uploads(), get_upload_status(), abort_upload(),
resume_upload()
- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
streaming retrieval
- Librarian operations:
- add-child-document for extracted PDF pages
- list-children to get child documents
- stream-document for chunked content retrieval
- Cascade delete removes children when parent is deleted
- list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
warnings directing users to tg-add-library-document +
tg-start-library-processing
Remove load_pdf and load_text utils
Move chunker/librarian comms to base class
Updating tests
This commit is contained in:
parent
a38ca9474f
commit
a630e143ef
21 changed files with 3164 additions and 650 deletions
|
|
@ -44,14 +44,21 @@ class LibraryRequestTranslator(MessageTranslator):
|
|||
|
||||
return LibrarianRequest(
|
||||
operation=data.get("operation"),
|
||||
document_id=data.get("document-id"),
|
||||
processing_id=data.get("processing-id"),
|
||||
document_id=data.get("document-id", ""),
|
||||
processing_id=data.get("processing-id", ""),
|
||||
document_metadata=doc_metadata,
|
||||
processing_metadata=proc_metadata,
|
||||
content=content,
|
||||
user=data.get("user"),
|
||||
collection=data.get("collection"),
|
||||
criteria=criteria
|
||||
user=data.get("user", ""),
|
||||
collection=data.get("collection", ""),
|
||||
criteria=criteria,
|
||||
# Chunked upload fields
|
||||
total_size=data.get("total-size", 0),
|
||||
chunk_size=data.get("chunk-size", 0),
|
||||
upload_id=data.get("upload-id", ""),
|
||||
chunk_index=data.get("chunk-index", 0),
|
||||
# List documents filtering
|
||||
include_children=data.get("include-children", False),
|
||||
)
|
||||
|
||||
def from_pulsar(self, obj: LibrarianRequest) -> Dict[str, Any]:
|
||||
|
|
@ -98,25 +105,71 @@ class LibraryResponseTranslator(MessageTranslator):
|
|||
|
||||
def from_pulsar(self, obj: LibrarianResponse) -> Dict[str, Any]:
|
||||
result = {}
|
||||
|
||||
|
||||
if obj.error:
|
||||
result["error"] = {
|
||||
"type": obj.error.type,
|
||||
"message": obj.error.message,
|
||||
}
|
||||
|
||||
if obj.document_metadata:
|
||||
result["document-metadata"] = self.doc_metadata_translator.from_pulsar(obj.document_metadata)
|
||||
|
||||
|
||||
if obj.content:
|
||||
result["content"] = obj.content.decode("utf-8") if isinstance(obj.content, bytes) else obj.content
|
||||
|
||||
|
||||
if obj.document_metadatas is not None:
|
||||
result["document-metadatas"] = [
|
||||
self.doc_metadata_translator.from_pulsar(dm)
|
||||
for dm in obj.document_metadatas
|
||||
]
|
||||
|
||||
|
||||
if obj.processing_metadatas is not None:
|
||||
result["processing-metadatas"] = [
|
||||
self.proc_metadata_translator.from_pulsar(pm)
|
||||
for pm in obj.processing_metadatas
|
||||
]
|
||||
|
||||
|
||||
# Chunked upload response fields
|
||||
if obj.upload_id:
|
||||
result["upload-id"] = obj.upload_id
|
||||
if obj.chunk_size:
|
||||
result["chunk-size"] = obj.chunk_size
|
||||
if obj.total_chunks:
|
||||
result["total-chunks"] = obj.total_chunks
|
||||
if obj.chunk_index:
|
||||
result["chunk-index"] = obj.chunk_index
|
||||
if obj.chunks_received:
|
||||
result["chunks-received"] = obj.chunks_received
|
||||
if obj.bytes_received:
|
||||
result["bytes-received"] = obj.bytes_received
|
||||
if obj.total_bytes:
|
||||
result["total-bytes"] = obj.total_bytes
|
||||
if obj.document_id:
|
||||
result["document-id"] = obj.document_id
|
||||
if obj.object_id:
|
||||
result["object-id"] = obj.object_id
|
||||
if obj.upload_state:
|
||||
result["upload-state"] = obj.upload_state
|
||||
if obj.received_chunks:
|
||||
result["received-chunks"] = obj.received_chunks
|
||||
if obj.missing_chunks:
|
||||
result["missing-chunks"] = obj.missing_chunks
|
||||
if obj.upload_sessions:
|
||||
result["upload-sessions"] = [
|
||||
{
|
||||
"upload-id": s.upload_id,
|
||||
"document-id": s.document_id,
|
||||
"document-metadata-json": s.document_metadata_json,
|
||||
"total-size": s.total_size,
|
||||
"chunk-size": s.chunk_size,
|
||||
"total-chunks": s.total_chunks,
|
||||
"chunks-received": s.chunks_received,
|
||||
"created-at": s.created_at,
|
||||
}
|
||||
for s in obj.upload_sessions
|
||||
]
|
||||
|
||||
return result
|
||||
|
||||
def from_response_with_completion(self, obj: LibrarianResponse) -> Tuple[Dict[str, Any], bool]:
|
||||
|
|
|
|||
|
|
@ -20,12 +20,14 @@ class DocumentMetadataTranslator(Translator):
|
|||
comments=data.get("comments"),
|
||||
metadata=self.subgraph_translator.to_pulsar(metadata) if metadata is not None else [],
|
||||
user=data.get("user"),
|
||||
tags=data.get("tags")
|
||||
tags=data.get("tags"),
|
||||
parent_id=data.get("parent-id", ""),
|
||||
document_type=data.get("document-type", "source"),
|
||||
)
|
||||
|
||||
def from_pulsar(self, obj: DocumentMetadata) -> Dict[str, Any]:
|
||||
result = {}
|
||||
|
||||
|
||||
if obj.id:
|
||||
result["id"] = obj.id
|
||||
if obj.time:
|
||||
|
|
@ -42,7 +44,11 @@ class DocumentMetadataTranslator(Translator):
|
|||
result["user"] = obj.user
|
||||
if obj.tags is not None:
|
||||
result["tags"] = obj.tags
|
||||
|
||||
if obj.parent_id:
|
||||
result["parent-id"] = obj.parent_id
|
||||
if obj.document_type:
|
||||
result["document-type"] = obj.document_type
|
||||
|
||||
return result
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue