mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-18 20:05:13 +02:00
Incremental / large document loading (#659)
Tech spec
BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up
Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
update_upload_session_chunk(), delete_upload_session(),
list_upload_sessions()
- Schema extended with UploadSession, UploadProgress, and new
request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
- add_document() auto-switches to chunked for files > 10MB
- Progress callback support (on_progress)
- get_pending_uploads(), get_upload_status(), abort_upload(),
resume_upload()
- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
streaming retrieval
- Librarian operations:
- add-child-document for extracted PDF pages
- list-children to get child documents
- stream-document for chunked content retrieval
- Cascade delete removes children when parent is deleted
- list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
warnings directing users to tg-add-library-document +
tg-start-library-processing
Remove load_pdf and load_text utils
Move chunker/librarian comms to base class
Updating tests
This commit is contained in:
parent
a38ca9474f
commit
a630e143ef
21 changed files with 3164 additions and 650 deletions
|
|
@ -49,6 +49,36 @@ from ..core.metadata import Metadata
|
|||
# <- (processing_metadata[])
|
||||
# <- (error)
|
||||
|
||||
# begin-upload
|
||||
# -> (document_metadata, total_size, chunk_size)
|
||||
# <- (upload_id, chunk_size, total_chunks)
|
||||
# <- (error)
|
||||
|
||||
# upload-chunk
|
||||
# -> (upload_id, chunk_index, content)
|
||||
# <- (upload_id, chunk_index, chunks_received, total_chunks, bytes_received, total_bytes)
|
||||
# <- (error)
|
||||
|
||||
# complete-upload
|
||||
# -> (upload_id)
|
||||
# <- (document_id, object_id)
|
||||
# <- (error)
|
||||
|
||||
# abort-upload
|
||||
# -> (upload_id)
|
||||
# <- ()
|
||||
# <- (error)
|
||||
|
||||
# get-upload-status
|
||||
# -> (upload_id)
|
||||
# <- (upload_id, state, chunks_received, missing_chunks, total_chunks, bytes_received, total_bytes)
|
||||
# <- (error)
|
||||
|
||||
# list-uploads
|
||||
# -> (user)
|
||||
# <- (uploads[])
|
||||
# <- (error)
|
||||
|
||||
@dataclass
|
||||
class DocumentMetadata:
|
||||
id: str = ""
|
||||
|
|
@ -59,6 +89,9 @@ class DocumentMetadata:
|
|||
metadata: list[Triple] = field(default_factory=list)
|
||||
user: str = ""
|
||||
tags: list[str] = field(default_factory=list)
|
||||
# Child document support
|
||||
parent_id: str = "" # Empty for top-level docs, set for children
|
||||
document_type: str = "source" # "source" or "extracted"
|
||||
|
||||
@dataclass
|
||||
class ProcessingMetadata:
|
||||
|
|
@ -76,11 +109,33 @@ class Criteria:
|
|||
value: str = ""
|
||||
operator: str = ""
|
||||
|
||||
@dataclass
|
||||
class UploadProgress:
|
||||
"""Progress information for chunked uploads."""
|
||||
upload_id: str = ""
|
||||
chunks_received: int = 0
|
||||
total_chunks: int = 0
|
||||
bytes_received: int = 0
|
||||
total_bytes: int = 0
|
||||
|
||||
@dataclass
|
||||
class UploadSession:
|
||||
"""Information about an in-progress upload."""
|
||||
upload_id: str = ""
|
||||
document_id: str = ""
|
||||
document_metadata_json: str = "" # JSON-encoded DocumentMetadata
|
||||
total_size: int = 0
|
||||
chunk_size: int = 0
|
||||
total_chunks: int = 0
|
||||
chunks_received: int = 0
|
||||
created_at: str = ""
|
||||
|
||||
@dataclass
|
||||
class LibrarianRequest:
|
||||
# add-document, remove-document, update-document, get-document-metadata,
|
||||
# get-document-content, add-processing, remove-processing, list-documents,
|
||||
# list-processing
|
||||
# list-processing, begin-upload, upload-chunk, complete-upload, abort-upload,
|
||||
# get-upload-status, list-uploads
|
||||
operation: str = ""
|
||||
|
||||
# add-document, remove-document, update-document, get-document-metadata,
|
||||
|
|
@ -90,16 +145,16 @@ class LibrarianRequest:
|
|||
# add-processing, remove-processing
|
||||
processing_id: str = ""
|
||||
|
||||
# add-document, update-document
|
||||
# add-document, update-document, begin-upload
|
||||
document_metadata: DocumentMetadata | None = None
|
||||
|
||||
# add-processing
|
||||
processing_metadata: ProcessingMetadata | None = None
|
||||
|
||||
# add-document
|
||||
# add-document, upload-chunk
|
||||
content: bytes = b""
|
||||
|
||||
# list-documents, list-processing
|
||||
# list-documents, list-processing, list-uploads
|
||||
user: str = ""
|
||||
|
||||
# list-documents?, list-processing?
|
||||
|
|
@ -108,6 +163,19 @@ class LibrarianRequest:
|
|||
#
|
||||
criteria: list[Criteria] = field(default_factory=list)
|
||||
|
||||
# begin-upload
|
||||
total_size: int = 0
|
||||
chunk_size: int = 0
|
||||
|
||||
# upload-chunk, complete-upload, abort-upload, get-upload-status
|
||||
upload_id: str = ""
|
||||
|
||||
# upload-chunk, stream-document
|
||||
chunk_index: int = 0
|
||||
|
||||
# list-documents - whether to include child documents (default False)
|
||||
include_children: bool = False
|
||||
|
||||
@dataclass
|
||||
class LibrarianResponse:
|
||||
error: Error | None = None
|
||||
|
|
@ -116,6 +184,29 @@ class LibrarianResponse:
|
|||
document_metadatas: list[DocumentMetadata] = field(default_factory=list)
|
||||
processing_metadatas: list[ProcessingMetadata] = field(default_factory=list)
|
||||
|
||||
# begin-upload response
|
||||
upload_id: str = ""
|
||||
chunk_size: int = 0
|
||||
total_chunks: int = 0
|
||||
|
||||
# upload-chunk response
|
||||
chunk_index: int = 0
|
||||
chunks_received: int = 0
|
||||
bytes_received: int = 0
|
||||
total_bytes: int = 0
|
||||
|
||||
# complete-upload response
|
||||
document_id: str = ""
|
||||
object_id: str = ""
|
||||
|
||||
# get-upload-status response
|
||||
upload_state: str = "" # "in-progress", "completed", "expired"
|
||||
received_chunks: list[int] = field(default_factory=list)
|
||||
missing_chunks: list[int] = field(default_factory=list)
|
||||
|
||||
# list-uploads response
|
||||
upload_sessions: list[UploadSession] = field(default_factory=list)
|
||||
|
||||
# FIXME: Is this right? Using persistence on librarian so that
|
||||
# message chunking works
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue