2025-12-17 21:40:43 +00:00
|
|
|
from dataclasses import dataclass, field
|
2025-08-04 21:42:57 +01:00
|
|
|
from ..core.primitives import Triple, Error
|
Pub/sub abstraction: decouple from Pulsar (#751)
Remove Pulsar-specific concepts from application code so that
the pub/sub backend is swappable via configuration.
Rename translators:
- to_pulsar/from_pulsar → decode/encode across all translator
classes, dispatch handlers, and tests (55+ files)
- from_response_with_completion → encode_with_completion
- Remove pulsar.schema.Record from translator base class
Queue naming (CLASS:TOPICSPACE:TOPIC):
- Replace topic() helper with queue() using new format:
flow:tg:name, request:tg:name, response:tg:name, state:tg:name
- Queue class implies persistence/TTL (no QoS in names)
- Update Pulsar backend map_topic() to parse new format
- Librarian queues use flow class (persistent, for chunking)
- Config push uses state class (persistent, last-value)
- Remove 15 dead topic imports from schema files
- Update init_trustgraph.py namespace: config → state
Confine Pulsar to pulsar_backend.py:
- Delete legacy PulsarClient class from pubsub.py
- Move add_args to add_pubsub_args() with standalone flag
for CLI tools (defaults to localhost)
- PulsarBackendConsumer.receive() catches _pulsar.Timeout,
raises standard TimeoutError
- Remove Pulsar imports from: async_processor, flow_processor,
log_level, all 11 client files, 4 storage writers, gateway
service, gateway config receiver
- Remove log_level/LoggerLevel from client API
- Rewrite tg-monitor-prompts to use backend abstraction
- Update tg-dump-queues to use add_pubsub_args
Also: pubsub-abstraction.md tech spec covering problem statement,
design goals, as-is requirements, candidate broker assessment,
approach, and implementation order.
2026-04-01 20:16:53 +01:00
|
|
|
from ..core.topic import queue
|
2025-08-04 21:42:57 +01:00
|
|
|
from ..core.metadata import Metadata
|
2025-12-17 21:40:43 +00:00
|
|
|
# Note: Document imports will be updated after knowledge schemas are converted
|
2025-02-11 16:01:03 +00:00
|
|
|
|
2025-05-04 22:26:19 +01:00
|
|
|
# add-document
|
|
|
|
|
# -> (document_id, document_metadata, content)
|
2025-03-11 16:52:59 +00:00
|
|
|
# <- ()
|
|
|
|
|
# <- (error)
|
|
|
|
|
|
2025-05-04 22:26:19 +01:00
|
|
|
# remove-document
|
|
|
|
|
# -> (document_id)
|
|
|
|
|
# <- ()
|
|
|
|
|
# <- (error)
|
|
|
|
|
|
|
|
|
|
# update-document
|
|
|
|
|
# -> (document_id, document_metadata)
|
|
|
|
|
# <- ()
|
|
|
|
|
# <- (error)
|
|
|
|
|
|
|
|
|
|
# get-document-metadata
|
|
|
|
|
# -> (document_id)
|
|
|
|
|
# <- (document_metadata)
|
|
|
|
|
# <- (error)
|
|
|
|
|
|
2026-04-02 12:47:16 +01:00
|
|
|
# get-document-content [DEPRECATED — use stream-document instead]
|
2025-05-04 22:26:19 +01:00
|
|
|
# -> (document_id)
|
|
|
|
|
# <- (content)
|
|
|
|
|
# <- (error)
|
2026-04-02 12:47:16 +01:00
|
|
|
# NOTE: Returns entire document in a single message. Fails for documents
|
|
|
|
|
# exceeding the broker's max message size. Use stream-document which
|
|
|
|
|
# returns content in chunks.
|
2025-05-04 22:26:19 +01:00
|
|
|
|
|
|
|
|
# add-processing
|
|
|
|
|
# -> (processing_id, processing_metadata)
|
|
|
|
|
# <- ()
|
|
|
|
|
# <- (error)
|
|
|
|
|
|
|
|
|
|
# remove-processing
|
|
|
|
|
# -> (processing_id)
|
|
|
|
|
# <- ()
|
|
|
|
|
# <- (error)
|
|
|
|
|
|
|
|
|
|
# list-documents
|
Per-workspace queue routing for workspace-scoped services (#862)
Workspace identity is now determined by queue infrastructure instead of
message body fields, closing a privilege-escalation vector where a caller
could spoof workspace in the request payload.
- Add WorkspaceProcessor base class: discovers workspaces from config at
startup, creates per-workspace consumers (queue:workspace), and manages
consumer lifecycle on workspace create/delete events
- Roll out to librarian, flow-svc, knowledge cores, and config-svc
- Config service gets a dual-queue regime: a system queue for
cross-workspace ops (getvalues-all-ws, bootstrapper writes to
__workspaces__) and per-workspace queues for tenant-scoped ops, with
workspace discovery from its own Cassandra store
- Remove workspace field from request schemas (FlowRequest,
LibrarianRequest, KnowledgeRequest, CollectionManagementRequest) and
from DocumentMetadata / ProcessingMetadata — table stores now accept
workspace as an explicit parameter
- Strip workspace encode/decode from all message translators and gateway
serializers
- Gateway enforces workspace existence: reject requests targeting
non-existent workspaces instead of routing to queues with no consumer
- Config service provisions new workspaces from __template__ on creation
- Add workspace lifecycle hooks to AsyncProcessor so any processor can
react to workspace create/delete without subclassing WorkspaceProcessor
2026-05-04 10:30:03 +01:00
|
|
|
# -> (collection?)
|
2025-05-04 22:26:19 +01:00
|
|
|
# <- (document_metadata[])
|
2025-03-11 16:52:59 +00:00
|
|
|
# <- (error)
|
|
|
|
|
|
2025-05-04 22:26:19 +01:00
|
|
|
# list-processing
|
Per-workspace queue routing for workspace-scoped services (#862)
Workspace identity is now determined by queue infrastructure instead of
message body fields, closing a privilege-escalation vector where a caller
could spoof workspace in the request payload.
- Add WorkspaceProcessor base class: discovers workspaces from config at
startup, creates per-workspace consumers (queue:workspace), and manages
consumer lifecycle on workspace create/delete events
- Roll out to librarian, flow-svc, knowledge cores, and config-svc
- Config service gets a dual-queue regime: a system queue for
cross-workspace ops (getvalues-all-ws, bootstrapper writes to
__workspaces__) and per-workspace queues for tenant-scoped ops, with
workspace discovery from its own Cassandra store
- Remove workspace field from request schemas (FlowRequest,
LibrarianRequest, KnowledgeRequest, CollectionManagementRequest) and
from DocumentMetadata / ProcessingMetadata — table stores now accept
workspace as an explicit parameter
- Strip workspace encode/decode from all message translators and gateway
serializers
- Gateway enforces workspace existence: reject requests targeting
non-existent workspaces instead of routing to queues with no consumer
- Config service provisions new workspaces from __template__ on creation
- Add workspace lifecycle hooks to AsyncProcessor so any processor can
react to workspace create/delete without subclassing WorkspaceProcessor
2026-05-04 10:30:03 +01:00
|
|
|
# -> (collection?)
|
2025-05-04 22:26:19 +01:00
|
|
|
# <- (processing_metadata[])
|
|
|
|
|
# <- (error)
|
|
|
|
|
|
Incremental / large document loading (#659)
Tech spec
BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up
Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
update_upload_session_chunk(), delete_upload_session(),
list_upload_sessions()
- Schema extended with UploadSession, UploadProgress, and new
request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
- add_document() auto-switches to chunked for files > 10MB
- Progress callback support (on_progress)
- get_pending_uploads(), get_upload_status(), abort_upload(),
resume_upload()
- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
streaming retrieval
- Librarian operations:
- add-child-document for extracted PDF pages
- list-children to get child documents
- stream-document for chunked content retrieval
- Cascade delete removes children when parent is deleted
- list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
warnings directing users to tg-add-library-document +
tg-start-library-processing
Remove load_pdf and load_text utils
Move chunker/librarian comms to base class
Updating tests
2026-03-04 16:57:58 +00:00
|
|
|
# begin-upload
|
|
|
|
|
# -> (document_metadata, total_size, chunk_size)
|
|
|
|
|
# <- (upload_id, chunk_size, total_chunks)
|
|
|
|
|
# <- (error)
|
|
|
|
|
|
|
|
|
|
# upload-chunk
|
|
|
|
|
# -> (upload_id, chunk_index, content)
|
|
|
|
|
# <- (upload_id, chunk_index, chunks_received, total_chunks, bytes_received, total_bytes)
|
|
|
|
|
# <- (error)
|
|
|
|
|
|
|
|
|
|
# complete-upload
|
|
|
|
|
# -> (upload_id)
|
|
|
|
|
# <- (document_id, object_id)
|
|
|
|
|
# <- (error)
|
|
|
|
|
|
|
|
|
|
# abort-upload
|
|
|
|
|
# -> (upload_id)
|
|
|
|
|
# <- ()
|
|
|
|
|
# <- (error)
|
|
|
|
|
|
|
|
|
|
# get-upload-status
|
|
|
|
|
# -> (upload_id)
|
|
|
|
|
# <- (upload_id, state, chunks_received, missing_chunks, total_chunks, bytes_received, total_bytes)
|
|
|
|
|
# <- (error)
|
|
|
|
|
|
|
|
|
|
# list-uploads
|
Per-workspace queue routing for workspace-scoped services (#862)
Workspace identity is now determined by queue infrastructure instead of
message body fields, closing a privilege-escalation vector where a caller
could spoof workspace in the request payload.
- Add WorkspaceProcessor base class: discovers workspaces from config at
startup, creates per-workspace consumers (queue:workspace), and manages
consumer lifecycle on workspace create/delete events
- Roll out to librarian, flow-svc, knowledge cores, and config-svc
- Config service gets a dual-queue regime: a system queue for
cross-workspace ops (getvalues-all-ws, bootstrapper writes to
__workspaces__) and per-workspace queues for tenant-scoped ops, with
workspace discovery from its own Cassandra store
- Remove workspace field from request schemas (FlowRequest,
LibrarianRequest, KnowledgeRequest, CollectionManagementRequest) and
from DocumentMetadata / ProcessingMetadata — table stores now accept
workspace as an explicit parameter
- Strip workspace encode/decode from all message translators and gateway
serializers
- Gateway enforces workspace existence: reject requests targeting
non-existent workspaces instead of routing to queues with no consumer
- Config service provisions new workspaces from __template__ on creation
- Add workspace lifecycle hooks to AsyncProcessor so any processor can
react to workspace create/delete without subclassing WorkspaceProcessor
2026-05-04 10:30:03 +01:00
|
|
|
# -> ()
|
Incremental / large document loading (#659)
Tech spec
BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up
Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
update_upload_session_chunk(), delete_upload_session(),
list_upload_sessions()
- Schema extended with UploadSession, UploadProgress, and new
request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
- add_document() auto-switches to chunked for files > 10MB
- Progress callback support (on_progress)
- get_pending_uploads(), get_upload_status(), abort_upload(),
resume_upload()
- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
streaming retrieval
- Librarian operations:
- add-child-document for extracted PDF pages
- list-children to get child documents
- stream-document for chunked content retrieval
- Cascade delete removes children when parent is deleted
- list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
warnings directing users to tg-add-library-document +
tg-start-library-processing
Remove load_pdf and load_text utils
Move chunker/librarian comms to base class
Updating tests
2026-03-04 16:57:58 +00:00
|
|
|
# <- (uploads[])
|
|
|
|
|
# <- (error)
|
|
|
|
|
|
2025-12-17 21:40:43 +00:00
|
|
|
@dataclass
|
|
|
|
|
class DocumentMetadata:
|
|
|
|
|
id: str = ""
|
|
|
|
|
time: int = 0
|
|
|
|
|
kind: str = ""
|
|
|
|
|
title: str = ""
|
|
|
|
|
comments: str = ""
|
|
|
|
|
metadata: list[Triple] = field(default_factory=list)
|
|
|
|
|
tags: list[str] = field(default_factory=list)
|
Incremental / large document loading (#659)
Tech spec
BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up
Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
update_upload_session_chunk(), delete_upload_session(),
list_upload_sessions()
- Schema extended with UploadSession, UploadProgress, and new
request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
- add_document() auto-switches to chunked for files > 10MB
- Progress callback support (on_progress)
- get_pending_uploads(), get_upload_status(), abort_upload(),
resume_upload()
- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
streaming retrieval
- Librarian operations:
- add-child-document for extracted PDF pages
- list-children to get child documents
- stream-document for chunked content retrieval
- Cascade delete removes children when parent is deleted
- list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
warnings directing users to tg-add-library-document +
tg-start-library-processing
Remove load_pdf and load_text utils
Move chunker/librarian comms to base class
Updating tests
2026-03-04 16:57:58 +00:00
|
|
|
# Child document support
|
|
|
|
|
parent_id: str = "" # Empty for top-level docs, set for children
|
2026-03-05 18:36:10 +00:00
|
|
|
# Document type vocabulary:
|
|
|
|
|
# "source" - original uploaded document
|
|
|
|
|
# "page" - page extracted from source (e.g., PDF page)
|
|
|
|
|
# "chunk" - text chunk derived from page or source
|
|
|
|
|
# "extracted" - legacy value, kept for backwards compatibility
|
|
|
|
|
document_type: str = "source"
|
2025-12-17 21:40:43 +00:00
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class ProcessingMetadata:
|
|
|
|
|
id: str = ""
|
|
|
|
|
document_id: str = ""
|
|
|
|
|
time: int = 0
|
|
|
|
|
flow: str = ""
|
|
|
|
|
collection: str = ""
|
|
|
|
|
tags: list[str] = field(default_factory=list)
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class Criteria:
|
|
|
|
|
key: str = ""
|
|
|
|
|
value: str = ""
|
|
|
|
|
operator: str = ""
|
|
|
|
|
|
Incremental / large document loading (#659)
Tech spec
BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up
Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
update_upload_session_chunk(), delete_upload_session(),
list_upload_sessions()
- Schema extended with UploadSession, UploadProgress, and new
request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
- add_document() auto-switches to chunked for files > 10MB
- Progress callback support (on_progress)
- get_pending_uploads(), get_upload_status(), abort_upload(),
resume_upload()
- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
streaming retrieval
- Librarian operations:
- add-child-document for extracted PDF pages
- list-children to get child documents
- stream-document for chunked content retrieval
- Cascade delete removes children when parent is deleted
- list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
warnings directing users to tg-add-library-document +
tg-start-library-processing
Remove load_pdf and load_text utils
Move chunker/librarian comms to base class
Updating tests
2026-03-04 16:57:58 +00:00
|
|
|
@dataclass
|
|
|
|
|
class UploadProgress:
|
|
|
|
|
"""Progress information for chunked uploads."""
|
|
|
|
|
upload_id: str = ""
|
|
|
|
|
chunks_received: int = 0
|
|
|
|
|
total_chunks: int = 0
|
|
|
|
|
bytes_received: int = 0
|
|
|
|
|
total_bytes: int = 0
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class UploadSession:
|
|
|
|
|
"""Information about an in-progress upload."""
|
|
|
|
|
upload_id: str = ""
|
|
|
|
|
document_id: str = ""
|
|
|
|
|
document_metadata_json: str = "" # JSON-encoded DocumentMetadata
|
|
|
|
|
total_size: int = 0
|
|
|
|
|
chunk_size: int = 0
|
|
|
|
|
total_chunks: int = 0
|
|
|
|
|
chunks_received: int = 0
|
|
|
|
|
created_at: str = ""
|
|
|
|
|
|
2025-12-17 21:40:43 +00:00
|
|
|
@dataclass
|
|
|
|
|
class LibrarianRequest:
|
2025-05-04 22:26:19 +01:00
|
|
|
# add-document, remove-document, update-document, get-document-metadata,
|
|
|
|
|
# get-document-content, add-processing, remove-processing, list-documents,
|
Incremental / large document loading (#659)
Tech spec
BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up
Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
update_upload_session_chunk(), delete_upload_session(),
list_upload_sessions()
- Schema extended with UploadSession, UploadProgress, and new
request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
- add_document() auto-switches to chunked for files > 10MB
- Progress callback support (on_progress)
- get_pending_uploads(), get_upload_status(), abort_upload(),
resume_upload()
- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
streaming retrieval
- Librarian operations:
- add-child-document for extracted PDF pages
- list-children to get child documents
- stream-document for chunked content retrieval
- Cascade delete removes children when parent is deleted
- list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
warnings directing users to tg-add-library-document +
tg-start-library-processing
Remove load_pdf and load_text utils
Move chunker/librarian comms to base class
Updating tests
2026-03-04 16:57:58 +00:00
|
|
|
# list-processing, begin-upload, upload-chunk, complete-upload, abort-upload,
|
|
|
|
|
# get-upload-status, list-uploads
|
2025-12-17 21:40:43 +00:00
|
|
|
operation: str = ""
|
2025-05-04 22:26:19 +01:00
|
|
|
|
|
|
|
|
# add-document, remove-document, update-document, get-document-metadata,
|
|
|
|
|
# get-document-content
|
2025-12-17 21:40:43 +00:00
|
|
|
document_id: str = ""
|
2025-05-04 22:26:19 +01:00
|
|
|
|
|
|
|
|
# add-processing, remove-processing
|
2025-12-17 21:40:43 +00:00
|
|
|
processing_id: str = ""
|
2025-05-04 22:26:19 +01:00
|
|
|
|
Incremental / large document loading (#659)
Tech spec
BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up
Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
update_upload_session_chunk(), delete_upload_session(),
list_upload_sessions()
- Schema extended with UploadSession, UploadProgress, and new
request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
- add_document() auto-switches to chunked for files > 10MB
- Progress callback support (on_progress)
- get_pending_uploads(), get_upload_status(), abort_upload(),
resume_upload()
- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
streaming retrieval
- Librarian operations:
- add-child-document for extracted PDF pages
- list-children to get child documents
- stream-document for chunked content retrieval
- Cascade delete removes children when parent is deleted
- list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
warnings directing users to tg-add-library-document +
tg-start-library-processing
Remove load_pdf and load_text utils
Move chunker/librarian comms to base class
Updating tests
2026-03-04 16:57:58 +00:00
|
|
|
# add-document, update-document, begin-upload
|
2025-12-17 21:40:43 +00:00
|
|
|
document_metadata: DocumentMetadata | None = None
|
2025-05-04 22:26:19 +01:00
|
|
|
|
|
|
|
|
# add-processing
|
2025-12-17 21:40:43 +00:00
|
|
|
processing_metadata: ProcessingMetadata | None = None
|
2025-05-04 22:26:19 +01:00
|
|
|
|
Incremental / large document loading (#659)
Tech spec
BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up
Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
update_upload_session_chunk(), delete_upload_session(),
list_upload_sessions()
- Schema extended with UploadSession, UploadProgress, and new
request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
- add_document() auto-switches to chunked for files > 10MB
- Progress callback support (on_progress)
- get_pending_uploads(), get_upload_status(), abort_upload(),
resume_upload()
- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
streaming retrieval
- Librarian operations:
- add-child-document for extracted PDF pages
- list-children to get child documents
- stream-document for chunked content retrieval
- Cascade delete removes children when parent is deleted
- list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
warnings directing users to tg-add-library-document +
tg-start-library-processing
Remove load_pdf and load_text utils
Move chunker/librarian comms to base class
Updating tests
2026-03-04 16:57:58 +00:00
|
|
|
# add-document, upload-chunk
|
2025-12-17 21:40:43 +00:00
|
|
|
content: bytes = b""
|
2025-05-04 22:26:19 +01:00
|
|
|
|
|
|
|
|
# list-documents?, list-processing?
|
2025-12-17 21:40:43 +00:00
|
|
|
collection: str = ""
|
2025-05-04 22:26:19 +01:00
|
|
|
|
2025-12-17 21:40:43 +00:00
|
|
|
#
|
|
|
|
|
criteria: list[Criteria] = field(default_factory=list)
|
2025-02-11 16:01:03 +00:00
|
|
|
|
Incremental / large document loading (#659)
Tech spec
BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up
Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
update_upload_session_chunk(), delete_upload_session(),
list_upload_sessions()
- Schema extended with UploadSession, UploadProgress, and new
request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
- add_document() auto-switches to chunked for files > 10MB
- Progress callback support (on_progress)
- get_pending_uploads(), get_upload_status(), abort_upload(),
resume_upload()
- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
streaming retrieval
- Librarian operations:
- add-child-document for extracted PDF pages
- list-children to get child documents
- stream-document for chunked content retrieval
- Cascade delete removes children when parent is deleted
- list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
warnings directing users to tg-add-library-document +
tg-start-library-processing
Remove load_pdf and load_text utils
Move chunker/librarian comms to base class
Updating tests
2026-03-04 16:57:58 +00:00
|
|
|
# begin-upload
|
|
|
|
|
total_size: int = 0
|
|
|
|
|
chunk_size: int = 0
|
|
|
|
|
|
|
|
|
|
# upload-chunk, complete-upload, abort-upload, get-upload-status
|
|
|
|
|
upload_id: str = ""
|
|
|
|
|
|
|
|
|
|
# upload-chunk, stream-document
|
|
|
|
|
chunk_index: int = 0
|
|
|
|
|
|
|
|
|
|
# list-documents - whether to include child documents (default False)
|
|
|
|
|
include_children: bool = False
|
|
|
|
|
|
2025-12-17 21:40:43 +00:00
|
|
|
@dataclass
|
|
|
|
|
class LibrarianResponse:
|
|
|
|
|
error: Error | None = None
|
|
|
|
|
document_metadata: DocumentMetadata | None = None
|
|
|
|
|
content: bytes = b""
|
|
|
|
|
document_metadatas: list[DocumentMetadata] = field(default_factory=list)
|
|
|
|
|
processing_metadatas: list[ProcessingMetadata] = field(default_factory=list)
|
2025-02-11 16:01:03 +00:00
|
|
|
|
Incremental / large document loading (#659)
Tech spec
BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up
Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
update_upload_session_chunk(), delete_upload_session(),
list_upload_sessions()
- Schema extended with UploadSession, UploadProgress, and new
request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
- add_document() auto-switches to chunked for files > 10MB
- Progress callback support (on_progress)
- get_pending_uploads(), get_upload_status(), abort_upload(),
resume_upload()
- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
streaming retrieval
- Librarian operations:
- add-child-document for extracted PDF pages
- list-children to get child documents
- stream-document for chunked content retrieval
- Cascade delete removes children when parent is deleted
- list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
warnings directing users to tg-add-library-document +
tg-start-library-processing
Remove load_pdf and load_text utils
Move chunker/librarian comms to base class
Updating tests
2026-03-04 16:57:58 +00:00
|
|
|
# begin-upload response
|
|
|
|
|
upload_id: str = ""
|
|
|
|
|
chunk_size: int = 0
|
|
|
|
|
total_chunks: int = 0
|
|
|
|
|
|
|
|
|
|
# upload-chunk response
|
|
|
|
|
chunk_index: int = 0
|
|
|
|
|
chunks_received: int = 0
|
|
|
|
|
bytes_received: int = 0
|
|
|
|
|
total_bytes: int = 0
|
|
|
|
|
|
|
|
|
|
# complete-upload response
|
|
|
|
|
document_id: str = ""
|
|
|
|
|
object_id: str = ""
|
|
|
|
|
|
|
|
|
|
# get-upload-status response
|
|
|
|
|
upload_state: str = "" # "in-progress", "completed", "expired"
|
|
|
|
|
received_chunks: list[int] = field(default_factory=list)
|
|
|
|
|
missing_chunks: list[int] = field(default_factory=list)
|
|
|
|
|
|
|
|
|
|
# list-uploads response
|
|
|
|
|
upload_sessions: list[UploadSession] = field(default_factory=list)
|
|
|
|
|
|
2026-03-09 13:36:24 +00:00
|
|
|
# Protocol flag: True if this is the final response for a request.
|
|
|
|
|
# Default True since most operations are single request/response.
|
|
|
|
|
# Only stream-document sets False for intermediate chunks.
|
|
|
|
|
is_final: bool = True
|
2026-03-09 12:36:10 +00:00
|
|
|
|
2025-05-06 00:28:20 +01:00
|
|
|
# FIXME: Is this right? Using persistence on librarian so that
|
|
|
|
|
# message chunking works
|
|
|
|
|
|
2026-04-02 12:47:16 +01:00
|
|
|
librarian_request_queue = queue('librarian', cls='request')
|
|
|
|
|
librarian_response_queue = queue('librarian', cls='response')
|