mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Tech spec
BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up
Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
update_upload_session_chunk(), delete_upload_session(),
list_upload_sessions()
- Schema extended with UploadSession, UploadProgress, and new
request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
- add_document() auto-switches to chunked for files > 10MB
- Progress callback support (on_progress)
- get_pending_uploads(), get_upload_status(), abort_upload(),
resume_upload()
- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
streaming retrieval
- Librarian operations:
- add-child-document for extracted PDF pages
- list-children to get child documents
- stream-document for chunked content retrieval
- Cascade delete removes children when parent is deleted
- list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
warnings directing users to tg-add-library-document +
tg-start-library-processing
Remove load_pdf and load_text utils
Move chunker/librarian comms to base class
Updating tests
221 lines
6.2 KiB
Python
221 lines
6.2 KiB
Python
from .. schema import LibrarianRequest, LibrarianResponse, Error
|
|
from .. knowledge import hash
|
|
from .. exceptions import RequestError
|
|
|
|
from minio import Minio
|
|
from minio.datatypes import Part
|
|
import time
|
|
import io
|
|
import logging
|
|
from typing import Iterator, List, Tuple
|
|
from uuid import UUID
|
|
|
|
# Module logger
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class BlobStore:
|
|
|
|
def __init__(
|
|
self,
|
|
endpoint, access_key, secret_key, bucket_name,
|
|
use_ssl=False, region=None,
|
|
):
|
|
|
|
|
|
self.client = Minio(
|
|
endpoint = endpoint,
|
|
access_key = access_key,
|
|
secret_key = secret_key,
|
|
secure = use_ssl,
|
|
region = region,
|
|
)
|
|
|
|
self.bucket_name = bucket_name
|
|
|
|
protocol = "https" if use_ssl else "http"
|
|
logger.info(f"Connected to S3-compatible storage at {protocol}://{endpoint}")
|
|
|
|
self.ensure_bucket()
|
|
|
|
def ensure_bucket(self):
|
|
|
|
# Make the bucket if it doesn't exist.
|
|
found = self.client.bucket_exists(bucket_name=self.bucket_name)
|
|
if not found:
|
|
self.client.make_bucket(bucket_name=self.bucket_name)
|
|
logger.info(f"Created bucket {self.bucket_name}")
|
|
else:
|
|
logger.debug(f"Bucket {self.bucket_name} already exists")
|
|
|
|
async def add(self, object_id, blob, kind):
|
|
|
|
# FIXME: Loop retry
|
|
self.client.put_object(
|
|
bucket_name = self.bucket_name,
|
|
object_name = "doc/" + str(object_id),
|
|
length = len(blob),
|
|
data = io.BytesIO(blob),
|
|
content_type = kind,
|
|
)
|
|
|
|
logger.debug("Add blob complete")
|
|
|
|
async def remove(self, object_id):
|
|
|
|
# FIXME: Loop retry
|
|
self.client.remove_object(
|
|
bucket_name = self.bucket_name,
|
|
object_name = "doc/" + str(object_id),
|
|
)
|
|
|
|
logger.debug("Remove blob complete")
|
|
|
|
|
|
async def get(self, object_id):
|
|
|
|
# FIXME: Loop retry
|
|
resp = self.client.get_object(
|
|
bucket_name = self.bucket_name,
|
|
object_name = "doc/" + str(object_id),
|
|
)
|
|
|
|
return resp.read()
|
|
|
|
def get_stream(self, object_id, chunk_size: int = 1024 * 1024) -> Iterator[bytes]:
|
|
"""
|
|
Stream document content in chunks.
|
|
|
|
Yields chunks of the document, allowing processing without loading
|
|
the entire document into memory.
|
|
|
|
Args:
|
|
object_id: The UUID of the document object
|
|
chunk_size: Size of each chunk in bytes (default 1MB)
|
|
|
|
Yields:
|
|
Chunks of document content as bytes
|
|
"""
|
|
resp = self.client.get_object(
|
|
bucket_name=self.bucket_name,
|
|
object_name="doc/" + str(object_id),
|
|
)
|
|
|
|
try:
|
|
while True:
|
|
chunk = resp.read(chunk_size)
|
|
if not chunk:
|
|
break
|
|
yield chunk
|
|
finally:
|
|
resp.close()
|
|
resp.release_conn()
|
|
|
|
logger.debug("Stream complete")
|
|
|
|
def create_multipart_upload(self, object_id: UUID, kind: str) -> str:
|
|
"""
|
|
Initialize a multipart upload.
|
|
|
|
Args:
|
|
object_id: The UUID for the new object
|
|
kind: MIME type of the document
|
|
|
|
Returns:
|
|
The S3 upload_id for this multipart upload session
|
|
"""
|
|
object_name = "doc/" + str(object_id)
|
|
|
|
# Use minio's internal method to create multipart upload
|
|
upload_id = self.client._create_multipart_upload(
|
|
bucket_name=self.bucket_name,
|
|
object_name=object_name,
|
|
headers={"Content-Type": kind},
|
|
)
|
|
|
|
logger.info(f"Created multipart upload {upload_id} for {object_id}")
|
|
return upload_id
|
|
|
|
def upload_part(
|
|
self,
|
|
object_id: UUID,
|
|
upload_id: str,
|
|
part_number: int,
|
|
data: bytes
|
|
) -> str:
|
|
"""
|
|
Upload a single part of a multipart upload.
|
|
|
|
Args:
|
|
object_id: The UUID of the object being uploaded
|
|
upload_id: The S3 upload_id from create_multipart_upload
|
|
part_number: Part number (1-indexed, as per S3 spec)
|
|
data: The chunk data to upload
|
|
|
|
Returns:
|
|
The ETag for this part (needed for complete_multipart_upload)
|
|
"""
|
|
object_name = "doc/" + str(object_id)
|
|
|
|
etag = self.client._upload_part(
|
|
bucket_name=self.bucket_name,
|
|
object_name=object_name,
|
|
data=data,
|
|
headers={"Content-Length": str(len(data))},
|
|
upload_id=upload_id,
|
|
part_number=part_number,
|
|
)
|
|
|
|
logger.debug(f"Uploaded part {part_number} for {object_id}, etag={etag}")
|
|
return etag
|
|
|
|
def complete_multipart_upload(
|
|
self,
|
|
object_id: UUID,
|
|
upload_id: str,
|
|
parts: List[Tuple[int, str]]
|
|
) -> None:
|
|
"""
|
|
Complete a multipart upload, assembling all parts into the final object.
|
|
|
|
S3 coalesces the parts server-side - no data transfer through this client.
|
|
|
|
Args:
|
|
object_id: The UUID of the object
|
|
upload_id: The S3 upload_id from create_multipart_upload
|
|
parts: List of (part_number, etag) tuples in order
|
|
"""
|
|
object_name = "doc/" + str(object_id)
|
|
|
|
# Convert to Part objects as expected by minio
|
|
part_objects = [
|
|
Part(part_number, etag)
|
|
for part_number, etag in parts
|
|
]
|
|
|
|
self.client._complete_multipart_upload(
|
|
bucket_name=self.bucket_name,
|
|
object_name=object_name,
|
|
upload_id=upload_id,
|
|
parts=part_objects,
|
|
)
|
|
|
|
logger.info(f"Completed multipart upload for {object_id}")
|
|
|
|
def abort_multipart_upload(self, object_id: UUID, upload_id: str) -> None:
|
|
"""
|
|
Abort a multipart upload, cleaning up any uploaded parts.
|
|
|
|
Args:
|
|
object_id: The UUID of the object
|
|
upload_id: The S3 upload_id from create_multipart_upload
|
|
"""
|
|
object_name = "doc/" + str(object_id)
|
|
|
|
self.client._abort_multipart_upload(
|
|
bucket_name=self.bucket_name,
|
|
object_name=object_name,
|
|
upload_id=upload_id,
|
|
)
|
|
|
|
logger.info(f"Aborted multipart upload {upload_id} for {object_id}")
|
|
|