mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-06-10 07:15:13 +02:00
fix: large document handling and Cassandra query pagination (#969)
- Paginate heavy Cassandra reads (triples, graph/document embeddings) using synchronous session.execute() in run_in_executor with fetch_size paging, preventing materialization hang on large result sets - Fix document stream endpoint to use workspace-scoped librarian queues - Add decoder error handling for PDF/OCR/unstructured processors - Add WebSocket mux guards for missing auth fields - Add null check in librarian document streaming - Rewrite get_document_content CLI to stream via librarian - Add Poppler dependency to unstructured container
This commit is contained in:
parent
7e1fb76bc9
commit
6b1dd16f9f
11 changed files with 166 additions and 74 deletions
|
|
@ -5,7 +5,7 @@ Gets document content from the library by document ID.
|
|||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from trustgraph.api import Api
|
||||
import requests
|
||||
|
||||
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
|
||||
default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
|
||||
|
|
@ -13,15 +13,29 @@ default_workspace = os.getenv("TRUSTGRAPH_WORKSPACE", "default")
|
|||
|
||||
def get_content(url, document_id, output_file, token=None, workspace="default"):
|
||||
|
||||
api = Api(url, token=token, workspace=workspace).library()
|
||||
stream_url = url.rstrip("/") + "/api/v1/document-stream"
|
||||
|
||||
content = api.get_document_content(id=document_id)
|
||||
params = {
|
||||
"document-id": document_id,
|
||||
"workspace": workspace,
|
||||
}
|
||||
|
||||
headers = {}
|
||||
if token:
|
||||
headers["Authorization"] = f"Bearer {token}"
|
||||
|
||||
resp = requests.get(stream_url, params=params, headers=headers, stream=True)
|
||||
resp.raise_for_status()
|
||||
|
||||
if output_file:
|
||||
total = 0
|
||||
with open(output_file, 'wb') as f:
|
||||
f.write(content)
|
||||
print(f"Written {len(content)} bytes to {output_file}")
|
||||
for chunk in resp.iter_content(chunk_size=65536):
|
||||
f.write(chunk)
|
||||
total += len(chunk)
|
||||
print(f"Written {total} bytes to {output_file}")
|
||||
else:
|
||||
content = resp.content
|
||||
try:
|
||||
text = content.decode('utf-8')
|
||||
print(text)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue