trustgraph/trustgraph-cli/trustgraph/cli/get_document_content.py
Cyber MacGeddon c3ce07d6f0 fix: large document handling and Cassandra query pagination
- Paginate heavy Cassandra reads (triples, graph/document embeddings)
  using synchronous session.execute() in run_in_executor with fetch_size
  paging, preventing materialization hang on large result sets
- Fix document stream endpoint to use workspace-scoped librarian queues
- Add decoder error handling for PDF/OCR/unstructured processors
- Add WebSocket mux guards for missing auth fields
- Add null check in librarian document streaming
- Rewrite get_document_content CLI to stream via librarian
- Add Poppler dependency to unstructured container
2026-06-01 22:38:43 +01:00

99 lines
2.4 KiB
Python

"""
Gets document content from the library by document ID.
"""
import argparse
import os
import sys
import requests
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
default_workspace = os.getenv("TRUSTGRAPH_WORKSPACE", "default")
def get_content(url, document_id, output_file, token=None, workspace="default"):
stream_url = url.rstrip("/") + "/api/v1/document-stream"
params = {
"document-id": document_id,
"workspace": workspace,
}
headers = {}
if token:
headers["Authorization"] = f"Bearer {token}"
resp = requests.get(stream_url, params=params, headers=headers, stream=True)
resp.raise_for_status()
if output_file:
total = 0
with open(output_file, 'wb') as f:
for chunk in resp.iter_content(chunk_size=65536):
f.write(chunk)
total += len(chunk)
print(f"Written {total} bytes to {output_file}")
else:
content = resp.content
try:
text = content.decode('utf-8')
print(text)
except UnicodeDecodeError:
print(f"Binary content: {len(content)} bytes", file=sys.stderr)
sys.stdout.buffer.write(content)
def main():
parser = argparse.ArgumentParser(
prog='tg-get-document-content',
description=__doc__,
)
parser.add_argument(
'-u', '--api-url',
default=default_url,
help=f'API URL (default: {default_url})',
)
parser.add_argument(
'-t', '--token',
default=default_token,
help='Authentication token (default: $TRUSTGRAPH_TOKEN)',
)
parser.add_argument(
'-w', '--workspace',
default=default_workspace,
help=f'Workspace (default: {default_workspace})',
)
parser.add_argument(
'-o', '--output',
default=None,
help='Output file (default: stdout)'
)
parser.add_argument(
'document_id',
help='Document ID (IRI) to retrieve',
)
args = parser.parse_args()
try:
get_content(
url=args.api_url,
document_id=args.document_id,
output_file=args.output,
token=args.token,
workspace=args.workspace,
)
except Exception as e:
print("Exception:", e, flush=True)
if __name__ == "__main__":
main()