mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-06-10 23:35:14 +02:00
fix: large document handling and Cassandra query pagination (#969)
- Paginate heavy Cassandra reads (triples, graph/document embeddings) using synchronous session.execute() in run_in_executor with fetch_size paging, preventing materialization hang on large result sets - Fix document stream endpoint to use workspace-scoped librarian queues - Add decoder error handling for PDF/OCR/unstructured processors - Add WebSocket mux guards for missing auth fields - Add null check in librarian document streaming - Rewrite get_document_content CLI to stream via librarian - Add Poppler dependency to unstructured container
This commit is contained in:
parent
7e1fb76bc9
commit
6b1dd16f9f
11 changed files with 166 additions and 74 deletions
|
|
@ -418,7 +418,14 @@ class Processor(FlowProcessor):
|
|||
doc_uri_str = document_uri(source_doc_id)
|
||||
|
||||
# Extract elements using unstructured
|
||||
elements = self.extract_elements(blob, mime_type)
|
||||
try:
|
||||
elements = self.extract_elements(blob, mime_type)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to extract elements from {source_doc_id}: "
|
||||
f"{type(e).__name__}: {e}"
|
||||
)
|
||||
return
|
||||
|
||||
if not elements:
|
||||
logger.warning("No elements extracted from document")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue