fix: large document handling and Cassandra query pagination (#969)

- Paginate heavy Cassandra reads (triples, graph/document embeddings)
  using synchronous session.execute() in run_in_executor with fetch_size
  paging, preventing materialization hang on large result sets
- Fix document stream endpoint to use workspace-scoped librarian queues
- Add decoder error handling for PDF/OCR/unstructured processors
- Add WebSocket mux guards for missing auth fields
- Add null check in librarian document streaming
- Rewrite get_document_content CLI to stream via librarian
- Add Poppler dependency to unstructured container
This commit is contained in:
cybermaggedon 2026-06-01 22:39:30 +01:00 committed by GitHub
parent 7e1fb76bc9
commit 6b1dd16f9f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 166 additions and 74 deletions

View file

@ -219,7 +219,14 @@ class Processor(FlowProcessor):
source_doc_id = v.document_id or v.metadata.id
# Run OCR, get per-page markdown
pages = self.ocr(blob)
try:
pages = self.ocr(blob)
except Exception as e:
logger.error(
f"Failed to decode PDF {source_doc_id}: "
f"{type(e).__name__}: {e}"
)
return
for markdown, page_num in pages:

View file

@ -129,7 +129,15 @@ class Processor(FlowProcessor):
)
PyPDFLoader = _cls
loader = PyPDFLoader(temp_path)
pages = loader.load()
try:
pages = loader.load()
except Exception as e:
source_doc_id = v.document_id or v.metadata.id
logger.error(
f"Failed to decode PDF {source_doc_id}: "
f"{type(e).__name__}: {e}"
)
return
# Get the source document ID
source_doc_id = v.document_id or v.metadata.id

View file

@ -3,6 +3,7 @@ import asyncio
import uuid
import logging
from . librarian import LibrarianRequestor
from ... schema import librarian_request_queue, librarian_response_queue
# Module logger
logger = logging.getLogger(__name__)
@ -23,10 +24,13 @@ class DocumentStreamExport:
response = await ok()
uid = str(uuid.uuid4())
lr = LibrarianRequestor(
backend=self.backend,
consumer="api-gateway-doc-stream-" + str(uuid.uuid4()),
subscriber="api-gateway-doc-stream-" + str(uuid.uuid4()),
consumer="api-gateway-doc-stream-" + uid,
subscriber="api-gateway-doc-stream-" + uid,
request_queue=f"{librarian_request_queue}:{workspace}",
response_queue=f"{librarian_response_queue}:{workspace}",
)
try:

View file

@ -288,6 +288,8 @@ class Mux:
await self.maybe_tidy_workers(workers)
async def responder(resp, fin):
if self.ws is None:
return
await self.ws.send_json({
"id": id,
"response": resp,
@ -321,6 +323,8 @@ class Mux:
)
except Exception as e:
if self.ws is None:
return
await self.ws.send_json({
"id": id,
"error": {"message": str(e), "type": "error"},

View file

@ -162,6 +162,9 @@ class Librarian:
request.document_id
)
if object_id is None:
raise RequestError(f"Document not found: {request.document_id}")
content = await self.blob_store.get(
object_id
)

View file

@ -27,6 +27,8 @@ Notes:
import asyncio
from cassandra.query import SimpleStatement
async def async_execute(session, query, parameters=None):
"""Execute a CQL statement asynchronously.
@ -76,3 +78,36 @@ def _set_result_if_pending(fut, result):
def _set_exception_if_pending(fut, exc):
if not fut.done():
fut.set_exception(exc)
async def async_execute_paged(session, query, parameters=None, fetch_size=100):
"""Execute a CQL query with page-by-page iteration.
Uses synchronous session.execute() inside run_in_executor so that
the driver's ResultSet paging works correctly without materialising
the entire result set in memory.
Yields one page of rows at a time (as a list).
"""
loop = asyncio.get_running_loop()
if isinstance(query, str):
stmt = SimpleStatement(query, fetch_size=fetch_size)
else:
stmt = query
stmt.fetch_size = fetch_size
def _fetch_all_pages():
pages = []
result_set = session.execute(stmt, parameters)
while True:
pages.append(list(result_set.current_rows))
if result_set.has_more_pages:
result_set.fetch_next_page()
else:
break
return pages
return await loop.run_in_executor(
None, _fetch_all_pages
)

View file

@ -5,7 +5,7 @@ from .. schema import DocumentEmbeddings, ChunkEmbeddings
from cassandra.cluster import Cluster
from . cassandra_async import async_execute
from . cassandra_async import async_execute, async_execute_paged
def term_to_tuple(term):
@ -398,7 +398,7 @@ class KnowledgeTableStore:
logger.debug("Get triples...")
try:
rows = await async_execute(
pages = await async_execute_paged(
self.cassandra,
self.get_triples_stmt,
(workspace, document_id),
@ -407,29 +407,30 @@ class KnowledgeTableStore:
logger.error("Exception occurred", exc_info=True)
raise
for row in rows:
for page in pages:
for row in page:
if row[3]:
triples = [
Triple(
s = tuple_to_term(elt[0], elt[1]),
p = tuple_to_term(elt[2], elt[3]),
o = tuple_to_term(elt[4], elt[5]),
if row[3]:
triples = [
Triple(
s = tuple_to_term(elt[0], elt[1]),
p = tuple_to_term(elt[2], elt[3]),
o = tuple_to_term(elt[4], elt[5]),
)
for elt in row[3]
]
else:
triples = []
await receiver(
Triples(
metadata = Metadata(
id = document_id,
collection = "default",
),
triples = triples
)
for elt in row[3]
]
else:
triples = []
await receiver(
Triples(
metadata = Metadata(
id = document_id,
collection = "default", # FIXME: What to put here?
),
triples = triples
)
)
logger.debug("Done")
@ -438,7 +439,7 @@ class KnowledgeTableStore:
logger.debug("Get GE...")
try:
rows = await async_execute(
pages = await async_execute_paged(
self.cassandra,
self.get_graph_embeddings_stmt,
(workspace, document_id),
@ -447,28 +448,29 @@ class KnowledgeTableStore:
logger.error("Exception occurred", exc_info=True)
raise
for row in rows:
for page in pages:
for row in page:
if row[3]:
entities = [
EntityEmbeddings(
entity = tuple_to_term(ent[0][0], ent[0][1]),
vector = ent[1]
if row[3]:
entities = [
EntityEmbeddings(
entity = tuple_to_term(ent[0][0], ent[0][1]),
vector = ent[1]
)
for ent in row[3]
]
else:
entities = []
await receiver(
GraphEmbeddings(
metadata = Metadata(
id = document_id,
collection = "default",
),
entities = entities
)
for ent in row[3]
]
else:
entities = []
await receiver(
GraphEmbeddings(
metadata = Metadata(
id = document_id,
collection = "default", # FIXME: What to put here?
),
entities = entities
)
)
logger.debug("Done")
@ -477,7 +479,7 @@ class KnowledgeTableStore:
logger.debug("Get DE...")
try:
rows = await async_execute(
pages = await async_execute_paged(
self.cassandra,
self.get_document_embeddings_stmt,
(workspace, document_id),
@ -486,28 +488,29 @@ class KnowledgeTableStore:
logger.error("Exception occurred", exc_info=True)
raise
for row in rows:
for page in pages:
for row in page:
if row[3]:
chunks = [
ChunkEmbeddings(
chunk_id=ch[0],
vector=ch[1],
if row[3]:
chunks = [
ChunkEmbeddings(
chunk_id=ch[0],
vector=ch[1],
)
for ch in row[3]
]
else:
chunks = []
await receiver(
DocumentEmbeddings(
metadata = Metadata(
id = document_id,
collection = "default",
),
chunks = chunks
)
for ch in row[3]
]
else:
chunks = []
await receiver(
DocumentEmbeddings(
metadata = Metadata(
id = document_id,
collection = "default",
),
chunks = chunks
)
)
logger.debug("Done")