mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-06-10 07:15:13 +02:00
fix: large document handling and Cassandra query pagination (#969)
- Paginate heavy Cassandra reads (triples, graph/document embeddings) using synchronous session.execute() in run_in_executor with fetch_size paging, preventing materialization hang on large result sets - Fix document stream endpoint to use workspace-scoped librarian queues - Add decoder error handling for PDF/OCR/unstructured processors - Add WebSocket mux guards for missing auth fields - Add null check in librarian document streaming - Rewrite get_document_content CLI to stream via librarian - Add Poppler dependency to unstructured container
This commit is contained in:
parent
7e1fb76bc9
commit
6b1dd16f9f
11 changed files with 166 additions and 74 deletions
|
|
@ -7,7 +7,7 @@ FROM docker.io/fedora:42 AS base
|
|||
|
||||
ENV PIP_BREAK_SYSTEM_PACKAGES=1
|
||||
|
||||
RUN dnf install -y python3.13 libxcb mesa-libGL && \
|
||||
RUN dnf install -y python3.13 libxcb mesa-libGL poppler-utils && \
|
||||
alternatives --install /usr/bin/python python /usr/bin/python3.13 1 && \
|
||||
python -m ensurepip --upgrade && \
|
||||
pip3 install --no-cache-dir --upgrade 'pip>=26.0' 'setuptools>=78.1.1' && \
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ Gets document content from the library by document ID.
|
|||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from trustgraph.api import Api
|
||||
import requests
|
||||
|
||||
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
|
||||
default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
|
||||
|
|
@ -13,15 +13,29 @@ default_workspace = os.getenv("TRUSTGRAPH_WORKSPACE", "default")
|
|||
|
||||
def get_content(url, document_id, output_file, token=None, workspace="default"):
|
||||
|
||||
api = Api(url, token=token, workspace=workspace).library()
|
||||
stream_url = url.rstrip("/") + "/api/v1/document-stream"
|
||||
|
||||
content = api.get_document_content(id=document_id)
|
||||
params = {
|
||||
"document-id": document_id,
|
||||
"workspace": workspace,
|
||||
}
|
||||
|
||||
headers = {}
|
||||
if token:
|
||||
headers["Authorization"] = f"Bearer {token}"
|
||||
|
||||
resp = requests.get(stream_url, params=params, headers=headers, stream=True)
|
||||
resp.raise_for_status()
|
||||
|
||||
if output_file:
|
||||
total = 0
|
||||
with open(output_file, 'wb') as f:
|
||||
f.write(content)
|
||||
print(f"Written {len(content)} bytes to {output_file}")
|
||||
for chunk in resp.iter_content(chunk_size=65536):
|
||||
f.write(chunk)
|
||||
total += len(chunk)
|
||||
print(f"Written {total} bytes to {output_file}")
|
||||
else:
|
||||
content = resp.content
|
||||
try:
|
||||
text = content.decode('utf-8')
|
||||
print(text)
|
||||
|
|
|
|||
|
|
@ -219,7 +219,14 @@ class Processor(FlowProcessor):
|
|||
source_doc_id = v.document_id or v.metadata.id
|
||||
|
||||
# Run OCR, get per-page markdown
|
||||
pages = self.ocr(blob)
|
||||
try:
|
||||
pages = self.ocr(blob)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to decode PDF {source_doc_id}: "
|
||||
f"{type(e).__name__}: {e}"
|
||||
)
|
||||
return
|
||||
|
||||
for markdown, page_num in pages:
|
||||
|
||||
|
|
|
|||
|
|
@ -129,7 +129,15 @@ class Processor(FlowProcessor):
|
|||
)
|
||||
PyPDFLoader = _cls
|
||||
loader = PyPDFLoader(temp_path)
|
||||
pages = loader.load()
|
||||
try:
|
||||
pages = loader.load()
|
||||
except Exception as e:
|
||||
source_doc_id = v.document_id or v.metadata.id
|
||||
logger.error(
|
||||
f"Failed to decode PDF {source_doc_id}: "
|
||||
f"{type(e).__name__}: {e}"
|
||||
)
|
||||
return
|
||||
|
||||
# Get the source document ID
|
||||
source_doc_id = v.document_id or v.metadata.id
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ import asyncio
|
|||
import uuid
|
||||
import logging
|
||||
from . librarian import LibrarianRequestor
|
||||
from ... schema import librarian_request_queue, librarian_response_queue
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -23,10 +24,13 @@ class DocumentStreamExport:
|
|||
|
||||
response = await ok()
|
||||
|
||||
uid = str(uuid.uuid4())
|
||||
lr = LibrarianRequestor(
|
||||
backend=self.backend,
|
||||
consumer="api-gateway-doc-stream-" + str(uuid.uuid4()),
|
||||
subscriber="api-gateway-doc-stream-" + str(uuid.uuid4()),
|
||||
consumer="api-gateway-doc-stream-" + uid,
|
||||
subscriber="api-gateway-doc-stream-" + uid,
|
||||
request_queue=f"{librarian_request_queue}:{workspace}",
|
||||
response_queue=f"{librarian_response_queue}:{workspace}",
|
||||
)
|
||||
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -288,6 +288,8 @@ class Mux:
|
|||
await self.maybe_tidy_workers(workers)
|
||||
|
||||
async def responder(resp, fin):
|
||||
if self.ws is None:
|
||||
return
|
||||
await self.ws.send_json({
|
||||
"id": id,
|
||||
"response": resp,
|
||||
|
|
@ -321,6 +323,8 @@ class Mux:
|
|||
)
|
||||
|
||||
except Exception as e:
|
||||
if self.ws is None:
|
||||
return
|
||||
await self.ws.send_json({
|
||||
"id": id,
|
||||
"error": {"message": str(e), "type": "error"},
|
||||
|
|
|
|||
|
|
@ -162,6 +162,9 @@ class Librarian:
|
|||
request.document_id
|
||||
)
|
||||
|
||||
if object_id is None:
|
||||
raise RequestError(f"Document not found: {request.document_id}")
|
||||
|
||||
content = await self.blob_store.get(
|
||||
object_id
|
||||
)
|
||||
|
|
|
|||
|
|
@ -27,6 +27,8 @@ Notes:
|
|||
|
||||
import asyncio
|
||||
|
||||
from cassandra.query import SimpleStatement
|
||||
|
||||
|
||||
async def async_execute(session, query, parameters=None):
|
||||
"""Execute a CQL statement asynchronously.
|
||||
|
|
@ -76,3 +78,36 @@ def _set_result_if_pending(fut, result):
|
|||
def _set_exception_if_pending(fut, exc):
|
||||
if not fut.done():
|
||||
fut.set_exception(exc)
|
||||
|
||||
|
||||
async def async_execute_paged(session, query, parameters=None, fetch_size=100):
|
||||
"""Execute a CQL query with page-by-page iteration.
|
||||
|
||||
Uses synchronous session.execute() inside run_in_executor so that
|
||||
the driver's ResultSet paging works correctly without materialising
|
||||
the entire result set in memory.
|
||||
|
||||
Yields one page of rows at a time (as a list).
|
||||
"""
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
if isinstance(query, str):
|
||||
stmt = SimpleStatement(query, fetch_size=fetch_size)
|
||||
else:
|
||||
stmt = query
|
||||
stmt.fetch_size = fetch_size
|
||||
|
||||
def _fetch_all_pages():
|
||||
pages = []
|
||||
result_set = session.execute(stmt, parameters)
|
||||
while True:
|
||||
pages.append(list(result_set.current_rows))
|
||||
if result_set.has_more_pages:
|
||||
result_set.fetch_next_page()
|
||||
else:
|
||||
break
|
||||
return pages
|
||||
|
||||
return await loop.run_in_executor(
|
||||
None, _fetch_all_pages
|
||||
)
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from .. schema import DocumentEmbeddings, ChunkEmbeddings
|
|||
|
||||
from cassandra.cluster import Cluster
|
||||
|
||||
from . cassandra_async import async_execute
|
||||
from . cassandra_async import async_execute, async_execute_paged
|
||||
|
||||
|
||||
def term_to_tuple(term):
|
||||
|
|
@ -398,7 +398,7 @@ class KnowledgeTableStore:
|
|||
logger.debug("Get triples...")
|
||||
|
||||
try:
|
||||
rows = await async_execute(
|
||||
pages = await async_execute_paged(
|
||||
self.cassandra,
|
||||
self.get_triples_stmt,
|
||||
(workspace, document_id),
|
||||
|
|
@ -407,29 +407,30 @@ class KnowledgeTableStore:
|
|||
logger.error("Exception occurred", exc_info=True)
|
||||
raise
|
||||
|
||||
for row in rows:
|
||||
for page in pages:
|
||||
for row in page:
|
||||
|
||||
if row[3]:
|
||||
triples = [
|
||||
Triple(
|
||||
s = tuple_to_term(elt[0], elt[1]),
|
||||
p = tuple_to_term(elt[2], elt[3]),
|
||||
o = tuple_to_term(elt[4], elt[5]),
|
||||
if row[3]:
|
||||
triples = [
|
||||
Triple(
|
||||
s = tuple_to_term(elt[0], elt[1]),
|
||||
p = tuple_to_term(elt[2], elt[3]),
|
||||
o = tuple_to_term(elt[4], elt[5]),
|
||||
)
|
||||
for elt in row[3]
|
||||
]
|
||||
else:
|
||||
triples = []
|
||||
|
||||
await receiver(
|
||||
Triples(
|
||||
metadata = Metadata(
|
||||
id = document_id,
|
||||
collection = "default",
|
||||
),
|
||||
triples = triples
|
||||
)
|
||||
for elt in row[3]
|
||||
]
|
||||
else:
|
||||
triples = []
|
||||
|
||||
await receiver(
|
||||
Triples(
|
||||
metadata = Metadata(
|
||||
id = document_id,
|
||||
collection = "default", # FIXME: What to put here?
|
||||
),
|
||||
triples = triples
|
||||
)
|
||||
)
|
||||
|
||||
logger.debug("Done")
|
||||
|
||||
|
|
@ -438,7 +439,7 @@ class KnowledgeTableStore:
|
|||
logger.debug("Get GE...")
|
||||
|
||||
try:
|
||||
rows = await async_execute(
|
||||
pages = await async_execute_paged(
|
||||
self.cassandra,
|
||||
self.get_graph_embeddings_stmt,
|
||||
(workspace, document_id),
|
||||
|
|
@ -447,28 +448,29 @@ class KnowledgeTableStore:
|
|||
logger.error("Exception occurred", exc_info=True)
|
||||
raise
|
||||
|
||||
for row in rows:
|
||||
for page in pages:
|
||||
for row in page:
|
||||
|
||||
if row[3]:
|
||||
entities = [
|
||||
EntityEmbeddings(
|
||||
entity = tuple_to_term(ent[0][0], ent[0][1]),
|
||||
vector = ent[1]
|
||||
if row[3]:
|
||||
entities = [
|
||||
EntityEmbeddings(
|
||||
entity = tuple_to_term(ent[0][0], ent[0][1]),
|
||||
vector = ent[1]
|
||||
)
|
||||
for ent in row[3]
|
||||
]
|
||||
else:
|
||||
entities = []
|
||||
|
||||
await receiver(
|
||||
GraphEmbeddings(
|
||||
metadata = Metadata(
|
||||
id = document_id,
|
||||
collection = "default",
|
||||
),
|
||||
entities = entities
|
||||
)
|
||||
for ent in row[3]
|
||||
]
|
||||
else:
|
||||
entities = []
|
||||
|
||||
await receiver(
|
||||
GraphEmbeddings(
|
||||
metadata = Metadata(
|
||||
id = document_id,
|
||||
collection = "default", # FIXME: What to put here?
|
||||
),
|
||||
entities = entities
|
||||
)
|
||||
)
|
||||
|
||||
logger.debug("Done")
|
||||
|
||||
|
|
@ -477,7 +479,7 @@ class KnowledgeTableStore:
|
|||
logger.debug("Get DE...")
|
||||
|
||||
try:
|
||||
rows = await async_execute(
|
||||
pages = await async_execute_paged(
|
||||
self.cassandra,
|
||||
self.get_document_embeddings_stmt,
|
||||
(workspace, document_id),
|
||||
|
|
@ -486,28 +488,29 @@ class KnowledgeTableStore:
|
|||
logger.error("Exception occurred", exc_info=True)
|
||||
raise
|
||||
|
||||
for row in rows:
|
||||
for page in pages:
|
||||
for row in page:
|
||||
|
||||
if row[3]:
|
||||
chunks = [
|
||||
ChunkEmbeddings(
|
||||
chunk_id=ch[0],
|
||||
vector=ch[1],
|
||||
if row[3]:
|
||||
chunks = [
|
||||
ChunkEmbeddings(
|
||||
chunk_id=ch[0],
|
||||
vector=ch[1],
|
||||
)
|
||||
for ch in row[3]
|
||||
]
|
||||
else:
|
||||
chunks = []
|
||||
|
||||
await receiver(
|
||||
DocumentEmbeddings(
|
||||
metadata = Metadata(
|
||||
id = document_id,
|
||||
collection = "default",
|
||||
),
|
||||
chunks = chunks
|
||||
)
|
||||
for ch in row[3]
|
||||
]
|
||||
else:
|
||||
chunks = []
|
||||
|
||||
await receiver(
|
||||
DocumentEmbeddings(
|
||||
metadata = Metadata(
|
||||
id = document_id,
|
||||
collection = "default",
|
||||
),
|
||||
chunks = chunks
|
||||
)
|
||||
)
|
||||
|
||||
logger.debug("Done")
|
||||
|
||||
|
|
|
|||
|
|
@ -107,7 +107,14 @@ class Processor(FlowProcessor):
|
|||
# Get the source document ID
|
||||
source_doc_id = v.document_id or v.metadata.id
|
||||
|
||||
pages = convert_from_bytes(blob)
|
||||
try:
|
||||
pages = convert_from_bytes(blob)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to decode PDF {source_doc_id}: "
|
||||
f"{type(e).__name__}: {e}"
|
||||
)
|
||||
return
|
||||
|
||||
for ix, page in enumerate(pages):
|
||||
|
||||
|
|
|
|||
|
|
@ -418,7 +418,14 @@ class Processor(FlowProcessor):
|
|||
doc_uri_str = document_uri(source_doc_id)
|
||||
|
||||
# Extract elements using unstructured
|
||||
elements = self.extract_elements(blob, mime_type)
|
||||
try:
|
||||
elements = self.extract_elements(blob, mime_type)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to extract elements from {source_doc_id}: "
|
||||
f"{type(e).__name__}: {e}"
|
||||
)
|
||||
return
|
||||
|
||||
if not elements:
|
||||
logger.warning("No elements extracted from document")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue