mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-06-17 10:55:14 +02:00
fix: large document handling and Cassandra query pagination (#969)
- Paginate heavy Cassandra reads (triples, graph/document embeddings) using synchronous session.execute() in run_in_executor with fetch_size paging, preventing materialization hang on large result sets - Fix document stream endpoint to use workspace-scoped librarian queues - Add decoder error handling for PDF/OCR/unstructured processors - Add WebSocket mux guards for missing auth fields - Add null check in librarian document streaming - Rewrite get_document_content CLI to stream via librarian - Add Poppler dependency to unstructured container
This commit is contained in:
parent
7e1fb76bc9
commit
6b1dd16f9f
11 changed files with 166 additions and 74 deletions
|
|
@ -7,7 +7,7 @@ FROM docker.io/fedora:42 AS base
|
||||||
|
|
||||||
ENV PIP_BREAK_SYSTEM_PACKAGES=1
|
ENV PIP_BREAK_SYSTEM_PACKAGES=1
|
||||||
|
|
||||||
RUN dnf install -y python3.13 libxcb mesa-libGL && \
|
RUN dnf install -y python3.13 libxcb mesa-libGL poppler-utils && \
|
||||||
alternatives --install /usr/bin/python python /usr/bin/python3.13 1 && \
|
alternatives --install /usr/bin/python python /usr/bin/python3.13 1 && \
|
||||||
python -m ensurepip --upgrade && \
|
python -m ensurepip --upgrade && \
|
||||||
pip3 install --no-cache-dir --upgrade 'pip>=26.0' 'setuptools>=78.1.1' && \
|
pip3 install --no-cache-dir --upgrade 'pip>=26.0' 'setuptools>=78.1.1' && \
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ Gets document content from the library by document ID.
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from trustgraph.api import Api
|
import requests
|
||||||
|
|
||||||
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
|
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
|
||||||
default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
|
default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
|
||||||
|
|
@ -13,15 +13,29 @@ default_workspace = os.getenv("TRUSTGRAPH_WORKSPACE", "default")
|
||||||
|
|
||||||
def get_content(url, document_id, output_file, token=None, workspace="default"):
|
def get_content(url, document_id, output_file, token=None, workspace="default"):
|
||||||
|
|
||||||
api = Api(url, token=token, workspace=workspace).library()
|
stream_url = url.rstrip("/") + "/api/v1/document-stream"
|
||||||
|
|
||||||
content = api.get_document_content(id=document_id)
|
params = {
|
||||||
|
"document-id": document_id,
|
||||||
|
"workspace": workspace,
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = {}
|
||||||
|
if token:
|
||||||
|
headers["Authorization"] = f"Bearer {token}"
|
||||||
|
|
||||||
|
resp = requests.get(stream_url, params=params, headers=headers, stream=True)
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
if output_file:
|
if output_file:
|
||||||
|
total = 0
|
||||||
with open(output_file, 'wb') as f:
|
with open(output_file, 'wb') as f:
|
||||||
f.write(content)
|
for chunk in resp.iter_content(chunk_size=65536):
|
||||||
print(f"Written {len(content)} bytes to {output_file}")
|
f.write(chunk)
|
||||||
|
total += len(chunk)
|
||||||
|
print(f"Written {total} bytes to {output_file}")
|
||||||
else:
|
else:
|
||||||
|
content = resp.content
|
||||||
try:
|
try:
|
||||||
text = content.decode('utf-8')
|
text = content.decode('utf-8')
|
||||||
print(text)
|
print(text)
|
||||||
|
|
|
||||||
|
|
@ -219,7 +219,14 @@ class Processor(FlowProcessor):
|
||||||
source_doc_id = v.document_id or v.metadata.id
|
source_doc_id = v.document_id or v.metadata.id
|
||||||
|
|
||||||
# Run OCR, get per-page markdown
|
# Run OCR, get per-page markdown
|
||||||
pages = self.ocr(blob)
|
try:
|
||||||
|
pages = self.ocr(blob)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"Failed to decode PDF {source_doc_id}: "
|
||||||
|
f"{type(e).__name__}: {e}"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
for markdown, page_num in pages:
|
for markdown, page_num in pages:
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -129,7 +129,15 @@ class Processor(FlowProcessor):
|
||||||
)
|
)
|
||||||
PyPDFLoader = _cls
|
PyPDFLoader = _cls
|
||||||
loader = PyPDFLoader(temp_path)
|
loader = PyPDFLoader(temp_path)
|
||||||
pages = loader.load()
|
try:
|
||||||
|
pages = loader.load()
|
||||||
|
except Exception as e:
|
||||||
|
source_doc_id = v.document_id or v.metadata.id
|
||||||
|
logger.error(
|
||||||
|
f"Failed to decode PDF {source_doc_id}: "
|
||||||
|
f"{type(e).__name__}: {e}"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
# Get the source document ID
|
# Get the source document ID
|
||||||
source_doc_id = v.document_id or v.metadata.id
|
source_doc_id = v.document_id or v.metadata.id
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ import asyncio
|
||||||
import uuid
|
import uuid
|
||||||
import logging
|
import logging
|
||||||
from . librarian import LibrarianRequestor
|
from . librarian import LibrarianRequestor
|
||||||
|
from ... schema import librarian_request_queue, librarian_response_queue
|
||||||
|
|
||||||
# Module logger
|
# Module logger
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -23,10 +24,13 @@ class DocumentStreamExport:
|
||||||
|
|
||||||
response = await ok()
|
response = await ok()
|
||||||
|
|
||||||
|
uid = str(uuid.uuid4())
|
||||||
lr = LibrarianRequestor(
|
lr = LibrarianRequestor(
|
||||||
backend=self.backend,
|
backend=self.backend,
|
||||||
consumer="api-gateway-doc-stream-" + str(uuid.uuid4()),
|
consumer="api-gateway-doc-stream-" + uid,
|
||||||
subscriber="api-gateway-doc-stream-" + str(uuid.uuid4()),
|
subscriber="api-gateway-doc-stream-" + uid,
|
||||||
|
request_queue=f"{librarian_request_queue}:{workspace}",
|
||||||
|
response_queue=f"{librarian_response_queue}:{workspace}",
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -288,6 +288,8 @@ class Mux:
|
||||||
await self.maybe_tidy_workers(workers)
|
await self.maybe_tidy_workers(workers)
|
||||||
|
|
||||||
async def responder(resp, fin):
|
async def responder(resp, fin):
|
||||||
|
if self.ws is None:
|
||||||
|
return
|
||||||
await self.ws.send_json({
|
await self.ws.send_json({
|
||||||
"id": id,
|
"id": id,
|
||||||
"response": resp,
|
"response": resp,
|
||||||
|
|
@ -321,6 +323,8 @@ class Mux:
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if self.ws is None:
|
||||||
|
return
|
||||||
await self.ws.send_json({
|
await self.ws.send_json({
|
||||||
"id": id,
|
"id": id,
|
||||||
"error": {"message": str(e), "type": "error"},
|
"error": {"message": str(e), "type": "error"},
|
||||||
|
|
|
||||||
|
|
@ -162,6 +162,9 @@ class Librarian:
|
||||||
request.document_id
|
request.document_id
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if object_id is None:
|
||||||
|
raise RequestError(f"Document not found: {request.document_id}")
|
||||||
|
|
||||||
content = await self.blob_store.get(
|
content = await self.blob_store.get(
|
||||||
object_id
|
object_id
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,8 @@ Notes:
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
|
from cassandra.query import SimpleStatement
|
||||||
|
|
||||||
|
|
||||||
async def async_execute(session, query, parameters=None):
|
async def async_execute(session, query, parameters=None):
|
||||||
"""Execute a CQL statement asynchronously.
|
"""Execute a CQL statement asynchronously.
|
||||||
|
|
@ -76,3 +78,36 @@ def _set_result_if_pending(fut, result):
|
||||||
def _set_exception_if_pending(fut, exc):
|
def _set_exception_if_pending(fut, exc):
|
||||||
if not fut.done():
|
if not fut.done():
|
||||||
fut.set_exception(exc)
|
fut.set_exception(exc)
|
||||||
|
|
||||||
|
|
||||||
|
async def async_execute_paged(session, query, parameters=None, fetch_size=100):
|
||||||
|
"""Execute a CQL query with page-by-page iteration.
|
||||||
|
|
||||||
|
Uses synchronous session.execute() inside run_in_executor so that
|
||||||
|
the driver's ResultSet paging works correctly without materialising
|
||||||
|
the entire result set in memory.
|
||||||
|
|
||||||
|
Yields one page of rows at a time (as a list).
|
||||||
|
"""
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
|
||||||
|
if isinstance(query, str):
|
||||||
|
stmt = SimpleStatement(query, fetch_size=fetch_size)
|
||||||
|
else:
|
||||||
|
stmt = query
|
||||||
|
stmt.fetch_size = fetch_size
|
||||||
|
|
||||||
|
def _fetch_all_pages():
|
||||||
|
pages = []
|
||||||
|
result_set = session.execute(stmt, parameters)
|
||||||
|
while True:
|
||||||
|
pages.append(list(result_set.current_rows))
|
||||||
|
if result_set.has_more_pages:
|
||||||
|
result_set.fetch_next_page()
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
return pages
|
||||||
|
|
||||||
|
return await loop.run_in_executor(
|
||||||
|
None, _fetch_all_pages
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ from .. schema import DocumentEmbeddings, ChunkEmbeddings
|
||||||
|
|
||||||
from cassandra.cluster import Cluster
|
from cassandra.cluster import Cluster
|
||||||
|
|
||||||
from . cassandra_async import async_execute
|
from . cassandra_async import async_execute, async_execute_paged
|
||||||
|
|
||||||
|
|
||||||
def term_to_tuple(term):
|
def term_to_tuple(term):
|
||||||
|
|
@ -398,7 +398,7 @@ class KnowledgeTableStore:
|
||||||
logger.debug("Get triples...")
|
logger.debug("Get triples...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
rows = await async_execute(
|
pages = await async_execute_paged(
|
||||||
self.cassandra,
|
self.cassandra,
|
||||||
self.get_triples_stmt,
|
self.get_triples_stmt,
|
||||||
(workspace, document_id),
|
(workspace, document_id),
|
||||||
|
|
@ -407,29 +407,30 @@ class KnowledgeTableStore:
|
||||||
logger.error("Exception occurred", exc_info=True)
|
logger.error("Exception occurred", exc_info=True)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
for row in rows:
|
for page in pages:
|
||||||
|
for row in page:
|
||||||
|
|
||||||
if row[3]:
|
if row[3]:
|
||||||
triples = [
|
triples = [
|
||||||
Triple(
|
Triple(
|
||||||
s = tuple_to_term(elt[0], elt[1]),
|
s = tuple_to_term(elt[0], elt[1]),
|
||||||
p = tuple_to_term(elt[2], elt[3]),
|
p = tuple_to_term(elt[2], elt[3]),
|
||||||
o = tuple_to_term(elt[4], elt[5]),
|
o = tuple_to_term(elt[4], elt[5]),
|
||||||
|
)
|
||||||
|
for elt in row[3]
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
triples = []
|
||||||
|
|
||||||
|
await receiver(
|
||||||
|
Triples(
|
||||||
|
metadata = Metadata(
|
||||||
|
id = document_id,
|
||||||
|
collection = "default",
|
||||||
|
),
|
||||||
|
triples = triples
|
||||||
)
|
)
|
||||||
for elt in row[3]
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
triples = []
|
|
||||||
|
|
||||||
await receiver(
|
|
||||||
Triples(
|
|
||||||
metadata = Metadata(
|
|
||||||
id = document_id,
|
|
||||||
collection = "default", # FIXME: What to put here?
|
|
||||||
),
|
|
||||||
triples = triples
|
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
logger.debug("Done")
|
logger.debug("Done")
|
||||||
|
|
||||||
|
|
@ -438,7 +439,7 @@ class KnowledgeTableStore:
|
||||||
logger.debug("Get GE...")
|
logger.debug("Get GE...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
rows = await async_execute(
|
pages = await async_execute_paged(
|
||||||
self.cassandra,
|
self.cassandra,
|
||||||
self.get_graph_embeddings_stmt,
|
self.get_graph_embeddings_stmt,
|
||||||
(workspace, document_id),
|
(workspace, document_id),
|
||||||
|
|
@ -447,28 +448,29 @@ class KnowledgeTableStore:
|
||||||
logger.error("Exception occurred", exc_info=True)
|
logger.error("Exception occurred", exc_info=True)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
for row in rows:
|
for page in pages:
|
||||||
|
for row in page:
|
||||||
|
|
||||||
if row[3]:
|
if row[3]:
|
||||||
entities = [
|
entities = [
|
||||||
EntityEmbeddings(
|
EntityEmbeddings(
|
||||||
entity = tuple_to_term(ent[0][0], ent[0][1]),
|
entity = tuple_to_term(ent[0][0], ent[0][1]),
|
||||||
vector = ent[1]
|
vector = ent[1]
|
||||||
|
)
|
||||||
|
for ent in row[3]
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
entities = []
|
||||||
|
|
||||||
|
await receiver(
|
||||||
|
GraphEmbeddings(
|
||||||
|
metadata = Metadata(
|
||||||
|
id = document_id,
|
||||||
|
collection = "default",
|
||||||
|
),
|
||||||
|
entities = entities
|
||||||
)
|
)
|
||||||
for ent in row[3]
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
entities = []
|
|
||||||
|
|
||||||
await receiver(
|
|
||||||
GraphEmbeddings(
|
|
||||||
metadata = Metadata(
|
|
||||||
id = document_id,
|
|
||||||
collection = "default", # FIXME: What to put here?
|
|
||||||
),
|
|
||||||
entities = entities
|
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
logger.debug("Done")
|
logger.debug("Done")
|
||||||
|
|
||||||
|
|
@ -477,7 +479,7 @@ class KnowledgeTableStore:
|
||||||
logger.debug("Get DE...")
|
logger.debug("Get DE...")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
rows = await async_execute(
|
pages = await async_execute_paged(
|
||||||
self.cassandra,
|
self.cassandra,
|
||||||
self.get_document_embeddings_stmt,
|
self.get_document_embeddings_stmt,
|
||||||
(workspace, document_id),
|
(workspace, document_id),
|
||||||
|
|
@ -486,28 +488,29 @@ class KnowledgeTableStore:
|
||||||
logger.error("Exception occurred", exc_info=True)
|
logger.error("Exception occurred", exc_info=True)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
for row in rows:
|
for page in pages:
|
||||||
|
for row in page:
|
||||||
|
|
||||||
if row[3]:
|
if row[3]:
|
||||||
chunks = [
|
chunks = [
|
||||||
ChunkEmbeddings(
|
ChunkEmbeddings(
|
||||||
chunk_id=ch[0],
|
chunk_id=ch[0],
|
||||||
vector=ch[1],
|
vector=ch[1],
|
||||||
|
)
|
||||||
|
for ch in row[3]
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
chunks = []
|
||||||
|
|
||||||
|
await receiver(
|
||||||
|
DocumentEmbeddings(
|
||||||
|
metadata = Metadata(
|
||||||
|
id = document_id,
|
||||||
|
collection = "default",
|
||||||
|
),
|
||||||
|
chunks = chunks
|
||||||
)
|
)
|
||||||
for ch in row[3]
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
chunks = []
|
|
||||||
|
|
||||||
await receiver(
|
|
||||||
DocumentEmbeddings(
|
|
||||||
metadata = Metadata(
|
|
||||||
id = document_id,
|
|
||||||
collection = "default",
|
|
||||||
),
|
|
||||||
chunks = chunks
|
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
logger.debug("Done")
|
logger.debug("Done")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -107,7 +107,14 @@ class Processor(FlowProcessor):
|
||||||
# Get the source document ID
|
# Get the source document ID
|
||||||
source_doc_id = v.document_id or v.metadata.id
|
source_doc_id = v.document_id or v.metadata.id
|
||||||
|
|
||||||
pages = convert_from_bytes(blob)
|
try:
|
||||||
|
pages = convert_from_bytes(blob)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"Failed to decode PDF {source_doc_id}: "
|
||||||
|
f"{type(e).__name__}: {e}"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
for ix, page in enumerate(pages):
|
for ix, page in enumerate(pages):
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -418,7 +418,14 @@ class Processor(FlowProcessor):
|
||||||
doc_uri_str = document_uri(source_doc_id)
|
doc_uri_str = document_uri(source_doc_id)
|
||||||
|
|
||||||
# Extract elements using unstructured
|
# Extract elements using unstructured
|
||||||
elements = self.extract_elements(blob, mime_type)
|
try:
|
||||||
|
elements = self.extract_elements(blob, mime_type)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"Failed to extract elements from {source_doc_id}: "
|
||||||
|
f"{type(e).__name__}: {e}"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
if not elements:
|
if not elements:
|
||||||
logger.warning("No elements extracted from document")
|
logger.warning("No elements extracted from document")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue