Add universal document decoder with multi-format support (#705)

Add universal document decoder with multi-format support
using 'unstructured'.

New universal decoder service powered by the unstructured
library, handling DOCX, XLSX, PPTX, HTML, Markdown, CSV, RTF,
ODT, EPUB and more through a single service. Tables are preserved
as HTML markup for better downstream extraction. Images are
stored in the librarian but excluded from the text
pipeline. Configurable section grouping strategies
(whole-document, heading, element-type, count, size) for non-page
formats. Page-based formats (PDF, PPTX, XLSX) are automatically
grouped by page.

All four decoders (PDF, Mistral OCR, Tesseract OCR, universal)
now share the "document-decoder" ident so they are
interchangeable.  PDF-only decoders fetch document metadata to
check MIME type and gracefully skip unsupported formats.

Librarian changes: removed MIME type whitelist validation so any
document format can be ingested. Simplified routing so text/plain
goes to text-load and everything else goes to document-load.
Removed dual inline/streaming data paths — documents always use
document_id for content retrieval.

New provenance entity types (tg:Section, tg:Image) and metadata
predicates (tg:elementTypes, tg:tableCount, tg:imageCount) for
richer explainability.

Universal decoder is in its own package (trustgraph-unstructured)
and container image (trustgraph-unstructured).
This commit is contained in:
cybermaggedon 2026-03-23 12:56:35 +00:00 committed by GitHub
parent 4609424afe
commit 5c6fe90fe2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
25 changed files with 2247 additions and 79 deletions

View file

@ -37,7 +37,7 @@ logger = logging.getLogger(__name__)
COMPONENT_NAME = "mistral-ocr-decoder"
COMPONENT_VERSION = "1.0.0"
default_ident = "pdf-decoder"
default_ident = "document-decoder"
default_api_key = os.getenv("MISTRAL_TOKEN")
default_librarian_request_queue = librarian_request_queue
@ -165,6 +165,39 @@ class Processor(FlowProcessor):
else:
logger.warning(f"Received unexpected librarian response: {request_id}")
async def fetch_document_metadata(self, document_id, user, timeout=120):
"""
Fetch document metadata from librarian via Pulsar.
"""
request_id = str(uuid.uuid4())
request = LibrarianRequest(
operation="get-document-metadata",
document_id=document_id,
user=user,
)
future = asyncio.get_event_loop().create_future()
self.pending_requests[request_id] = future
try:
await self.librarian_request_producer.send(
request, properties={"id": request_id}
)
response = await asyncio.wait_for(future, timeout=timeout)
if response.error:
raise RuntimeError(
f"Librarian error: {response.error.type}: {response.error.message}"
)
return response.document_metadata
except asyncio.TimeoutError:
self.pending_requests.pop(request_id, None)
raise RuntimeError(f"Timeout fetching metadata for {document_id}")
async def fetch_document_content(self, document_id, user, timeout=120):
"""
Fetch document content from librarian via Pulsar.
@ -326,6 +359,20 @@ class Processor(FlowProcessor):
logger.info(f"Decoding {v.metadata.id}...")
# Check MIME type if fetching from librarian
if v.document_id:
doc_meta = await self.fetch_document_metadata(
document_id=v.document_id,
user=v.metadata.user,
)
if doc_meta and doc_meta.kind and doc_meta.kind != "application/pdf":
logger.error(
f"Unsupported MIME type: {doc_meta.kind}. "
f"Mistral OCR decoder only handles application/pdf. "
f"Ignoring document {v.metadata.id}."
)
return
# Get PDF content - fetch from librarian or use inline data
if v.document_id:
logger.info(f"Fetching document {v.document_id} from librarian...")

View file

@ -34,7 +34,7 @@ COMPONENT_VERSION = "1.0.0"
# Module logger
logger = logging.getLogger(__name__)
default_ident = "pdf-decoder"
default_ident = "document-decoder"
default_librarian_request_queue = librarian_request_queue
default_librarian_response_queue = librarian_response_queue
@ -129,6 +129,39 @@ class Processor(FlowProcessor):
else:
logger.warning(f"Received unexpected librarian response: {request_id}")
async def fetch_document_metadata(self, document_id, user, timeout=120):
"""
Fetch document metadata from librarian via Pulsar.
"""
request_id = str(uuid.uuid4())
request = LibrarianRequest(
operation="get-document-metadata",
document_id=document_id,
user=user,
)
future = asyncio.get_event_loop().create_future()
self.pending_requests[request_id] = future
try:
await self.librarian_request_producer.send(
request, properties={"id": request_id}
)
response = await asyncio.wait_for(future, timeout=timeout)
if response.error:
raise RuntimeError(
f"Librarian error: {response.error.type}: {response.error.message}"
)
return response.document_metadata
except asyncio.TimeoutError:
self.pending_requests.pop(request_id, None)
raise RuntimeError(f"Timeout fetching metadata for {document_id}")
async def fetch_document_content(self, document_id, user, timeout=120):
"""
Fetch document content from librarian via Pulsar.
@ -233,6 +266,20 @@ class Processor(FlowProcessor):
logger.info(f"Decoding PDF {v.metadata.id}...")
# Check MIME type if fetching from librarian
if v.document_id:
doc_meta = await self.fetch_document_metadata(
document_id=v.document_id,
user=v.metadata.user,
)
if doc_meta and doc_meta.kind and doc_meta.kind != "application/pdf":
logger.error(
f"Unsupported MIME type: {doc_meta.kind}. "
f"PDF decoder only handles application/pdf. "
f"Ignoring document {v.metadata.id}."
)
return
with tempfile.NamedTemporaryFile(delete_on_close=False, suffix='.pdf') as fp:
temp_path = fp.name

View file

@ -44,12 +44,8 @@ class Librarian:
async def add_document(self, request):
if request.document_metadata.kind not in (
"text/plain", "application/pdf"
):
raise RequestError(
"Invalid document kind: " + request.document_metadata.kind
)
if not request.document_metadata.kind:
raise RequestError("Document kind (MIME type) is required")
if await self.table_store.document_exists(
request.document_metadata.user,
@ -276,10 +272,8 @@ class Librarian:
"""
logger.info(f"Beginning chunked upload for document {request.document_metadata.id}")
if request.document_metadata.kind not in ("text/plain", "application/pdf"):
raise RequestError(
"Invalid document kind: " + request.document_metadata.kind
)
if not request.document_metadata.kind:
raise RequestError("Document kind (MIME type) is required")
if await self.table_store.document_exists(
request.document_metadata.user,

View file

@ -284,7 +284,6 @@ class Processor(AsyncProcessor):
pass
# Threshold for sending document_id instead of inline content (2MB)
STREAMING_THRESHOLD = 2 * 1024 * 1024
async def emit_document_provenance(self, document, processing, triples_queue):
"""
@ -360,10 +359,8 @@ class Processor(AsyncProcessor):
if document.kind == "text/plain":
kind = "text-load"
elif document.kind == "application/pdf":
kind = "document-load"
else:
raise RuntimeError("Document with a MIME type I don't know")
kind = "document-load"
q = flow["interfaces"][kind]
@ -374,57 +371,28 @@ class Processor(AsyncProcessor):
)
if kind == "text-load":
# For large text documents, send document_id for streaming retrieval
if len(content) >= self.STREAMING_THRESHOLD:
logger.info(f"Text document {document.id} is large ({len(content)} bytes), "
f"sending document_id for streaming retrieval")
doc = TextDocument(
metadata = Metadata(
id = document.id,
root = document.id,
user = processing.user,
collection = processing.collection
),
document_id = document.id,
text = b"", # Empty, receiver will fetch via librarian
)
else:
doc = TextDocument(
metadata = Metadata(
id = document.id,
root = document.id,
user = processing.user,
collection = processing.collection
),
text = content,
)
doc = TextDocument(
metadata = Metadata(
id = document.id,
root = document.id,
user = processing.user,
collection = processing.collection
),
document_id = document.id,
text = b"",
)
schema = TextDocument
else:
# For large PDF documents, send document_id for streaming retrieval
# instead of embedding the entire content in the message
if len(content) >= self.STREAMING_THRESHOLD:
logger.info(f"Document {document.id} is large ({len(content)} bytes), "
f"sending document_id for streaming retrieval")
doc = Document(
metadata = Metadata(
id = document.id,
root = document.id,
user = processing.user,
collection = processing.collection
),
document_id = document.id,
data = b"", # Empty data, receiver will fetch via API
)
else:
doc = Document(
metadata = Metadata(
id = document.id,
root = document.id,
user = processing.user,
collection = processing.collection
),
data = base64.b64encode(content).decode("utf-8")
)
doc = Document(
metadata = Metadata(
id = document.id,
root = document.id,
user = processing.user,
collection = processing.collection
),
document_id = document.id,
data = b"",
)
schema = Document
logger.debug(f"Submitting to queue {q}...")