Add universal document decoder with multi-format support (#705)

Add universal document decoder with multi-format support using 'unstructured'. New universal decoder service powered by the unstructured library, handling DOCX, XLSX, PPTX, HTML, Markdown, CSV, RTF, ODT, EPUB and more through a single service. Tables are preserved as HTML markup for better downstream extraction. Images are stored in the librarian but excluded from the text pipeline. Configurable section grouping strategies (whole-document, heading, element-type, count, size) for non-page formats. Page-based formats (PDF, PPTX, XLSX) are automatically grouped by page. All four decoders (PDF, Mistral OCR, Tesseract OCR, universal) now share the "document-decoder" ident so they are interchangeable. PDF-only decoders fetch document metadata to check MIME type and gracefully skip unsupported formats. Librarian changes: removed MIME type whitelist validation so any document format can be ingested. Simplified routing so text/plain goes to text-load and everything else goes to document-load. Removed dual inline/streaming data paths — documents always use document_id for content retrieval. New provenance entity types (tg:Section, tg:Image) and metadata predicates (tg:elementTypes, tg:tableCount, tg:imageCount) for richer explainability. Universal decoder is in its own package (trustgraph-unstructured) and container image (trustgraph-unstructured).
2026-06-09 06:45:13 +02:00 · 2026-03-23 12:56:35 +00:00 · 2026-03-23 12:56:35 +00:00 · 5c6fe90fe2
commit 5c6fe90fe2
parent 4609424afe
25 changed files with 2247 additions and 79 deletions
--- a/trustgraph-flow/trustgraph/decoding/mistral_ocr/processor.py
+++ b/trustgraph-flow/trustgraph/decoding/mistral_ocr/processor.py
@ -37,7 +37,7 @@ logger = logging.getLogger(__name__)
 COMPONENT_NAME = "mistral-ocr-decoder"
 COMPONENT_VERSION = "1.0.0"

-default_ident = "pdf-decoder"
+default_ident = "document-decoder"
 default_api_key = os.getenv("MISTRAL_TOKEN")

 default_librarian_request_queue = librarian_request_queue
@ -165,6 +165,39 @@ class Processor(FlowProcessor):
        else:
            logger.warning(f"Received unexpected librarian response: {request_id}")

+    async def fetch_document_metadata(self, document_id, user, timeout=120):
+        """
+        Fetch document metadata from librarian via Pulsar.
+        """
+        request_id = str(uuid.uuid4())
+
+        request = LibrarianRequest(
+            operation="get-document-metadata",
+            document_id=document_id,
+            user=user,
+        )
+
+        future = asyncio.get_event_loop().create_future()
+        self.pending_requests[request_id] = future
+
+        try:
+            await self.librarian_request_producer.send(
+                request, properties={"id": request_id}
+            )
+
+            response = await asyncio.wait_for(future, timeout=timeout)
+
+            if response.error:
+                raise RuntimeError(
+                    f"Librarian error: {response.error.type}: {response.error.message}"
+                )
+
+            return response.document_metadata
+
+        except asyncio.TimeoutError:
+            self.pending_requests.pop(request_id, None)
+            raise RuntimeError(f"Timeout fetching metadata for {document_id}")
+
    async def fetch_document_content(self, document_id, user, timeout=120):
        """
        Fetch document content from librarian via Pulsar.
@ -326,6 +359,20 @@ class Processor(FlowProcessor):

        logger.info(f"Decoding {v.metadata.id}...")

+        # Check MIME type if fetching from librarian
+        if v.document_id:
+            doc_meta = await self.fetch_document_metadata(
+                document_id=v.document_id,
+                user=v.metadata.user,
+            )
+            if doc_meta and doc_meta.kind and doc_meta.kind != "application/pdf":
+                logger.error(
+                    f"Unsupported MIME type: {doc_meta.kind}. "
+                    f"Mistral OCR decoder only handles application/pdf. "
+                    f"Ignoring document {v.metadata.id}."
+                )
+                return
+
        # Get PDF content - fetch from librarian or use inline data
        if v.document_id:
            logger.info(f"Fetching document {v.document_id} from librarian...")
--- a/trustgraph-flow/trustgraph/decoding/pdf/pdf_decoder.py
+++ b/trustgraph-flow/trustgraph/decoding/pdf/pdf_decoder.py
@ -34,7 +34,7 @@ COMPONENT_VERSION = "1.0.0"
 # Module logger
 logger = logging.getLogger(__name__)

-default_ident = "pdf-decoder"
+default_ident = "document-decoder"

 default_librarian_request_queue = librarian_request_queue
 default_librarian_response_queue = librarian_response_queue
@ -129,6 +129,39 @@ class Processor(FlowProcessor):
        else:
            logger.warning(f"Received unexpected librarian response: {request_id}")

+    async def fetch_document_metadata(self, document_id, user, timeout=120):
+        """
+        Fetch document metadata from librarian via Pulsar.
+        """
+        request_id = str(uuid.uuid4())
+
+        request = LibrarianRequest(
+            operation="get-document-metadata",
+            document_id=document_id,
+            user=user,
+        )
+
+        future = asyncio.get_event_loop().create_future()
+        self.pending_requests[request_id] = future
+
+        try:
+            await self.librarian_request_producer.send(
+                request, properties={"id": request_id}
+            )
+
+            response = await asyncio.wait_for(future, timeout=timeout)
+
+            if response.error:
+                raise RuntimeError(
+                    f"Librarian error: {response.error.type}: {response.error.message}"
+                )
+
+            return response.document_metadata
+
+        except asyncio.TimeoutError:
+            self.pending_requests.pop(request_id, None)
+            raise RuntimeError(f"Timeout fetching metadata for {document_id}")
+
    async def fetch_document_content(self, document_id, user, timeout=120):
        """
        Fetch document content from librarian via Pulsar.
@ -233,6 +266,20 @@ class Processor(FlowProcessor):

        logger.info(f"Decoding PDF {v.metadata.id}...")

+        # Check MIME type if fetching from librarian
+        if v.document_id:
+            doc_meta = await self.fetch_document_metadata(
+                document_id=v.document_id,
+                user=v.metadata.user,
+            )
+            if doc_meta and doc_meta.kind and doc_meta.kind != "application/pdf":
+                logger.error(
+                    f"Unsupported MIME type: {doc_meta.kind}. "
+                    f"PDF decoder only handles application/pdf. "
+                    f"Ignoring document {v.metadata.id}."
+                )
+                return
+
        with tempfile.NamedTemporaryFile(delete_on_close=False, suffix='.pdf') as fp:
            temp_path = fp.name

--- a/trustgraph-flow/trustgraph/librarian/librarian.py
+++ b/trustgraph-flow/trustgraph/librarian/librarian.py
@ -44,12 +44,8 @@ class Librarian:

    async def add_document(self, request):

-        if request.document_metadata.kind not in (
-                "text/plain", "application/pdf"
-        ):
-            raise RequestError(
-                "Invalid document kind: " + request.document_metadata.kind
-            )
+        if not request.document_metadata.kind:
+            raise RequestError("Document kind (MIME type) is required")

        if await self.table_store.document_exists(
                request.document_metadata.user,
@ -276,10 +272,8 @@ class Librarian:
        """
        logger.info(f"Beginning chunked upload for document {request.document_metadata.id}")

-        if request.document_metadata.kind not in ("text/plain", "application/pdf"):
-            raise RequestError(
-                "Invalid document kind: " + request.document_metadata.kind
-            )
+        if not request.document_metadata.kind:
+            raise RequestError("Document kind (MIME type) is required")

        if await self.table_store.document_exists(
                request.document_metadata.user,
--- a/trustgraph-flow/trustgraph/librarian/service.py
+++ b/trustgraph-flow/trustgraph/librarian/service.py
@ -284,7 +284,6 @@ class Processor(AsyncProcessor):
        pass

    # Threshold for sending document_id instead of inline content (2MB)
-    STREAMING_THRESHOLD = 2 * 1024 * 1024

    async def emit_document_provenance(self, document, processing, triples_queue):
        """
@ -360,10 +359,8 @@ class Processor(AsyncProcessor):

        if document.kind == "text/plain":
            kind = "text-load"
-        elif document.kind == "application/pdf":
-            kind = "document-load"
        else:
-            raise RuntimeError("Document with a MIME type I don't know")
+            kind = "document-load"

        q = flow["interfaces"][kind]

@ -374,57 +371,28 @@ class Processor(AsyncProcessor):
            )

        if kind == "text-load":
-            # For large text documents, send document_id for streaming retrieval
-            if len(content) >= self.STREAMING_THRESHOLD:
-                logger.info(f"Text document {document.id} is large ({len(content)} bytes), "
-                           f"sending document_id for streaming retrieval")
-                doc = TextDocument(
-                    metadata = Metadata(
-                        id = document.id,
-                        root = document.id,
-                        user = processing.user,
-                        collection = processing.collection
-                    ),
-                    document_id = document.id,
-                    text = b"",  # Empty, receiver will fetch via librarian
-                )
-            else:
-                doc = TextDocument(
-                    metadata = Metadata(
-                        id = document.id,
-                        root = document.id,
-                        user = processing.user,
-                        collection = processing.collection
-                    ),
-                    text = content,
-                )
+            doc = TextDocument(
+                metadata = Metadata(
+                    id = document.id,
+                    root = document.id,
+                    user = processing.user,
+                    collection = processing.collection
+                ),
+                document_id = document.id,
+                text = b"",
+            )
            schema = TextDocument
        else:
-            # For large PDF documents, send document_id for streaming retrieval
-            # instead of embedding the entire content in the message
-            if len(content) >= self.STREAMING_THRESHOLD:
-                logger.info(f"Document {document.id} is large ({len(content)} bytes), "
-                           f"sending document_id for streaming retrieval")
-                doc = Document(
-                    metadata = Metadata(
-                        id = document.id,
-                        root = document.id,
-                        user = processing.user,
-                        collection = processing.collection
-                    ),
-                    document_id = document.id,
-                    data = b"",  # Empty data, receiver will fetch via API
-                )
-            else:
-                doc = Document(
-                    metadata = Metadata(
-                        id = document.id,
-                        root = document.id,
-                        user = processing.user,
-                        collection = processing.collection
-                    ),
-                    data = base64.b64encode(content).decode("utf-8")
-                )
+            doc = Document(
+                metadata = Metadata(
+                    id = document.id,
+                    root = document.id,
+                    user = processing.user,
+                    collection = processing.collection
+                ),
+                document_id = document.id,
+                data = b"",
+            )
            schema = Document

        logger.debug(f"Submitting to queue {q}...")