Add universal document decoder with multi-format support (#705)

Add universal document decoder with multi-format support using 'unstructured'. New universal decoder service powered by the unstructured library, handling DOCX, XLSX, PPTX, HTML, Markdown, CSV, RTF, ODT, EPUB and more through a single service. Tables are preserved as HTML markup for better downstream extraction. Images are stored in the librarian but excluded from the text pipeline. Configurable section grouping strategies (whole-document, heading, element-type, count, size) for non-page formats. Page-based formats (PDF, PPTX, XLSX) are automatically grouped by page. All four decoders (PDF, Mistral OCR, Tesseract OCR, universal) now share the "document-decoder" ident so they are interchangeable. PDF-only decoders fetch document metadata to check MIME type and gracefully skip unsupported formats. Librarian changes: removed MIME type whitelist validation so any document format can be ingested. Simplified routing so text/plain goes to text-load and everything else goes to document-load. Removed dual inline/streaming data paths — documents always use document_id for content retrieval. New provenance entity types (tg:Section, tg:Image) and metadata predicates (tg:elementTypes, tg:tableCount, tg:imageCount) for richer explainability. Universal decoder is in its own package (trustgraph-unstructured) and container image (trustgraph-unstructured).
2026-04-25 08:26:21 +02:00 · 2026-03-23 12:56:35 +00:00 · 2026-03-23 12:56:35 +00:00 · 5c6fe90fe2
commit 5c6fe90fe2
parent 4609424afe
25 changed files with 2247 additions and 79 deletions
--- a/trustgraph-base/trustgraph/provenance/init.py
+++ b/trustgraph-base/trustgraph/provenance/init.py
@ -35,7 +35,9 @@ from . uris import (
    TRUSTGRAPH_BASE,
    document_uri,
    page_uri,
+    section_uri,
    chunk_uri,
+    image_uri,
    activity_uri,
    subgraph_uri,
    agent_uri,
@ -74,8 +76,10 @@ from . namespaces import (
    TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
    TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
    TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
+    TG_ELEMENT_TYPES, TG_TABLE_COUNT, TG_IMAGE_COUNT,
    # Extraction provenance entity types
-    TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
+    TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_SECTION_TYPE, TG_CHUNK_TYPE,
+    TG_IMAGE_TYPE, TG_SUBGRAPH_TYPE,
    # Query-time provenance predicates (GraphRAG)
    TG_QUERY, TG_CONCEPT, TG_ENTITY,
    TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_REASONING,
@ -137,7 +141,9 @@ __all__ = [
    "TRUSTGRAPH_BASE",
    "document_uri",
    "page_uri",
+    "section_uri",
    "chunk_uri",
+    "image_uri",
    "activity_uri",
    "subgraph_uri",
    "agent_uri",
@ -169,8 +175,10 @@ __all__ = [
    "TG_CHUNK_SIZE", "TG_CHUNK_OVERLAP", "TG_COMPONENT_VERSION",
    "TG_LLM_MODEL", "TG_ONTOLOGY", "TG_EMBEDDING_MODEL",
    "TG_SOURCE_TEXT", "TG_SOURCE_CHAR_OFFSET", "TG_SOURCE_CHAR_LENGTH",
+    "TG_ELEMENT_TYPES", "TG_TABLE_COUNT", "TG_IMAGE_COUNT",
    # Extraction provenance entity types
-    "TG_DOCUMENT_TYPE", "TG_PAGE_TYPE", "TG_CHUNK_TYPE", "TG_SUBGRAPH_TYPE",
+    "TG_DOCUMENT_TYPE", "TG_PAGE_TYPE", "TG_SECTION_TYPE",
+    "TG_CHUNK_TYPE", "TG_IMAGE_TYPE", "TG_SUBGRAPH_TYPE",
    # Query-time provenance predicates (GraphRAG)
    "TG_QUERY", "TG_CONCEPT", "TG_ENTITY",
    "TG_EDGE_COUNT", "TG_SELECTED_EDGE", "TG_REASONING",
--- a/trustgraph-base/trustgraph/provenance/namespaces.py
+++ b/trustgraph-base/trustgraph/provenance/namespaces.py
@ -75,9 +75,16 @@ TG_SELECTED_CHUNK = TG + "selectedChunk"
 # Extraction provenance entity types
 TG_DOCUMENT_TYPE = TG + "Document"
 TG_PAGE_TYPE = TG + "Page"
+TG_SECTION_TYPE = TG + "Section"
 TG_CHUNK_TYPE = TG + "Chunk"
+TG_IMAGE_TYPE = TG + "Image"
 TG_SUBGRAPH_TYPE = TG + "Subgraph"

+# Universal decoder metadata predicates
+TG_ELEMENT_TYPES = TG + "elementTypes"
+TG_TABLE_COUNT = TG + "tableCount"
+TG_IMAGE_COUNT = TG + "imageCount"
+
 # Explainability entity types (shared)
 TG_QUESTION = TG + "Question"
 TG_GROUNDING = TG + "Grounding"
--- a/trustgraph-base/trustgraph/provenance/triples.py
+++ b/trustgraph-base/trustgraph/provenance/triples.py
@ -18,7 +18,10 @@ from . namespaces import (
    TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
    TG_LLM_MODEL, TG_ONTOLOGY, TG_CONTAINS,
    # Extraction provenance entity types
-    TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
+    TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_SECTION_TYPE, TG_CHUNK_TYPE,
+    TG_IMAGE_TYPE, TG_SUBGRAPH_TYPE,
+    # Universal decoder metadata predicates
+    TG_ELEMENT_TYPES, TG_TABLE_COUNT, TG_IMAGE_COUNT,
    # Query-time provenance predicates (GraphRAG)
    TG_QUERY, TG_CONCEPT, TG_ENTITY,
    TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_EDGE, TG_REASONING,
@ -129,15 +132,22 @@ def derived_entity_triples(
    component_version: str,
    label: Optional[str] = None,
    page_number: Optional[int] = None,
+    section: bool = False,
+    image: bool = False,
    chunk_index: Optional[int] = None,
    char_offset: Optional[int] = None,
    char_length: Optional[int] = None,
    chunk_size: Optional[int] = None,
    chunk_overlap: Optional[int] = None,
+    mime_type: Optional[str] = None,
+    element_types: Optional[str] = None,
+    table_count: Optional[int] = None,
+    image_count: Optional[int] = None,
    timestamp: Optional[str] = None,
 ) -> List[Triple]:
    """
-    Build triples for a derived entity (page or chunk) with full PROV-O provenance.
+    Build triples for a derived entity (page, section, chunk, or image)
+    with full PROV-O provenance.

    Creates:
    - Entity declaration
@ -146,17 +156,23 @@ def derived_entity_triples(
    - Agent for the component

    Args:
-        entity_uri: URI of the derived entity (page or chunk)
+        entity_uri: URI of the derived entity
        parent_uri: URI of the parent entity
        component_name: Name of TG component (e.g., "pdf-extractor", "chunker")
        component_version: Version of the component
        label: Human-readable label
        page_number: Page number (for pages)
+        section: True if this is a document section (non-page format)
+        image: True if this is an image entity
        chunk_index: Chunk index (for chunks)
-        char_offset: Character offset in parent (for chunks)
-        char_length: Character length (for chunks)
+        char_offset: Character offset in parent
+        char_length: Character length
        chunk_size: Configured chunk size (for chunking activity)
        chunk_overlap: Configured chunk overlap (for chunking activity)
+        mime_type: Source document MIME type
+        element_types: Comma-separated unstructured element categories
+        table_count: Number of tables in this page/section
+        image_count: Number of images in this page/section
        timestamp: ISO timestamp (defaults to now)

    Returns:
@ -169,7 +185,11 @@ def derived_entity_triples(
    agt_uri = agent_uri(component_name)

    # Determine specific type from parameters
-    if page_number is not None:
+    if image:
+        specific_type = TG_IMAGE_TYPE
+    elif section:
+        specific_type = TG_SECTION_TYPE
+    elif page_number is not None:
        specific_type = TG_PAGE_TYPE
    elif chunk_index is not None:
        specific_type = TG_CHUNK_TYPE
@ -225,6 +245,18 @@ def derived_entity_triples(
    if chunk_overlap is not None:
        triples.append(_triple(act_uri, TG_CHUNK_OVERLAP, _literal(chunk_overlap)))

+    if mime_type:
+        triples.append(_triple(entity_uri, TG_MIME_TYPE, _literal(mime_type)))
+
+    if element_types:
+        triples.append(_triple(entity_uri, TG_ELEMENT_TYPES, _literal(element_types)))
+
+    if table_count is not None:
+        triples.append(_triple(entity_uri, TG_TABLE_COUNT, _literal(table_count)))
+
+    if image_count is not None:
+        triples.append(_triple(entity_uri, TG_IMAGE_COUNT, _literal(image_count)))
+
    return triples


--- a/trustgraph-base/trustgraph/provenance/uris.py
+++ b/trustgraph-base/trustgraph/provenance/uris.py
@ -5,7 +5,9 @@ Document IDs are externally provided (e.g., https://trustgraph.ai/doc/abc123).
 Child entities (pages, chunks) use UUID-based URNs:
 - Document:  {doc_iri} (as provided, not generated here)
 - Page:      urn:page:{uuid}
+- Section:   urn:section:{uuid}
 - Chunk:     urn:chunk:{uuid}
+- Image:     urn:image:{uuid}
 - Activity:  https://trustgraph.ai/activity/{uuid}
 - Subgraph:  https://trustgraph.ai/subgraph/{uuid}
 """
@ -32,11 +34,21 @@ def page_uri() -> str:
    return f"urn:page:{uuid.uuid4()}"


+def section_uri() -> str:
+    """Generate a unique URI for a document section."""
+    return f"urn:section:{uuid.uuid4()}"
+
+
 def chunk_uri() -> str:
    """Generate a unique URI for a chunk."""
    return f"urn:chunk:{uuid.uuid4()}"


+def image_uri() -> str:
+    """Generate a unique URI for an image."""
+    return f"urn:image:{uuid.uuid4()}"
+
+
 def activity_uri(activity_id: str = None) -> str:
    """Generate URI for a PROV-O activity. Auto-generates UUID if not provided."""
    if activity_id is None: