Add universal document decoder with multi-format support (#705)

Add universal document decoder with multi-format support
using 'unstructured'.

New universal decoder service powered by the unstructured
library, handling DOCX, XLSX, PPTX, HTML, Markdown, CSV, RTF,
ODT, EPUB and more through a single service. Tables are preserved
as HTML markup for better downstream extraction. Images are
stored in the librarian but excluded from the text
pipeline. Configurable section grouping strategies
(whole-document, heading, element-type, count, size) for non-page
formats. Page-based formats (PDF, PPTX, XLSX) are automatically
grouped by page.

All four decoders (PDF, Mistral OCR, Tesseract OCR, universal)
now share the "document-decoder" ident so they are
interchangeable.  PDF-only decoders fetch document metadata to
check MIME type and gracefully skip unsupported formats.

Librarian changes: removed MIME type whitelist validation so any
document format can be ingested. Simplified routing so text/plain
goes to text-load and everything else goes to document-load.
Removed dual inline/streaming data paths — documents always use
document_id for content retrieval.

New provenance entity types (tg:Section, tg:Image) and metadata
predicates (tg:elementTypes, tg:tableCount, tg:imageCount) for
richer explainability.

Universal decoder is in its own package (trustgraph-unstructured)
and container image (trustgraph-unstructured).
This commit is contained in:
cybermaggedon 2026-03-23 12:56:35 +00:00 committed by GitHub
parent 4609424afe
commit 5c6fe90fe2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
25 changed files with 2247 additions and 79 deletions

View file

@ -35,7 +35,9 @@ from . uris import (
TRUSTGRAPH_BASE,
document_uri,
page_uri,
section_uri,
chunk_uri,
image_uri,
activity_uri,
subgraph_uri,
agent_uri,
@ -74,8 +76,10 @@ from . namespaces import (
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
TG_ELEMENT_TYPES, TG_TABLE_COUNT, TG_IMAGE_COUNT,
# Extraction provenance entity types
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_SECTION_TYPE, TG_CHUNK_TYPE,
TG_IMAGE_TYPE, TG_SUBGRAPH_TYPE,
# Query-time provenance predicates (GraphRAG)
TG_QUERY, TG_CONCEPT, TG_ENTITY,
TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_REASONING,
@ -137,7 +141,9 @@ __all__ = [
"TRUSTGRAPH_BASE",
"document_uri",
"page_uri",
"section_uri",
"chunk_uri",
"image_uri",
"activity_uri",
"subgraph_uri",
"agent_uri",
@ -169,8 +175,10 @@ __all__ = [
"TG_CHUNK_SIZE", "TG_CHUNK_OVERLAP", "TG_COMPONENT_VERSION",
"TG_LLM_MODEL", "TG_ONTOLOGY", "TG_EMBEDDING_MODEL",
"TG_SOURCE_TEXT", "TG_SOURCE_CHAR_OFFSET", "TG_SOURCE_CHAR_LENGTH",
"TG_ELEMENT_TYPES", "TG_TABLE_COUNT", "TG_IMAGE_COUNT",
# Extraction provenance entity types
"TG_DOCUMENT_TYPE", "TG_PAGE_TYPE", "TG_CHUNK_TYPE", "TG_SUBGRAPH_TYPE",
"TG_DOCUMENT_TYPE", "TG_PAGE_TYPE", "TG_SECTION_TYPE",
"TG_CHUNK_TYPE", "TG_IMAGE_TYPE", "TG_SUBGRAPH_TYPE",
# Query-time provenance predicates (GraphRAG)
"TG_QUERY", "TG_CONCEPT", "TG_ENTITY",
"TG_EDGE_COUNT", "TG_SELECTED_EDGE", "TG_REASONING",

View file

@ -75,9 +75,16 @@ TG_SELECTED_CHUNK = TG + "selectedChunk"
# Extraction provenance entity types
TG_DOCUMENT_TYPE = TG + "Document"
TG_PAGE_TYPE = TG + "Page"
TG_SECTION_TYPE = TG + "Section"
TG_CHUNK_TYPE = TG + "Chunk"
TG_IMAGE_TYPE = TG + "Image"
TG_SUBGRAPH_TYPE = TG + "Subgraph"
# Universal decoder metadata predicates
TG_ELEMENT_TYPES = TG + "elementTypes"
TG_TABLE_COUNT = TG + "tableCount"
TG_IMAGE_COUNT = TG + "imageCount"
# Explainability entity types (shared)
TG_QUESTION = TG + "Question"
TG_GROUNDING = TG + "Grounding"

View file

@ -18,7 +18,10 @@ from . namespaces import (
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
TG_LLM_MODEL, TG_ONTOLOGY, TG_CONTAINS,
# Extraction provenance entity types
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_SECTION_TYPE, TG_CHUNK_TYPE,
TG_IMAGE_TYPE, TG_SUBGRAPH_TYPE,
# Universal decoder metadata predicates
TG_ELEMENT_TYPES, TG_TABLE_COUNT, TG_IMAGE_COUNT,
# Query-time provenance predicates (GraphRAG)
TG_QUERY, TG_CONCEPT, TG_ENTITY,
TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_EDGE, TG_REASONING,
@ -129,15 +132,22 @@ def derived_entity_triples(
component_version: str,
label: Optional[str] = None,
page_number: Optional[int] = None,
section: bool = False,
image: bool = False,
chunk_index: Optional[int] = None,
char_offset: Optional[int] = None,
char_length: Optional[int] = None,
chunk_size: Optional[int] = None,
chunk_overlap: Optional[int] = None,
mime_type: Optional[str] = None,
element_types: Optional[str] = None,
table_count: Optional[int] = None,
image_count: Optional[int] = None,
timestamp: Optional[str] = None,
) -> List[Triple]:
"""
Build triples for a derived entity (page or chunk) with full PROV-O provenance.
Build triples for a derived entity (page, section, chunk, or image)
with full PROV-O provenance.
Creates:
- Entity declaration
@ -146,17 +156,23 @@ def derived_entity_triples(
- Agent for the component
Args:
entity_uri: URI of the derived entity (page or chunk)
entity_uri: URI of the derived entity
parent_uri: URI of the parent entity
component_name: Name of TG component (e.g., "pdf-extractor", "chunker")
component_version: Version of the component
label: Human-readable label
page_number: Page number (for pages)
section: True if this is a document section (non-page format)
image: True if this is an image entity
chunk_index: Chunk index (for chunks)
char_offset: Character offset in parent (for chunks)
char_length: Character length (for chunks)
char_offset: Character offset in parent
char_length: Character length
chunk_size: Configured chunk size (for chunking activity)
chunk_overlap: Configured chunk overlap (for chunking activity)
mime_type: Source document MIME type
element_types: Comma-separated unstructured element categories
table_count: Number of tables in this page/section
image_count: Number of images in this page/section
timestamp: ISO timestamp (defaults to now)
Returns:
@ -169,7 +185,11 @@ def derived_entity_triples(
agt_uri = agent_uri(component_name)
# Determine specific type from parameters
if page_number is not None:
if image:
specific_type = TG_IMAGE_TYPE
elif section:
specific_type = TG_SECTION_TYPE
elif page_number is not None:
specific_type = TG_PAGE_TYPE
elif chunk_index is not None:
specific_type = TG_CHUNK_TYPE
@ -225,6 +245,18 @@ def derived_entity_triples(
if chunk_overlap is not None:
triples.append(_triple(act_uri, TG_CHUNK_OVERLAP, _literal(chunk_overlap)))
if mime_type:
triples.append(_triple(entity_uri, TG_MIME_TYPE, _literal(mime_type)))
if element_types:
triples.append(_triple(entity_uri, TG_ELEMENT_TYPES, _literal(element_types)))
if table_count is not None:
triples.append(_triple(entity_uri, TG_TABLE_COUNT, _literal(table_count)))
if image_count is not None:
triples.append(_triple(entity_uri, TG_IMAGE_COUNT, _literal(image_count)))
return triples

View file

@ -5,7 +5,9 @@ Document IDs are externally provided (e.g., https://trustgraph.ai/doc/abc123).
Child entities (pages, chunks) use UUID-based URNs:
- Document: {doc_iri} (as provided, not generated here)
- Page: urn:page:{uuid}
- Section: urn:section:{uuid}
- Chunk: urn:chunk:{uuid}
- Image: urn:image:{uuid}
- Activity: https://trustgraph.ai/activity/{uuid}
- Subgraph: https://trustgraph.ai/subgraph/{uuid}
"""
@ -32,11 +34,21 @@ def page_uri() -> str:
return f"urn:page:{uuid.uuid4()}"
def section_uri() -> str:
"""Generate a unique URI for a document section."""
return f"urn:section:{uuid.uuid4()}"
def chunk_uri() -> str:
"""Generate a unique URI for a chunk."""
return f"urn:chunk:{uuid.uuid4()}"
def image_uri() -> str:
"""Generate a unique URI for an image."""
return f"urn:image:{uuid.uuid4()}"
def activity_uri(activity_id: str = None) -> str:
"""Generate URI for a PROV-O activity. Auto-generates UUID if not provided."""
if activity_id is None: