mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 08:26:21 +02:00
Add universal document decoder with multi-format support (#705)
Add universal document decoder with multi-format support using 'unstructured'. New universal decoder service powered by the unstructured library, handling DOCX, XLSX, PPTX, HTML, Markdown, CSV, RTF, ODT, EPUB and more through a single service. Tables are preserved as HTML markup for better downstream extraction. Images are stored in the librarian but excluded from the text pipeline. Configurable section grouping strategies (whole-document, heading, element-type, count, size) for non-page formats. Page-based formats (PDF, PPTX, XLSX) are automatically grouped by page. All four decoders (PDF, Mistral OCR, Tesseract OCR, universal) now share the "document-decoder" ident so they are interchangeable. PDF-only decoders fetch document metadata to check MIME type and gracefully skip unsupported formats. Librarian changes: removed MIME type whitelist validation so any document format can be ingested. Simplified routing so text/plain goes to text-load and everything else goes to document-load. Removed dual inline/streaming data paths — documents always use document_id for content retrieval. New provenance entity types (tg:Section, tg:Image) and metadata predicates (tg:elementTypes, tg:tableCount, tg:imageCount) for richer explainability. Universal decoder is in its own package (trustgraph-unstructured) and container image (trustgraph-unstructured).
This commit is contained in:
parent
4609424afe
commit
5c6fe90fe2
25 changed files with 2247 additions and 79 deletions
|
|
@ -35,7 +35,9 @@ from . uris import (
|
|||
TRUSTGRAPH_BASE,
|
||||
document_uri,
|
||||
page_uri,
|
||||
section_uri,
|
||||
chunk_uri,
|
||||
image_uri,
|
||||
activity_uri,
|
||||
subgraph_uri,
|
||||
agent_uri,
|
||||
|
|
@ -74,8 +76,10 @@ from . namespaces import (
|
|||
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
|
||||
TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
|
||||
TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
|
||||
TG_ELEMENT_TYPES, TG_TABLE_COUNT, TG_IMAGE_COUNT,
|
||||
# Extraction provenance entity types
|
||||
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
|
||||
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_SECTION_TYPE, TG_CHUNK_TYPE,
|
||||
TG_IMAGE_TYPE, TG_SUBGRAPH_TYPE,
|
||||
# Query-time provenance predicates (GraphRAG)
|
||||
TG_QUERY, TG_CONCEPT, TG_ENTITY,
|
||||
TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_REASONING,
|
||||
|
|
@ -137,7 +141,9 @@ __all__ = [
|
|||
"TRUSTGRAPH_BASE",
|
||||
"document_uri",
|
||||
"page_uri",
|
||||
"section_uri",
|
||||
"chunk_uri",
|
||||
"image_uri",
|
||||
"activity_uri",
|
||||
"subgraph_uri",
|
||||
"agent_uri",
|
||||
|
|
@ -169,8 +175,10 @@ __all__ = [
|
|||
"TG_CHUNK_SIZE", "TG_CHUNK_OVERLAP", "TG_COMPONENT_VERSION",
|
||||
"TG_LLM_MODEL", "TG_ONTOLOGY", "TG_EMBEDDING_MODEL",
|
||||
"TG_SOURCE_TEXT", "TG_SOURCE_CHAR_OFFSET", "TG_SOURCE_CHAR_LENGTH",
|
||||
"TG_ELEMENT_TYPES", "TG_TABLE_COUNT", "TG_IMAGE_COUNT",
|
||||
# Extraction provenance entity types
|
||||
"TG_DOCUMENT_TYPE", "TG_PAGE_TYPE", "TG_CHUNK_TYPE", "TG_SUBGRAPH_TYPE",
|
||||
"TG_DOCUMENT_TYPE", "TG_PAGE_TYPE", "TG_SECTION_TYPE",
|
||||
"TG_CHUNK_TYPE", "TG_IMAGE_TYPE", "TG_SUBGRAPH_TYPE",
|
||||
# Query-time provenance predicates (GraphRAG)
|
||||
"TG_QUERY", "TG_CONCEPT", "TG_ENTITY",
|
||||
"TG_EDGE_COUNT", "TG_SELECTED_EDGE", "TG_REASONING",
|
||||
|
|
|
|||
|
|
@ -75,9 +75,16 @@ TG_SELECTED_CHUNK = TG + "selectedChunk"
|
|||
# Extraction provenance entity types
|
||||
TG_DOCUMENT_TYPE = TG + "Document"
|
||||
TG_PAGE_TYPE = TG + "Page"
|
||||
TG_SECTION_TYPE = TG + "Section"
|
||||
TG_CHUNK_TYPE = TG + "Chunk"
|
||||
TG_IMAGE_TYPE = TG + "Image"
|
||||
TG_SUBGRAPH_TYPE = TG + "Subgraph"
|
||||
|
||||
# Universal decoder metadata predicates
|
||||
TG_ELEMENT_TYPES = TG + "elementTypes"
|
||||
TG_TABLE_COUNT = TG + "tableCount"
|
||||
TG_IMAGE_COUNT = TG + "imageCount"
|
||||
|
||||
# Explainability entity types (shared)
|
||||
TG_QUESTION = TG + "Question"
|
||||
TG_GROUNDING = TG + "Grounding"
|
||||
|
|
|
|||
|
|
@ -18,7 +18,10 @@ from . namespaces import (
|
|||
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
|
||||
TG_LLM_MODEL, TG_ONTOLOGY, TG_CONTAINS,
|
||||
# Extraction provenance entity types
|
||||
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
|
||||
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_SECTION_TYPE, TG_CHUNK_TYPE,
|
||||
TG_IMAGE_TYPE, TG_SUBGRAPH_TYPE,
|
||||
# Universal decoder metadata predicates
|
||||
TG_ELEMENT_TYPES, TG_TABLE_COUNT, TG_IMAGE_COUNT,
|
||||
# Query-time provenance predicates (GraphRAG)
|
||||
TG_QUERY, TG_CONCEPT, TG_ENTITY,
|
||||
TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_EDGE, TG_REASONING,
|
||||
|
|
@ -129,15 +132,22 @@ def derived_entity_triples(
|
|||
component_version: str,
|
||||
label: Optional[str] = None,
|
||||
page_number: Optional[int] = None,
|
||||
section: bool = False,
|
||||
image: bool = False,
|
||||
chunk_index: Optional[int] = None,
|
||||
char_offset: Optional[int] = None,
|
||||
char_length: Optional[int] = None,
|
||||
chunk_size: Optional[int] = None,
|
||||
chunk_overlap: Optional[int] = None,
|
||||
mime_type: Optional[str] = None,
|
||||
element_types: Optional[str] = None,
|
||||
table_count: Optional[int] = None,
|
||||
image_count: Optional[int] = None,
|
||||
timestamp: Optional[str] = None,
|
||||
) -> List[Triple]:
|
||||
"""
|
||||
Build triples for a derived entity (page or chunk) with full PROV-O provenance.
|
||||
Build triples for a derived entity (page, section, chunk, or image)
|
||||
with full PROV-O provenance.
|
||||
|
||||
Creates:
|
||||
- Entity declaration
|
||||
|
|
@ -146,17 +156,23 @@ def derived_entity_triples(
|
|||
- Agent for the component
|
||||
|
||||
Args:
|
||||
entity_uri: URI of the derived entity (page or chunk)
|
||||
entity_uri: URI of the derived entity
|
||||
parent_uri: URI of the parent entity
|
||||
component_name: Name of TG component (e.g., "pdf-extractor", "chunker")
|
||||
component_version: Version of the component
|
||||
label: Human-readable label
|
||||
page_number: Page number (for pages)
|
||||
section: True if this is a document section (non-page format)
|
||||
image: True if this is an image entity
|
||||
chunk_index: Chunk index (for chunks)
|
||||
char_offset: Character offset in parent (for chunks)
|
||||
char_length: Character length (for chunks)
|
||||
char_offset: Character offset in parent
|
||||
char_length: Character length
|
||||
chunk_size: Configured chunk size (for chunking activity)
|
||||
chunk_overlap: Configured chunk overlap (for chunking activity)
|
||||
mime_type: Source document MIME type
|
||||
element_types: Comma-separated unstructured element categories
|
||||
table_count: Number of tables in this page/section
|
||||
image_count: Number of images in this page/section
|
||||
timestamp: ISO timestamp (defaults to now)
|
||||
|
||||
Returns:
|
||||
|
|
@ -169,7 +185,11 @@ def derived_entity_triples(
|
|||
agt_uri = agent_uri(component_name)
|
||||
|
||||
# Determine specific type from parameters
|
||||
if page_number is not None:
|
||||
if image:
|
||||
specific_type = TG_IMAGE_TYPE
|
||||
elif section:
|
||||
specific_type = TG_SECTION_TYPE
|
||||
elif page_number is not None:
|
||||
specific_type = TG_PAGE_TYPE
|
||||
elif chunk_index is not None:
|
||||
specific_type = TG_CHUNK_TYPE
|
||||
|
|
@ -225,6 +245,18 @@ def derived_entity_triples(
|
|||
if chunk_overlap is not None:
|
||||
triples.append(_triple(act_uri, TG_CHUNK_OVERLAP, _literal(chunk_overlap)))
|
||||
|
||||
if mime_type:
|
||||
triples.append(_triple(entity_uri, TG_MIME_TYPE, _literal(mime_type)))
|
||||
|
||||
if element_types:
|
||||
triples.append(_triple(entity_uri, TG_ELEMENT_TYPES, _literal(element_types)))
|
||||
|
||||
if table_count is not None:
|
||||
triples.append(_triple(entity_uri, TG_TABLE_COUNT, _literal(table_count)))
|
||||
|
||||
if image_count is not None:
|
||||
triples.append(_triple(entity_uri, TG_IMAGE_COUNT, _literal(image_count)))
|
||||
|
||||
return triples
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -5,7 +5,9 @@ Document IDs are externally provided (e.g., https://trustgraph.ai/doc/abc123).
|
|||
Child entities (pages, chunks) use UUID-based URNs:
|
||||
- Document: {doc_iri} (as provided, not generated here)
|
||||
- Page: urn:page:{uuid}
|
||||
- Section: urn:section:{uuid}
|
||||
- Chunk: urn:chunk:{uuid}
|
||||
- Image: urn:image:{uuid}
|
||||
- Activity: https://trustgraph.ai/activity/{uuid}
|
||||
- Subgraph: https://trustgraph.ai/subgraph/{uuid}
|
||||
"""
|
||||
|
|
@ -32,11 +34,21 @@ def page_uri() -> str:
|
|||
return f"urn:page:{uuid.uuid4()}"
|
||||
|
||||
|
||||
def section_uri() -> str:
|
||||
"""Generate a unique URI for a document section."""
|
||||
return f"urn:section:{uuid.uuid4()}"
|
||||
|
||||
|
||||
def chunk_uri() -> str:
|
||||
"""Generate a unique URI for a chunk."""
|
||||
return f"urn:chunk:{uuid.uuid4()}"
|
||||
|
||||
|
||||
def image_uri() -> str:
|
||||
"""Generate a unique URI for an image."""
|
||||
return f"urn:image:{uuid.uuid4()}"
|
||||
|
||||
|
||||
def activity_uri(activity_id: str = None) -> str:
|
||||
"""Generate URI for a PROV-O activity. Auto-generates UUID if not provided."""
|
||||
if activity_id is None:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue