mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 08:26:21 +02:00
Add universal document decoder with multi-format support (#705)
Add universal document decoder with multi-format support using 'unstructured'. New universal decoder service powered by the unstructured library, handling DOCX, XLSX, PPTX, HTML, Markdown, CSV, RTF, ODT, EPUB and more through a single service. Tables are preserved as HTML markup for better downstream extraction. Images are stored in the librarian but excluded from the text pipeline. Configurable section grouping strategies (whole-document, heading, element-type, count, size) for non-page formats. Page-based formats (PDF, PPTX, XLSX) are automatically grouped by page. All four decoders (PDF, Mistral OCR, Tesseract OCR, universal) now share the "document-decoder" ident so they are interchangeable. PDF-only decoders fetch document metadata to check MIME type and gracefully skip unsupported formats. Librarian changes: removed MIME type whitelist validation so any document format can be ingested. Simplified routing so text/plain goes to text-load and everything else goes to document-load. Removed dual inline/streaming data paths — documents always use document_id for content retrieval. New provenance entity types (tg:Section, tg:Image) and metadata predicates (tg:elementTypes, tg:tableCount, tg:imageCount) for richer explainability. Universal decoder is in its own package (trustgraph-unstructured) and container image (trustgraph-unstructured).
This commit is contained in:
parent
4609424afe
commit
5c6fe90fe2
25 changed files with 2247 additions and 79 deletions
34
trustgraph-unstructured/pyproject.toml
Normal file
34
trustgraph-unstructured/pyproject.toml
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
[build-system]
|
||||
requires = ["setuptools>=61.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "trustgraph-unstructured"
|
||||
dynamic = ["version"]
|
||||
authors = [{name = "trustgraph.ai", email = "security@trustgraph.ai"}]
|
||||
description = "TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline."
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.8"
|
||||
dependencies = [
|
||||
"trustgraph-base>=2.2,<2.3",
|
||||
"pulsar-client",
|
||||
"prometheus-client",
|
||||
"python-magic",
|
||||
"unstructured[csv,docx,epub,md,odt,pptx,rst,rtf,tsv,xlsx]",
|
||||
]
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3",
|
||||
"Operating System :: OS Independent",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/trustgraph-ai/trustgraph"
|
||||
|
||||
[project.scripts]
|
||||
universal-decoder = "trustgraph.decoding.universal:run"
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
include = ["trustgraph*"]
|
||||
|
||||
[tool.setuptools.dynamic]
|
||||
version = {attr = "trustgraph.unstructured_version.__version__"}
|
||||
Loading…
Add table
Add a link
Reference in a new issue