mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 08:26:21 +02:00
Add universal document decoder with multi-format support using 'unstructured'. New universal decoder service powered by the unstructured library, handling DOCX, XLSX, PPTX, HTML, Markdown, CSV, RTF, ODT, EPUB and more through a single service. Tables are preserved as HTML markup for better downstream extraction. Images are stored in the librarian but excluded from the text pipeline. Configurable section grouping strategies (whole-document, heading, element-type, count, size) for non-page formats. Page-based formats (PDF, PPTX, XLSX) are automatically grouped by page. All four decoders (PDF, Mistral OCR, Tesseract OCR, universal) now share the "document-decoder" ident so they are interchangeable. PDF-only decoders fetch document metadata to check MIME type and gracefully skip unsupported formats. Librarian changes: removed MIME type whitelist validation so any document format can be ingested. Simplified routing so text/plain goes to text-load and everything else goes to document-load. Removed dual inline/streaming data paths — documents always use document_id for content retrieval. New provenance entity types (tg:Section, tg:Image) and metadata predicates (tg:elementTypes, tg:tableCount, tg:imageCount) for richer explainability. Universal decoder is in its own package (trustgraph-unstructured) and container image (trustgraph-unstructured).
34 lines
1,017 B
TOML
34 lines
1,017 B
TOML
[build-system]
|
|
requires = ["setuptools>=61.0", "wheel"]
|
|
build-backend = "setuptools.build_meta"
|
|
|
|
[project]
|
|
name = "trustgraph-unstructured"
|
|
dynamic = ["version"]
|
|
authors = [{name = "trustgraph.ai", email = "security@trustgraph.ai"}]
|
|
description = "TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline."
|
|
readme = "README.md"
|
|
requires-python = ">=3.8"
|
|
dependencies = [
|
|
"trustgraph-base>=2.2,<2.3",
|
|
"pulsar-client",
|
|
"prometheus-client",
|
|
"python-magic",
|
|
"unstructured[csv,docx,epub,md,odt,pptx,rst,rtf,tsv,xlsx]",
|
|
]
|
|
classifiers = [
|
|
"Programming Language :: Python :: 3",
|
|
"Operating System :: OS Independent",
|
|
]
|
|
|
|
[project.urls]
|
|
Homepage = "https://github.com/trustgraph-ai/trustgraph"
|
|
|
|
[project.scripts]
|
|
universal-decoder = "trustgraph.decoding.universal:run"
|
|
|
|
[tool.setuptools.packages.find]
|
|
include = ["trustgraph*"]
|
|
|
|
[tool.setuptools.dynamic]
|
|
version = {attr = "trustgraph.unstructured_version.__version__"}
|