mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Add universal document decoder with multi-format support (#705)
Add universal document decoder with multi-format support using 'unstructured'. New universal decoder service powered by the unstructured library, handling DOCX, XLSX, PPTX, HTML, Markdown, CSV, RTF, ODT, EPUB and more through a single service. Tables are preserved as HTML markup for better downstream extraction. Images are stored in the librarian but excluded from the text pipeline. Configurable section grouping strategies (whole-document, heading, element-type, count, size) for non-page formats. Page-based formats (PDF, PPTX, XLSX) are automatically grouped by page. All four decoders (PDF, Mistral OCR, Tesseract OCR, universal) now share the "document-decoder" ident so they are interchangeable. PDF-only decoders fetch document metadata to check MIME type and gracefully skip unsupported formats. Librarian changes: removed MIME type whitelist validation so any document format can be ingested. Simplified routing so text/plain goes to text-load and everything else goes to document-load. Removed dual inline/streaming data paths — documents always use document_id for content retrieval. New provenance entity types (tg:Section, tg:Image) and metadata predicates (tg:elementTypes, tg:tableCount, tg:imageCount) for richer explainability. Universal decoder is in its own package (trustgraph-unstructured) and container image (trustgraph-unstructured).
This commit is contained in:
parent
4609424afe
commit
5c6fe90fe2
25 changed files with 2247 additions and 79 deletions
14
Makefile
14
Makefile
|
|
@ -17,6 +17,7 @@ wheels:
|
|||
pip3 wheel --no-deps --wheel-dir dist trustgraph-embeddings-hf/
|
||||
pip3 wheel --no-deps --wheel-dir dist trustgraph-cli/
|
||||
pip3 wheel --no-deps --wheel-dir dist trustgraph-ocr/
|
||||
pip3 wheel --no-deps --wheel-dir dist trustgraph-unstructured/
|
||||
pip3 wheel --no-deps --wheel-dir dist trustgraph-mcp/
|
||||
|
||||
packages: update-package-versions
|
||||
|
|
@ -29,6 +30,7 @@ packages: update-package-versions
|
|||
cd trustgraph-embeddings-hf && python -m build --sdist --outdir ../dist/
|
||||
cd trustgraph-cli && python -m build --sdist --outdir ../dist/
|
||||
cd trustgraph-ocr && python -m build --sdist --outdir ../dist/
|
||||
cd trustgraph-unstructured && python -m build --sdist --outdir ../dist/
|
||||
cd trustgraph-mcp && python -m build --sdist --outdir ../dist/
|
||||
|
||||
pypi-upload:
|
||||
|
|
@ -46,6 +48,7 @@ update-package-versions:
|
|||
echo __version__ = \"${VERSION}\" > trustgraph-embeddings-hf/trustgraph/embeddings_hf_version.py
|
||||
echo __version__ = \"${VERSION}\" > trustgraph-cli/trustgraph/cli_version.py
|
||||
echo __version__ = \"${VERSION}\" > trustgraph-ocr/trustgraph/ocr_version.py
|
||||
echo __version__ = \"${VERSION}\" > trustgraph-unstructured/trustgraph/unstructured_version.py
|
||||
echo __version__ = \"${VERSION}\" > trustgraph/trustgraph/trustgraph_version.py
|
||||
echo __version__ = \"${VERSION}\" > trustgraph-mcp/trustgraph/mcp_version.py
|
||||
|
||||
|
|
@ -64,6 +67,8 @@ containers: FORCE
|
|||
-t ${CONTAINER_BASE}/trustgraph-hf:${VERSION} .
|
||||
${DOCKER} build -f containers/Containerfile.ocr \
|
||||
-t ${CONTAINER_BASE}/trustgraph-ocr:${VERSION} .
|
||||
${DOCKER} build -f containers/Containerfile.unstructured \
|
||||
-t ${CONTAINER_BASE}/trustgraph-unstructured:${VERSION} .
|
||||
${DOCKER} build -f containers/Containerfile.mcp \
|
||||
-t ${CONTAINER_BASE}/trustgraph-mcp:${VERSION} .
|
||||
|
||||
|
|
@ -72,6 +77,8 @@ some-containers:
|
|||
-t ${CONTAINER_BASE}/trustgraph-base:${VERSION} .
|
||||
${DOCKER} build -f containers/Containerfile.flow \
|
||||
-t ${CONTAINER_BASE}/trustgraph-flow:${VERSION} .
|
||||
${DOCKER} build -f containers/Containerfile.unstructured \
|
||||
-t ${CONTAINER_BASE}/trustgraph-unstructured:${VERSION} .
|
||||
# ${DOCKER} build -f containers/Containerfile.vertexai \
|
||||
# -t ${CONTAINER_BASE}/trustgraph-vertexai:${VERSION} .
|
||||
# ${DOCKER} build -f containers/Containerfile.mcp \
|
||||
|
|
@ -98,6 +105,7 @@ push:
|
|||
${DOCKER} push ${CONTAINER_BASE}/trustgraph-vertexai:${VERSION}
|
||||
${DOCKER} push ${CONTAINER_BASE}/trustgraph-hf:${VERSION}
|
||||
${DOCKER} push ${CONTAINER_BASE}/trustgraph-ocr:${VERSION}
|
||||
${DOCKER} push ${CONTAINER_BASE}/trustgraph-unstructured:${VERSION}
|
||||
${DOCKER} push ${CONTAINER_BASE}/trustgraph-mcp:${VERSION}
|
||||
|
||||
# Individual container build targets
|
||||
|
|
@ -119,6 +127,9 @@ container-trustgraph-hf: update-package-versions
|
|||
container-trustgraph-ocr: update-package-versions
|
||||
${DOCKER} build -f containers/Containerfile.ocr -t ${CONTAINER_BASE}/trustgraph-ocr:${VERSION} .
|
||||
|
||||
container-trustgraph-unstructured: update-package-versions
|
||||
${DOCKER} build -f containers/Containerfile.unstructured -t ${CONTAINER_BASE}/trustgraph-unstructured:${VERSION} .
|
||||
|
||||
container-trustgraph-mcp: update-package-versions
|
||||
${DOCKER} build -f containers/Containerfile.mcp -t ${CONTAINER_BASE}/trustgraph-mcp:${VERSION} .
|
||||
|
||||
|
|
@ -141,6 +152,9 @@ push-trustgraph-hf:
|
|||
push-trustgraph-ocr:
|
||||
${DOCKER} push ${CONTAINER_BASE}/trustgraph-ocr:${VERSION}
|
||||
|
||||
push-trustgraph-unstructured:
|
||||
${DOCKER} push ${CONTAINER_BASE}/trustgraph-unstructured:${VERSION}
|
||||
|
||||
push-trustgraph-mcp:
|
||||
${DOCKER} push ${CONTAINER_BASE}/trustgraph-mcp:${VERSION}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue