mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-07-01 17:39:39 +02:00
Add universal document decoder with multi-format support (#705)
Add universal document decoder with multi-format support using 'unstructured'. New universal decoder service powered by the unstructured library, handling DOCX, XLSX, PPTX, HTML, Markdown, CSV, RTF, ODT, EPUB and more through a single service. Tables are preserved as HTML markup for better downstream extraction. Images are stored in the librarian but excluded from the text pipeline. Configurable section grouping strategies (whole-document, heading, element-type, count, size) for non-page formats. Page-based formats (PDF, PPTX, XLSX) are automatically grouped by page. All four decoders (PDF, Mistral OCR, Tesseract OCR, universal) now share the "document-decoder" ident so they are interchangeable. PDF-only decoders fetch document metadata to check MIME type and gracefully skip unsupported formats. Librarian changes: removed MIME type whitelist validation so any document format can be ingested. Simplified routing so text/plain goes to text-load and everything else goes to document-load. Removed dual inline/streaming data paths — documents always use document_id for content retrieval. New provenance entity types (tg:Section, tg:Image) and metadata predicates (tg:elementTypes, tg:tableCount, tg:imageCount) for richer explainability. Universal decoder is in its own package (trustgraph-unstructured) and container image (trustgraph-unstructured).
This commit is contained in:
parent
4609424afe
commit
5c6fe90fe2
25 changed files with 2247 additions and 79 deletions
|
|
@ -44,12 +44,8 @@ class Librarian:
|
|||
|
||||
async def add_document(self, request):
|
||||
|
||||
if request.document_metadata.kind not in (
|
||||
"text/plain", "application/pdf"
|
||||
):
|
||||
raise RequestError(
|
||||
"Invalid document kind: " + request.document_metadata.kind
|
||||
)
|
||||
if not request.document_metadata.kind:
|
||||
raise RequestError("Document kind (MIME type) is required")
|
||||
|
||||
if await self.table_store.document_exists(
|
||||
request.document_metadata.user,
|
||||
|
|
@ -276,10 +272,8 @@ class Librarian:
|
|||
"""
|
||||
logger.info(f"Beginning chunked upload for document {request.document_metadata.id}")
|
||||
|
||||
if request.document_metadata.kind not in ("text/plain", "application/pdf"):
|
||||
raise RequestError(
|
||||
"Invalid document kind: " + request.document_metadata.kind
|
||||
)
|
||||
if not request.document_metadata.kind:
|
||||
raise RequestError("Document kind (MIME type) is required")
|
||||
|
||||
if await self.table_store.document_exists(
|
||||
request.document_metadata.user,
|
||||
|
|
|
|||
|
|
@ -284,7 +284,6 @@ class Processor(AsyncProcessor):
|
|||
pass
|
||||
|
||||
# Threshold for sending document_id instead of inline content (2MB)
|
||||
STREAMING_THRESHOLD = 2 * 1024 * 1024
|
||||
|
||||
async def emit_document_provenance(self, document, processing, triples_queue):
|
||||
"""
|
||||
|
|
@ -360,10 +359,8 @@ class Processor(AsyncProcessor):
|
|||
|
||||
if document.kind == "text/plain":
|
||||
kind = "text-load"
|
||||
elif document.kind == "application/pdf":
|
||||
kind = "document-load"
|
||||
else:
|
||||
raise RuntimeError("Document with a MIME type I don't know")
|
||||
kind = "document-load"
|
||||
|
||||
q = flow["interfaces"][kind]
|
||||
|
||||
|
|
@ -374,57 +371,28 @@ class Processor(AsyncProcessor):
|
|||
)
|
||||
|
||||
if kind == "text-load":
|
||||
# For large text documents, send document_id for streaming retrieval
|
||||
if len(content) >= self.STREAMING_THRESHOLD:
|
||||
logger.info(f"Text document {document.id} is large ({len(content)} bytes), "
|
||||
f"sending document_id for streaming retrieval")
|
||||
doc = TextDocument(
|
||||
metadata = Metadata(
|
||||
id = document.id,
|
||||
root = document.id,
|
||||
user = processing.user,
|
||||
collection = processing.collection
|
||||
),
|
||||
document_id = document.id,
|
||||
text = b"", # Empty, receiver will fetch via librarian
|
||||
)
|
||||
else:
|
||||
doc = TextDocument(
|
||||
metadata = Metadata(
|
||||
id = document.id,
|
||||
root = document.id,
|
||||
user = processing.user,
|
||||
collection = processing.collection
|
||||
),
|
||||
text = content,
|
||||
)
|
||||
doc = TextDocument(
|
||||
metadata = Metadata(
|
||||
id = document.id,
|
||||
root = document.id,
|
||||
user = processing.user,
|
||||
collection = processing.collection
|
||||
),
|
||||
document_id = document.id,
|
||||
text = b"",
|
||||
)
|
||||
schema = TextDocument
|
||||
else:
|
||||
# For large PDF documents, send document_id for streaming retrieval
|
||||
# instead of embedding the entire content in the message
|
||||
if len(content) >= self.STREAMING_THRESHOLD:
|
||||
logger.info(f"Document {document.id} is large ({len(content)} bytes), "
|
||||
f"sending document_id for streaming retrieval")
|
||||
doc = Document(
|
||||
metadata = Metadata(
|
||||
id = document.id,
|
||||
root = document.id,
|
||||
user = processing.user,
|
||||
collection = processing.collection
|
||||
),
|
||||
document_id = document.id,
|
||||
data = b"", # Empty data, receiver will fetch via API
|
||||
)
|
||||
else:
|
||||
doc = Document(
|
||||
metadata = Metadata(
|
||||
id = document.id,
|
||||
root = document.id,
|
||||
user = processing.user,
|
||||
collection = processing.collection
|
||||
),
|
||||
data = base64.b64encode(content).decode("utf-8")
|
||||
)
|
||||
doc = Document(
|
||||
metadata = Metadata(
|
||||
id = document.id,
|
||||
root = document.id,
|
||||
user = processing.user,
|
||||
collection = processing.collection
|
||||
),
|
||||
document_id = document.id,
|
||||
data = b"",
|
||||
)
|
||||
schema = Document
|
||||
|
||||
logger.debug(f"Submitting to queue {q}...")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue