mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Add universal document decoder with multi-format support (#705)
Add universal document decoder with multi-format support using 'unstructured'. New universal decoder service powered by the unstructured library, handling DOCX, XLSX, PPTX, HTML, Markdown, CSV, RTF, ODT, EPUB and more through a single service. Tables are preserved as HTML markup for better downstream extraction. Images are stored in the librarian but excluded from the text pipeline. Configurable section grouping strategies (whole-document, heading, element-type, count, size) for non-page formats. Page-based formats (PDF, PPTX, XLSX) are automatically grouped by page. All four decoders (PDF, Mistral OCR, Tesseract OCR, universal) now share the "document-decoder" ident so they are interchangeable. PDF-only decoders fetch document metadata to check MIME type and gracefully skip unsupported formats. Librarian changes: removed MIME type whitelist validation so any document format can be ingested. Simplified routing so text/plain goes to text-load and everything else goes to document-load. Removed dual inline/streaming data paths — documents always use document_id for content retrieval. New provenance entity types (tg:Section, tg:Image) and metadata predicates (tg:elementTypes, tg:tableCount, tg:imageCount) for richer explainability. Universal decoder is in its own package (trustgraph-unstructured) and container image (trustgraph-unstructured).
This commit is contained in:
parent
4609424afe
commit
5c6fe90fe2
25 changed files with 2247 additions and 79 deletions
34
trustgraph-unstructured/pyproject.toml
Normal file
34
trustgraph-unstructured/pyproject.toml
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
[build-system]
|
||||
requires = ["setuptools>=61.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "trustgraph-unstructured"
|
||||
dynamic = ["version"]
|
||||
authors = [{name = "trustgraph.ai", email = "security@trustgraph.ai"}]
|
||||
description = "TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline."
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.8"
|
||||
dependencies = [
|
||||
"trustgraph-base>=2.2,<2.3",
|
||||
"pulsar-client",
|
||||
"prometheus-client",
|
||||
"python-magic",
|
||||
"unstructured[csv,docx,epub,md,odt,pptx,rst,rtf,tsv,xlsx]",
|
||||
]
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3",
|
||||
"Operating System :: OS Independent",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/trustgraph-ai/trustgraph"
|
||||
|
||||
[project.scripts]
|
||||
universal-decoder = "trustgraph.decoding.universal:run"
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
include = ["trustgraph*"]
|
||||
|
||||
[tool.setuptools.dynamic]
|
||||
version = {attr = "trustgraph.unstructured_version.__version__"}
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
|
||||
from . processor import *
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from . processor import run
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
||||
|
|
@ -0,0 +1,714 @@
|
|||
|
||||
"""
|
||||
Universal document decoder powered by the unstructured library.
|
||||
|
||||
Accepts documents in any common format (PDF, DOCX, XLSX, HTML, Markdown,
|
||||
plain text, PPTX, etc.) on input, outputs pages or sections as text
|
||||
as separate output objects.
|
||||
|
||||
Supports both inline document data and fetching from librarian via Pulsar
|
||||
for large documents. Fetches document metadata from the librarian to
|
||||
determine mime type for format detection.
|
||||
|
||||
Tables are preserved as HTML markup for better downstream extraction.
|
||||
Images are stored in the librarian but not sent to the text pipeline.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import logging
|
||||
import magic
|
||||
import tempfile
|
||||
import os
|
||||
import uuid
|
||||
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
from ... schema import Document, TextDocument, Metadata
|
||||
from ... schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
|
||||
from ... schema import librarian_request_queue, librarian_response_queue
|
||||
from ... schema import Triples
|
||||
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
||||
from ... base import Consumer, Producer, ConsumerMetrics, ProducerMetrics
|
||||
|
||||
from ... provenance import (
|
||||
document_uri, page_uri as make_page_uri,
|
||||
section_uri as make_section_uri, image_uri as make_image_uri,
|
||||
derived_entity_triples, set_graph, GRAPH_SOURCE,
|
||||
)
|
||||
|
||||
from . strategies import get_strategy
|
||||
|
||||
# Component identification for provenance
|
||||
COMPONENT_NAME = "universal-decoder"
|
||||
COMPONENT_VERSION = "1.0.0"
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
default_ident = "document-decoder"
|
||||
|
||||
default_librarian_request_queue = librarian_request_queue
|
||||
default_librarian_response_queue = librarian_response_queue
|
||||
|
||||
# Mime type to unstructured content_type mapping
|
||||
# unstructured auto-detects most formats, but we pass the hint when available
|
||||
MIME_EXTENSIONS = {
|
||||
"application/pdf": ".pdf",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
||||
"application/vnd.ms-excel": ".xls",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
||||
"text/html": ".html",
|
||||
"text/markdown": ".md",
|
||||
"text/plain": ".txt",
|
||||
"text/csv": ".csv",
|
||||
"text/tab-separated-values": ".tsv",
|
||||
"application/rtf": ".rtf",
|
||||
"text/x-rst": ".rst",
|
||||
"application/vnd.oasis.opendocument.text": ".odt",
|
||||
}
|
||||
|
||||
# Formats that have natural page boundaries
|
||||
PAGE_BASED_FORMATS = {
|
||||
"application/pdf",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
"application/vnd.ms-excel",
|
||||
}
|
||||
|
||||
|
||||
def assemble_section_text(elements):
|
||||
"""
|
||||
Assemble text from a list of unstructured elements.
|
||||
|
||||
- Text elements: plain text, joined with double newlines
|
||||
- Table elements: HTML table markup from text_as_html
|
||||
- Image elements: skipped (stored separately, not in text output)
|
||||
|
||||
Returns:
|
||||
tuple: (assembled_text, element_types_set, table_count, image_count)
|
||||
"""
|
||||
parts = []
|
||||
element_types = set()
|
||||
table_count = 0
|
||||
image_count = 0
|
||||
|
||||
for el in elements:
|
||||
category = getattr(el, 'category', 'UncategorizedText')
|
||||
element_types.add(category)
|
||||
|
||||
if category == 'Image':
|
||||
image_count += 1
|
||||
continue # Images are NOT included in text output
|
||||
|
||||
if category == 'Table':
|
||||
table_count += 1
|
||||
# Prefer HTML representation for tables
|
||||
html = getattr(el.metadata, 'text_as_html', None) if hasattr(el, 'metadata') else None
|
||||
if html:
|
||||
parts.append(html)
|
||||
else:
|
||||
# Fallback to plain text
|
||||
text = getattr(el, 'text', '') or ''
|
||||
if text:
|
||||
parts.append(text)
|
||||
else:
|
||||
text = getattr(el, 'text', '') or ''
|
||||
if text:
|
||||
parts.append(text)
|
||||
|
||||
return '\n\n'.join(parts), element_types, table_count, image_count
|
||||
|
||||
|
||||
class Processor(FlowProcessor):
|
||||
|
||||
def __init__(self, **params):
|
||||
|
||||
id = params.get("id", default_ident)
|
||||
|
||||
self.partition_strategy = params.get("strategy", "auto")
|
||||
self.languages = params.get("languages", "eng").split(",")
|
||||
self.section_strategy_name = params.get(
|
||||
"section_strategy", "whole-document"
|
||||
)
|
||||
self.section_element_count = params.get("section_element_count", 20)
|
||||
self.section_max_size = params.get("section_max_size", 4000)
|
||||
self.section_within_pages = params.get("section_within_pages", False)
|
||||
|
||||
self.section_strategy = get_strategy(self.section_strategy_name)
|
||||
|
||||
super(Processor, self).__init__(
|
||||
**params | {
|
||||
"id": id,
|
||||
}
|
||||
)
|
||||
|
||||
self.register_specification(
|
||||
ConsumerSpec(
|
||||
name="input",
|
||||
schema=Document,
|
||||
handler=self.on_message,
|
||||
)
|
||||
)
|
||||
|
||||
self.register_specification(
|
||||
ProducerSpec(
|
||||
name="output",
|
||||
schema=TextDocument,
|
||||
)
|
||||
)
|
||||
|
||||
self.register_specification(
|
||||
ProducerSpec(
|
||||
name="triples",
|
||||
schema=Triples,
|
||||
)
|
||||
)
|
||||
|
||||
# Librarian client for fetching/storing document content
|
||||
librarian_request_q = params.get(
|
||||
"librarian_request_queue", default_librarian_request_queue
|
||||
)
|
||||
librarian_response_q = params.get(
|
||||
"librarian_response_queue", default_librarian_response_queue
|
||||
)
|
||||
|
||||
librarian_request_metrics = ProducerMetrics(
|
||||
processor=id, flow=None, name="librarian-request"
|
||||
)
|
||||
|
||||
self.librarian_request_producer = Producer(
|
||||
backend=self.pubsub,
|
||||
topic=librarian_request_q,
|
||||
schema=LibrarianRequest,
|
||||
metrics=librarian_request_metrics,
|
||||
)
|
||||
|
||||
librarian_response_metrics = ConsumerMetrics(
|
||||
processor=id, flow=None, name="librarian-response"
|
||||
)
|
||||
|
||||
self.librarian_response_consumer = Consumer(
|
||||
taskgroup=self.taskgroup,
|
||||
backend=self.pubsub,
|
||||
flow=None,
|
||||
topic=librarian_response_q,
|
||||
subscriber=f"{id}-librarian",
|
||||
schema=LibrarianResponse,
|
||||
handler=self.on_librarian_response,
|
||||
metrics=librarian_response_metrics,
|
||||
)
|
||||
|
||||
# Pending librarian requests: request_id -> asyncio.Future
|
||||
self.pending_requests = {}
|
||||
|
||||
logger.info("Universal decoder initialized")
|
||||
|
||||
async def start(self):
|
||||
await super(Processor, self).start()
|
||||
await self.librarian_request_producer.start()
|
||||
await self.librarian_response_consumer.start()
|
||||
|
||||
async def on_librarian_response(self, msg, consumer, flow):
|
||||
"""Handle responses from the librarian service."""
|
||||
response = msg.value()
|
||||
request_id = msg.properties().get("id")
|
||||
|
||||
if request_id and request_id in self.pending_requests:
|
||||
future = self.pending_requests.pop(request_id)
|
||||
future.set_result(response)
|
||||
else:
|
||||
logger.warning(
|
||||
f"Received unexpected librarian response: {request_id}"
|
||||
)
|
||||
|
||||
async def _librarian_request(self, request, timeout=120):
|
||||
"""Send a request to the librarian and wait for response."""
|
||||
request_id = str(uuid.uuid4())
|
||||
|
||||
future = asyncio.get_event_loop().create_future()
|
||||
self.pending_requests[request_id] = future
|
||||
|
||||
try:
|
||||
await self.librarian_request_producer.send(
|
||||
request, properties={"id": request_id}
|
||||
)
|
||||
response = await asyncio.wait_for(future, timeout=timeout)
|
||||
|
||||
if response.error:
|
||||
raise RuntimeError(
|
||||
f"Librarian error: {response.error.type}: "
|
||||
f"{response.error.message}"
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
self.pending_requests.pop(request_id, None)
|
||||
raise RuntimeError("Timeout waiting for librarian response")
|
||||
|
||||
async def fetch_document_metadata(self, document_id, user):
|
||||
"""Fetch document metadata from the librarian."""
|
||||
request = LibrarianRequest(
|
||||
operation="get-document-metadata",
|
||||
document_id=document_id,
|
||||
user=user,
|
||||
)
|
||||
response = await self._librarian_request(request)
|
||||
return response.document_metadata
|
||||
|
||||
async def fetch_document_content(self, document_id, user):
|
||||
"""Fetch document content from the librarian."""
|
||||
request = LibrarianRequest(
|
||||
operation="get-document-content",
|
||||
document_id=document_id,
|
||||
user=user,
|
||||
)
|
||||
response = await self._librarian_request(request)
|
||||
return response.content
|
||||
|
||||
async def save_child_document(self, doc_id, parent_id, user, content,
|
||||
document_type="page", title=None,
|
||||
kind="text/plain"):
|
||||
"""Save a child document to the librarian."""
|
||||
if isinstance(content, str):
|
||||
content = content.encode("utf-8")
|
||||
|
||||
doc_metadata = DocumentMetadata(
|
||||
id=doc_id,
|
||||
user=user,
|
||||
kind=kind,
|
||||
title=title or doc_id,
|
||||
parent_id=parent_id,
|
||||
document_type=document_type,
|
||||
)
|
||||
|
||||
request = LibrarianRequest(
|
||||
operation="add-child-document",
|
||||
document_metadata=doc_metadata,
|
||||
content=base64.b64encode(content).decode("utf-8"),
|
||||
)
|
||||
|
||||
await self._librarian_request(request)
|
||||
return doc_id
|
||||
|
||||
def extract_elements(self, blob, mime_type=None):
|
||||
"""
|
||||
Extract elements from a document using unstructured.
|
||||
|
||||
Args:
|
||||
blob: Raw document bytes
|
||||
mime_type: Optional mime type hint
|
||||
|
||||
Returns:
|
||||
List of unstructured Element objects
|
||||
"""
|
||||
# Determine file extension for unstructured
|
||||
suffix = MIME_EXTENSIONS.get(mime_type, "") if mime_type else ""
|
||||
if not suffix:
|
||||
suffix = ".bin"
|
||||
|
||||
with tempfile.NamedTemporaryFile(
|
||||
delete=False, suffix=suffix
|
||||
) as fp:
|
||||
fp.write(blob)
|
||||
temp_path = fp.name
|
||||
|
||||
try:
|
||||
kwargs = {
|
||||
"filename": temp_path,
|
||||
"strategy": self.partition_strategy,
|
||||
"languages": self.languages,
|
||||
}
|
||||
|
||||
# For hi_res strategy, request image extraction
|
||||
if self.partition_strategy == "hi_res":
|
||||
kwargs["extract_image_block_to_payload"] = True
|
||||
|
||||
elements = partition(**kwargs)
|
||||
|
||||
logger.info(
|
||||
f"Extracted {len(elements)} elements "
|
||||
f"(strategy: {self.partition_strategy})"
|
||||
)
|
||||
|
||||
return elements
|
||||
|
||||
finally:
|
||||
try:
|
||||
os.unlink(temp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
def group_by_page(self, elements):
|
||||
"""
|
||||
Group elements by page number.
|
||||
|
||||
Returns list of (page_number, elements) tuples.
|
||||
"""
|
||||
pages = {}
|
||||
|
||||
for el in elements:
|
||||
page_num = getattr(
|
||||
el.metadata, 'page_number', None
|
||||
) if hasattr(el, 'metadata') else None
|
||||
if page_num is None:
|
||||
page_num = 1
|
||||
if page_num not in pages:
|
||||
pages[page_num] = []
|
||||
pages[page_num].append(el)
|
||||
|
||||
return sorted(pages.items())
|
||||
|
||||
async def emit_section(self, elements, parent_doc_id, doc_uri_str,
|
||||
metadata, flow, mime_type=None,
|
||||
page_number=None, section_index=None):
|
||||
"""
|
||||
Process a group of elements as a page or section.
|
||||
|
||||
Assembles text, saves to librarian, emits provenance, sends
|
||||
TextDocument downstream. Returns the entity URI.
|
||||
"""
|
||||
text, element_types, table_count, image_count = (
|
||||
assemble_section_text(elements)
|
||||
)
|
||||
|
||||
if not text.strip():
|
||||
logger.debug("Skipping empty section")
|
||||
return None
|
||||
|
||||
is_page = page_number is not None
|
||||
char_length = len(text)
|
||||
|
||||
if is_page:
|
||||
entity_uri = make_page_uri()
|
||||
label = f"Page {page_number}"
|
||||
else:
|
||||
entity_uri = make_section_uri()
|
||||
label = f"Section {section_index}" if section_index else "Section"
|
||||
|
||||
doc_id = entity_uri
|
||||
page_content = text.encode("utf-8")
|
||||
|
||||
# Save to librarian
|
||||
await self.save_child_document(
|
||||
doc_id=doc_id,
|
||||
parent_id=parent_doc_id,
|
||||
user=metadata.user,
|
||||
content=page_content,
|
||||
document_type="page" if is_page else "section",
|
||||
title=label,
|
||||
)
|
||||
|
||||
# Emit provenance triples
|
||||
element_types_str = ",".join(sorted(element_types)) if element_types else None
|
||||
|
||||
prov_triples = derived_entity_triples(
|
||||
entity_uri=entity_uri,
|
||||
parent_uri=doc_uri_str,
|
||||
component_name=COMPONENT_NAME,
|
||||
component_version=COMPONENT_VERSION,
|
||||
label=label,
|
||||
page_number=page_number,
|
||||
section=not is_page,
|
||||
char_length=char_length,
|
||||
mime_type=mime_type,
|
||||
element_types=element_types_str,
|
||||
table_count=table_count if table_count > 0 else None,
|
||||
image_count=image_count if image_count > 0 else None,
|
||||
)
|
||||
|
||||
await flow("triples").send(Triples(
|
||||
metadata=Metadata(
|
||||
id=entity_uri,
|
||||
root=metadata.root,
|
||||
user=metadata.user,
|
||||
collection=metadata.collection,
|
||||
),
|
||||
triples=set_graph(prov_triples, GRAPH_SOURCE),
|
||||
))
|
||||
|
||||
# Send TextDocument downstream (chunker will fetch from librarian)
|
||||
r = TextDocument(
|
||||
metadata=Metadata(
|
||||
id=entity_uri,
|
||||
root=metadata.root,
|
||||
user=metadata.user,
|
||||
collection=metadata.collection,
|
||||
),
|
||||
document_id=doc_id,
|
||||
text=b"",
|
||||
)
|
||||
|
||||
await flow("output").send(r)
|
||||
|
||||
return entity_uri
|
||||
|
||||
async def emit_image(self, element, parent_uri, parent_doc_id,
|
||||
metadata, flow, mime_type=None, page_number=None):
|
||||
"""
|
||||
Store an image element in the librarian with provenance.
|
||||
|
||||
Images are stored but NOT sent downstream to the text pipeline.
|
||||
"""
|
||||
img_uri = make_image_uri()
|
||||
|
||||
# Get image data
|
||||
img_data = None
|
||||
if hasattr(element, 'metadata'):
|
||||
img_data = getattr(element.metadata, 'image_base64', None)
|
||||
|
||||
if not img_data:
|
||||
# No image payload available, just record provenance
|
||||
logger.debug("Image element without payload, recording provenance only")
|
||||
img_content = b""
|
||||
img_kind = "image/unknown"
|
||||
else:
|
||||
if isinstance(img_data, str):
|
||||
img_content = base64.b64decode(img_data)
|
||||
else:
|
||||
img_content = img_data
|
||||
img_kind = "image/png" # unstructured typically extracts as PNG
|
||||
|
||||
# Save to librarian
|
||||
if img_content:
|
||||
await self.save_child_document(
|
||||
doc_id=img_uri,
|
||||
parent_id=parent_doc_id,
|
||||
user=metadata.user,
|
||||
content=img_content,
|
||||
document_type="image",
|
||||
title=f"Image from page {page_number}" if page_number else "Image",
|
||||
kind=img_kind,
|
||||
)
|
||||
|
||||
# Emit provenance triples
|
||||
prov_triples = derived_entity_triples(
|
||||
entity_uri=img_uri,
|
||||
parent_uri=parent_uri,
|
||||
component_name=COMPONENT_NAME,
|
||||
component_version=COMPONENT_VERSION,
|
||||
label=f"Image from page {page_number}" if page_number else "Image",
|
||||
image=True,
|
||||
page_number=page_number,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
|
||||
await flow("triples").send(Triples(
|
||||
metadata=Metadata(
|
||||
id=img_uri,
|
||||
root=metadata.root,
|
||||
user=metadata.user,
|
||||
collection=metadata.collection,
|
||||
),
|
||||
triples=set_graph(prov_triples, GRAPH_SOURCE),
|
||||
))
|
||||
|
||||
async def on_message(self, msg, consumer, flow):
|
||||
|
||||
logger.debug("Document message received")
|
||||
|
||||
v = msg.value()
|
||||
|
||||
logger.info(f"Decoding {v.metadata.id}...")
|
||||
|
||||
# Determine content and mime type
|
||||
mime_type = None
|
||||
|
||||
if v.document_id:
|
||||
# Librarian path: fetch metadata then content
|
||||
logger.info(
|
||||
f"Fetching document {v.document_id} from librarian..."
|
||||
)
|
||||
|
||||
doc_meta = await self.fetch_document_metadata(
|
||||
document_id=v.document_id,
|
||||
user=v.metadata.user,
|
||||
)
|
||||
mime_type = doc_meta.kind if doc_meta else None
|
||||
|
||||
content = await self.fetch_document_content(
|
||||
document_id=v.document_id,
|
||||
user=v.metadata.user,
|
||||
)
|
||||
|
||||
if isinstance(content, str):
|
||||
content = content.encode('utf-8')
|
||||
blob = base64.b64decode(content)
|
||||
|
||||
logger.info(
|
||||
f"Fetched {len(blob)} bytes, mime: {mime_type}"
|
||||
)
|
||||
else:
|
||||
# Inline path: detect format from content
|
||||
blob = base64.b64decode(v.data)
|
||||
try:
|
||||
mime_type = magic.from_buffer(blob, mime=True)
|
||||
logger.info(f"Detected mime type: {mime_type}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not detect mime type: {e}")
|
||||
|
||||
# Get the source document ID
|
||||
source_doc_id = v.document_id or v.metadata.id
|
||||
doc_uri_str = document_uri(source_doc_id)
|
||||
|
||||
# Extract elements using unstructured
|
||||
elements = self.extract_elements(blob, mime_type)
|
||||
|
||||
if not elements:
|
||||
logger.warning("No elements extracted from document")
|
||||
return
|
||||
|
||||
# Determine if this is a page-based format
|
||||
is_page_based = mime_type in PAGE_BASED_FORMATS if mime_type else False
|
||||
|
||||
# Also check if elements actually have page numbers
|
||||
if not is_page_based:
|
||||
has_pages = any(
|
||||
getattr(el.metadata, 'page_number', None) is not None
|
||||
for el in elements
|
||||
if hasattr(el, 'metadata')
|
||||
)
|
||||
if has_pages:
|
||||
is_page_based = True
|
||||
|
||||
if is_page_based:
|
||||
# Group by page
|
||||
page_groups = self.group_by_page(elements)
|
||||
|
||||
for page_num, page_elements in page_groups:
|
||||
|
||||
# Extract and store images separately
|
||||
image_elements = [
|
||||
el for el in page_elements
|
||||
if getattr(el, 'category', '') == 'Image'
|
||||
]
|
||||
text_elements = [
|
||||
el for el in page_elements
|
||||
if getattr(el, 'category', '') != 'Image'
|
||||
]
|
||||
|
||||
# Emit the page as a text section
|
||||
page_uri_str = await self.emit_section(
|
||||
text_elements, source_doc_id, doc_uri_str,
|
||||
v.metadata, flow,
|
||||
mime_type=mime_type, page_number=page_num,
|
||||
)
|
||||
|
||||
# Store images (not sent to text pipeline)
|
||||
for img_el in image_elements:
|
||||
await self.emit_image(
|
||||
img_el,
|
||||
page_uri_str or doc_uri_str,
|
||||
source_doc_id,
|
||||
v.metadata, flow,
|
||||
mime_type=mime_type, page_number=page_num,
|
||||
)
|
||||
|
||||
else:
|
||||
# Non-page format: use section strategy
|
||||
|
||||
# Separate images from text elements
|
||||
image_elements = [
|
||||
el for el in elements
|
||||
if getattr(el, 'category', '') == 'Image'
|
||||
]
|
||||
text_elements = [
|
||||
el for el in elements
|
||||
if getattr(el, 'category', '') != 'Image'
|
||||
]
|
||||
|
||||
# Apply section strategy to text elements
|
||||
strategy_kwargs = {
|
||||
'element_count': self.section_element_count,
|
||||
'max_size': self.section_max_size,
|
||||
}
|
||||
groups = self.section_strategy(text_elements, **strategy_kwargs)
|
||||
|
||||
for idx, group in enumerate(groups):
|
||||
section_idx = idx + 1
|
||||
|
||||
await self.emit_section(
|
||||
group, source_doc_id, doc_uri_str,
|
||||
v.metadata, flow,
|
||||
mime_type=mime_type, section_index=section_idx,
|
||||
)
|
||||
|
||||
# Store images (not sent to text pipeline)
|
||||
for img_el in image_elements:
|
||||
await self.emit_image(
|
||||
img_el, doc_uri_str, source_doc_id,
|
||||
v.metadata, flow,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
|
||||
logger.info("Document decoding complete")
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
FlowProcessor.add_args(parser)
|
||||
|
||||
parser.add_argument(
|
||||
'--strategy',
|
||||
default='auto',
|
||||
choices=['auto', 'hi_res', 'fast'],
|
||||
help='Partitioning strategy (default: auto)',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--languages',
|
||||
default='eng',
|
||||
help='Comma-separated OCR language codes (default: eng)',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--section-strategy',
|
||||
default='whole-document',
|
||||
choices=[
|
||||
'whole-document', 'heading', 'element-type', 'count', 'size'
|
||||
],
|
||||
help='Section grouping strategy for non-page formats '
|
||||
'(default: whole-document)',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--section-element-count',
|
||||
type=int,
|
||||
default=20,
|
||||
help='Elements per section for count strategy (default: 20)',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--section-max-size',
|
||||
type=int,
|
||||
default=4000,
|
||||
help='Max chars per section for size strategy (default: 4000)',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--section-within-pages',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Apply section strategy within pages too (default: false)',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--librarian-request-queue',
|
||||
default=default_librarian_request_queue,
|
||||
help=f'Librarian request queue '
|
||||
f'(default: {default_librarian_request_queue})',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--librarian-response-queue',
|
||||
default=default_librarian_response_queue,
|
||||
help=f'Librarian response queue '
|
||||
f'(default: {default_librarian_response_queue})',
|
||||
)
|
||||
|
||||
|
||||
def run():
|
||||
|
||||
Processor.launch(default_ident, __doc__)
|
||||
|
|
@ -0,0 +1,171 @@
|
|||
|
||||
"""
|
||||
Section grouping strategies for the universal document decoder.
|
||||
|
||||
Each strategy takes a list of unstructured elements and returns a list
|
||||
of element groups. Each group becomes one TextDocument output.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def group_whole_document(elements, **kwargs):
|
||||
"""
|
||||
Emit the entire document as a single section.
|
||||
|
||||
The downstream chunker handles all splitting.
|
||||
"""
|
||||
if not elements:
|
||||
return []
|
||||
return [elements]
|
||||
|
||||
|
||||
def group_by_heading(elements, **kwargs):
|
||||
"""
|
||||
Split at heading elements (Title category).
|
||||
|
||||
Each section is a heading plus all content until the next heading.
|
||||
Falls back to whole-document if no headings are found.
|
||||
"""
|
||||
if not elements:
|
||||
return []
|
||||
|
||||
# Check if any headings exist
|
||||
has_headings = any(
|
||||
getattr(el, 'category', '') == 'Title' for el in elements
|
||||
)
|
||||
|
||||
if not has_headings:
|
||||
logger.debug("No headings found, falling back to whole-document")
|
||||
return group_whole_document(elements)
|
||||
|
||||
groups = []
|
||||
current_group = []
|
||||
|
||||
for el in elements:
|
||||
if getattr(el, 'category', '') == 'Title' and current_group:
|
||||
groups.append(current_group)
|
||||
current_group = []
|
||||
current_group.append(el)
|
||||
|
||||
if current_group:
|
||||
groups.append(current_group)
|
||||
|
||||
return groups
|
||||
|
||||
|
||||
def group_by_element_type(elements, **kwargs):
|
||||
"""
|
||||
Split on transitions between narrative text and tables.
|
||||
|
||||
Consecutive elements of the same broad category stay grouped.
|
||||
"""
|
||||
if not elements:
|
||||
return []
|
||||
|
||||
def is_table(el):
|
||||
return getattr(el, 'category', '') == 'Table'
|
||||
|
||||
groups = []
|
||||
current_group = []
|
||||
current_is_table = None
|
||||
|
||||
for el in elements:
|
||||
el_is_table = is_table(el)
|
||||
if current_is_table is not None and el_is_table != current_is_table:
|
||||
groups.append(current_group)
|
||||
current_group = []
|
||||
current_group.append(el)
|
||||
current_is_table = el_is_table
|
||||
|
||||
if current_group:
|
||||
groups.append(current_group)
|
||||
|
||||
return groups
|
||||
|
||||
|
||||
def group_by_count(elements, element_count=20, **kwargs):
|
||||
"""
|
||||
Group a fixed number of elements per section.
|
||||
|
||||
Args:
|
||||
elements: List of unstructured elements
|
||||
element_count: Number of elements per group (default: 20)
|
||||
"""
|
||||
if not elements:
|
||||
return []
|
||||
|
||||
groups = []
|
||||
for i in range(0, len(elements), element_count):
|
||||
groups.append(elements[i:i + element_count])
|
||||
|
||||
return groups
|
||||
|
||||
|
||||
def group_by_size(elements, max_size=4000, **kwargs):
|
||||
"""
|
||||
Accumulate elements until a character limit is reached.
|
||||
|
||||
Respects element boundaries — never splits mid-element. If a
|
||||
single element exceeds the limit, it becomes its own section.
|
||||
|
||||
Args:
|
||||
elements: List of unstructured elements
|
||||
max_size: Max characters per section (default: 4000)
|
||||
"""
|
||||
if not elements:
|
||||
return []
|
||||
|
||||
groups = []
|
||||
current_group = []
|
||||
current_size = 0
|
||||
|
||||
for el in elements:
|
||||
el_text = getattr(el, 'text', '') or ''
|
||||
el_size = len(el_text)
|
||||
|
||||
if current_group and current_size + el_size > max_size:
|
||||
groups.append(current_group)
|
||||
current_group = []
|
||||
current_size = 0
|
||||
|
||||
current_group.append(el)
|
||||
current_size += el_size
|
||||
|
||||
if current_group:
|
||||
groups.append(current_group)
|
||||
|
||||
return groups
|
||||
|
||||
|
||||
# Strategy registry
|
||||
STRATEGIES = {
|
||||
'whole-document': group_whole_document,
|
||||
'heading': group_by_heading,
|
||||
'element-type': group_by_element_type,
|
||||
'count': group_by_count,
|
||||
'size': group_by_size,
|
||||
}
|
||||
|
||||
|
||||
def get_strategy(name):
|
||||
"""
|
||||
Get a section grouping strategy by name.
|
||||
|
||||
Args:
|
||||
name: Strategy name (whole-document, heading, element-type, count, size)
|
||||
|
||||
Returns:
|
||||
Strategy function
|
||||
|
||||
Raises:
|
||||
ValueError: If strategy name is not recognized
|
||||
"""
|
||||
if name not in STRATEGIES:
|
||||
raise ValueError(
|
||||
f"Unknown section strategy: {name}. "
|
||||
f"Available: {', '.join(STRATEGIES.keys())}"
|
||||
)
|
||||
return STRATEGIES[name]
|
||||
Loading…
Add table
Add a link
Reference in a new issue