mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-01 11:26:22 +02:00
Use UUID-based URNs for page and chunk IDs (#703)
Page and chunk document IDs were deterministic ({doc_id}/p{num},
{doc_id}/p{num}/c{num}), causing "Document already exists" errors
when reprocessing documents through different flows. Content may
differ between runs due to different parameters or extractors, so
deterministic IDs are incorrect.
Pages now use urn:page:{uuid}, chunks use
urn:chunk:{uuid}. Parent- child relationships are tracked via
librarian metadata and provenance triples.
Also brings Mistral OCR and Tesseract OCR decoders up to parity
with the PDF decoder: librarian fetch/save support, per-page
output with unique IDs, and provenance triple emission. Fixes
Mistral OCR bug where only the first 5 pages were processed.
This commit is contained in:
parent
1a7b654bd3
commit
96fd1eab15
10 changed files with 694 additions and 286 deletions
|
|
@ -9,14 +9,14 @@ Provides helpers for:
|
|||
Usage example:
|
||||
|
||||
from trustgraph.provenance import (
|
||||
document_uri, page_uri, chunk_uri_from_page,
|
||||
document_uri, page_uri, chunk_uri,
|
||||
document_triples, derived_entity_triples,
|
||||
get_vocabulary_triples,
|
||||
)
|
||||
|
||||
# Generate URIs
|
||||
doc_uri = document_uri("my-doc-123")
|
||||
page_uri = page_uri("my-doc-123", page_number=1)
|
||||
pg_uri = page_uri()
|
||||
|
||||
# Build provenance triples
|
||||
triples = document_triples(
|
||||
|
|
@ -35,8 +35,7 @@ from . uris import (
|
|||
TRUSTGRAPH_BASE,
|
||||
document_uri,
|
||||
page_uri,
|
||||
chunk_uri_from_page,
|
||||
chunk_uri_from_doc,
|
||||
chunk_uri,
|
||||
activity_uri,
|
||||
subgraph_uri,
|
||||
agent_uri,
|
||||
|
|
@ -138,8 +137,7 @@ __all__ = [
|
|||
"TRUSTGRAPH_BASE",
|
||||
"document_uri",
|
||||
"page_uri",
|
||||
"chunk_uri_from_page",
|
||||
"chunk_uri_from_doc",
|
||||
"chunk_uri",
|
||||
"activity_uri",
|
||||
"subgraph_uri",
|
||||
"agent_uri",
|
||||
|
|
|
|||
|
|
@ -1,12 +1,11 @@
|
|||
"""
|
||||
URI generation for provenance entities.
|
||||
|
||||
Document IDs are already IRIs (e.g., https://trustgraph.ai/doc/abc123).
|
||||
Child entities (pages, chunks) append path segments to the parent IRI:
|
||||
- Document: {doc_iri} (as provided)
|
||||
- Page: {doc_iri}/p{page_number}
|
||||
- Chunk: {page_iri}/c{chunk_index} (from page)
|
||||
{doc_iri}/c{chunk_index} (from text doc)
|
||||
Document IDs are externally provided (e.g., https://trustgraph.ai/doc/abc123).
|
||||
Child entities (pages, chunks) use UUID-based URNs:
|
||||
- Document: {doc_iri} (as provided, not generated here)
|
||||
- Page: urn:page:{uuid}
|
||||
- Chunk: urn:chunk:{uuid}
|
||||
- Activity: https://trustgraph.ai/activity/{uuid}
|
||||
- Subgraph: https://trustgraph.ai/subgraph/{uuid}
|
||||
"""
|
||||
|
|
@ -28,19 +27,14 @@ def document_uri(doc_iri: str) -> str:
|
|||
return doc_iri
|
||||
|
||||
|
||||
def page_uri(doc_iri: str, page_number: int) -> str:
|
||||
"""Generate URI for a page by appending to document IRI."""
|
||||
return f"{doc_iri}/p{page_number}"
|
||||
def page_uri() -> str:
|
||||
"""Generate a unique URI for a page."""
|
||||
return f"urn:page:{uuid.uuid4()}"
|
||||
|
||||
|
||||
def chunk_uri_from_page(doc_iri: str, page_number: int, chunk_index: int) -> str:
|
||||
"""Generate URI for a chunk extracted from a page."""
|
||||
return f"{doc_iri}/p{page_number}/c{chunk_index}"
|
||||
|
||||
|
||||
def chunk_uri_from_doc(doc_iri: str, chunk_index: int) -> str:
|
||||
"""Generate URI for a chunk extracted directly from a text document."""
|
||||
return f"{doc_iri}/c{chunk_index}"
|
||||
def chunk_uri() -> str:
|
||||
"""Generate a unique URI for a chunk."""
|
||||
return f"urn:chunk:{uuid.uuid4()}"
|
||||
|
||||
|
||||
def activity_uri(activity_id: str = None) -> str:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue