mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Extract-time provenance (#661)
1. Shared Provenance Module - URI generators, namespace constants,
triple builders, vocabulary bootstrap
2. Librarian - Emits document metadata to graph on processing
initiation (vocabulary bootstrap + PROV-O triples)
3. PDF Extractor - Saves pages as child documents, emits parent-child
provenance edges, forwards page IDs
4. Chunker - Saves chunks as child documents, emits provenance edges,
forwards chunk ID + content
5. Knowledge Extractors (both definitions and relationships):
- Link entities to chunks via SUBJECT_OF (not top-level document)
- Removed duplicate metadata emission (now handled by librarian)
- Get chunk_doc_id and chunk_uri from incoming Chunk message
6. Embedding Provenance:
- EntityContext schema has chunk_id field
- EntityEmbeddings schema has chunk_id field
- Definitions extractor sets chunk_id when creating EntityContext
- Graph embeddings processor passes chunk_id through to
EntityEmbeddings
Provenance Flow:
Document → Page (PDF) → Chunk → Extracted Facts/Embeddings
↓ ↓ ↓ ↓
librarian librarian librarian (chunk_id reference)
+ graph + graph + graph
Each artifact is stored in librarian with parent-child linking, and PROV-O
edges are emitted to the knowledge graph for full traceability from any
extracted fact back to its source document.
Also, updating tests
This commit is contained in:
parent
d8f0a576af
commit
cd5580be59
20 changed files with 1601 additions and 59 deletions
|
|
@ -15,7 +15,7 @@ from .consumer import Consumer
|
|||
from .producer import Producer
|
||||
from .metrics import ConsumerMetrics, ProducerMetrics
|
||||
|
||||
from ..schema import LibrarianRequest, LibrarianResponse
|
||||
from ..schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
|
||||
from ..schema import librarian_request_queue, librarian_response_queue
|
||||
|
||||
# Module logger
|
||||
|
|
@ -135,6 +135,67 @@ class ChunkingService(FlowProcessor):
|
|||
self.pending_requests.pop(request_id, None)
|
||||
raise RuntimeError(f"Timeout fetching document {document_id}")
|
||||
|
||||
async def save_child_document(self, doc_id, parent_id, user, content,
|
||||
document_type="chunk", title=None, timeout=120):
|
||||
"""
|
||||
Save a child document (chunk) to the librarian.
|
||||
|
||||
Args:
|
||||
doc_id: ID for the new child document
|
||||
parent_id: ID of the parent document
|
||||
user: User ID
|
||||
content: Document content (bytes or str)
|
||||
document_type: Type of document ("chunk", etc.)
|
||||
title: Optional title
|
||||
timeout: Request timeout in seconds
|
||||
|
||||
Returns:
|
||||
The document ID on success
|
||||
"""
|
||||
request_id = str(uuid.uuid4())
|
||||
|
||||
if isinstance(content, str):
|
||||
content = content.encode("utf-8")
|
||||
|
||||
doc_metadata = DocumentMetadata(
|
||||
id=doc_id,
|
||||
user=user,
|
||||
kind="text/plain",
|
||||
title=title or doc_id,
|
||||
parent_id=parent_id,
|
||||
document_type=document_type,
|
||||
)
|
||||
|
||||
request = LibrarianRequest(
|
||||
operation="add-child-document",
|
||||
document_metadata=doc_metadata,
|
||||
content=base64.b64encode(content).decode("utf-8"),
|
||||
)
|
||||
|
||||
# Create future for response
|
||||
future = asyncio.get_event_loop().create_future()
|
||||
self.pending_requests[request_id] = future
|
||||
|
||||
try:
|
||||
# Send request
|
||||
await self.librarian_request_producer.send(
|
||||
request, properties={"id": request_id}
|
||||
)
|
||||
|
||||
# Wait for response
|
||||
response = await asyncio.wait_for(future, timeout=timeout)
|
||||
|
||||
if response.error:
|
||||
raise RuntimeError(
|
||||
f"Librarian error saving chunk: {response.error.type}: {response.error.message}"
|
||||
)
|
||||
|
||||
return doc_id
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
self.pending_requests.pop(request_id, None)
|
||||
raise RuntimeError(f"Timeout saving chunk {doc_id}")
|
||||
|
||||
async def get_document_text(self, doc):
|
||||
"""
|
||||
Get text content from a TextDocument, fetching from librarian if needed.
|
||||
|
|
|
|||
110
trustgraph-base/trustgraph/provenance/__init__.py
Normal file
110
trustgraph-base/trustgraph/provenance/__init__.py
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
"""
|
||||
Provenance module for extraction-time provenance support.
|
||||
|
||||
Provides helpers for:
|
||||
- URI generation for documents, pages, chunks, activities, statements
|
||||
- PROV-O triple building for provenance metadata
|
||||
- Vocabulary bootstrap for per-collection initialization
|
||||
|
||||
Usage example:
|
||||
|
||||
from trustgraph.provenance import (
|
||||
document_uri, page_uri, chunk_uri_from_page,
|
||||
document_triples, derived_entity_triples,
|
||||
get_vocabulary_triples,
|
||||
)
|
||||
|
||||
# Generate URIs
|
||||
doc_uri = document_uri("my-doc-123")
|
||||
page_uri = page_uri("my-doc-123", page_number=1)
|
||||
|
||||
# Build provenance triples
|
||||
triples = document_triples(
|
||||
doc_uri,
|
||||
title="My Document",
|
||||
mime_type="application/pdf",
|
||||
page_count=10,
|
||||
)
|
||||
|
||||
# Get vocabulary bootstrap triples (once per collection)
|
||||
vocab_triples = get_vocabulary_triples()
|
||||
"""
|
||||
|
||||
# URI generation
|
||||
from . uris import (
|
||||
TRUSTGRAPH_BASE,
|
||||
document_uri,
|
||||
page_uri,
|
||||
chunk_uri_from_page,
|
||||
chunk_uri_from_doc,
|
||||
activity_uri,
|
||||
statement_uri,
|
||||
agent_uri,
|
||||
)
|
||||
|
||||
# Namespace constants
|
||||
from . namespaces import (
|
||||
# PROV-O
|
||||
PROV, PROV_ENTITY, PROV_ACTIVITY, PROV_AGENT,
|
||||
PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
|
||||
PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
|
||||
# Dublin Core
|
||||
DC, DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
|
||||
# RDF/RDFS
|
||||
RDF, RDF_TYPE, RDFS, RDFS_LABEL,
|
||||
# TrustGraph
|
||||
TG, TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
|
||||
TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
|
||||
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
|
||||
TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
|
||||
TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
|
||||
)
|
||||
|
||||
# Triple builders
|
||||
from . triples import (
|
||||
document_triples,
|
||||
derived_entity_triples,
|
||||
triple_provenance_triples,
|
||||
)
|
||||
|
||||
# Vocabulary bootstrap
|
||||
from . vocabulary import (
|
||||
get_vocabulary_triples,
|
||||
PROV_CLASS_LABELS,
|
||||
PROV_PREDICATE_LABELS,
|
||||
DC_PREDICATE_LABELS,
|
||||
TG_PREDICATE_LABELS,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# URIs
|
||||
"TRUSTGRAPH_BASE",
|
||||
"document_uri",
|
||||
"page_uri",
|
||||
"chunk_uri_from_page",
|
||||
"chunk_uri_from_doc",
|
||||
"activity_uri",
|
||||
"statement_uri",
|
||||
"agent_uri",
|
||||
# Namespaces
|
||||
"PROV", "PROV_ENTITY", "PROV_ACTIVITY", "PROV_AGENT",
|
||||
"PROV_WAS_DERIVED_FROM", "PROV_WAS_GENERATED_BY",
|
||||
"PROV_USED", "PROV_WAS_ASSOCIATED_WITH", "PROV_STARTED_AT_TIME",
|
||||
"DC", "DC_TITLE", "DC_SOURCE", "DC_DATE", "DC_CREATOR",
|
||||
"RDF", "RDF_TYPE", "RDFS", "RDFS_LABEL",
|
||||
"TG", "TG_REIFIES", "TG_PAGE_COUNT", "TG_MIME_TYPE", "TG_PAGE_NUMBER",
|
||||
"TG_CHUNK_INDEX", "TG_CHAR_OFFSET", "TG_CHAR_LENGTH",
|
||||
"TG_CHUNK_SIZE", "TG_CHUNK_OVERLAP", "TG_COMPONENT_VERSION",
|
||||
"TG_LLM_MODEL", "TG_ONTOLOGY", "TG_EMBEDDING_MODEL",
|
||||
"TG_SOURCE_TEXT", "TG_SOURCE_CHAR_OFFSET", "TG_SOURCE_CHAR_LENGTH",
|
||||
# Triple builders
|
||||
"document_triples",
|
||||
"derived_entity_triples",
|
||||
"triple_provenance_triples",
|
||||
# Vocabulary
|
||||
"get_vocabulary_triples",
|
||||
"PROV_CLASS_LABELS",
|
||||
"PROV_PREDICATE_LABELS",
|
||||
"DC_PREDICATE_LABELS",
|
||||
"TG_PREDICATE_LABELS",
|
||||
]
|
||||
48
trustgraph-base/trustgraph/provenance/namespaces.py
Normal file
48
trustgraph-base/trustgraph/provenance/namespaces.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
"""
|
||||
RDF namespace constants for provenance.
|
||||
|
||||
Includes PROV-O, Dublin Core, and TrustGraph namespace URIs.
|
||||
"""
|
||||
|
||||
# PROV-O namespace (W3C Provenance Ontology)
|
||||
PROV = "http://www.w3.org/ns/prov#"
|
||||
PROV_ENTITY = PROV + "Entity"
|
||||
PROV_ACTIVITY = PROV + "Activity"
|
||||
PROV_AGENT = PROV + "Agent"
|
||||
PROV_WAS_DERIVED_FROM = PROV + "wasDerivedFrom"
|
||||
PROV_WAS_GENERATED_BY = PROV + "wasGeneratedBy"
|
||||
PROV_USED = PROV + "used"
|
||||
PROV_WAS_ASSOCIATED_WITH = PROV + "wasAssociatedWith"
|
||||
PROV_STARTED_AT_TIME = PROV + "startedAtTime"
|
||||
|
||||
# Dublin Core namespace
|
||||
DC = "http://purl.org/dc/elements/1.1/"
|
||||
DC_TITLE = DC + "title"
|
||||
DC_SOURCE = DC + "source"
|
||||
DC_DATE = DC + "date"
|
||||
DC_CREATOR = DC + "creator"
|
||||
|
||||
# RDF/RDFS namespace (also in rdf.py, but included here for completeness)
|
||||
RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
RDF_TYPE = RDF + "type"
|
||||
RDFS = "http://www.w3.org/2000/01/rdf-schema#"
|
||||
RDFS_LABEL = RDFS + "label"
|
||||
|
||||
# TrustGraph namespace for custom predicates
|
||||
TG = "https://trustgraph.ai/ns/"
|
||||
TG_REIFIES = TG + "reifies"
|
||||
TG_PAGE_COUNT = TG + "pageCount"
|
||||
TG_MIME_TYPE = TG + "mimeType"
|
||||
TG_PAGE_NUMBER = TG + "pageNumber"
|
||||
TG_CHUNK_INDEX = TG + "chunkIndex"
|
||||
TG_CHAR_OFFSET = TG + "charOffset"
|
||||
TG_CHAR_LENGTH = TG + "charLength"
|
||||
TG_CHUNK_SIZE = TG + "chunkSize"
|
||||
TG_CHUNK_OVERLAP = TG + "chunkOverlap"
|
||||
TG_COMPONENT_VERSION = TG + "componentVersion"
|
||||
TG_LLM_MODEL = TG + "llmModel"
|
||||
TG_ONTOLOGY = TG + "ontology"
|
||||
TG_EMBEDDING_MODEL = TG + "embeddingModel"
|
||||
TG_SOURCE_TEXT = TG + "sourceText"
|
||||
TG_SOURCE_CHAR_OFFSET = TG + "sourceCharOffset"
|
||||
TG_SOURCE_CHAR_LENGTH = TG + "sourceCharLength"
|
||||
251
trustgraph-base/trustgraph/provenance/triples.py
Normal file
251
trustgraph-base/trustgraph/provenance/triples.py
Normal file
|
|
@ -0,0 +1,251 @@
|
|||
"""
|
||||
Helper functions to build PROV-O triples for extraction-time provenance.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import List, Optional
|
||||
|
||||
from .. schema import Triple, Term, IRI, LITERAL
|
||||
|
||||
from . namespaces import (
|
||||
RDF_TYPE, RDFS_LABEL,
|
||||
PROV_ENTITY, PROV_ACTIVITY, PROV_AGENT,
|
||||
PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
|
||||
PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
|
||||
DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
|
||||
TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
|
||||
TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
|
||||
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
|
||||
TG_LLM_MODEL, TG_ONTOLOGY, TG_REIFIES,
|
||||
)
|
||||
|
||||
from . uris import activity_uri, agent_uri
|
||||
|
||||
|
||||
def _iri(uri: str) -> Term:
|
||||
"""Create an IRI term."""
|
||||
return Term(type=IRI, iri=uri)
|
||||
|
||||
|
||||
def _literal(value) -> Term:
|
||||
"""Create a literal term."""
|
||||
return Term(type=LITERAL, value=str(value))
|
||||
|
||||
|
||||
def _triple(s: str, p: str, o_term: Term) -> Triple:
|
||||
"""Create a triple with IRI subject and predicate."""
|
||||
return Triple(s=_iri(s), p=_iri(p), o=o_term)
|
||||
|
||||
|
||||
def document_triples(
|
||||
doc_uri: str,
|
||||
title: Optional[str] = None,
|
||||
source: Optional[str] = None,
|
||||
date: Optional[str] = None,
|
||||
creator: Optional[str] = None,
|
||||
page_count: Optional[int] = None,
|
||||
mime_type: Optional[str] = None,
|
||||
) -> List[Triple]:
|
||||
"""
|
||||
Build triples for a source document entity.
|
||||
|
||||
Args:
|
||||
doc_uri: The document URI (from uris.document_uri)
|
||||
title: Document title
|
||||
source: Source URL/path
|
||||
date: Document date
|
||||
creator: Author/creator
|
||||
page_count: Number of pages (for PDFs)
|
||||
mime_type: MIME type
|
||||
|
||||
Returns:
|
||||
List of Triple objects
|
||||
"""
|
||||
triples = [
|
||||
_triple(doc_uri, RDF_TYPE, _iri(PROV_ENTITY)),
|
||||
]
|
||||
|
||||
if title:
|
||||
triples.append(_triple(doc_uri, DC_TITLE, _literal(title)))
|
||||
triples.append(_triple(doc_uri, RDFS_LABEL, _literal(title)))
|
||||
|
||||
if source:
|
||||
triples.append(_triple(doc_uri, DC_SOURCE, _iri(source)))
|
||||
|
||||
if date:
|
||||
triples.append(_triple(doc_uri, DC_DATE, _literal(date)))
|
||||
|
||||
if creator:
|
||||
triples.append(_triple(doc_uri, DC_CREATOR, _literal(creator)))
|
||||
|
||||
if page_count is not None:
|
||||
triples.append(_triple(doc_uri, TG_PAGE_COUNT, _literal(page_count)))
|
||||
|
||||
if mime_type:
|
||||
triples.append(_triple(doc_uri, TG_MIME_TYPE, _literal(mime_type)))
|
||||
|
||||
return triples
|
||||
|
||||
|
||||
def derived_entity_triples(
|
||||
entity_uri: str,
|
||||
parent_uri: str,
|
||||
component_name: str,
|
||||
component_version: str,
|
||||
label: Optional[str] = None,
|
||||
page_number: Optional[int] = None,
|
||||
chunk_index: Optional[int] = None,
|
||||
char_offset: Optional[int] = None,
|
||||
char_length: Optional[int] = None,
|
||||
chunk_size: Optional[int] = None,
|
||||
chunk_overlap: Optional[int] = None,
|
||||
timestamp: Optional[str] = None,
|
||||
) -> List[Triple]:
|
||||
"""
|
||||
Build triples for a derived entity (page or chunk) with full PROV-O provenance.
|
||||
|
||||
Creates:
|
||||
- Entity declaration
|
||||
- wasDerivedFrom relationship to parent
|
||||
- Activity for the extraction
|
||||
- Agent for the component
|
||||
|
||||
Args:
|
||||
entity_uri: URI of the derived entity (page or chunk)
|
||||
parent_uri: URI of the parent entity
|
||||
component_name: Name of TG component (e.g., "pdf-extractor", "chunker")
|
||||
component_version: Version of the component
|
||||
label: Human-readable label
|
||||
page_number: Page number (for pages)
|
||||
chunk_index: Chunk index (for chunks)
|
||||
char_offset: Character offset in parent (for chunks)
|
||||
char_length: Character length (for chunks)
|
||||
chunk_size: Configured chunk size (for chunking activity)
|
||||
chunk_overlap: Configured chunk overlap (for chunking activity)
|
||||
timestamp: ISO timestamp (defaults to now)
|
||||
|
||||
Returns:
|
||||
List of Triple objects
|
||||
"""
|
||||
if timestamp is None:
|
||||
timestamp = datetime.utcnow().isoformat() + "Z"
|
||||
|
||||
act_uri = activity_uri()
|
||||
agt_uri = agent_uri(component_name)
|
||||
|
||||
triples = [
|
||||
# Entity declaration
|
||||
_triple(entity_uri, RDF_TYPE, _iri(PROV_ENTITY)),
|
||||
|
||||
# Derivation from parent
|
||||
_triple(entity_uri, PROV_WAS_DERIVED_FROM, _iri(parent_uri)),
|
||||
|
||||
# Generation by activity
|
||||
_triple(entity_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)),
|
||||
|
||||
# Activity declaration
|
||||
_triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
|
||||
_triple(act_uri, PROV_USED, _iri(parent_uri)),
|
||||
_triple(act_uri, PROV_WAS_ASSOCIATED_WITH, _iri(agt_uri)),
|
||||
_triple(act_uri, PROV_STARTED_AT_TIME, _literal(timestamp)),
|
||||
_triple(act_uri, TG_COMPONENT_VERSION, _literal(component_version)),
|
||||
|
||||
# Agent declaration
|
||||
_triple(agt_uri, RDF_TYPE, _iri(PROV_AGENT)),
|
||||
_triple(agt_uri, RDFS_LABEL, _literal(component_name)),
|
||||
]
|
||||
|
||||
if label:
|
||||
triples.append(_triple(entity_uri, RDFS_LABEL, _literal(label)))
|
||||
|
||||
if page_number is not None:
|
||||
triples.append(_triple(entity_uri, TG_PAGE_NUMBER, _literal(page_number)))
|
||||
|
||||
if chunk_index is not None:
|
||||
triples.append(_triple(entity_uri, TG_CHUNK_INDEX, _literal(chunk_index)))
|
||||
|
||||
if char_offset is not None:
|
||||
triples.append(_triple(entity_uri, TG_CHAR_OFFSET, _literal(char_offset)))
|
||||
|
||||
if char_length is not None:
|
||||
triples.append(_triple(entity_uri, TG_CHAR_LENGTH, _literal(char_length)))
|
||||
|
||||
if chunk_size is not None:
|
||||
triples.append(_triple(act_uri, TG_CHUNK_SIZE, _literal(chunk_size)))
|
||||
|
||||
if chunk_overlap is not None:
|
||||
triples.append(_triple(act_uri, TG_CHUNK_OVERLAP, _literal(chunk_overlap)))
|
||||
|
||||
return triples
|
||||
|
||||
|
||||
def triple_provenance_triples(
|
||||
stmt_uri: str,
|
||||
subject_uri: str,
|
||||
predicate_uri: str,
|
||||
object_term: Term,
|
||||
chunk_uri: str,
|
||||
component_name: str,
|
||||
component_version: str,
|
||||
llm_model: Optional[str] = None,
|
||||
ontology_uri: Optional[str] = None,
|
||||
timestamp: Optional[str] = None,
|
||||
) -> List[Triple]:
|
||||
"""
|
||||
Build provenance triples for an extracted knowledge triple using reification.
|
||||
|
||||
Creates:
|
||||
- Statement object that reifies the triple
|
||||
- wasDerivedFrom link to source chunk
|
||||
- Activity and agent metadata
|
||||
|
||||
Args:
|
||||
stmt_uri: URI for the reified statement
|
||||
subject_uri: Subject of the extracted triple
|
||||
predicate_uri: Predicate of the extracted triple
|
||||
object_term: Object of the extracted triple (Term)
|
||||
chunk_uri: URI of source chunk
|
||||
component_name: Name of extractor component
|
||||
component_version: Version of the component
|
||||
llm_model: LLM model used for extraction
|
||||
ontology_uri: Ontology URI used for extraction
|
||||
timestamp: ISO timestamp
|
||||
|
||||
Returns:
|
||||
List of Triple objects for the provenance (not the triple itself)
|
||||
"""
|
||||
if timestamp is None:
|
||||
timestamp = datetime.utcnow().isoformat() + "Z"
|
||||
|
||||
act_uri = activity_uri()
|
||||
agt_uri = agent_uri(component_name)
|
||||
|
||||
# Note: The actual reification (tg:reifies pointing at the edge) requires
|
||||
# RDF 1.2 triple term support. This builds the surrounding provenance.
|
||||
# The actual reification link must be handled by the knowledge extractor
|
||||
# using the graph store's reification API.
|
||||
|
||||
triples = [
|
||||
# Statement provenance
|
||||
_triple(stmt_uri, PROV_WAS_DERIVED_FROM, _iri(chunk_uri)),
|
||||
_triple(stmt_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)),
|
||||
|
||||
# Activity
|
||||
_triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
|
||||
_triple(act_uri, PROV_USED, _iri(chunk_uri)),
|
||||
_triple(act_uri, PROV_WAS_ASSOCIATED_WITH, _iri(agt_uri)),
|
||||
_triple(act_uri, PROV_STARTED_AT_TIME, _literal(timestamp)),
|
||||
_triple(act_uri, TG_COMPONENT_VERSION, _literal(component_version)),
|
||||
|
||||
# Agent
|
||||
_triple(agt_uri, RDF_TYPE, _iri(PROV_AGENT)),
|
||||
_triple(agt_uri, RDFS_LABEL, _literal(component_name)),
|
||||
]
|
||||
|
||||
if llm_model:
|
||||
triples.append(_triple(act_uri, TG_LLM_MODEL, _literal(llm_model)))
|
||||
|
||||
if ontology_uri:
|
||||
triples.append(_triple(act_uri, TG_ONTOLOGY, _iri(ontology_uri)))
|
||||
|
||||
return triples
|
||||
61
trustgraph-base/trustgraph/provenance/uris.py
Normal file
61
trustgraph-base/trustgraph/provenance/uris.py
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
"""
|
||||
URI generation for provenance entities.
|
||||
|
||||
URI patterns:
|
||||
- Document: https://trustgraph.ai/doc/{doc_id}
|
||||
- Page: https://trustgraph.ai/page/{doc_id}/p{page_number}
|
||||
- Chunk: https://trustgraph.ai/chunk/{doc_id}/p{page}/c{chunk} (from page)
|
||||
https://trustgraph.ai/chunk/{doc_id}/c{chunk} (from text doc)
|
||||
- Activity: https://trustgraph.ai/activity/{uuid}
|
||||
- Statement: https://trustgraph.ai/stmt/{uuid}
|
||||
"""
|
||||
|
||||
import uuid
|
||||
import urllib.parse
|
||||
|
||||
# Base URI prefix
|
||||
TRUSTGRAPH_BASE = "https://trustgraph.ai"
|
||||
|
||||
|
||||
def _encode_id(id_str: str) -> str:
|
||||
"""URL-encode an ID component for safe inclusion in URIs."""
|
||||
return urllib.parse.quote(str(id_str), safe='')
|
||||
|
||||
|
||||
def document_uri(doc_id: str) -> str:
|
||||
"""Generate URI for a source document."""
|
||||
return f"{TRUSTGRAPH_BASE}/doc/{_encode_id(doc_id)}"
|
||||
|
||||
|
||||
def page_uri(doc_id: str, page_number: int) -> str:
|
||||
"""Generate URI for a page extracted from a document."""
|
||||
return f"{TRUSTGRAPH_BASE}/page/{_encode_id(doc_id)}/p{page_number}"
|
||||
|
||||
|
||||
def chunk_uri_from_page(doc_id: str, page_number: int, chunk_index: int) -> str:
|
||||
"""Generate URI for a chunk extracted from a page."""
|
||||
return f"{TRUSTGRAPH_BASE}/chunk/{_encode_id(doc_id)}/p{page_number}/c{chunk_index}"
|
||||
|
||||
|
||||
def chunk_uri_from_doc(doc_id: str, chunk_index: int) -> str:
|
||||
"""Generate URI for a chunk extracted directly from a text document."""
|
||||
return f"{TRUSTGRAPH_BASE}/chunk/{_encode_id(doc_id)}/c{chunk_index}"
|
||||
|
||||
|
||||
def activity_uri(activity_id: str = None) -> str:
|
||||
"""Generate URI for a PROV-O activity. Auto-generates UUID if not provided."""
|
||||
if activity_id is None:
|
||||
activity_id = str(uuid.uuid4())
|
||||
return f"{TRUSTGRAPH_BASE}/activity/{_encode_id(activity_id)}"
|
||||
|
||||
|
||||
def statement_uri(stmt_id: str = None) -> str:
|
||||
"""Generate URI for a reified statement. Auto-generates UUID if not provided."""
|
||||
if stmt_id is None:
|
||||
stmt_id = str(uuid.uuid4())
|
||||
return f"{TRUSTGRAPH_BASE}/stmt/{_encode_id(stmt_id)}"
|
||||
|
||||
|
||||
def agent_uri(component_name: str) -> str:
|
||||
"""Generate URI for a TrustGraph component agent."""
|
||||
return f"{TRUSTGRAPH_BASE}/agent/{_encode_id(component_name)}"
|
||||
101
trustgraph-base/trustgraph/provenance/vocabulary.py
Normal file
101
trustgraph-base/trustgraph/provenance/vocabulary.py
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
"""
|
||||
Vocabulary bootstrap for provenance.
|
||||
|
||||
The knowledge graph is ontology-neutral and initializes empty. When writing
|
||||
PROV-O provenance data to a collection for the first time, the vocabulary
|
||||
must be bootstrapped with RDF labels for all classes and predicates.
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
|
||||
from .. schema import Triple, Term, IRI, LITERAL
|
||||
|
||||
from . namespaces import (
|
||||
RDFS_LABEL,
|
||||
PROV_ENTITY, PROV_ACTIVITY, PROV_AGENT,
|
||||
PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
|
||||
PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
|
||||
DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
|
||||
TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
|
||||
TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
|
||||
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
|
||||
TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
|
||||
TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
|
||||
)
|
||||
|
||||
|
||||
def _label_triple(uri: str, label: str) -> Triple:
|
||||
"""Create a label triple for a URI."""
|
||||
return Triple(
|
||||
s=Term(type=IRI, iri=uri),
|
||||
p=Term(type=IRI, iri=RDFS_LABEL),
|
||||
o=Term(type=LITERAL, value=label),
|
||||
)
|
||||
|
||||
|
||||
# PROV-O class labels
|
||||
PROV_CLASS_LABELS = [
|
||||
_label_triple(PROV_ENTITY, "Entity"),
|
||||
_label_triple(PROV_ACTIVITY, "Activity"),
|
||||
_label_triple(PROV_AGENT, "Agent"),
|
||||
]
|
||||
|
||||
# PROV-O predicate labels
|
||||
PROV_PREDICATE_LABELS = [
|
||||
_label_triple(PROV_WAS_DERIVED_FROM, "was derived from"),
|
||||
_label_triple(PROV_WAS_GENERATED_BY, "was generated by"),
|
||||
_label_triple(PROV_USED, "used"),
|
||||
_label_triple(PROV_WAS_ASSOCIATED_WITH, "was associated with"),
|
||||
_label_triple(PROV_STARTED_AT_TIME, "started at"),
|
||||
]
|
||||
|
||||
# Dublin Core predicate labels
|
||||
DC_PREDICATE_LABELS = [
|
||||
_label_triple(DC_TITLE, "title"),
|
||||
_label_triple(DC_SOURCE, "source"),
|
||||
_label_triple(DC_DATE, "date"),
|
||||
_label_triple(DC_CREATOR, "creator"),
|
||||
]
|
||||
|
||||
# TrustGraph predicate labels
|
||||
TG_PREDICATE_LABELS = [
|
||||
_label_triple(TG_REIFIES, "reifies"),
|
||||
_label_triple(TG_PAGE_COUNT, "page count"),
|
||||
_label_triple(TG_MIME_TYPE, "MIME type"),
|
||||
_label_triple(TG_PAGE_NUMBER, "page number"),
|
||||
_label_triple(TG_CHUNK_INDEX, "chunk index"),
|
||||
_label_triple(TG_CHAR_OFFSET, "character offset"),
|
||||
_label_triple(TG_CHAR_LENGTH, "character length"),
|
||||
_label_triple(TG_CHUNK_SIZE, "chunk size"),
|
||||
_label_triple(TG_CHUNK_OVERLAP, "chunk overlap"),
|
||||
_label_triple(TG_COMPONENT_VERSION, "component version"),
|
||||
_label_triple(TG_LLM_MODEL, "LLM model"),
|
||||
_label_triple(TG_ONTOLOGY, "ontology"),
|
||||
_label_triple(TG_EMBEDDING_MODEL, "embedding model"),
|
||||
_label_triple(TG_SOURCE_TEXT, "source text"),
|
||||
_label_triple(TG_SOURCE_CHAR_OFFSET, "source character offset"),
|
||||
_label_triple(TG_SOURCE_CHAR_LENGTH, "source character length"),
|
||||
]
|
||||
|
||||
|
||||
def get_vocabulary_triples() -> List[Triple]:
|
||||
"""
|
||||
Get all vocabulary bootstrap triples.
|
||||
|
||||
Returns a list of triples that define labels for all PROV-O classes,
|
||||
PROV-O predicates, Dublin Core predicates, and TrustGraph predicates
|
||||
used in extraction-time provenance.
|
||||
|
||||
This should be emitted to the knowledge graph once per collection
|
||||
before any provenance data is written. The operation is idempotent -
|
||||
re-emitting the same triples is harmless.
|
||||
|
||||
Returns:
|
||||
List of Triple objects defining vocabulary labels
|
||||
"""
|
||||
return (
|
||||
PROV_CLASS_LABELS +
|
||||
PROV_PREDICATE_LABELS +
|
||||
DC_PREDICATE_LABELS +
|
||||
TG_PREDICATE_LABELS
|
||||
)
|
||||
|
|
@ -34,5 +34,9 @@ class TextDocument:
|
|||
class Chunk:
|
||||
metadata: Metadata | None = None
|
||||
chunk: bytes = b""
|
||||
# For provenance: document_id of this chunk in librarian
|
||||
# Post-chunker optimization: both document_id AND chunk content are included
|
||||
# so downstream processors have the ID for provenance and content to work with
|
||||
document_id: str = ""
|
||||
|
||||
############################################################################
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@ from ..core.topic import topic
|
|||
class EntityEmbeddings:
|
||||
entity: Term | None = None
|
||||
vectors: list[list[float]] = field(default_factory=list)
|
||||
# Provenance: which chunk this embedding was derived from
|
||||
chunk_id: str = ""
|
||||
|
||||
# This is a 'batching' mechanism for the above data
|
||||
@dataclass
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@ from ..core.topic import topic
|
|||
class EntityContext:
|
||||
entity: Term | None = None
|
||||
context: str = ""
|
||||
# Provenance: which chunk this entity context was derived from
|
||||
chunk_id: str = ""
|
||||
|
||||
# This is a 'batching' mechanism for the above data
|
||||
@dataclass
|
||||
|
|
|
|||
|
|
@ -91,7 +91,12 @@ class DocumentMetadata:
|
|||
tags: list[str] = field(default_factory=list)
|
||||
# Child document support
|
||||
parent_id: str = "" # Empty for top-level docs, set for children
|
||||
document_type: str = "source" # "source" or "extracted"
|
||||
# Document type vocabulary:
|
||||
# "source" - original uploaded document
|
||||
# "page" - page extracted from source (e.g., PDF page)
|
||||
# "chunk" - text chunk derived from page or source
|
||||
# "extracted" - legacy value, kept for backwards compatibility
|
||||
document_type: str = "source"
|
||||
|
||||
@dataclass
|
||||
class ProcessingMetadata:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue