Extract-time provenance (#661)

1. Shared Provenance Module - URI generators, namespace constants,
   triple builders, vocabulary bootstrap
2. Librarian - Emits document metadata to graph on processing
   initiation (vocabulary bootstrap + PROV-O triples)
3. PDF Extractor - Saves pages as child documents, emits parent-child
   provenance edges, forwards page IDs
4. Chunker - Saves chunks as child documents, emits provenance edges,
   forwards chunk ID + content
5. Knowledge Extractors (both definitions and relationships):
   - Link entities to chunks via SUBJECT_OF (not top-level document)
   - Removed duplicate metadata emission (now handled by librarian)
   - Get chunk_doc_id and chunk_uri from incoming Chunk message
6. Embedding Provenance:
   - EntityContext schema has chunk_id field
   - EntityEmbeddings schema has chunk_id field
   - Definitions extractor sets chunk_id when creating EntityContext
   - Graph embeddings processor passes chunk_id through to
     EntityEmbeddings

Provenance Flow:
Document → Page (PDF) → Chunk → Extracted Facts/Embeddings
    ↓           ↓          ↓              ↓
  librarian  librarian  librarian    (chunk_id reference)
  + graph    + graph    + graph

Each artifact is stored in librarian with parent-child linking, and PROV-O
edges are emitted to the knowledge graph for full traceability from any
extracted fact back to its source document.

Also, updating tests
This commit is contained in:
cybermaggedon 2026-03-05 18:36:10 +00:00 committed by GitHub
parent d8f0a576af
commit cd5580be59
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 1601 additions and 59 deletions

View file

@ -0,0 +1,251 @@
"""
Helper functions to build PROV-O triples for extraction-time provenance.
"""
from datetime import datetime
from typing import List, Optional
from .. schema import Triple, Term, IRI, LITERAL
from . namespaces import (
RDF_TYPE, RDFS_LABEL,
PROV_ENTITY, PROV_ACTIVITY, PROV_AGENT,
PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
TG_LLM_MODEL, TG_ONTOLOGY, TG_REIFIES,
)
from . uris import activity_uri, agent_uri
def _iri(uri: str) -> Term:
"""Create an IRI term."""
return Term(type=IRI, iri=uri)
def _literal(value) -> Term:
"""Create a literal term."""
return Term(type=LITERAL, value=str(value))
def _triple(s: str, p: str, o_term: Term) -> Triple:
"""Create a triple with IRI subject and predicate."""
return Triple(s=_iri(s), p=_iri(p), o=o_term)
def document_triples(
doc_uri: str,
title: Optional[str] = None,
source: Optional[str] = None,
date: Optional[str] = None,
creator: Optional[str] = None,
page_count: Optional[int] = None,
mime_type: Optional[str] = None,
) -> List[Triple]:
"""
Build triples for a source document entity.
Args:
doc_uri: The document URI (from uris.document_uri)
title: Document title
source: Source URL/path
date: Document date
creator: Author/creator
page_count: Number of pages (for PDFs)
mime_type: MIME type
Returns:
List of Triple objects
"""
triples = [
_triple(doc_uri, RDF_TYPE, _iri(PROV_ENTITY)),
]
if title:
triples.append(_triple(doc_uri, DC_TITLE, _literal(title)))
triples.append(_triple(doc_uri, RDFS_LABEL, _literal(title)))
if source:
triples.append(_triple(doc_uri, DC_SOURCE, _iri(source)))
if date:
triples.append(_triple(doc_uri, DC_DATE, _literal(date)))
if creator:
triples.append(_triple(doc_uri, DC_CREATOR, _literal(creator)))
if page_count is not None:
triples.append(_triple(doc_uri, TG_PAGE_COUNT, _literal(page_count)))
if mime_type:
triples.append(_triple(doc_uri, TG_MIME_TYPE, _literal(mime_type)))
return triples
def derived_entity_triples(
entity_uri: str,
parent_uri: str,
component_name: str,
component_version: str,
label: Optional[str] = None,
page_number: Optional[int] = None,
chunk_index: Optional[int] = None,
char_offset: Optional[int] = None,
char_length: Optional[int] = None,
chunk_size: Optional[int] = None,
chunk_overlap: Optional[int] = None,
timestamp: Optional[str] = None,
) -> List[Triple]:
"""
Build triples for a derived entity (page or chunk) with full PROV-O provenance.
Creates:
- Entity declaration
- wasDerivedFrom relationship to parent
- Activity for the extraction
- Agent for the component
Args:
entity_uri: URI of the derived entity (page or chunk)
parent_uri: URI of the parent entity
component_name: Name of TG component (e.g., "pdf-extractor", "chunker")
component_version: Version of the component
label: Human-readable label
page_number: Page number (for pages)
chunk_index: Chunk index (for chunks)
char_offset: Character offset in parent (for chunks)
char_length: Character length (for chunks)
chunk_size: Configured chunk size (for chunking activity)
chunk_overlap: Configured chunk overlap (for chunking activity)
timestamp: ISO timestamp (defaults to now)
Returns:
List of Triple objects
"""
if timestamp is None:
timestamp = datetime.utcnow().isoformat() + "Z"
act_uri = activity_uri()
agt_uri = agent_uri(component_name)
triples = [
# Entity declaration
_triple(entity_uri, RDF_TYPE, _iri(PROV_ENTITY)),
# Derivation from parent
_triple(entity_uri, PROV_WAS_DERIVED_FROM, _iri(parent_uri)),
# Generation by activity
_triple(entity_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)),
# Activity declaration
_triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
_triple(act_uri, PROV_USED, _iri(parent_uri)),
_triple(act_uri, PROV_WAS_ASSOCIATED_WITH, _iri(agt_uri)),
_triple(act_uri, PROV_STARTED_AT_TIME, _literal(timestamp)),
_triple(act_uri, TG_COMPONENT_VERSION, _literal(component_version)),
# Agent declaration
_triple(agt_uri, RDF_TYPE, _iri(PROV_AGENT)),
_triple(agt_uri, RDFS_LABEL, _literal(component_name)),
]
if label:
triples.append(_triple(entity_uri, RDFS_LABEL, _literal(label)))
if page_number is not None:
triples.append(_triple(entity_uri, TG_PAGE_NUMBER, _literal(page_number)))
if chunk_index is not None:
triples.append(_triple(entity_uri, TG_CHUNK_INDEX, _literal(chunk_index)))
if char_offset is not None:
triples.append(_triple(entity_uri, TG_CHAR_OFFSET, _literal(char_offset)))
if char_length is not None:
triples.append(_triple(entity_uri, TG_CHAR_LENGTH, _literal(char_length)))
if chunk_size is not None:
triples.append(_triple(act_uri, TG_CHUNK_SIZE, _literal(chunk_size)))
if chunk_overlap is not None:
triples.append(_triple(act_uri, TG_CHUNK_OVERLAP, _literal(chunk_overlap)))
return triples
def triple_provenance_triples(
stmt_uri: str,
subject_uri: str,
predicate_uri: str,
object_term: Term,
chunk_uri: str,
component_name: str,
component_version: str,
llm_model: Optional[str] = None,
ontology_uri: Optional[str] = None,
timestamp: Optional[str] = None,
) -> List[Triple]:
"""
Build provenance triples for an extracted knowledge triple using reification.
Creates:
- Statement object that reifies the triple
- wasDerivedFrom link to source chunk
- Activity and agent metadata
Args:
stmt_uri: URI for the reified statement
subject_uri: Subject of the extracted triple
predicate_uri: Predicate of the extracted triple
object_term: Object of the extracted triple (Term)
chunk_uri: URI of source chunk
component_name: Name of extractor component
component_version: Version of the component
llm_model: LLM model used for extraction
ontology_uri: Ontology URI used for extraction
timestamp: ISO timestamp
Returns:
List of Triple objects for the provenance (not the triple itself)
"""
if timestamp is None:
timestamp = datetime.utcnow().isoformat() + "Z"
act_uri = activity_uri()
agt_uri = agent_uri(component_name)
# Note: The actual reification (tg:reifies pointing at the edge) requires
# RDF 1.2 triple term support. This builds the surrounding provenance.
# The actual reification link must be handled by the knowledge extractor
# using the graph store's reification API.
triples = [
# Statement provenance
_triple(stmt_uri, PROV_WAS_DERIVED_FROM, _iri(chunk_uri)),
_triple(stmt_uri, PROV_WAS_GENERATED_BY, _iri(act_uri)),
# Activity
_triple(act_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
_triple(act_uri, PROV_USED, _iri(chunk_uri)),
_triple(act_uri, PROV_WAS_ASSOCIATED_WITH, _iri(agt_uri)),
_triple(act_uri, PROV_STARTED_AT_TIME, _literal(timestamp)),
_triple(act_uri, TG_COMPONENT_VERSION, _literal(component_version)),
# Agent
_triple(agt_uri, RDF_TYPE, _iri(PROV_AGENT)),
_triple(agt_uri, RDFS_LABEL, _literal(component_name)),
]
if llm_model:
triples.append(_triple(act_uri, TG_LLM_MODEL, _literal(llm_model)))
if ontology_uri:
triples.append(_triple(act_uri, TG_ONTOLOGY, _iri(ontology_uri)))
return triples