trustgraph/trustgraph-base/trustgraph/provenance/vocabulary.py
cybermaggedon cd5580be59
Extract-time provenance (#661)
1. Shared Provenance Module - URI generators, namespace constants,
   triple builders, vocabulary bootstrap
2. Librarian - Emits document metadata to graph on processing
   initiation (vocabulary bootstrap + PROV-O triples)
3. PDF Extractor - Saves pages as child documents, emits parent-child
   provenance edges, forwards page IDs
4. Chunker - Saves chunks as child documents, emits provenance edges,
   forwards chunk ID + content
5. Knowledge Extractors (both definitions and relationships):
   - Link entities to chunks via SUBJECT_OF (not top-level document)
   - Removed duplicate metadata emission (now handled by librarian)
   - Get chunk_doc_id and chunk_uri from incoming Chunk message
6. Embedding Provenance:
   - EntityContext schema has chunk_id field
   - EntityEmbeddings schema has chunk_id field
   - Definitions extractor sets chunk_id when creating EntityContext
   - Graph embeddings processor passes chunk_id through to
     EntityEmbeddings

Provenance Flow:
Document → Page (PDF) → Chunk → Extracted Facts/Embeddings
    ↓           ↓          ↓              ↓
  librarian  librarian  librarian    (chunk_id reference)
  + graph    + graph    + graph

Each artifact is stored in librarian with parent-child linking, and PROV-O
edges are emitted to the knowledge graph for full traceability from any
extracted fact back to its source document.

Also, updating tests
2026-03-05 18:36:10 +00:00

101 lines
3.4 KiB
Python

"""
Vocabulary bootstrap for provenance.
The knowledge graph is ontology-neutral and initializes empty. When writing
PROV-O provenance data to a collection for the first time, the vocabulary
must be bootstrapped with RDF labels for all classes and predicates.
"""
from typing import List
from .. schema import Triple, Term, IRI, LITERAL
from . namespaces import (
RDFS_LABEL,
PROV_ENTITY, PROV_ACTIVITY, PROV_AGENT,
PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
)
def _label_triple(uri: str, label: str) -> Triple:
"""Create a label triple for a URI."""
return Triple(
s=Term(type=IRI, iri=uri),
p=Term(type=IRI, iri=RDFS_LABEL),
o=Term(type=LITERAL, value=label),
)
# PROV-O class labels
PROV_CLASS_LABELS = [
_label_triple(PROV_ENTITY, "Entity"),
_label_triple(PROV_ACTIVITY, "Activity"),
_label_triple(PROV_AGENT, "Agent"),
]
# PROV-O predicate labels
PROV_PREDICATE_LABELS = [
_label_triple(PROV_WAS_DERIVED_FROM, "was derived from"),
_label_triple(PROV_WAS_GENERATED_BY, "was generated by"),
_label_triple(PROV_USED, "used"),
_label_triple(PROV_WAS_ASSOCIATED_WITH, "was associated with"),
_label_triple(PROV_STARTED_AT_TIME, "started at"),
]
# Dublin Core predicate labels
DC_PREDICATE_LABELS = [
_label_triple(DC_TITLE, "title"),
_label_triple(DC_SOURCE, "source"),
_label_triple(DC_DATE, "date"),
_label_triple(DC_CREATOR, "creator"),
]
# TrustGraph predicate labels
TG_PREDICATE_LABELS = [
_label_triple(TG_REIFIES, "reifies"),
_label_triple(TG_PAGE_COUNT, "page count"),
_label_triple(TG_MIME_TYPE, "MIME type"),
_label_triple(TG_PAGE_NUMBER, "page number"),
_label_triple(TG_CHUNK_INDEX, "chunk index"),
_label_triple(TG_CHAR_OFFSET, "character offset"),
_label_triple(TG_CHAR_LENGTH, "character length"),
_label_triple(TG_CHUNK_SIZE, "chunk size"),
_label_triple(TG_CHUNK_OVERLAP, "chunk overlap"),
_label_triple(TG_COMPONENT_VERSION, "component version"),
_label_triple(TG_LLM_MODEL, "LLM model"),
_label_triple(TG_ONTOLOGY, "ontology"),
_label_triple(TG_EMBEDDING_MODEL, "embedding model"),
_label_triple(TG_SOURCE_TEXT, "source text"),
_label_triple(TG_SOURCE_CHAR_OFFSET, "source character offset"),
_label_triple(TG_SOURCE_CHAR_LENGTH, "source character length"),
]
def get_vocabulary_triples() -> List[Triple]:
"""
Get all vocabulary bootstrap triples.
Returns a list of triples that define labels for all PROV-O classes,
PROV-O predicates, Dublin Core predicates, and TrustGraph predicates
used in extraction-time provenance.
This should be emitted to the knowledge graph once per collection
before any provenance data is written. The operation is idempotent -
re-emitting the same triples is harmless.
Returns:
List of Triple objects defining vocabulary labels
"""
return (
PROV_CLASS_LABELS +
PROV_PREDICATE_LABELS +
DC_PREDICATE_LABELS +
TG_PREDICATE_LABELS
)