mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 08:26:21 +02:00
1. Shared Provenance Module - URI generators, namespace constants,
triple builders, vocabulary bootstrap
2. Librarian - Emits document metadata to graph on processing
initiation (vocabulary bootstrap + PROV-O triples)
3. PDF Extractor - Saves pages as child documents, emits parent-child
provenance edges, forwards page IDs
4. Chunker - Saves chunks as child documents, emits provenance edges,
forwards chunk ID + content
5. Knowledge Extractors (both definitions and relationships):
- Link entities to chunks via SUBJECT_OF (not top-level document)
- Removed duplicate metadata emission (now handled by librarian)
- Get chunk_doc_id and chunk_uri from incoming Chunk message
6. Embedding Provenance:
- EntityContext schema has chunk_id field
- EntityEmbeddings schema has chunk_id field
- Definitions extractor sets chunk_id when creating EntityContext
- Graph embeddings processor passes chunk_id through to
EntityEmbeddings
Provenance Flow:
Document → Page (PDF) → Chunk → Extracted Facts/Embeddings
↓ ↓ ↓ ↓
librarian librarian librarian (chunk_id reference)
+ graph + graph + graph
Each artifact is stored in librarian with parent-child linking, and PROV-O
edges are emitted to the knowledge graph for full traceability from any
extracted fact back to its source document.
Also, updating tests
101 lines
3.4 KiB
Python
101 lines
3.4 KiB
Python
"""
|
|
Vocabulary bootstrap for provenance.
|
|
|
|
The knowledge graph is ontology-neutral and initializes empty. When writing
|
|
PROV-O provenance data to a collection for the first time, the vocabulary
|
|
must be bootstrapped with RDF labels for all classes and predicates.
|
|
"""
|
|
|
|
from typing import List
|
|
|
|
from .. schema import Triple, Term, IRI, LITERAL
|
|
|
|
from . namespaces import (
|
|
RDFS_LABEL,
|
|
PROV_ENTITY, PROV_ACTIVITY, PROV_AGENT,
|
|
PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
|
|
PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
|
|
DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
|
|
TG_REIFIES, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
|
|
TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
|
|
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
|
|
TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
|
|
TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
|
|
)
|
|
|
|
|
|
def _label_triple(uri: str, label: str) -> Triple:
|
|
"""Create a label triple for a URI."""
|
|
return Triple(
|
|
s=Term(type=IRI, iri=uri),
|
|
p=Term(type=IRI, iri=RDFS_LABEL),
|
|
o=Term(type=LITERAL, value=label),
|
|
)
|
|
|
|
|
|
# PROV-O class labels
|
|
PROV_CLASS_LABELS = [
|
|
_label_triple(PROV_ENTITY, "Entity"),
|
|
_label_triple(PROV_ACTIVITY, "Activity"),
|
|
_label_triple(PROV_AGENT, "Agent"),
|
|
]
|
|
|
|
# PROV-O predicate labels
|
|
PROV_PREDICATE_LABELS = [
|
|
_label_triple(PROV_WAS_DERIVED_FROM, "was derived from"),
|
|
_label_triple(PROV_WAS_GENERATED_BY, "was generated by"),
|
|
_label_triple(PROV_USED, "used"),
|
|
_label_triple(PROV_WAS_ASSOCIATED_WITH, "was associated with"),
|
|
_label_triple(PROV_STARTED_AT_TIME, "started at"),
|
|
]
|
|
|
|
# Dublin Core predicate labels
|
|
DC_PREDICATE_LABELS = [
|
|
_label_triple(DC_TITLE, "title"),
|
|
_label_triple(DC_SOURCE, "source"),
|
|
_label_triple(DC_DATE, "date"),
|
|
_label_triple(DC_CREATOR, "creator"),
|
|
]
|
|
|
|
# TrustGraph predicate labels
|
|
TG_PREDICATE_LABELS = [
|
|
_label_triple(TG_REIFIES, "reifies"),
|
|
_label_triple(TG_PAGE_COUNT, "page count"),
|
|
_label_triple(TG_MIME_TYPE, "MIME type"),
|
|
_label_triple(TG_PAGE_NUMBER, "page number"),
|
|
_label_triple(TG_CHUNK_INDEX, "chunk index"),
|
|
_label_triple(TG_CHAR_OFFSET, "character offset"),
|
|
_label_triple(TG_CHAR_LENGTH, "character length"),
|
|
_label_triple(TG_CHUNK_SIZE, "chunk size"),
|
|
_label_triple(TG_CHUNK_OVERLAP, "chunk overlap"),
|
|
_label_triple(TG_COMPONENT_VERSION, "component version"),
|
|
_label_triple(TG_LLM_MODEL, "LLM model"),
|
|
_label_triple(TG_ONTOLOGY, "ontology"),
|
|
_label_triple(TG_EMBEDDING_MODEL, "embedding model"),
|
|
_label_triple(TG_SOURCE_TEXT, "source text"),
|
|
_label_triple(TG_SOURCE_CHAR_OFFSET, "source character offset"),
|
|
_label_triple(TG_SOURCE_CHAR_LENGTH, "source character length"),
|
|
]
|
|
|
|
|
|
def get_vocabulary_triples() -> List[Triple]:
|
|
"""
|
|
Get all vocabulary bootstrap triples.
|
|
|
|
Returns a list of triples that define labels for all PROV-O classes,
|
|
PROV-O predicates, Dublin Core predicates, and TrustGraph predicates
|
|
used in extraction-time provenance.
|
|
|
|
This should be emitted to the knowledge graph once per collection
|
|
before any provenance data is written. The operation is idempotent -
|
|
re-emitting the same triples is harmless.
|
|
|
|
Returns:
|
|
List of Triple objects defining vocabulary labels
|
|
"""
|
|
return (
|
|
PROV_CLASS_LABELS +
|
|
PROV_PREDICATE_LABELS +
|
|
DC_PREDICATE_LABELS +
|
|
TG_PREDICATE_LABELS
|
|
)
|