trustgraph/trustgraph-base/trustgraph/provenance/vocabulary.py
cybermaggedon 01cc8dbc64
feat: replace LLM edge scoring with cross-encoder reranker in GraphRAG (#1005)
Replace the three-prompt LLM scoring pipeline (kg-edge-scoring,
kg-edge-reasoning, kg-edge-selection) with a cross-encoder reranker
service backed by FlashRank. The new hop_and_filter() method performs
iterative graph traversal with semantic scoring at each hop, replacing
the previous follow_edges/get_subgraph approach.

- Add reranker service (trustgraph-base client/service, FlashRank processor)
- Add gateway dispatch for reranker via API and WebSocket
- Rewrite GraphRAG pipeline: hop_and_filter() with per-hop cross-encoder scoring
- Remove kg_prompt() and edge_score_limit from prompt client
- Update provenance: add tg:EdgeSelection type, tg:concept, tg:score predicates
- Update CLIs (tg-invoke-graph-rag, tg-show-explain-trace) for new metadata
- Add tg-invoke-reranker CLI tool
- Add tech spec and UX developer guidance
- Update all unit and integration tests
2026-06-30 14:36:37 +01:00

149 lines
5.1 KiB
Python

"""
Vocabulary bootstrap for provenance.
The knowledge graph is ontology-neutral and initializes empty. When writing
PROV-O provenance data to a collection for the first time, the vocabulary
must be bootstrapped with RDF labels for all classes and predicates.
"""
from typing import List
from .. schema import Triple, Term, IRI, LITERAL
from . namespaces import (
RDFS_LABEL,
PROV_ENTITY, PROV_ACTIVITY, PROV_AGENT,
PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION,
SCHEMA_KEYWORDS, SCHEMA_NAME,
SKOS_DEFINITION,
TG_CONTAINS, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
TG_CONCEPT, TG_ENTITY, TG_GROUNDING,
TG_ANSWER_TYPE, TG_REFLECTION_TYPE, TG_THOUGHT_TYPE, TG_OBSERVATION_TYPE,
TG_DECOMPOSITION, TG_FINDING, TG_PLAN_TYPE, TG_STEP_RESULT,
TG_SUBAGENT_GOAL, TG_PLAN_STEP,
TG_EDGE_SELECTION, TG_SCORE,
)
def _label_triple(uri: str, label: str) -> Triple:
"""Create a label triple for a URI."""
return Triple(
s=Term(type=IRI, iri=uri),
p=Term(type=IRI, iri=RDFS_LABEL),
o=Term(type=LITERAL, value=label),
)
# PROV-O class labels
PROV_CLASS_LABELS = [
_label_triple(PROV_ENTITY, "Entity"),
_label_triple(PROV_ACTIVITY, "Activity"),
_label_triple(PROV_AGENT, "Agent"),
]
# PROV-O predicate labels
PROV_PREDICATE_LABELS = [
_label_triple(PROV_WAS_DERIVED_FROM, "was derived from"),
_label_triple(PROV_WAS_GENERATED_BY, "was generated by"),
_label_triple(PROV_USED, "used"),
_label_triple(PROV_WAS_ASSOCIATED_WITH, "was associated with"),
_label_triple(PROV_STARTED_AT_TIME, "started at"),
]
# Dublin Core predicate labels
DC_PREDICATE_LABELS = [
_label_triple(DC_TITLE, "title"),
_label_triple(DC_SOURCE, "source"),
_label_triple(DC_DATE, "date"),
_label_triple(DC_CREATOR, "creator"),
]
# Schema.org labels
SCHEMA_LABELS = [
_label_triple(SCHEMA_DIGITAL_DOCUMENT, "Digital Document"),
_label_triple(SCHEMA_DESCRIPTION, "description"),
_label_triple(SCHEMA_KEYWORDS, "keywords"),
_label_triple(SCHEMA_NAME, "name"),
]
# SKOS labels
SKOS_LABELS = [
_label_triple(SKOS_DEFINITION, "definition"),
]
# TrustGraph class labels (extraction provenance)
TG_CLASS_LABELS = [
_label_triple(TG_DOCUMENT_TYPE, "Document"),
_label_triple(TG_PAGE_TYPE, "Page"),
_label_triple(TG_CHUNK_TYPE, "Chunk"),
_label_triple(TG_SUBGRAPH_TYPE, "Subgraph"),
_label_triple(TG_GROUNDING, "Grounding"),
_label_triple(TG_ANSWER_TYPE, "Answer"),
_label_triple(TG_REFLECTION_TYPE, "Reflection"),
_label_triple(TG_THOUGHT_TYPE, "Thought"),
_label_triple(TG_OBSERVATION_TYPE, "Observation"),
_label_triple(TG_DECOMPOSITION, "Decomposition"),
_label_triple(TG_FINDING, "Finding"),
_label_triple(TG_PLAN_TYPE, "Plan"),
_label_triple(TG_STEP_RESULT, "Step Result"),
_label_triple(TG_EDGE_SELECTION, "Edge Selection"),
]
# TrustGraph predicate labels
TG_PREDICATE_LABELS = [
_label_triple(TG_CONTAINS, "contains"),
_label_triple(TG_PAGE_COUNT, "page count"),
_label_triple(TG_MIME_TYPE, "MIME type"),
_label_triple(TG_PAGE_NUMBER, "page number"),
_label_triple(TG_CHUNK_INDEX, "chunk index"),
_label_triple(TG_CHAR_OFFSET, "character offset"),
_label_triple(TG_CHAR_LENGTH, "character length"),
_label_triple(TG_CHUNK_SIZE, "chunk size"),
_label_triple(TG_CHUNK_OVERLAP, "chunk overlap"),
_label_triple(TG_COMPONENT_VERSION, "component version"),
_label_triple(TG_LLM_MODEL, "LLM model"),
_label_triple(TG_ONTOLOGY, "ontology"),
_label_triple(TG_EMBEDDING_MODEL, "embedding model"),
_label_triple(TG_SOURCE_TEXT, "source text"),
_label_triple(TG_SOURCE_CHAR_OFFSET, "source character offset"),
_label_triple(TG_SOURCE_CHAR_LENGTH, "source character length"),
_label_triple(TG_CONCEPT, "concept"),
_label_triple(TG_ENTITY, "entity"),
_label_triple(TG_SUBAGENT_GOAL, "subagent goal"),
_label_triple(TG_PLAN_STEP, "plan step"),
_label_triple(TG_SCORE, "score"),
]
def get_vocabulary_triples() -> List[Triple]:
"""
Get all vocabulary bootstrap triples.
Returns a list of triples that define labels for all PROV-O classes,
PROV-O predicates, Dublin Core predicates, and TrustGraph predicates
used in extraction-time provenance.
This should be emitted to the knowledge graph once per collection
before any provenance data is written. The operation is idempotent -
re-emitting the same triples is harmless.
Returns:
List of Triple objects defining vocabulary labels
"""
return (
PROV_CLASS_LABELS +
PROV_PREDICATE_LABELS +
DC_PREDICATE_LABELS +
SCHEMA_LABELS +
SKOS_LABELS +
TG_CLASS_LABELS +
TG_PREDICATE_LABELS
)