mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-07-02 22:41:01 +02:00
Replace the three-prompt LLM scoring pipeline (kg-edge-scoring, kg-edge-reasoning, kg-edge-selection) with a cross-encoder reranker service backed by FlashRank. The new hop_and_filter() method performs iterative graph traversal with semantic scoring at each hop, replacing the previous follow_edges/get_subgraph approach. - Add reranker service (trustgraph-base client/service, FlashRank processor) - Add gateway dispatch for reranker via API and WebSocket - Rewrite GraphRAG pipeline: hop_and_filter() with per-hop cross-encoder scoring - Remove kg_prompt() and edge_score_limit from prompt client - Update provenance: add tg:EdgeSelection type, tg:concept, tg:score predicates - Update CLIs (tg-invoke-graph-rag, tg-show-explain-trace) for new metadata - Add tg-invoke-reranker CLI tool - Add tech spec and UX developer guidance - Update all unit and integration tests
149 lines
5.1 KiB
Python
149 lines
5.1 KiB
Python
"""
|
|
Vocabulary bootstrap for provenance.
|
|
|
|
The knowledge graph is ontology-neutral and initializes empty. When writing
|
|
PROV-O provenance data to a collection for the first time, the vocabulary
|
|
must be bootstrapped with RDF labels for all classes and predicates.
|
|
"""
|
|
|
|
from typing import List
|
|
|
|
from .. schema import Triple, Term, IRI, LITERAL
|
|
|
|
from . namespaces import (
|
|
RDFS_LABEL,
|
|
PROV_ENTITY, PROV_ACTIVITY, PROV_AGENT,
|
|
PROV_WAS_DERIVED_FROM, PROV_WAS_GENERATED_BY,
|
|
PROV_USED, PROV_WAS_ASSOCIATED_WITH, PROV_STARTED_AT_TIME,
|
|
DC_TITLE, DC_SOURCE, DC_DATE, DC_CREATOR,
|
|
SCHEMA_DIGITAL_DOCUMENT, SCHEMA_DESCRIPTION,
|
|
SCHEMA_KEYWORDS, SCHEMA_NAME,
|
|
SKOS_DEFINITION,
|
|
TG_CONTAINS, TG_PAGE_COUNT, TG_MIME_TYPE, TG_PAGE_NUMBER,
|
|
TG_CHUNK_INDEX, TG_CHAR_OFFSET, TG_CHAR_LENGTH,
|
|
TG_CHUNK_SIZE, TG_CHUNK_OVERLAP, TG_COMPONENT_VERSION,
|
|
TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
|
|
TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
|
|
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
|
|
TG_CONCEPT, TG_ENTITY, TG_GROUNDING,
|
|
TG_ANSWER_TYPE, TG_REFLECTION_TYPE, TG_THOUGHT_TYPE, TG_OBSERVATION_TYPE,
|
|
TG_DECOMPOSITION, TG_FINDING, TG_PLAN_TYPE, TG_STEP_RESULT,
|
|
TG_SUBAGENT_GOAL, TG_PLAN_STEP,
|
|
TG_EDGE_SELECTION, TG_SCORE,
|
|
)
|
|
|
|
|
|
def _label_triple(uri: str, label: str) -> Triple:
|
|
"""Create a label triple for a URI."""
|
|
return Triple(
|
|
s=Term(type=IRI, iri=uri),
|
|
p=Term(type=IRI, iri=RDFS_LABEL),
|
|
o=Term(type=LITERAL, value=label),
|
|
)
|
|
|
|
|
|
# PROV-O class labels
|
|
PROV_CLASS_LABELS = [
|
|
_label_triple(PROV_ENTITY, "Entity"),
|
|
_label_triple(PROV_ACTIVITY, "Activity"),
|
|
_label_triple(PROV_AGENT, "Agent"),
|
|
]
|
|
|
|
# PROV-O predicate labels
|
|
PROV_PREDICATE_LABELS = [
|
|
_label_triple(PROV_WAS_DERIVED_FROM, "was derived from"),
|
|
_label_triple(PROV_WAS_GENERATED_BY, "was generated by"),
|
|
_label_triple(PROV_USED, "used"),
|
|
_label_triple(PROV_WAS_ASSOCIATED_WITH, "was associated with"),
|
|
_label_triple(PROV_STARTED_AT_TIME, "started at"),
|
|
]
|
|
|
|
# Dublin Core predicate labels
|
|
DC_PREDICATE_LABELS = [
|
|
_label_triple(DC_TITLE, "title"),
|
|
_label_triple(DC_SOURCE, "source"),
|
|
_label_triple(DC_DATE, "date"),
|
|
_label_triple(DC_CREATOR, "creator"),
|
|
]
|
|
|
|
# Schema.org labels
|
|
SCHEMA_LABELS = [
|
|
_label_triple(SCHEMA_DIGITAL_DOCUMENT, "Digital Document"),
|
|
_label_triple(SCHEMA_DESCRIPTION, "description"),
|
|
_label_triple(SCHEMA_KEYWORDS, "keywords"),
|
|
_label_triple(SCHEMA_NAME, "name"),
|
|
]
|
|
|
|
# SKOS labels
|
|
SKOS_LABELS = [
|
|
_label_triple(SKOS_DEFINITION, "definition"),
|
|
]
|
|
|
|
# TrustGraph class labels (extraction provenance)
|
|
TG_CLASS_LABELS = [
|
|
_label_triple(TG_DOCUMENT_TYPE, "Document"),
|
|
_label_triple(TG_PAGE_TYPE, "Page"),
|
|
_label_triple(TG_CHUNK_TYPE, "Chunk"),
|
|
_label_triple(TG_SUBGRAPH_TYPE, "Subgraph"),
|
|
_label_triple(TG_GROUNDING, "Grounding"),
|
|
_label_triple(TG_ANSWER_TYPE, "Answer"),
|
|
_label_triple(TG_REFLECTION_TYPE, "Reflection"),
|
|
_label_triple(TG_THOUGHT_TYPE, "Thought"),
|
|
_label_triple(TG_OBSERVATION_TYPE, "Observation"),
|
|
_label_triple(TG_DECOMPOSITION, "Decomposition"),
|
|
_label_triple(TG_FINDING, "Finding"),
|
|
_label_triple(TG_PLAN_TYPE, "Plan"),
|
|
_label_triple(TG_STEP_RESULT, "Step Result"),
|
|
_label_triple(TG_EDGE_SELECTION, "Edge Selection"),
|
|
]
|
|
|
|
# TrustGraph predicate labels
|
|
TG_PREDICATE_LABELS = [
|
|
_label_triple(TG_CONTAINS, "contains"),
|
|
_label_triple(TG_PAGE_COUNT, "page count"),
|
|
_label_triple(TG_MIME_TYPE, "MIME type"),
|
|
_label_triple(TG_PAGE_NUMBER, "page number"),
|
|
_label_triple(TG_CHUNK_INDEX, "chunk index"),
|
|
_label_triple(TG_CHAR_OFFSET, "character offset"),
|
|
_label_triple(TG_CHAR_LENGTH, "character length"),
|
|
_label_triple(TG_CHUNK_SIZE, "chunk size"),
|
|
_label_triple(TG_CHUNK_OVERLAP, "chunk overlap"),
|
|
_label_triple(TG_COMPONENT_VERSION, "component version"),
|
|
_label_triple(TG_LLM_MODEL, "LLM model"),
|
|
_label_triple(TG_ONTOLOGY, "ontology"),
|
|
_label_triple(TG_EMBEDDING_MODEL, "embedding model"),
|
|
_label_triple(TG_SOURCE_TEXT, "source text"),
|
|
_label_triple(TG_SOURCE_CHAR_OFFSET, "source character offset"),
|
|
_label_triple(TG_SOURCE_CHAR_LENGTH, "source character length"),
|
|
_label_triple(TG_CONCEPT, "concept"),
|
|
_label_triple(TG_ENTITY, "entity"),
|
|
_label_triple(TG_SUBAGENT_GOAL, "subagent goal"),
|
|
_label_triple(TG_PLAN_STEP, "plan step"),
|
|
_label_triple(TG_SCORE, "score"),
|
|
]
|
|
|
|
|
|
def get_vocabulary_triples() -> List[Triple]:
|
|
"""
|
|
Get all vocabulary bootstrap triples.
|
|
|
|
Returns a list of triples that define labels for all PROV-O classes,
|
|
PROV-O predicates, Dublin Core predicates, and TrustGraph predicates
|
|
used in extraction-time provenance.
|
|
|
|
This should be emitted to the knowledge graph once per collection
|
|
before any provenance data is written. The operation is idempotent -
|
|
re-emitting the same triples is harmless.
|
|
|
|
Returns:
|
|
List of Triple objects defining vocabulary labels
|
|
"""
|
|
return (
|
|
PROV_CLASS_LABELS +
|
|
PROV_PREDICATE_LABELS +
|
|
DC_PREDICATE_LABELS +
|
|
SCHEMA_LABELS +
|
|
SKOS_LABELS +
|
|
TG_CLASS_LABELS +
|
|
TG_PREDICATE_LABELS
|
|
)
|