mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-07-01 01:19:38 +02:00
Replace the three-prompt LLM scoring pipeline (kg-edge-scoring, kg-edge-reasoning, kg-edge-selection) with a cross-encoder reranker service backed by FlashRank. The new hop_and_filter() method performs iterative graph traversal with semantic scoring at each hop, replacing the previous follow_edges/get_subgraph approach. - Add reranker service (trustgraph-base client/service, FlashRank processor) - Add gateway dispatch for reranker via API and WebSocket - Rewrite GraphRAG pipeline: hop_and_filter() with per-hop cross-encoder scoring - Remove kg_prompt() and edge_score_limit from prompt client - Update provenance: add tg:EdgeSelection type, tg:concept, tg:score predicates - Update CLIs (tg-invoke-graph-rag, tg-show-explain-trace) for new metadata - Add tg-invoke-reranker CLI tool - Add tech spec and UX developer guidance - Update all unit and integration tests
143 lines
5.6 KiB
Python
143 lines
5.6 KiB
Python
"""
|
|
RDF namespace constants for provenance.
|
|
|
|
Includes PROV-O, Dublin Core, and TrustGraph namespace URIs.
|
|
"""
|
|
|
|
# PROV-O namespace (W3C Provenance Ontology)
|
|
PROV = "http://www.w3.org/ns/prov#"
|
|
PROV_ENTITY = PROV + "Entity"
|
|
PROV_ACTIVITY = PROV + "Activity"
|
|
PROV_AGENT = PROV + "Agent"
|
|
PROV_WAS_DERIVED_FROM = PROV + "wasDerivedFrom"
|
|
PROV_WAS_GENERATED_BY = PROV + "wasGeneratedBy"
|
|
PROV_USED = PROV + "used"
|
|
PROV_WAS_ASSOCIATED_WITH = PROV + "wasAssociatedWith"
|
|
PROV_STARTED_AT_TIME = PROV + "startedAtTime"
|
|
|
|
# Dublin Core namespace
|
|
DC = "http://purl.org/dc/elements/1.1/"
|
|
DC_TITLE = DC + "title"
|
|
DC_SOURCE = DC + "source"
|
|
DC_DATE = DC + "date"
|
|
DC_CREATOR = DC + "creator"
|
|
|
|
# RDF/RDFS namespace (also in rdf.py, but included here for completeness)
|
|
RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
|
RDF_TYPE = RDF + "type"
|
|
RDFS = "http://www.w3.org/2000/01/rdf-schema#"
|
|
RDFS_LABEL = RDFS + "label"
|
|
|
|
# Schema.org namespace
|
|
SCHEMA = "https://schema.org/"
|
|
SCHEMA_DIGITAL_DOCUMENT = SCHEMA + "DigitalDocument"
|
|
SCHEMA_DESCRIPTION = SCHEMA + "description"
|
|
SCHEMA_KEYWORDS = SCHEMA + "keywords"
|
|
SCHEMA_NAME = SCHEMA + "name"
|
|
|
|
# SKOS namespace
|
|
SKOS = "http://www.w3.org/2004/02/skos/core#"
|
|
SKOS_DEFINITION = SKOS + "definition"
|
|
|
|
# TrustGraph namespace for custom predicates
|
|
TG = "https://trustgraph.ai/ns/"
|
|
TG_CONTAINS = TG + "contains"
|
|
TG_PAGE_COUNT = TG + "pageCount"
|
|
TG_MIME_TYPE = TG + "mimeType"
|
|
TG_PAGE_NUMBER = TG + "pageNumber"
|
|
TG_CHUNK_INDEX = TG + "chunkIndex"
|
|
TG_CHAR_OFFSET = TG + "charOffset"
|
|
TG_CHAR_LENGTH = TG + "charLength"
|
|
TG_CHUNK_SIZE = TG + "chunkSize"
|
|
TG_CHUNK_OVERLAP = TG + "chunkOverlap"
|
|
TG_COMPONENT_VERSION = TG + "componentVersion"
|
|
TG_LLM_MODEL = TG + "llmModel"
|
|
TG_ONTOLOGY = TG + "ontology"
|
|
TG_EMBEDDING_MODEL = TG + "embeddingModel"
|
|
TG_SOURCE_TEXT = TG + "sourceText"
|
|
TG_SOURCE_CHAR_OFFSET = TG + "sourceCharOffset"
|
|
TG_SOURCE_CHAR_LENGTH = TG + "sourceCharLength"
|
|
|
|
# Query-time provenance predicates (GraphRAG)
|
|
TG_QUERY = TG + "query"
|
|
TG_CONCEPT = TG + "concept"
|
|
TG_ENTITY = TG + "entity"
|
|
TG_EDGE_COUNT = TG + "edgeCount"
|
|
TG_SELECTED_EDGE = TG + "selectedEdge"
|
|
TG_EDGE = TG + "edge"
|
|
TG_REASONING = TG + "reasoning"
|
|
TG_SCORE = TG + "score"
|
|
TG_DOCUMENT = TG + "document" # Reference to document in librarian
|
|
|
|
# Edge selection entity type (cross-encoder scored edge in Focus)
|
|
TG_EDGE_SELECTION = TG + "EdgeSelection"
|
|
|
|
# Query-time provenance predicates (DocumentRAG)
|
|
TG_CHUNK_COUNT = TG + "chunkCount"
|
|
TG_SELECTED_CHUNK = TG + "selectedChunk"
|
|
|
|
# Extraction provenance entity types
|
|
TG_DOCUMENT_TYPE = TG + "Document"
|
|
TG_PAGE_TYPE = TG + "Page"
|
|
TG_SECTION_TYPE = TG + "Section"
|
|
TG_CHUNK_TYPE = TG + "Chunk"
|
|
TG_IMAGE_TYPE = TG + "Image"
|
|
TG_SUBGRAPH_TYPE = TG + "Subgraph"
|
|
|
|
# Universal decoder metadata predicates
|
|
TG_ELEMENT_TYPES = TG + "elementTypes"
|
|
TG_TABLE_COUNT = TG + "tableCount"
|
|
TG_IMAGE_COUNT = TG + "imageCount"
|
|
|
|
# Explainability entity types (shared)
|
|
TG_QUESTION = TG + "Question"
|
|
TG_GROUNDING = TG + "Grounding"
|
|
TG_EXPLORATION = TG + "Exploration"
|
|
TG_FOCUS = TG + "Focus"
|
|
TG_SYNTHESIS = TG + "Synthesis"
|
|
TG_ANALYSIS = TG + "Analysis"
|
|
TG_CONCLUSION = TG + "Conclusion"
|
|
|
|
# Orchestrator entity types
|
|
TG_DECOMPOSITION = TG + "Decomposition" # Supervisor decomposed into sub-goals
|
|
TG_FINDING = TG + "Finding" # Subagent result
|
|
TG_PLAN_TYPE = TG + "Plan" # Plan-then-execute plan
|
|
TG_STEP_RESULT = TG + "StepResult" # Plan step result
|
|
|
|
# Unifying types for answer and intermediate commentary
|
|
TG_ANSWER_TYPE = TG + "Answer" # Final answer (Synthesis, Conclusion, Finding, StepResult)
|
|
TG_REFLECTION_TYPE = TG + "Reflection" # Intermediate commentary (Thought, Observation)
|
|
TG_THOUGHT_TYPE = TG + "Thought" # Agent reasoning
|
|
TG_OBSERVATION_TYPE = TG + "Observation" # Agent tool result
|
|
TG_TOOL_USE = TG + "ToolUse" # Analysis+ToolUse mixin
|
|
|
|
# Question subtypes (to distinguish retrieval mechanism)
|
|
TG_GRAPH_RAG_QUESTION = TG + "GraphRagQuestion"
|
|
TG_DOC_RAG_QUESTION = TG + "DocRagQuestion"
|
|
TG_AGENT_QUESTION = TG + "AgentQuestion"
|
|
|
|
# Agent provenance predicates
|
|
TG_THOUGHT = TG + "thought" # Links iteration to thought sub-entity
|
|
TG_ACTION = TG + "action"
|
|
TG_ARGUMENTS = TG + "arguments"
|
|
TG_OBSERVATION = TG + "observation" # Links iteration to observation sub-entity
|
|
TG_SUBAGENT_GOAL = TG + "subagentGoal" # Goal string on Decomposition/Finding
|
|
TG_PLAN_STEP = TG + "planStep" # Step goal string on Plan/StepResult
|
|
TG_TOOL_CANDIDATE = TG + "toolCandidate" # Tool name on Analysis events
|
|
TG_TERMINATION_REASON = TG + "terminationReason" # Why the agent loop stopped
|
|
TG_STEP_NUMBER = TG + "stepNumber" # Explicit step counter on iteration events
|
|
TG_PATTERN_DECISION = TG + "PatternDecision" # Meta-router routing decision entity type
|
|
TG_PATTERN = TG + "pattern" # Selected execution pattern
|
|
TG_TASK_TYPE = TG + "taskType" # Identified task type
|
|
TG_LLM_DURATION_MS = TG + "llmDurationMs" # LLM call duration in milliseconds
|
|
TG_TOOL_DURATION_MS = TG + "toolDurationMs" # Tool execution duration in milliseconds
|
|
TG_TOOL_ERROR = TG + "toolError" # Error message from a failed tool execution
|
|
TG_ERROR_TYPE = TG + "Error" # Mixin type for failure events
|
|
TG_IN_TOKEN = TG + "inToken" # Input token count for an LLM call
|
|
TG_OUT_TOKEN = TG + "outToken" # Output token count for an LLM call
|
|
|
|
# Named graph URIs for RDF datasets
|
|
# These separate different types of data while keeping them in the same collection
|
|
GRAPH_DEFAULT = "" # Core knowledge facts (triples extracted from documents)
|
|
GRAPH_SOURCE = "urn:graph:source" # Extraction provenance (which document/chunk a triple came from)
|
|
GRAPH_RETRIEVAL = "urn:graph:retrieval" # Query-time explainability (question, exploration, focus, synthesis)
|