Enhance retrieval pipelines: 4-stage GraphRAG, DocRAG grounding (#697)

Enhance retrieval pipelines: 4-stage GraphRAG, DocRAG grounding,
consistent PROV-O

GraphRAG:
- Split retrieval into 4 prompt stages: extract-concepts,
  kg-edge-scoring,
  kg-edge-reasoning, kg-synthesis (was single-stage)
- Add concept extraction (grounding) for per-concept embedding
- Filter main query to default graph, ignoring
  provenance/explainability edges
- Add source document edges to knowledge graph

DocumentRAG:
- Add grounding step with concept extraction, matching GraphRAG's
  pattern:
  Question → Grounding → Exploration → Synthesis
- Per-concept embedding and chunk retrieval with deduplication

Cross-pipeline:
- Make PROV-O derivation links consistent: wasGeneratedBy for first
  entity from Activity, wasDerivedFrom for entity-to-entity chains
- Update CLIs (tg-invoke-agent, tg-invoke-graph-rag,
  tg-invoke-document-rag) for new explainability structure
- Fix all affected unit and integration tests
This commit is contained in:
cybermaggedon 2026-03-16 12:12:13 +00:00 committed by GitHub
parent 29b4300808
commit a115ec06ab
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
25 changed files with 1537 additions and 1008 deletions

View file

@ -75,9 +75,11 @@ from .explainability import (
ExplainabilityClient,
ExplainEntity,
Question,
Grounding,
Exploration,
Focus,
Synthesis,
Reflection,
Analysis,
Conclusion,
EdgeSelection,

View file

@ -18,25 +18,28 @@ TG_EDGE_COUNT = TG + "edgeCount"
TG_SELECTED_EDGE = TG + "selectedEdge"
TG_EDGE = TG + "edge"
TG_REASONING = TG + "reasoning"
TG_CONTENT = TG + "content"
TG_DOCUMENT = TG + "document"
TG_CONCEPT = TG + "concept"
TG_ENTITY = TG + "entity"
TG_CHUNK_COUNT = TG + "chunkCount"
TG_SELECTED_CHUNK = TG + "selectedChunk"
TG_THOUGHT = TG + "thought"
TG_ACTION = TG + "action"
TG_ARGUMENTS = TG + "arguments"
TG_OBSERVATION = TG + "observation"
TG_ANSWER = TG + "answer"
TG_THOUGHT_DOCUMENT = TG + "thoughtDocument"
TG_OBSERVATION_DOCUMENT = TG + "observationDocument"
# Entity types
TG_QUESTION = TG + "Question"
TG_GROUNDING = TG + "Grounding"
TG_EXPLORATION = TG + "Exploration"
TG_FOCUS = TG + "Focus"
TG_SYNTHESIS = TG + "Synthesis"
TG_ANALYSIS = TG + "Analysis"
TG_CONCLUSION = TG + "Conclusion"
TG_ANSWER_TYPE = TG + "Answer"
TG_REFLECTION_TYPE = TG + "Reflection"
TG_THOUGHT_TYPE = TG + "Thought"
TG_OBSERVATION_TYPE = TG + "Observation"
TG_GRAPH_RAG_QUESTION = TG + "GraphRagQuestion"
TG_DOC_RAG_QUESTION = TG + "DocRagQuestion"
TG_AGENT_QUESTION = TG + "AgentQuestion"
@ -73,12 +76,16 @@ class ExplainEntity:
if TG_GRAPH_RAG_QUESTION in types or TG_DOC_RAG_QUESTION in types or TG_AGENT_QUESTION in types:
return Question.from_triples(uri, triples, types)
elif TG_GROUNDING in types:
return Grounding.from_triples(uri, triples)
elif TG_EXPLORATION in types:
return Exploration.from_triples(uri, triples)
elif TG_FOCUS in types:
return Focus.from_triples(uri, triples)
elif TG_SYNTHESIS in types:
return Synthesis.from_triples(uri, triples)
elif TG_REFLECTION_TYPE in types:
return Reflection.from_triples(uri, triples)
elif TG_ANALYSIS in types:
return Analysis.from_triples(uri, triples)
elif TG_CONCLUSION in types:
@ -124,16 +131,38 @@ class Question(ExplainEntity):
)
@dataclass
class Grounding(ExplainEntity):
"""Grounding entity - concept decomposition of the query."""
concepts: List[str] = field(default_factory=list)
@classmethod
def from_triples(cls, uri: str, triples: List[Tuple[str, str, Any]]) -> "Grounding":
concepts = []
for s, p, o in triples:
if p == TG_CONCEPT:
concepts.append(o)
return cls(
uri=uri,
entity_type="grounding",
concepts=concepts
)
@dataclass
class Exploration(ExplainEntity):
"""Exploration entity - edges/chunks retrieved from the knowledge store."""
edge_count: int = 0
chunk_count: int = 0
entities: List[str] = field(default_factory=list)
@classmethod
def from_triples(cls, uri: str, triples: List[Tuple[str, str, Any]]) -> "Exploration":
edge_count = 0
chunk_count = 0
entities = []
for s, p, o in triples:
if p == TG_EDGE_COUNT:
@ -146,12 +175,15 @@ class Exploration(ExplainEntity):
chunk_count = int(o)
except (ValueError, TypeError):
pass
elif p == TG_ENTITY:
entities.append(o)
return cls(
uri=uri,
entity_type="exploration",
edge_count=edge_count,
chunk_count=chunk_count
chunk_count=chunk_count,
entities=entities
)
@ -180,94 +212,104 @@ class Focus(ExplainEntity):
@dataclass
class Synthesis(ExplainEntity):
"""Synthesis entity - the final answer."""
content: str = ""
document_uri: str = "" # Reference to librarian document
@classmethod
def from_triples(cls, uri: str, triples: List[Tuple[str, str, Any]]) -> "Synthesis":
content = ""
document_uri = ""
for s, p, o in triples:
if p == TG_CONTENT:
content = o
elif p == TG_DOCUMENT:
if p == TG_DOCUMENT:
document_uri = o
return cls(
uri=uri,
entity_type="synthesis",
content=content,
document_uri=document_uri
)
@dataclass
class Reflection(ExplainEntity):
"""Reflection entity - intermediate commentary (Thought or Observation)."""
document_uri: str = "" # Reference to content in librarian
reflection_type: str = "" # "thought" or "observation"
@classmethod
def from_triples(cls, uri: str, triples: List[Tuple[str, str, Any]]) -> "Reflection":
document_uri = ""
reflection_type = ""
types = [o for s, p, o in triples if p == RDF_TYPE]
if TG_THOUGHT_TYPE in types:
reflection_type = "thought"
elif TG_OBSERVATION_TYPE in types:
reflection_type = "observation"
for s, p, o in triples:
if p == TG_DOCUMENT:
document_uri = o
return cls(
uri=uri,
entity_type="reflection",
document_uri=document_uri,
reflection_type=reflection_type
)
@dataclass
class Analysis(ExplainEntity):
"""Analysis entity - one think/act/observe cycle (Agent only)."""
thought: str = ""
action: str = ""
arguments: str = "" # JSON string
observation: str = ""
thought_document_uri: str = "" # Reference to thought in librarian
observation_document_uri: str = "" # Reference to observation in librarian
thought_uri: str = "" # URI of thought sub-entity
observation_uri: str = "" # URI of observation sub-entity
@classmethod
def from_triples(cls, uri: str, triples: List[Tuple[str, str, Any]]) -> "Analysis":
thought = ""
action = ""
arguments = ""
observation = ""
thought_document_uri = ""
observation_document_uri = ""
thought_uri = ""
observation_uri = ""
for s, p, o in triples:
if p == TG_THOUGHT:
thought = o
elif p == TG_ACTION:
if p == TG_ACTION:
action = o
elif p == TG_ARGUMENTS:
arguments = o
elif p == TG_THOUGHT:
thought_uri = o
elif p == TG_OBSERVATION:
observation = o
elif p == TG_THOUGHT_DOCUMENT:
thought_document_uri = o
elif p == TG_OBSERVATION_DOCUMENT:
observation_document_uri = o
observation_uri = o
return cls(
uri=uri,
entity_type="analysis",
thought=thought,
action=action,
arguments=arguments,
observation=observation,
thought_document_uri=thought_document_uri,
observation_document_uri=observation_document_uri
thought_uri=thought_uri,
observation_uri=observation_uri
)
@dataclass
class Conclusion(ExplainEntity):
"""Conclusion entity - final answer (Agent only)."""
answer: str = ""
document_uri: str = "" # Reference to librarian document
@classmethod
def from_triples(cls, uri: str, triples: List[Tuple[str, str, Any]]) -> "Conclusion":
answer = ""
document_uri = ""
for s, p, o in triples:
if p == TG_ANSWER:
answer = o
elif p == TG_DOCUMENT:
if p == TG_DOCUMENT:
document_uri = o
return cls(
uri=uri,
entity_type="conclusion",
answer=answer,
document_uri=document_uri
)
@ -543,42 +585,29 @@ class ExplainabilityClient:
o_label = self.resolve_label(edge.get("o", ""), user, collection)
return (s_label, p_label, o_label)
def fetch_synthesis_content(
def fetch_document_content(
self,
synthesis: Synthesis,
document_uri: str,
api: Any,
user: Optional[str] = None,
max_content: int = 10000
) -> str:
"""
Fetch the content for a Synthesis entity.
If synthesis has inline content, returns that.
If synthesis has a document_uri, fetches from librarian with retry.
Fetch content from the librarian by document URI.
Args:
synthesis: The Synthesis entity
document_uri: The document URI in the librarian
api: TrustGraph Api instance for librarian access
user: User identifier for librarian
max_content: Maximum content length to return
Returns:
The synthesis content as a string
The document content as a string
"""
# If inline content exists, use it
if synthesis.content:
if len(synthesis.content) > max_content:
return synthesis.content[:max_content] + "... [truncated]"
return synthesis.content
# Otherwise fetch from librarian
if not synthesis.document_uri:
if not document_uri:
return ""
# Extract document ID from URI (e.g., "urn:document:abc123" -> "abc123")
doc_id = synthesis.document_uri
if doc_id.startswith("urn:document:"):
doc_id = doc_id[len("urn:document:"):]
doc_id = document_uri
# Retry fetching from librarian for eventual consistency
for attempt in range(self.max_retries):
@ -603,129 +632,6 @@ class ExplainabilityClient:
return ""
def fetch_conclusion_content(
self,
conclusion: Conclusion,
api: Any,
user: Optional[str] = None,
max_content: int = 10000
) -> str:
"""
Fetch the content for a Conclusion entity (Agent final answer).
If conclusion has inline answer, returns that.
If conclusion has a document_uri, fetches from librarian with retry.
Args:
conclusion: The Conclusion entity
api: TrustGraph Api instance for librarian access
user: User identifier for librarian
max_content: Maximum content length to return
Returns:
The conclusion answer as a string
"""
# If inline answer exists, use it
if conclusion.answer:
if len(conclusion.answer) > max_content:
return conclusion.answer[:max_content] + "... [truncated]"
return conclusion.answer
# Otherwise fetch from librarian
if not conclusion.document_uri:
return ""
# Use document URI directly (it's already a full URN)
doc_id = conclusion.document_uri
# Retry fetching from librarian for eventual consistency
for attempt in range(self.max_retries):
try:
library = api.library()
content_bytes = library.get_document_content(user=user, id=doc_id)
# Decode as text
try:
content = content_bytes.decode('utf-8')
if len(content) > max_content:
return content[:max_content] + "... [truncated]"
return content
except UnicodeDecodeError:
return f"[Binary: {len(content_bytes)} bytes]"
except Exception as e:
if attempt < self.max_retries - 1:
time.sleep(self.retry_delay)
continue
return f"[Error fetching content: {e}]"
return ""
def fetch_analysis_content(
self,
analysis: Analysis,
api: Any,
user: Optional[str] = None,
max_content: int = 10000
) -> None:
"""
Fetch thought and observation content for an Analysis entity.
If analysis has inline content, uses that.
If analysis has document URIs, fetches from librarian with retry.
Modifies the analysis object in place.
Args:
analysis: The Analysis entity (modified in place)
api: TrustGraph Api instance for librarian access
user: User identifier for librarian
max_content: Maximum content length to return
"""
# Fetch thought if needed
if not analysis.thought and analysis.thought_document_uri:
doc_id = analysis.thought_document_uri
for attempt in range(self.max_retries):
try:
library = api.library()
content_bytes = library.get_document_content(user=user, id=doc_id)
try:
content = content_bytes.decode('utf-8')
if len(content) > max_content:
analysis.thought = content[:max_content] + "... [truncated]"
else:
analysis.thought = content
break
except UnicodeDecodeError:
analysis.thought = f"[Binary: {len(content_bytes)} bytes]"
break
except Exception as e:
if attempt < self.max_retries - 1:
time.sleep(self.retry_delay)
continue
analysis.thought = f"[Error fetching thought: {e}]"
# Fetch observation if needed
if not analysis.observation and analysis.observation_document_uri:
doc_id = analysis.observation_document_uri
for attempt in range(self.max_retries):
try:
library = api.library()
content_bytes = library.get_document_content(user=user, id=doc_id)
try:
content = content_bytes.decode('utf-8')
if len(content) > max_content:
analysis.observation = content[:max_content] + "... [truncated]"
else:
analysis.observation = content
break
except UnicodeDecodeError:
analysis.observation = f"[Binary: {len(content_bytes)} bytes]"
break
except Exception as e:
if attempt < self.max_retries - 1:
time.sleep(self.retry_delay)
continue
analysis.observation = f"[Error fetching observation: {e}]"
def fetch_graphrag_trace(
self,
@ -739,7 +645,7 @@ class ExplainabilityClient:
"""
Fetch the complete GraphRAG trace starting from a question URI.
Follows the provenance chain: Question -> Exploration -> Focus -> Synthesis
Follows the provenance chain: Question -> Grounding -> Exploration -> Focus -> Synthesis
Args:
question_uri: The question entity URI
@ -750,13 +656,14 @@ class ExplainabilityClient:
max_content: Maximum content length for synthesis
Returns:
Dict with question, exploration, focus, synthesis entities
Dict with question, grounding, exploration, focus, synthesis entities
"""
if graph is None:
graph = "urn:graph:retrieval"
trace = {
"question": None,
"grounding": None,
"exploration": None,
"focus": None,
"synthesis": None,
@ -768,8 +675,8 @@ class ExplainabilityClient:
return trace
trace["question"] = question
# Find exploration: ?exploration prov:wasGeneratedBy question_uri
exploration_triples = self.flow.triples_query(
# Find grounding: ?grounding prov:wasGeneratedBy question_uri
grounding_triples = self.flow.triples_query(
p=PROV_WAS_GENERATED_BY,
o=question_uri,
g=graph,
@ -778,6 +685,30 @@ class ExplainabilityClient:
limit=10
)
if grounding_triples:
grounding_uris = [
extract_term_value(t.get("s", {}))
for t in grounding_triples
]
for gnd_uri in grounding_uris:
grounding = self.fetch_entity(gnd_uri, graph, user, collection)
if isinstance(grounding, Grounding):
trace["grounding"] = grounding
break
if not trace["grounding"]:
return trace
# Find exploration: ?exploration prov:wasDerivedFrom grounding_uri
exploration_triples = self.flow.triples_query(
p=PROV_WAS_DERIVED_FROM,
o=trace["grounding"].uri,
g=graph,
user=user,
collection=collection,
limit=10
)
if exploration_triples:
exploration_uris = [
extract_term_value(t.get("s", {}))
@ -834,11 +765,6 @@ class ExplainabilityClient:
for synth_uri in synthesis_uris:
synthesis = self.fetch_entity(synth_uri, graph, user, collection)
if isinstance(synthesis, Synthesis):
# Fetch content if needed
if api and not synthesis.content and synthesis.document_uri:
synthesis.content = self.fetch_synthesis_content(
synthesis, api, user, max_content
)
trace["synthesis"] = synthesis
break
@ -928,11 +854,6 @@ class ExplainabilityClient:
for synth_uri in synthesis_uris:
synthesis = self.fetch_entity(synth_uri, graph, user, collection)
if isinstance(synthesis, Synthesis):
# Fetch content if needed
if api and not synthesis.content and synthesis.document_uri:
synthesis.content = self.fetch_synthesis_content(
synthesis, api, user, max_content
)
trace["synthesis"] = synthesis
break
@ -978,20 +899,43 @@ class ExplainabilityClient:
return trace
trace["question"] = question
# Follow the chain of wasDerivedFrom
# Follow the chain: wasGeneratedBy for first hop, wasDerivedFrom after
current_uri = session_uri
is_first = True
max_iterations = 50 # Safety limit
for _ in range(max_iterations):
# Find entity derived from current
derived_triples = self.flow.triples_query(
p=PROV_WAS_DERIVED_FROM,
o=current_uri,
g=graph,
user=user,
collection=collection,
limit=10
)
# First hop uses wasGeneratedBy (entity←activity),
# subsequent hops use wasDerivedFrom (entity←entity)
if is_first:
derived_triples = self.flow.triples_query(
p=PROV_WAS_GENERATED_BY,
o=current_uri,
g=graph,
user=user,
collection=collection,
limit=10
)
# Fall back to wasDerivedFrom for backwards compatibility
if not derived_triples:
derived_triples = self.flow.triples_query(
p=PROV_WAS_DERIVED_FROM,
o=current_uri,
g=graph,
user=user,
collection=collection,
limit=10
)
is_first = False
else:
derived_triples = self.flow.triples_query(
p=PROV_WAS_DERIVED_FROM,
o=current_uri,
g=graph,
user=user,
collection=collection,
limit=10
)
if not derived_triples:
break
@ -1003,19 +947,9 @@ class ExplainabilityClient:
entity = self.fetch_entity(derived_uri, graph, user, collection)
if isinstance(entity, Analysis):
# Fetch thought/observation content from librarian if needed
if api:
self.fetch_analysis_content(
entity, api, user=user, max_content=max_content
)
trace["iterations"].append(entity)
current_uri = derived_uri
elif isinstance(entity, Conclusion):
# Fetch answer content from librarian if needed
if api and not entity.answer and entity.document_uri:
entity.answer = self.fetch_conclusion_content(
entity, api, user=user, max_content=max_content
)
trace["conclusion"] = entity
break
else:

View file

@ -1,6 +1,6 @@
from . request_response_spec import RequestResponse, RequestResponseSpec
from .. schema import TriplesQueryRequest, TriplesQueryResponse, Term, IRI, LITERAL
from .. schema import TriplesQueryRequest, TriplesQueryResponse, Term, IRI, LITERAL, TRIPLE
from .. knowledge import Uri, Literal
@ -22,9 +22,11 @@ def to_value(x):
def from_value(x):
"""Convert Uri, Literal, or string to schema Term."""
"""Convert Uri, Literal, string, or Term to schema Term."""
if x is None:
return None
if isinstance(x, Term):
return x
if isinstance(x, Uri):
return Term(type=IRI, iri=str(x))
elif isinstance(x, Literal):
@ -41,7 +43,7 @@ def from_value(x):
class TriplesClient(RequestResponse):
async def query(self, s=None, p=None, o=None, limit=20,
user="trustgraph", collection="default",
timeout=30):
timeout=30, g=None):
resp = await self.request(
TriplesQueryRequest(
@ -51,6 +53,7 @@ class TriplesClient(RequestResponse):
limit = limit,
user = user,
collection = collection,
g = g,
),
timeout=timeout
)
@ -68,7 +71,7 @@ class TriplesClient(RequestResponse):
async def query_stream(self, s=None, p=None, o=None, limit=20,
user="trustgraph", collection="default",
batch_size=20, timeout=30,
batch_callback=None):
batch_callback=None, g=None):
"""
Streaming triple query - calls callback for each batch as it arrives.
@ -80,6 +83,8 @@ class TriplesClient(RequestResponse):
batch_size: Triples per batch
timeout: Request timeout in seconds
batch_callback: Async callback(batch, is_final) called for each batch
g: Graph filter. ""=default graph only, None=all graphs,
or a specific graph IRI.
Returns:
List[Triple]: All triples (flattened) if no callback provided
@ -112,6 +117,7 @@ class TriplesClient(RequestResponse):
collection=collection,
streaming=True,
batch_size=batch_size,
g=g,
),
timeout=timeout,
recipient=recipient,

View file

@ -84,6 +84,7 @@ class GraphRagRequestTranslator(MessageTranslator):
triple_limit=int(data.get("triple-limit", 30)),
max_subgraph_size=int(data.get("max-subgraph-size", 1000)),
max_path_length=int(data.get("max-path-length", 2)),
edge_limit=int(data.get("edge-limit", 25)),
streaming=data.get("streaming", False)
)
@ -96,6 +97,7 @@ class GraphRagRequestTranslator(MessageTranslator):
"triple-limit": obj.triple_limit,
"max-subgraph-size": obj.max_subgraph_size,
"max-path-length": obj.max_path_length,
"edge-limit": obj.edge_limit,
"streaming": getattr(obj, "streaming", False)
}

View file

@ -42,15 +42,19 @@ from . uris import (
agent_uri,
# Query-time provenance URIs (GraphRAG)
question_uri,
grounding_uri,
exploration_uri,
focus_uri,
synthesis_uri,
# Agent provenance URIs
agent_session_uri,
agent_iteration_uri,
agent_thought_uri,
agent_observation_uri,
agent_final_uri,
# Document RAG provenance URIs
docrag_question_uri,
docrag_grounding_uri,
docrag_exploration_uri,
docrag_synthesis_uri,
)
@ -74,18 +78,19 @@ from . namespaces import (
# Extraction provenance entity types
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
# Query-time provenance predicates (GraphRAG)
TG_QUERY, TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_REASONING, TG_CONTENT,
TG_QUERY, TG_CONCEPT, TG_ENTITY,
TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_REASONING,
# Query-time provenance predicates (DocumentRAG)
TG_CHUNK_COUNT, TG_SELECTED_CHUNK,
# Explainability entity types
TG_QUESTION, TG_EXPLORATION, TG_FOCUS, TG_SYNTHESIS,
TG_QUESTION, TG_GROUNDING, TG_EXPLORATION, TG_FOCUS, TG_SYNTHESIS,
TG_ANALYSIS, TG_CONCLUSION,
# Unifying types
TG_ANSWER_TYPE, TG_REFLECTION_TYPE, TG_THOUGHT_TYPE, TG_OBSERVATION_TYPE,
# Question subtypes (to distinguish retrieval mechanism)
TG_GRAPH_RAG_QUESTION, TG_DOC_RAG_QUESTION, TG_AGENT_QUESTION,
# Agent provenance predicates
TG_THOUGHT, TG_ACTION, TG_ARGUMENTS, TG_OBSERVATION, TG_ANSWER,
# Agent document references
TG_THOUGHT_DOCUMENT, TG_OBSERVATION_DOCUMENT,
TG_THOUGHT, TG_ACTION, TG_ARGUMENTS, TG_OBSERVATION,
# Document reference predicate
TG_DOCUMENT,
# Named graphs
@ -99,6 +104,7 @@ from . triples import (
subgraph_provenance_triples,
# Query-time provenance triple builders (GraphRAG)
question_triples,
grounding_triples,
exploration_triples,
focus_triples,
synthesis_triples,
@ -139,15 +145,19 @@ __all__ = [
"agent_uri",
# Query-time provenance URIs
"question_uri",
"grounding_uri",
"exploration_uri",
"focus_uri",
"synthesis_uri",
# Agent provenance URIs
"agent_session_uri",
"agent_iteration_uri",
"agent_thought_uri",
"agent_observation_uri",
"agent_final_uri",
# Document RAG provenance URIs
"docrag_question_uri",
"docrag_grounding_uri",
"docrag_exploration_uri",
"docrag_synthesis_uri",
# Namespaces
@ -164,18 +174,19 @@ __all__ = [
# Extraction provenance entity types
"TG_DOCUMENT_TYPE", "TG_PAGE_TYPE", "TG_CHUNK_TYPE", "TG_SUBGRAPH_TYPE",
# Query-time provenance predicates (GraphRAG)
"TG_QUERY", "TG_EDGE_COUNT", "TG_SELECTED_EDGE", "TG_REASONING", "TG_CONTENT",
"TG_QUERY", "TG_CONCEPT", "TG_ENTITY",
"TG_EDGE_COUNT", "TG_SELECTED_EDGE", "TG_REASONING",
# Query-time provenance predicates (DocumentRAG)
"TG_CHUNK_COUNT", "TG_SELECTED_CHUNK",
# Explainability entity types
"TG_QUESTION", "TG_EXPLORATION", "TG_FOCUS", "TG_SYNTHESIS",
"TG_QUESTION", "TG_GROUNDING", "TG_EXPLORATION", "TG_FOCUS", "TG_SYNTHESIS",
"TG_ANALYSIS", "TG_CONCLUSION",
# Unifying types
"TG_ANSWER_TYPE", "TG_REFLECTION_TYPE", "TG_THOUGHT_TYPE", "TG_OBSERVATION_TYPE",
# Question subtypes
"TG_GRAPH_RAG_QUESTION", "TG_DOC_RAG_QUESTION", "TG_AGENT_QUESTION",
# Agent provenance predicates
"TG_THOUGHT", "TG_ACTION", "TG_ARGUMENTS", "TG_OBSERVATION", "TG_ANSWER",
# Agent document references
"TG_THOUGHT_DOCUMENT", "TG_OBSERVATION_DOCUMENT",
"TG_THOUGHT", "TG_ACTION", "TG_ARGUMENTS", "TG_OBSERVATION",
# Document reference predicate
"TG_DOCUMENT",
# Named graphs
@ -186,6 +197,7 @@ __all__ = [
"subgraph_provenance_triples",
# Query-time provenance triple builders (GraphRAG)
"question_triples",
"grounding_triples",
"exploration_triples",
"focus_triples",
"synthesis_triples",

View file

@ -15,10 +15,11 @@ from .. schema import Triple, Term, IRI, LITERAL
from . namespaces import (
RDF_TYPE, RDFS_LABEL,
PROV_ACTIVITY, PROV_ENTITY, PROV_WAS_DERIVED_FROM, PROV_STARTED_AT_TIME,
TG_QUERY, TG_THOUGHT, TG_ACTION, TG_ARGUMENTS, TG_OBSERVATION, TG_ANSWER,
PROV_ACTIVITY, PROV_ENTITY, PROV_WAS_DERIVED_FROM,
PROV_WAS_GENERATED_BY, PROV_STARTED_AT_TIME,
TG_QUERY, TG_THOUGHT, TG_ACTION, TG_ARGUMENTS, TG_OBSERVATION,
TG_QUESTION, TG_ANALYSIS, TG_CONCLUSION, TG_DOCUMENT,
TG_THOUGHT_DOCUMENT, TG_OBSERVATION_DOCUMENT,
TG_ANSWER_TYPE, TG_REFLECTION_TYPE, TG_THOUGHT_TYPE, TG_OBSERVATION_TYPE,
TG_AGENT_QUESTION,
)
@ -73,12 +74,13 @@ def agent_session_triples(
def agent_iteration_triples(
iteration_uri: str,
parent_uri: str,
thought: str = "",
question_uri: Optional[str] = None,
previous_uri: Optional[str] = None,
action: str = "",
arguments: Dict[str, Any] = None,
observation: str = "",
thought_uri: Optional[str] = None,
thought_document_id: Optional[str] = None,
observation_uri: Optional[str] = None,
observation_document_id: Optional[str] = None,
) -> List[Triple]:
"""
@ -86,19 +88,22 @@ def agent_iteration_triples(
Creates:
- Entity declaration with tg:Analysis type
- wasDerivedFrom link to parent (previous iteration or session)
- Thought, action, arguments, and observation data
- Document references for thought/observation when stored in librarian
- wasGeneratedBy link to question (if first iteration)
- wasDerivedFrom link to previous iteration (if not first)
- Action and arguments metadata
- Thought sub-entity (tg:Reflection, tg:Thought) with librarian document
- Observation sub-entity (tg:Reflection, tg:Observation) with librarian document
Args:
iteration_uri: URI of this iteration (from agent_iteration_uri)
parent_uri: URI of the parent (previous iteration or session)
thought: The agent's reasoning/thought (used if thought_document_id not provided)
question_uri: URI of the question activity (for first iteration)
previous_uri: URI of the previous iteration (for subsequent iterations)
action: The tool/action name
arguments: Arguments passed to the tool (will be JSON-encoded)
observation: The result/observation from the tool (used if observation_document_id not provided)
thought_document_id: Optional document URI for thought in librarian (preferred)
observation_document_id: Optional document URI for observation in librarian (preferred)
thought_uri: URI for the thought sub-entity
thought_document_id: Document URI for thought in librarian
observation_uri: URI for the observation sub-entity
observation_document_id: Document URI for observation in librarian
Returns:
List of Triple objects
@ -110,45 +115,70 @@ def agent_iteration_triples(
_triple(iteration_uri, RDF_TYPE, _iri(PROV_ENTITY)),
_triple(iteration_uri, RDF_TYPE, _iri(TG_ANALYSIS)),
_triple(iteration_uri, RDFS_LABEL, _literal(f"Analysis: {action}")),
_triple(iteration_uri, PROV_WAS_DERIVED_FROM, _iri(parent_uri)),
_triple(iteration_uri, TG_ACTION, _literal(action)),
_triple(iteration_uri, TG_ARGUMENTS, _literal(json.dumps(arguments))),
]
# Thought: use document reference or inline
if thought_document_id:
triples.append(_triple(iteration_uri, TG_THOUGHT_DOCUMENT, _iri(thought_document_id)))
elif thought:
triples.append(_triple(iteration_uri, TG_THOUGHT, _literal(thought)))
if question_uri:
triples.append(
_triple(iteration_uri, PROV_WAS_GENERATED_BY, _iri(question_uri))
)
elif previous_uri:
triples.append(
_triple(iteration_uri, PROV_WAS_DERIVED_FROM, _iri(previous_uri))
)
# Observation: use document reference or inline
if observation_document_id:
triples.append(_triple(iteration_uri, TG_OBSERVATION_DOCUMENT, _iri(observation_document_id)))
elif observation:
triples.append(_triple(iteration_uri, TG_OBSERVATION, _literal(observation)))
# Thought sub-entity
if thought_uri:
triples.extend([
_triple(iteration_uri, TG_THOUGHT, _iri(thought_uri)),
_triple(thought_uri, RDF_TYPE, _iri(TG_REFLECTION_TYPE)),
_triple(thought_uri, RDF_TYPE, _iri(TG_THOUGHT_TYPE)),
_triple(thought_uri, RDFS_LABEL, _literal("Thought")),
_triple(thought_uri, PROV_WAS_GENERATED_BY, _iri(iteration_uri)),
])
if thought_document_id:
triples.append(
_triple(thought_uri, TG_DOCUMENT, _iri(thought_document_id))
)
# Observation sub-entity
if observation_uri:
triples.extend([
_triple(iteration_uri, TG_OBSERVATION, _iri(observation_uri)),
_triple(observation_uri, RDF_TYPE, _iri(TG_REFLECTION_TYPE)),
_triple(observation_uri, RDF_TYPE, _iri(TG_OBSERVATION_TYPE)),
_triple(observation_uri, RDFS_LABEL, _literal("Observation")),
_triple(observation_uri, PROV_WAS_GENERATED_BY, _iri(iteration_uri)),
])
if observation_document_id:
triples.append(
_triple(observation_uri, TG_DOCUMENT, _iri(observation_document_id))
)
return triples
def agent_final_triples(
final_uri: str,
parent_uri: str,
answer: str = "",
question_uri: Optional[str] = None,
previous_uri: Optional[str] = None,
document_id: Optional[str] = None,
) -> List[Triple]:
"""
Build triples for an agent final answer (Conclusion).
Creates:
- Entity declaration with tg:Conclusion type
- wasDerivedFrom link to parent (last iteration or session)
- Either document reference (if document_id provided) or inline answer
- Entity declaration with tg:Conclusion and tg:Answer types
- wasGeneratedBy link to question (if no iterations)
- wasDerivedFrom link to last iteration (if iterations exist)
- Document reference to librarian
Args:
final_uri: URI of the final answer (from agent_final_uri)
parent_uri: URI of the parent (last iteration or session if no iterations)
answer: The final answer text (used if document_id not provided)
document_id: Optional document URI in librarian (preferred)
question_uri: URI of the question activity (if no iterations)
previous_uri: URI of the last iteration (if iterations exist)
document_id: Librarian document ID for the answer content
Returns:
List of Triple objects
@ -156,15 +186,20 @@ def agent_final_triples(
triples = [
_triple(final_uri, RDF_TYPE, _iri(PROV_ENTITY)),
_triple(final_uri, RDF_TYPE, _iri(TG_CONCLUSION)),
_triple(final_uri, RDF_TYPE, _iri(TG_ANSWER_TYPE)),
_triple(final_uri, RDFS_LABEL, _literal("Conclusion")),
_triple(final_uri, PROV_WAS_DERIVED_FROM, _iri(parent_uri)),
]
if question_uri:
triples.append(
_triple(final_uri, PROV_WAS_GENERATED_BY, _iri(question_uri))
)
elif previous_uri:
triples.append(
_triple(final_uri, PROV_WAS_DERIVED_FROM, _iri(previous_uri))
)
if document_id:
# Store reference to document in librarian (as IRI)
triples.append(_triple(final_uri, TG_DOCUMENT, _iri(document_id)))
elif answer:
# Fallback: store inline answer
triples.append(_triple(final_uri, TG_ANSWER, _literal(answer)))
return triples

View file

@ -60,11 +60,12 @@ TG_SOURCE_CHAR_LENGTH = TG + "sourceCharLength"
# Query-time provenance predicates (GraphRAG)
TG_QUERY = TG + "query"
TG_CONCEPT = TG + "concept"
TG_ENTITY = TG + "entity"
TG_EDGE_COUNT = TG + "edgeCount"
TG_SELECTED_EDGE = TG + "selectedEdge"
TG_EDGE = TG + "edge"
TG_REASONING = TG + "reasoning"
TG_CONTENT = TG + "content"
TG_DOCUMENT = TG + "document" # Reference to document in librarian
# Query-time provenance predicates (DocumentRAG)
@ -79,27 +80,29 @@ TG_SUBGRAPH_TYPE = TG + "Subgraph"
# Explainability entity types (shared)
TG_QUESTION = TG + "Question"
TG_GROUNDING = TG + "Grounding"
TG_EXPLORATION = TG + "Exploration"
TG_FOCUS = TG + "Focus"
TG_SYNTHESIS = TG + "Synthesis"
TG_ANALYSIS = TG + "Analysis"
TG_CONCLUSION = TG + "Conclusion"
# Unifying types for answer and intermediate commentary
TG_ANSWER_TYPE = TG + "Answer" # Final answer (Synthesis, Conclusion)
TG_REFLECTION_TYPE = TG + "Reflection" # Intermediate commentary (Thought, Observation)
TG_THOUGHT_TYPE = TG + "Thought" # Agent reasoning
TG_OBSERVATION_TYPE = TG + "Observation" # Agent tool result
# Question subtypes (to distinguish retrieval mechanism)
TG_GRAPH_RAG_QUESTION = TG + "GraphRagQuestion"
TG_DOC_RAG_QUESTION = TG + "DocRagQuestion"
TG_AGENT_QUESTION = TG + "AgentQuestion"
# Agent provenance predicates
TG_THOUGHT = TG + "thought"
TG_THOUGHT = TG + "thought" # Links iteration to thought sub-entity
TG_ACTION = TG + "action"
TG_ARGUMENTS = TG + "arguments"
TG_OBSERVATION = TG + "observation"
TG_ANSWER = TG + "answer"
# Agent document references (for librarian storage)
TG_THOUGHT_DOCUMENT = TG + "thoughtDocument"
TG_OBSERVATION_DOCUMENT = TG + "observationDocument"
TG_OBSERVATION = TG + "observation" # Links iteration to observation sub-entity
# Named graph URIs for RDF datasets
# These separate different types of data while keeping them in the same collection

View file

@ -20,12 +20,15 @@ from . namespaces import (
# Extraction provenance entity types
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
# Query-time provenance predicates (GraphRAG)
TG_QUERY, TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_EDGE, TG_REASONING, TG_CONTENT,
TG_QUERY, TG_CONCEPT, TG_ENTITY,
TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_EDGE, TG_REASONING,
TG_DOCUMENT,
# Query-time provenance predicates (DocumentRAG)
TG_CHUNK_COUNT, TG_SELECTED_CHUNK,
# Explainability entity types
TG_QUESTION, TG_EXPLORATION, TG_FOCUS, TG_SYNTHESIS,
TG_QUESTION, TG_GROUNDING, TG_EXPLORATION, TG_FOCUS, TG_SYNTHESIS,
# Unifying types
TG_ANSWER_TYPE,
# Question subtypes
TG_GRAPH_RAG_QUESTION, TG_DOC_RAG_QUESTION,
)
@ -347,35 +350,78 @@ def question_triples(
]
def grounding_triples(
grounding_uri: str,
question_uri: str,
concepts: List[str],
) -> List[Triple]:
"""
Build triples for a grounding entity (concept decomposition of query).
Creates:
- Entity declaration for grounding
- wasGeneratedBy link to question
- Concept literals for each extracted concept
Args:
grounding_uri: URI of the grounding entity (from grounding_uri)
question_uri: URI of the parent question
concepts: List of concept strings extracted from the query
Returns:
List of Triple objects
"""
triples = [
_triple(grounding_uri, RDF_TYPE, _iri(PROV_ENTITY)),
_triple(grounding_uri, RDF_TYPE, _iri(TG_GROUNDING)),
_triple(grounding_uri, RDFS_LABEL, _literal("Grounding")),
_triple(grounding_uri, PROV_WAS_GENERATED_BY, _iri(question_uri)),
]
for concept in concepts:
triples.append(_triple(grounding_uri, TG_CONCEPT, _literal(concept)))
return triples
def exploration_triples(
exploration_uri: str,
question_uri: str,
grounding_uri: str,
edge_count: int,
entities: Optional[List[str]] = None,
) -> List[Triple]:
"""
Build triples for an exploration entity (all edges retrieved from subgraph).
Creates:
- Entity declaration for exploration
- wasGeneratedBy link to question
- wasDerivedFrom link to grounding
- Edge count metadata
- Entity IRIs for each seed entity
Args:
exploration_uri: URI of the exploration entity (from exploration_uri)
question_uri: URI of the parent question
grounding_uri: URI of the parent grounding entity
edge_count: Number of edges retrieved
entities: Optional list of seed entity URIs
Returns:
List of Triple objects
"""
return [
triples = [
_triple(exploration_uri, RDF_TYPE, _iri(PROV_ENTITY)),
_triple(exploration_uri, RDF_TYPE, _iri(TG_EXPLORATION)),
_triple(exploration_uri, RDFS_LABEL, _literal("Exploration")),
_triple(exploration_uri, PROV_WAS_GENERATED_BY, _iri(question_uri)),
_triple(exploration_uri, PROV_WAS_DERIVED_FROM, _iri(grounding_uri)),
_triple(exploration_uri, TG_EDGE_COUNT, _literal(edge_count)),
]
if entities:
for entity in entities:
triples.append(_triple(exploration_uri, TG_ENTITY, _iri(entity)))
return triples
def _quoted_triple(s: str, p: str, o: str) -> Term:
"""Create a quoted triple term (RDF-star) from string values."""
@ -454,22 +500,20 @@ def focus_triples(
def synthesis_triples(
synthesis_uri: str,
focus_uri: str,
answer_text: str = "",
document_id: Optional[str] = None,
) -> List[Triple]:
"""
Build triples for a synthesis entity (final answer text).
Build triples for a synthesis entity (final answer).
Creates:
- Entity declaration for synthesis
- Entity declaration for synthesis with tg:Answer type
- wasDerivedFrom link to focus
- Either document reference (if document_id provided) or inline content
- Document reference to librarian
Args:
synthesis_uri: URI of the synthesis entity (from synthesis_uri)
focus_uri: URI of the parent focus entity
answer_text: The synthesized answer text (used if no document_id)
document_id: Optional librarian document ID (preferred over inline content)
document_id: Librarian document ID for the answer content
Returns:
List of Triple objects
@ -477,16 +521,13 @@ def synthesis_triples(
triples = [
_triple(synthesis_uri, RDF_TYPE, _iri(PROV_ENTITY)),
_triple(synthesis_uri, RDF_TYPE, _iri(TG_SYNTHESIS)),
_triple(synthesis_uri, RDF_TYPE, _iri(TG_ANSWER_TYPE)),
_triple(synthesis_uri, RDFS_LABEL, _literal("Synthesis")),
_triple(synthesis_uri, PROV_WAS_DERIVED_FROM, _iri(focus_uri)),
]
if document_id:
# Store reference to document in librarian (as IRI)
triples.append(_triple(synthesis_uri, TG_DOCUMENT, _iri(document_id)))
elif answer_text:
# Fallback: store inline content
triples.append(_triple(synthesis_uri, TG_CONTENT, _literal(answer_text)))
return triples
@ -533,7 +574,7 @@ def docrag_question_triples(
def docrag_exploration_triples(
exploration_uri: str,
question_uri: str,
grounding_uri: str,
chunk_count: int,
chunk_ids: Optional[List[str]] = None,
) -> List[Triple]:
@ -542,12 +583,12 @@ def docrag_exploration_triples(
Creates:
- Entity declaration with tg:Exploration type
- wasGeneratedBy link to question
- wasDerivedFrom link to grounding
- Chunk count and optional chunk references
Args:
exploration_uri: URI of the exploration entity
question_uri: URI of the parent question
grounding_uri: URI of the parent grounding entity
chunk_count: Number of chunks retrieved
chunk_ids: Optional list of chunk URIs/IDs
@ -558,7 +599,7 @@ def docrag_exploration_triples(
_triple(exploration_uri, RDF_TYPE, _iri(PROV_ENTITY)),
_triple(exploration_uri, RDF_TYPE, _iri(TG_EXPLORATION)),
_triple(exploration_uri, RDFS_LABEL, _literal("Exploration")),
_triple(exploration_uri, PROV_WAS_GENERATED_BY, _iri(question_uri)),
_triple(exploration_uri, PROV_WAS_DERIVED_FROM, _iri(grounding_uri)),
_triple(exploration_uri, TG_CHUNK_COUNT, _literal(chunk_count)),
]
@ -573,22 +614,20 @@ def docrag_exploration_triples(
def docrag_synthesis_triples(
synthesis_uri: str,
exploration_uri: str,
answer_text: str = "",
document_id: Optional[str] = None,
) -> List[Triple]:
"""
Build triples for a document RAG synthesis entity (final answer).
Creates:
- Entity declaration with tg:Synthesis type
- Entity declaration with tg:Synthesis and tg:Answer types
- wasDerivedFrom link to exploration (skips focus step)
- Either document reference or inline content
- Document reference to librarian
Args:
synthesis_uri: URI of the synthesis entity
exploration_uri: URI of the parent exploration entity
answer_text: The synthesized answer text (used if no document_id)
document_id: Optional librarian document ID (preferred over inline content)
document_id: Librarian document ID for the answer content
Returns:
List of Triple objects
@ -596,13 +635,12 @@ def docrag_synthesis_triples(
triples = [
_triple(synthesis_uri, RDF_TYPE, _iri(PROV_ENTITY)),
_triple(synthesis_uri, RDF_TYPE, _iri(TG_SYNTHESIS)),
_triple(synthesis_uri, RDF_TYPE, _iri(TG_ANSWER_TYPE)),
_triple(synthesis_uri, RDFS_LABEL, _literal("Synthesis")),
_triple(synthesis_uri, PROV_WAS_DERIVED_FROM, _iri(exploration_uri)),
]
if document_id:
triples.append(_triple(synthesis_uri, TG_DOCUMENT, _iri(document_id)))
elif answer_text:
triples.append(_triple(synthesis_uri, TG_CONTENT, _literal(answer_text)))
return triples

View file

@ -68,6 +68,7 @@ def agent_uri(component_name: str) -> str:
#
# Terminology:
# Question - What was asked, the anchor for everything
# Grounding - Decomposing the question into concepts
# Exploration - Casting wide, what do we know about this space
# Focus - Closing down, what's actually relevant here
# Synthesis - Weaving the relevant pieces into an answer
@ -87,6 +88,19 @@ def question_uri(session_id: str = None) -> str:
return f"urn:trustgraph:question:{session_id}"
def grounding_uri(session_id: str) -> str:
"""
Generate URI for a grounding entity (concept decomposition of query).
Args:
session_id: The session UUID (same as question_uri).
Returns:
URN in format: urn:trustgraph:prov:grounding:{uuid}
"""
return f"urn:trustgraph:prov:grounding:{session_id}"
def exploration_uri(session_id: str) -> str:
"""
Generate URI for an exploration entity (edges retrieved from subgraph).
@ -173,6 +187,34 @@ def agent_iteration_uri(session_id: str, iteration_num: int) -> str:
return f"urn:trustgraph:agent:{session_id}/i{iteration_num}"
def agent_thought_uri(session_id: str, iteration_num: int) -> str:
"""
Generate URI for an agent thought sub-entity.
Args:
session_id: The session UUID.
iteration_num: 1-based iteration number.
Returns:
URN in format: urn:trustgraph:agent:{uuid}/i{num}/thought
"""
return f"urn:trustgraph:agent:{session_id}/i{iteration_num}/thought"
def agent_observation_uri(session_id: str, iteration_num: int) -> str:
"""
Generate URI for an agent observation sub-entity.
Args:
session_id: The session UUID.
iteration_num: 1-based iteration number.
Returns:
URN in format: urn:trustgraph:agent:{uuid}/i{num}/observation
"""
return f"urn:trustgraph:agent:{session_id}/i{iteration_num}/observation"
def agent_final_uri(session_id: str) -> str:
"""
Generate URI for an agent final answer.
@ -205,6 +247,19 @@ def docrag_question_uri(session_id: str = None) -> str:
return f"urn:trustgraph:docrag:{session_id}"
def docrag_grounding_uri(session_id: str) -> str:
"""
Generate URI for a document RAG grounding entity (concept decomposition).
Args:
session_id: The session UUID.
Returns:
URN in format: urn:trustgraph:docrag:{uuid}/grounding
"""
return f"urn:trustgraph:docrag:{session_id}/grounding"
def docrag_exploration_uri(session_id: str) -> str:
"""
Generate URI for a document RAG exploration entity (chunks retrieved).

View file

@ -25,6 +25,8 @@ from . namespaces import (
TG_LLM_MODEL, TG_ONTOLOGY, TG_EMBEDDING_MODEL,
TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
TG_CONCEPT, TG_ENTITY, TG_GROUNDING,
TG_ANSWER_TYPE, TG_REFLECTION_TYPE, TG_THOUGHT_TYPE, TG_OBSERVATION_TYPE,
)
@ -80,6 +82,11 @@ TG_CLASS_LABELS = [
_label_triple(TG_PAGE_TYPE, "Page"),
_label_triple(TG_CHUNK_TYPE, "Chunk"),
_label_triple(TG_SUBGRAPH_TYPE, "Subgraph"),
_label_triple(TG_GROUNDING, "Grounding"),
_label_triple(TG_ANSWER_TYPE, "Answer"),
_label_triple(TG_REFLECTION_TYPE, "Reflection"),
_label_triple(TG_THOUGHT_TYPE, "Thought"),
_label_triple(TG_OBSERVATION_TYPE, "Observation"),
]
# TrustGraph predicate labels
@ -100,6 +107,8 @@ TG_PREDICATE_LABELS = [
_label_triple(TG_SOURCE_TEXT, "source text"),
_label_triple(TG_SOURCE_CHAR_OFFSET, "source character offset"),
_label_triple(TG_SOURCE_CHAR_LENGTH, "source character length"),
_label_triple(TG_CONCEPT, "concept"),
_label_triple(TG_ENTITY, "entity"),
]

View file

@ -15,6 +15,7 @@ class GraphRagQuery:
triple_limit: int = 0
max_subgraph_size: int = 0
max_path_length: int = 0
edge_limit: int = 0
streaming: bool = False
@dataclass