Terminology Rename, and named-graphs for explainability (#682)

Terminology Rename, and named-graphs for explainability data

Changed terminology:
  - session -> question
  - retrieval -> exploration
  - selection -> focus
  - answer -> synthesis

- uris.py: Renamed query_session_uri → question_uri,
  retrieval_uri → exploration_uri, selection_uri → focus_uri,
  answer_uri → synthesis_uri
- triples.py: Renamed corresponding triple generation functions with
  updated labels ("GraphRAG question", "Exploration", "Focus",
  "Synthesis")
- namespaces.py: Added named graph constants GRAPH_DEFAULT,
  GRAPH_SOURCE, GRAPH_RETRIEVAL
- init.py: Updated exports
- graph_rag.py: Updated to use new terminology
- invoke_graph_rag.py: Updated CLI to display new stage names
  (Question, Exploration, Focus, Synthesis)

Query-Time Explainability → Named Graph
- triples.py: Added set_graph() helper function to set named graph
  on triples
- graph_rag.py: All explainability triples now use GRAPH_RETRIEVAL
  named graph
- rag.py: Explainability triples stored in user's collection (not
  separate collection) with named graph

Extraction Provenance → Named Graph
- relationships/extract.py: Provenance triples use GRAPH_SOURCE
  named graph
- definitions/extract.py: Provenance triples use GRAPH_SOURCE
  named graph
- chunker.py: Provenance triples use GRAPH_SOURCE named graph
- pdf_decoder.py: Provenance triples use GRAPH_SOURCE named graph

CLI Updates
- show_graph.py: Added -g/--graph option to filter by named graph and
  --show-graph to display graph column

Also:
- Fix knowledge core schemas
This commit is contained in:
cybermaggedon 2026-03-10 14:35:21 +00:00 committed by GitHub
parent 57eda65674
commit e1bc4c04a4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 279 additions and 180 deletions

View file

@ -168,7 +168,7 @@ class TestGraphRagIntegration:
assert isinstance(response, str) assert isinstance(response, str)
assert "machine learning" in response.lower() assert "machine learning" in response.lower()
# Verify provenance was emitted in real-time (4 events: session, retrieval, selection, answer) # Verify provenance was emitted in real-time (4 events: question, exploration, focus, synthesis)
assert len(provenance_events) == 4 assert len(provenance_events) == 4
for triples, prov_id in provenance_events: for triples, prov_id in provenance_events:
assert isinstance(triples, list) assert isinstance(triples, list)

View file

@ -644,7 +644,7 @@ class TestQuery:
# Verify response text # Verify response text
assert response == expected_response assert response == expected_response
# Verify provenance was emitted incrementally (4 events: session, retrieval, selection, answer) # Verify provenance was emitted incrementally (4 events: question, exploration, focus, synthesis)
assert len(provenance_events) == 4 assert len(provenance_events) == 4
# Verify each event has triples and a URN # Verify each event has triples and a URN
@ -653,11 +653,11 @@ class TestQuery:
assert len(triples) > 0 assert len(triples) > 0
assert prov_id.startswith("urn:trustgraph:") assert prov_id.startswith("urn:trustgraph:")
# Verify order: session, retrieval, selection, answer # Verify order: question, exploration, focus, synthesis
assert "session" in provenance_events[0][1] assert "question" in provenance_events[0][1]
assert "retrieval" in provenance_events[1][1] assert "exploration" in provenance_events[1][1]
assert "selection" in provenance_events[2][1] assert "focus" in provenance_events[2][1]
assert "answer" in provenance_events[3][1] assert "synthesis" in provenance_events[3][1]
finally: finally:
# Restore original methods # Restore original methods

View file

@ -104,10 +104,10 @@ class GraphRagResponseTranslator(MessageTranslator):
if explain_id: if explain_id:
result["explain_id"] = explain_id result["explain_id"] = explain_id
# Include explain_collection for explain messages # Include explain_graph for explain messages (named graph filter)
explain_collection = getattr(obj, "explain_collection", None) explain_graph = getattr(obj, "explain_graph", None)
if explain_collection: if explain_graph is not None:
result["explain_collection"] = explain_collection result["explain_graph"] = explain_graph
# Include end_of_stream flag (LLM stream complete) # Include end_of_stream flag (LLM stream complete)
result["end_of_stream"] = getattr(obj, "end_of_stream", False) result["end_of_stream"] = getattr(obj, "end_of_stream", False)

View file

@ -41,10 +41,10 @@ from . uris import (
statement_uri, statement_uri,
agent_uri, agent_uri,
# Query-time provenance URIs # Query-time provenance URIs
query_session_uri, question_uri,
retrieval_uri, exploration_uri,
selection_uri, focus_uri,
answer_uri, synthesis_uri,
) )
# Namespace constants # Namespace constants
@ -65,6 +65,8 @@ from . namespaces import (
TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH, TG_SOURCE_TEXT, TG_SOURCE_CHAR_OFFSET, TG_SOURCE_CHAR_LENGTH,
# Query-time provenance predicates # Query-time provenance predicates
TG_QUERY, TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_REASONING, TG_CONTENT, TG_QUERY, TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_REASONING, TG_CONTENT,
# Named graphs
GRAPH_DEFAULT, GRAPH_SOURCE, GRAPH_RETRIEVAL,
) )
# Triple builders # Triple builders
@ -73,10 +75,12 @@ from . triples import (
derived_entity_triples, derived_entity_triples,
triple_provenance_triples, triple_provenance_triples,
# Query-time provenance triple builders # Query-time provenance triple builders
query_session_triples, question_triples,
retrieval_triples, exploration_triples,
selection_triples, focus_triples,
answer_triples, synthesis_triples,
# Utility
set_graph,
) )
# Vocabulary bootstrap # Vocabulary bootstrap
@ -99,10 +103,10 @@ __all__ = [
"statement_uri", "statement_uri",
"agent_uri", "agent_uri",
# Query-time provenance URIs # Query-time provenance URIs
"query_session_uri", "question_uri",
"retrieval_uri", "exploration_uri",
"selection_uri", "focus_uri",
"answer_uri", "synthesis_uri",
# Namespaces # Namespaces
"PROV", "PROV_ENTITY", "PROV_ACTIVITY", "PROV_AGENT", "PROV", "PROV_ENTITY", "PROV_ACTIVITY", "PROV_AGENT",
"PROV_WAS_DERIVED_FROM", "PROV_WAS_GENERATED_BY", "PROV_WAS_DERIVED_FROM", "PROV_WAS_GENERATED_BY",
@ -116,15 +120,19 @@ __all__ = [
"TG_SOURCE_TEXT", "TG_SOURCE_CHAR_OFFSET", "TG_SOURCE_CHAR_LENGTH", "TG_SOURCE_TEXT", "TG_SOURCE_CHAR_OFFSET", "TG_SOURCE_CHAR_LENGTH",
# Query-time provenance predicates # Query-time provenance predicates
"TG_QUERY", "TG_EDGE_COUNT", "TG_SELECTED_EDGE", "TG_REASONING", "TG_CONTENT", "TG_QUERY", "TG_EDGE_COUNT", "TG_SELECTED_EDGE", "TG_REASONING", "TG_CONTENT",
# Named graphs
"GRAPH_DEFAULT", "GRAPH_SOURCE", "GRAPH_RETRIEVAL",
# Triple builders # Triple builders
"document_triples", "document_triples",
"derived_entity_triples", "derived_entity_triples",
"triple_provenance_triples", "triple_provenance_triples",
# Query-time provenance triple builders # Query-time provenance triple builders
"query_session_triples", "question_triples",
"retrieval_triples", "exploration_triples",
"selection_triples", "focus_triples",
"answer_triples", "synthesis_triples",
# Utility
"set_graph",
# Vocabulary # Vocabulary
"get_vocabulary_triples", "get_vocabulary_triples",
"PROV_CLASS_LABELS", "PROV_CLASS_LABELS",

View file

@ -67,3 +67,9 @@ TG_EDGE = TG + "edge"
TG_REASONING = TG + "reasoning" TG_REASONING = TG + "reasoning"
TG_CONTENT = TG + "content" TG_CONTENT = TG + "content"
TG_DOCUMENT = TG + "document" # Reference to document in librarian TG_DOCUMENT = TG + "document" # Reference to document in librarian
# Named graph URIs for RDF datasets
# These separate different types of data while keeping them in the same collection
GRAPH_DEFAULT = "" # Core knowledge facts (triples extracted from documents)
GRAPH_SOURCE = "urn:graph:source" # Extraction provenance (which document/chunk a triple came from)
GRAPH_RETRIEVAL = "urn:graph:retrieval" # Query-time explainability (question, exploration, focus, synthesis)

View file

@ -25,6 +25,26 @@ from . namespaces import (
from . uris import activity_uri, agent_uri, edge_selection_uri from . uris import activity_uri, agent_uri, edge_selection_uri
def set_graph(triples: List[Triple], graph: str) -> List[Triple]:
"""
Set the named graph on a list of triples.
This creates new Triple objects with the graph field set,
leaving the original triples unchanged.
Args:
triples: List of Triple objects
graph: Named graph URI (e.g., "urn:graph:retrieval")
Returns:
List of Triple objects with graph field set
"""
return [
Triple(s=t.s, p=t.p, o=t.o, g=graph)
for t in triples
]
def _iri(uri: str) -> Term: def _iri(uri: str) -> Term:
"""Create an IRI term.""" """Create an IRI term."""
return Term(type=IRI, iri=uri) return Term(type=IRI, iri=uri)
@ -258,21 +278,27 @@ def triple_provenance_triples(
# Query-time provenance triple builders # Query-time provenance triple builders
#
# Terminology:
# Question - What was asked, the anchor for everything
# Exploration - Casting wide, what do we know about this space
# Focus - Closing down, what's actually relevant here
# Synthesis - Weaving the relevant pieces into an answer
def query_session_triples( def question_triples(
session_uri: str, question_uri: str,
query: str, query: str,
timestamp: Optional[str] = None, timestamp: Optional[str] = None,
) -> List[Triple]: ) -> List[Triple]:
""" """
Build triples for a query session activity. Build triples for a question activity.
Creates: Creates:
- Activity declaration for the query session - Activity declaration for the question
- Query text and timestamp - Query text and timestamp
Args: Args:
session_uri: URI of the session (from query_session_uri) question_uri: URI of the question (from question_uri)
query: The user's query text query: The user's query text
timestamp: ISO timestamp (defaults to now) timestamp: ISO timestamp (defaults to now)
@ -283,39 +309,39 @@ def query_session_triples(
timestamp = datetime.utcnow().isoformat() + "Z" timestamp = datetime.utcnow().isoformat() + "Z"
return [ return [
_triple(session_uri, RDF_TYPE, _iri(PROV_ACTIVITY)), _triple(question_uri, RDF_TYPE, _iri(PROV_ACTIVITY)),
_triple(session_uri, RDFS_LABEL, _literal("GraphRAG query session")), _triple(question_uri, RDFS_LABEL, _literal("GraphRAG question")),
_triple(session_uri, PROV_STARTED_AT_TIME, _literal(timestamp)), _triple(question_uri, PROV_STARTED_AT_TIME, _literal(timestamp)),
_triple(session_uri, TG_QUERY, _literal(query)), _triple(question_uri, TG_QUERY, _literal(query)),
] ]
def retrieval_triples( def exploration_triples(
retrieval_uri: str, exploration_uri: str,
session_uri: str, question_uri: str,
edge_count: int, edge_count: int,
) -> List[Triple]: ) -> List[Triple]:
""" """
Build triples for a retrieval entity (all edges retrieved from subgraph). Build triples for an exploration entity (all edges retrieved from subgraph).
Creates: Creates:
- Entity declaration for retrieval - Entity declaration for exploration
- wasGeneratedBy link to session - wasGeneratedBy link to question
- Edge count metadata - Edge count metadata
Args: Args:
retrieval_uri: URI of the retrieval entity (from retrieval_uri) exploration_uri: URI of the exploration entity (from exploration_uri)
session_uri: URI of the parent session question_uri: URI of the parent question
edge_count: Number of edges retrieved edge_count: Number of edges retrieved
Returns: Returns:
List of Triple objects List of Triple objects
""" """
return [ return [
_triple(retrieval_uri, RDF_TYPE, _iri(PROV_ENTITY)), _triple(exploration_uri, RDF_TYPE, _iri(PROV_ENTITY)),
_triple(retrieval_uri, RDFS_LABEL, _literal("Retrieved edges")), _triple(exploration_uri, RDFS_LABEL, _literal("Exploration")),
_triple(retrieval_uri, PROV_WAS_GENERATED_BY, _iri(session_uri)), _triple(exploration_uri, PROV_WAS_GENERATED_BY, _iri(question_uri)),
_triple(retrieval_uri, TG_EDGE_COUNT, _literal(edge_count)), _triple(exploration_uri, TG_EDGE_COUNT, _literal(edge_count)),
] ]
@ -327,28 +353,28 @@ def _quoted_triple(s: str, p: str, o: str) -> Term:
) )
def selection_triples( def focus_triples(
selection_uri: str, focus_uri: str,
retrieval_uri: str, exploration_uri: str,
selected_edges_with_reasoning: List[dict], selected_edges_with_reasoning: List[dict],
session_id: str = "", session_id: str = "",
) -> List[Triple]: ) -> List[Triple]:
""" """
Build triples for a selection entity (selected edges with reasoning). Build triples for a focus entity (selected edges with reasoning).
Creates: Creates:
- Entity declaration for selection - Entity declaration for focus
- wasDerivedFrom link to retrieval - wasDerivedFrom link to exploration
- For each selected edge: an edge selection entity with quoted triple and reasoning - For each selected edge: an edge selection entity with quoted triple and reasoning
Structure: Structure:
<selection> tg:selectedEdge <edge_sel_1> . <focus> tg:selectedEdge <edge_sel_1> .
<edge_sel_1> tg:edge << <s> <p> <o> >> . <edge_sel_1> tg:edge << <s> <p> <o> >> .
<edge_sel_1> tg:reasoning "reason" . <edge_sel_1> tg:reasoning "reason" .
Args: Args:
selection_uri: URI of the selection entity (from selection_uri) focus_uri: URI of the focus entity (from focus_uri)
retrieval_uri: URI of the parent retrieval entity exploration_uri: URI of the parent exploration entity
selected_edges_with_reasoning: List of dicts with 'edge' (s,p,o tuple) and 'reasoning' selected_edges_with_reasoning: List of dicts with 'edge' (s,p,o tuple) and 'reasoning'
session_id: Session UUID for generating edge selection URIs session_id: Session UUID for generating edge selection URIs
@ -356,9 +382,9 @@ def selection_triples(
List of Triple objects List of Triple objects
""" """
triples = [ triples = [
_triple(selection_uri, RDF_TYPE, _iri(PROV_ENTITY)), _triple(focus_uri, RDF_TYPE, _iri(PROV_ENTITY)),
_triple(selection_uri, RDFS_LABEL, _literal("Selected edges")), _triple(focus_uri, RDFS_LABEL, _literal("Focus")),
_triple(selection_uri, PROV_WAS_DERIVED_FROM, _iri(retrieval_uri)), _triple(focus_uri, PROV_WAS_DERIVED_FROM, _iri(exploration_uri)),
] ]
# Add each selected edge with its reasoning via intermediate entity # Add each selected edge with its reasoning via intermediate entity
@ -372,9 +398,9 @@ def selection_triples(
# Create intermediate entity for this edge selection # Create intermediate entity for this edge selection
edge_sel_uri = edge_selection_uri(session_id, idx) edge_sel_uri = edge_selection_uri(session_id, idx)
# Link selection to edge selection entity # Link focus to edge selection entity
triples.append( triples.append(
_triple(selection_uri, TG_SELECTED_EDGE, _iri(edge_sel_uri)) _triple(focus_uri, TG_SELECTED_EDGE, _iri(edge_sel_uri))
) )
# Attach quoted triple to edge selection entity # Attach quoted triple to edge selection entity
@ -392,23 +418,23 @@ def selection_triples(
return triples return triples
def answer_triples( def synthesis_triples(
answer_uri: str, synthesis_uri: str,
selection_uri: str, focus_uri: str,
answer_text: str = "", answer_text: str = "",
document_id: Optional[str] = None, document_id: Optional[str] = None,
) -> List[Triple]: ) -> List[Triple]:
""" """
Build triples for an answer entity (final synthesis text). Build triples for a synthesis entity (final answer text).
Creates: Creates:
- Entity declaration for answer - Entity declaration for synthesis
- wasDerivedFrom link to selection - wasDerivedFrom link to focus
- Either document reference (if document_id provided) or inline content - Either document reference (if document_id provided) or inline content
Args: Args:
answer_uri: URI of the answer entity (from answer_uri) synthesis_uri: URI of the synthesis entity (from synthesis_uri)
selection_uri: URI of the parent selection entity focus_uri: URI of the parent focus entity
answer_text: The synthesized answer text (used if no document_id) answer_text: The synthesized answer text (used if no document_id)
document_id: Optional librarian document ID (preferred over inline content) document_id: Optional librarian document ID (preferred over inline content)
@ -416,16 +442,16 @@ def answer_triples(
List of Triple objects List of Triple objects
""" """
triples = [ triples = [
_triple(answer_uri, RDF_TYPE, _iri(PROV_ENTITY)), _triple(synthesis_uri, RDF_TYPE, _iri(PROV_ENTITY)),
_triple(answer_uri, RDFS_LABEL, _literal("GraphRAG answer")), _triple(synthesis_uri, RDFS_LABEL, _literal("Synthesis")),
_triple(answer_uri, PROV_WAS_DERIVED_FROM, _iri(selection_uri)), _triple(synthesis_uri, PROV_WAS_DERIVED_FROM, _iri(focus_uri)),
] ]
if document_id: if document_id:
# Store reference to document in librarian (as IRI) # Store reference to document in librarian (as IRI)
triples.append(_triple(answer_uri, TG_DOCUMENT, _iri(document_id))) triples.append(_triple(synthesis_uri, TG_DOCUMENT, _iri(document_id)))
elif answer_text: elif answer_text:
# Fallback: store inline content # Fallback: store inline content
triples.append(_triple(answer_uri, TG_CONTENT, _literal(answer_text))) triples.append(_triple(synthesis_uri, TG_CONTENT, _literal(answer_text)))
return triples return triples

View file

@ -65,59 +65,65 @@ def agent_uri(component_name: str) -> str:
# Query-time provenance URIs # Query-time provenance URIs
# These URIs use the urn:trustgraph: namespace to distinguish query-time # These URIs use the urn:trustgraph: namespace to distinguish query-time
# provenance from extraction-time provenance (which uses https://trustgraph.ai/) # provenance from extraction-time provenance (which uses https://trustgraph.ai/)
#
# Terminology:
# Question - What was asked, the anchor for everything
# Exploration - Casting wide, what do we know about this space
# Focus - Closing down, what's actually relevant here
# Synthesis - Weaving the relevant pieces into an answer
def query_session_uri(session_id: str = None) -> str: def question_uri(session_id: str = None) -> str:
""" """
Generate URI for a query session activity. Generate URI for a question activity.
Args: Args:
session_id: Optional UUID string. Auto-generates if not provided. session_id: Optional UUID string. Auto-generates if not provided.
Returns: Returns:
URN in format: urn:trustgraph:session:{uuid} URN in format: urn:trustgraph:question:{uuid}
""" """
if session_id is None: if session_id is None:
session_id = str(uuid.uuid4()) session_id = str(uuid.uuid4())
return f"urn:trustgraph:session:{session_id}" return f"urn:trustgraph:question:{session_id}"
def retrieval_uri(session_id: str) -> str: def exploration_uri(session_id: str) -> str:
""" """
Generate URI for a retrieval entity (edges retrieved from subgraph). Generate URI for an exploration entity (edges retrieved from subgraph).
Args: Args:
session_id: The session UUID (same as query_session_uri). session_id: The session UUID (same as question_uri).
Returns: Returns:
URN in format: urn:trustgraph:prov:retrieval:{uuid} URN in format: urn:trustgraph:prov:exploration:{uuid}
""" """
return f"urn:trustgraph:prov:retrieval:{session_id}" return f"urn:trustgraph:prov:exploration:{session_id}"
def selection_uri(session_id: str) -> str: def focus_uri(session_id: str) -> str:
""" """
Generate URI for a selection entity (selected edges with reasoning). Generate URI for a focus entity (selected edges with reasoning).
Args: Args:
session_id: The session UUID (same as query_session_uri). session_id: The session UUID (same as question_uri).
Returns: Returns:
URN in format: urn:trustgraph:prov:selection:{uuid} URN in format: urn:trustgraph:prov:focus:{uuid}
""" """
return f"urn:trustgraph:prov:selection:{session_id}" return f"urn:trustgraph:prov:focus:{session_id}"
def answer_uri(session_id: str) -> str: def synthesis_uri(session_id: str) -> str:
""" """
Generate URI for an answer entity (final synthesis text). Generate URI for a synthesis entity (final answer text).
Args: Args:
session_id: The session UUID (same as query_session_uri). session_id: The session UUID (same as question_uri).
Returns: Returns:
URN in format: urn:trustgraph:prov:answer:{uuid} URN in format: urn:trustgraph:prov:synthesis:{uuid}
""" """
return f"urn:trustgraph:prov:answer:{session_id}" return f"urn:trustgraph:prov:synthesis:{session_id}"
def edge_selection_uri(session_id: str, edge_index: int) -> str: def edge_selection_uri(session_id: str, edge_index: int) -> str:

View file

@ -22,8 +22,8 @@ class GraphRagResponse:
error: Error | None = None error: Error | None = None
response: str = "" response: str = ""
end_of_stream: bool = False # LLM response stream complete end_of_stream: bool = False # LLM response stream complete
explain_id: str | None = None # Single explain URI (announced as created) explain_id: str | None = None # Single explain URI (announced as created)
explain_collection: str | None = None # Collection where explain was stored explain_graph: str | None = None # Named graph where explain was stored (e.g., urn:graph:retrieval)
message_type: str = "" # "chunk" or "explain" message_type: str = "" # "chunk" or "explain"
end_of_session: bool = False # Entire session complete end_of_session: bool = False # Entire session complete

View file

@ -36,14 +36,14 @@ RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
def _get_event_type(prov_id): def _get_event_type(prov_id):
"""Extract event type from provenance_id""" """Extract event type from provenance_id"""
if "session" in prov_id: if "question" in prov_id:
return "session" return "question"
elif "retrieval" in prov_id: elif "exploration" in prov_id:
return "retrieval" return "exploration"
elif "selection" in prov_id: elif "focus" in prov_id:
return "selection" return "focus"
elif "answer" in prov_id: elif "synthesis" in prov_id:
return "answer" return "synthesis"
return "provenance" return "provenance"
@ -51,7 +51,7 @@ def _format_provenance_details(event_type, triples):
"""Format provenance details based on event type and triples""" """Format provenance details based on event type and triples"""
lines = [] lines = []
if event_type == "session": if event_type == "question":
# Show query and timestamp # Show query and timestamp
for s, p, o in triples: for s, p, o in triples:
if p == TG_QUERY: if p == TG_QUERY:
@ -59,32 +59,32 @@ def _format_provenance_details(event_type, triples):
elif p == PROV_STARTED_AT_TIME: elif p == PROV_STARTED_AT_TIME:
lines.append(f" Time: {o}") lines.append(f" Time: {o}")
elif event_type == "retrieval": elif event_type == "exploration":
# Show edge count # Show edge count
for s, p, o in triples: for s, p, o in triples:
if p == TG_EDGE_COUNT: if p == TG_EDGE_COUNT:
lines.append(f" Edges retrieved: {o}") lines.append(f" Edges explored: {o}")
elif event_type == "selection": elif event_type == "focus":
# For selection, just count edge selection URIs # For focus, just count edge selection URIs
# The actual edge details are fetched separately via edge_selections parameter # The actual edge details are fetched separately via edge_selections parameter
edge_sel_uris = [] edge_sel_uris = []
for s, p, o in triples: for s, p, o in triples:
if p == TG_SELECTED_EDGE: if p == TG_SELECTED_EDGE:
edge_sel_uris.append(o) edge_sel_uris.append(o)
if edge_sel_uris: if edge_sel_uris:
lines.append(f" Selected {len(edge_sel_uris)} edge(s)") lines.append(f" Focused on {len(edge_sel_uris)} edge(s)")
elif event_type == "answer": elif event_type == "synthesis":
# Show content length (not full content - it's already streamed) # Show content length (not full content - it's already streamed)
for s, p, o in triples: for s, p, o in triples:
if p == TG_CONTENT: if p == TG_CONTENT:
lines.append(f" Answer length: {len(o)} chars") lines.append(f" Synthesis length: {len(o)} chars")
return lines return lines
async def _query_triples_once(ws_url, flow_id, prov_id, user, collection, debug=False): async def _query_triples_once(ws_url, flow_id, prov_id, user, collection, graph=None, debug=False):
"""Query triples for a provenance node (single attempt)""" """Query triples for a provenance node (single attempt)"""
request = { request = {
"id": "triples-request", "id": "triples-request",
@ -97,6 +97,9 @@ async def _query_triples_once(ws_url, flow_id, prov_id, user, collection, debug=
"limit": 100 "limit": 100
} }
} }
# Add graph filter if specified (for named graph queries)
if graph is not None:
request["request"]["g"] = graph
if debug: if debug:
print(f" [debug] querying triples for s={prov_id}", file=sys.stderr) print(f" [debug] querying triples for s={prov_id}", file=sys.stderr)
@ -155,10 +158,10 @@ async def _query_triples_once(ws_url, flow_id, prov_id, user, collection, debug=
return triples return triples
async def _query_triples(ws_url, flow_id, prov_id, user, collection, max_retries=5, retry_delay=0.2, debug=False): async def _query_triples(ws_url, flow_id, prov_id, user, collection, graph=None, max_retries=5, retry_delay=0.2, debug=False):
"""Query triples for a provenance node with retries for race condition""" """Query triples for a provenance node with retries for race condition"""
for attempt in range(max_retries): for attempt in range(max_retries):
triples = await _query_triples_once(ws_url, flow_id, prov_id, user, collection, debug) triples = await _query_triples_once(ws_url, flow_id, prov_id, user, collection, graph=graph, debug=debug)
if triples: if triples:
return triples return triples
# Wait before retry if empty (triples may not be stored yet) # Wait before retry if empty (triples may not be stored yet)
@ -515,14 +518,14 @@ async def _question_explainable(
if message_type == "explain": if message_type == "explain":
# Display explain event with details # Display explain event with details
explain_id = resp.get("explain_id", "") explain_id = resp.get("explain_id", "")
explain_collection = resp.get("explain_collection", "explainability") explain_graph = resp.get("explain_graph") # Named graph (e.g., urn:graph:retrieval)
if explain_id: if explain_id:
event_type = _get_event_type(explain_id) event_type = _get_event_type(explain_id)
print(f"\n [{event_type}] {explain_id}", file=sys.stderr) print(f"\n [{event_type}] {explain_id}", file=sys.stderr)
# Query triples for this explain node (using explain collection from event) # Query triples for this explain node (using named graph filter)
triples = await _query_triples( triples = await _query_triples(
ws_url, flow_id, explain_id, user, explain_collection, debug=debug ws_url, flow_id, explain_id, user, collection, graph=explain_graph, debug=debug
) )
# Format and display details # Format and display details
@ -530,17 +533,17 @@ async def _question_explainable(
for line in details: for line in details:
print(line, file=sys.stderr) print(line, file=sys.stderr)
# For selection events, query each edge selection for details # For focus events, query each edge selection for details
if event_type == "selection": if event_type == "focus":
for s, p, o in triples: for s, p, o in triples:
if debug: if debug:
print(f" [debug] triple: p={p}, o={o}, o_type={type(o).__name__}", file=sys.stderr) print(f" [debug] triple: p={p}, o={o}, o_type={type(o).__name__}", file=sys.stderr)
if p == TG_SELECTED_EDGE and isinstance(o, str): if p == TG_SELECTED_EDGE and isinstance(o, str):
if debug: if debug:
print(f" [debug] querying edge selection: {o}", file=sys.stderr) print(f" [debug] querying edge selection: {o}", file=sys.stderr)
# Query the edge selection entity (using explain collection from event) # Query the edge selection entity (using named graph filter)
edge_triples = await _query_triples( edge_triples = await _query_triples(
ws_url, flow_id, o, user, explain_collection, debug=debug ws_url, flow_id, o, user, collection, graph=explain_graph, debug=debug
) )
if debug: if debug:
print(f" [debug] got {len(edge_triples)} edge triples", file=sys.stderr) print(f" [debug] got {len(edge_triples)} edge triples", file=sys.stderr)
@ -743,7 +746,7 @@ def main():
parser.add_argument( parser.add_argument(
'-x', '--explainable', '-x', '--explainable',
action='store_true', action='store_true',
help='Show provenance events for explainability (implies streaming)' help='Show provenance events: Question, Exploration, Focus, Synthesis (implies streaming)'
) )
parser.add_argument( parser.add_argument(

View file

@ -1,6 +1,11 @@
""" """
Connects to the graph query service and dumps all graph edges. Connects to the graph query service and dumps all graph edges.
Uses streaming mode for lower time-to-first-result and reduced memory overhead. Uses streaming mode for lower time-to-first-result and reduced memory overhead.
Named graphs:
- Default graph (empty): Core knowledge facts
- urn:graph:source: Extraction provenance (document/chunk sources)
- urn:graph:retrieval: Query-time explainability (question, exploration, focus, synthesis)
""" """
import argparse import argparse
@ -12,7 +17,13 @@ default_user = 'trustgraph'
default_collection = 'default' default_collection = 'default'
default_token = os.getenv("TRUSTGRAPH_TOKEN", None) default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
def show_graph(url, flow_id, user, collection, limit, batch_size, token=None): # Named graph constants for convenience
GRAPH_DEFAULT = ""
GRAPH_SOURCE = "urn:graph:source"
GRAPH_RETRIEVAL = "urn:graph:retrieval"
def show_graph(url, flow_id, user, collection, limit, batch_size, graph=None, show_graph_column=False, token=None):
socket = Api(url, token=token).socket() socket = Api(url, token=token).socket()
flow = socket.flow(flow_id) flow = socket.flow(flow_id)
@ -22,6 +33,7 @@ def show_graph(url, flow_id, user, collection, limit, batch_size, token=None):
user=user, user=user,
collection=collection, collection=collection,
s=None, p=None, o=None, s=None, p=None, o=None,
g=graph, # Filter by named graph (None = all graphs)
limit=limit, limit=limit,
batch_size=batch_size, batch_size=batch_size,
): ):
@ -29,11 +41,16 @@ def show_graph(url, flow_id, user, collection, limit, batch_size, token=None):
s = triple.get("s", {}) s = triple.get("s", {})
p = triple.get("p", {}) p = triple.get("p", {})
o = triple.get("o", {}) o = triple.get("o", {})
g = triple.get("g") # Named graph (None = default graph)
# Format terms for display # Format terms for display
s_str = s.get("v", s.get("i", str(s))) s_str = s.get("v", s.get("i", str(s)))
p_str = p.get("v", p.get("i", str(p))) p_str = p.get("v", p.get("i", str(p)))
o_str = o.get("v", o.get("i", str(o))) o_str = o.get("v", o.get("i", str(o)))
print(s_str, p_str, o_str) if show_graph_column:
g_str = g if g else "(default)"
print(f"[{g_str}]", s_str, p_str, o_str)
else:
print(s_str, p_str, o_str)
finally: finally:
socket.close() socket.close()
@ -88,8 +105,25 @@ def main():
help='Triples per streaming batch (default: 20)', help='Triples per streaming batch (default: 20)',
) )
parser.add_argument(
'-g', '--graph',
default=None,
help='Filter by named graph (e.g., urn:graph:source, urn:graph:retrieval). Use "" for default graph only.',
)
parser.add_argument(
'--show-graph',
action='store_true',
help='Show graph column in output',
)
args = parser.parse_args() args = parser.parse_args()
# Handle empty string for default graph filter
graph = args.graph
if graph == '""' or graph == "''":
graph = "" # Filter to default graph only
try: try:
show_graph( show_graph(
@ -99,6 +133,8 @@ def main():
collection = args.collection, collection = args.collection,
limit = args.limit, limit = args.limit,
batch_size = args.batch_size, batch_size = args.batch_size,
graph = graph,
show_graph_column = args.show_graph,
token = args.token, token = args.token,
) )

View file

@ -14,6 +14,7 @@ from ... base import ChunkingService, ConsumerSpec, ProducerSpec
from ... provenance import ( from ... provenance import (
page_uri, chunk_uri_from_page, chunk_uri_from_doc, page_uri, chunk_uri_from_page, chunk_uri_from_doc,
derived_entity_triples, document_uri, derived_entity_triples, document_uri,
set_graph, GRAPH_SOURCE,
) )
# Component identification for provenance # Component identification for provenance
@ -160,7 +161,7 @@ class Processor(ChunkingService):
title=f"Chunk {chunk_index}", title=f"Chunk {chunk_index}",
) )
# Emit provenance triples # Emit provenance triples (stored in source graph for separation from core knowledge)
prov_triples = derived_entity_triples( prov_triples = derived_entity_triples(
entity_uri=chunk_uri, entity_uri=chunk_uri,
parent_uri=parent_uri, parent_uri=parent_uri,
@ -181,7 +182,7 @@ class Processor(ChunkingService):
user=v.metadata.user, user=v.metadata.user,
collection=v.metadata.collection, collection=v.metadata.collection,
), ),
triples=prov_triples, triples=set_graph(prov_triples, GRAPH_SOURCE),
)) ))
# Forward chunk ID + content (post-chunker optimization) # Forward chunk ID + content (post-chunker optimization)

View file

@ -24,6 +24,7 @@ from ... base import Consumer, Producer, ConsumerMetrics, ProducerMetrics
from ... provenance import ( from ... provenance import (
document_uri, page_uri, derived_entity_triples, document_uri, page_uri, derived_entity_triples,
set_graph, GRAPH_SOURCE,
) )
# Component identification for provenance # Component identification for provenance
@ -285,7 +286,7 @@ class Processor(FlowProcessor):
title=f"Page {page_num}", title=f"Page {page_num}",
) )
# Emit provenance triples # Emit provenance triples (stored in source graph for separation from core knowledge)
doc_uri = document_uri(source_doc_id) doc_uri = document_uri(source_doc_id)
pg_uri = page_uri(source_doc_id, page_num) pg_uri = page_uri(source_doc_id, page_num)
@ -305,7 +306,7 @@ class Processor(FlowProcessor):
user=v.metadata.user, user=v.metadata.user,
collection=v.metadata.collection, collection=v.metadata.collection,
), ),
triples=prov_triples, triples=set_graph(prov_triples, GRAPH_SOURCE),
)) ))
# Forward page document ID to chunker # Forward page document ID to chunker

View file

@ -20,7 +20,7 @@ from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
from .... base import PromptClientSpec, ParameterSpec from .... base import PromptClientSpec, ParameterSpec
from .... provenance import statement_uri, triple_provenance_triples from .... provenance import statement_uri, triple_provenance_triples, set_graph, GRAPH_SOURCE
from .... flow_version import __version__ as COMPONENT_VERSION from .... flow_version import __version__ as COMPONENT_VERSION
DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION) DEFINITION_VALUE = Term(type=IRI, iri=DEFINITION)
@ -175,6 +175,7 @@ class Processor(FlowProcessor):
triples.append(definition_triple) triples.append(definition_triple)
# Generate provenance for the definition triple (reification) # Generate provenance for the definition triple (reification)
# Provenance triples go in the source graph for separation from core knowledge
stmt_uri = statement_uri() stmt_uri = statement_uri()
prov_triples = triple_provenance_triples( prov_triples = triple_provenance_triples(
stmt_uri=stmt_uri, stmt_uri=stmt_uri,
@ -185,7 +186,7 @@ class Processor(FlowProcessor):
llm_model=llm_model, llm_model=llm_model,
ontology_uri=ontology_uri, ontology_uri=ontology_uri,
) )
triples.extend(prov_triples) triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
# Link entity to chunk (not top-level document) # Link entity to chunk (not top-level document)
triples.append(Triple( triples.append(Triple(

View file

@ -20,7 +20,7 @@ from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
from .... base import PromptClientSpec, ParameterSpec from .... base import PromptClientSpec, ParameterSpec
from .... provenance import statement_uri, triple_provenance_triples from .... provenance import statement_uri, triple_provenance_triples, set_graph, GRAPH_SOURCE
from .... flow_version import __version__ as COMPONENT_VERSION from .... flow_version import __version__ as COMPONENT_VERSION
RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL) RDF_LABEL_VALUE = Term(type=IRI, iri=RDF_LABEL)
@ -162,6 +162,7 @@ class Processor(FlowProcessor):
triples.append(relationship_triple) triples.append(relationship_triple)
# Generate provenance for the relationship triple (reification) # Generate provenance for the relationship triple (reification)
# Provenance triples go in the source graph for separation from core knowledge
stmt_uri = statement_uri() stmt_uri = statement_uri()
prov_triples = triple_provenance_triples( prov_triples = triple_provenance_triples(
stmt_uri=stmt_uri, stmt_uri=stmt_uri,
@ -172,7 +173,7 @@ class Processor(FlowProcessor):
llm_model=llm_model, llm_model=llm_model,
ontology_uri=ontology_uri, ontology_uri=ontology_uri,
) )
triples.extend(prov_triples) triples.extend(set_graph(prov_triples, GRAPH_SOURCE))
# Label for s # Label for s
triples.append(Triple( triples.append(Triple(

View file

@ -12,14 +12,16 @@ from ... schema import IRI, LITERAL
# Provenance imports # Provenance imports
from trustgraph.provenance import ( from trustgraph.provenance import (
query_session_uri, question_uri,
retrieval_uri as make_retrieval_uri, exploration_uri as make_exploration_uri,
selection_uri as make_selection_uri, focus_uri as make_focus_uri,
answer_uri as make_answer_uri, synthesis_uri as make_synthesis_uri,
query_session_triples, question_triples,
retrieval_triples, exploration_triples,
selection_triples, focus_triples,
answer_triples, synthesis_triples,
set_graph,
GRAPH_RETRIEVAL,
) )
# Module logger # Module logger
@ -396,17 +398,20 @@ class GraphRag:
# Generate explainability URIs upfront # Generate explainability URIs upfront
session_id = str(uuid.uuid4()) session_id = str(uuid.uuid4())
session_uri = query_session_uri(session_id) q_uri = question_uri(session_id)
ret_uri = make_retrieval_uri(session_id) exp_uri = make_exploration_uri(session_id)
sel_uri = make_selection_uri(session_id) foc_uri = make_focus_uri(session_id)
ans_uri = make_answer_uri(session_id) syn_uri = make_synthesis_uri(session_id)
timestamp = datetime.utcnow().isoformat() + "Z" timestamp = datetime.utcnow().isoformat() + "Z"
# Emit session explainability immediately # Emit question explainability immediately
if explain_callback: if explain_callback:
session_triples = query_session_triples(session_uri, query, timestamp) q_triples = set_graph(
await explain_callback(session_triples, session_uri) question_triples(q_uri, query, timestamp),
GRAPH_RETRIEVAL
)
await explain_callback(q_triples, q_uri)
q = Query( q = Query(
rag = self, user = user, collection = collection, rag = self, user = user, collection = collection,
@ -418,10 +423,13 @@ class GraphRag:
kg, uri_map = await q.get_labelgraph(query) kg, uri_map = await q.get_labelgraph(query)
# Emit retrieval explain after graph retrieval completes # Emit exploration explain after graph retrieval completes
if explain_callback: if explain_callback:
ret_triples = retrieval_triples(ret_uri, session_uri, len(kg)) exp_triples = set_graph(
await explain_callback(ret_triples, ret_uri) exploration_triples(exp_uri, q_uri, len(kg)),
GRAPH_RETRIEVAL
)
await explain_callback(exp_triples, exp_uri)
if self.verbose: if self.verbose:
logger.debug("Invoking LLM...") logger.debug("Invoking LLM...")
@ -511,12 +519,15 @@ class GraphRag:
if self.verbose: if self.verbose:
logger.debug(f"Filtered to {len(selected_edges)} edges") logger.debug(f"Filtered to {len(selected_edges)} edges")
# Emit selection explain after edge selection completes # Emit focus explain after edge selection completes
if explain_callback: if explain_callback:
sel_triples = selection_triples( foc_triples = set_graph(
sel_uri, ret_uri, selected_edges_with_reasoning, session_id focus_triples(
foc_uri, exp_uri, selected_edges_with_reasoning, session_id
),
GRAPH_RETRIEVAL
) )
await explain_callback(sel_triples, sel_uri) await explain_callback(foc_triples, foc_uri)
# Step 2: Synthesis - LLM generates answer from selected edges only # Step 2: Synthesis - LLM generates answer from selected edges only
selected_edge_dicts = [ selected_edge_dicts = [
@ -554,30 +565,33 @@ class GraphRag:
if self.verbose: if self.verbose:
logger.debug("Query processing complete") logger.debug("Query processing complete")
# Emit answer explain after synthesis completes # Emit synthesis explain after synthesis completes
if explain_callback: if explain_callback:
answer_doc_id = None synthesis_doc_id = None
answer_text = resp if resp else "" answer_text = resp if resp else ""
# Save answer to librarian if callback provided # Save answer to librarian if callback provided
if save_answer_callback and answer_text: if save_answer_callback and answer_text:
# Generate document ID as URN matching query-time provenance format # Generate document ID as URN matching query-time provenance format
answer_doc_id = f"urn:trustgraph:answer:{session_id}" synthesis_doc_id = f"urn:trustgraph:synthesis:{session_id}"
try: try:
await save_answer_callback(answer_doc_id, answer_text) await save_answer_callback(synthesis_doc_id, answer_text)
if self.verbose: if self.verbose:
logger.debug(f"Saved answer to librarian: {answer_doc_id}") logger.debug(f"Saved answer to librarian: {synthesis_doc_id}")
except Exception as e: except Exception as e:
logger.warning(f"Failed to save answer to librarian: {e}") logger.warning(f"Failed to save answer to librarian: {e}")
answer_doc_id = None # Fall back to inline content synthesis_doc_id = None # Fall back to inline content
# Generate triples with document reference or inline content # Generate triples with document reference or inline content
ans_triples = answer_triples( syn_triples = set_graph(
ans_uri, sel_uri, synthesis_triples(
answer_text="" if answer_doc_id else answer_text, syn_uri, foc_uri,
document_id=answer_doc_id, answer_text="" if synthesis_doc_id else answer_text,
document_id=synthesis_doc_id,
),
GRAPH_RETRIEVAL
) )
await explain_callback(ans_triples, ans_uri) await explain_callback(syn_triples, syn_uri)
if self.verbose: if self.verbose:
logger.debug(f"Emitted explain for session {session_id}") logger.debug(f"Emitted explain for session {session_id}")

View file

@ -13,6 +13,7 @@ from ... schema import GraphRagQuery, GraphRagResponse, Error
from ... schema import Triples, Metadata from ... schema import Triples, Metadata
from ... schema import LibrarianRequest, LibrarianResponse, DocumentMetadata from ... schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
from ... schema import librarian_request_queue, librarian_response_queue from ... schema import librarian_request_queue, librarian_response_queue
from ... provenance import GRAPH_RETRIEVAL
from . graph_rag import GraphRag from . graph_rag import GraphRag
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
from ... base import PromptClientSpec, EmbeddingsClientSpec from ... base import PromptClientSpec, EmbeddingsClientSpec
@ -38,7 +39,6 @@ class Processor(FlowProcessor):
triple_limit = params.get("triple_limit", 30) triple_limit = params.get("triple_limit", 30)
max_subgraph_size = params.get("max_subgraph_size", 150) max_subgraph_size = params.get("max_subgraph_size", 150)
max_path_length = params.get("max_path_length", 2) max_path_length = params.get("max_path_length", 2)
explainability_collection = params.get("explainability_collection", "explainability")
super(Processor, self).__init__( super(Processor, self).__init__(
**params | { **params | {
@ -48,7 +48,6 @@ class Processor(FlowProcessor):
"triple_limit": triple_limit, "triple_limit": triple_limit,
"max_subgraph_size": max_subgraph_size, "max_subgraph_size": max_subgraph_size,
"max_path_length": max_path_length, "max_path_length": max_path_length,
"explainability_collection": explainability_collection,
} }
) )
@ -56,7 +55,6 @@ class Processor(FlowProcessor):
self.default_triple_limit = triple_limit self.default_triple_limit = triple_limit
self.default_max_subgraph_size = max_subgraph_size self.default_max_subgraph_size = max_subgraph_size
self.default_max_path_length = max_path_length self.default_max_path_length = max_path_length
self.explainability_collection = explainability_collection
# CRITICAL SECURITY: NEVER share data between users or collections # CRITICAL SECURITY: NEVER share data between users or collections
# Each user/collection combination MUST have isolated data access # Each user/collection combination MUST have isolated data access
@ -239,24 +237,25 @@ class Processor(FlowProcessor):
explainability_refs_emitted = [] explainability_refs_emitted = []
# Real-time explainability callback - emits triples and IDs as they're generated # Real-time explainability callback - emits triples and IDs as they're generated
# Triples are stored in the user's collection with a named graph (urn:graph:retrieval)
async def send_explainability(triples, explain_id): async def send_explainability(triples, explain_id):
# Send triples to explainability queue # Send triples to explainability queue - stores in same collection with named graph
await flow("explainability").send(Triples( await flow("explainability").send(Triples(
metadata=Metadata( metadata=Metadata(
id=explain_id, id=explain_id,
metadata=[], metadata=[],
user=v.user, user=v.user,
collection=self.explainability_collection, collection=v.collection, # Store in user's collection, not separate explainability collection
), ),
triples=triples, triples=triples,
)) ))
# Send explain ID and collection to response queue # Send explain ID and graph to response queue
await flow("response").send( await flow("response").send(
GraphRagResponse( GraphRagResponse(
message_type="explain", message_type="explain",
explain_id=explain_id, explain_id=explain_id,
explain_collection=self.explainability_collection, explain_graph=GRAPH_RETRIEVAL,
), ),
properties={"id": id} properties={"id": id}
) )
@ -424,11 +423,8 @@ class Processor(FlowProcessor):
help=f'Default max path length (default: 2)' help=f'Default max path length (default: 2)'
) )
parser.add_argument( # Note: Explainability triples are now stored in the user's collection
'--explainability-collection', # with the named graph urn:graph:retrieval (no separate collection needed)
default='explainability',
help=f'Collection for storing explainability triples (default: explainability)'
)
def run(): def run():

View file

@ -114,7 +114,7 @@ class KnowledgeTableStore:
entity_embeddings list< entity_embeddings list<
tuple< tuple<
tuple<text, boolean>, tuple<text, boolean>,
list<list<double>> list<double>
> >
>, >,
PRIMARY KEY ((user, document_id), id) PRIMARY KEY ((user, document_id), id)
@ -140,7 +140,7 @@ class KnowledgeTableStore:
chunks list< chunks list<
tuple< tuple<
blob, blob,
list<list<double>> list<double>
> >
>, >,
PRIMARY KEY ((user, document_id), id) PRIMARY KEY ((user, document_id), id)