Terminology Rename, and named-graphs for explainability (#682)

Terminology Rename, and named-graphs for explainability data Changed terminology: - session -> question - retrieval -> exploration - selection -> focus - answer -> synthesis - uris.py: Renamed query_session_uri → question_uri, retrieval_uri → exploration_uri, selection_uri → focus_uri, answer_uri → synthesis_uri - triples.py: Renamed corresponding triple generation functions with updated labels ("GraphRAG question", "Exploration", "Focus", "Synthesis") - namespaces.py: Added named graph constants GRAPH_DEFAULT, GRAPH_SOURCE, GRAPH_RETRIEVAL - init.py: Updated exports - graph_rag.py: Updated to use new terminology - invoke_graph_rag.py: Updated CLI to display new stage names (Question, Exploration, Focus, Synthesis) Query-Time Explainability → Named Graph - triples.py: Added set_graph() helper function to set named graph on triples - graph_rag.py: All explainability triples now use GRAPH_RETRIEVAL named graph - rag.py: Explainability triples stored in user's collection (not separate collection) with named graph Extraction Provenance → Named Graph - relationships/extract.py: Provenance triples use GRAPH_SOURCE named graph - definitions/extract.py: Provenance triples use GRAPH_SOURCE named graph - chunker.py: Provenance triples use GRAPH_SOURCE named graph - pdf_decoder.py: Provenance triples use GRAPH_SOURCE named graph CLI Updates - show_graph.py: Added -g/--graph option to filter by named graph and --show-graph to display graph column Also: - Fix knowledge core schemas
2026-06-14 09:15:13 +02:00 · 2026-03-10 14:35:21 +00:00 · 2026-03-10 14:35:21 +00:00 · e1bc4c04a4
commit e1bc4c04a4
parent 57eda65674
17 changed files with 279 additions and 180 deletions
--- a/trustgraph-cli/trustgraph/cli/invoke_graph_rag.py
+++ b/trustgraph-cli/trustgraph/cli/invoke_graph_rag.py
@ -36,14 +36,14 @@ RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"

 def _get_event_type(prov_id):
    """Extract event type from provenance_id"""
-    if "session" in prov_id:
-        return "session"
-    elif "retrieval" in prov_id:
-        return "retrieval"
-    elif "selection" in prov_id:
-        return "selection"
-    elif "answer" in prov_id:
-        return "answer"
+    if "question" in prov_id:
+        return "question"
+    elif "exploration" in prov_id:
+        return "exploration"
+    elif "focus" in prov_id:
+        return "focus"
+    elif "synthesis" in prov_id:
+        return "synthesis"
    return "provenance"


@ -51,7 +51,7 @@ def _format_provenance_details(event_type, triples):
    """Format provenance details based on event type and triples"""
    lines = []

-    if event_type == "session":
+    if event_type == "question":
        # Show query and timestamp
        for s, p, o in triples:
            if p == TG_QUERY:
@ -59,32 +59,32 @@ def _format_provenance_details(event_type, triples):
            elif p == PROV_STARTED_AT_TIME:
                lines.append(f"    Time: {o}")

-    elif event_type == "retrieval":
+    elif event_type == "exploration":
        # Show edge count
        for s, p, o in triples:
            if p == TG_EDGE_COUNT:
-                lines.append(f"    Edges retrieved: {o}")
+                lines.append(f"    Edges explored: {o}")

-    elif event_type == "selection":
-        # For selection, just count edge selection URIs
+    elif event_type == "focus":
+        # For focus, just count edge selection URIs
        # The actual edge details are fetched separately via edge_selections parameter
        edge_sel_uris = []
        for s, p, o in triples:
            if p == TG_SELECTED_EDGE:
                edge_sel_uris.append(o)
        if edge_sel_uris:
-            lines.append(f"    Selected {len(edge_sel_uris)} edge(s)")
+            lines.append(f"    Focused on {len(edge_sel_uris)} edge(s)")

-    elif event_type == "answer":
+    elif event_type == "synthesis":
        # Show content length (not full content - it's already streamed)
        for s, p, o in triples:
            if p == TG_CONTENT:
-                lines.append(f"    Answer length: {len(o)} chars")
+                lines.append(f"    Synthesis length: {len(o)} chars")

    return lines


-async def _query_triples_once(ws_url, flow_id, prov_id, user, collection, debug=False):
+async def _query_triples_once(ws_url, flow_id, prov_id, user, collection, graph=None, debug=False):
    """Query triples for a provenance node (single attempt)"""
    request = {
        "id": "triples-request",
@ -97,6 +97,9 @@ async def _query_triples_once(ws_url, flow_id, prov_id, user, collection, debug=
            "limit": 100
        }
    }
+    # Add graph filter if specified (for named graph queries)
+    if graph is not None:
+        request["request"]["g"] = graph

    if debug:
        print(f"    [debug] querying triples for s={prov_id}", file=sys.stderr)
@ -155,10 +158,10 @@ async def _query_triples_once(ws_url, flow_id, prov_id, user, collection, debug=
    return triples


-async def _query_triples(ws_url, flow_id, prov_id, user, collection, max_retries=5, retry_delay=0.2, debug=False):
+async def _query_triples(ws_url, flow_id, prov_id, user, collection, graph=None, max_retries=5, retry_delay=0.2, debug=False):
    """Query triples for a provenance node with retries for race condition"""
    for attempt in range(max_retries):
-        triples = await _query_triples_once(ws_url, flow_id, prov_id, user, collection, debug)
+        triples = await _query_triples_once(ws_url, flow_id, prov_id, user, collection, graph=graph, debug=debug)
        if triples:
            return triples
        # Wait before retry if empty (triples may not be stored yet)
@ -515,14 +518,14 @@ async def _question_explainable(
                if message_type == "explain":
                    # Display explain event with details
                    explain_id = resp.get("explain_id", "")
-                    explain_collection = resp.get("explain_collection", "explainability")
+                    explain_graph = resp.get("explain_graph")  # Named graph (e.g., urn:graph:retrieval)
                    if explain_id:
                        event_type = _get_event_type(explain_id)
                        print(f"\n  [{event_type}] {explain_id}", file=sys.stderr)

-                        # Query triples for this explain node (using explain collection from event)
+                        # Query triples for this explain node (using named graph filter)
                        triples = await _query_triples(
-                            ws_url, flow_id, explain_id, user, explain_collection, debug=debug
+                            ws_url, flow_id, explain_id, user, collection, graph=explain_graph, debug=debug
                        )

                        # Format and display details
@ -530,17 +533,17 @@ async def _question_explainable(
                        for line in details:
                            print(line, file=sys.stderr)

-                        # For selection events, query each edge selection for details
-                        if event_type == "selection":
+                        # For focus events, query each edge selection for details
+                        if event_type == "focus":
                            for s, p, o in triples:
                                if debug:
                                    print(f"    [debug] triple: p={p}, o={o}, o_type={type(o).__name__}", file=sys.stderr)
                                if p == TG_SELECTED_EDGE and isinstance(o, str):
                                    if debug:
                                        print(f"    [debug] querying edge selection: {o}", file=sys.stderr)
-                                    # Query the edge selection entity (using explain collection from event)
+                                    # Query the edge selection entity (using named graph filter)
                                    edge_triples = await _query_triples(
-                                        ws_url, flow_id, o, user, explain_collection, debug=debug
+                                        ws_url, flow_id, o, user, collection, graph=explain_graph, debug=debug
                                    )
                                    if debug:
                                        print(f"    [debug] got {len(edge_triples)} edge triples", file=sys.stderr)
@ -743,7 +746,7 @@ def main():
    parser.add_argument(
        '-x', '--explainable',
        action='store_true',
-        help='Show provenance events for explainability (implies streaming)'
+        help='Show provenance events: Question, Exploration, Focus, Synthesis (implies streaming)'
    )

    parser.add_argument(
--- a/trustgraph-cli/trustgraph/cli/show_graph.py
+++ b/trustgraph-cli/trustgraph/cli/show_graph.py
@ -1,6 +1,11 @@
 """
 Connects to the graph query service and dumps all graph edges.
 Uses streaming mode for lower time-to-first-result and reduced memory overhead.
+
+Named graphs:
+  - Default graph (empty): Core knowledge facts
+  - urn:graph:source: Extraction provenance (document/chunk sources)
+  - urn:graph:retrieval: Query-time explainability (question, exploration, focus, synthesis)
 """

 import argparse
@ -12,7 +17,13 @@ default_user = 'trustgraph'
 default_collection = 'default'
 default_token = os.getenv("TRUSTGRAPH_TOKEN", None)

-def show_graph(url, flow_id, user, collection, limit, batch_size, token=None):
+# Named graph constants for convenience
+GRAPH_DEFAULT = ""
+GRAPH_SOURCE = "urn:graph:source"
+GRAPH_RETRIEVAL = "urn:graph:retrieval"
+
+
+def show_graph(url, flow_id, user, collection, limit, batch_size, graph=None, show_graph_column=False, token=None):

    socket = Api(url, token=token).socket()
    flow = socket.flow(flow_id)
@ -22,6 +33,7 @@ def show_graph(url, flow_id, user, collection, limit, batch_size, token=None):
            user=user,
            collection=collection,
            s=None, p=None, o=None,
+            g=graph,  # Filter by named graph (None = all graphs)
            limit=limit,
            batch_size=batch_size,
        ):
@ -29,11 +41,16 @@ def show_graph(url, flow_id, user, collection, limit, batch_size, token=None):
                s = triple.get("s", {})
                p = triple.get("p", {})
                o = triple.get("o", {})
+                g = triple.get("g")  # Named graph (None = default graph)
                # Format terms for display
                s_str = s.get("v", s.get("i", str(s)))
                p_str = p.get("v", p.get("i", str(p)))
                o_str = o.get("v", o.get("i", str(o)))
-                print(s_str, p_str, o_str)
+                if show_graph_column:
+                    g_str = g if g else "(default)"
+                    print(f"[{g_str}]", s_str, p_str, o_str)
+                else:
+                    print(s_str, p_str, o_str)
    finally:
        socket.close()

@ -88,8 +105,25 @@ def main():
        help='Triples per streaming batch (default: 20)',
    )

+    parser.add_argument(
+        '-g', '--graph',
+        default=None,
+        help='Filter by named graph (e.g., urn:graph:source, urn:graph:retrieval). Use "" for default graph only.',
+    )
+
+    parser.add_argument(
+        '--show-graph',
+        action='store_true',
+        help='Show graph column in output',
+    )
+
    args = parser.parse_args()

+    # Handle empty string for default graph filter
+    graph = args.graph
+    if graph == '""' or graph == "''":
+        graph = ""  # Filter to default graph only
+
    try:

        show_graph(
@ -99,6 +133,8 @@ def main():
            collection = args.collection,
            limit = args.limit,
            batch_size = args.batch_size,
+            graph = graph,
+            show_graph_column = args.show_graph,
            token = args.token,
        )