mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Add explainability CLI tools for debugging provenance data - tg-show-document-hierarchy: Display document → page → chunk → edge hierarchy by traversing prov:wasDerivedFrom relationships - tg-list-explain-traces: List all GraphRAG sessions with questions and timestamps from the retrieval graph - tg-show-explain-trace: Show full explainability cascade for a GraphRAG session (question → exploration → focus → synthesis) These tools query the source and retrieval graphs to help debug and explore provenance/explainability data stored during document processing and GraphRAG queries.
431 lines
13 KiB
Python
431 lines
13 KiB
Python
"""
|
|
Show document hierarchy: Document -> Pages -> Chunks -> Edges.
|
|
|
|
Given a document ID, traverses and displays all derived entities
|
|
(pages, chunks, extracted edges) using prov:wasDerivedFrom relationships.
|
|
|
|
Examples:
|
|
tg-show-document-hierarchy -U trustgraph -C default "urn:trustgraph:doc:abc123"
|
|
tg-show-document-hierarchy --show-content --max-content 500 "urn:trustgraph:doc:abc123"
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
from trustgraph.api import Api
|
|
|
|
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
|
|
default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
|
|
default_user = 'trustgraph'
|
|
default_collection = 'default'
|
|
|
|
# Predicates
|
|
PROV_WAS_DERIVED_FROM = "http://www.w3.org/ns/prov#wasDerivedFrom"
|
|
RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
|
|
RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
|
|
TG = "https://trustgraph.ai/ns/"
|
|
TG_REIFIES = TG + "reifies"
|
|
DC_TITLE = "http://purl.org/dc/terms/title"
|
|
DC_FORMAT = "http://purl.org/dc/terms/format"
|
|
|
|
# Source graph
|
|
SOURCE_GRAPH = "urn:graph:source"
|
|
|
|
|
|
def query_triples(socket, flow_id, user, collection, s=None, p=None, o=None, g=None, limit=1000):
|
|
"""Query triples using the socket API."""
|
|
request = {
|
|
"user": user,
|
|
"collection": collection,
|
|
"limit": limit,
|
|
"streaming": False,
|
|
}
|
|
|
|
if s is not None:
|
|
request["s"] = {"t": "i", "i": s}
|
|
if p is not None:
|
|
request["p"] = {"t": "i", "i": p}
|
|
if o is not None:
|
|
if isinstance(o, str):
|
|
if o.startswith("http://") or o.startswith("https://") or o.startswith("urn:"):
|
|
request["o"] = {"t": "i", "i": o}
|
|
else:
|
|
request["o"] = {"t": "l", "v": o}
|
|
elif isinstance(o, dict):
|
|
request["o"] = o
|
|
if g is not None:
|
|
request["g"] = g
|
|
|
|
triples = []
|
|
try:
|
|
for response in socket._send_request_sync("triples", flow_id, request, streaming_raw=True):
|
|
if isinstance(response, dict):
|
|
triple_list = response.get("response", response.get("triples", []))
|
|
else:
|
|
triple_list = response
|
|
|
|
if not isinstance(triple_list, list):
|
|
triple_list = [triple_list] if triple_list else []
|
|
|
|
for t in triple_list:
|
|
s_val = extract_value(t.get("s", {}))
|
|
p_val = extract_value(t.get("p", {}))
|
|
o_val = extract_value(t.get("o", {}))
|
|
triples.append((s_val, p_val, o_val))
|
|
except Exception as e:
|
|
print(f"Error querying triples: {e}", file=sys.stderr)
|
|
|
|
return triples
|
|
|
|
|
|
def extract_value(term):
|
|
"""Extract value from a term dict."""
|
|
if not term:
|
|
return ""
|
|
|
|
t = term.get("t") or term.get("type")
|
|
|
|
if t == "i":
|
|
return term.get("i") or term.get("iri", "")
|
|
elif t == "l":
|
|
return term.get("v") or term.get("value", "")
|
|
elif t == "t":
|
|
# Quoted triple
|
|
tr = term.get("tr") or term.get("triple", {})
|
|
return {
|
|
"s": extract_value(tr.get("s", {})),
|
|
"p": extract_value(tr.get("p", {})),
|
|
"o": extract_value(tr.get("o", {})),
|
|
}
|
|
|
|
# Fallback for raw values
|
|
if "i" in term:
|
|
return term["i"]
|
|
if "v" in term:
|
|
return term["v"]
|
|
|
|
return str(term)
|
|
|
|
|
|
def get_node_metadata(socket, flow_id, user, collection, node_uri):
|
|
"""Get metadata for a node (label, type, title, format)."""
|
|
triples = query_triples(socket, flow_id, user, collection, s=node_uri, g=SOURCE_GRAPH)
|
|
|
|
metadata = {"uri": node_uri}
|
|
for s, p, o in triples:
|
|
if p == RDFS_LABEL:
|
|
metadata["label"] = o
|
|
elif p == RDF_TYPE:
|
|
metadata["type"] = o
|
|
elif p == DC_TITLE:
|
|
metadata["title"] = o
|
|
elif p == DC_FORMAT:
|
|
metadata["format"] = o
|
|
|
|
return metadata
|
|
|
|
|
|
def get_children(socket, flow_id, user, collection, parent_uri):
|
|
"""Get children of a node via prov:wasDerivedFrom."""
|
|
triples = query_triples(
|
|
socket, flow_id, user, collection,
|
|
p=PROV_WAS_DERIVED_FROM, o=parent_uri, g=SOURCE_GRAPH
|
|
)
|
|
return [s for s, p, o in triples]
|
|
|
|
|
|
def get_edges_from_chunk(socket, flow_id, user, collection, chunk_uri):
|
|
"""Get edges that were derived from a chunk (via tg:reifies)."""
|
|
# Query for triples where: ?stmt prov:wasDerivedFrom chunk_uri
|
|
# Then get the tg:reifies value
|
|
derived_triples = query_triples(
|
|
socket, flow_id, user, collection,
|
|
p=PROV_WAS_DERIVED_FROM, o=chunk_uri, g=SOURCE_GRAPH
|
|
)
|
|
|
|
edges = []
|
|
for stmt_uri, _, _ in derived_triples:
|
|
# Get what this statement reifies
|
|
reifies_triples = query_triples(
|
|
socket, flow_id, user, collection,
|
|
s=stmt_uri, p=TG_REIFIES, g=SOURCE_GRAPH
|
|
)
|
|
for _, _, edge in reifies_triples:
|
|
if isinstance(edge, dict):
|
|
edges.append(edge)
|
|
|
|
return edges
|
|
|
|
|
|
def get_document_content(api, user, doc_id, max_content):
|
|
"""Fetch document content from librarian API."""
|
|
try:
|
|
library = api.library()
|
|
content = library.get_document_content(user=user, id=doc_id)
|
|
|
|
# Try to decode as text
|
|
try:
|
|
text = content.decode('utf-8')
|
|
if len(text) > max_content:
|
|
return text[:max_content] + "... [truncated]"
|
|
return text
|
|
except UnicodeDecodeError:
|
|
return f"[Binary: {len(content)} bytes]"
|
|
except Exception as e:
|
|
return f"[Error fetching content: {e}]"
|
|
|
|
|
|
def classify_uri(uri):
|
|
"""Classify a URI as document, page, or chunk based on patterns."""
|
|
if not isinstance(uri, str):
|
|
return "unknown"
|
|
|
|
# Common patterns in trustgraph URIs
|
|
if "/c" in uri and uri.split("/c")[-1].isdigit():
|
|
return "chunk"
|
|
if "/p" in uri and any(uri.split("/p")[-1].replace("/", "").isdigit() for _ in [1]):
|
|
# Check for page pattern like /p1 or /p1/
|
|
parts = uri.split("/p")
|
|
if len(parts) > 1:
|
|
remainder = parts[-1].split("/")[0]
|
|
if remainder.isdigit():
|
|
return "page"
|
|
|
|
if "chunk" in uri.lower():
|
|
return "chunk"
|
|
if "page" in uri.lower():
|
|
return "page"
|
|
if "doc" in uri.lower():
|
|
return "document"
|
|
|
|
return "unknown"
|
|
|
|
|
|
def build_hierarchy(socket, flow_id, user, collection, root_uri, api=None, show_content=False, max_content=200, visited=None):
|
|
"""Build document hierarchy tree recursively."""
|
|
if visited is None:
|
|
visited = set()
|
|
|
|
if root_uri in visited:
|
|
return None
|
|
visited.add(root_uri)
|
|
|
|
metadata = get_node_metadata(socket, flow_id, user, collection, root_uri)
|
|
node_type = classify_uri(root_uri)
|
|
|
|
node = {
|
|
"uri": root_uri,
|
|
"type": node_type,
|
|
"metadata": metadata,
|
|
"children": [],
|
|
"edges": [],
|
|
}
|
|
|
|
# Fetch content if requested
|
|
if show_content and api:
|
|
content = get_document_content(api, user, root_uri, max_content)
|
|
if content:
|
|
node["content"] = content
|
|
|
|
# Get children
|
|
children_uris = get_children(socket, flow_id, user, collection, root_uri)
|
|
|
|
for child_uri in children_uris:
|
|
child_type = classify_uri(child_uri)
|
|
|
|
# Recursively build hierarchy for pages and chunks
|
|
if child_type in ("page", "chunk", "unknown"):
|
|
child_node = build_hierarchy(
|
|
socket, flow_id, user, collection, child_uri,
|
|
api=api, show_content=show_content, max_content=max_content,
|
|
visited=visited
|
|
)
|
|
if child_node:
|
|
node["children"].append(child_node)
|
|
|
|
# Get edges for chunks
|
|
if node_type == "chunk":
|
|
edges = get_edges_from_chunk(socket, flow_id, user, collection, root_uri)
|
|
node["edges"] = edges
|
|
|
|
# Sort children by URI for consistent output
|
|
node["children"].sort(key=lambda x: x.get("uri", ""))
|
|
|
|
return node
|
|
|
|
|
|
def format_edge(edge):
|
|
"""Format an edge (quoted triple) for display."""
|
|
if isinstance(edge, dict):
|
|
s = edge.get("s", "?")
|
|
p = edge.get("p", "?")
|
|
o = edge.get("o", "?")
|
|
|
|
# Shorten URIs for display
|
|
s_short = s.split("/")[-1] if "/" in str(s) else s
|
|
p_short = p.split("/")[-1] if "/" in str(p) else p
|
|
o_short = o.split("/")[-1] if "/" in str(o) else o
|
|
|
|
return f"({s_short}, {p_short}, {o_short})"
|
|
return str(edge)
|
|
|
|
|
|
def print_tree(node, prefix="", is_last=True, show_content=False):
|
|
"""Print node as indented tree."""
|
|
connector = "└── " if is_last else "├── "
|
|
continuation = " " if is_last else "│ "
|
|
|
|
# Format node header
|
|
uri = node.get("uri", "")
|
|
node_type = node.get("type", "unknown")
|
|
metadata = node.get("metadata", {})
|
|
|
|
label = metadata.get("label") or metadata.get("title") or uri.split("/")[-1]
|
|
type_str = node_type.capitalize()
|
|
|
|
if prefix:
|
|
print(f"{prefix}{connector}{type_str}: {label}")
|
|
else:
|
|
print(f"{type_str}: {uri}")
|
|
if metadata.get("title"):
|
|
print(f" Title: \"{metadata['title']}\"")
|
|
if metadata.get("format"):
|
|
print(f" Type: {metadata['format']}")
|
|
|
|
new_prefix = prefix + continuation if prefix else " "
|
|
|
|
# Print content if available
|
|
if show_content and "content" in node:
|
|
content = node["content"]
|
|
content_lines = content.split("\n")[:3] # Show first 3 lines
|
|
for line in content_lines:
|
|
if line.strip():
|
|
truncated = line[:80] + "..." if len(line) > 80 else line
|
|
print(f"{new_prefix}Content: \"{truncated}\"")
|
|
break
|
|
|
|
# Print edges
|
|
edges = node.get("edges", [])
|
|
children = node.get("children", [])
|
|
|
|
total_items = len(edges) + len(children)
|
|
current_item = 0
|
|
|
|
for edge in edges:
|
|
current_item += 1
|
|
is_last_item = (current_item == total_items)
|
|
edge_connector = "└── " if is_last_item else "├── "
|
|
print(f"{new_prefix}{edge_connector}Edge: {format_edge(edge)}")
|
|
|
|
# Print children recursively
|
|
for i, child in enumerate(children):
|
|
current_item += 1
|
|
is_last_child = (i == len(children) - 1)
|
|
print_tree(child, new_prefix, is_last_child, show_content)
|
|
|
|
|
|
def print_json(node):
|
|
"""Print node as JSON."""
|
|
print(json.dumps(node, indent=2))
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
prog='tg-show-document-hierarchy',
|
|
description=__doc__,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
)
|
|
|
|
parser.add_argument(
|
|
'document_id',
|
|
help='Document URI to show hierarchy for',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'-u', '--api-url',
|
|
default=default_url,
|
|
help=f'API URL (default: {default_url})',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'-t', '--token',
|
|
default=default_token,
|
|
help='Auth token (default: $TRUSTGRAPH_TOKEN)',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'-U', '--user',
|
|
default=default_user,
|
|
help=f'User ID (default: {default_user})',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'-C', '--collection',
|
|
default=default_collection,
|
|
help=f'Collection (default: {default_collection})',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'-f', '--flow-id',
|
|
default='default',
|
|
help='Flow ID (default: default)',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--show-content',
|
|
action='store_true',
|
|
help='Include blob/document content',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--max-content',
|
|
type=int,
|
|
default=200,
|
|
help='Max chars to display per blob (default: 200)',
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--format',
|
|
choices=['tree', 'json'],
|
|
default='tree',
|
|
help='Output format: tree (default), json',
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
api = Api(args.api_url, token=args.token)
|
|
socket = api.socket()
|
|
|
|
try:
|
|
hierarchy = build_hierarchy(
|
|
socket=socket,
|
|
flow_id=args.flow_id,
|
|
user=args.user,
|
|
collection=args.collection,
|
|
root_uri=args.document_id,
|
|
api=api if args.show_content else None,
|
|
show_content=args.show_content,
|
|
max_content=args.max_content,
|
|
)
|
|
|
|
if hierarchy is None:
|
|
print(f"No data found for document: {args.document_id}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
if args.format == 'json':
|
|
print_json(hierarchy)
|
|
else:
|
|
print_tree(hierarchy, show_content=args.show_content)
|
|
|
|
finally:
|
|
socket.close()
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|