trustgraph/trustgraph-base/trustgraph/provenance/uris.py
cybermaggedon a115ec06ab
Enhance retrieval pipelines: 4-stage GraphRAG, DocRAG grounding (#697)
Enhance retrieval pipelines: 4-stage GraphRAG, DocRAG grounding,
consistent PROV-O

GraphRAG:
- Split retrieval into 4 prompt stages: extract-concepts,
  kg-edge-scoring,
  kg-edge-reasoning, kg-synthesis (was single-stage)
- Add concept extraction (grounding) for per-concept embedding
- Filter main query to default graph, ignoring
  provenance/explainability edges
- Add source document edges to knowledge graph

DocumentRAG:
- Add grounding step with concept extraction, matching GraphRAG's
  pattern:
  Question → Grounding → Exploration → Synthesis
- Per-concept embedding and chunk retrieval with deduplication

Cross-pipeline:
- Make PROV-O derivation links consistent: wasGeneratedBy for first
  entity from Activity, wasDerivedFrom for entity-to-entity chains
- Update CLIs (tg-invoke-agent, tg-invoke-graph-rag,
  tg-invoke-document-rag) for new explainability structure
- Fix all affected unit and integration tests
2026-03-16 12:12:13 +00:00

286 lines
8.1 KiB
Python

"""
URI generation for provenance entities.
Document IDs are already IRIs (e.g., https://trustgraph.ai/doc/abc123).
Child entities (pages, chunks) append path segments to the parent IRI:
- Document: {doc_iri} (as provided)
- Page: {doc_iri}/p{page_number}
- Chunk: {page_iri}/c{chunk_index} (from page)
{doc_iri}/c{chunk_index} (from text doc)
- Activity: https://trustgraph.ai/activity/{uuid}
- Subgraph: https://trustgraph.ai/subgraph/{uuid}
"""
import uuid
import urllib.parse
# Base URI prefix for generated URIs (activities, statements, agents)
TRUSTGRAPH_BASE = "https://trustgraph.ai"
def _encode_id(id_str: str) -> str:
"""URL-encode an ID component for safe inclusion in URIs."""
return urllib.parse.quote(str(id_str), safe='')
def document_uri(doc_iri: str) -> str:
"""Return the document IRI as-is (already a full URI)."""
return doc_iri
def page_uri(doc_iri: str, page_number: int) -> str:
"""Generate URI for a page by appending to document IRI."""
return f"{doc_iri}/p{page_number}"
def chunk_uri_from_page(doc_iri: str, page_number: int, chunk_index: int) -> str:
"""Generate URI for a chunk extracted from a page."""
return f"{doc_iri}/p{page_number}/c{chunk_index}"
def chunk_uri_from_doc(doc_iri: str, chunk_index: int) -> str:
"""Generate URI for a chunk extracted directly from a text document."""
return f"{doc_iri}/c{chunk_index}"
def activity_uri(activity_id: str = None) -> str:
"""Generate URI for a PROV-O activity. Auto-generates UUID if not provided."""
if activity_id is None:
activity_id = str(uuid.uuid4())
return f"{TRUSTGRAPH_BASE}/activity/{_encode_id(activity_id)}"
def subgraph_uri(subgraph_id: str = None) -> str:
"""Generate URI for an extraction subgraph. Auto-generates UUID if not provided."""
if subgraph_id is None:
subgraph_id = str(uuid.uuid4())
return f"{TRUSTGRAPH_BASE}/subgraph/{_encode_id(subgraph_id)}"
def agent_uri(component_name: str) -> str:
"""Generate URI for a TrustGraph component agent."""
return f"{TRUSTGRAPH_BASE}/agent/{_encode_id(component_name)}"
# Query-time provenance URIs
# These URIs use the urn:trustgraph: namespace to distinguish query-time
# provenance from extraction-time provenance (which uses https://trustgraph.ai/)
#
# Terminology:
# Question - What was asked, the anchor for everything
# Grounding - Decomposing the question into concepts
# Exploration - Casting wide, what do we know about this space
# Focus - Closing down, what's actually relevant here
# Synthesis - Weaving the relevant pieces into an answer
def question_uri(session_id: str = None) -> str:
"""
Generate URI for a question activity.
Args:
session_id: Optional UUID string. Auto-generates if not provided.
Returns:
URN in format: urn:trustgraph:question:{uuid}
"""
if session_id is None:
session_id = str(uuid.uuid4())
return f"urn:trustgraph:question:{session_id}"
def grounding_uri(session_id: str) -> str:
"""
Generate URI for a grounding entity (concept decomposition of query).
Args:
session_id: The session UUID (same as question_uri).
Returns:
URN in format: urn:trustgraph:prov:grounding:{uuid}
"""
return f"urn:trustgraph:prov:grounding:{session_id}"
def exploration_uri(session_id: str) -> str:
"""
Generate URI for an exploration entity (edges retrieved from subgraph).
Args:
session_id: The session UUID (same as question_uri).
Returns:
URN in format: urn:trustgraph:prov:exploration:{uuid}
"""
return f"urn:trustgraph:prov:exploration:{session_id}"
def focus_uri(session_id: str) -> str:
"""
Generate URI for a focus entity (selected edges with reasoning).
Args:
session_id: The session UUID (same as question_uri).
Returns:
URN in format: urn:trustgraph:prov:focus:{uuid}
"""
return f"urn:trustgraph:prov:focus:{session_id}"
def synthesis_uri(session_id: str) -> str:
"""
Generate URI for a synthesis entity (final answer text).
Args:
session_id: The session UUID (same as question_uri).
Returns:
URN in format: urn:trustgraph:prov:synthesis:{uuid}
"""
return f"urn:trustgraph:prov:synthesis:{session_id}"
def edge_selection_uri(session_id: str, edge_index: int) -> str:
"""
Generate URI for an edge selection item (links edge to reasoning).
Args:
session_id: The session UUID.
edge_index: Index of this edge in the selection (0-based).
Returns:
URN in format: urn:trustgraph:prov:edge:{uuid}:{index}
"""
return f"urn:trustgraph:prov:edge:{session_id}:{edge_index}"
# Agent provenance URIs
# These URIs use the urn:trustgraph:agent: namespace to distinguish agent
# provenance from GraphRAG question provenance
def agent_session_uri(session_id: str = None) -> str:
"""
Generate URI for an agent session.
Args:
session_id: Optional UUID string. Auto-generates if not provided.
Returns:
URN in format: urn:trustgraph:agent:{uuid}
"""
if session_id is None:
session_id = str(uuid.uuid4())
return f"urn:trustgraph:agent:{session_id}"
def agent_iteration_uri(session_id: str, iteration_num: int) -> str:
"""
Generate URI for an agent iteration.
Args:
session_id: The session UUID.
iteration_num: 1-based iteration number.
Returns:
URN in format: urn:trustgraph:agent:{uuid}/i{num}
"""
return f"urn:trustgraph:agent:{session_id}/i{iteration_num}"
def agent_thought_uri(session_id: str, iteration_num: int) -> str:
"""
Generate URI for an agent thought sub-entity.
Args:
session_id: The session UUID.
iteration_num: 1-based iteration number.
Returns:
URN in format: urn:trustgraph:agent:{uuid}/i{num}/thought
"""
return f"urn:trustgraph:agent:{session_id}/i{iteration_num}/thought"
def agent_observation_uri(session_id: str, iteration_num: int) -> str:
"""
Generate URI for an agent observation sub-entity.
Args:
session_id: The session UUID.
iteration_num: 1-based iteration number.
Returns:
URN in format: urn:trustgraph:agent:{uuid}/i{num}/observation
"""
return f"urn:trustgraph:agent:{session_id}/i{iteration_num}/observation"
def agent_final_uri(session_id: str) -> str:
"""
Generate URI for an agent final answer.
Args:
session_id: The session UUID.
Returns:
URN in format: urn:trustgraph:agent:{uuid}/final
"""
return f"urn:trustgraph:agent:{session_id}/final"
# Document RAG provenance URIs
# These URIs use the urn:trustgraph:docrag: namespace to distinguish
# document RAG provenance from graph RAG provenance
def docrag_question_uri(session_id: str = None) -> str:
"""
Generate URI for a document RAG question activity.
Args:
session_id: Optional UUID string. Auto-generates if not provided.
Returns:
URN in format: urn:trustgraph:docrag:{uuid}
"""
if session_id is None:
session_id = str(uuid.uuid4())
return f"urn:trustgraph:docrag:{session_id}"
def docrag_grounding_uri(session_id: str) -> str:
"""
Generate URI for a document RAG grounding entity (concept decomposition).
Args:
session_id: The session UUID.
Returns:
URN in format: urn:trustgraph:docrag:{uuid}/grounding
"""
return f"urn:trustgraph:docrag:{session_id}/grounding"
def docrag_exploration_uri(session_id: str) -> str:
"""
Generate URI for a document RAG exploration entity (chunks retrieved).
Args:
session_id: The session UUID.
Returns:
URN in format: urn:trustgraph:docrag:{uuid}/exploration
"""
return f"urn:trustgraph:docrag:{session_id}/exploration"
def docrag_synthesis_uri(session_id: str) -> str:
"""
Generate URI for a document RAG synthesis entity (final answer).
Args:
session_id: The session UUID.
Returns:
URN in format: urn:trustgraph:docrag:{uuid}/synthesis
"""
return f"urn:trustgraph:docrag:{session_id}/synthesis"