Enhance retrieval pipelines: 4-stage GraphRAG, DocRAG grounding (#697)

Enhance retrieval pipelines: 4-stage GraphRAG, DocRAG grounding,
consistent PROV-O

GraphRAG:
- Split retrieval into 4 prompt stages: extract-concepts,
  kg-edge-scoring,
  kg-edge-reasoning, kg-synthesis (was single-stage)
- Add concept extraction (grounding) for per-concept embedding
- Filter main query to default graph, ignoring
  provenance/explainability edges
- Add source document edges to knowledge graph

DocumentRAG:
- Add grounding step with concept extraction, matching GraphRAG's
  pattern:
  Question → Grounding → Exploration → Synthesis
- Per-concept embedding and chunk retrieval with deduplication

Cross-pipeline:
- Make PROV-O derivation links consistent: wasGeneratedBy for first
  entity from Activity, wasDerivedFrom for entity-to-entity chains
- Update CLIs (tg-invoke-agent, tg-invoke-graph-rag,
  tg-invoke-document-rag) for new explainability structure
- Fix all affected unit and integration tests
This commit is contained in:
cybermaggedon 2026-03-16 12:12:13 +00:00 committed by GitHub
parent 29b4300808
commit a115ec06ab
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
25 changed files with 1537 additions and 1008 deletions

View file

@ -20,12 +20,15 @@ from . namespaces import (
# Extraction provenance entity types
TG_DOCUMENT_TYPE, TG_PAGE_TYPE, TG_CHUNK_TYPE, TG_SUBGRAPH_TYPE,
# Query-time provenance predicates (GraphRAG)
TG_QUERY, TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_EDGE, TG_REASONING, TG_CONTENT,
TG_QUERY, TG_CONCEPT, TG_ENTITY,
TG_EDGE_COUNT, TG_SELECTED_EDGE, TG_EDGE, TG_REASONING,
TG_DOCUMENT,
# Query-time provenance predicates (DocumentRAG)
TG_CHUNK_COUNT, TG_SELECTED_CHUNK,
# Explainability entity types
TG_QUESTION, TG_EXPLORATION, TG_FOCUS, TG_SYNTHESIS,
TG_QUESTION, TG_GROUNDING, TG_EXPLORATION, TG_FOCUS, TG_SYNTHESIS,
# Unifying types
TG_ANSWER_TYPE,
# Question subtypes
TG_GRAPH_RAG_QUESTION, TG_DOC_RAG_QUESTION,
)
@ -347,35 +350,78 @@ def question_triples(
]
def grounding_triples(
grounding_uri: str,
question_uri: str,
concepts: List[str],
) -> List[Triple]:
"""
Build triples for a grounding entity (concept decomposition of query).
Creates:
- Entity declaration for grounding
- wasGeneratedBy link to question
- Concept literals for each extracted concept
Args:
grounding_uri: URI of the grounding entity (from grounding_uri)
question_uri: URI of the parent question
concepts: List of concept strings extracted from the query
Returns:
List of Triple objects
"""
triples = [
_triple(grounding_uri, RDF_TYPE, _iri(PROV_ENTITY)),
_triple(grounding_uri, RDF_TYPE, _iri(TG_GROUNDING)),
_triple(grounding_uri, RDFS_LABEL, _literal("Grounding")),
_triple(grounding_uri, PROV_WAS_GENERATED_BY, _iri(question_uri)),
]
for concept in concepts:
triples.append(_triple(grounding_uri, TG_CONCEPT, _literal(concept)))
return triples
def exploration_triples(
exploration_uri: str,
question_uri: str,
grounding_uri: str,
edge_count: int,
entities: Optional[List[str]] = None,
) -> List[Triple]:
"""
Build triples for an exploration entity (all edges retrieved from subgraph).
Creates:
- Entity declaration for exploration
- wasGeneratedBy link to question
- wasDerivedFrom link to grounding
- Edge count metadata
- Entity IRIs for each seed entity
Args:
exploration_uri: URI of the exploration entity (from exploration_uri)
question_uri: URI of the parent question
grounding_uri: URI of the parent grounding entity
edge_count: Number of edges retrieved
entities: Optional list of seed entity URIs
Returns:
List of Triple objects
"""
return [
triples = [
_triple(exploration_uri, RDF_TYPE, _iri(PROV_ENTITY)),
_triple(exploration_uri, RDF_TYPE, _iri(TG_EXPLORATION)),
_triple(exploration_uri, RDFS_LABEL, _literal("Exploration")),
_triple(exploration_uri, PROV_WAS_GENERATED_BY, _iri(question_uri)),
_triple(exploration_uri, PROV_WAS_DERIVED_FROM, _iri(grounding_uri)),
_triple(exploration_uri, TG_EDGE_COUNT, _literal(edge_count)),
]
if entities:
for entity in entities:
triples.append(_triple(exploration_uri, TG_ENTITY, _iri(entity)))
return triples
def _quoted_triple(s: str, p: str, o: str) -> Term:
"""Create a quoted triple term (RDF-star) from string values."""
@ -454,22 +500,20 @@ def focus_triples(
def synthesis_triples(
synthesis_uri: str,
focus_uri: str,
answer_text: str = "",
document_id: Optional[str] = None,
) -> List[Triple]:
"""
Build triples for a synthesis entity (final answer text).
Build triples for a synthesis entity (final answer).
Creates:
- Entity declaration for synthesis
- Entity declaration for synthesis with tg:Answer type
- wasDerivedFrom link to focus
- Either document reference (if document_id provided) or inline content
- Document reference to librarian
Args:
synthesis_uri: URI of the synthesis entity (from synthesis_uri)
focus_uri: URI of the parent focus entity
answer_text: The synthesized answer text (used if no document_id)
document_id: Optional librarian document ID (preferred over inline content)
document_id: Librarian document ID for the answer content
Returns:
List of Triple objects
@ -477,16 +521,13 @@ def synthesis_triples(
triples = [
_triple(synthesis_uri, RDF_TYPE, _iri(PROV_ENTITY)),
_triple(synthesis_uri, RDF_TYPE, _iri(TG_SYNTHESIS)),
_triple(synthesis_uri, RDF_TYPE, _iri(TG_ANSWER_TYPE)),
_triple(synthesis_uri, RDFS_LABEL, _literal("Synthesis")),
_triple(synthesis_uri, PROV_WAS_DERIVED_FROM, _iri(focus_uri)),
]
if document_id:
# Store reference to document in librarian (as IRI)
triples.append(_triple(synthesis_uri, TG_DOCUMENT, _iri(document_id)))
elif answer_text:
# Fallback: store inline content
triples.append(_triple(synthesis_uri, TG_CONTENT, _literal(answer_text)))
return triples
@ -533,7 +574,7 @@ def docrag_question_triples(
def docrag_exploration_triples(
exploration_uri: str,
question_uri: str,
grounding_uri: str,
chunk_count: int,
chunk_ids: Optional[List[str]] = None,
) -> List[Triple]:
@ -542,12 +583,12 @@ def docrag_exploration_triples(
Creates:
- Entity declaration with tg:Exploration type
- wasGeneratedBy link to question
- wasDerivedFrom link to grounding
- Chunk count and optional chunk references
Args:
exploration_uri: URI of the exploration entity
question_uri: URI of the parent question
grounding_uri: URI of the parent grounding entity
chunk_count: Number of chunks retrieved
chunk_ids: Optional list of chunk URIs/IDs
@ -558,7 +599,7 @@ def docrag_exploration_triples(
_triple(exploration_uri, RDF_TYPE, _iri(PROV_ENTITY)),
_triple(exploration_uri, RDF_TYPE, _iri(TG_EXPLORATION)),
_triple(exploration_uri, RDFS_LABEL, _literal("Exploration")),
_triple(exploration_uri, PROV_WAS_GENERATED_BY, _iri(question_uri)),
_triple(exploration_uri, PROV_WAS_DERIVED_FROM, _iri(grounding_uri)),
_triple(exploration_uri, TG_CHUNK_COUNT, _literal(chunk_count)),
]
@ -573,22 +614,20 @@ def docrag_exploration_triples(
def docrag_synthesis_triples(
synthesis_uri: str,
exploration_uri: str,
answer_text: str = "",
document_id: Optional[str] = None,
) -> List[Triple]:
"""
Build triples for a document RAG synthesis entity (final answer).
Creates:
- Entity declaration with tg:Synthesis type
- Entity declaration with tg:Synthesis and tg:Answer types
- wasDerivedFrom link to exploration (skips focus step)
- Either document reference or inline content
- Document reference to librarian
Args:
synthesis_uri: URI of the synthesis entity
exploration_uri: URI of the parent exploration entity
answer_text: The synthesized answer text (used if no document_id)
document_id: Optional librarian document ID (preferred over inline content)
document_id: Librarian document ID for the answer content
Returns:
List of Triple objects
@ -596,13 +635,12 @@ def docrag_synthesis_triples(
triples = [
_triple(synthesis_uri, RDF_TYPE, _iri(PROV_ENTITY)),
_triple(synthesis_uri, RDF_TYPE, _iri(TG_SYNTHESIS)),
_triple(synthesis_uri, RDF_TYPE, _iri(TG_ANSWER_TYPE)),
_triple(synthesis_uri, RDFS_LABEL, _literal("Synthesis")),
_triple(synthesis_uri, PROV_WAS_DERIVED_FROM, _iri(exploration_uri)),
]
if document_id:
triples.append(_triple(synthesis_uri, TG_DOCUMENT, _iri(document_id)))
elif answer_text:
triples.append(_triple(synthesis_uri, TG_CONTENT, _literal(answer_text)))
return triples