Enhance retrieval pipelines: 4-stage GraphRAG, DocRAG grounding (#697)

Enhance retrieval pipelines: 4-stage GraphRAG, DocRAG grounding, consistent PROV-O GraphRAG: - Split retrieval into 4 prompt stages: extract-concepts, kg-edge-scoring, kg-edge-reasoning, kg-synthesis (was single-stage) - Add concept extraction (grounding) for per-concept embedding - Filter main query to default graph, ignoring provenance/explainability edges - Add source document edges to knowledge graph DocumentRAG: - Add grounding step with concept extraction, matching GraphRAG's pattern: Question → Grounding → Exploration → Synthesis - Per-concept embedding and chunk retrieval with deduplication Cross-pipeline: - Make PROV-O derivation links consistent: wasGeneratedBy for first entity from Activity, wasDerivedFrom for entity-to-entity chains - Update CLIs (tg-invoke-agent, tg-invoke-graph-rag, tg-invoke-document-rag) for new explainability structure - Fix all affected unit and integration tests
2026-04-25 08:26:21 +02:00 · 2026-03-16 12:12:13 +00:00 · 2026-03-16 12:12:13 +00:00 · a115ec06ab
commit a115ec06ab
parent 29b4300808
25 changed files with 1537 additions and 1008 deletions
--- a/trustgraph-flow/trustgraph/retrieval/graph_rag/rag.py
+++ b/trustgraph-flow/trustgraph/retrieval/graph_rag/rag.py
@ -39,6 +39,7 @@ class Processor(FlowProcessor):
        triple_limit = params.get("triple_limit", 30)
        max_subgraph_size = params.get("max_subgraph_size", 150)
        max_path_length = params.get("max_path_length", 2)
+        edge_limit = params.get("edge_limit", 25)

        super(Processor, self).__init__(
            **params | {
@ -48,6 +49,7 @@ class Processor(FlowProcessor):
                "triple_limit": triple_limit,
                "max_subgraph_size": max_subgraph_size,
                "max_path_length": max_path_length,
+                "edge_limit": edge_limit,
            }
        )

@ -55,6 +57,7 @@ class Processor(FlowProcessor):
        self.default_triple_limit = triple_limit
        self.default_max_subgraph_size = max_subgraph_size
        self.default_max_path_length = max_path_length
+        self.default_edge_limit = edge_limit

        # CRITICAL SECURITY: NEVER share data between users or collections
        # Each user/collection combination MUST have isolated data access
@ -292,6 +295,11 @@ class Processor(FlowProcessor):
            else:
                max_path_length = self.default_max_path_length

+            if v.edge_limit:
+                edge_limit = v.edge_limit
+            else:
+                edge_limit = self.default_edge_limit
+
            # Callback to save answer content to librarian
            async def save_answer(doc_id, answer_text):
                await self.save_answer_content(
@ -322,6 +330,7 @@ class Processor(FlowProcessor):
                    entity_limit = entity_limit, triple_limit = triple_limit,
                    max_subgraph_size = max_subgraph_size,
                    max_path_length = max_path_length,
+                    edge_limit = edge_limit,
                    streaming = True,
                    chunk_callback = send_chunk,
                    explain_callback = send_explainability,
@ -335,6 +344,7 @@ class Processor(FlowProcessor):
                    entity_limit = entity_limit, triple_limit = triple_limit,
                    max_subgraph_size = max_subgraph_size,
                    max_path_length = max_path_length,
+                    edge_limit = edge_limit,
                    explain_callback = send_explainability,
                    save_answer_callback = save_answer,
                )