Add semantic pre-filter for GraphRAG edge scoring (#702)

Embed edge descriptions and compute cosine similarity against grounding concepts to reduce the number of edges sent to expensive LLM scoring. Controlled by edge_score_limit parameter (default 30), skipped when edge count is already below the limit. Also plumbs edge_score_limit and edge_limit parameters end-to-end: - CLI args (--edge-score-limit, --edge-limit) in both invoke and service - Socket client: fix parameter mapping to use hyphenated wire-format keys - Flow API, message translator, gateway all pass through correctly - Explainable code path (_question_explainable_api) now forwards all params - Default edge_score_limit changed from 50 to 30 based on typical subgraph sizes
2026-04-25 08:26:21 +02:00 · 2026-03-21 20:06:29 +00:00 · 2026-03-21 20:06:29 +00:00 · 1a7b654bd3
commit 1a7b654bd3
parent bc68738c37
7 changed files with 166 additions and 20 deletions
--- a/trustgraph-flow/trustgraph/retrieval/graph_rag/rag.py
+++ b/trustgraph-flow/trustgraph/retrieval/graph_rag/rag.py
@ -39,6 +39,7 @@ class Processor(FlowProcessor):
        triple_limit = params.get("triple_limit", 30)
        max_subgraph_size = params.get("max_subgraph_size", 150)
        max_path_length = params.get("max_path_length", 2)
+        edge_score_limit = params.get("edge_score_limit", 30)
        edge_limit = params.get("edge_limit", 25)

        super(Processor, self).__init__(
@ -49,6 +50,7 @@ class Processor(FlowProcessor):
                "triple_limit": triple_limit,
                "max_subgraph_size": max_subgraph_size,
                "max_path_length": max_path_length,
+                "edge_score_limit": edge_score_limit,
                "edge_limit": edge_limit,
            }
        )
@ -57,6 +59,7 @@ class Processor(FlowProcessor):
        self.default_triple_limit = triple_limit
        self.default_max_subgraph_size = max_subgraph_size
        self.default_max_path_length = max_path_length
+        self.default_edge_score_limit = edge_score_limit
        self.default_edge_limit = edge_limit

        # CRITICAL SECURITY: NEVER share data between users or collections
@ -295,6 +298,11 @@ class Processor(FlowProcessor):
            else:
                max_path_length = self.default_max_path_length

+            if v.edge_score_limit:
+                edge_score_limit = v.edge_score_limit
+            else:
+                edge_score_limit = self.default_edge_score_limit
+
            if v.edge_limit:
                edge_limit = v.edge_limit
            else:
@ -330,6 +338,7 @@ class Processor(FlowProcessor):
                    entity_limit = entity_limit, triple_limit = triple_limit,
                    max_subgraph_size = max_subgraph_size,
                    max_path_length = max_path_length,
+                    edge_score_limit = edge_score_limit,
                    edge_limit = edge_limit,
                    streaming = True,
                    chunk_callback = send_chunk,
@ -344,6 +353,7 @@ class Processor(FlowProcessor):
                    entity_limit = entity_limit, triple_limit = triple_limit,
                    max_subgraph_size = max_subgraph_size,
                    max_path_length = max_path_length,
+                    edge_score_limit = edge_score_limit,
                    edge_limit = edge_limit,
                    explain_callback = send_explainability,
                    save_answer_callback = save_answer,
@ -432,6 +442,20 @@ class Processor(FlowProcessor):
            help=f'Default max path length (default: 2)'
        )

+        parser.add_argument(
+            '--edge-score-limit',
+            type=int,
+            default=30,
+            help=f'Semantic pre-filter limit before LLM scoring (default: 30)'
+        )
+
+        parser.add_argument(
+            '--edge-limit',
+            type=int,
+            default=25,
+            help=f'Max edges after LLM scoring (default: 25)'
+        )
+
        # Note: Explainability triples are now stored in the user's collection
        # with the named graph urn:graph:retrieval (no separate collection needed)