feat: direction-aware reranker text in GraphRAG hop-and-filter (#1016)

The reranker document text now reflects the traversal direction, showing only the new information relative to the frontier entity: - From S (subject is frontier): text = "{predicate} {object}" - From O (object is frontier): text = "{subject} {predicate}" - From P (predicate is frontier): text = "{subject} {object}" This eliminates duplicate reranker texts when traversing inward from shared object nodes (e.g. 18 CPUs all producing identical "hasSubcategory Processors" text when the subject was dropped). execute_batch_triple_queries now returns (triple, direction) tuples so hop_and_filter can select the appropriate text format. Updates tech spec to document the direction-aware approach. Adds unit tests for direction tracking and reranker text construction.
2026-07-03 15:01:00 +02:00 · 2026-07-02 21:14:47 +01:00 · 2026-07-02 21:14:47 +01:00 · db7fdbc652
commit db7fdbc652
parent 9cf7dcb578
4 changed files with 502 additions and 19 deletions
--- a/trustgraph-flow/trustgraph/retrieval/graph_rag/graph_rag.py
+++ b/trustgraph-flow/trustgraph/retrieval/graph_rag/graph_rag.py
@ -241,38 +241,56 @@ class Query:
        self.rag.label_cache.put(cache_key, label)
        return label

+    FROM_S = "from_s"
+    FROM_P = "from_p"
+    FROM_O = "from_o"
+
    async def execute_batch_triple_queries(self, entities, limit_per_entity):
-        """Execute triple queries for multiple entities concurrently."""
+        """Execute triple queries for multiple entities concurrently.
+
+        Returns a list of (triple, direction) tuples where direction
+        indicates which position the frontier entity occupied.
+        """
        tasks = []
+        directions = []

        for entity in entities:
-            tasks.extend([
+            tasks.append(
                self.rag.triples_client.query_stream(
                    s=entity, p=None, o=None,
                    limit=limit_per_entity,
                    collection=self.collection,
                    batch_size=20, g="",
                ),
+            )
+            directions.append(self.FROM_S)
+
+            tasks.append(
                self.rag.triples_client.query_stream(
                    s=None, p=entity, o=None,
                    limit=limit_per_entity,
                    collection=self.collection,
                    batch_size=20, g="",
                ),
+            )
+            directions.append(self.FROM_P)
+
+            tasks.append(
                self.rag.triples_client.query_stream(
                    s=None, p=None, o=entity,
                    limit=limit_per_entity,
                    collection=self.collection,
                    batch_size=20, g="",
-                )
-            ])
+                ),
+            )
+            directions.append(self.FROM_O)

        results = await asyncio.gather(*tasks, return_exceptions=True)

        all_triples = []
-        for result in results:
+        for direction, result in zip(directions, results):
            if not isinstance(result, Exception) and result is not None:
-                all_triples.extend(result)
+                all_triples.extend((triple, direction) for triple in result)

        return all_triples

@ -325,7 +343,8 @@ class Query:
            # Deduplicate and filter already-seen edges
            hop_triples = []
            hop_term_map = {}
-            for triple in triples:
+            hop_directions = {}
+            for triple, direction in triples:
                triple_tuple = (str(triple.s), str(triple.p), str(triple.o))
                if triple_tuple[1] == LABEL:
                    continue
@ -336,6 +355,7 @@ class Query:
                hop_term_map[triple_tuple] = (
                    to_term(triple.s), to_term(triple.p), to_term(triple.o),
                )
+                hop_directions[triple_tuple] = direction

            if not hop_triples:
                visited_entities.update(frontier)
@ -361,7 +381,10 @@ class Query:
                else:
                    label_map[entity] = entity

-            # Build labeled edges and documents for cross-encoder
+            # Build labeled edges and documents for cross-encoder.
+            # The reranker text highlights the NEW information relative
+            # to the traversal direction: arriving from S means p,o are
+            # new; from O means s,p are new; from P means s,o are new.
            labeled_hop = []
            for s, p, o in hop_triples:
                ls = label_map.get(s, s)
@ -369,10 +392,18 @@ class Query:
                lo = label_map.get(o, o)
                labeled_hop.append((ls, lp, lo))

-            documents = [
-                {"id": str(i), "text": f"{lp} {lo}"}
-                for i, (ls, lp, lo) in enumerate(labeled_hop)
-            ]
+            documents = []
+            for i, (triple_tuple, (ls, lp, lo)) in enumerate(
+                zip(hop_triples, labeled_hop)
+            ):
+                direction = hop_directions[triple_tuple]
+                if direction == self.FROM_S:
+                    text = f"{lp} {lo}"
+                elif direction == self.FROM_O:
+                    text = f"{ls} {lp}"
+                else:
+                    text = f"{ls} {lo}"
+                documents.append({"id": str(i), "text": text})

            queries = [
                {"id": str(i), "text": c}