fix: prevent duplicate edges in Qdrant and fix sigma floor check

Duplicate edges root cause — critical logic bug in QdrantStore: boost_edges() and add_contradicts_edge() always created new points with random UUIDs instead of reusing existing point IDs. Each boost call added a duplicate point rather than updating in-place, causing 190 duplicate edge rows for 73 unique edges. Sigma fix — compute_sigma() checked total node count (311 >= 200 floor) but the largest connected component had only 2 nodes. Now checks the largest CC size, which is the correct semantic for Humphries-Gurney sigma validity.
2026-05-16 12:02:49 +02:00 · 2026-05-16 12:02:49 +02:00 · e2883cceaa
commit e2883cceaa
parent 7eea3ced28
2 changed files with 40 additions and 14 deletions
--- a/src/iai_mcp/qdrant_store.py
+++ b/src/iai_mcp/qdrant_store.py
@ -934,13 +934,15 @@ class QdrantStore:
        if not coalesced:
            return {}

-        # Fetch existing edges from metadata collection
+        # Fetch existing edges from metadata collection.
+        # existing_map: edge_key -> (point_id, weight) so we can update
+        # in-place by reusing the existing point ID (avoids duplicates).
        all_edges = self._scroll_all(METADATA_TABLE, table_filter=EDGES_TABLE)
-        existing_map: dict[tuple[str, str, str], float] = {}
+        existing_map: dict[tuple[str, str, str], tuple[str, float]] = {}
        for point in all_edges:
            p = point.payload
            edge_key = (p.get("src", ""), p.get("dst", ""), p.get("edge_type", ""))
-            existing_map[edge_key] = float(p.get("weight", 0.0))
+            existing_map[edge_key] = (str(point.id), float(p.get("weight", 0.0)))

        now = datetime.now(timezone.utc).isoformat()
        points_to_upsert: list[PointStruct] = []
@ -949,13 +951,15 @@ class QdrantStore:
        for (src_str, dst_str), accum_delta in coalesced.items():
            edge_key = (src_str, dst_str, edge_type)
            if edge_key in existing_map:
-                nw = existing_map[edge_key] + accum_delta
+                nw = existing_map[edge_key][1] + accum_delta
+                point_id = existing_map[edge_key][0]  # reuse existing point ID
            else:
                nw = accum_delta
+                point_id = str(uuid4())  # new edge → new ID

-            # Create payload-only point (use UUID string for Qdrant compatibility)
+            # Create payload-only point (reuses existing ID to update in-place)
            points_to_upsert.append(PointStruct(
-                id=str(uuid4()),
+                id=point_id,
                vector={},
                payload={
                    "table": EDGES_TABLE,
@ -992,17 +996,33 @@ class QdrantStore:
        return self.boost_edges([pair], delta=delta, edge_type=edge_type)

    def add_contradicts_edge(self, original: UUID, new_id: UUID) -> None:
-        """Add a contradicts edge in the metadata collection (table=edges)."""
+        """Add or update a contradicts edge in the metadata collection (table=edges).
+
+        Reuses existing point ID if the edge already exists to avoid duplicates.
+        """
+        src_str = str(original)
+        dst_str = str(new_id)
+        edge_key = (src_str, dst_str, "contradicts")
+
+        # Check if edge already exists
+        all_edges = self._scroll_all(METADATA_TABLE, table_filter=EDGES_TABLE)
+        point_id = str(uuid4())  # default: new edge
+        for point in all_edges:
+            p = point.payload
+            if (p.get("src"), p.get("dst"), p.get("edge_type")) == edge_key:
+                point_id = str(point.id)
+                break
+
        self._client.upsert(
            collection_name=METADATA_TABLE,
            points=[PointStruct(
-                id=str(uuid4()),
+                id=point_id,
                vector={},
                payload={
                    "table": EDGES_TABLE,
                    "group_id": self._group_id,
-                    "src": str(original),
-                    "dst": str(new_id),
+                    "src": src_str,
+                    "dst": dst_str,
                    "edge_type": "contradicts",
                    "weight": 1.0,
                    "updated_at": datetime.now(timezone.utc).isoformat(),
--- a/src/iai_mcp/sigma.py
+++ b/src/iai_mcp/sigma.py
@ -151,11 +151,17 @@ def fast_sigma(
 def compute_sigma(graph: "nx.Graph", *, seed: int = 42) -> Optional[float]:
    """D-SIGMA-01: sigma at N>=SIGMA_N_FLOOR; otherwise None.

-    Returns None for graphs with fewer than SIGMA_N_FLOOR nodes -- below
-    that threshold, the random-graph baselines are too noisy to interpret
-    (Humphries-Gurney 2008).
+    Returns None for graphs whose largest connected component has fewer
+    than SIGMA_N_FLOOR nodes -- below that threshold, the random-graph
+    baselines are too noisy to interpret (Humphries-Gurney 2008).
+
+    This checks the largest CC rather than total node count because a
+    graph with many isolated nodes (e.g. 311 nodes, 310 components) would
+    otherwise pass the floor check while the actual connected subgraph is
+    too small for meaningful sigma computation.
    """
-    if graph.number_of_nodes() < SIGMA_N_FLOOR:
+    g_cc = _largest_cc(graph)
+    if g_cc.number_of_nodes() < SIGMA_N_FLOOR:
        return None
    sigma_val, *_ = fast_sigma(graph, seed=seed)
    if isinstance(sigma_val, float) and math.isnan(sigma_val):