fix: prevent duplicate edges in Qdrant and fix sigma floor check

Duplicate edges root cause — critical logic bug in QdrantStore:
boost_edges() and add_contradicts_edge() always created new points with
random UUIDs instead of reusing existing point IDs. Each boost call
added a duplicate point rather than updating in-place, causing 190
duplicate edge rows for 73 unique edges.

Sigma fix — compute_sigma() checked total node count (311 >= 200 floor)
but the largest connected component had only 2 nodes. Now checks the
largest CC size, which is the correct semantic for Humphries-Gurney
sigma validity.
This commit is contained in:
Apunkt 2026-05-16 12:02:49 +02:00
parent 7eea3ced28
commit e2883cceaa
No known key found for this signature in database
2 changed files with 40 additions and 14 deletions

View file

@ -934,13 +934,15 @@ class QdrantStore:
if not coalesced:
return {}
# Fetch existing edges from metadata collection
# Fetch existing edges from metadata collection.
# existing_map: edge_key -> (point_id, weight) so we can update
# in-place by reusing the existing point ID (avoids duplicates).
all_edges = self._scroll_all(METADATA_TABLE, table_filter=EDGES_TABLE)
existing_map: dict[tuple[str, str, str], float] = {}
existing_map: dict[tuple[str, str, str], tuple[str, float]] = {}
for point in all_edges:
p = point.payload
edge_key = (p.get("src", ""), p.get("dst", ""), p.get("edge_type", ""))
existing_map[edge_key] = float(p.get("weight", 0.0))
existing_map[edge_key] = (str(point.id), float(p.get("weight", 0.0)))
now = datetime.now(timezone.utc).isoformat()
points_to_upsert: list[PointStruct] = []
@ -949,13 +951,15 @@ class QdrantStore:
for (src_str, dst_str), accum_delta in coalesced.items():
edge_key = (src_str, dst_str, edge_type)
if edge_key in existing_map:
nw = existing_map[edge_key] + accum_delta
nw = existing_map[edge_key][1] + accum_delta
point_id = existing_map[edge_key][0] # reuse existing point ID
else:
nw = accum_delta
point_id = str(uuid4()) # new edge → new ID
# Create payload-only point (use UUID string for Qdrant compatibility)
# Create payload-only point (reuses existing ID to update in-place)
points_to_upsert.append(PointStruct(
id=str(uuid4()),
id=point_id,
vector={},
payload={
"table": EDGES_TABLE,
@ -992,17 +996,33 @@ class QdrantStore:
return self.boost_edges([pair], delta=delta, edge_type=edge_type)
def add_contradicts_edge(self, original: UUID, new_id: UUID) -> None:
"""Add a contradicts edge in the metadata collection (table=edges)."""
"""Add or update a contradicts edge in the metadata collection (table=edges).
Reuses existing point ID if the edge already exists to avoid duplicates.
"""
src_str = str(original)
dst_str = str(new_id)
edge_key = (src_str, dst_str, "contradicts")
# Check if edge already exists
all_edges = self._scroll_all(METADATA_TABLE, table_filter=EDGES_TABLE)
point_id = str(uuid4()) # default: new edge
for point in all_edges:
p = point.payload
if (p.get("src"), p.get("dst"), p.get("edge_type")) == edge_key:
point_id = str(point.id)
break
self._client.upsert(
collection_name=METADATA_TABLE,
points=[PointStruct(
id=str(uuid4()),
id=point_id,
vector={},
payload={
"table": EDGES_TABLE,
"group_id": self._group_id,
"src": str(original),
"dst": str(new_id),
"src": src_str,
"dst": dst_str,
"edge_type": "contradicts",
"weight": 1.0,
"updated_at": datetime.now(timezone.utc).isoformat(),

View file

@ -151,11 +151,17 @@ def fast_sigma(
def compute_sigma(graph: "nx.Graph", *, seed: int = 42) -> Optional[float]:
"""D-SIGMA-01: sigma at N>=SIGMA_N_FLOOR; otherwise None.
Returns None for graphs with fewer than SIGMA_N_FLOOR nodes -- below
that threshold, the random-graph baselines are too noisy to interpret
(Humphries-Gurney 2008).
Returns None for graphs whose largest connected component has fewer
than SIGMA_N_FLOOR nodes -- below that threshold, the random-graph
baselines are too noisy to interpret (Humphries-Gurney 2008).
This checks the largest CC rather than total node count because a
graph with many isolated nodes (e.g. 311 nodes, 310 components) would
otherwise pass the floor check while the actual connected subgraph is
too small for meaningful sigma computation.
"""
if graph.number_of_nodes() < SIGMA_N_FLOOR:
g_cc = _largest_cc(graph)
if g_cc.number_of_nodes() < SIGMA_N_FLOOR:
return None
sigma_val, *_ = fast_sigma(graph, seed=seed)
if isinstance(sigma_val, float) and math.isnan(sigma_val):