fix: prevent duplicate edges in Qdrant and fix sigma floor check
Duplicate edges root cause — critical logic bug in QdrantStore: boost_edges() and add_contradicts_edge() always created new points with random UUIDs instead of reusing existing point IDs. Each boost call added a duplicate point rather than updating in-place, causing 190 duplicate edge rows for 73 unique edges. Sigma fix — compute_sigma() checked total node count (311 >= 200 floor) but the largest connected component had only 2 nodes. Now checks the largest CC size, which is the correct semantic for Humphries-Gurney sigma validity.
This commit is contained in:
parent
7eea3ced28
commit
e2883cceaa
2 changed files with 40 additions and 14 deletions
|
|
@ -934,13 +934,15 @@ class QdrantStore:
|
|||
if not coalesced:
|
||||
return {}
|
||||
|
||||
# Fetch existing edges from metadata collection
|
||||
# Fetch existing edges from metadata collection.
|
||||
# existing_map: edge_key -> (point_id, weight) so we can update
|
||||
# in-place by reusing the existing point ID (avoids duplicates).
|
||||
all_edges = self._scroll_all(METADATA_TABLE, table_filter=EDGES_TABLE)
|
||||
existing_map: dict[tuple[str, str, str], float] = {}
|
||||
existing_map: dict[tuple[str, str, str], tuple[str, float]] = {}
|
||||
for point in all_edges:
|
||||
p = point.payload
|
||||
edge_key = (p.get("src", ""), p.get("dst", ""), p.get("edge_type", ""))
|
||||
existing_map[edge_key] = float(p.get("weight", 0.0))
|
||||
existing_map[edge_key] = (str(point.id), float(p.get("weight", 0.0)))
|
||||
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
points_to_upsert: list[PointStruct] = []
|
||||
|
|
@ -949,13 +951,15 @@ class QdrantStore:
|
|||
for (src_str, dst_str), accum_delta in coalesced.items():
|
||||
edge_key = (src_str, dst_str, edge_type)
|
||||
if edge_key in existing_map:
|
||||
nw = existing_map[edge_key] + accum_delta
|
||||
nw = existing_map[edge_key][1] + accum_delta
|
||||
point_id = existing_map[edge_key][0] # reuse existing point ID
|
||||
else:
|
||||
nw = accum_delta
|
||||
point_id = str(uuid4()) # new edge → new ID
|
||||
|
||||
# Create payload-only point (use UUID string for Qdrant compatibility)
|
||||
# Create payload-only point (reuses existing ID to update in-place)
|
||||
points_to_upsert.append(PointStruct(
|
||||
id=str(uuid4()),
|
||||
id=point_id,
|
||||
vector={},
|
||||
payload={
|
||||
"table": EDGES_TABLE,
|
||||
|
|
@ -992,17 +996,33 @@ class QdrantStore:
|
|||
return self.boost_edges([pair], delta=delta, edge_type=edge_type)
|
||||
|
||||
def add_contradicts_edge(self, original: UUID, new_id: UUID) -> None:
|
||||
"""Add a contradicts edge in the metadata collection (table=edges)."""
|
||||
"""Add or update a contradicts edge in the metadata collection (table=edges).
|
||||
|
||||
Reuses existing point ID if the edge already exists to avoid duplicates.
|
||||
"""
|
||||
src_str = str(original)
|
||||
dst_str = str(new_id)
|
||||
edge_key = (src_str, dst_str, "contradicts")
|
||||
|
||||
# Check if edge already exists
|
||||
all_edges = self._scroll_all(METADATA_TABLE, table_filter=EDGES_TABLE)
|
||||
point_id = str(uuid4()) # default: new edge
|
||||
for point in all_edges:
|
||||
p = point.payload
|
||||
if (p.get("src"), p.get("dst"), p.get("edge_type")) == edge_key:
|
||||
point_id = str(point.id)
|
||||
break
|
||||
|
||||
self._client.upsert(
|
||||
collection_name=METADATA_TABLE,
|
||||
points=[PointStruct(
|
||||
id=str(uuid4()),
|
||||
id=point_id,
|
||||
vector={},
|
||||
payload={
|
||||
"table": EDGES_TABLE,
|
||||
"group_id": self._group_id,
|
||||
"src": str(original),
|
||||
"dst": str(new_id),
|
||||
"src": src_str,
|
||||
"dst": dst_str,
|
||||
"edge_type": "contradicts",
|
||||
"weight": 1.0,
|
||||
"updated_at": datetime.now(timezone.utc).isoformat(),
|
||||
|
|
|
|||
|
|
@ -151,11 +151,17 @@ def fast_sigma(
|
|||
def compute_sigma(graph: "nx.Graph", *, seed: int = 42) -> Optional[float]:
|
||||
"""D-SIGMA-01: sigma at N>=SIGMA_N_FLOOR; otherwise None.
|
||||
|
||||
Returns None for graphs with fewer than SIGMA_N_FLOOR nodes -- below
|
||||
that threshold, the random-graph baselines are too noisy to interpret
|
||||
(Humphries-Gurney 2008).
|
||||
Returns None for graphs whose largest connected component has fewer
|
||||
than SIGMA_N_FLOOR nodes -- below that threshold, the random-graph
|
||||
baselines are too noisy to interpret (Humphries-Gurney 2008).
|
||||
|
||||
This checks the largest CC rather than total node count because a
|
||||
graph with many isolated nodes (e.g. 311 nodes, 310 components) would
|
||||
otherwise pass the floor check while the actual connected subgraph is
|
||||
too small for meaningful sigma computation.
|
||||
"""
|
||||
if graph.number_of_nodes() < SIGMA_N_FLOOR:
|
||||
g_cc = _largest_cc(graph)
|
||||
if g_cc.number_of_nodes() < SIGMA_N_FLOOR:
|
||||
return None
|
||||
sigma_val, *_ = fast_sigma(graph, seed=seed)
|
||||
if isinstance(sigma_val, float) and math.isnan(sigma_val):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue