fix: prevent duplicate edges in Qdrant and fix sigma floor check
Duplicate edges root cause — critical logic bug in QdrantStore: boost_edges() and add_contradicts_edge() always created new points with random UUIDs instead of reusing existing point IDs. Each boost call added a duplicate point rather than updating in-place, causing 190 duplicate edge rows for 73 unique edges. Sigma fix — compute_sigma() checked total node count (311 >= 200 floor) but the largest connected component had only 2 nodes. Now checks the largest CC size, which is the correct semantic for Humphries-Gurney sigma validity.
This commit is contained in:
parent
7eea3ced28
commit
e2883cceaa
2 changed files with 40 additions and 14 deletions
|
|
@ -934,13 +934,15 @@ class QdrantStore:
|
||||||
if not coalesced:
|
if not coalesced:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
# Fetch existing edges from metadata collection
|
# Fetch existing edges from metadata collection.
|
||||||
|
# existing_map: edge_key -> (point_id, weight) so we can update
|
||||||
|
# in-place by reusing the existing point ID (avoids duplicates).
|
||||||
all_edges = self._scroll_all(METADATA_TABLE, table_filter=EDGES_TABLE)
|
all_edges = self._scroll_all(METADATA_TABLE, table_filter=EDGES_TABLE)
|
||||||
existing_map: dict[tuple[str, str, str], float] = {}
|
existing_map: dict[tuple[str, str, str], tuple[str, float]] = {}
|
||||||
for point in all_edges:
|
for point in all_edges:
|
||||||
p = point.payload
|
p = point.payload
|
||||||
edge_key = (p.get("src", ""), p.get("dst", ""), p.get("edge_type", ""))
|
edge_key = (p.get("src", ""), p.get("dst", ""), p.get("edge_type", ""))
|
||||||
existing_map[edge_key] = float(p.get("weight", 0.0))
|
existing_map[edge_key] = (str(point.id), float(p.get("weight", 0.0)))
|
||||||
|
|
||||||
now = datetime.now(timezone.utc).isoformat()
|
now = datetime.now(timezone.utc).isoformat()
|
||||||
points_to_upsert: list[PointStruct] = []
|
points_to_upsert: list[PointStruct] = []
|
||||||
|
|
@ -949,13 +951,15 @@ class QdrantStore:
|
||||||
for (src_str, dst_str), accum_delta in coalesced.items():
|
for (src_str, dst_str), accum_delta in coalesced.items():
|
||||||
edge_key = (src_str, dst_str, edge_type)
|
edge_key = (src_str, dst_str, edge_type)
|
||||||
if edge_key in existing_map:
|
if edge_key in existing_map:
|
||||||
nw = existing_map[edge_key] + accum_delta
|
nw = existing_map[edge_key][1] + accum_delta
|
||||||
|
point_id = existing_map[edge_key][0] # reuse existing point ID
|
||||||
else:
|
else:
|
||||||
nw = accum_delta
|
nw = accum_delta
|
||||||
|
point_id = str(uuid4()) # new edge → new ID
|
||||||
|
|
||||||
# Create payload-only point (use UUID string for Qdrant compatibility)
|
# Create payload-only point (reuses existing ID to update in-place)
|
||||||
points_to_upsert.append(PointStruct(
|
points_to_upsert.append(PointStruct(
|
||||||
id=str(uuid4()),
|
id=point_id,
|
||||||
vector={},
|
vector={},
|
||||||
payload={
|
payload={
|
||||||
"table": EDGES_TABLE,
|
"table": EDGES_TABLE,
|
||||||
|
|
@ -992,17 +996,33 @@ class QdrantStore:
|
||||||
return self.boost_edges([pair], delta=delta, edge_type=edge_type)
|
return self.boost_edges([pair], delta=delta, edge_type=edge_type)
|
||||||
|
|
||||||
def add_contradicts_edge(self, original: UUID, new_id: UUID) -> None:
|
def add_contradicts_edge(self, original: UUID, new_id: UUID) -> None:
|
||||||
"""Add a contradicts edge in the metadata collection (table=edges)."""
|
"""Add or update a contradicts edge in the metadata collection (table=edges).
|
||||||
|
|
||||||
|
Reuses existing point ID if the edge already exists to avoid duplicates.
|
||||||
|
"""
|
||||||
|
src_str = str(original)
|
||||||
|
dst_str = str(new_id)
|
||||||
|
edge_key = (src_str, dst_str, "contradicts")
|
||||||
|
|
||||||
|
# Check if edge already exists
|
||||||
|
all_edges = self._scroll_all(METADATA_TABLE, table_filter=EDGES_TABLE)
|
||||||
|
point_id = str(uuid4()) # default: new edge
|
||||||
|
for point in all_edges:
|
||||||
|
p = point.payload
|
||||||
|
if (p.get("src"), p.get("dst"), p.get("edge_type")) == edge_key:
|
||||||
|
point_id = str(point.id)
|
||||||
|
break
|
||||||
|
|
||||||
self._client.upsert(
|
self._client.upsert(
|
||||||
collection_name=METADATA_TABLE,
|
collection_name=METADATA_TABLE,
|
||||||
points=[PointStruct(
|
points=[PointStruct(
|
||||||
id=str(uuid4()),
|
id=point_id,
|
||||||
vector={},
|
vector={},
|
||||||
payload={
|
payload={
|
||||||
"table": EDGES_TABLE,
|
"table": EDGES_TABLE,
|
||||||
"group_id": self._group_id,
|
"group_id": self._group_id,
|
||||||
"src": str(original),
|
"src": src_str,
|
||||||
"dst": str(new_id),
|
"dst": dst_str,
|
||||||
"edge_type": "contradicts",
|
"edge_type": "contradicts",
|
||||||
"weight": 1.0,
|
"weight": 1.0,
|
||||||
"updated_at": datetime.now(timezone.utc).isoformat(),
|
"updated_at": datetime.now(timezone.utc).isoformat(),
|
||||||
|
|
|
||||||
|
|
@ -151,11 +151,17 @@ def fast_sigma(
|
||||||
def compute_sigma(graph: "nx.Graph", *, seed: int = 42) -> Optional[float]:
|
def compute_sigma(graph: "nx.Graph", *, seed: int = 42) -> Optional[float]:
|
||||||
"""D-SIGMA-01: sigma at N>=SIGMA_N_FLOOR; otherwise None.
|
"""D-SIGMA-01: sigma at N>=SIGMA_N_FLOOR; otherwise None.
|
||||||
|
|
||||||
Returns None for graphs with fewer than SIGMA_N_FLOOR nodes -- below
|
Returns None for graphs whose largest connected component has fewer
|
||||||
that threshold, the random-graph baselines are too noisy to interpret
|
than SIGMA_N_FLOOR nodes -- below that threshold, the random-graph
|
||||||
(Humphries-Gurney 2008).
|
baselines are too noisy to interpret (Humphries-Gurney 2008).
|
||||||
|
|
||||||
|
This checks the largest CC rather than total node count because a
|
||||||
|
graph with many isolated nodes (e.g. 311 nodes, 310 components) would
|
||||||
|
otherwise pass the floor check while the actual connected subgraph is
|
||||||
|
too small for meaningful sigma computation.
|
||||||
"""
|
"""
|
||||||
if graph.number_of_nodes() < SIGMA_N_FLOOR:
|
g_cc = _largest_cc(graph)
|
||||||
|
if g_cc.number_of_nodes() < SIGMA_N_FLOOR:
|
||||||
return None
|
return None
|
||||||
sigma_val, *_ = fast_sigma(graph, seed=seed)
|
sigma_val, *_ = fast_sigma(graph, seed=seed)
|
||||||
if isinstance(sigma_val, float) and math.isnan(sigma_val):
|
if isinstance(sigma_val, float) and math.isnan(sigma_val):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue