2026-02-12 05:02:09 -06:00
|
|
|
//! Find Duplicates Tool
|
|
|
|
|
//!
|
|
|
|
|
//! Detects duplicate and near-duplicate memory clusters using
|
|
|
|
|
//! cosine similarity on stored embeddings. Uses union-find for
|
|
|
|
|
//! efficient clustering.
|
|
|
|
|
|
|
|
|
|
use serde::Deserialize;
|
|
|
|
|
use serde_json::Value;
|
|
|
|
|
use std::collections::HashMap;
|
|
|
|
|
use std::sync::Arc;
|
|
|
|
|
use tokio::sync::Mutex;
|
|
|
|
|
|
|
|
|
|
use vestige_core::Storage;
|
|
|
|
|
#[cfg(all(feature = "embeddings", feature = "vector-search"))]
|
|
|
|
|
use vestige_core::cosine_similarity;
|
|
|
|
|
|
|
|
|
|
/// Input schema for find_duplicates tool
|
|
|
|
|
pub fn schema() -> Value {
|
|
|
|
|
serde_json::json!({
|
|
|
|
|
"type": "object",
|
|
|
|
|
"properties": {
|
|
|
|
|
"similarity_threshold": {
|
|
|
|
|
"type": "number",
|
|
|
|
|
"description": "Minimum cosine similarity to consider as duplicate (0.0-1.0, default: 0.80)",
|
|
|
|
|
"default": 0.80,
|
|
|
|
|
"minimum": 0.5,
|
|
|
|
|
"maximum": 1.0
|
|
|
|
|
},
|
|
|
|
|
"limit": {
|
|
|
|
|
"type": "integer",
|
|
|
|
|
"description": "Maximum number of duplicate clusters to return (default: 20)",
|
|
|
|
|
"default": 20,
|
|
|
|
|
"minimum": 1,
|
|
|
|
|
"maximum": 100
|
|
|
|
|
},
|
|
|
|
|
"tags": {
|
|
|
|
|
"type": "array",
|
|
|
|
|
"items": { "type": "string" },
|
|
|
|
|
"description": "Optional: only check memories with these tags (ANY match)"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
|
|
|
#[serde(rename_all = "camelCase")]
|
|
|
|
|
struct DedupArgs {
|
|
|
|
|
similarity_threshold: Option<f64>,
|
|
|
|
|
limit: Option<usize>,
|
|
|
|
|
tags: Option<Vec<String>>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Simple union-find for clustering
|
|
|
|
|
struct UnionFind {
|
|
|
|
|
parent: Vec<usize>,
|
|
|
|
|
rank: Vec<usize>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl UnionFind {
|
|
|
|
|
fn new(n: usize) -> Self {
|
|
|
|
|
Self {
|
|
|
|
|
parent: (0..n).collect(),
|
|
|
|
|
rank: vec![0; n],
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn find(&mut self, x: usize) -> usize {
|
|
|
|
|
if self.parent[x] != x {
|
|
|
|
|
self.parent[x] = self.find(self.parent[x]);
|
|
|
|
|
}
|
|
|
|
|
self.parent[x]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn union(&mut self, x: usize, y: usize) {
|
|
|
|
|
let rx = self.find(x);
|
|
|
|
|
let ry = self.find(y);
|
|
|
|
|
if rx == ry {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if self.rank[rx] < self.rank[ry] {
|
|
|
|
|
self.parent[rx] = ry;
|
|
|
|
|
} else if self.rank[rx] > self.rank[ry] {
|
|
|
|
|
self.parent[ry] = rx;
|
|
|
|
|
} else {
|
|
|
|
|
self.parent[ry] = rx;
|
|
|
|
|
self.rank[rx] += 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub async fn execute(
|
|
|
|
|
storage: &Arc<Mutex<Storage>>,
|
|
|
|
|
args: Option<Value>,
|
|
|
|
|
) -> Result<Value, String> {
|
|
|
|
|
let args: DedupArgs = match args {
|
|
|
|
|
Some(v) => serde_json::from_value(v).map_err(|e| format!("Invalid arguments: {}", e))?,
|
|
|
|
|
None => DedupArgs {
|
|
|
|
|
similarity_threshold: None,
|
|
|
|
|
limit: None,
|
|
|
|
|
tags: None,
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let threshold = args.similarity_threshold.unwrap_or(0.80) as f32;
|
|
|
|
|
let limit = args.limit.unwrap_or(20);
|
|
|
|
|
let tag_filter = args.tags.unwrap_or_default();
|
|
|
|
|
|
|
|
|
|
#[cfg(all(feature = "embeddings", feature = "vector-search"))]
|
|
|
|
|
{
|
|
|
|
|
let storage = storage.lock().await;
|
|
|
|
|
|
|
|
|
|
// Load all embeddings
|
|
|
|
|
let all_embeddings = storage
|
|
|
|
|
.get_all_embeddings()
|
|
|
|
|
.map_err(|e| format!("Failed to load embeddings: {}", e))?;
|
|
|
|
|
|
|
|
|
|
if all_embeddings.is_empty() {
|
|
|
|
|
return Ok(serde_json::json!({
|
|
|
|
|
"clusters": [],
|
|
|
|
|
"totalMemories": 0,
|
|
|
|
|
"totalWithEmbeddings": 0,
|
|
|
|
|
"message": "No embeddings found. Run consolidation first."
|
|
|
|
|
}));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Load nodes for metadata (content preview, retention, tags)
|
|
|
|
|
let mut all_nodes = Vec::new();
|
|
|
|
|
let mut offset = 0;
|
|
|
|
|
loop {
|
|
|
|
|
let batch = storage
|
|
|
|
|
.get_all_nodes(500, offset)
|
|
|
|
|
.map_err(|e| format!("Failed to load nodes: {}", e))?;
|
|
|
|
|
let batch_len = batch.len();
|
|
|
|
|
all_nodes.extend(batch);
|
|
|
|
|
if batch_len < 500 {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
offset += 500;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Build node lookup
|
|
|
|
|
let node_map: HashMap<String, &vestige_core::KnowledgeNode> =
|
|
|
|
|
all_nodes.iter().map(|n| (n.id.clone(), n)).collect();
|
|
|
|
|
|
|
|
|
|
// Filter by tags if specified
|
|
|
|
|
let filtered_embeddings: Vec<(usize, &String, &Vec<f32>)> = all_embeddings
|
|
|
|
|
.iter()
|
|
|
|
|
.enumerate()
|
|
|
|
|
.filter(|(_, (id, _))| {
|
|
|
|
|
if tag_filter.is_empty() {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
if let Some(node) = node_map.get(id) {
|
|
|
|
|
tag_filter.iter().any(|t| node.tags.contains(t))
|
|
|
|
|
} else {
|
|
|
|
|
false
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
.map(|(i, (id, vec))| (i, id, vec))
|
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
|
|
let n = filtered_embeddings.len();
|
|
|
|
|
|
|
|
|
|
if n > 2000 {
|
|
|
|
|
return Ok(serde_json::json!({
|
|
|
|
|
"warning": format!("Too many memories to scan ({} with embeddings). Filter by tags to reduce scope.", n),
|
|
|
|
|
"totalMemories": all_nodes.len(),
|
|
|
|
|
"totalWithEmbeddings": n
|
|
|
|
|
}));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// O(n^2) pairwise similarity + union-find clustering
|
|
|
|
|
let mut uf = UnionFind::new(n);
|
|
|
|
|
let mut similarities: Vec<(usize, usize, f32)> = Vec::new();
|
|
|
|
|
|
|
|
|
|
for i in 0..n {
|
|
|
|
|
for j in (i + 1)..n {
|
chore: license AGPL-3.0, zero clippy warnings, CHANGELOG through v1.6.0
License:
- Replace MIT/Apache-2.0 with AGPL-3.0-only across all crates and npm packages
- Replace LICENSE file with official GNU AGPL-3.0 text
- Remove LICENSE-MIT and LICENSE-APACHE
Code quality:
- Fix all 44 clippy warnings (zero remaining)
- Collapsible if statements, redundant closures, manual Option::map
- Remove duplicate #[allow(dead_code)] attributes in deprecated tool modules
- Add Default impl for CognitiveEngine
- Replace manual sort_by with sort_by_key
Documentation:
- Update CHANGELOG with v1.2.0, v1.3.0, v1.5.0, v1.6.0 entries
- Update README with v1.6.0 highlights and accurate stats (52K lines, 1100+ tests)
- Add fastembed-rs/ to .gitignore
- Add fastembed-rs to workspace exclude
1115 tests passing, zero warnings, RUSTFLAGS="-Dwarnings" clean.
2026-02-19 03:00:39 -06:00
|
|
|
let sim = cosine_similarity(filtered_embeddings[i].2, filtered_embeddings[j].2);
|
2026-02-12 05:02:09 -06:00
|
|
|
if sim >= threshold {
|
|
|
|
|
uf.union(i, j);
|
|
|
|
|
similarities.push((i, j, sim));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Group into clusters
|
|
|
|
|
let mut cluster_map: HashMap<usize, Vec<usize>> = HashMap::new();
|
|
|
|
|
for i in 0..n {
|
|
|
|
|
let root = uf.find(i);
|
|
|
|
|
cluster_map.entry(root).or_default().push(i);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Only keep clusters with >1 member, sorted by size descending
|
|
|
|
|
let mut clusters: Vec<Vec<usize>> = cluster_map
|
|
|
|
|
.into_values()
|
|
|
|
|
.filter(|c| c.len() > 1)
|
|
|
|
|
.collect();
|
chore: license AGPL-3.0, zero clippy warnings, CHANGELOG through v1.6.0
License:
- Replace MIT/Apache-2.0 with AGPL-3.0-only across all crates and npm packages
- Replace LICENSE file with official GNU AGPL-3.0 text
- Remove LICENSE-MIT and LICENSE-APACHE
Code quality:
- Fix all 44 clippy warnings (zero remaining)
- Collapsible if statements, redundant closures, manual Option::map
- Remove duplicate #[allow(dead_code)] attributes in deprecated tool modules
- Add Default impl for CognitiveEngine
- Replace manual sort_by with sort_by_key
Documentation:
- Update CHANGELOG with v1.2.0, v1.3.0, v1.5.0, v1.6.0 entries
- Update README with v1.6.0 highlights and accurate stats (52K lines, 1100+ tests)
- Add fastembed-rs/ to .gitignore
- Add fastembed-rs to workspace exclude
1115 tests passing, zero warnings, RUSTFLAGS="-Dwarnings" clean.
2026-02-19 03:00:39 -06:00
|
|
|
clusters.sort_by_key(|b| std::cmp::Reverse(b.len()));
|
2026-02-12 05:02:09 -06:00
|
|
|
clusters.truncate(limit);
|
|
|
|
|
|
|
|
|
|
// Build similarity lookup for formatting
|
|
|
|
|
let mut sim_lookup: HashMap<(usize, usize), f32> = HashMap::new();
|
|
|
|
|
for &(i, j, sim) in &similarities {
|
|
|
|
|
sim_lookup.insert((i, j), sim);
|
|
|
|
|
sim_lookup.insert((j, i), sim);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Format output
|
|
|
|
|
let cluster_results: Vec<Value> = clusters
|
|
|
|
|
.iter()
|
|
|
|
|
.enumerate()
|
|
|
|
|
.map(|(ci, members)| {
|
|
|
|
|
let anchor = members[0];
|
|
|
|
|
let member_results: Vec<Value> = members
|
|
|
|
|
.iter()
|
|
|
|
|
.map(|&idx| {
|
|
|
|
|
let id = &filtered_embeddings[idx].1;
|
|
|
|
|
let node = node_map.get(id.as_str());
|
|
|
|
|
let content_preview = node
|
|
|
|
|
.map(|n| {
|
|
|
|
|
let c = n.content.replace('\n', " ");
|
|
|
|
|
if c.len() > 120 {
|
|
|
|
|
format!("{}...", &c[..120])
|
|
|
|
|
} else {
|
|
|
|
|
c
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
.unwrap_or_default();
|
|
|
|
|
|
|
|
|
|
let sim_to_anchor = if idx == anchor {
|
|
|
|
|
1.0
|
|
|
|
|
} else {
|
|
|
|
|
sim_lookup
|
|
|
|
|
.get(&(anchor, idx))
|
|
|
|
|
.copied()
|
|
|
|
|
.unwrap_or(0.0)
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
serde_json::json!({
|
|
|
|
|
"id": id,
|
|
|
|
|
"contentPreview": content_preview,
|
|
|
|
|
"retention": node.map(|n| n.retention_strength).unwrap_or(0.0),
|
|
|
|
|
"createdAt": node.map(|n| n.created_at.to_rfc3339()).unwrap_or_default(),
|
|
|
|
|
"tags": node.map(|n| &n.tags).unwrap_or(&vec![]),
|
|
|
|
|
"similarityToAnchor": format!("{:.3}", sim_to_anchor)
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
|
|
serde_json::json!({
|
|
|
|
|
"clusterId": ci,
|
|
|
|
|
"size": members.len(),
|
|
|
|
|
"members": member_results,
|
|
|
|
|
"suggestedAction": if members.len() > 3 { "review" } else { "merge" }
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
|
|
Ok(serde_json::json!({
|
|
|
|
|
"clusters": cluster_results,
|
|
|
|
|
"totalClusters": cluster_results.len(),
|
|
|
|
|
"totalMemories": all_nodes.len(),
|
|
|
|
|
"totalWithEmbeddings": n,
|
|
|
|
|
"threshold": threshold,
|
|
|
|
|
"pairsChecked": n * (n - 1) / 2
|
|
|
|
|
}))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[cfg(not(all(feature = "embeddings", feature = "vector-search")))]
|
|
|
|
|
{
|
|
|
|
|
Ok(serde_json::json!({
|
|
|
|
|
"error": "Embeddings feature not enabled. Cannot compute similarities.",
|
|
|
|
|
"clusters": []
|
|
|
|
|
}))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
|
mod tests {
|
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_schema() {
|
|
|
|
|
let schema = schema();
|
|
|
|
|
assert_eq!(schema["type"], "object");
|
|
|
|
|
assert!(schema["properties"]["similarity_threshold"].is_object());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_union_find() {
|
|
|
|
|
let mut uf = UnionFind::new(5);
|
|
|
|
|
uf.union(0, 1);
|
|
|
|
|
uf.union(2, 3);
|
|
|
|
|
uf.union(1, 3);
|
|
|
|
|
assert_eq!(uf.find(0), uf.find(3));
|
|
|
|
|
assert_ne!(uf.find(0), uf.find(4));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[tokio::test]
|
|
|
|
|
async fn test_empty_storage() {
|
|
|
|
|
let dir = tempfile::TempDir::new().unwrap();
|
|
|
|
|
let storage = Storage::new(Some(dir.path().join("test.db"))).unwrap();
|
|
|
|
|
let storage = Arc::new(Mutex::new(storage));
|
|
|
|
|
let result = execute(&storage, None).await;
|
|
|
|
|
assert!(result.is_ok());
|
|
|
|
|
}
|
|
|
|
|
}
|