feat: Vestige v1.3.0 — importance scoring, session checkpoints, duplicate detection

3 new MCP tools (16 → 19 total):
- importance_score: 4-channel neuroscience importance scoring (novelty/arousal/reward/attention)
- session_checkpoint: batch smart_ingest up to 20 items with PE Gating
- find_duplicates: cosine similarity clustering with union-find for dedup

CLI: vestige ingest command for memory ingestion via command line
Core: made get_node_embedding public, added get_all_embeddings for dedup scanning

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Sam Valladares 2026-02-12 05:02:09 -06:00
parent 5cca386d6b
commit 04a3062328
9 changed files with 848 additions and 5 deletions

View file

@ -0,0 +1,240 @@
//! Session Checkpoint Tool
//!
//! Batch smart_ingest for session-end saves. Accepts up to 20 items
//! in a single call, routing each through Prediction Error Gating.
use serde::Deserialize;
use serde_json::Value;
use std::sync::Arc;
use tokio::sync::Mutex;
use vestige_core::{IngestInput, Storage};
/// Input schema for session_checkpoint tool
pub fn schema() -> Value {
serde_json::json!({
"type": "object",
"properties": {
"items": {
"type": "array",
"description": "Array of items to save (max 20). Each goes through Prediction Error Gating.",
"maxItems": 20,
"items": {
"type": "object",
"properties": {
"content": {
"type": "string",
"description": "The content to remember"
},
"tags": {
"type": "array",
"items": { "type": "string" },
"description": "Tags for categorization"
},
"node_type": {
"type": "string",
"description": "Type: fact, concept, event, person, place, note, pattern, decision",
"default": "fact"
},
"source": {
"type": "string",
"description": "Source reference"
}
},
"required": ["content"]
}
}
},
"required": ["items"]
})
}
#[derive(Debug, Deserialize)]
struct CheckpointArgs {
items: Vec<CheckpointItem>,
}
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct CheckpointItem {
content: String,
tags: Option<Vec<String>>,
node_type: Option<String>,
source: Option<String>,
}
pub async fn execute(
storage: &Arc<Mutex<Storage>>,
args: Option<Value>,
) -> Result<Value, String> {
let args: CheckpointArgs = match args {
Some(v) => serde_json::from_value(v).map_err(|e| format!("Invalid arguments: {}", e))?,
None => return Err("Missing arguments".to_string()),
};
if args.items.is_empty() {
return Err("Items array cannot be empty".to_string());
}
if args.items.len() > 20 {
return Err("Maximum 20 items per checkpoint".to_string());
}
let mut storage = storage.lock().await;
let mut results = Vec::new();
let mut created = 0u32;
let mut updated = 0u32;
let mut skipped = 0u32;
let mut errors = 0u32;
for (i, item) in args.items.into_iter().enumerate() {
if item.content.trim().is_empty() {
results.push(serde_json::json!({
"index": i,
"status": "skipped",
"reason": "Empty content"
}));
skipped += 1;
continue;
}
let input = IngestInput {
content: item.content,
node_type: item.node_type.unwrap_or_else(|| "fact".to_string()),
source: item.source,
sentiment_score: 0.0,
sentiment_magnitude: 0.0,
tags: item.tags.unwrap_or_default(),
valid_from: None,
valid_until: None,
};
#[cfg(all(feature = "embeddings", feature = "vector-search"))]
{
match storage.smart_ingest(input) {
Ok(result) => {
match result.decision.as_str() {
"create" | "supersede" | "replace" => created += 1,
"update" | "reinforce" | "merge" | "add_context" => updated += 1,
_ => created += 1,
}
results.push(serde_json::json!({
"index": i,
"status": "saved",
"decision": result.decision,
"nodeId": result.node.id,
"similarity": result.similarity,
"reason": result.reason
}));
}
Err(e) => {
errors += 1;
results.push(serde_json::json!({
"index": i,
"status": "error",
"reason": e.to_string()
}));
}
}
}
#[cfg(not(all(feature = "embeddings", feature = "vector-search")))]
{
match storage.ingest(input) {
Ok(node) => {
created += 1;
results.push(serde_json::json!({
"index": i,
"status": "saved",
"decision": "create",
"nodeId": node.id,
"reason": "Embeddings not available - used regular ingest"
}));
}
Err(e) => {
errors += 1;
results.push(serde_json::json!({
"index": i,
"status": "error",
"reason": e.to_string()
}));
}
}
}
}
Ok(serde_json::json!({
"success": errors == 0,
"summary": {
"total": results.len(),
"created": created,
"updated": updated,
"skipped": skipped,
"errors": errors
},
"results": results
}))
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
async fn test_storage() -> (Arc<Mutex<Storage>>, TempDir) {
let dir = TempDir::new().unwrap();
let storage = Storage::new(Some(dir.path().join("test.db"))).unwrap();
(Arc::new(Mutex::new(storage)), dir)
}
#[test]
fn test_schema_has_required_fields() {
let schema = schema();
assert_eq!(schema["type"], "object");
assert!(schema["properties"]["items"].is_object());
}
#[tokio::test]
async fn test_empty_items_fails() {
let (storage, _dir) = test_storage().await;
let result = execute(&storage, Some(serde_json::json!({ "items": [] }))).await;
assert!(result.is_err());
}
#[tokio::test]
async fn test_batch_ingest() {
let (storage, _dir) = test_storage().await;
let result = execute(
&storage,
Some(serde_json::json!({
"items": [
{ "content": "First checkpoint item", "tags": ["test"] },
{ "content": "Second checkpoint item", "tags": ["test"] }
]
})),
)
.await;
assert!(result.is_ok());
let value = result.unwrap();
assert_eq!(value["summary"]["total"], 2);
}
#[tokio::test]
async fn test_skips_empty_content() {
let (storage, _dir) = test_storage().await;
let result = execute(
&storage,
Some(serde_json::json!({
"items": [
{ "content": "Valid item" },
{ "content": "" },
{ "content": "Another valid item" }
]
})),
)
.await;
assert!(result.is_ok());
let value = result.unwrap();
assert_eq!(value["summary"]["skipped"], 1);
}
}

View file

@ -0,0 +1,307 @@
//! Find Duplicates Tool
//!
//! Detects duplicate and near-duplicate memory clusters using
//! cosine similarity on stored embeddings. Uses union-find for
//! efficient clustering.
use serde::Deserialize;
use serde_json::Value;
use std::collections::HashMap;
use std::sync::Arc;
use tokio::sync::Mutex;
use vestige_core::Storage;
#[cfg(all(feature = "embeddings", feature = "vector-search"))]
use vestige_core::cosine_similarity;
/// Input schema for find_duplicates tool
pub fn schema() -> Value {
serde_json::json!({
"type": "object",
"properties": {
"similarity_threshold": {
"type": "number",
"description": "Minimum cosine similarity to consider as duplicate (0.0-1.0, default: 0.80)",
"default": 0.80,
"minimum": 0.5,
"maximum": 1.0
},
"limit": {
"type": "integer",
"description": "Maximum number of duplicate clusters to return (default: 20)",
"default": 20,
"minimum": 1,
"maximum": 100
},
"tags": {
"type": "array",
"items": { "type": "string" },
"description": "Optional: only check memories with these tags (ANY match)"
}
}
})
}
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct DedupArgs {
similarity_threshold: Option<f64>,
limit: Option<usize>,
tags: Option<Vec<String>>,
}
/// Simple union-find for clustering
struct UnionFind {
parent: Vec<usize>,
rank: Vec<usize>,
}
impl UnionFind {
fn new(n: usize) -> Self {
Self {
parent: (0..n).collect(),
rank: vec![0; n],
}
}
fn find(&mut self, x: usize) -> usize {
if self.parent[x] != x {
self.parent[x] = self.find(self.parent[x]);
}
self.parent[x]
}
fn union(&mut self, x: usize, y: usize) {
let rx = self.find(x);
let ry = self.find(y);
if rx == ry {
return;
}
if self.rank[rx] < self.rank[ry] {
self.parent[rx] = ry;
} else if self.rank[rx] > self.rank[ry] {
self.parent[ry] = rx;
} else {
self.parent[ry] = rx;
self.rank[rx] += 1;
}
}
}
pub async fn execute(
storage: &Arc<Mutex<Storage>>,
args: Option<Value>,
) -> Result<Value, String> {
let args: DedupArgs = match args {
Some(v) => serde_json::from_value(v).map_err(|e| format!("Invalid arguments: {}", e))?,
None => DedupArgs {
similarity_threshold: None,
limit: None,
tags: None,
},
};
let threshold = args.similarity_threshold.unwrap_or(0.80) as f32;
let limit = args.limit.unwrap_or(20);
let tag_filter = args.tags.unwrap_or_default();
#[cfg(all(feature = "embeddings", feature = "vector-search"))]
{
let storage = storage.lock().await;
// Load all embeddings
let all_embeddings = storage
.get_all_embeddings()
.map_err(|e| format!("Failed to load embeddings: {}", e))?;
if all_embeddings.is_empty() {
return Ok(serde_json::json!({
"clusters": [],
"totalMemories": 0,
"totalWithEmbeddings": 0,
"message": "No embeddings found. Run consolidation first."
}));
}
// Load nodes for metadata (content preview, retention, tags)
let mut all_nodes = Vec::new();
let mut offset = 0;
loop {
let batch = storage
.get_all_nodes(500, offset)
.map_err(|e| format!("Failed to load nodes: {}", e))?;
let batch_len = batch.len();
all_nodes.extend(batch);
if batch_len < 500 {
break;
}
offset += 500;
}
// Build node lookup
let node_map: HashMap<String, &vestige_core::KnowledgeNode> =
all_nodes.iter().map(|n| (n.id.clone(), n)).collect();
// Filter by tags if specified
let filtered_embeddings: Vec<(usize, &String, &Vec<f32>)> = all_embeddings
.iter()
.enumerate()
.filter(|(_, (id, _))| {
if tag_filter.is_empty() {
return true;
}
if let Some(node) = node_map.get(id) {
tag_filter.iter().any(|t| node.tags.contains(t))
} else {
false
}
})
.map(|(i, (id, vec))| (i, id, vec))
.collect();
let n = filtered_embeddings.len();
if n > 2000 {
return Ok(serde_json::json!({
"warning": format!("Too many memories to scan ({} with embeddings). Filter by tags to reduce scope.", n),
"totalMemories": all_nodes.len(),
"totalWithEmbeddings": n
}));
}
// O(n^2) pairwise similarity + union-find clustering
let mut uf = UnionFind::new(n);
let mut similarities: Vec<(usize, usize, f32)> = Vec::new();
for i in 0..n {
for j in (i + 1)..n {
let sim = cosine_similarity(&filtered_embeddings[i].2, &filtered_embeddings[j].2);
if sim >= threshold {
uf.union(i, j);
similarities.push((i, j, sim));
}
}
}
// Group into clusters
let mut cluster_map: HashMap<usize, Vec<usize>> = HashMap::new();
for i in 0..n {
let root = uf.find(i);
cluster_map.entry(root).or_default().push(i);
}
// Only keep clusters with >1 member, sorted by size descending
let mut clusters: Vec<Vec<usize>> = cluster_map
.into_values()
.filter(|c| c.len() > 1)
.collect();
clusters.sort_by(|a, b| b.len().cmp(&a.len()));
clusters.truncate(limit);
// Build similarity lookup for formatting
let mut sim_lookup: HashMap<(usize, usize), f32> = HashMap::new();
for &(i, j, sim) in &similarities {
sim_lookup.insert((i, j), sim);
sim_lookup.insert((j, i), sim);
}
// Format output
let cluster_results: Vec<Value> = clusters
.iter()
.enumerate()
.map(|(ci, members)| {
let anchor = members[0];
let member_results: Vec<Value> = members
.iter()
.map(|&idx| {
let id = &filtered_embeddings[idx].1;
let node = node_map.get(id.as_str());
let content_preview = node
.map(|n| {
let c = n.content.replace('\n', " ");
if c.len() > 120 {
format!("{}...", &c[..120])
} else {
c
}
})
.unwrap_or_default();
let sim_to_anchor = if idx == anchor {
1.0
} else {
sim_lookup
.get(&(anchor, idx))
.copied()
.unwrap_or(0.0)
};
serde_json::json!({
"id": id,
"contentPreview": content_preview,
"retention": node.map(|n| n.retention_strength).unwrap_or(0.0),
"createdAt": node.map(|n| n.created_at.to_rfc3339()).unwrap_or_default(),
"tags": node.map(|n| &n.tags).unwrap_or(&vec![]),
"similarityToAnchor": format!("{:.3}", sim_to_anchor)
})
})
.collect();
serde_json::json!({
"clusterId": ci,
"size": members.len(),
"members": member_results,
"suggestedAction": if members.len() > 3 { "review" } else { "merge" }
})
})
.collect();
Ok(serde_json::json!({
"clusters": cluster_results,
"totalClusters": cluster_results.len(),
"totalMemories": all_nodes.len(),
"totalWithEmbeddings": n,
"threshold": threshold,
"pairsChecked": n * (n - 1) / 2
}))
}
#[cfg(not(all(feature = "embeddings", feature = "vector-search")))]
{
Ok(serde_json::json!({
"error": "Embeddings feature not enabled. Cannot compute similarities.",
"clusters": []
}))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_schema() {
let schema = schema();
assert_eq!(schema["type"], "object");
assert!(schema["properties"]["similarity_threshold"].is_object());
}
#[test]
fn test_union_find() {
let mut uf = UnionFind::new(5);
uf.union(0, 1);
uf.union(2, 3);
uf.union(1, 3);
assert_eq!(uf.find(0), uf.find(3));
assert_ne!(uf.find(0), uf.find(4));
}
#[tokio::test]
async fn test_empty_storage() {
let dir = tempfile::TempDir::new().unwrap();
let storage = Storage::new(Some(dir.path().join("test.db"))).unwrap();
let storage = Arc::new(Mutex::new(storage));
let result = execute(&storage, None).await;
assert!(result.is_ok());
}
}

View file

@ -0,0 +1,140 @@
//! Importance Score Tool
//!
//! Exposes the 4-channel importance signaling system as an MCP tool.
//! Wraps ImportanceSignals::compute_importance() from vestige-core's
//! neuroscience module (dopamine/norepinephrine/acetylcholine/serotonin model).
use serde::Deserialize;
use serde_json::Value;
use std::sync::Arc;
use tokio::sync::Mutex;
use vestige_core::{ImportanceContext, ImportanceSignals, Storage};
/// Input schema for importance_score tool
pub fn schema() -> Value {
serde_json::json!({
"type": "object",
"properties": {
"content": {
"type": "string",
"description": "The content to score for importance"
},
"context_topics": {
"type": "array",
"items": { "type": "string" },
"description": "Optional topics for novelty detection context"
},
"project": {
"type": "string",
"description": "Optional project/codebase name for context"
}
},
"required": ["content"]
})
}
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
struct ImportanceArgs {
content: String,
context_topics: Option<Vec<String>>,
project: Option<String>,
}
pub async fn execute(
_storage: &Arc<Mutex<Storage>>,
args: Option<Value>,
) -> Result<Value, String> {
let args: ImportanceArgs = match args {
Some(v) => serde_json::from_value(v).map_err(|e| format!("Invalid arguments: {}", e))?,
None => return Err("Missing arguments".to_string()),
};
if args.content.trim().is_empty() {
return Err("Content cannot be empty".to_string());
}
let signals = ImportanceSignals::new();
let mut context = ImportanceContext::current();
if let Some(project) = args.project {
context = context.with_project(project);
}
if let Some(topics) = args.context_topics {
context = context.with_tags(topics);
}
let score = signals.compute_importance(&args.content, &context);
Ok(serde_json::json!({
"composite": score.composite,
"channels": {
"novelty": score.novelty,
"arousal": score.arousal,
"reward": score.reward,
"attention": score.attention
},
"encodingBoost": score.encoding_boost,
"consolidationPriority": format!("{:?}", score.consolidation_priority),
"weightsUsed": {
"novelty": score.weights_used.novelty,
"arousal": score.weights_used.arousal,
"reward": score.weights_used.reward,
"attention": score.weights_used.attention
},
"explanations": {
"novelty": score.novelty_explanation.as_ref().map(|e| format!("{:?}", e)),
"arousal": score.arousal_explanation.as_ref().map(|e| format!("{:?}", e)),
"reward": score.reward_explanation.as_ref().map(|e| format!("{:?}", e)),
"attention": score.attention_explanation.as_ref().map(|e| format!("{:?}", e))
},
"summary": score.summary(),
"dominantSignal": score.dominant_signal()
}))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_schema_has_required_fields() {
let schema = schema();
assert_eq!(schema["type"], "object");
assert!(schema["properties"]["content"].is_object());
assert!(schema["required"]
.as_array()
.unwrap()
.contains(&serde_json::json!("content")));
}
#[tokio::test]
async fn test_empty_content_fails() {
let storage = Arc::new(Mutex::new(
Storage::new(Some(std::path::PathBuf::from("/tmp/test_importance.db"))).unwrap(),
));
let result = execute(&storage, Some(serde_json::json!({ "content": "" }))).await;
assert!(result.is_err());
}
#[tokio::test]
async fn test_basic_importance_score() {
let storage = Arc::new(Mutex::new(
Storage::new(Some(std::path::PathBuf::from("/tmp/test_importance2.db"))).unwrap(),
));
let result = execute(
&storage,
Some(serde_json::json!({
"content": "CRITICAL: Production database migration failed with data loss!"
})),
)
.await;
assert!(result.is_ok());
let value = result.unwrap();
assert!(value["composite"].as_f64().is_some());
assert!(value["channels"]["novelty"].as_f64().is_some());
assert!(value["channels"]["arousal"].as_f64().is_some());
assert!(value["dominantSignal"].is_string());
}
}

View file

@ -21,6 +21,11 @@ pub mod timeline;
// v1.2: Maintenance tools
pub mod maintenance;
// v1.3: Auto-save and dedup tools
pub mod checkpoint;
pub mod dedup;
pub mod importance;
// Deprecated tools - kept for internal backwards compatibility
// These modules are intentionally unused in the public API
#[allow(dead_code)]