From 5d46ebfd30c40d5ed25c42ba1a8b236008ffef23 Mon Sep 17 00:00:00 2001 From: Bot Date: Mon, 20 Apr 2026 15:00:20 -0500 Subject: [PATCH] fix(timeline): push node_type and tags filters into SQL WHERE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit memory_timeline ran node_type and tags as Rust-side `retain` after `query_time_range`, which applied `LIMIT` in SQL before the retain saw anything. Against a corpus where one tag or type dominates, a sparse match could be crowded out of the limit window — the tool reported "no matches" when matches existed. Fix: thread `node_type: Option<&str>` and `tags: Option<&[String]>` through `query_time_range` and apply both as `WHERE` predicates so `LIMIT` kicks in after filtering. Tag matching uses `tags LIKE '%"tag"%'` — the quoted pattern pins to exact tags and rejects substring false positives (e.g. `alpha` no longer matches `alphabet`). Regression tests in `tools/timeline.rs`: - test_timeline_node_type_filter_sparse: 10 `fact` + 2 `concept`, `limit=5`, query `concept` — asserts 2 rows; fails on pre-fix code. - test_timeline_tag_filter_sparse: 10 rows tagged `common` + 2 tagged `rare`, `limit=5`, query `rare` — asserts 2 rows; same shape for tags. - test_timeline_tag_filter_exact_match: one `alpha` row + one `alphabet` row, query `alpha` — asserts exactly 1 row. Dashboard caller updated to pass `None, None` for the new filter params. 19/19 timeline tests pass; 1295/1295 workspace tests pass; clippy clean on vestige-core and vestige-mcp. Ported from the Unforgettable/Anamnesis fork. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/vestige-core/src/storage/sqlite.rs | 96 ++++++++------ crates/vestige-mcp/src/dashboard/handlers.rs | 2 +- crates/vestige-mcp/src/tools/timeline.rs | 131 +++++++++++++++++-- 3 files changed, 177 insertions(+), 52 deletions(-) diff --git a/crates/vestige-core/src/storage/sqlite.rs b/crates/vestige-core/src/storage/sqlite.rs index 7de250a..81197cb 100644 --- a/crates/vestige-core/src/storage/sqlite.rs +++ b/crates/vestige-core/src/storage/sqlite.rs @@ -2077,61 +2077,77 @@ impl Storage { Ok(result) } - /// Query memories created/modified in a time range + /// Query memories created/modified in a time range, optionally filtered by + /// `node_type` and/or `tags`. + /// + /// All filters are pushed into the SQL `WHERE` clause so that `LIMIT` is + /// applied AFTER filtering. If filters were applied in Rust after `LIMIT`, + /// sparse types/tags could be crowded out by a dominant set within the + /// limit window — e.g. a query for a rare tag against a corpus where + /// every day has hundreds of rows with a common tag would return 0 + /// matches after `LIMIT` crowded the rare-tag rows out. + /// + /// Tag filtering uses `tags LIKE '%"tag"%'` — an exact-match JSON pattern + /// that keys off the quote characters around each tag in the stored JSON + /// array. This avoids the substring-match false positive where `alpha` + /// would otherwise match `alphabet`. pub fn query_time_range( &self, start: Option>, end: Option>, limit: i32, + node_type: Option<&str>, + tags: Option<&[String]>, ) -> Result> { let start_str = start.map(|dt| dt.to_rfc3339()); let end_str = end.map(|dt| dt.to_rfc3339()); - let (query, params): (&str, Vec>) = match (&start_str, &end_str) { - (Some(s), Some(e)) => ( - "SELECT * FROM knowledge_nodes - WHERE created_at >= ?1 AND created_at <= ?2 - ORDER BY created_at DESC - LIMIT ?3", - vec![ - Box::new(s.clone()) as Box, - Box::new(e.clone()) as Box, - Box::new(limit) as Box, - ], - ), - (Some(s), None) => ( - "SELECT * FROM knowledge_nodes - WHERE created_at >= ?1 - ORDER BY created_at DESC - LIMIT ?2", - vec![ - Box::new(s.clone()) as Box, - Box::new(limit) as Box, - ], - ), - (None, Some(e)) => ( - "SELECT * FROM knowledge_nodes - WHERE created_at <= ?1 - ORDER BY created_at DESC - LIMIT ?2", - vec![ - Box::new(e.clone()) as Box, - Box::new(limit) as Box, - ], - ), - (None, None) => ( - "SELECT * FROM knowledge_nodes - ORDER BY created_at DESC - LIMIT ?1", - vec![Box::new(limit) as Box], - ), + let mut conditions: Vec = Vec::new(); + let mut params: Vec> = Vec::new(); + let mut idx = 1; + + if let Some(ref s) = start_str { + conditions.push(format!("created_at >= ?{}", idx)); + params.push(Box::new(s.clone()) as Box); + idx += 1; + } + if let Some(ref e) = end_str { + conditions.push(format!("created_at <= ?{}", idx)); + params.push(Box::new(e.clone()) as Box); + idx += 1; + } + if let Some(nt) = node_type { + conditions.push(format!("LOWER(node_type) = LOWER(?{})", idx)); + params.push(Box::new(nt.to_string()) as Box); + idx += 1; + } + if let Some(tag_list) = tags.filter(|t| !t.is_empty()) { + let mut tag_conditions = Vec::new(); + for tag in tag_list { + tag_conditions.push(format!("tags LIKE ?{}", idx)); + params.push(Box::new(format!("%\"{}\"%", tag)) as Box); + idx += 1; + } + conditions.push(format!("({})", tag_conditions.join(" OR "))); + } + + let where_clause = if conditions.is_empty() { + String::new() + } else { + format!("WHERE {}", conditions.join(" AND ")) }; + let query = format!( + "SELECT * FROM knowledge_nodes {} ORDER BY created_at DESC LIMIT ?{}", + where_clause, idx + ); + params.push(Box::new(limit) as Box); + let reader = self .reader .lock() .map_err(|_| StorageError::Init("Reader lock poisoned".into()))?; - let mut stmt = reader.prepare(query)?; + let mut stmt = reader.prepare(&query)?; let params_refs: Vec<&dyn rusqlite::ToSql> = params.iter().map(|p| p.as_ref()).collect(); let nodes = stmt.query_map(params_refs.as_slice(), Self::row_to_node)?; diff --git a/crates/vestige-mcp/src/dashboard/handlers.rs b/crates/vestige-mcp/src/dashboard/handlers.rs index d1c3187..28d4431 100644 --- a/crates/vestige-mcp/src/dashboard/handlers.rs +++ b/crates/vestige-mcp/src/dashboard/handlers.rs @@ -384,7 +384,7 @@ pub async fn get_timeline( let start = Utc::now() - Duration::days(days); let nodes = state .storage - .query_time_range(Some(start), Some(Utc::now()), limit) + .query_time_range(Some(start), Some(Utc::now()), limit, None, None) .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; // Group by day diff --git a/crates/vestige-mcp/src/tools/timeline.rs b/crates/vestige-mcp/src/tools/timeline.rs index d52588d..f73983a 100644 --- a/crates/vestige-mcp/src/tools/timeline.rs +++ b/crates/vestige-mcp/src/tools/timeline.rs @@ -126,19 +126,20 @@ pub async fn execute(storage: &Arc, args: Option) -> Result> = BTreeMap::new(); for node in &results { @@ -204,6 +205,28 @@ mod tests { .unwrap(); } + /// Ingest with explicit node_type and tags. Used by the sparse-filter + /// regression tests so the dominant and sparse sets can be told apart. + async fn ingest_typed( + storage: &Arc, + content: &str, + node_type: &str, + tags: &[&str], + ) { + storage + .ingest(vestige_core::IngestInput { + content: content.to_string(), + node_type: node_type.to_string(), + source: None, + sentiment_score: 0.0, + sentiment_magnitude: 0.0, + tags: tags.iter().map(|t| t.to_string()).collect(), + valid_from: None, + valid_until: None, + }) + .unwrap(); + } + #[test] fn test_schema_has_properties() { let s = schema(); @@ -357,4 +380,90 @@ mod tests { let value = result.unwrap(); assert_eq!(value["totalMemories"], 0); } + + /// Regression: `node_type` filter must work even when the sparse type is + /// crowded out by a dominant type within the SQL `LIMIT`. Before the fix, + /// `query_time_range` applied `LIMIT` before the Rust-side `retain`, so a + /// limit of 5 against 10 dominant + 2 sparse rows returned 5 dominant, + /// then filtered to 0 sparse. + #[tokio::test] + async fn test_timeline_node_type_filter_sparse() { + let (storage, _dir) = test_storage().await; + + // Dominant set: 10 facts + for i in 0..10 { + ingest_typed(&storage, &format!("Dominant memory {}", i), "fact", &["alpha"]).await; + } + // Sparse set: 2 concepts + for i in 0..2 { + ingest_typed(&storage, &format!("Sparse memory {}", i), "concept", &["beta"]).await; + } + + // Limit 5 against 12 total — before the fix, `retain` on `concept` + // would operate on the 5 most recent rows (all `fact`) and find 0. + let args = serde_json::json!({ "node_type": "concept", "limit": 5 }); + let value = execute(&storage, Some(args)).await.unwrap(); + assert_eq!( + value["totalMemories"], 2, + "Both sparse concepts should survive a limit smaller than the dominant set" + ); + + // Also verify the storage layer directly, so the contract is pinned + // at the API boundary even if the tool wrapper shifts. + let nodes = storage + .query_time_range(None, None, 5, Some("concept"), None) + .unwrap(); + assert_eq!(nodes.len(), 2); + assert!(nodes.iter().all(|n| n.node_type == "concept")); + } + + /// Regression: `tags` filter must work even when the sparse tag is + /// crowded out by a dominant tag within the SQL `LIMIT`. Parallel to + /// the node_type sparse case — same `retain`-after-`LIMIT` bug. + #[tokio::test] + async fn test_timeline_tag_filter_sparse() { + let (storage, _dir) = test_storage().await; + + // Dominant set: 10 memories with tag "common" + for i in 0..10 { + ingest_typed(&storage, &format!("Common memory {}", i), "fact", &["common"]).await; + } + // Sparse set: 2 memories with tag "rare" + for i in 0..2 { + ingest_typed(&storage, &format!("Rare memory {}", i), "fact", &["rare"]).await; + } + + let args = serde_json::json!({ "tags": ["rare"], "limit": 5 }); + let value = execute(&storage, Some(args)).await.unwrap(); + assert_eq!( + value["totalMemories"], 2, + "Both sparse-tag matches should survive a limit smaller than the dominant set" + ); + + let tag_slice = vec!["rare".to_string()]; + let nodes = storage + .query_time_range(None, None, 5, None, Some(&tag_slice)) + .unwrap(); + assert_eq!(nodes.len(), 2); + assert!(nodes.iter().all(|n| n.tags.iter().any(|t| t == "rare"))); + } + + /// Regression: tag filter must match exact tags, not substrings. Without + /// the `"tag"`-wrapped `LIKE` pattern, a query for `alpha` would also + /// match rows tagged `alphabet`. The pattern `%"alpha"%` keys off the + /// JSON-array quote characters and rejects that. + #[tokio::test] + async fn test_timeline_tag_filter_exact_match() { + let (storage, _dir) = test_storage().await; + + ingest_typed(&storage, "Exact tag hit", "fact", &["alpha"]).await; + ingest_typed(&storage, "Substring decoy", "fact", &["alphabet"]).await; + + let tag_slice = vec!["alpha".to_string()]; + let nodes = storage + .query_time_range(None, None, 50, None, Some(&tag_slice)) + .unwrap(); + assert_eq!(nodes.len(), 1, "Only the exact-tag match should return"); + assert_eq!(nodes[0].content, "Exact tag hit"); + } }