Merge pull request #37 from Unforgettable-93/unforgettable-timeline-tags-filter

fix(timeline): push node_type and tags filters into SQL WHERE
2026-04-25 00:36:22 +02:00 · 2026-04-20 18:21:58 -05:00 · 2026-04-20 18:21:58 -05:00 · 2391acf480
commit 2391acf480
parent 30d92b5371 5d46ebfd30
3 changed files with 177 additions and 52 deletions
--- a/crates/vestige-core/src/storage/sqlite.rs
+++ b/crates/vestige-core/src/storage/sqlite.rs
@ -2077,61 +2077,77 @@ impl Storage {
        Ok(result)
    }

-    /// Query memories created/modified in a time range
+    /// Query memories created/modified in a time range, optionally filtered by
+    /// `node_type` and/or `tags`.
+    ///
+    /// All filters are pushed into the SQL `WHERE` clause so that `LIMIT` is
+    /// applied AFTER filtering. If filters were applied in Rust after `LIMIT`,
+    /// sparse types/tags could be crowded out by a dominant set within the
+    /// limit window — e.g. a query for a rare tag against a corpus where
+    /// every day has hundreds of rows with a common tag would return 0
+    /// matches after `LIMIT` crowded the rare-tag rows out.
+    ///
+    /// Tag filtering uses `tags LIKE '%"tag"%'` — an exact-match JSON pattern
+    /// that keys off the quote characters around each tag in the stored JSON
+    /// array. This avoids the substring-match false positive where `alpha`
+    /// would otherwise match `alphabet`.
    pub fn query_time_range(
        &self,
        start: Option<DateTime<Utc>>,
        end: Option<DateTime<Utc>>,
        limit: i32,
+        node_type: Option<&str>,
+        tags: Option<&[String]>,
    ) -> Result<Vec<KnowledgeNode>> {
        let start_str = start.map(|dt| dt.to_rfc3339());
        let end_str = end.map(|dt| dt.to_rfc3339());

-        let (query, params): (&str, Vec<Box<dyn rusqlite::ToSql>>) = match (&start_str, &end_str) {
-            (Some(s), Some(e)) => (
-                "SELECT * FROM knowledge_nodes
-                 WHERE created_at >= ?1 AND created_at <= ?2
-                 ORDER BY created_at DESC
-                 LIMIT ?3",
-                vec![
-                    Box::new(s.clone()) as Box<dyn rusqlite::ToSql>,
-                    Box::new(e.clone()) as Box<dyn rusqlite::ToSql>,
-                    Box::new(limit) as Box<dyn rusqlite::ToSql>,
-                ],
-            ),
-            (Some(s), None) => (
-                "SELECT * FROM knowledge_nodes
-                 WHERE created_at >= ?1
-                 ORDER BY created_at DESC
-                 LIMIT ?2",
-                vec![
-                    Box::new(s.clone()) as Box<dyn rusqlite::ToSql>,
-                    Box::new(limit) as Box<dyn rusqlite::ToSql>,
-                ],
-            ),
-            (None, Some(e)) => (
-                "SELECT * FROM knowledge_nodes
-                 WHERE created_at <= ?1
-                 ORDER BY created_at DESC
-                 LIMIT ?2",
-                vec![
-                    Box::new(e.clone()) as Box<dyn rusqlite::ToSql>,
-                    Box::new(limit) as Box<dyn rusqlite::ToSql>,
-                ],
-            ),
-            (None, None) => (
-                "SELECT * FROM knowledge_nodes
-                 ORDER BY created_at DESC
-                 LIMIT ?1",
-                vec![Box::new(limit) as Box<dyn rusqlite::ToSql>],
-            ),
+        let mut conditions: Vec<String> = Vec::new();
+        let mut params: Vec<Box<dyn rusqlite::ToSql>> = Vec::new();
+        let mut idx = 1;
+
+        if let Some(ref s) = start_str {
+            conditions.push(format!("created_at >= ?{}", idx));
+            params.push(Box::new(s.clone()) as Box<dyn rusqlite::ToSql>);
+            idx += 1;
+        }
+        if let Some(ref e) = end_str {
+            conditions.push(format!("created_at <= ?{}", idx));
+            params.push(Box::new(e.clone()) as Box<dyn rusqlite::ToSql>);
+            idx += 1;
+        }
+        if let Some(nt) = node_type {
+            conditions.push(format!("LOWER(node_type) = LOWER(?{})", idx));
+            params.push(Box::new(nt.to_string()) as Box<dyn rusqlite::ToSql>);
+            idx += 1;
+        }
+        if let Some(tag_list) = tags.filter(|t| !t.is_empty()) {
+            let mut tag_conditions = Vec::new();
+            for tag in tag_list {
+                tag_conditions.push(format!("tags LIKE ?{}", idx));
+                params.push(Box::new(format!("%\"{}\"%", tag)) as Box<dyn rusqlite::ToSql>);
+                idx += 1;
+            }
+            conditions.push(format!("({})", tag_conditions.join(" OR ")));
+        }
+
+        let where_clause = if conditions.is_empty() {
+            String::new()
+        } else {
+            format!("WHERE {}", conditions.join(" AND "))
        };

+        let query = format!(
+            "SELECT * FROM knowledge_nodes {} ORDER BY created_at DESC LIMIT ?{}",
+            where_clause, idx
+        );
+        params.push(Box::new(limit) as Box<dyn rusqlite::ToSql>);
+
        let reader = self
            .reader
            .lock()
            .map_err(|_| StorageError::Init("Reader lock poisoned".into()))?;
-        let mut stmt = reader.prepare(query)?;
+        let mut stmt = reader.prepare(&query)?;
        let params_refs: Vec<&dyn rusqlite::ToSql> = params.iter().map(|p| p.as_ref()).collect();
        let nodes = stmt.query_map(params_refs.as_slice(), Self::row_to_node)?;

--- a/crates/vestige-mcp/src/dashboard/handlers.rs
+++ b/crates/vestige-mcp/src/dashboard/handlers.rs
@ -384,7 +384,7 @@ pub async fn get_timeline(
    let start = Utc::now() - Duration::days(days);
    let nodes = state
        .storage
-        .query_time_range(Some(start), Some(Utc::now()), limit)
+        .query_time_range(Some(start), Some(Utc::now()), limit, None, None)
        .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;

    // Group by day
--- a/crates/vestige-mcp/src/tools/timeline.rs
+++ b/crates/vestige-mcp/src/tools/timeline.rs
@ -126,19 +126,20 @@ pub async fn execute(storage: &Arc<Storage>, args: Option<Value>) -> Result<Valu

    let limit = args.limit.unwrap_or(50).clamp(1, 200);

-    // Query memories in time range
-    let mut results = storage
-        .query_time_range(start, end, limit)
+    // Query memories in time range with filters pushed into SQL. Rust-side
+    // `retain` after `LIMIT` was unsafe for sparse types/tags — a dominant
+    // set could crowd the sparse matches out of the limit window and leave
+    // the retain with 0 rows to keep.
+    let results = storage
+        .query_time_range(
+            start,
+            end,
+            limit,
+            args.node_type.as_deref(),
+            args.tags.as_deref(),
+        )
        .map_err(|e| e.to_string())?;

-    // Post-query filters
-    if let Some(ref node_type) = args.node_type {
-        results.retain(|n| n.node_type == *node_type);
-    }
-    if let Some(tags) = args.tags.as_ref().filter(|t| !t.is_empty()) {
-        results.retain(|n| tags.iter().any(|t| n.tags.contains(t)));
-    }
-
    // Group by day
    let mut by_day: BTreeMap<NaiveDate, Vec<Value>> = BTreeMap::new();
    for node in &results {
@ -204,6 +205,28 @@ mod tests {
            .unwrap();
    }

+    /// Ingest with explicit node_type and tags. Used by the sparse-filter
+    /// regression tests so the dominant and sparse sets can be told apart.
+    async fn ingest_typed(
+        storage: &Arc<Storage>,
+        content: &str,
+        node_type: &str,
+        tags: &[&str],
+    ) {
+        storage
+            .ingest(vestige_core::IngestInput {
+                content: content.to_string(),
+                node_type: node_type.to_string(),
+                source: None,
+                sentiment_score: 0.0,
+                sentiment_magnitude: 0.0,
+                tags: tags.iter().map(|t| t.to_string()).collect(),
+                valid_from: None,
+                valid_until: None,
+            })
+            .unwrap();
+    }
+
    #[test]
    fn test_schema_has_properties() {
        let s = schema();
@ -357,4 +380,90 @@ mod tests {
        let value = result.unwrap();
        assert_eq!(value["totalMemories"], 0);
    }
+
+    /// Regression: `node_type` filter must work even when the sparse type is
+    /// crowded out by a dominant type within the SQL `LIMIT`. Before the fix,
+    /// `query_time_range` applied `LIMIT` before the Rust-side `retain`, so a
+    /// limit of 5 against 10 dominant + 2 sparse rows returned 5 dominant,
+    /// then filtered to 0 sparse.
+    #[tokio::test]
+    async fn test_timeline_node_type_filter_sparse() {
+        let (storage, _dir) = test_storage().await;
+
+        // Dominant set: 10 facts
+        for i in 0..10 {
+            ingest_typed(&storage, &format!("Dominant memory {}", i), "fact", &["alpha"]).await;
+        }
+        // Sparse set: 2 concepts
+        for i in 0..2 {
+            ingest_typed(&storage, &format!("Sparse memory {}", i), "concept", &["beta"]).await;
+        }
+
+        // Limit 5 against 12 total — before the fix, `retain` on `concept`
+        // would operate on the 5 most recent rows (all `fact`) and find 0.
+        let args = serde_json::json!({ "node_type": "concept", "limit": 5 });
+        let value = execute(&storage, Some(args)).await.unwrap();
+        assert_eq!(
+            value["totalMemories"], 2,
+            "Both sparse concepts should survive a limit smaller than the dominant set"
+        );
+
+        // Also verify the storage layer directly, so the contract is pinned
+        // at the API boundary even if the tool wrapper shifts.
+        let nodes = storage
+            .query_time_range(None, None, 5, Some("concept"), None)
+            .unwrap();
+        assert_eq!(nodes.len(), 2);
+        assert!(nodes.iter().all(|n| n.node_type == "concept"));
+    }
+
+    /// Regression: `tags` filter must work even when the sparse tag is
+    /// crowded out by a dominant tag within the SQL `LIMIT`. Parallel to
+    /// the node_type sparse case — same `retain`-after-`LIMIT` bug.
+    #[tokio::test]
+    async fn test_timeline_tag_filter_sparse() {
+        let (storage, _dir) = test_storage().await;
+
+        // Dominant set: 10 memories with tag "common"
+        for i in 0..10 {
+            ingest_typed(&storage, &format!("Common memory {}", i), "fact", &["common"]).await;
+        }
+        // Sparse set: 2 memories with tag "rare"
+        for i in 0..2 {
+            ingest_typed(&storage, &format!("Rare memory {}", i), "fact", &["rare"]).await;
+        }
+
+        let args = serde_json::json!({ "tags": ["rare"], "limit": 5 });
+        let value = execute(&storage, Some(args)).await.unwrap();
+        assert_eq!(
+            value["totalMemories"], 2,
+            "Both sparse-tag matches should survive a limit smaller than the dominant set"
+        );
+
+        let tag_slice = vec!["rare".to_string()];
+        let nodes = storage
+            .query_time_range(None, None, 5, None, Some(&tag_slice))
+            .unwrap();
+        assert_eq!(nodes.len(), 2);
+        assert!(nodes.iter().all(|n| n.tags.iter().any(|t| t == "rare")));
+    }
+
+    /// Regression: tag filter must match exact tags, not substrings. Without
+    /// the `"tag"`-wrapped `LIKE` pattern, a query for `alpha` would also
+    /// match rows tagged `alphabet`. The pattern `%"alpha"%` keys off the
+    /// JSON-array quote characters and rejects that.
+    #[tokio::test]
+    async fn test_timeline_tag_filter_exact_match() {
+        let (storage, _dir) = test_storage().await;
+
+        ingest_typed(&storage, "Exact tag hit", "fact", &["alpha"]).await;
+        ingest_typed(&storage, "Substring decoy", "fact", &["alphabet"]).await;
+
+        let tag_slice = vec!["alpha".to_string()];
+        let nodes = storage
+            .query_time_range(None, None, 50, None, Some(&tag_slice))
+            .unwrap();
+        assert_eq!(nodes.len(), 1, "Only the exact-tag match should return");
+        assert_eq!(nodes[0].content, "Exact tag hit");
+    }
 }