mirror of
https://github.com/samvallad33/vestige.git
synced 2026-04-25 00:36:22 +02:00
fix(timeline): push node_type and tags filters into SQL WHERE
memory_timeline ran node_type and tags as Rust-side `retain` after `query_time_range`, which applied `LIMIT` in SQL before the retain saw anything. Against a corpus where one tag or type dominates, a sparse match could be crowded out of the limit window — the tool reported "no matches" when matches existed. Fix: thread `node_type: Option<&str>` and `tags: Option<&[String]>` through `query_time_range` and apply both as `WHERE` predicates so `LIMIT` kicks in after filtering. Tag matching uses `tags LIKE '%"tag"%'` — the quoted pattern pins to exact tags and rejects substring false positives (e.g. `alpha` no longer matches `alphabet`). Regression tests in `tools/timeline.rs`: - test_timeline_node_type_filter_sparse: 10 `fact` + 2 `concept`, `limit=5`, query `concept` — asserts 2 rows; fails on pre-fix code. - test_timeline_tag_filter_sparse: 10 rows tagged `common` + 2 tagged `rare`, `limit=5`, query `rare` — asserts 2 rows; same shape for tags. - test_timeline_tag_filter_exact_match: one `alpha` row + one `alphabet` row, query `alpha` — asserts exactly 1 row. Dashboard caller updated to pass `None, None` for the new filter params. 19/19 timeline tests pass; 1295/1295 workspace tests pass; clippy clean on vestige-core and vestige-mcp. Ported from the Unforgettable/Anamnesis fork. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
30d92b5371
commit
5d46ebfd30
3 changed files with 177 additions and 52 deletions
|
|
@ -2077,61 +2077,77 @@ impl Storage {
|
|||
Ok(result)
|
||||
}
|
||||
|
||||
/// Query memories created/modified in a time range
|
||||
/// Query memories created/modified in a time range, optionally filtered by
|
||||
/// `node_type` and/or `tags`.
|
||||
///
|
||||
/// All filters are pushed into the SQL `WHERE` clause so that `LIMIT` is
|
||||
/// applied AFTER filtering. If filters were applied in Rust after `LIMIT`,
|
||||
/// sparse types/tags could be crowded out by a dominant set within the
|
||||
/// limit window — e.g. a query for a rare tag against a corpus where
|
||||
/// every day has hundreds of rows with a common tag would return 0
|
||||
/// matches after `LIMIT` crowded the rare-tag rows out.
|
||||
///
|
||||
/// Tag filtering uses `tags LIKE '%"tag"%'` — an exact-match JSON pattern
|
||||
/// that keys off the quote characters around each tag in the stored JSON
|
||||
/// array. This avoids the substring-match false positive where `alpha`
|
||||
/// would otherwise match `alphabet`.
|
||||
pub fn query_time_range(
|
||||
&self,
|
||||
start: Option<DateTime<Utc>>,
|
||||
end: Option<DateTime<Utc>>,
|
||||
limit: i32,
|
||||
node_type: Option<&str>,
|
||||
tags: Option<&[String]>,
|
||||
) -> Result<Vec<KnowledgeNode>> {
|
||||
let start_str = start.map(|dt| dt.to_rfc3339());
|
||||
let end_str = end.map(|dt| dt.to_rfc3339());
|
||||
|
||||
let (query, params): (&str, Vec<Box<dyn rusqlite::ToSql>>) = match (&start_str, &end_str) {
|
||||
(Some(s), Some(e)) => (
|
||||
"SELECT * FROM knowledge_nodes
|
||||
WHERE created_at >= ?1 AND created_at <= ?2
|
||||
ORDER BY created_at DESC
|
||||
LIMIT ?3",
|
||||
vec![
|
||||
Box::new(s.clone()) as Box<dyn rusqlite::ToSql>,
|
||||
Box::new(e.clone()) as Box<dyn rusqlite::ToSql>,
|
||||
Box::new(limit) as Box<dyn rusqlite::ToSql>,
|
||||
],
|
||||
),
|
||||
(Some(s), None) => (
|
||||
"SELECT * FROM knowledge_nodes
|
||||
WHERE created_at >= ?1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT ?2",
|
||||
vec![
|
||||
Box::new(s.clone()) as Box<dyn rusqlite::ToSql>,
|
||||
Box::new(limit) as Box<dyn rusqlite::ToSql>,
|
||||
],
|
||||
),
|
||||
(None, Some(e)) => (
|
||||
"SELECT * FROM knowledge_nodes
|
||||
WHERE created_at <= ?1
|
||||
ORDER BY created_at DESC
|
||||
LIMIT ?2",
|
||||
vec![
|
||||
Box::new(e.clone()) as Box<dyn rusqlite::ToSql>,
|
||||
Box::new(limit) as Box<dyn rusqlite::ToSql>,
|
||||
],
|
||||
),
|
||||
(None, None) => (
|
||||
"SELECT * FROM knowledge_nodes
|
||||
ORDER BY created_at DESC
|
||||
LIMIT ?1",
|
||||
vec![Box::new(limit) as Box<dyn rusqlite::ToSql>],
|
||||
),
|
||||
let mut conditions: Vec<String> = Vec::new();
|
||||
let mut params: Vec<Box<dyn rusqlite::ToSql>> = Vec::new();
|
||||
let mut idx = 1;
|
||||
|
||||
if let Some(ref s) = start_str {
|
||||
conditions.push(format!("created_at >= ?{}", idx));
|
||||
params.push(Box::new(s.clone()) as Box<dyn rusqlite::ToSql>);
|
||||
idx += 1;
|
||||
}
|
||||
if let Some(ref e) = end_str {
|
||||
conditions.push(format!("created_at <= ?{}", idx));
|
||||
params.push(Box::new(e.clone()) as Box<dyn rusqlite::ToSql>);
|
||||
idx += 1;
|
||||
}
|
||||
if let Some(nt) = node_type {
|
||||
conditions.push(format!("LOWER(node_type) = LOWER(?{})", idx));
|
||||
params.push(Box::new(nt.to_string()) as Box<dyn rusqlite::ToSql>);
|
||||
idx += 1;
|
||||
}
|
||||
if let Some(tag_list) = tags.filter(|t| !t.is_empty()) {
|
||||
let mut tag_conditions = Vec::new();
|
||||
for tag in tag_list {
|
||||
tag_conditions.push(format!("tags LIKE ?{}", idx));
|
||||
params.push(Box::new(format!("%\"{}\"%", tag)) as Box<dyn rusqlite::ToSql>);
|
||||
idx += 1;
|
||||
}
|
||||
conditions.push(format!("({})", tag_conditions.join(" OR ")));
|
||||
}
|
||||
|
||||
let where_clause = if conditions.is_empty() {
|
||||
String::new()
|
||||
} else {
|
||||
format!("WHERE {}", conditions.join(" AND "))
|
||||
};
|
||||
|
||||
let query = format!(
|
||||
"SELECT * FROM knowledge_nodes {} ORDER BY created_at DESC LIMIT ?{}",
|
||||
where_clause, idx
|
||||
);
|
||||
params.push(Box::new(limit) as Box<dyn rusqlite::ToSql>);
|
||||
|
||||
let reader = self
|
||||
.reader
|
||||
.lock()
|
||||
.map_err(|_| StorageError::Init("Reader lock poisoned".into()))?;
|
||||
let mut stmt = reader.prepare(query)?;
|
||||
let mut stmt = reader.prepare(&query)?;
|
||||
let params_refs: Vec<&dyn rusqlite::ToSql> = params.iter().map(|p| p.as_ref()).collect();
|
||||
let nodes = stmt.query_map(params_refs.as_slice(), Self::row_to_node)?;
|
||||
|
||||
|
|
|
|||
|
|
@ -384,7 +384,7 @@ pub async fn get_timeline(
|
|||
let start = Utc::now() - Duration::days(days);
|
||||
let nodes = state
|
||||
.storage
|
||||
.query_time_range(Some(start), Some(Utc::now()), limit)
|
||||
.query_time_range(Some(start), Some(Utc::now()), limit, None, None)
|
||||
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
|
||||
|
||||
// Group by day
|
||||
|
|
|
|||
|
|
@ -126,19 +126,20 @@ pub async fn execute(storage: &Arc<Storage>, args: Option<Value>) -> Result<Valu
|
|||
|
||||
let limit = args.limit.unwrap_or(50).clamp(1, 200);
|
||||
|
||||
// Query memories in time range
|
||||
let mut results = storage
|
||||
.query_time_range(start, end, limit)
|
||||
// Query memories in time range with filters pushed into SQL. Rust-side
|
||||
// `retain` after `LIMIT` was unsafe for sparse types/tags — a dominant
|
||||
// set could crowd the sparse matches out of the limit window and leave
|
||||
// the retain with 0 rows to keep.
|
||||
let results = storage
|
||||
.query_time_range(
|
||||
start,
|
||||
end,
|
||||
limit,
|
||||
args.node_type.as_deref(),
|
||||
args.tags.as_deref(),
|
||||
)
|
||||
.map_err(|e| e.to_string())?;
|
||||
|
||||
// Post-query filters
|
||||
if let Some(ref node_type) = args.node_type {
|
||||
results.retain(|n| n.node_type == *node_type);
|
||||
}
|
||||
if let Some(tags) = args.tags.as_ref().filter(|t| !t.is_empty()) {
|
||||
results.retain(|n| tags.iter().any(|t| n.tags.contains(t)));
|
||||
}
|
||||
|
||||
// Group by day
|
||||
let mut by_day: BTreeMap<NaiveDate, Vec<Value>> = BTreeMap::new();
|
||||
for node in &results {
|
||||
|
|
@ -204,6 +205,28 @@ mod tests {
|
|||
.unwrap();
|
||||
}
|
||||
|
||||
/// Ingest with explicit node_type and tags. Used by the sparse-filter
|
||||
/// regression tests so the dominant and sparse sets can be told apart.
|
||||
async fn ingest_typed(
|
||||
storage: &Arc<Storage>,
|
||||
content: &str,
|
||||
node_type: &str,
|
||||
tags: &[&str],
|
||||
) {
|
||||
storage
|
||||
.ingest(vestige_core::IngestInput {
|
||||
content: content.to_string(),
|
||||
node_type: node_type.to_string(),
|
||||
source: None,
|
||||
sentiment_score: 0.0,
|
||||
sentiment_magnitude: 0.0,
|
||||
tags: tags.iter().map(|t| t.to_string()).collect(),
|
||||
valid_from: None,
|
||||
valid_until: None,
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_schema_has_properties() {
|
||||
let s = schema();
|
||||
|
|
@ -357,4 +380,90 @@ mod tests {
|
|||
let value = result.unwrap();
|
||||
assert_eq!(value["totalMemories"], 0);
|
||||
}
|
||||
|
||||
/// Regression: `node_type` filter must work even when the sparse type is
|
||||
/// crowded out by a dominant type within the SQL `LIMIT`. Before the fix,
|
||||
/// `query_time_range` applied `LIMIT` before the Rust-side `retain`, so a
|
||||
/// limit of 5 against 10 dominant + 2 sparse rows returned 5 dominant,
|
||||
/// then filtered to 0 sparse.
|
||||
#[tokio::test]
|
||||
async fn test_timeline_node_type_filter_sparse() {
|
||||
let (storage, _dir) = test_storage().await;
|
||||
|
||||
// Dominant set: 10 facts
|
||||
for i in 0..10 {
|
||||
ingest_typed(&storage, &format!("Dominant memory {}", i), "fact", &["alpha"]).await;
|
||||
}
|
||||
// Sparse set: 2 concepts
|
||||
for i in 0..2 {
|
||||
ingest_typed(&storage, &format!("Sparse memory {}", i), "concept", &["beta"]).await;
|
||||
}
|
||||
|
||||
// Limit 5 against 12 total — before the fix, `retain` on `concept`
|
||||
// would operate on the 5 most recent rows (all `fact`) and find 0.
|
||||
let args = serde_json::json!({ "node_type": "concept", "limit": 5 });
|
||||
let value = execute(&storage, Some(args)).await.unwrap();
|
||||
assert_eq!(
|
||||
value["totalMemories"], 2,
|
||||
"Both sparse concepts should survive a limit smaller than the dominant set"
|
||||
);
|
||||
|
||||
// Also verify the storage layer directly, so the contract is pinned
|
||||
// at the API boundary even if the tool wrapper shifts.
|
||||
let nodes = storage
|
||||
.query_time_range(None, None, 5, Some("concept"), None)
|
||||
.unwrap();
|
||||
assert_eq!(nodes.len(), 2);
|
||||
assert!(nodes.iter().all(|n| n.node_type == "concept"));
|
||||
}
|
||||
|
||||
/// Regression: `tags` filter must work even when the sparse tag is
|
||||
/// crowded out by a dominant tag within the SQL `LIMIT`. Parallel to
|
||||
/// the node_type sparse case — same `retain`-after-`LIMIT` bug.
|
||||
#[tokio::test]
|
||||
async fn test_timeline_tag_filter_sparse() {
|
||||
let (storage, _dir) = test_storage().await;
|
||||
|
||||
// Dominant set: 10 memories with tag "common"
|
||||
for i in 0..10 {
|
||||
ingest_typed(&storage, &format!("Common memory {}", i), "fact", &["common"]).await;
|
||||
}
|
||||
// Sparse set: 2 memories with tag "rare"
|
||||
for i in 0..2 {
|
||||
ingest_typed(&storage, &format!("Rare memory {}", i), "fact", &["rare"]).await;
|
||||
}
|
||||
|
||||
let args = serde_json::json!({ "tags": ["rare"], "limit": 5 });
|
||||
let value = execute(&storage, Some(args)).await.unwrap();
|
||||
assert_eq!(
|
||||
value["totalMemories"], 2,
|
||||
"Both sparse-tag matches should survive a limit smaller than the dominant set"
|
||||
);
|
||||
|
||||
let tag_slice = vec!["rare".to_string()];
|
||||
let nodes = storage
|
||||
.query_time_range(None, None, 5, None, Some(&tag_slice))
|
||||
.unwrap();
|
||||
assert_eq!(nodes.len(), 2);
|
||||
assert!(nodes.iter().all(|n| n.tags.iter().any(|t| t == "rare")));
|
||||
}
|
||||
|
||||
/// Regression: tag filter must match exact tags, not substrings. Without
|
||||
/// the `"tag"`-wrapped `LIKE` pattern, a query for `alpha` would also
|
||||
/// match rows tagged `alphabet`. The pattern `%"alpha"%` keys off the
|
||||
/// JSON-array quote characters and rejects that.
|
||||
#[tokio::test]
|
||||
async fn test_timeline_tag_filter_exact_match() {
|
||||
let (storage, _dir) = test_storage().await;
|
||||
|
||||
ingest_typed(&storage, "Exact tag hit", "fact", &["alpha"]).await;
|
||||
ingest_typed(&storage, "Substring decoy", "fact", &["alphabet"]).await;
|
||||
|
||||
let tag_slice = vec!["alpha".to_string()];
|
||||
let nodes = storage
|
||||
.query_time_range(None, None, 50, None, Some(&tag_slice))
|
||||
.unwrap();
|
||||
assert_eq!(nodes.len(), 1, "Only the exact-tag match should return");
|
||||
assert_eq!(nodes[0].content, "Exact tag hit");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue