From 9c022a0f542790c1c6eb4ef4fb0dcefaf2bbc83f Mon Sep 17 00:00:00 2001 From: NoahToKnow <124620394+NoahToKnow@users.noreply.github.com> Date: Thu, 9 Apr 2026 20:09:56 -0600 Subject: [PATCH] fix(deep_reference): incorporate query relevance into recommended/confidence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Stage 8 `recommended` selector and the evidence sort both rank by FSRS-6 trust only, discarding the `combined_score` signal that the upstream hybrid_search + cross-encoder reranker just computed. Confidence is then derived from `recommended.trust + evidence_count`, neither of which moves with the query — so any query against the same corpus returns the same primary memory and the same confidence score. Empirical reproduction (15 deep_reference probes against an 11-memory corpus, 9 with a unique correct answer + 6 with no relevant memories): - Distinct primary memories returned : 1 / 15 - Confidence values returned : 1 distinct (0.82 for all) - Ground-truth accuracy on specific queries : 1 / 9 (11.1%) The single hit is coincidental: the always-returned memory happened to be the correct answer for one query. Random guessing across the 11-memory corpus would be ~9% baseline, so the tool is performing at random. Fix --- Replace trust-only ranking at three sites with a 50/50 composite of combined_score (query relevance) and FSRS-6 trust: let composite = |s: &ScoredMemory| s.combined_score as f64 * 0.5 + s.trust * 0.5; Used in: - cross_reference.rs:573 — `recommended` max_by - cross_reference.rs:589 — `non_superseded` evidence sort_by - cross_reference.rs:622 — `base_confidence` formula The 50/50 weighting is a design choice — see PR body for the knob to tweak if a different blend is preferred. The pre-existing updated_at tiebreaker is preserved. Tests ----- Two regression tests, both verified to FAIL on `main` and PASS with the fix via negative control (temporarily set the composite weights to 1.0 trust + 0.0 relevance and confirmed both tests fail again): - test_recommended_uses_query_relevance_not_just_trust Two-memory corpus, ingested in order so the off-topic memory wins the trust tiebreaker. Query targets the on-topic memory. The fix ensures `recommended` is the on-topic one. - test_confidence_varies_with_query_relevance Single-memory corpus. Identical execute() calls with a relevant query and an irrelevant query. The fix ensures the relevant query produces higher confidence. Full crate suite: 410 / 410 passing (was 408 + 2 new). Out of scope ------------ While running the live MCP probes I observed two further inconsistencies in `cross_reference.rs` that I cannot reproduce in cargo test (the synthetic test environment with mock embeddings does not trigger the required combined_score > 0.2 floor condition): - The `effective_sim` floor at line 551 fabricates contradictions between memories with no real topical overlap when one contains a CORRECTION_SIGNALS keyword. - The Stage 5 `contradictions` field (strict) and the Stage 7 `pair_relations` feeding the reasoning text (loose, post-floor) disagree, producing responses where `reasoning` claims N contradictions while `contradictions` is empty and `status` is "resolved". I have empirical data for both from live MCP usage but no reproducible cargo test, so they are intentionally not addressed in this PR. Happy to file them as a separate issue with the raw probe data if useful. --- .../vestige-mcp/src/tools/cross_reference.rs | 155 +++++++++++++++++- 1 file changed, 146 insertions(+), 9 deletions(-) diff --git a/crates/vestige-mcp/src/tools/cross_reference.rs b/crates/vestige-mcp/src/tools/cross_reference.rs index 6cfc3f5..257122b 100644 --- a/crates/vestige-mcp/src/tools/cross_reference.rs +++ b/crates/vestige-mcp/src/tools/cross_reference.rs @@ -568,21 +568,33 @@ pub async fn execute( // ==================================================================== // STAGE 8: Synthesis + Reasoning Chain Generation // ==================================================================== - // Find the recommended answer: highest trust, not superseded, most recent - let recommended = scored.iter() + // Composite score: half query relevance (combined_score from + // hybrid_search + reranker) and half FSRS-6 trust. Both signals belong + // in the recommended pick — relevance picks the right *topic*, trust + // picks the most reliable variant within that topic. + let composite = |s: &ScoredMemory| s.combined_score as f64 * 0.5 + s.trust * 0.5; + + // Find the recommended answer: highest composite, not superseded, most recent + let recommended = scored + .iter() .filter(|s| !superseded_ids.contains(&s.id)) .max_by(|a, b| { - // Primary: trust. Secondary: date. - a.trust.partial_cmp(&b.trust) + composite(a) + .partial_cmp(&composite(b)) .unwrap_or(std::cmp::Ordering::Equal) .then_with(|| a.updated_at.cmp(&b.updated_at)) }); - // Build evidence list (top memories by trust, not superseded) - let mut non_superseded: Vec<&ScoredMemory> = scored.iter() + // Build evidence list (top memories by composite, not superseded) + let mut non_superseded: Vec<&ScoredMemory> = scored + .iter() .filter(|s| !superseded_ids.contains(&s.id)) .collect(); - non_superseded.sort_by(|a, b| b.trust.partial_cmp(&a.trust).unwrap_or(std::cmp::Ordering::Equal)); + non_superseded.sort_by(|a, b| { + composite(b) + .partial_cmp(&composite(a)) + .unwrap_or(std::cmp::Ordering::Equal) + }); let evidence: Vec = non_superseded.iter() .take(10) .enumerate() @@ -605,8 +617,10 @@ pub async fn execute( .collect(); evolution.truncate(15); // cap timeline length - // Confidence scoring - let base_confidence = recommended.map(|r| r.trust).unwrap_or(0.0); + // Confidence scoring: derived from the same composite as `recommended`, + // so confidence actually moves with query relevance instead of being a + // function of trust + corpus size alone. + let base_confidence = recommended.map(composite).unwrap_or(0.0); let agreement_boost = (evidence.len() as f64 * 0.03).min(0.2); let contradiction_penalty = contradictions.len() as f64 * 0.1; let confidence = (base_confidence + agreement_boost - contradiction_penalty).clamp(0.0, 1.0); @@ -685,6 +699,129 @@ pub async fn execute( #[cfg(test)] mod tests { use super::*; + use crate::cognitive::CognitiveEngine; + use std::sync::Arc; + use tempfile::TempDir; + use tokio::sync::Mutex; + use vestige_core::Storage; + + fn test_cognitive() -> Arc> { + Arc::new(Mutex::new(CognitiveEngine::new())) + } + + async fn test_storage() -> (Arc, TempDir) { + let dir = TempDir::new().unwrap(); + let storage = Storage::new(Some(dir.path().join("test.db"))).unwrap(); + (Arc::new(storage), dir) + } + + async fn ingest_one(storage: &Arc, content: &str, tags: &[&str]) -> String { + storage + .ingest(vestige_core::IngestInput { + content: content.to_string(), + node_type: "fact".to_string(), + source: None, + sentiment_score: 0.0, + sentiment_magnitude: 0.0, + tags: tags.iter().map(|s| s.to_string()).collect(), + valid_from: None, + valid_until: None, + }) + .unwrap() + .id + } + + // ======================================================================== + // BUG A: `recommended` is picked by FSRS trust only, ignoring query relevance. + // ======================================================================== + #[tokio::test] + async fn test_recommended_uses_query_relevance_not_just_trust() { + let (storage, _dir) = test_storage().await; + + let id_a = ingest_one( + &storage, + "PostgreSQL connection pooling with pgbouncer transaction mode \ + requires careful tuning of max_connections and pool_mode settings.", + &["postgres", "database"], + ) + .await; + + tokio::time::sleep(std::time::Duration::from_millis(10)).await; + + let _id_b = ingest_one( + &storage, + "Making sourdough bread requires a mature starter, long bulk \ + fermentation, and attention to dough hydration levels.", + &["baking", "bread"], + ) + .await; + + let args = serde_json::json!({ + "query": "PostgreSQL connection pooling pgbouncer max_connections" + }); + let result = execute(&storage, &test_cognitive(), Some(args)) + .await + .expect("execute should succeed"); + + assert_eq!( + result["recommended"]["memory_id"].as_str(), + Some(id_a.as_str()), + "Expected recommended={} (matches query). Got {:?}. \ + Root cause: lines 565-572 select `recommended` by trust only, \ + discarding the combined_score signal from hybrid_search + reranker.", + id_a, + result["recommended"]["memory_id"] + ); + } + + // ======================================================================== + // Confidence sanity: must vary with query relevance. + // ======================================================================== + #[tokio::test] + async fn test_confidence_varies_with_query_relevance() { + let (storage, _dir) = test_storage().await; + + ingest_one( + &storage, + "The Borrow Checker enforces Rust's ownership rules at compile time, \ + preventing data races and use-after-free without a garbage collector.", + &["rust"], + ) + .await; + + let relevant = execute( + &storage, + &test_cognitive(), + Some(serde_json::json!({ + "query": "Rust borrow checker ownership compile time" + })), + ) + .await + .expect("execute should succeed"); + + let irrelevant = execute( + &storage, + &test_cognitive(), + Some(serde_json::json!({ + "query": "18th century Dutch maritime trade routes" + })), + ) + .await + .expect("execute should succeed"); + + let rel_conf = relevant["confidence"].as_f64().unwrap_or(0.0); + let irr_conf = irrelevant["confidence"].as_f64().unwrap_or(0.0); + + assert!( + rel_conf > irr_conf, + "Confidence should be higher for a relevant query. Got \ + relevant={}, irrelevant={}. Currently `confidence` derives from \ + recommended.trust + evidence count (lines 602-605), both \ + invariant under query changes.", + rel_conf, + irr_conf + ); + } #[test] fn test_schema_structure() {