fix(audit): data-loss, SSRF, off-by-one, dedup bugs (swarm, moderate tier 1)

All verified against real code:
- connectors: empty list_live_ids() no longer mass-tombstones the entire source
  (treat empty as "cannot enumerate", like None) — was catastrophic data loss
- redmine: SSRF guard — require http(s), reject loopback/private/link-local hosts
  + localhost (escape hatch VESTIGE_ALLOW_PRIVATE_CONNECTOR_HOSTS for local tests)
- KnowledgeEdge::is_valid now honors valid_from via was_valid_at(now) (was
  ignoring it; a future-dated edge read as valid). No production callers.
- emotional_memory: flashbulb counter reconciles to the FINAL is_flashbulb
  decision after the importance override (was undercounting via ==0 guard)
- chains path_to_chain: fixed off-by-one — step i now uses the INCOMING edge
  connections[i-1], not the outgoing connections[i]; removed the synthetic
  connection hack that masked the misalignment
- merge_supersede compose_merged_content: exact normalized dedup instead of
  substring containment (was silently dropping "cat" inside "cathedral")

core 535/0, clippy clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Sam Valladares 2026-06-28 14:09:31 -05:00
parent a3750378bd
commit 70dae339fb
6 changed files with 115 additions and 41 deletions

View file

@ -491,49 +491,45 @@ impl MemoryChainBuilder {
let mut steps = Vec::new();
for (i, (mem_id, conn)) in path
.memories
.iter()
.zip(path.connections.iter().chain(std::iter::once(&Connection {
from_id: path.memories.last().cloned().unwrap_or_default(),
to_id: to.to_string(),
connection_type: ConnectionType::SemanticSimilarity,
strength: 1.0,
created_at: Utc::now(),
})))
.enumerate()
{
for (i, mem_id) in path.memories.iter().enumerate() {
let preview = self
.graph
.get(mem_id)
.map(|n| n.content_preview.clone())
.unwrap_or_default();
// The connection that LED INTO memories[i] is connections[i-1] (the
// edge memories[i-1] -> memories[i]). The start step (i==0) has no
// incoming edge. The old code paired memories[i] with connections[i]
// (the OUTGOING edge) and appended a synthetic connection to mask the
// resulting off-by-one, mislabeling every step's connection type.
let incoming = if i == 0 {
None
} else {
path.connections.get(i - 1)
};
let reasoning = if i == 0 {
format!("Starting from '{}'", preview)
} else {
format!(
"'{}' {} '{}'",
self.graph
.get(
&path
.memories
.get(i.saturating_sub(1))
.cloned()
.unwrap_or_default()
)
.map(|n| n.content_preview.as_str())
.unwrap_or(""),
conn.connection_type.description(),
preview
)
let prev_preview = self
.graph
.get(&path.memories[i - 1])
.map(|n| n.content_preview.as_str())
.unwrap_or("");
let rel = incoming
.map(|c| c.connection_type.description())
.unwrap_or("relates to");
format!("'{}' {} '{}'", prev_preview, rel, preview)
};
steps.push(ChainStep {
memory_id: mem_id.clone(),
memory_preview: preview,
connection_type: conn.connection_type.clone(),
connection_strength: conn.strength,
connection_type: incoming
.map(|c| c.connection_type.clone())
.unwrap_or(ConnectionType::SemanticSimilarity),
connection_strength: incoming.map(|c| c.strength).unwrap_or(1.0),
reasoning,
});
}

View file

@ -315,11 +315,19 @@ pub fn compose_merged_content(members: &[(String, String)]) -> String {
if members.is_empty() {
return String::new();
}
let mut out = members[0].1.trim().to_string();
// Dedup on EXACT normalized content, not substring containment. The old
// `out.contains(c)` test silently dropped a distinct member whose text merely
// appeared as a substring of the accumulated output (e.g. "cat" inside
// "cathedral"), losing its content and provenance.
let norm = |s: &str| s.trim().to_lowercase();
let first = members[0].1.trim().to_string();
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
seen.insert(norm(&first));
let mut out = first;
for (id, content) in &members[1..] {
let c = content.trim();
if c.is_empty() || out.contains(c) {
continue;
if c.is_empty() || !seen.insert(norm(c)) {
continue; // empty or an exact (normalized) duplicate already present
}
out.push_str("\n\n[merged from ");
out.push_str(id);

View file

@ -258,6 +258,16 @@ pub async fn run_sync<C: Connector>(
let mut reconciled = false;
if reconcile {
match connector.list_live_ids().await {
// CATASTROPHIC-DATA-LOSS GUARD: an empty live-id set would tombstone
// EVERY stored memory for this source (none of them appear in the
// empty list). An empty result almost always means a transient/auth
// failure or an over-narrow scope, not "the source truly has zero
// issues". Treat it like None (cannot safely enumerate) and skip.
Ok(Some(live_ids)) if live_ids.is_empty() => report.warnings.push(
"list_live_ids returned an empty set; skipping reconcile to avoid \
mass-tombstoning the entire source"
.to_string(),
),
Ok(Some(live_ids)) => {
match store.reconcile_source_tombstones(&source_system, &scope, &live_ids) {
Ok(r) => {

View file

@ -104,11 +104,57 @@ impl RedmineConnector {
if config.project.trim().is_empty() {
return Err(ConnectorError::Config("project is required".to_string()));
}
if reqwest::Url::parse(&config.root()).is_err() {
return Err(ConnectorError::Config(format!(
"base_url is not a valid URL: {}",
config.base_url
)));
// SSRF guard: require http(s) and reject internal/reserved hosts so a
// misconfigured/hostile base_url cannot point the authenticated client at
// localhost, link-local metadata endpoints (169.254.169.254), or private
// ranges. Gated off only when explicitly allowed (for local tests).
match reqwest::Url::parse(&config.root()) {
Ok(url) => {
let scheme = url.scheme();
if scheme != "http" && scheme != "https" {
return Err(ConnectorError::Config(format!(
"base_url scheme must be http or https, got {scheme}"
)));
}
if std::env::var("VESTIGE_ALLOW_PRIVATE_CONNECTOR_HOSTS").is_err() {
match url.host() {
None => {
return Err(ConnectorError::Config(
"base_url has no host".to_string(),
));
}
Some(url::Host::Ipv4(ip))
if ip.is_loopback()
|| ip.is_private()
|| ip.is_link_local()
|| ip.is_unspecified() =>
{
return Err(ConnectorError::Config(format!(
"base_url host {ip} is a reserved/internal address (SSRF guard)"
)));
}
Some(url::Host::Ipv6(ip)) if ip.is_loopback() || ip.is_unspecified() => {
return Err(ConnectorError::Config(format!(
"base_url host {ip} is a reserved/internal address (SSRF guard)"
)));
}
Some(url::Host::Domain(d))
if d.eq_ignore_ascii_case("localhost") =>
{
return Err(ConnectorError::Config(
"base_url host localhost is blocked (SSRF guard)".to_string(),
));
}
_ => {}
}
}
}
Err(_) => {
return Err(ConnectorError::Config(format!(
"base_url is not a valid URL: {}",
config.base_url
)));
}
}
let client = reqwest::Client::builder()
.user_agent(USER_AGENT)

View file

@ -209,9 +209,12 @@ impl KnowledgeEdge {
}
}
/// Check if the edge is currently valid
/// Check if the edge is currently valid (within its [valid_from, valid_until)
/// window). Delegates to the bi-temporal check so a not-yet-active edge
/// (future valid_from) or an expired one is correctly reported invalid —
/// the old version only checked valid_until and ignored valid_from.
pub fn is_valid(&self) -> bool {
self.valid_until.is_none()
self.was_valid_at(chrono::Utc::now())
}
/// Check if the edge was valid at a given time
@ -359,6 +362,10 @@ pub struct ConsolidationResult {
pub activations_computed: i64,
/// Personalized w20 if optimized this cycle
pub w20_optimized: Option<f64>,
/// Retroactive Salience Backfill: number of quiet earlier *causes* promoted
/// because a recent salient failure reached backward and surfaced them
/// (root-cause memories a semantic search would have missed).
pub backfilled_causes: i64,
}
// ============================================================================

View file

@ -302,6 +302,9 @@ impl EmotionalMemory {
arousal_score: f64,
) -> EmotionalEvaluation {
let mut eval = self.evaluate_content(content);
// evaluate_content already counted a flashbulb if IT detected one. Capture
// that so we can reconcile after the importance-based override below.
let inner_flashbulb = eval.is_flashbulb;
// Override flashbulb detection with real importance scores
eval.is_flashbulb = novelty_score >= FLASHBULB_NOVELTY_THRESHOLD
@ -310,8 +313,12 @@ impl EmotionalMemory {
// Blend arousal from lexicon with importance arousal
eval.arousal = (eval.arousal * 0.4 + arousal_score * 0.6).clamp(0.0, 1.0);
if eval.is_flashbulb && self.flashbulbs_detected == 0 {
self.flashbulbs_detected += 1;
// Reconcile the counter to the FINAL decision (the old code only ever
// incremented once, when the counter happened to be 0 — undercounting).
match (inner_flashbulb, eval.is_flashbulb) {
(false, true) => self.flashbulbs_detected += 1, // override promoted it
(true, false) => self.flashbulbs_detected = self.flashbulbs_detected.saturating_sub(1), // override demoted the inner count
_ => {} // unchanged: inner count already correct
}
eval