diff --git a/crates/vestige-core/src/lib.rs b/crates/vestige-core/src/lib.rs index 640ba4a..08ce090 100644 --- a/crates/vestige-core/src/lib.rs +++ b/crates/vestige-core/src/lib.rs @@ -127,9 +127,11 @@ pub use memory::{ MemorySystem, NodeType, RecallInput, + SchemaIntrospection, SearchMode, SearchResult, SimilarityResult, + TableIntrospection, TemporalRange, }; diff --git a/crates/vestige-core/src/memory/mod.rs b/crates/vestige-core/src/memory/mod.rs index e8c3f32..8cd618e 100644 --- a/crates/vestige-core/src/memory/mod.rs +++ b/crates/vestige-core/src/memory/mod.rs @@ -276,6 +276,59 @@ impl Default for MemoryStats { } } +// ============================================================================ +// SCHEMA INTROSPECTION (v2.1.24+: surfaces DB shape to MCP consumers) +// ============================================================================ + +/// A single SQLite table's introspected shape: name, row count, column list. +/// +/// Returned as part of `SchemaIntrospection` from `Storage::schema_introspection()`. +/// Consumers needing more depth (e.g. per-column NULL counts) should request +/// targeted methods rather than expecting this struct to grow unboundedly — +/// the row + column shape covered here is the 80% case for audit / migration +/// guard scripts. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct TableIntrospection { + /// SQLite table name. + pub name: String, + /// Row count. + pub rows: i64, + /// Column names in declaration order. + pub columns: Vec, +} + +/// Result of `Storage::schema_introspection()`. Snapshots the schema version, +/// migration timestamp, and a row/column view of every user-data table. +/// +/// Motivation: external consumers (audit scripts, migration guards, downstream +/// upgrade scripts) currently must read SQLite directly to learn the schema +/// version and table shape, which couples them to internal layout. This struct +/// gives them a first-class MCP-callable surface. The list of tables walked is +/// intentionally the same canonical set used elsewhere in storage (the user- +/// data tables) so the surface stays stable across migrations. +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +#[serde(rename_all = "camelCase")] +pub struct SchemaIntrospection { + /// Current schema version (highest applied migration; matches the + /// `schema_version` table's MAX(version)). + pub schema_version: u32, + /// When the current schema version was applied (RFC3339), if known. + pub schema_version_applied_at: Option>, + /// Per-table introspection rows. + pub tables: Vec, + /// Total number of nodes whose `embeddings.embedding` is NULL (i.e., have + /// no embedding row). Convenience field for embedding-coverage audits; + /// equivalent to (knowledge_nodes.rows − rows in `embeddings` joined to + /// knowledge_nodes), so consumers don't have to compute it themselves. + pub embedding_null_count: i64, + /// Active embedding model name (mirrors `MemoryStats.active_embedding_model`). + /// Useful when an audit script wants schema_version + active model in one call. + pub active_embedding_model: Option, + /// Embedding dimensions for the active model, if known. + pub active_embedding_dimensions: Option, +} + // ============================================================================ // CONSOLIDATION RESULT // ============================================================================ diff --git a/crates/vestige-core/src/storage/sqlite.rs b/crates/vestige-core/src/storage/sqlite.rs index e94a27d..c4e8f2b 100644 --- a/crates/vestige-core/src/storage/sqlite.rs +++ b/crates/vestige-core/src/storage/sqlite.rs @@ -1890,6 +1890,103 @@ impl Storage { }) } + /// Introspect the live SQLite schema: schema version + per-table row/column + /// shape + embedding-coverage convenience fields. + /// + /// This is the v2.1.24+ replacement for the direct-SQLite reads that + /// audit scripts and migration guards previously had to perform. The set + /// of tables walked matches `PORTABLE_USER_DATA_TABLES` — the same + /// canonical set used by portable export / import — so the surface stays + /// stable across migrations rather than chasing arbitrary + /// `sqlite_master` rows. + /// + /// Cost: O(N_tables) `COUNT(*)` queries + one PRAGMA per table. Negligible + /// at the table cardinalities Vestige carries (~15 tables, all indexed). + /// Safe to call on every MCP `system_status` invocation when the flag is + /// set; callers wanting to limit cost should leave the flag off (default). + pub fn schema_introspection(&self) -> Result { + let reader = self + .reader + .lock() + .map_err(|_| StorageError::Init("Reader lock poisoned".into()))?; + + let schema_version = Self::current_schema_version(&reader)?; + + // schema_version has the row (version PK + applied_at TEXT). Read the + // applied_at for the current version row; tolerate failure (legacy + // databases may have skipped the applied_at fill on early upgrades). + let applied_at_str: Option = reader + .query_row( + "SELECT applied_at FROM schema_version WHERE version = ?1", + params![schema_version as i64], + |row| row.get(0), + ) + .optional()?; + let schema_version_applied_at = applied_at_str.and_then(|s| { + // The migration scripts use `datetime('now')` which yields + // SQLite's "YYYY-MM-DD HH:MM:SS" UTC form (NOT RFC3339). + // Try the SQLite form first, fall back to RFC3339 for any + // future migrations that switch. + chrono::NaiveDateTime::parse_from_str(&s, "%Y-%m-%d %H:%M:%S") + .map(|naive| naive.and_utc()) + .or_else(|_| { + DateTime::parse_from_rfc3339(&s) + .map(|dt| dt.with_timezone(&Utc)) + }) + .ok() + }); + + let mut tables = Vec::with_capacity(PORTABLE_USER_DATA_TABLES.len()); + for table_name in PORTABLE_USER_DATA_TABLES { + if Self::table_exists(&reader, table_name)? { + let rows = Self::table_row_count(&reader, table_name)?; + let columns = Self::table_columns(&reader, table_name)?; + tables.push(crate::TableIntrospection { + name: (*table_name).to_string(), + rows, + columns, + }); + } + } + + // Convenience: embedding-coverage NULL count. Defined as the number + // of knowledge_nodes with NO matching row in node_embeddings. This is + // distinct from `nodes_with_embeddings` in MemoryStats (which uses + // the `has_embedding` column flag); we compute the join-based truth + // here so audit scripts can detect drift between the flag and the + // actual embeddings table. + let embedding_null_count: i64 = reader + .query_row( + "SELECT COUNT(*) FROM knowledge_nodes kn + WHERE NOT EXISTS ( + SELECT 1 FROM node_embeddings ne WHERE ne.node_id = kn.id + )", + [], + |row| row.get(0), + ) + .unwrap_or(0); + + #[cfg(feature = "embeddings")] + let active_embedding_model = Some(self.embedding_service.model_name().to_string()); + #[cfg(not(feature = "embeddings"))] + let active_embedding_model: Option = None; + + #[cfg(feature = "embeddings")] + let active_embedding_dimensions: Option = + Some(self.embedding_service.dimensions() as u32); + #[cfg(not(feature = "embeddings"))] + let active_embedding_dimensions: Option = None; + + Ok(crate::SchemaIntrospection { + schema_version, + schema_version_applied_at, + tables, + embedding_null_count, + active_embedding_model, + active_embedding_dimensions, + }) + } + /// Delete a node pub fn delete_node(&self, id: &str) -> Result { let mut writer = self diff --git a/crates/vestige-mcp/src/tools/maintenance.rs b/crates/vestige-mcp/src/tools/maintenance.rs index 82f5374..9dfb50e 100644 --- a/crates/vestige-mcp/src/tools/maintenance.rs +++ b/crates/vestige-mcp/src/tools/maintenance.rs @@ -105,10 +105,24 @@ pub fn gc_schema() -> Value { pub fn system_status_schema() -> Value { serde_json::json!({ "type": "object", - "properties": {} + "properties": { + "schema_introspection": { + "type": "boolean", + "description": "When true, extends the response with a 'schema' block carrying the SQLite schema version, per-table row counts + column lists, and embedding-coverage convenience fields. Default: false (response shape unchanged). Use this for audit / migration-guard / downstream-upgrade scripts that otherwise have to read SQLite directly.", + "default": false + } + } }) } +/// Arguments for the system_status tool. All optional. +#[derive(Debug, Default, Deserialize)] +#[serde(rename_all = "camelCase")] +struct SystemStatusArgs { + #[serde(alias = "schema_introspection")] + schema_introspection: Option, +} + // ============================================================================ // EXECUTE FUNCTIONS // ============================================================================ @@ -117,11 +131,24 @@ pub fn system_status_schema() -> Value { /// /// Returns system health status, full statistics, FSRS preview, /// cognitive module health, state distribution, and actionable recommendations. +/// +/// v2.1.24+: when `schema_introspection: true` is passed, the response +/// additionally carries a `schema` block with the live SQLite schema version, +/// per-table row counts + column lists, and embedding-coverage convenience +/// fields. Default off; response shape unchanged when omitted. pub async fn execute_system_status( storage: &Arc, cognitive: &Arc>, - _args: Option, + args: Option, ) -> Result { + // Parse arguments (all optional, including the args envelope itself). + let parsed: SystemStatusArgs = match args { + Some(v) => serde_json::from_value(v) + .map_err(|e| format!("Invalid arguments: {}", e))?, + None => SystemStatusArgs::default(), + }; + let include_schema = parsed.schema_introspection.unwrap_or(false); + let stats = storage.get_stats().map_err(|e| e.to_string())?; // === Health assessment === @@ -259,7 +286,7 @@ pub async fn execute_system_status( }; let last_backup = storage.last_backup_timestamp(); - Ok(serde_json::json!({ + let mut response = serde_json::json!({ "tool": "system_status", // Health "status": status, @@ -299,7 +326,34 @@ pub async fn execute_system_status( "lastBackupTimestamp": last_backup.map(|dt| dt.to_rfc3339()), "lastConsolidationTimestamp": last_consolidation.map(|dt| dt.to_rfc3339()), }, - })) + }); + + // v2.1.24+: optional schema introspection block. Default off; response + // shape unchanged when omitted. + if include_schema { + let intro = storage.schema_introspection().map_err(|e| e.to_string())?; + let tables_json: Vec = intro + .tables + .iter() + .map(|t| { + serde_json::json!({ + "name": t.name, + "rows": t.rows, + "columns": t.columns, + }) + }) + .collect(); + response["schema"] = serde_json::json!({ + "schemaVersion": intro.schema_version, + "schemaVersionAppliedAt": intro.schema_version_applied_at.map(|dt| dt.to_rfc3339()), + "tables": tables_json, + "embeddingNullCount": intro.embedding_null_count, + "activeEmbeddingModel": intro.active_embedding_model, + "activeEmbeddingDimensions": intro.active_embedding_dimensions, + }); + } + + Ok(response) } /// Consolidate tool @@ -792,6 +846,163 @@ mod tests { assert!(triggers["lastDreamTimestamp"].is_null()); } + // ======================================================================== + // SCHEMA INTROSPECTION TESTS (PR2) + // ======================================================================== + + #[test] + fn test_system_status_schema_has_schema_introspection_flag() { + let schema = system_status_schema(); + let props = &schema["properties"]; + let flag = &props["schema_introspection"]; + assert!(flag.is_object(), "schema_introspection property must exist"); + assert_eq!(flag["type"], "boolean"); + assert_eq!(flag["default"], false); + // Top-level required must NOT include this — flag is opt-in. + let required = schema.get("required"); + if let Some(req) = required { + let req_arr = req.as_array().unwrap(); + assert!(!req_arr.contains(&serde_json::json!("schema_introspection"))); + } + } + + #[tokio::test] + async fn test_system_status_without_schema_flag_omits_schema_block() { + // Backwards-compat: when the flag is not set (or false), the response + // shape is unchanged — no `schema` key. + let (storage, _dir) = test_storage().await; + let result = execute_system_status(&storage, &test_cognitive(), None).await; + assert!(result.is_ok()); + let value = result.unwrap(); + assert!( + value.get("schema").is_none(), + "schema block must NOT be present when flag is unset, got {:?}", + value.get("schema") + ); + + // Explicit false → still no schema block. + let result = execute_system_status( + &storage, + &test_cognitive(), + Some(serde_json::json!({ "schema_introspection": false })), + ) + .await; + assert!(result.is_ok()); + let value = result.unwrap(); + assert!(value.get("schema").is_none()); + } + + #[tokio::test] + async fn test_system_status_with_schema_flag_emits_schema_block() { + let (storage, _dir) = test_storage().await; + storage + .ingest(vestige_core::IngestInput { + content: "Schema introspection seed memory".to_string(), + node_type: "fact".to_string(), + source: None, + sentiment_score: 0.0, + sentiment_magnitude: 0.0, + tags: vec!["schema-test".to_string()], + valid_from: None, + valid_until: None, + }) + .unwrap(); + + let result = execute_system_status( + &storage, + &test_cognitive(), + Some(serde_json::json!({ "schema_introspection": true })), + ) + .await; + assert!(result.is_ok(), "{:?}", result); + let value = result.unwrap(); + + // Shape assertions. + let schema_block = value + .get("schema") + .expect("schema block must be present when flag is true"); + assert!(schema_block.is_object()); + assert!( + schema_block["schemaVersion"].is_number(), + "schemaVersion must be a number, got {:?}", + schema_block["schemaVersion"] + ); + // Schema version should be >= 13 (V13 is the highest landed migration + // at the time this PR was authored). + let v = schema_block["schemaVersion"].as_u64().unwrap(); + assert!(v >= 13, "expected schema_version >= 13, got {}", v); + + // tables should be a non-empty array of {name, rows, columns}. + let tables = schema_block["tables"].as_array().unwrap(); + assert!(!tables.is_empty(), "expected at least one table"); + let kn = tables + .iter() + .find(|t| t["name"] == "knowledge_nodes") + .expect("knowledge_nodes table must be present"); + assert_eq!(kn["rows"], 1, "ingested exactly one memory"); + let cols = kn["columns"].as_array().unwrap(); + assert!(!cols.is_empty(), "knowledge_nodes must have columns"); + // The id column is universally present. + let col_names: Vec<&str> = cols.iter().filter_map(|c| c.as_str()).collect(); + assert!( + col_names.contains(&"id"), + "knowledge_nodes.id must be in columns list: {:?}", + col_names + ); + + // Convenience fields. + assert!(schema_block["embeddingNullCount"].is_number()); + // activeEmbeddingModel may be null if the `embeddings` feature is + // not enabled in the test build; just check the key exists. + assert!(schema_block.get("activeEmbeddingModel").is_some()); + assert!(schema_block.get("activeEmbeddingDimensions").is_some()); + } + + #[tokio::test] + async fn test_system_status_camelcase_alias() { + // Accept both `schema_introspection` (snake) and `schemaIntrospection` + // (camel) per the #[serde(rename_all = "camelCase")] + alias attr. + let (storage, _dir) = test_storage().await; + let result = execute_system_status( + &storage, + &test_cognitive(), + Some(serde_json::json!({ "schemaIntrospection": true })), + ) + .await; + assert!(result.is_ok(), "{:?}", result); + let value = result.unwrap(); + assert!( + value.get("schema").is_some(), + "camelCase form must also trigger schema block" + ); + } + + #[test] + fn test_storage_schema_introspection_method() { + // Direct test on the Storage method, independent of the MCP layer. + let dir = TempDir::new().unwrap(); + let storage = Storage::new(Some(dir.path().join("test.db"))).unwrap(); + let intro = storage + .schema_introspection() + .expect("schema_introspection must succeed on a fresh DB"); + + // Schema version pulled from the schema_version table. + assert!( + intro.schema_version >= 13, + "fresh DB should be at schema_version >= 13, got {}", + intro.schema_version + ); + // At least one walked table should exist. + assert!( + !intro.tables.is_empty(), + "expected at least one user-data table" + ); + // Empty DB → no embeddings → embedding_null_count == 0 (no rows to + // count). Once we ingest, it should be > 0 (no embeddings generated + // in tests by default). + assert_eq!(intro.embedding_null_count, 0); + } + #[tokio::test] async fn test_portable_export_writes_archive_to_storage_exports_dir() { let (storage, _dir) = test_storage().await;