diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 533a46a..ae12427 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,9 +59,9 @@ jobs: - os: macos-latest target: aarch64-apple-darwin cargo_flags: "" - - os: macos-14 - target: x86_64-apple-darwin - cargo_flags: "--no-default-features" + # x86_64-apple-darwin dropped: ort-sys has no prebuilt ONNX Runtime + # binaries for Intel Mac, and the codebase requires embeddings. + # Apple discontinued Intel Macs in 2020. Build from source if needed. - os: ubuntu-latest target: x86_64-unknown-linux-gnu cargo_flags: "" diff --git a/crates/vestige-core/src/fts.rs b/crates/vestige-core/src/fts.rs new file mode 100644 index 0000000..c3d0752 --- /dev/null +++ b/crates/vestige-core/src/fts.rs @@ -0,0 +1,107 @@ +//! FTS5 Query Sanitization +//! +//! Always-available utilities for SQLite FTS5 full-text search. +//! Separated from the `search` module (which requires the `vector-search` feature) +//! because FTS5 keyword search is a core capability that works without embeddings. + +/// Dangerous FTS5 operators that could be used for injection or DoS +const FTS5_OPERATORS: &[&str] = &["OR", "AND", "NOT", "NEAR"]; + +/// Sanitize input for FTS5 MATCH queries +/// +/// Prevents: +/// - Boolean operator injection (OR, AND, NOT, NEAR) +/// - Column targeting attacks (content:secret) +/// - Prefix/suffix wildcards for data extraction +/// - DoS via complex query patterns +pub fn sanitize_fts5_query(query: &str) -> String { + // Limit query length to prevent DoS (char-aware to avoid UTF-8 boundary issues) + let limited: String = query.chars().take(1000).collect(); + + // Remove FTS5 special characters and operators + let mut sanitized = limited.to_string(); + + // Remove special characters: * : ^ - " ( ) + sanitized = sanitized + .chars() + .map(|c| match c { + '*' | ':' | '^' | '-' | '"' | '(' | ')' | '{' | '}' | '[' | ']' => ' ', + _ => c, + }) + .collect(); + + // Remove FTS5 boolean operators (case-insensitive) + for op in FTS5_OPERATORS { + // Use word boundary replacement to avoid partial matches + let pattern = format!(" {} ", op); + sanitized = sanitized.replace(&pattern, " "); + sanitized = sanitized.replace(&pattern.to_lowercase(), " "); + + // Handle operators at start/end (using char-aware operations) + let upper = sanitized.to_uppercase(); + let start_pattern = format!("{} ", op); + if upper.starts_with(&start_pattern) { + sanitized = sanitized.chars().skip(op.len()).collect(); + } + let end_pattern = format!(" {}", op); + if upper.ends_with(&end_pattern) { + let char_count = sanitized.chars().count(); + sanitized = sanitized + .chars() + .take(char_count.saturating_sub(op.len())) + .collect(); + } + } + + // Collapse multiple spaces and trim + let sanitized = sanitized.split_whitespace().collect::>().join(" "); + + // If empty after sanitization, return a safe default + if sanitized.is_empty() { + return "\"\"".to_string(); // Empty phrase - matches nothing safely + } + + // Wrap in quotes to treat as literal phrase search + format!("\"{}\"", sanitized) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_sanitize_fts5_query_basic() { + assert_eq!(sanitize_fts5_query("hello world"), "\"hello world\""); + } + + #[test] + fn test_sanitize_fts5_query_operators() { + assert_eq!(sanitize_fts5_query("hello OR world"), "\"hello world\""); + assert_eq!(sanitize_fts5_query("hello AND world"), "\"hello world\""); + assert_eq!(sanitize_fts5_query("NOT hello"), "\"hello\""); + } + + #[test] + fn test_sanitize_fts5_query_special_chars() { + assert_eq!(sanitize_fts5_query("hello* world"), "\"hello world\""); + assert_eq!( + sanitize_fts5_query("content:secret"), + "\"content secret\"" + ); + assert_eq!(sanitize_fts5_query("^boost"), "\"boost\""); + } + + #[test] + fn test_sanitize_fts5_query_empty() { + assert_eq!(sanitize_fts5_query(""), "\"\""); + assert_eq!(sanitize_fts5_query(" "), "\"\""); + assert_eq!(sanitize_fts5_query("* : ^"), "\"\""); + } + + #[test] + fn test_sanitize_fts5_query_length_limit() { + let long_query = "a".repeat(2000); + let sanitized = sanitize_fts5_query(&long_query); + assert!(sanitized.len() <= 1004); + } +} diff --git a/crates/vestige-core/src/lib.rs b/crates/vestige-core/src/lib.rs index 944ee7e..d0bcd0b 100644 --- a/crates/vestige-core/src/lib.rs +++ b/crates/vestige-core/src/lib.rs @@ -82,6 +82,7 @@ pub mod consolidation; pub mod fsrs; +pub mod fts; pub mod memory; pub mod storage; diff --git a/crates/vestige-core/src/search/keyword.rs b/crates/vestige-core/src/search/keyword.rs index d554b7f..bcb5291 100644 --- a/crates/vestige-core/src/search/keyword.rs +++ b/crates/vestige-core/src/search/keyword.rs @@ -1,69 +1,10 @@ //! Keyword Search (BM25/FTS5) //! //! Provides keyword-based search using SQLite FTS5. -//! Includes query sanitization for security. +//! Query sanitization lives in `crate::fts` (always available, even without vector-search). -// ============================================================================ -// FTS5 QUERY SANITIZATION -// ============================================================================ - -/// Dangerous FTS5 operators that could be used for injection or DoS -const FTS5_OPERATORS: &[&str] = &["OR", "AND", "NOT", "NEAR"]; - -/// Sanitize input for FTS5 MATCH queries -/// -/// Prevents: -/// - Boolean operator injection (OR, AND, NOT, NEAR) -/// - Column targeting attacks (content:secret) -/// - Prefix/suffix wildcards for data extraction -/// - DoS via complex query patterns -pub fn sanitize_fts5_query(query: &str) -> String { - // Limit query length to prevent DoS (char-aware to avoid UTF-8 boundary issues) - let limited: String = query.chars().take(1000).collect(); - - // Remove FTS5 special characters and operators - let mut sanitized = limited.to_string(); - - // Remove special characters: * : ^ - " ( ) - sanitized = sanitized - .chars() - .map(|c| match c { - '*' | ':' | '^' | '-' | '"' | '(' | ')' | '{' | '}' | '[' | ']' => ' ', - _ => c, - }) - .collect(); - - // Remove FTS5 boolean operators (case-insensitive) - for op in FTS5_OPERATORS { - // Use word boundary replacement to avoid partial matches - let pattern = format!(" {} ", op); - sanitized = sanitized.replace(&pattern, " "); - sanitized = sanitized.replace(&pattern.to_lowercase(), " "); - - // Handle operators at start/end (using char-aware operations) - let upper = sanitized.to_uppercase(); - let start_pattern = format!("{} ", op); - if upper.starts_with(&start_pattern) { - sanitized = sanitized.chars().skip(op.len()).collect(); - } - let end_pattern = format!(" {}", op); - if upper.ends_with(&end_pattern) { - let char_count = sanitized.chars().count(); - sanitized = sanitized.chars().take(char_count.saturating_sub(op.len())).collect(); - } - } - - // Collapse multiple spaces and trim - let sanitized = sanitized.split_whitespace().collect::>().join(" "); - - // If empty after sanitization, return a safe default - if sanitized.is_empty() { - return "\"\"".to_string(); // Empty phrase - matches nothing safely - } - - // Wrap in quotes to treat as literal phrase search - format!("\"{}\"", sanitized) -} +// Re-export from the always-available fts module +pub use crate::fts::sanitize_fts5_query; // ============================================================================ // KEYWORD SEARCHER @@ -197,38 +138,7 @@ impl KeywordSearcher { mod tests { use super::*; - #[test] - fn test_sanitize_fts5_query_basic() { - assert_eq!(sanitize_fts5_query("hello world"), "\"hello world\""); - } - - #[test] - fn test_sanitize_fts5_query_operators() { - assert_eq!(sanitize_fts5_query("hello OR world"), "\"hello world\""); - assert_eq!(sanitize_fts5_query("hello AND world"), "\"hello world\""); - assert_eq!(sanitize_fts5_query("NOT hello"), "\"hello\""); - } - - #[test] - fn test_sanitize_fts5_query_special_chars() { - assert_eq!(sanitize_fts5_query("hello* world"), "\"hello world\""); - assert_eq!(sanitize_fts5_query("content:secret"), "\"content secret\""); - assert_eq!(sanitize_fts5_query("^boost"), "\"boost\""); - } - - #[test] - fn test_sanitize_fts5_query_empty() { - assert_eq!(sanitize_fts5_query(""), "\"\""); - assert_eq!(sanitize_fts5_query(" "), "\"\""); - assert_eq!(sanitize_fts5_query("* : ^"), "\"\""); - } - - #[test] - fn test_sanitize_fts5_query_length_limit() { - let long_query = "a".repeat(2000); - let sanitized = sanitize_fts5_query(&long_query); - assert!(sanitized.len() <= 1004); - } + // FTS5 sanitization tests are in crate::fts::tests #[test] fn test_tokenize() { diff --git a/crates/vestige-core/src/storage/sqlite.rs b/crates/vestige-core/src/storage/sqlite.rs index 52f4c50..370d455 100644 --- a/crates/vestige-core/src/storage/sqlite.rs +++ b/crates/vestige-core/src/storage/sqlite.rs @@ -4,8 +4,10 @@ use chrono::{DateTime, Duration, Utc}; use directories::ProjectDirs; +#[cfg(feature = "embeddings")] use lru::LruCache; use rusqlite::{params, Connection, OptionalExtension}; +#[cfg(feature = "embeddings")] use std::num::NonZeroUsize; use std::path::PathBuf; use std::sync::Mutex; @@ -16,10 +18,12 @@ use crate::fsrs::{ FSRSScheduler, FSRSState, LearningState, Rating, }; use crate::memory::{ - ConsolidationResult, EmbeddingResult, IngestInput, KnowledgeNode, MatchType, MemoryStats, - RecallInput, SearchMode, SearchResult, SimilarityResult, + ConsolidationResult, IngestInput, KnowledgeNode, MemoryStats, + RecallInput, SearchMode, }; -use crate::search::sanitize_fts5_query; +#[cfg(all(feature = "embeddings", feature = "vector-search"))] +use crate::memory::{EmbeddingResult, MatchType, SearchResult, SimilarityResult}; +use crate::fts::sanitize_fts5_query; #[cfg(feature = "embeddings")] use crate::embeddings::{matryoshka_truncate, Embedding, EmbeddingService, EMBEDDING_DIMENSIONS}; diff --git a/crates/vestige-mcp/Cargo.toml b/crates/vestige-mcp/Cargo.toml index 510e6cd..86d8d74 100644 --- a/crates/vestige-mcp/Cargo.toml +++ b/crates/vestige-mcp/Cargo.toml @@ -32,7 +32,7 @@ path = "src/bin/cli.rs" # ============================================================================ # Includes: FSRS-6, spreading activation, synaptic tagging, hippocampal indexing, # memory states, context memory, importance signals, dreams, and more -vestige-core = { version = "2.0.1", path = "../vestige-core" } +vestige-core = { version = "2.0.1", path = "../vestige-core", default-features = false, features = ["bundled-sqlite"] } # ============================================================================ # MCP Server Dependencies