mirror of
https://github.com/samvallad33/vestige.git
synced 2026-05-09 15:52:37 +02:00
fix: drop Intel Mac CI target, fix feature-gate dependency chain
ort-sys v2.0.0-rc.11 has no prebuilt ONNX Runtime binaries for x86_64-apple-darwin, and vestige-mcp requires embeddings to compile. - Remove x86_64-apple-darwin from CI release matrix (discontinued 2020) - Fix vestige-mcp Cargo.toml: add default-features=false to vestige-core dep - Extract sanitize_fts5_query to always-available fts.rs module - Gate embeddings-only imports in storage/sqlite.rs behind #[cfg] Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
de19ed8dd9
commit
070889ef26
6 changed files with 123 additions and 101 deletions
107
crates/vestige-core/src/fts.rs
Normal file
107
crates/vestige-core/src/fts.rs
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
//! FTS5 Query Sanitization
|
||||
//!
|
||||
//! Always-available utilities for SQLite FTS5 full-text search.
|
||||
//! Separated from the `search` module (which requires the `vector-search` feature)
|
||||
//! because FTS5 keyword search is a core capability that works without embeddings.
|
||||
|
||||
/// Dangerous FTS5 operators that could be used for injection or DoS
|
||||
const FTS5_OPERATORS: &[&str] = &["OR", "AND", "NOT", "NEAR"];
|
||||
|
||||
/// Sanitize input for FTS5 MATCH queries
|
||||
///
|
||||
/// Prevents:
|
||||
/// - Boolean operator injection (OR, AND, NOT, NEAR)
|
||||
/// - Column targeting attacks (content:secret)
|
||||
/// - Prefix/suffix wildcards for data extraction
|
||||
/// - DoS via complex query patterns
|
||||
pub fn sanitize_fts5_query(query: &str) -> String {
|
||||
// Limit query length to prevent DoS (char-aware to avoid UTF-8 boundary issues)
|
||||
let limited: String = query.chars().take(1000).collect();
|
||||
|
||||
// Remove FTS5 special characters and operators
|
||||
let mut sanitized = limited.to_string();
|
||||
|
||||
// Remove special characters: * : ^ - " ( )
|
||||
sanitized = sanitized
|
||||
.chars()
|
||||
.map(|c| match c {
|
||||
'*' | ':' | '^' | '-' | '"' | '(' | ')' | '{' | '}' | '[' | ']' => ' ',
|
||||
_ => c,
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Remove FTS5 boolean operators (case-insensitive)
|
||||
for op in FTS5_OPERATORS {
|
||||
// Use word boundary replacement to avoid partial matches
|
||||
let pattern = format!(" {} ", op);
|
||||
sanitized = sanitized.replace(&pattern, " ");
|
||||
sanitized = sanitized.replace(&pattern.to_lowercase(), " ");
|
||||
|
||||
// Handle operators at start/end (using char-aware operations)
|
||||
let upper = sanitized.to_uppercase();
|
||||
let start_pattern = format!("{} ", op);
|
||||
if upper.starts_with(&start_pattern) {
|
||||
sanitized = sanitized.chars().skip(op.len()).collect();
|
||||
}
|
||||
let end_pattern = format!(" {}", op);
|
||||
if upper.ends_with(&end_pattern) {
|
||||
let char_count = sanitized.chars().count();
|
||||
sanitized = sanitized
|
||||
.chars()
|
||||
.take(char_count.saturating_sub(op.len()))
|
||||
.collect();
|
||||
}
|
||||
}
|
||||
|
||||
// Collapse multiple spaces and trim
|
||||
let sanitized = sanitized.split_whitespace().collect::<Vec<_>>().join(" ");
|
||||
|
||||
// If empty after sanitization, return a safe default
|
||||
if sanitized.is_empty() {
|
||||
return "\"\"".to_string(); // Empty phrase - matches nothing safely
|
||||
}
|
||||
|
||||
// Wrap in quotes to treat as literal phrase search
|
||||
format!("\"{}\"", sanitized)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_sanitize_fts5_query_basic() {
|
||||
assert_eq!(sanitize_fts5_query("hello world"), "\"hello world\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sanitize_fts5_query_operators() {
|
||||
assert_eq!(sanitize_fts5_query("hello OR world"), "\"hello world\"");
|
||||
assert_eq!(sanitize_fts5_query("hello AND world"), "\"hello world\"");
|
||||
assert_eq!(sanitize_fts5_query("NOT hello"), "\"hello\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sanitize_fts5_query_special_chars() {
|
||||
assert_eq!(sanitize_fts5_query("hello* world"), "\"hello world\"");
|
||||
assert_eq!(
|
||||
sanitize_fts5_query("content:secret"),
|
||||
"\"content secret\""
|
||||
);
|
||||
assert_eq!(sanitize_fts5_query("^boost"), "\"boost\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sanitize_fts5_query_empty() {
|
||||
assert_eq!(sanitize_fts5_query(""), "\"\"");
|
||||
assert_eq!(sanitize_fts5_query(" "), "\"\"");
|
||||
assert_eq!(sanitize_fts5_query("* : ^"), "\"\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sanitize_fts5_query_length_limit() {
|
||||
let long_query = "a".repeat(2000);
|
||||
let sanitized = sanitize_fts5_query(&long_query);
|
||||
assert!(sanitized.len() <= 1004);
|
||||
}
|
||||
}
|
||||
|
|
@ -82,6 +82,7 @@
|
|||
|
||||
pub mod consolidation;
|
||||
pub mod fsrs;
|
||||
pub mod fts;
|
||||
pub mod memory;
|
||||
pub mod storage;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,69 +1,10 @@
|
|||
//! Keyword Search (BM25/FTS5)
|
||||
//!
|
||||
//! Provides keyword-based search using SQLite FTS5.
|
||||
//! Includes query sanitization for security.
|
||||
//! Query sanitization lives in `crate::fts` (always available, even without vector-search).
|
||||
|
||||
// ============================================================================
|
||||
// FTS5 QUERY SANITIZATION
|
||||
// ============================================================================
|
||||
|
||||
/// Dangerous FTS5 operators that could be used for injection or DoS
|
||||
const FTS5_OPERATORS: &[&str] = &["OR", "AND", "NOT", "NEAR"];
|
||||
|
||||
/// Sanitize input for FTS5 MATCH queries
|
||||
///
|
||||
/// Prevents:
|
||||
/// - Boolean operator injection (OR, AND, NOT, NEAR)
|
||||
/// - Column targeting attacks (content:secret)
|
||||
/// - Prefix/suffix wildcards for data extraction
|
||||
/// - DoS via complex query patterns
|
||||
pub fn sanitize_fts5_query(query: &str) -> String {
|
||||
// Limit query length to prevent DoS (char-aware to avoid UTF-8 boundary issues)
|
||||
let limited: String = query.chars().take(1000).collect();
|
||||
|
||||
// Remove FTS5 special characters and operators
|
||||
let mut sanitized = limited.to_string();
|
||||
|
||||
// Remove special characters: * : ^ - " ( )
|
||||
sanitized = sanitized
|
||||
.chars()
|
||||
.map(|c| match c {
|
||||
'*' | ':' | '^' | '-' | '"' | '(' | ')' | '{' | '}' | '[' | ']' => ' ',
|
||||
_ => c,
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Remove FTS5 boolean operators (case-insensitive)
|
||||
for op in FTS5_OPERATORS {
|
||||
// Use word boundary replacement to avoid partial matches
|
||||
let pattern = format!(" {} ", op);
|
||||
sanitized = sanitized.replace(&pattern, " ");
|
||||
sanitized = sanitized.replace(&pattern.to_lowercase(), " ");
|
||||
|
||||
// Handle operators at start/end (using char-aware operations)
|
||||
let upper = sanitized.to_uppercase();
|
||||
let start_pattern = format!("{} ", op);
|
||||
if upper.starts_with(&start_pattern) {
|
||||
sanitized = sanitized.chars().skip(op.len()).collect();
|
||||
}
|
||||
let end_pattern = format!(" {}", op);
|
||||
if upper.ends_with(&end_pattern) {
|
||||
let char_count = sanitized.chars().count();
|
||||
sanitized = sanitized.chars().take(char_count.saturating_sub(op.len())).collect();
|
||||
}
|
||||
}
|
||||
|
||||
// Collapse multiple spaces and trim
|
||||
let sanitized = sanitized.split_whitespace().collect::<Vec<_>>().join(" ");
|
||||
|
||||
// If empty after sanitization, return a safe default
|
||||
if sanitized.is_empty() {
|
||||
return "\"\"".to_string(); // Empty phrase - matches nothing safely
|
||||
}
|
||||
|
||||
// Wrap in quotes to treat as literal phrase search
|
||||
format!("\"{}\"", sanitized)
|
||||
}
|
||||
// Re-export from the always-available fts module
|
||||
pub use crate::fts::sanitize_fts5_query;
|
||||
|
||||
// ============================================================================
|
||||
// KEYWORD SEARCHER
|
||||
|
|
@ -197,38 +138,7 @@ impl KeywordSearcher {
|
|||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_sanitize_fts5_query_basic() {
|
||||
assert_eq!(sanitize_fts5_query("hello world"), "\"hello world\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sanitize_fts5_query_operators() {
|
||||
assert_eq!(sanitize_fts5_query("hello OR world"), "\"hello world\"");
|
||||
assert_eq!(sanitize_fts5_query("hello AND world"), "\"hello world\"");
|
||||
assert_eq!(sanitize_fts5_query("NOT hello"), "\"hello\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sanitize_fts5_query_special_chars() {
|
||||
assert_eq!(sanitize_fts5_query("hello* world"), "\"hello world\"");
|
||||
assert_eq!(sanitize_fts5_query("content:secret"), "\"content secret\"");
|
||||
assert_eq!(sanitize_fts5_query("^boost"), "\"boost\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sanitize_fts5_query_empty() {
|
||||
assert_eq!(sanitize_fts5_query(""), "\"\"");
|
||||
assert_eq!(sanitize_fts5_query(" "), "\"\"");
|
||||
assert_eq!(sanitize_fts5_query("* : ^"), "\"\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sanitize_fts5_query_length_limit() {
|
||||
let long_query = "a".repeat(2000);
|
||||
let sanitized = sanitize_fts5_query(&long_query);
|
||||
assert!(sanitized.len() <= 1004);
|
||||
}
|
||||
// FTS5 sanitization tests are in crate::fts::tests
|
||||
|
||||
#[test]
|
||||
fn test_tokenize() {
|
||||
|
|
|
|||
|
|
@ -4,8 +4,10 @@
|
|||
|
||||
use chrono::{DateTime, Duration, Utc};
|
||||
use directories::ProjectDirs;
|
||||
#[cfg(feature = "embeddings")]
|
||||
use lru::LruCache;
|
||||
use rusqlite::{params, Connection, OptionalExtension};
|
||||
#[cfg(feature = "embeddings")]
|
||||
use std::num::NonZeroUsize;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Mutex;
|
||||
|
|
@ -16,10 +18,12 @@ use crate::fsrs::{
|
|||
FSRSScheduler, FSRSState, LearningState, Rating,
|
||||
};
|
||||
use crate::memory::{
|
||||
ConsolidationResult, EmbeddingResult, IngestInput, KnowledgeNode, MatchType, MemoryStats,
|
||||
RecallInput, SearchMode, SearchResult, SimilarityResult,
|
||||
ConsolidationResult, IngestInput, KnowledgeNode, MemoryStats,
|
||||
RecallInput, SearchMode,
|
||||
};
|
||||
use crate::search::sanitize_fts5_query;
|
||||
#[cfg(all(feature = "embeddings", feature = "vector-search"))]
|
||||
use crate::memory::{EmbeddingResult, MatchType, SearchResult, SimilarityResult};
|
||||
use crate::fts::sanitize_fts5_query;
|
||||
|
||||
#[cfg(feature = "embeddings")]
|
||||
use crate::embeddings::{matryoshka_truncate, Embedding, EmbeddingService, EMBEDDING_DIMENSIONS};
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue