mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-09 19:45:13 +02:00
fix(db): fast-fail Indexer::init on non-SQLite files via magic-header preflight
This commit is contained in:
parent
946cb6a9bc
commit
8abb023dd0
11 changed files with 648 additions and 17 deletions
|
|
@ -19,6 +19,7 @@ pub mod index {
|
|||
use r2d2_sqlite::SqliteConnectionManager;
|
||||
use rusqlite::{Connection, OpenFlags, OptionalExtension, params};
|
||||
use std::fs;
|
||||
use std::io::Read;
|
||||
use std::ops::Deref;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
|
|
@ -332,9 +333,62 @@ pub mod index {
|
|||
project: String,
|
||||
}
|
||||
|
||||
/// SQLite database files start with this 16-byte ASCII magic.
|
||||
const SQLITE_MAGIC: &[u8; 16] = b"SQLite format 3\0";
|
||||
|
||||
/// Reject obviously non-SQLite files before handing them to the
|
||||
/// connection pool, where the same rejection costs minutes instead of
|
||||
/// microseconds on some corruption shapes.
|
||||
///
|
||||
/// Returns `Ok(())` when:
|
||||
/// * the file does not exist (the pool will `CREATE` it),
|
||||
/// * the file is zero-length (SQLite treats this as a fresh DB),
|
||||
/// * the first 16 bytes match the SQLite magic header,
|
||||
/// * the file is shorter than the magic but non-empty (extremely
|
||||
/// unusual; we defer to SQLite rather than gating arbitrarily).
|
||||
///
|
||||
/// Returns `Err(NyxError::Sql(...))` carrying `SQLITE_NOTADB` when the
|
||||
/// header is present but does not match.
|
||||
fn preflight_header(database_path: &Path) -> NyxResult<()> {
|
||||
let Ok(meta) = fs::metadata(database_path) else {
|
||||
return Ok(());
|
||||
};
|
||||
if !meta.is_file() {
|
||||
return Ok(());
|
||||
}
|
||||
if meta.len() < SQLITE_MAGIC.len() as u64 {
|
||||
return Ok(());
|
||||
}
|
||||
let mut head = [0u8; 16];
|
||||
let mut f = fs::File::open(database_path)?;
|
||||
f.read_exact(&mut head)?;
|
||||
if &head != SQLITE_MAGIC {
|
||||
return Err(NyxError::Sql(rusqlite::Error::SqliteFailure(
|
||||
rusqlite::ffi::Error::new(rusqlite::ffi::SQLITE_NOTADB),
|
||||
Some(format!(
|
||||
"file at {} is not a SQLite database (header magic mismatch)",
|
||||
database_path.display(),
|
||||
)),
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl Indexer {
|
||||
pub fn init(database_path: &Path) -> NyxResult<Arc<Pool<SqliteConnectionManager>>> {
|
||||
let _span = tracing::info_span!("db_init", path = %database_path.display()).entered();
|
||||
|
||||
// Fast-fail when the existing file is clearly not a SQLite
|
||||
// database. Without this guard, certain corruption shapes
|
||||
// (truncated header, header overwritten with arbitrary bytes,
|
||||
// mid-page damage that preserves magic) can keep SQLite busy
|
||||
// for 150-200 seconds inside the PRAGMA / schema execution
|
||||
// below before it surfaces SQLITE_NOTADB or SQLITE_CORRUPT.
|
||||
// A zero-length file is treated as a fresh DB by SQLite, so we
|
||||
// only validate when the file is large enough to hold the
|
||||
// 16-byte magic header.
|
||||
preflight_header(database_path)?;
|
||||
|
||||
// NO_MUTEX is safe because r2d2 ensures each pooled connection
|
||||
// is only ever used by one thread at a time. Combined with WAL
|
||||
// mode this allows concurrent readers + a single writer without
|
||||
|
|
|
|||
|
|
@ -492,9 +492,41 @@ fn entry_kind_from_summary(_kind: &crate::entry_points::EntryKind) -> EntryKind
|
|||
|
||||
// ── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Resolve the language for a finding path using extension first, then a
|
||||
/// shebang / content sniff against the first 200 bytes of the file.
|
||||
///
|
||||
/// Phase 02 widens this resolver beyond `Lang::from_extension` so that
|
||||
/// extensionless CLI entry points and idiomatic non-canonical extensions
|
||||
/// (`.cjs`, `.mts`, `.pyi`, …) no longer cause `SpecDerivationFailed`. File
|
||||
/// I/O is best-effort: an unreadable / absent file falls through to the
|
||||
/// extension-only path so callers in tests that pass synthetic paths still
|
||||
/// resolve when the extension is well-known.
|
||||
fn lang_from_path(path: &str) -> Option<Lang> {
|
||||
let ext = Path::new(path).extension().and_then(|e| e.to_str()).unwrap_or("");
|
||||
Lang::from_extension(ext)
|
||||
let p = Path::new(path);
|
||||
if let Some(ext) = p.extension().and_then(|e| e.to_str()) {
|
||||
if let Some(lang) = Lang::from_extension(ext) {
|
||||
return Some(lang);
|
||||
}
|
||||
}
|
||||
// Fall back to a shebang / content sniff over the file head.
|
||||
let head = read_file_head(p, 200);
|
||||
if head.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Lang::from_path_or_content(p, &head)
|
||||
}
|
||||
|
||||
/// Read up to `cap` bytes from `path`, returning an empty buffer on any I/O
|
||||
/// error. The verifier never wants a missing file to abort spec derivation —
|
||||
/// callers downstream already gate on `Lang` being `Some`.
|
||||
fn read_file_head(path: &Path, cap: usize) -> Vec<u8> {
|
||||
use std::io::Read;
|
||||
let mut buf = Vec::with_capacity(cap);
|
||||
let Ok(f) = std::fs::File::open(path) else {
|
||||
return buf;
|
||||
};
|
||||
let _ = f.take(cap as u64).read_to_end(&mut buf);
|
||||
buf
|
||||
}
|
||||
|
||||
/// Return the first non-empty `function` annotation found on any flow step.
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@
|
|||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::path::Path;
|
||||
|
||||
/// Supported source-code languages.
|
||||
///
|
||||
|
|
@ -59,23 +60,71 @@ impl Lang {
|
|||
///
|
||||
/// Mirrors the extension→language mapping in `ast::lang_for_path()` so that
|
||||
/// callers outside `ast` can obtain a `Lang` from a path without needing a
|
||||
/// `FuncSummary`.
|
||||
/// `FuncSummary`. Match is case-insensitive (ASCII).
|
||||
///
|
||||
/// Extension coverage is intentionally broader than the tree-sitter loader
|
||||
/// in `ast::lang_for_path` because this function is consumed by the
|
||||
/// dynamic verifier, which must classify *every* finding-bearing path so
|
||||
/// that spec derivation does not collapse on idiomatic file extensions
|
||||
/// like `.cjs`, `.mts`, `.pyi`, or `.kts`. JVM-family `.kt` / `.kts` map
|
||||
/// to [`Lang::Java`] because the spec/toolchain layer is JVM-aware even
|
||||
/// where the tree-sitter grammar is not.
|
||||
pub fn from_extension(ext: &str) -> Option<Lang> {
|
||||
match ext {
|
||||
let lower = ext.to_ascii_lowercase();
|
||||
match lower.as_str() {
|
||||
"rs" => Some(Lang::Rust),
|
||||
"c" => Some(Lang::C),
|
||||
"cpp" => Some(Lang::Cpp),
|
||||
"java" => Some(Lang::Java),
|
||||
"cpp" | "cc" | "cxx" | "c++" | "hpp" | "hxx" | "hh" | "h++" => Some(Lang::Cpp),
|
||||
// Java family. `.kt` / `.kts` are Kotlin (JVM); the dynamic spec
|
||||
// layer treats them as Java for toolchain selection purposes.
|
||||
"java" | "kt" | "kts" => Some(Lang::Java),
|
||||
"go" => Some(Lang::Go),
|
||||
"php" => Some(Lang::Php),
|
||||
"py" => Some(Lang::Python),
|
||||
"ts" => Some(Lang::TypeScript),
|
||||
"js" => Some(Lang::JavaScript),
|
||||
// `.pyi` are Python stub files; spec derivation accepts them so
|
||||
// typed-stub-only entry points still register a language.
|
||||
"py" | "pyi" => Some(Lang::Python),
|
||||
// `.mts` / `.cts` are TypeScript module-form (ES module / CommonJS).
|
||||
"ts" | "tsx" | "mts" | "cts" => Some(Lang::TypeScript),
|
||||
// `.mjs` / `.cjs` are JavaScript module-form. `.jsx` is React JSX.
|
||||
"js" | "jsx" | "mjs" | "cjs" => Some(Lang::JavaScript),
|
||||
"rb" => Some(Lang::Ruby),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Probe a path's language using extension first, then a shebang line on
|
||||
/// `head_bytes`, then a content-byte heuristic on the first 200 bytes.
|
||||
///
|
||||
/// `head_bytes` should be the first N bytes of the file (200 is plenty;
|
||||
/// callers may pass more). Empty / unreadable files return `None`.
|
||||
///
|
||||
/// Order:
|
||||
/// 1. [`Lang::from_extension`] on the path's extension — fast path.
|
||||
/// 2. Shebang inspection. Common interpreter aliases are recognised:
|
||||
/// `python` / `python3` → [`Lang::Python`], `node` / `nodejs` / `deno`
|
||||
/// / `bun` → [`Lang::JavaScript`], `ruby` → [`Lang::Ruby`], `php` →
|
||||
/// [`Lang::Php`]. `/usr/bin/env <interp>` and direct
|
||||
/// `/usr/bin/<interp>` paths both work.
|
||||
/// 3. Content-byte syntactic sniff: line-prefix matches on the first 200
|
||||
/// bytes (`<?php`, `package main`, Java `package …;`, `fn main`, etc.).
|
||||
/// The sniff stands in for a full tree-sitter parse — it is cheaper
|
||||
/// and covers the verifier's failure modes without paying the cost of
|
||||
/// loading every grammar for every extensionless file.
|
||||
///
|
||||
/// Used by [`crate::dynamic::spec`] so spec derivation no longer rejects
|
||||
/// CLI entry points and other extensionless / non-canonical files.
|
||||
pub fn from_path_or_content(path: &Path, head_bytes: &[u8]) -> Option<Lang> {
|
||||
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
|
||||
if let Some(lang) = Self::from_extension(ext) {
|
||||
return Some(lang);
|
||||
}
|
||||
}
|
||||
if let Some(lang) = lang_from_shebang(head_bytes) {
|
||||
return Some(lang);
|
||||
}
|
||||
sniff_content_lang(head_bytes)
|
||||
}
|
||||
|
||||
/// Canonical slug string for this language.
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
|
|
@ -288,5 +337,113 @@ pub fn namespace_with_package(
|
|||
}
|
||||
}
|
||||
|
||||
/// Maximum bytes of `head_bytes` consulted by the shebang / content sniff.
|
||||
/// Larger reads are tolerated — the helpers truncate internally.
|
||||
const SNIFF_HEAD_LIMIT: usize = 200;
|
||||
|
||||
/// Parse a `#!` shebang line and map the interpreter name to a `Lang`.
|
||||
///
|
||||
/// Handles `/usr/bin/env <interp>` (with optional `-S` / `-i` flags),
|
||||
/// direct `/usr/bin/<interp>`, and bare `<interp>` forms. Trailing version
|
||||
/// digits (`python3`, `python3.11`) are stripped so the lookup matches the
|
||||
/// base interpreter. Returns `None` for non-Nyx-supported interpreters
|
||||
/// (`bash`, `sh`, `perl`, …).
|
||||
fn lang_from_shebang(head: &[u8]) -> Option<Lang> {
|
||||
if !head.starts_with(b"#!") {
|
||||
return None;
|
||||
}
|
||||
let cap = head.len().min(SNIFF_HEAD_LIMIT);
|
||||
let line_end = head[..cap]
|
||||
.iter()
|
||||
.position(|&b| b == b'\n')
|
||||
.unwrap_or(cap);
|
||||
let line = std::str::from_utf8(&head[..line_end]).ok()?;
|
||||
let line = line.trim_end_matches('\r').trim();
|
||||
let rest = line.strip_prefix("#!")?.trim();
|
||||
|
||||
let mut tokens = rest.split_whitespace();
|
||||
let first = tokens.next()?;
|
||||
let interpreter = if first.ends_with("/env") || first == "env" {
|
||||
// Skip env's own options (e.g. `-S`, `-i`, `--split-string`).
|
||||
tokens.find(|t| !t.starts_with('-'))?
|
||||
} else {
|
||||
first.rsplit('/').next()?
|
||||
};
|
||||
|
||||
let base: String = interpreter
|
||||
.chars()
|
||||
.take_while(|c| c.is_ascii_alphabetic())
|
||||
.collect();
|
||||
match base.as_str() {
|
||||
"python" => Some(Lang::Python),
|
||||
"node" | "nodejs" | "deno" | "bun" => Some(Lang::JavaScript),
|
||||
"ts" | "tsx" => Some(Lang::TypeScript),
|
||||
"ruby" => Some(Lang::Ruby),
|
||||
"php" => Some(Lang::Php),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Lightweight syntactic sniff over the first 200 bytes of a file.
|
||||
///
|
||||
/// Skips a leading shebang line (callers already tried it), then inspects up
|
||||
/// to ~20 head lines for unambiguous language tokens. Returns `None` if
|
||||
/// nothing convinces; the verifier's caller will record `LangUnsupported`
|
||||
/// rather than misclassify.
|
||||
fn sniff_content_lang(head: &[u8]) -> Option<Lang> {
|
||||
if head.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let cap = head.len().min(SNIFF_HEAD_LIMIT);
|
||||
let text = std::str::from_utf8(&head[..cap]).ok()?;
|
||||
let body = match (text.starts_with("#!"), text.find('\n')) {
|
||||
(true, Some(i)) => &text[i + 1..],
|
||||
_ => text,
|
||||
};
|
||||
|
||||
for raw in body.lines().take(20) {
|
||||
let line = raw.trim_start();
|
||||
if line.is_empty() {
|
||||
continue;
|
||||
}
|
||||
if line.starts_with("<?php") {
|
||||
return Some(Lang::Php);
|
||||
}
|
||||
if line.starts_with("package main") {
|
||||
return Some(Lang::Go);
|
||||
}
|
||||
// Java `package foo.bar;` always ends with a semicolon.
|
||||
if line.starts_with("package ") && line.trim_end().ends_with(';') {
|
||||
return Some(Lang::Java);
|
||||
}
|
||||
if line.starts_with("import java.") || line.starts_with("public class ") {
|
||||
return Some(Lang::Java);
|
||||
}
|
||||
if line.starts_with("from __future__")
|
||||
|| line.starts_with("from typing ")
|
||||
|| (line.starts_with("def ") && line.contains(':'))
|
||||
{
|
||||
return Some(Lang::Python);
|
||||
}
|
||||
if line.starts_with("fn main") || line.starts_with("use std::") {
|
||||
return Some(Lang::Rust);
|
||||
}
|
||||
if line.starts_with("func ") && line.contains('(') {
|
||||
return Some(Lang::Go);
|
||||
}
|
||||
if line.starts_with("require ") || line.starts_with("require_relative ") {
|
||||
return Some(Lang::Ruby);
|
||||
}
|
||||
if line.starts_with("function ")
|
||||
|| line.starts_with("const ")
|
||||
|| line.starts_with("import {")
|
||||
|| line.starts_with("export ")
|
||||
{
|
||||
return Some(Lang::JavaScript);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
|
|
|||
|
|
@ -203,3 +203,138 @@ fn normalize_fallback_on_mismatch() {
|
|||
"/other/path/lib.rs"
|
||||
);
|
||||
}
|
||||
|
||||
// ── Phase 02: extension + shebang + content sniff ──────────────────────────
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
fn from_extension_accepts_phase02_additions() {
|
||||
// Each of the new extensions must round-trip to the documented language.
|
||||
assert_eq!(Lang::from_extension("cjs"), Some(Lang::JavaScript));
|
||||
assert_eq!(Lang::from_extension("mjs"), Some(Lang::JavaScript));
|
||||
assert_eq!(Lang::from_extension("jsx"), Some(Lang::JavaScript));
|
||||
assert_eq!(Lang::from_extension("mts"), Some(Lang::TypeScript));
|
||||
assert_eq!(Lang::from_extension("cts"), Some(Lang::TypeScript));
|
||||
assert_eq!(Lang::from_extension("tsx"), Some(Lang::TypeScript));
|
||||
assert_eq!(Lang::from_extension("pyi"), Some(Lang::Python));
|
||||
assert_eq!(Lang::from_extension("kt"), Some(Lang::Java));
|
||||
assert_eq!(Lang::from_extension("kts"), Some(Lang::Java));
|
||||
// C++ inventory extended in Phase 01 / ast.rs: keep the helper aligned.
|
||||
assert_eq!(Lang::from_extension("cc"), Some(Lang::Cpp));
|
||||
assert_eq!(Lang::from_extension("hpp"), Some(Lang::Cpp));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_extension_is_case_insensitive() {
|
||||
// Real-world filesystems mix case (especially on Windows / macOS).
|
||||
assert_eq!(Lang::from_extension("PY"), Some(Lang::Python));
|
||||
assert_eq!(Lang::from_extension("Java"), Some(Lang::Java));
|
||||
assert_eq!(Lang::from_extension("JSX"), Some(Lang::JavaScript));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_extension_wins() {
|
||||
// Even with a misleading shebang the explicit extension must take
|
||||
// precedence — file-format ground truth beats hand-edited interpreter
|
||||
// hints.
|
||||
let head = b"#!/usr/bin/env node\nprint('hi')\n";
|
||||
let path = Path::new("/tmp/script.py");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Python));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_shebang_python_env() {
|
||||
let head = b"#!/usr/bin/env python3\nimport os\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Python));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_shebang_node_direct() {
|
||||
let head = b"#!/usr/local/bin/node\nconsole.log(1)\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(
|
||||
Lang::from_path_or_content(path, head),
|
||||
Some(Lang::JavaScript)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_shebang_ruby_direct() {
|
||||
let head = b"#!/usr/bin/ruby\nputs 1\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Ruby));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_shebang_php() {
|
||||
let head = b"#!/usr/bin/env php\n<?php echo 1;\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Php));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_shebang_with_env_dash_flag() {
|
||||
// `env -S` is the portable trick for passing args; the second token after
|
||||
// env is the real interpreter.
|
||||
let head = b"#!/usr/bin/env -S python3 -u\nimport sys\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Python));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_shebang_unknown_interpreter_falls_through_to_sniff() {
|
||||
// bash isn't a supported language — shebang returns None — and the
|
||||
// body's `<?php` opener should still be picked up by the content sniff.
|
||||
let head = b"#!/bin/bash\n<?php echo 1; ?>\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Php));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_content_sniff_php() {
|
||||
let head = b"<?php echo 'hi'; ?>";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Php));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_content_sniff_go_package_main() {
|
||||
let head = b"package main\n\nimport \"fmt\"\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Go));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_content_sniff_java_package_semicolon() {
|
||||
let head = b"package com.example.app;\n\npublic class Main {}\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Java));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_content_sniff_python_def() {
|
||||
let head = b"\"\"\"docstring\"\"\"\n\ndef handle(x):\n return x\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Python));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_content_sniff_rust_use_std() {
|
||||
let head = b"use std::path::Path;\n\nfn main() {}\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Rust));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_returns_none_when_nothing_matches() {
|
||||
let path = Path::new("/tmp/runme.weird");
|
||||
assert_eq!(Lang::from_path_or_content(path, b"plain text data"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_empty_head_with_unknown_extension_returns_none() {
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, b""), None);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue