mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-09 19:45:13 +02:00
fix(db): fast-fail Indexer::init on non-SQLite files via magic-header preflight
This commit is contained in:
parent
946cb6a9bc
commit
8abb023dd0
11 changed files with 648 additions and 17 deletions
|
|
@ -19,6 +19,7 @@ pub mod index {
|
|||
use r2d2_sqlite::SqliteConnectionManager;
|
||||
use rusqlite::{Connection, OpenFlags, OptionalExtension, params};
|
||||
use std::fs;
|
||||
use std::io::Read;
|
||||
use std::ops::Deref;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
|
|
@ -332,9 +333,62 @@ pub mod index {
|
|||
project: String,
|
||||
}
|
||||
|
||||
/// SQLite database files start with this 16-byte ASCII magic.
|
||||
const SQLITE_MAGIC: &[u8; 16] = b"SQLite format 3\0";
|
||||
|
||||
/// Reject obviously non-SQLite files before handing them to the
|
||||
/// connection pool, where the same rejection costs minutes instead of
|
||||
/// microseconds on some corruption shapes.
|
||||
///
|
||||
/// Returns `Ok(())` when:
|
||||
/// * the file does not exist (the pool will `CREATE` it),
|
||||
/// * the file is zero-length (SQLite treats this as a fresh DB),
|
||||
/// * the first 16 bytes match the SQLite magic header,
|
||||
/// * the file is shorter than the magic but non-empty (extremely
|
||||
/// unusual; we defer to SQLite rather than gating arbitrarily).
|
||||
///
|
||||
/// Returns `Err(NyxError::Sql(...))` carrying `SQLITE_NOTADB` when the
|
||||
/// header is present but does not match.
|
||||
fn preflight_header(database_path: &Path) -> NyxResult<()> {
|
||||
let Ok(meta) = fs::metadata(database_path) else {
|
||||
return Ok(());
|
||||
};
|
||||
if !meta.is_file() {
|
||||
return Ok(());
|
||||
}
|
||||
if meta.len() < SQLITE_MAGIC.len() as u64 {
|
||||
return Ok(());
|
||||
}
|
||||
let mut head = [0u8; 16];
|
||||
let mut f = fs::File::open(database_path)?;
|
||||
f.read_exact(&mut head)?;
|
||||
if &head != SQLITE_MAGIC {
|
||||
return Err(NyxError::Sql(rusqlite::Error::SqliteFailure(
|
||||
rusqlite::ffi::Error::new(rusqlite::ffi::SQLITE_NOTADB),
|
||||
Some(format!(
|
||||
"file at {} is not a SQLite database (header magic mismatch)",
|
||||
database_path.display(),
|
||||
)),
|
||||
)));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl Indexer {
|
||||
pub fn init(database_path: &Path) -> NyxResult<Arc<Pool<SqliteConnectionManager>>> {
|
||||
let _span = tracing::info_span!("db_init", path = %database_path.display()).entered();
|
||||
|
||||
// Fast-fail when the existing file is clearly not a SQLite
|
||||
// database. Without this guard, certain corruption shapes
|
||||
// (truncated header, header overwritten with arbitrary bytes,
|
||||
// mid-page damage that preserves magic) can keep SQLite busy
|
||||
// for 150-200 seconds inside the PRAGMA / schema execution
|
||||
// below before it surfaces SQLITE_NOTADB or SQLITE_CORRUPT.
|
||||
// A zero-length file is treated as a fresh DB by SQLite, so we
|
||||
// only validate when the file is large enough to hold the
|
||||
// 16-byte magic header.
|
||||
preflight_header(database_path)?;
|
||||
|
||||
// NO_MUTEX is safe because r2d2 ensures each pooled connection
|
||||
// is only ever used by one thread at a time. Combined with WAL
|
||||
// mode this allows concurrent readers + a single writer without
|
||||
|
|
|
|||
|
|
@ -492,9 +492,41 @@ fn entry_kind_from_summary(_kind: &crate::entry_points::EntryKind) -> EntryKind
|
|||
|
||||
// ── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Resolve the language for a finding path using extension first, then a
|
||||
/// shebang / content sniff against the first 200 bytes of the file.
|
||||
///
|
||||
/// Phase 02 widens this resolver beyond `Lang::from_extension` so that
|
||||
/// extensionless CLI entry points and idiomatic non-canonical extensions
|
||||
/// (`.cjs`, `.mts`, `.pyi`, …) no longer cause `SpecDerivationFailed`. File
|
||||
/// I/O is best-effort: an unreadable / absent file falls through to the
|
||||
/// extension-only path so callers in tests that pass synthetic paths still
|
||||
/// resolve when the extension is well-known.
|
||||
fn lang_from_path(path: &str) -> Option<Lang> {
|
||||
let ext = Path::new(path).extension().and_then(|e| e.to_str()).unwrap_or("");
|
||||
Lang::from_extension(ext)
|
||||
let p = Path::new(path);
|
||||
if let Some(ext) = p.extension().and_then(|e| e.to_str()) {
|
||||
if let Some(lang) = Lang::from_extension(ext) {
|
||||
return Some(lang);
|
||||
}
|
||||
}
|
||||
// Fall back to a shebang / content sniff over the file head.
|
||||
let head = read_file_head(p, 200);
|
||||
if head.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Lang::from_path_or_content(p, &head)
|
||||
}
|
||||
|
||||
/// Read up to `cap` bytes from `path`, returning an empty buffer on any I/O
|
||||
/// error. The verifier never wants a missing file to abort spec derivation —
|
||||
/// callers downstream already gate on `Lang` being `Some`.
|
||||
fn read_file_head(path: &Path, cap: usize) -> Vec<u8> {
|
||||
use std::io::Read;
|
||||
let mut buf = Vec::with_capacity(cap);
|
||||
let Ok(f) = std::fs::File::open(path) else {
|
||||
return buf;
|
||||
};
|
||||
let _ = f.take(cap as u64).read_to_end(&mut buf);
|
||||
buf
|
||||
}
|
||||
|
||||
/// Return the first non-empty `function` annotation found on any flow step.
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@
|
|||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::path::Path;
|
||||
|
||||
/// Supported source-code languages.
|
||||
///
|
||||
|
|
@ -59,23 +60,71 @@ impl Lang {
|
|||
///
|
||||
/// Mirrors the extension→language mapping in `ast::lang_for_path()` so that
|
||||
/// callers outside `ast` can obtain a `Lang` from a path without needing a
|
||||
/// `FuncSummary`.
|
||||
/// `FuncSummary`. Match is case-insensitive (ASCII).
|
||||
///
|
||||
/// Extension coverage is intentionally broader than the tree-sitter loader
|
||||
/// in `ast::lang_for_path` because this function is consumed by the
|
||||
/// dynamic verifier, which must classify *every* finding-bearing path so
|
||||
/// that spec derivation does not collapse on idiomatic file extensions
|
||||
/// like `.cjs`, `.mts`, `.pyi`, or `.kts`. JVM-family `.kt` / `.kts` map
|
||||
/// to [`Lang::Java`] because the spec/toolchain layer is JVM-aware even
|
||||
/// where the tree-sitter grammar is not.
|
||||
pub fn from_extension(ext: &str) -> Option<Lang> {
|
||||
match ext {
|
||||
let lower = ext.to_ascii_lowercase();
|
||||
match lower.as_str() {
|
||||
"rs" => Some(Lang::Rust),
|
||||
"c" => Some(Lang::C),
|
||||
"cpp" => Some(Lang::Cpp),
|
||||
"java" => Some(Lang::Java),
|
||||
"cpp" | "cc" | "cxx" | "c++" | "hpp" | "hxx" | "hh" | "h++" => Some(Lang::Cpp),
|
||||
// Java family. `.kt` / `.kts` are Kotlin (JVM); the dynamic spec
|
||||
// layer treats them as Java for toolchain selection purposes.
|
||||
"java" | "kt" | "kts" => Some(Lang::Java),
|
||||
"go" => Some(Lang::Go),
|
||||
"php" => Some(Lang::Php),
|
||||
"py" => Some(Lang::Python),
|
||||
"ts" => Some(Lang::TypeScript),
|
||||
"js" => Some(Lang::JavaScript),
|
||||
// `.pyi` are Python stub files; spec derivation accepts them so
|
||||
// typed-stub-only entry points still register a language.
|
||||
"py" | "pyi" => Some(Lang::Python),
|
||||
// `.mts` / `.cts` are TypeScript module-form (ES module / CommonJS).
|
||||
"ts" | "tsx" | "mts" | "cts" => Some(Lang::TypeScript),
|
||||
// `.mjs` / `.cjs` are JavaScript module-form. `.jsx` is React JSX.
|
||||
"js" | "jsx" | "mjs" | "cjs" => Some(Lang::JavaScript),
|
||||
"rb" => Some(Lang::Ruby),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Probe a path's language using extension first, then a shebang line on
|
||||
/// `head_bytes`, then a content-byte heuristic on the first 200 bytes.
|
||||
///
|
||||
/// `head_bytes` should be the first N bytes of the file (200 is plenty;
|
||||
/// callers may pass more). Empty / unreadable files return `None`.
|
||||
///
|
||||
/// Order:
|
||||
/// 1. [`Lang::from_extension`] on the path's extension — fast path.
|
||||
/// 2. Shebang inspection. Common interpreter aliases are recognised:
|
||||
/// `python` / `python3` → [`Lang::Python`], `node` / `nodejs` / `deno`
|
||||
/// / `bun` → [`Lang::JavaScript`], `ruby` → [`Lang::Ruby`], `php` →
|
||||
/// [`Lang::Php`]. `/usr/bin/env <interp>` and direct
|
||||
/// `/usr/bin/<interp>` paths both work.
|
||||
/// 3. Content-byte syntactic sniff: line-prefix matches on the first 200
|
||||
/// bytes (`<?php`, `package main`, Java `package …;`, `fn main`, etc.).
|
||||
/// The sniff stands in for a full tree-sitter parse — it is cheaper
|
||||
/// and covers the verifier's failure modes without paying the cost of
|
||||
/// loading every grammar for every extensionless file.
|
||||
///
|
||||
/// Used by [`crate::dynamic::spec`] so spec derivation no longer rejects
|
||||
/// CLI entry points and other extensionless / non-canonical files.
|
||||
pub fn from_path_or_content(path: &Path, head_bytes: &[u8]) -> Option<Lang> {
|
||||
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
|
||||
if let Some(lang) = Self::from_extension(ext) {
|
||||
return Some(lang);
|
||||
}
|
||||
}
|
||||
if let Some(lang) = lang_from_shebang(head_bytes) {
|
||||
return Some(lang);
|
||||
}
|
||||
sniff_content_lang(head_bytes)
|
||||
}
|
||||
|
||||
/// Canonical slug string for this language.
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
|
|
@ -288,5 +337,113 @@ pub fn namespace_with_package(
|
|||
}
|
||||
}
|
||||
|
||||
/// Maximum bytes of `head_bytes` consulted by the shebang / content sniff.
|
||||
/// Larger reads are tolerated — the helpers truncate internally.
|
||||
const SNIFF_HEAD_LIMIT: usize = 200;
|
||||
|
||||
/// Parse a `#!` shebang line and map the interpreter name to a `Lang`.
|
||||
///
|
||||
/// Handles `/usr/bin/env <interp>` (with optional `-S` / `-i` flags),
|
||||
/// direct `/usr/bin/<interp>`, and bare `<interp>` forms. Trailing version
|
||||
/// digits (`python3`, `python3.11`) are stripped so the lookup matches the
|
||||
/// base interpreter. Returns `None` for non-Nyx-supported interpreters
|
||||
/// (`bash`, `sh`, `perl`, …).
|
||||
fn lang_from_shebang(head: &[u8]) -> Option<Lang> {
|
||||
if !head.starts_with(b"#!") {
|
||||
return None;
|
||||
}
|
||||
let cap = head.len().min(SNIFF_HEAD_LIMIT);
|
||||
let line_end = head[..cap]
|
||||
.iter()
|
||||
.position(|&b| b == b'\n')
|
||||
.unwrap_or(cap);
|
||||
let line = std::str::from_utf8(&head[..line_end]).ok()?;
|
||||
let line = line.trim_end_matches('\r').trim();
|
||||
let rest = line.strip_prefix("#!")?.trim();
|
||||
|
||||
let mut tokens = rest.split_whitespace();
|
||||
let first = tokens.next()?;
|
||||
let interpreter = if first.ends_with("/env") || first == "env" {
|
||||
// Skip env's own options (e.g. `-S`, `-i`, `--split-string`).
|
||||
tokens.find(|t| !t.starts_with('-'))?
|
||||
} else {
|
||||
first.rsplit('/').next()?
|
||||
};
|
||||
|
||||
let base: String = interpreter
|
||||
.chars()
|
||||
.take_while(|c| c.is_ascii_alphabetic())
|
||||
.collect();
|
||||
match base.as_str() {
|
||||
"python" => Some(Lang::Python),
|
||||
"node" | "nodejs" | "deno" | "bun" => Some(Lang::JavaScript),
|
||||
"ts" | "tsx" => Some(Lang::TypeScript),
|
||||
"ruby" => Some(Lang::Ruby),
|
||||
"php" => Some(Lang::Php),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Lightweight syntactic sniff over the first 200 bytes of a file.
|
||||
///
|
||||
/// Skips a leading shebang line (callers already tried it), then inspects up
|
||||
/// to ~20 head lines for unambiguous language tokens. Returns `None` if
|
||||
/// nothing convinces; the verifier's caller will record `LangUnsupported`
|
||||
/// rather than misclassify.
|
||||
fn sniff_content_lang(head: &[u8]) -> Option<Lang> {
|
||||
if head.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let cap = head.len().min(SNIFF_HEAD_LIMIT);
|
||||
let text = std::str::from_utf8(&head[..cap]).ok()?;
|
||||
let body = match (text.starts_with("#!"), text.find('\n')) {
|
||||
(true, Some(i)) => &text[i + 1..],
|
||||
_ => text,
|
||||
};
|
||||
|
||||
for raw in body.lines().take(20) {
|
||||
let line = raw.trim_start();
|
||||
if line.is_empty() {
|
||||
continue;
|
||||
}
|
||||
if line.starts_with("<?php") {
|
||||
return Some(Lang::Php);
|
||||
}
|
||||
if line.starts_with("package main") {
|
||||
return Some(Lang::Go);
|
||||
}
|
||||
// Java `package foo.bar;` always ends with a semicolon.
|
||||
if line.starts_with("package ") && line.trim_end().ends_with(';') {
|
||||
return Some(Lang::Java);
|
||||
}
|
||||
if line.starts_with("import java.") || line.starts_with("public class ") {
|
||||
return Some(Lang::Java);
|
||||
}
|
||||
if line.starts_with("from __future__")
|
||||
|| line.starts_with("from typing ")
|
||||
|| (line.starts_with("def ") && line.contains(':'))
|
||||
{
|
||||
return Some(Lang::Python);
|
||||
}
|
||||
if line.starts_with("fn main") || line.starts_with("use std::") {
|
||||
return Some(Lang::Rust);
|
||||
}
|
||||
if line.starts_with("func ") && line.contains('(') {
|
||||
return Some(Lang::Go);
|
||||
}
|
||||
if line.starts_with("require ") || line.starts_with("require_relative ") {
|
||||
return Some(Lang::Ruby);
|
||||
}
|
||||
if line.starts_with("function ")
|
||||
|| line.starts_with("const ")
|
||||
|| line.starts_with("import {")
|
||||
|| line.starts_with("export ")
|
||||
{
|
||||
return Some(Lang::JavaScript);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
|
|
|||
|
|
@ -203,3 +203,138 @@ fn normalize_fallback_on_mismatch() {
|
|||
"/other/path/lib.rs"
|
||||
);
|
||||
}
|
||||
|
||||
// ── Phase 02: extension + shebang + content sniff ──────────────────────────
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
fn from_extension_accepts_phase02_additions() {
|
||||
// Each of the new extensions must round-trip to the documented language.
|
||||
assert_eq!(Lang::from_extension("cjs"), Some(Lang::JavaScript));
|
||||
assert_eq!(Lang::from_extension("mjs"), Some(Lang::JavaScript));
|
||||
assert_eq!(Lang::from_extension("jsx"), Some(Lang::JavaScript));
|
||||
assert_eq!(Lang::from_extension("mts"), Some(Lang::TypeScript));
|
||||
assert_eq!(Lang::from_extension("cts"), Some(Lang::TypeScript));
|
||||
assert_eq!(Lang::from_extension("tsx"), Some(Lang::TypeScript));
|
||||
assert_eq!(Lang::from_extension("pyi"), Some(Lang::Python));
|
||||
assert_eq!(Lang::from_extension("kt"), Some(Lang::Java));
|
||||
assert_eq!(Lang::from_extension("kts"), Some(Lang::Java));
|
||||
// C++ inventory extended in Phase 01 / ast.rs: keep the helper aligned.
|
||||
assert_eq!(Lang::from_extension("cc"), Some(Lang::Cpp));
|
||||
assert_eq!(Lang::from_extension("hpp"), Some(Lang::Cpp));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_extension_is_case_insensitive() {
|
||||
// Real-world filesystems mix case (especially on Windows / macOS).
|
||||
assert_eq!(Lang::from_extension("PY"), Some(Lang::Python));
|
||||
assert_eq!(Lang::from_extension("Java"), Some(Lang::Java));
|
||||
assert_eq!(Lang::from_extension("JSX"), Some(Lang::JavaScript));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_extension_wins() {
|
||||
// Even with a misleading shebang the explicit extension must take
|
||||
// precedence — file-format ground truth beats hand-edited interpreter
|
||||
// hints.
|
||||
let head = b"#!/usr/bin/env node\nprint('hi')\n";
|
||||
let path = Path::new("/tmp/script.py");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Python));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_shebang_python_env() {
|
||||
let head = b"#!/usr/bin/env python3\nimport os\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Python));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_shebang_node_direct() {
|
||||
let head = b"#!/usr/local/bin/node\nconsole.log(1)\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(
|
||||
Lang::from_path_or_content(path, head),
|
||||
Some(Lang::JavaScript)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_shebang_ruby_direct() {
|
||||
let head = b"#!/usr/bin/ruby\nputs 1\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Ruby));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_shebang_php() {
|
||||
let head = b"#!/usr/bin/env php\n<?php echo 1;\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Php));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_shebang_with_env_dash_flag() {
|
||||
// `env -S` is the portable trick for passing args; the second token after
|
||||
// env is the real interpreter.
|
||||
let head = b"#!/usr/bin/env -S python3 -u\nimport sys\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Python));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_shebang_unknown_interpreter_falls_through_to_sniff() {
|
||||
// bash isn't a supported language — shebang returns None — and the
|
||||
// body's `<?php` opener should still be picked up by the content sniff.
|
||||
let head = b"#!/bin/bash\n<?php echo 1; ?>\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Php));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_content_sniff_php() {
|
||||
let head = b"<?php echo 'hi'; ?>";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Php));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_content_sniff_go_package_main() {
|
||||
let head = b"package main\n\nimport \"fmt\"\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Go));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_content_sniff_java_package_semicolon() {
|
||||
let head = b"package com.example.app;\n\npublic class Main {}\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Java));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_content_sniff_python_def() {
|
||||
let head = b"\"\"\"docstring\"\"\"\n\ndef handle(x):\n return x\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Python));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_content_sniff_rust_use_std() {
|
||||
let head = b"use std::path::Path;\n\nfn main() {}\n";
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Rust));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_returns_none_when_nothing_matches() {
|
||||
let path = Path::new("/tmp/runme.weird");
|
||||
assert_eq!(Lang::from_path_or_content(path, b"plain text data"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_path_or_content_empty_head_with_unknown_extension_returns_none() {
|
||||
let path = Path::new("/tmp/runme");
|
||||
assert_eq!(Lang::from_path_or_content(path, b""), None);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -189,11 +189,10 @@ fn garbage_header_db_returns_structured_error() {
|
|||
}
|
||||
|
||||
// NOTE: A mid-file corruption test (garbage at bytes 100..200, preserving
|
||||
// SQLite magic) was attempted and is deliberately omitted. That shape
|
||||
// triggers a slow corruption-detection path in SQLite where `Indexer::init`
|
||||
// takes 150–200 seconds before returning, unsuitable for CI wall-clock
|
||||
// budgets. The two tests above already cover the "corrupt-on-arrival"
|
||||
// cases that users actually hit (crash-truncated file, deliberate clobber).
|
||||
// A follow-up should either short-circuit `PRAGMA integrity_check` up
|
||||
// front or wrap the init path in a timeout so mid-page corruption
|
||||
// also fails fast.
|
||||
// SQLite magic) is still omitted. `Indexer::init` short-circuits on
|
||||
// header-magic mismatch (see `preflight_header`), so the corrupt-on-arrival
|
||||
// shapes users actually hit return in microseconds. Mid-page damage that
|
||||
// preserves the magic header still falls into SQLite's slow corruption
|
||||
// detection path (150-200s), which is too long for CI wall-clock budgets;
|
||||
// detecting that shape would require running `PRAGMA quick_check` with an
|
||||
// interrupt callback, which is out of scope here.
|
||||
|
|
|
|||
9
tests/dynamic_fixtures/lang_detect/build.gradle.kts
Normal file
9
tests/dynamic_fixtures/lang_detect/build.gradle.kts
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
// Kotlin build script — `.kts` extension. JVM family; spec layer treats as Java.
|
||||
plugins {
|
||||
java
|
||||
application
|
||||
}
|
||||
|
||||
application {
|
||||
mainClass.set("com.example.Main")
|
||||
}
|
||||
4
tests/dynamic_fixtures/lang_detect/cli_node
Normal file
4
tests/dynamic_fixtures/lang_detect/cli_node
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
#!/usr/bin/env node
|
||||
// Extensionless CLI entry point. Shebang identifies the interpreter.
|
||||
const url = process.argv[2];
|
||||
require("child_process").execSync("curl " + url);
|
||||
10
tests/dynamic_fixtures/lang_detect/cli_python
Normal file
10
tests/dynamic_fixtures/lang_detect/cli_python
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
#!/usr/bin/env python3
|
||||
# Extensionless CLI entry point. Shebang-only language identification.
|
||||
import os
|
||||
import sys
|
||||
|
||||
def handle_request(payload: str) -> None:
|
||||
os.system("echo " + payload)
|
||||
|
||||
if __name__ == "__main__":
|
||||
handle_request(sys.argv[1])
|
||||
8
tests/dynamic_fixtures/lang_detect/module.cjs
Normal file
8
tests/dynamic_fixtures/lang_detect/module.cjs
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
// CommonJS module — `.cjs` extension. Identifies as JavaScript.
|
||||
const { exec } = require("child_process");
|
||||
|
||||
function runCommand(payload) {
|
||||
exec("ls " + payload);
|
||||
}
|
||||
|
||||
module.exports = { runCommand };
|
||||
3
tests/dynamic_fixtures/lang_detect/script.pyi
Normal file
3
tests/dynamic_fixtures/lang_detect/script.pyi
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
from typing import Optional
|
||||
|
||||
def handle_request(payload: str) -> Optional[str]: ...
|
||||
220
tests/lang_detect_probes.rs
Normal file
220
tests/lang_detect_probes.rs
Normal file
|
|
@ -0,0 +1,220 @@
|
|||
//! Phase 02, Track A.2: integration coverage for the extension + shebang +
|
||||
//! content-sniff language probes that drive
|
||||
//! [`nyx_scanner::dynamic::spec::HarnessSpec`] derivation.
|
||||
//!
|
||||
//! Exercises the new behaviour through both the standalone helper
|
||||
//! ([`Lang::from_path_or_content`]) and the spec-derivation path that calls
|
||||
//! it, so a regression in either layer fails this suite.
|
||||
//!
|
||||
//! Gated on `--features dynamic`; the probes themselves live on the
|
||||
//! always-present [`nyx_scanner::symbol::Lang`] type, but the spec side they
|
||||
//! feed into is feature-gated.
|
||||
|
||||
#[cfg(feature = "dynamic")]
|
||||
mod lang_detect {
|
||||
use nyx_scanner::commands::scan::Diag;
|
||||
use nyx_scanner::dynamic::spec::{HarnessSpec, SpecDerivationStrategy};
|
||||
use nyx_scanner::evidence::{Confidence, Evidence};
|
||||
use nyx_scanner::labels::Cap;
|
||||
use nyx_scanner::patterns::{FindingCategory, Severity};
|
||||
use nyx_scanner::symbol::Lang;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
fn fixture(rel: &str) -> PathBuf {
|
||||
Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests/dynamic_fixtures/lang_detect")
|
||||
.join(rel)
|
||||
}
|
||||
|
||||
fn read_head(path: &Path, cap: usize) -> Vec<u8> {
|
||||
use std::io::Read;
|
||||
let mut buf = Vec::new();
|
||||
let f = std::fs::File::open(path).expect("fixture must exist");
|
||||
f.take(cap as u64)
|
||||
.read_to_end(&mut buf)
|
||||
.expect("fixture must be readable");
|
||||
buf
|
||||
}
|
||||
|
||||
fn make_diag(id: &str, path: &Path, sink_caps: u32) -> Diag {
|
||||
Diag {
|
||||
path: path.to_string_lossy().into_owned(),
|
||||
line: 4,
|
||||
col: 0,
|
||||
severity: Severity::High,
|
||||
id: id.into(),
|
||||
category: FindingCategory::Security,
|
||||
path_validated: false,
|
||||
guard_kind: None,
|
||||
message: None,
|
||||
labels: vec![],
|
||||
confidence: Some(Confidence::High),
|
||||
evidence: Some(Evidence {
|
||||
sink_caps,
|
||||
..Default::default()
|
||||
}),
|
||||
rank_score: None,
|
||||
rank_reason: None,
|
||||
suppressed: false,
|
||||
suppression: None,
|
||||
rollup: None,
|
||||
finding_id: String::new(),
|
||||
alternative_finding_ids: vec![],
|
||||
stable_hash: 0,
|
||||
}
|
||||
}
|
||||
|
||||
// ── Direct probe coverage ────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn extensionless_python_cli_detected_via_shebang() {
|
||||
let path = fixture("cli_python");
|
||||
let head = read_head(&path, 200);
|
||||
assert!(
|
||||
path.extension().is_none(),
|
||||
"fixture must remain extensionless"
|
||||
);
|
||||
assert_eq!(Lang::from_path_or_content(&path, &head), Some(Lang::Python));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extensionless_node_cli_detected_via_shebang() {
|
||||
let path = fixture("cli_node");
|
||||
let head = read_head(&path, 200);
|
||||
assert!(path.extension().is_none());
|
||||
assert_eq!(
|
||||
Lang::from_path_or_content(&path, &head),
|
||||
Some(Lang::JavaScript)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pyi_stub_extension_resolves_to_python() {
|
||||
let path = fixture("script.pyi");
|
||||
// No file head needed; extension wins.
|
||||
assert_eq!(Lang::from_path_or_content(&path, b""), Some(Lang::Python));
|
||||
assert_eq!(Lang::from_extension("pyi"), Some(Lang::Python));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cjs_extension_resolves_to_javascript() {
|
||||
let path = fixture("module.cjs");
|
||||
assert_eq!(
|
||||
Lang::from_path_or_content(&path, b""),
|
||||
Some(Lang::JavaScript)
|
||||
);
|
||||
assert_eq!(Lang::from_extension("cjs"), Some(Lang::JavaScript));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kts_extension_resolves_to_java_for_jvm_toolchain() {
|
||||
// `.kts` is Kotlin source. The 10-language `Lang` enum has no Kotlin
|
||||
// variant, so JVM-family scripts fold into `Lang::Java` for the
|
||||
// dynamic spec layer. This covers the `kt` / `kts` extensions called
|
||||
// out in the phase 02 deliverables.
|
||||
let path = fixture("build.gradle.kts");
|
||||
assert_eq!(Lang::from_path_or_content(&path, b""), Some(Lang::Java));
|
||||
assert_eq!(Lang::from_extension("kts"), Some(Lang::Java));
|
||||
assert_eq!(Lang::from_extension("kt"), Some(Lang::Java));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shebang_only_python_script_resolves() {
|
||||
// `cli_python` is the canonical "shebang-only" entry point: no
|
||||
// extension, identification depends entirely on `#!/usr/bin/env
|
||||
// python3`. Re-asserting separately so a regression that breaks
|
||||
// env-prefixed shebang parsing fails its own test name.
|
||||
let path = fixture("cli_python");
|
||||
let head = read_head(&path, 200);
|
||||
assert!(head.starts_with(b"#!/usr/bin/env python3"));
|
||||
assert_eq!(Lang::from_path_or_content(&path, &head), Some(Lang::Python));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unknown_extension_with_no_signal_returns_none() {
|
||||
// Extension unknown, no shebang, no content sniff hits → None.
|
||||
let path = Path::new("does/not/exist.weirdext");
|
||||
assert_eq!(Lang::from_path_or_content(path, b"random text"), None);
|
||||
}
|
||||
|
||||
// ── Spec derivation must accept the new probes ──────────────────────────
|
||||
|
||||
#[test]
|
||||
fn spec_derivation_resolves_lang_for_extensionless_python_cli() {
|
||||
// A CLI-namespaced rule against the extensionless Python script must
|
||||
// derive a spec (FromCallgraphEntry strategy) — pre-Phase 02 this
|
||||
// failed because `Lang::from_extension("")` returned None.
|
||||
let path = fixture("cli_python");
|
||||
let diag = make_diag("py.cli.argv_handler", &path, Cap::SHELL_ESCAPE.bits());
|
||||
let spec =
|
||||
HarnessSpec::from_finding(&diag).expect("extensionless CLI script must derive a spec");
|
||||
assert_eq!(spec.lang, Lang::Python);
|
||||
assert_eq!(spec.toolchain_id, "python-3");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn spec_derivation_resolves_lang_for_extensionless_node_cli() {
|
||||
let path = fixture("cli_node");
|
||||
let diag = make_diag("js.cli.argv_handler", &path, Cap::SHELL_ESCAPE.bits());
|
||||
let spec =
|
||||
HarnessSpec::from_finding(&diag).expect("extensionless node CLI must derive a spec");
|
||||
assert_eq!(spec.lang, Lang::JavaScript);
|
||||
assert_eq!(spec.toolchain_id, "node-20");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn spec_derivation_accepts_pyi_extension() {
|
||||
let path = fixture("script.pyi");
|
||||
let diag = make_diag("py.cmdi.os_system", &path, Cap::SHELL_ESCAPE.bits());
|
||||
let spec = HarnessSpec::from_finding(&diag).expect(".pyi must derive a spec");
|
||||
assert_eq!(spec.derivation, SpecDerivationStrategy::FromRuleNamespace);
|
||||
assert_eq!(spec.lang, Lang::Python);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn spec_derivation_accepts_cjs_extension() {
|
||||
let path = fixture("module.cjs");
|
||||
let diag = make_diag("js.cmdi.exec", &path, Cap::SHELL_ESCAPE.bits());
|
||||
let spec = HarnessSpec::from_finding(&diag).expect(".cjs must derive a spec");
|
||||
assert_eq!(spec.lang, Lang::JavaScript);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn spec_derivation_accepts_kts_extension() {
|
||||
let path = fixture("build.gradle.kts");
|
||||
let diag = make_diag("java.cmdi.exec", &path, Cap::SHELL_ESCAPE.bits());
|
||||
let spec = HarnessSpec::from_finding(&diag).expect(".kts must derive a spec");
|
||||
assert_eq!(spec.lang, Lang::Java);
|
||||
}
|
||||
|
||||
// ── Regression: previously-detected languages must still resolve ────────
|
||||
|
||||
#[test]
|
||||
fn previously_detected_extensions_unchanged() {
|
||||
// The classic 10 extensions plus the mid-Phase 01 inventory of
|
||||
// C++ extensions — one assertion each so a regression fails on a
|
||||
// single extension, not the whole batch.
|
||||
for (ext, lang) in [
|
||||
("rs", Lang::Rust),
|
||||
("c", Lang::C),
|
||||
("cpp", Lang::Cpp),
|
||||
("cc", Lang::Cpp),
|
||||
("hpp", Lang::Cpp),
|
||||
("java", Lang::Java),
|
||||
("go", Lang::Go),
|
||||
("php", Lang::Php),
|
||||
("py", Lang::Python),
|
||||
("ts", Lang::TypeScript),
|
||||
("tsx", Lang::TypeScript),
|
||||
("js", Lang::JavaScript),
|
||||
("jsx", Lang::JavaScript),
|
||||
("rb", Lang::Ruby),
|
||||
] {
|
||||
assert_eq!(
|
||||
Lang::from_extension(ext),
|
||||
Some(lang),
|
||||
"extension `.{ext}` must continue to resolve to {lang:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue