fix(db): fast-fail Indexer::init on non-SQLite files via magic-header preflight

This commit is contained in:
elipeter 2026-05-13 17:22:50 -04:00
parent 946cb6a9bc
commit 8abb023dd0
11 changed files with 648 additions and 17 deletions

View file

@ -19,6 +19,7 @@ pub mod index {
use r2d2_sqlite::SqliteConnectionManager;
use rusqlite::{Connection, OpenFlags, OptionalExtension, params};
use std::fs;
use std::io::Read;
use std::ops::Deref;
use std::path::{Path, PathBuf};
use std::str::FromStr;
@ -332,9 +333,62 @@ pub mod index {
project: String,
}
/// SQLite database files start with this 16-byte ASCII magic.
const SQLITE_MAGIC: &[u8; 16] = b"SQLite format 3\0";
/// Reject obviously non-SQLite files before handing them to the
/// connection pool, where the same rejection costs minutes instead of
/// microseconds on some corruption shapes.
///
/// Returns `Ok(())` when:
/// * the file does not exist (the pool will `CREATE` it),
/// * the file is zero-length (SQLite treats this as a fresh DB),
/// * the first 16 bytes match the SQLite magic header,
/// * the file is shorter than the magic but non-empty (extremely
/// unusual; we defer to SQLite rather than gating arbitrarily).
///
/// Returns `Err(NyxError::Sql(...))` carrying `SQLITE_NOTADB` when the
/// header is present but does not match.
fn preflight_header(database_path: &Path) -> NyxResult<()> {
let Ok(meta) = fs::metadata(database_path) else {
return Ok(());
};
if !meta.is_file() {
return Ok(());
}
if meta.len() < SQLITE_MAGIC.len() as u64 {
return Ok(());
}
let mut head = [0u8; 16];
let mut f = fs::File::open(database_path)?;
f.read_exact(&mut head)?;
if &head != SQLITE_MAGIC {
return Err(NyxError::Sql(rusqlite::Error::SqliteFailure(
rusqlite::ffi::Error::new(rusqlite::ffi::SQLITE_NOTADB),
Some(format!(
"file at {} is not a SQLite database (header magic mismatch)",
database_path.display(),
)),
)));
}
Ok(())
}
impl Indexer {
pub fn init(database_path: &Path) -> NyxResult<Arc<Pool<SqliteConnectionManager>>> {
let _span = tracing::info_span!("db_init", path = %database_path.display()).entered();
// Fast-fail when the existing file is clearly not a SQLite
// database. Without this guard, certain corruption shapes
// (truncated header, header overwritten with arbitrary bytes,
// mid-page damage that preserves magic) can keep SQLite busy
// for 150-200 seconds inside the PRAGMA / schema execution
// below before it surfaces SQLITE_NOTADB or SQLITE_CORRUPT.
// A zero-length file is treated as a fresh DB by SQLite, so we
// only validate when the file is large enough to hold the
// 16-byte magic header.
preflight_header(database_path)?;
// NO_MUTEX is safe because r2d2 ensures each pooled connection
// is only ever used by one thread at a time. Combined with WAL
// mode this allows concurrent readers + a single writer without

View file

@ -492,9 +492,41 @@ fn entry_kind_from_summary(_kind: &crate::entry_points::EntryKind) -> EntryKind
// ── Helpers ──────────────────────────────────────────────────────────────────
/// Resolve the language for a finding path using extension first, then a
/// shebang / content sniff against the first 200 bytes of the file.
///
/// Phase 02 widens this resolver beyond `Lang::from_extension` so that
/// extensionless CLI entry points and idiomatic non-canonical extensions
/// (`.cjs`, `.mts`, `.pyi`, …) no longer cause `SpecDerivationFailed`. File
/// I/O is best-effort: an unreadable / absent file falls through to the
/// extension-only path so callers in tests that pass synthetic paths still
/// resolve when the extension is well-known.
fn lang_from_path(path: &str) -> Option<Lang> {
let ext = Path::new(path).extension().and_then(|e| e.to_str()).unwrap_or("");
Lang::from_extension(ext)
let p = Path::new(path);
if let Some(ext) = p.extension().and_then(|e| e.to_str()) {
if let Some(lang) = Lang::from_extension(ext) {
return Some(lang);
}
}
// Fall back to a shebang / content sniff over the file head.
let head = read_file_head(p, 200);
if head.is_empty() {
return None;
}
Lang::from_path_or_content(p, &head)
}
/// Read up to `cap` bytes from `path`, returning an empty buffer on any I/O
/// error. The verifier never wants a missing file to abort spec derivation —
/// callers downstream already gate on `Lang` being `Some`.
fn read_file_head(path: &Path, cap: usize) -> Vec<u8> {
use std::io::Read;
let mut buf = Vec::with_capacity(cap);
let Ok(f) = std::fs::File::open(path) else {
return buf;
};
let _ = f.take(cap as u64).read_to_end(&mut buf);
buf
}
/// Return the first non-empty `function` annotation found on any flow step.

View file

@ -12,6 +12,7 @@
use serde::{Deserialize, Serialize};
use std::fmt;
use std::path::Path;
/// Supported source-code languages.
///
@ -59,23 +60,71 @@ impl Lang {
///
/// Mirrors the extension→language mapping in `ast::lang_for_path()` so that
/// callers outside `ast` can obtain a `Lang` from a path without needing a
/// `FuncSummary`.
/// `FuncSummary`. Match is case-insensitive (ASCII).
///
/// Extension coverage is intentionally broader than the tree-sitter loader
/// in `ast::lang_for_path` because this function is consumed by the
/// dynamic verifier, which must classify *every* finding-bearing path so
/// that spec derivation does not collapse on idiomatic file extensions
/// like `.cjs`, `.mts`, `.pyi`, or `.kts`. JVM-family `.kt` / `.kts` map
/// to [`Lang::Java`] because the spec/toolchain layer is JVM-aware even
/// where the tree-sitter grammar is not.
pub fn from_extension(ext: &str) -> Option<Lang> {
match ext {
let lower = ext.to_ascii_lowercase();
match lower.as_str() {
"rs" => Some(Lang::Rust),
"c" => Some(Lang::C),
"cpp" => Some(Lang::Cpp),
"java" => Some(Lang::Java),
"cpp" | "cc" | "cxx" | "c++" | "hpp" | "hxx" | "hh" | "h++" => Some(Lang::Cpp),
// Java family. `.kt` / `.kts` are Kotlin (JVM); the dynamic spec
// layer treats them as Java for toolchain selection purposes.
"java" | "kt" | "kts" => Some(Lang::Java),
"go" => Some(Lang::Go),
"php" => Some(Lang::Php),
"py" => Some(Lang::Python),
"ts" => Some(Lang::TypeScript),
"js" => Some(Lang::JavaScript),
// `.pyi` are Python stub files; spec derivation accepts them so
// typed-stub-only entry points still register a language.
"py" | "pyi" => Some(Lang::Python),
// `.mts` / `.cts` are TypeScript module-form (ES module / CommonJS).
"ts" | "tsx" | "mts" | "cts" => Some(Lang::TypeScript),
// `.mjs` / `.cjs` are JavaScript module-form. `.jsx` is React JSX.
"js" | "jsx" | "mjs" | "cjs" => Some(Lang::JavaScript),
"rb" => Some(Lang::Ruby),
_ => None,
}
}
/// Probe a path's language using extension first, then a shebang line on
/// `head_bytes`, then a content-byte heuristic on the first 200 bytes.
///
/// `head_bytes` should be the first N bytes of the file (200 is plenty;
/// callers may pass more). Empty / unreadable files return `None`.
///
/// Order:
/// 1. [`Lang::from_extension`] on the path's extension — fast path.
/// 2. Shebang inspection. Common interpreter aliases are recognised:
/// `python` / `python3` → [`Lang::Python`], `node` / `nodejs` / `deno`
/// / `bun` → [`Lang::JavaScript`], `ruby` → [`Lang::Ruby`], `php` →
/// [`Lang::Php`]. `/usr/bin/env <interp>` and direct
/// `/usr/bin/<interp>` paths both work.
/// 3. Content-byte syntactic sniff: line-prefix matches on the first 200
/// bytes (`<?php`, `package main`, Java `package …;`, `fn main`, etc.).
/// The sniff stands in for a full tree-sitter parse — it is cheaper
/// and covers the verifier's failure modes without paying the cost of
/// loading every grammar for every extensionless file.
///
/// Used by [`crate::dynamic::spec`] so spec derivation no longer rejects
/// CLI entry points and other extensionless / non-canonical files.
pub fn from_path_or_content(path: &Path, head_bytes: &[u8]) -> Option<Lang> {
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
if let Some(lang) = Self::from_extension(ext) {
return Some(lang);
}
}
if let Some(lang) = lang_from_shebang(head_bytes) {
return Some(lang);
}
sniff_content_lang(head_bytes)
}
/// Canonical slug string for this language.
pub fn as_str(&self) -> &'static str {
match self {
@ -288,5 +337,113 @@ pub fn namespace_with_package(
}
}
/// Maximum bytes of `head_bytes` consulted by the shebang / content sniff.
/// Larger reads are tolerated — the helpers truncate internally.
const SNIFF_HEAD_LIMIT: usize = 200;
/// Parse a `#!` shebang line and map the interpreter name to a `Lang`.
///
/// Handles `/usr/bin/env <interp>` (with optional `-S` / `-i` flags),
/// direct `/usr/bin/<interp>`, and bare `<interp>` forms. Trailing version
/// digits (`python3`, `python3.11`) are stripped so the lookup matches the
/// base interpreter. Returns `None` for non-Nyx-supported interpreters
/// (`bash`, `sh`, `perl`, …).
fn lang_from_shebang(head: &[u8]) -> Option<Lang> {
if !head.starts_with(b"#!") {
return None;
}
let cap = head.len().min(SNIFF_HEAD_LIMIT);
let line_end = head[..cap]
.iter()
.position(|&b| b == b'\n')
.unwrap_or(cap);
let line = std::str::from_utf8(&head[..line_end]).ok()?;
let line = line.trim_end_matches('\r').trim();
let rest = line.strip_prefix("#!")?.trim();
let mut tokens = rest.split_whitespace();
let first = tokens.next()?;
let interpreter = if first.ends_with("/env") || first == "env" {
// Skip env's own options (e.g. `-S`, `-i`, `--split-string`).
tokens.find(|t| !t.starts_with('-'))?
} else {
first.rsplit('/').next()?
};
let base: String = interpreter
.chars()
.take_while(|c| c.is_ascii_alphabetic())
.collect();
match base.as_str() {
"python" => Some(Lang::Python),
"node" | "nodejs" | "deno" | "bun" => Some(Lang::JavaScript),
"ts" | "tsx" => Some(Lang::TypeScript),
"ruby" => Some(Lang::Ruby),
"php" => Some(Lang::Php),
_ => None,
}
}
/// Lightweight syntactic sniff over the first 200 bytes of a file.
///
/// Skips a leading shebang line (callers already tried it), then inspects up
/// to ~20 head lines for unambiguous language tokens. Returns `None` if
/// nothing convinces; the verifier's caller will record `LangUnsupported`
/// rather than misclassify.
fn sniff_content_lang(head: &[u8]) -> Option<Lang> {
if head.is_empty() {
return None;
}
let cap = head.len().min(SNIFF_HEAD_LIMIT);
let text = std::str::from_utf8(&head[..cap]).ok()?;
let body = match (text.starts_with("#!"), text.find('\n')) {
(true, Some(i)) => &text[i + 1..],
_ => text,
};
for raw in body.lines().take(20) {
let line = raw.trim_start();
if line.is_empty() {
continue;
}
if line.starts_with("<?php") {
return Some(Lang::Php);
}
if line.starts_with("package main") {
return Some(Lang::Go);
}
// Java `package foo.bar;` always ends with a semicolon.
if line.starts_with("package ") && line.trim_end().ends_with(';') {
return Some(Lang::Java);
}
if line.starts_with("import java.") || line.starts_with("public class ") {
return Some(Lang::Java);
}
if line.starts_with("from __future__")
|| line.starts_with("from typing ")
|| (line.starts_with("def ") && line.contains(':'))
{
return Some(Lang::Python);
}
if line.starts_with("fn main") || line.starts_with("use std::") {
return Some(Lang::Rust);
}
if line.starts_with("func ") && line.contains('(') {
return Some(Lang::Go);
}
if line.starts_with("require ") || line.starts_with("require_relative ") {
return Some(Lang::Ruby);
}
if line.starts_with("function ")
|| line.starts_with("const ")
|| line.starts_with("import {")
|| line.starts_with("export ")
{
return Some(Lang::JavaScript);
}
}
None
}
#[cfg(test)]
mod tests;

View file

@ -203,3 +203,138 @@ fn normalize_fallback_on_mismatch() {
"/other/path/lib.rs"
);
}
// ── Phase 02: extension + shebang + content sniff ──────────────────────────
use std::path::Path;
#[test]
fn from_extension_accepts_phase02_additions() {
// Each of the new extensions must round-trip to the documented language.
assert_eq!(Lang::from_extension("cjs"), Some(Lang::JavaScript));
assert_eq!(Lang::from_extension("mjs"), Some(Lang::JavaScript));
assert_eq!(Lang::from_extension("jsx"), Some(Lang::JavaScript));
assert_eq!(Lang::from_extension("mts"), Some(Lang::TypeScript));
assert_eq!(Lang::from_extension("cts"), Some(Lang::TypeScript));
assert_eq!(Lang::from_extension("tsx"), Some(Lang::TypeScript));
assert_eq!(Lang::from_extension("pyi"), Some(Lang::Python));
assert_eq!(Lang::from_extension("kt"), Some(Lang::Java));
assert_eq!(Lang::from_extension("kts"), Some(Lang::Java));
// C++ inventory extended in Phase 01 / ast.rs: keep the helper aligned.
assert_eq!(Lang::from_extension("cc"), Some(Lang::Cpp));
assert_eq!(Lang::from_extension("hpp"), Some(Lang::Cpp));
}
#[test]
fn from_extension_is_case_insensitive() {
// Real-world filesystems mix case (especially on Windows / macOS).
assert_eq!(Lang::from_extension("PY"), Some(Lang::Python));
assert_eq!(Lang::from_extension("Java"), Some(Lang::Java));
assert_eq!(Lang::from_extension("JSX"), Some(Lang::JavaScript));
}
#[test]
fn from_path_or_content_extension_wins() {
// Even with a misleading shebang the explicit extension must take
// precedence — file-format ground truth beats hand-edited interpreter
// hints.
let head = b"#!/usr/bin/env node\nprint('hi')\n";
let path = Path::new("/tmp/script.py");
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Python));
}
#[test]
fn from_path_or_content_shebang_python_env() {
let head = b"#!/usr/bin/env python3\nimport os\n";
let path = Path::new("/tmp/runme");
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Python));
}
#[test]
fn from_path_or_content_shebang_node_direct() {
let head = b"#!/usr/local/bin/node\nconsole.log(1)\n";
let path = Path::new("/tmp/runme");
assert_eq!(
Lang::from_path_or_content(path, head),
Some(Lang::JavaScript)
);
}
#[test]
fn from_path_or_content_shebang_ruby_direct() {
let head = b"#!/usr/bin/ruby\nputs 1\n";
let path = Path::new("/tmp/runme");
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Ruby));
}
#[test]
fn from_path_or_content_shebang_php() {
let head = b"#!/usr/bin/env php\n<?php echo 1;\n";
let path = Path::new("/tmp/runme");
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Php));
}
#[test]
fn from_path_or_content_shebang_with_env_dash_flag() {
// `env -S` is the portable trick for passing args; the second token after
// env is the real interpreter.
let head = b"#!/usr/bin/env -S python3 -u\nimport sys\n";
let path = Path::new("/tmp/runme");
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Python));
}
#[test]
fn from_path_or_content_shebang_unknown_interpreter_falls_through_to_sniff() {
// bash isn't a supported language — shebang returns None — and the
// body's `<?php` opener should still be picked up by the content sniff.
let head = b"#!/bin/bash\n<?php echo 1; ?>\n";
let path = Path::new("/tmp/runme");
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Php));
}
#[test]
fn from_path_or_content_content_sniff_php() {
let head = b"<?php echo 'hi'; ?>";
let path = Path::new("/tmp/runme");
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Php));
}
#[test]
fn from_path_or_content_content_sniff_go_package_main() {
let head = b"package main\n\nimport \"fmt\"\n";
let path = Path::new("/tmp/runme");
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Go));
}
#[test]
fn from_path_or_content_content_sniff_java_package_semicolon() {
let head = b"package com.example.app;\n\npublic class Main {}\n";
let path = Path::new("/tmp/runme");
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Java));
}
#[test]
fn from_path_or_content_content_sniff_python_def() {
let head = b"\"\"\"docstring\"\"\"\n\ndef handle(x):\n return x\n";
let path = Path::new("/tmp/runme");
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Python));
}
#[test]
fn from_path_or_content_content_sniff_rust_use_std() {
let head = b"use std::path::Path;\n\nfn main() {}\n";
let path = Path::new("/tmp/runme");
assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Rust));
}
#[test]
fn from_path_or_content_returns_none_when_nothing_matches() {
let path = Path::new("/tmp/runme.weird");
assert_eq!(Lang::from_path_or_content(path, b"plain text data"), None);
}
#[test]
fn from_path_or_content_empty_head_with_unknown_extension_returns_none() {
let path = Path::new("/tmp/runme");
assert_eq!(Lang::from_path_or_content(path, b""), None);
}

View file

@ -189,11 +189,10 @@ fn garbage_header_db_returns_structured_error() {
}
// NOTE: A mid-file corruption test (garbage at bytes 100..200, preserving
// SQLite magic) was attempted and is deliberately omitted. That shape
// triggers a slow corruption-detection path in SQLite where `Indexer::init`
// takes 150200 seconds before returning, unsuitable for CI wall-clock
// budgets. The two tests above already cover the "corrupt-on-arrival"
// cases that users actually hit (crash-truncated file, deliberate clobber).
// A follow-up should either short-circuit `PRAGMA integrity_check` up
// front or wrap the init path in a timeout so mid-page corruption
// also fails fast.
// SQLite magic) is still omitted. `Indexer::init` short-circuits on
// header-magic mismatch (see `preflight_header`), so the corrupt-on-arrival
// shapes users actually hit return in microseconds. Mid-page damage that
// preserves the magic header still falls into SQLite's slow corruption
// detection path (150-200s), which is too long for CI wall-clock budgets;
// detecting that shape would require running `PRAGMA quick_check` with an
// interrupt callback, which is out of scope here.

View file

@ -0,0 +1,9 @@
// Kotlin build script — `.kts` extension. JVM family; spec layer treats as Java.
plugins {
java
application
}
application {
mainClass.set("com.example.Main")
}

View file

@ -0,0 +1,4 @@
#!/usr/bin/env node
// Extensionless CLI entry point. Shebang identifies the interpreter.
const url = process.argv[2];
require("child_process").execSync("curl " + url);

View file

@ -0,0 +1,10 @@
#!/usr/bin/env python3
# Extensionless CLI entry point. Shebang-only language identification.
import os
import sys
def handle_request(payload: str) -> None:
os.system("echo " + payload)
if __name__ == "__main__":
handle_request(sys.argv[1])

View file

@ -0,0 +1,8 @@
// CommonJS module — `.cjs` extension. Identifies as JavaScript.
const { exec } = require("child_process");
function runCommand(payload) {
exec("ls " + payload);
}
module.exports = { runCommand };

View file

@ -0,0 +1,3 @@
from typing import Optional
def handle_request(payload: str) -> Optional[str]: ...

220
tests/lang_detect_probes.rs Normal file
View file

@ -0,0 +1,220 @@
//! Phase 02, Track A.2: integration coverage for the extension + shebang +
//! content-sniff language probes that drive
//! [`nyx_scanner::dynamic::spec::HarnessSpec`] derivation.
//!
//! Exercises the new behaviour through both the standalone helper
//! ([`Lang::from_path_or_content`]) and the spec-derivation path that calls
//! it, so a regression in either layer fails this suite.
//!
//! Gated on `--features dynamic`; the probes themselves live on the
//! always-present [`nyx_scanner::symbol::Lang`] type, but the spec side they
//! feed into is feature-gated.
#[cfg(feature = "dynamic")]
mod lang_detect {
use nyx_scanner::commands::scan::Diag;
use nyx_scanner::dynamic::spec::{HarnessSpec, SpecDerivationStrategy};
use nyx_scanner::evidence::{Confidence, Evidence};
use nyx_scanner::labels::Cap;
use nyx_scanner::patterns::{FindingCategory, Severity};
use nyx_scanner::symbol::Lang;
use std::path::{Path, PathBuf};
fn fixture(rel: &str) -> PathBuf {
Path::new(env!("CARGO_MANIFEST_DIR"))
.join("tests/dynamic_fixtures/lang_detect")
.join(rel)
}
fn read_head(path: &Path, cap: usize) -> Vec<u8> {
use std::io::Read;
let mut buf = Vec::new();
let f = std::fs::File::open(path).expect("fixture must exist");
f.take(cap as u64)
.read_to_end(&mut buf)
.expect("fixture must be readable");
buf
}
fn make_diag(id: &str, path: &Path, sink_caps: u32) -> Diag {
Diag {
path: path.to_string_lossy().into_owned(),
line: 4,
col: 0,
severity: Severity::High,
id: id.into(),
category: FindingCategory::Security,
path_validated: false,
guard_kind: None,
message: None,
labels: vec![],
confidence: Some(Confidence::High),
evidence: Some(Evidence {
sink_caps,
..Default::default()
}),
rank_score: None,
rank_reason: None,
suppressed: false,
suppression: None,
rollup: None,
finding_id: String::new(),
alternative_finding_ids: vec![],
stable_hash: 0,
}
}
// ── Direct probe coverage ────────────────────────────────────────────────
#[test]
fn extensionless_python_cli_detected_via_shebang() {
let path = fixture("cli_python");
let head = read_head(&path, 200);
assert!(
path.extension().is_none(),
"fixture must remain extensionless"
);
assert_eq!(Lang::from_path_or_content(&path, &head), Some(Lang::Python));
}
#[test]
fn extensionless_node_cli_detected_via_shebang() {
let path = fixture("cli_node");
let head = read_head(&path, 200);
assert!(path.extension().is_none());
assert_eq!(
Lang::from_path_or_content(&path, &head),
Some(Lang::JavaScript)
);
}
#[test]
fn pyi_stub_extension_resolves_to_python() {
let path = fixture("script.pyi");
// No file head needed; extension wins.
assert_eq!(Lang::from_path_or_content(&path, b""), Some(Lang::Python));
assert_eq!(Lang::from_extension("pyi"), Some(Lang::Python));
}
#[test]
fn cjs_extension_resolves_to_javascript() {
let path = fixture("module.cjs");
assert_eq!(
Lang::from_path_or_content(&path, b""),
Some(Lang::JavaScript)
);
assert_eq!(Lang::from_extension("cjs"), Some(Lang::JavaScript));
}
#[test]
fn kts_extension_resolves_to_java_for_jvm_toolchain() {
// `.kts` is Kotlin source. The 10-language `Lang` enum has no Kotlin
// variant, so JVM-family scripts fold into `Lang::Java` for the
// dynamic spec layer. This covers the `kt` / `kts` extensions called
// out in the phase 02 deliverables.
let path = fixture("build.gradle.kts");
assert_eq!(Lang::from_path_or_content(&path, b""), Some(Lang::Java));
assert_eq!(Lang::from_extension("kts"), Some(Lang::Java));
assert_eq!(Lang::from_extension("kt"), Some(Lang::Java));
}
#[test]
fn shebang_only_python_script_resolves() {
// `cli_python` is the canonical "shebang-only" entry point: no
// extension, identification depends entirely on `#!/usr/bin/env
// python3`. Re-asserting separately so a regression that breaks
// env-prefixed shebang parsing fails its own test name.
let path = fixture("cli_python");
let head = read_head(&path, 200);
assert!(head.starts_with(b"#!/usr/bin/env python3"));
assert_eq!(Lang::from_path_or_content(&path, &head), Some(Lang::Python));
}
#[test]
fn unknown_extension_with_no_signal_returns_none() {
// Extension unknown, no shebang, no content sniff hits → None.
let path = Path::new("does/not/exist.weirdext");
assert_eq!(Lang::from_path_or_content(path, b"random text"), None);
}
// ── Spec derivation must accept the new probes ──────────────────────────
#[test]
fn spec_derivation_resolves_lang_for_extensionless_python_cli() {
// A CLI-namespaced rule against the extensionless Python script must
// derive a spec (FromCallgraphEntry strategy) — pre-Phase 02 this
// failed because `Lang::from_extension("")` returned None.
let path = fixture("cli_python");
let diag = make_diag("py.cli.argv_handler", &path, Cap::SHELL_ESCAPE.bits());
let spec =
HarnessSpec::from_finding(&diag).expect("extensionless CLI script must derive a spec");
assert_eq!(spec.lang, Lang::Python);
assert_eq!(spec.toolchain_id, "python-3");
}
#[test]
fn spec_derivation_resolves_lang_for_extensionless_node_cli() {
let path = fixture("cli_node");
let diag = make_diag("js.cli.argv_handler", &path, Cap::SHELL_ESCAPE.bits());
let spec =
HarnessSpec::from_finding(&diag).expect("extensionless node CLI must derive a spec");
assert_eq!(spec.lang, Lang::JavaScript);
assert_eq!(spec.toolchain_id, "node-20");
}
#[test]
fn spec_derivation_accepts_pyi_extension() {
let path = fixture("script.pyi");
let diag = make_diag("py.cmdi.os_system", &path, Cap::SHELL_ESCAPE.bits());
let spec = HarnessSpec::from_finding(&diag).expect(".pyi must derive a spec");
assert_eq!(spec.derivation, SpecDerivationStrategy::FromRuleNamespace);
assert_eq!(spec.lang, Lang::Python);
}
#[test]
fn spec_derivation_accepts_cjs_extension() {
let path = fixture("module.cjs");
let diag = make_diag("js.cmdi.exec", &path, Cap::SHELL_ESCAPE.bits());
let spec = HarnessSpec::from_finding(&diag).expect(".cjs must derive a spec");
assert_eq!(spec.lang, Lang::JavaScript);
}
#[test]
fn spec_derivation_accepts_kts_extension() {
let path = fixture("build.gradle.kts");
let diag = make_diag("java.cmdi.exec", &path, Cap::SHELL_ESCAPE.bits());
let spec = HarnessSpec::from_finding(&diag).expect(".kts must derive a spec");
assert_eq!(spec.lang, Lang::Java);
}
// ── Regression: previously-detected languages must still resolve ────────
#[test]
fn previously_detected_extensions_unchanged() {
// The classic 10 extensions plus the mid-Phase 01 inventory of
// C++ extensions — one assertion each so a regression fails on a
// single extension, not the whole batch.
for (ext, lang) in [
("rs", Lang::Rust),
("c", Lang::C),
("cpp", Lang::Cpp),
("cc", Lang::Cpp),
("hpp", Lang::Cpp),
("java", Lang::Java),
("go", Lang::Go),
("php", Lang::Php),
("py", Lang::Python),
("ts", Lang::TypeScript),
("tsx", Lang::TypeScript),
("js", Lang::JavaScript),
("jsx", Lang::JavaScript),
("rb", Lang::Ruby),
] {
assert_eq!(
Lang::from_extension(ext),
Some(lang),
"extension `.{ext}` must continue to resolve to {lang:?}"
);
}
}
}