From 8abb023dd0f67ea66da7ab0f6fd09096cfa7a329 Mon Sep 17 00:00:00 2001 From: elipeter Date: Wed, 13 May 2026 17:22:50 -0400 Subject: [PATCH] fix(db): fast-fail Indexer::init on non-SQLite files via magic-header preflight --- src/database.rs | 54 +++++ src/dynamic/spec.rs | 36 ++- src/symbol/mod.rs | 171 +++++++++++++- src/symbol/tests.rs | 135 +++++++++++ tests/db_corruption_tests.rs | 15 +- .../lang_detect/build.gradle.kts | 9 + tests/dynamic_fixtures/lang_detect/cli_node | 4 + tests/dynamic_fixtures/lang_detect/cli_python | 10 + tests/dynamic_fixtures/lang_detect/module.cjs | 8 + tests/dynamic_fixtures/lang_detect/script.pyi | 3 + tests/lang_detect_probes.rs | 220 ++++++++++++++++++ 11 files changed, 648 insertions(+), 17 deletions(-) create mode 100644 tests/dynamic_fixtures/lang_detect/build.gradle.kts create mode 100644 tests/dynamic_fixtures/lang_detect/cli_node create mode 100644 tests/dynamic_fixtures/lang_detect/cli_python create mode 100644 tests/dynamic_fixtures/lang_detect/module.cjs create mode 100644 tests/dynamic_fixtures/lang_detect/script.pyi create mode 100644 tests/lang_detect_probes.rs diff --git a/src/database.rs b/src/database.rs index d7284479..176ac788 100644 --- a/src/database.rs +++ b/src/database.rs @@ -19,6 +19,7 @@ pub mod index { use r2d2_sqlite::SqliteConnectionManager; use rusqlite::{Connection, OpenFlags, OptionalExtension, params}; use std::fs; + use std::io::Read; use std::ops::Deref; use std::path::{Path, PathBuf}; use std::str::FromStr; @@ -332,9 +333,62 @@ pub mod index { project: String, } + /// SQLite database files start with this 16-byte ASCII magic. + const SQLITE_MAGIC: &[u8; 16] = b"SQLite format 3\0"; + + /// Reject obviously non-SQLite files before handing them to the + /// connection pool, where the same rejection costs minutes instead of + /// microseconds on some corruption shapes. + /// + /// Returns `Ok(())` when: + /// * the file does not exist (the pool will `CREATE` it), + /// * the file is zero-length (SQLite treats this as a fresh DB), + /// * the first 16 bytes match the SQLite magic header, + /// * the file is shorter than the magic but non-empty (extremely + /// unusual; we defer to SQLite rather than gating arbitrarily). + /// + /// Returns `Err(NyxError::Sql(...))` carrying `SQLITE_NOTADB` when the + /// header is present but does not match. + fn preflight_header(database_path: &Path) -> NyxResult<()> { + let Ok(meta) = fs::metadata(database_path) else { + return Ok(()); + }; + if !meta.is_file() { + return Ok(()); + } + if meta.len() < SQLITE_MAGIC.len() as u64 { + return Ok(()); + } + let mut head = [0u8; 16]; + let mut f = fs::File::open(database_path)?; + f.read_exact(&mut head)?; + if &head != SQLITE_MAGIC { + return Err(NyxError::Sql(rusqlite::Error::SqliteFailure( + rusqlite::ffi::Error::new(rusqlite::ffi::SQLITE_NOTADB), + Some(format!( + "file at {} is not a SQLite database (header magic mismatch)", + database_path.display(), + )), + ))); + } + Ok(()) + } + impl Indexer { pub fn init(database_path: &Path) -> NyxResult>> { let _span = tracing::info_span!("db_init", path = %database_path.display()).entered(); + + // Fast-fail when the existing file is clearly not a SQLite + // database. Without this guard, certain corruption shapes + // (truncated header, header overwritten with arbitrary bytes, + // mid-page damage that preserves magic) can keep SQLite busy + // for 150-200 seconds inside the PRAGMA / schema execution + // below before it surfaces SQLITE_NOTADB or SQLITE_CORRUPT. + // A zero-length file is treated as a fresh DB by SQLite, so we + // only validate when the file is large enough to hold the + // 16-byte magic header. + preflight_header(database_path)?; + // NO_MUTEX is safe because r2d2 ensures each pooled connection // is only ever used by one thread at a time. Combined with WAL // mode this allows concurrent readers + a single writer without diff --git a/src/dynamic/spec.rs b/src/dynamic/spec.rs index a71329e7..de273951 100644 --- a/src/dynamic/spec.rs +++ b/src/dynamic/spec.rs @@ -492,9 +492,41 @@ fn entry_kind_from_summary(_kind: &crate::entry_points::EntryKind) -> EntryKind // ── Helpers ────────────────────────────────────────────────────────────────── +/// Resolve the language for a finding path using extension first, then a +/// shebang / content sniff against the first 200 bytes of the file. +/// +/// Phase 02 widens this resolver beyond `Lang::from_extension` so that +/// extensionless CLI entry points and idiomatic non-canonical extensions +/// (`.cjs`, `.mts`, `.pyi`, …) no longer cause `SpecDerivationFailed`. File +/// I/O is best-effort: an unreadable / absent file falls through to the +/// extension-only path so callers in tests that pass synthetic paths still +/// resolve when the extension is well-known. fn lang_from_path(path: &str) -> Option { - let ext = Path::new(path).extension().and_then(|e| e.to_str()).unwrap_or(""); - Lang::from_extension(ext) + let p = Path::new(path); + if let Some(ext) = p.extension().and_then(|e| e.to_str()) { + if let Some(lang) = Lang::from_extension(ext) { + return Some(lang); + } + } + // Fall back to a shebang / content sniff over the file head. + let head = read_file_head(p, 200); + if head.is_empty() { + return None; + } + Lang::from_path_or_content(p, &head) +} + +/// Read up to `cap` bytes from `path`, returning an empty buffer on any I/O +/// error. The verifier never wants a missing file to abort spec derivation — +/// callers downstream already gate on `Lang` being `Some`. +fn read_file_head(path: &Path, cap: usize) -> Vec { + use std::io::Read; + let mut buf = Vec::with_capacity(cap); + let Ok(f) = std::fs::File::open(path) else { + return buf; + }; + let _ = f.take(cap as u64).read_to_end(&mut buf); + buf } /// Return the first non-empty `function` annotation found on any flow step. diff --git a/src/symbol/mod.rs b/src/symbol/mod.rs index 94cb8054..eed5ae40 100644 --- a/src/symbol/mod.rs +++ b/src/symbol/mod.rs @@ -12,6 +12,7 @@ use serde::{Deserialize, Serialize}; use std::fmt; +use std::path::Path; /// Supported source-code languages. /// @@ -59,23 +60,71 @@ impl Lang { /// /// Mirrors the extension→language mapping in `ast::lang_for_path()` so that /// callers outside `ast` can obtain a `Lang` from a path without needing a - /// `FuncSummary`. + /// `FuncSummary`. Match is case-insensitive (ASCII). + /// + /// Extension coverage is intentionally broader than the tree-sitter loader + /// in `ast::lang_for_path` because this function is consumed by the + /// dynamic verifier, which must classify *every* finding-bearing path so + /// that spec derivation does not collapse on idiomatic file extensions + /// like `.cjs`, `.mts`, `.pyi`, or `.kts`. JVM-family `.kt` / `.kts` map + /// to [`Lang::Java`] because the spec/toolchain layer is JVM-aware even + /// where the tree-sitter grammar is not. pub fn from_extension(ext: &str) -> Option { - match ext { + let lower = ext.to_ascii_lowercase(); + match lower.as_str() { "rs" => Some(Lang::Rust), "c" => Some(Lang::C), - "cpp" => Some(Lang::Cpp), - "java" => Some(Lang::Java), + "cpp" | "cc" | "cxx" | "c++" | "hpp" | "hxx" | "hh" | "h++" => Some(Lang::Cpp), + // Java family. `.kt` / `.kts` are Kotlin (JVM); the dynamic spec + // layer treats them as Java for toolchain selection purposes. + "java" | "kt" | "kts" => Some(Lang::Java), "go" => Some(Lang::Go), "php" => Some(Lang::Php), - "py" => Some(Lang::Python), - "ts" => Some(Lang::TypeScript), - "js" => Some(Lang::JavaScript), + // `.pyi` are Python stub files; spec derivation accepts them so + // typed-stub-only entry points still register a language. + "py" | "pyi" => Some(Lang::Python), + // `.mts` / `.cts` are TypeScript module-form (ES module / CommonJS). + "ts" | "tsx" | "mts" | "cts" => Some(Lang::TypeScript), + // `.mjs` / `.cjs` are JavaScript module-form. `.jsx` is React JSX. + "js" | "jsx" | "mjs" | "cjs" => Some(Lang::JavaScript), "rb" => Some(Lang::Ruby), _ => None, } } + /// Probe a path's language using extension first, then a shebang line on + /// `head_bytes`, then a content-byte heuristic on the first 200 bytes. + /// + /// `head_bytes` should be the first N bytes of the file (200 is plenty; + /// callers may pass more). Empty / unreadable files return `None`. + /// + /// Order: + /// 1. [`Lang::from_extension`] on the path's extension — fast path. + /// 2. Shebang inspection. Common interpreter aliases are recognised: + /// `python` / `python3` → [`Lang::Python`], `node` / `nodejs` / `deno` + /// / `bun` → [`Lang::JavaScript`], `ruby` → [`Lang::Ruby`], `php` → + /// [`Lang::Php`]. `/usr/bin/env ` and direct + /// `/usr/bin/` paths both work. + /// 3. Content-byte syntactic sniff: line-prefix matches on the first 200 + /// bytes (` Option { + if let Some(ext) = path.extension().and_then(|e| e.to_str()) { + if let Some(lang) = Self::from_extension(ext) { + return Some(lang); + } + } + if let Some(lang) = lang_from_shebang(head_bytes) { + return Some(lang); + } + sniff_content_lang(head_bytes) + } + /// Canonical slug string for this language. pub fn as_str(&self) -> &'static str { match self { @@ -288,5 +337,113 @@ pub fn namespace_with_package( } } +/// Maximum bytes of `head_bytes` consulted by the shebang / content sniff. +/// Larger reads are tolerated — the helpers truncate internally. +const SNIFF_HEAD_LIMIT: usize = 200; + +/// Parse a `#!` shebang line and map the interpreter name to a `Lang`. +/// +/// Handles `/usr/bin/env ` (with optional `-S` / `-i` flags), +/// direct `/usr/bin/`, and bare `` forms. Trailing version +/// digits (`python3`, `python3.11`) are stripped so the lookup matches the +/// base interpreter. Returns `None` for non-Nyx-supported interpreters +/// (`bash`, `sh`, `perl`, …). +fn lang_from_shebang(head: &[u8]) -> Option { + if !head.starts_with(b"#!") { + return None; + } + let cap = head.len().min(SNIFF_HEAD_LIMIT); + let line_end = head[..cap] + .iter() + .position(|&b| b == b'\n') + .unwrap_or(cap); + let line = std::str::from_utf8(&head[..line_end]).ok()?; + let line = line.trim_end_matches('\r').trim(); + let rest = line.strip_prefix("#!")?.trim(); + + let mut tokens = rest.split_whitespace(); + let first = tokens.next()?; + let interpreter = if first.ends_with("/env") || first == "env" { + // Skip env's own options (e.g. `-S`, `-i`, `--split-string`). + tokens.find(|t| !t.starts_with('-'))? + } else { + first.rsplit('/').next()? + }; + + let base: String = interpreter + .chars() + .take_while(|c| c.is_ascii_alphabetic()) + .collect(); + match base.as_str() { + "python" => Some(Lang::Python), + "node" | "nodejs" | "deno" | "bun" => Some(Lang::JavaScript), + "ts" | "tsx" => Some(Lang::TypeScript), + "ruby" => Some(Lang::Ruby), + "php" => Some(Lang::Php), + _ => None, + } +} + +/// Lightweight syntactic sniff over the first 200 bytes of a file. +/// +/// Skips a leading shebang line (callers already tried it), then inspects up +/// to ~20 head lines for unambiguous language tokens. Returns `None` if +/// nothing convinces; the verifier's caller will record `LangUnsupported` +/// rather than misclassify. +fn sniff_content_lang(head: &[u8]) -> Option { + if head.is_empty() { + return None; + } + let cap = head.len().min(SNIFF_HEAD_LIMIT); + let text = std::str::from_utf8(&head[..cap]).ok()?; + let body = match (text.starts_with("#!"), text.find('\n')) { + (true, Some(i)) => &text[i + 1..], + _ => text, + }; + + for raw in body.lines().take(20) { + let line = raw.trim_start(); + if line.is_empty() { + continue; + } + if line.starts_with("\n"; + let path = Path::new("/tmp/runme"); + assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Php)); +} + +#[test] +fn from_path_or_content_content_sniff_php() { + let head = b""; + let path = Path::new("/tmp/runme"); + assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Php)); +} + +#[test] +fn from_path_or_content_content_sniff_go_package_main() { + let head = b"package main\n\nimport \"fmt\"\n"; + let path = Path::new("/tmp/runme"); + assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Go)); +} + +#[test] +fn from_path_or_content_content_sniff_java_package_semicolon() { + let head = b"package com.example.app;\n\npublic class Main {}\n"; + let path = Path::new("/tmp/runme"); + assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Java)); +} + +#[test] +fn from_path_or_content_content_sniff_python_def() { + let head = b"\"\"\"docstring\"\"\"\n\ndef handle(x):\n return x\n"; + let path = Path::new("/tmp/runme"); + assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Python)); +} + +#[test] +fn from_path_or_content_content_sniff_rust_use_std() { + let head = b"use std::path::Path;\n\nfn main() {}\n"; + let path = Path::new("/tmp/runme"); + assert_eq!(Lang::from_path_or_content(path, head), Some(Lang::Rust)); +} + +#[test] +fn from_path_or_content_returns_none_when_nothing_matches() { + let path = Path::new("/tmp/runme.weird"); + assert_eq!(Lang::from_path_or_content(path, b"plain text data"), None); +} + +#[test] +fn from_path_or_content_empty_head_with_unknown_extension_returns_none() { + let path = Path::new("/tmp/runme"); + assert_eq!(Lang::from_path_or_content(path, b""), None); +} diff --git a/tests/db_corruption_tests.rs b/tests/db_corruption_tests.rs index d9bc0e2b..00315f6c 100644 --- a/tests/db_corruption_tests.rs +++ b/tests/db_corruption_tests.rs @@ -189,11 +189,10 @@ fn garbage_header_db_returns_structured_error() { } // NOTE: A mid-file corruption test (garbage at bytes 100..200, preserving -// SQLite magic) was attempted and is deliberately omitted. That shape -// triggers a slow corruption-detection path in SQLite where `Indexer::init` -// takes 150–200 seconds before returning, unsuitable for CI wall-clock -// budgets. The two tests above already cover the "corrupt-on-arrival" -// cases that users actually hit (crash-truncated file, deliberate clobber). -// A follow-up should either short-circuit `PRAGMA integrity_check` up -// front or wrap the init path in a timeout so mid-page corruption -// also fails fast. +// SQLite magic) is still omitted. `Indexer::init` short-circuits on +// header-magic mismatch (see `preflight_header`), so the corrupt-on-arrival +// shapes users actually hit return in microseconds. Mid-page damage that +// preserves the magic header still falls into SQLite's slow corruption +// detection path (150-200s), which is too long for CI wall-clock budgets; +// detecting that shape would require running `PRAGMA quick_check` with an +// interrupt callback, which is out of scope here. diff --git a/tests/dynamic_fixtures/lang_detect/build.gradle.kts b/tests/dynamic_fixtures/lang_detect/build.gradle.kts new file mode 100644 index 00000000..236d1566 --- /dev/null +++ b/tests/dynamic_fixtures/lang_detect/build.gradle.kts @@ -0,0 +1,9 @@ +// Kotlin build script — `.kts` extension. JVM family; spec layer treats as Java. +plugins { + java + application +} + +application { + mainClass.set("com.example.Main") +} diff --git a/tests/dynamic_fixtures/lang_detect/cli_node b/tests/dynamic_fixtures/lang_detect/cli_node new file mode 100644 index 00000000..45c8e309 --- /dev/null +++ b/tests/dynamic_fixtures/lang_detect/cli_node @@ -0,0 +1,4 @@ +#!/usr/bin/env node +// Extensionless CLI entry point. Shebang identifies the interpreter. +const url = process.argv[2]; +require("child_process").execSync("curl " + url); diff --git a/tests/dynamic_fixtures/lang_detect/cli_python b/tests/dynamic_fixtures/lang_detect/cli_python new file mode 100644 index 00000000..5c5744d7 --- /dev/null +++ b/tests/dynamic_fixtures/lang_detect/cli_python @@ -0,0 +1,10 @@ +#!/usr/bin/env python3 +# Extensionless CLI entry point. Shebang-only language identification. +import os +import sys + +def handle_request(payload: str) -> None: + os.system("echo " + payload) + +if __name__ == "__main__": + handle_request(sys.argv[1]) diff --git a/tests/dynamic_fixtures/lang_detect/module.cjs b/tests/dynamic_fixtures/lang_detect/module.cjs new file mode 100644 index 00000000..577684ed --- /dev/null +++ b/tests/dynamic_fixtures/lang_detect/module.cjs @@ -0,0 +1,8 @@ +// CommonJS module — `.cjs` extension. Identifies as JavaScript. +const { exec } = require("child_process"); + +function runCommand(payload) { + exec("ls " + payload); +} + +module.exports = { runCommand }; diff --git a/tests/dynamic_fixtures/lang_detect/script.pyi b/tests/dynamic_fixtures/lang_detect/script.pyi new file mode 100644 index 00000000..ea5b93f5 --- /dev/null +++ b/tests/dynamic_fixtures/lang_detect/script.pyi @@ -0,0 +1,3 @@ +from typing import Optional + +def handle_request(payload: str) -> Optional[str]: ... diff --git a/tests/lang_detect_probes.rs b/tests/lang_detect_probes.rs new file mode 100644 index 00000000..133feafa --- /dev/null +++ b/tests/lang_detect_probes.rs @@ -0,0 +1,220 @@ +//! Phase 02, Track A.2: integration coverage for the extension + shebang + +//! content-sniff language probes that drive +//! [`nyx_scanner::dynamic::spec::HarnessSpec`] derivation. +//! +//! Exercises the new behaviour through both the standalone helper +//! ([`Lang::from_path_or_content`]) and the spec-derivation path that calls +//! it, so a regression in either layer fails this suite. +//! +//! Gated on `--features dynamic`; the probes themselves live on the +//! always-present [`nyx_scanner::symbol::Lang`] type, but the spec side they +//! feed into is feature-gated. + +#[cfg(feature = "dynamic")] +mod lang_detect { + use nyx_scanner::commands::scan::Diag; + use nyx_scanner::dynamic::spec::{HarnessSpec, SpecDerivationStrategy}; + use nyx_scanner::evidence::{Confidence, Evidence}; + use nyx_scanner::labels::Cap; + use nyx_scanner::patterns::{FindingCategory, Severity}; + use nyx_scanner::symbol::Lang; + use std::path::{Path, PathBuf}; + + fn fixture(rel: &str) -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests/dynamic_fixtures/lang_detect") + .join(rel) + } + + fn read_head(path: &Path, cap: usize) -> Vec { + use std::io::Read; + let mut buf = Vec::new(); + let f = std::fs::File::open(path).expect("fixture must exist"); + f.take(cap as u64) + .read_to_end(&mut buf) + .expect("fixture must be readable"); + buf + } + + fn make_diag(id: &str, path: &Path, sink_caps: u32) -> Diag { + Diag { + path: path.to_string_lossy().into_owned(), + line: 4, + col: 0, + severity: Severity::High, + id: id.into(), + category: FindingCategory::Security, + path_validated: false, + guard_kind: None, + message: None, + labels: vec![], + confidence: Some(Confidence::High), + evidence: Some(Evidence { + sink_caps, + ..Default::default() + }), + rank_score: None, + rank_reason: None, + suppressed: false, + suppression: None, + rollup: None, + finding_id: String::new(), + alternative_finding_ids: vec![], + stable_hash: 0, + } + } + + // ── Direct probe coverage ──────────────────────────────────────────────── + + #[test] + fn extensionless_python_cli_detected_via_shebang() { + let path = fixture("cli_python"); + let head = read_head(&path, 200); + assert!( + path.extension().is_none(), + "fixture must remain extensionless" + ); + assert_eq!(Lang::from_path_or_content(&path, &head), Some(Lang::Python)); + } + + #[test] + fn extensionless_node_cli_detected_via_shebang() { + let path = fixture("cli_node"); + let head = read_head(&path, 200); + assert!(path.extension().is_none()); + assert_eq!( + Lang::from_path_or_content(&path, &head), + Some(Lang::JavaScript) + ); + } + + #[test] + fn pyi_stub_extension_resolves_to_python() { + let path = fixture("script.pyi"); + // No file head needed; extension wins. + assert_eq!(Lang::from_path_or_content(&path, b""), Some(Lang::Python)); + assert_eq!(Lang::from_extension("pyi"), Some(Lang::Python)); + } + + #[test] + fn cjs_extension_resolves_to_javascript() { + let path = fixture("module.cjs"); + assert_eq!( + Lang::from_path_or_content(&path, b""), + Some(Lang::JavaScript) + ); + assert_eq!(Lang::from_extension("cjs"), Some(Lang::JavaScript)); + } + + #[test] + fn kts_extension_resolves_to_java_for_jvm_toolchain() { + // `.kts` is Kotlin source. The 10-language `Lang` enum has no Kotlin + // variant, so JVM-family scripts fold into `Lang::Java` for the + // dynamic spec layer. This covers the `kt` / `kts` extensions called + // out in the phase 02 deliverables. + let path = fixture("build.gradle.kts"); + assert_eq!(Lang::from_path_or_content(&path, b""), Some(Lang::Java)); + assert_eq!(Lang::from_extension("kts"), Some(Lang::Java)); + assert_eq!(Lang::from_extension("kt"), Some(Lang::Java)); + } + + #[test] + fn shebang_only_python_script_resolves() { + // `cli_python` is the canonical "shebang-only" entry point: no + // extension, identification depends entirely on `#!/usr/bin/env + // python3`. Re-asserting separately so a regression that breaks + // env-prefixed shebang parsing fails its own test name. + let path = fixture("cli_python"); + let head = read_head(&path, 200); + assert!(head.starts_with(b"#!/usr/bin/env python3")); + assert_eq!(Lang::from_path_or_content(&path, &head), Some(Lang::Python)); + } + + #[test] + fn unknown_extension_with_no_signal_returns_none() { + // Extension unknown, no shebang, no content sniff hits → None. + let path = Path::new("does/not/exist.weirdext"); + assert_eq!(Lang::from_path_or_content(path, b"random text"), None); + } + + // ── Spec derivation must accept the new probes ────────────────────────── + + #[test] + fn spec_derivation_resolves_lang_for_extensionless_python_cli() { + // A CLI-namespaced rule against the extensionless Python script must + // derive a spec (FromCallgraphEntry strategy) — pre-Phase 02 this + // failed because `Lang::from_extension("")` returned None. + let path = fixture("cli_python"); + let diag = make_diag("py.cli.argv_handler", &path, Cap::SHELL_ESCAPE.bits()); + let spec = + HarnessSpec::from_finding(&diag).expect("extensionless CLI script must derive a spec"); + assert_eq!(spec.lang, Lang::Python); + assert_eq!(spec.toolchain_id, "python-3"); + } + + #[test] + fn spec_derivation_resolves_lang_for_extensionless_node_cli() { + let path = fixture("cli_node"); + let diag = make_diag("js.cli.argv_handler", &path, Cap::SHELL_ESCAPE.bits()); + let spec = + HarnessSpec::from_finding(&diag).expect("extensionless node CLI must derive a spec"); + assert_eq!(spec.lang, Lang::JavaScript); + assert_eq!(spec.toolchain_id, "node-20"); + } + + #[test] + fn spec_derivation_accepts_pyi_extension() { + let path = fixture("script.pyi"); + let diag = make_diag("py.cmdi.os_system", &path, Cap::SHELL_ESCAPE.bits()); + let spec = HarnessSpec::from_finding(&diag).expect(".pyi must derive a spec"); + assert_eq!(spec.derivation, SpecDerivationStrategy::FromRuleNamespace); + assert_eq!(spec.lang, Lang::Python); + } + + #[test] + fn spec_derivation_accepts_cjs_extension() { + let path = fixture("module.cjs"); + let diag = make_diag("js.cmdi.exec", &path, Cap::SHELL_ESCAPE.bits()); + let spec = HarnessSpec::from_finding(&diag).expect(".cjs must derive a spec"); + assert_eq!(spec.lang, Lang::JavaScript); + } + + #[test] + fn spec_derivation_accepts_kts_extension() { + let path = fixture("build.gradle.kts"); + let diag = make_diag("java.cmdi.exec", &path, Cap::SHELL_ESCAPE.bits()); + let spec = HarnessSpec::from_finding(&diag).expect(".kts must derive a spec"); + assert_eq!(spec.lang, Lang::Java); + } + + // ── Regression: previously-detected languages must still resolve ──────── + + #[test] + fn previously_detected_extensions_unchanged() { + // The classic 10 extensions plus the mid-Phase 01 inventory of + // C++ extensions — one assertion each so a regression fails on a + // single extension, not the whole batch. + for (ext, lang) in [ + ("rs", Lang::Rust), + ("c", Lang::C), + ("cpp", Lang::Cpp), + ("cc", Lang::Cpp), + ("hpp", Lang::Cpp), + ("java", Lang::Java), + ("go", Lang::Go), + ("php", Lang::Php), + ("py", Lang::Python), + ("ts", Lang::TypeScript), + ("tsx", Lang::TypeScript), + ("js", Lang::JavaScript), + ("jsx", Lang::JavaScript), + ("rb", Lang::Ruby), + ] { + assert_eq!( + Lang::from_extension(ext), + Some(lang), + "extension `.{ext}` must continue to resolve to {lang:?}" + ); + } + } +}