Python fp and docs updtes (#58)

* refactor: Update comments for clarity and add expectations.json files for performance metrics * feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks * feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks * refactor: Simplify code formatting for better readability in multiple files * refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration * feat: Update Java and Python patterns to include new security rules * refactor: Improve comment clarity and consistency across multiple Rust files * refactor: Simplify code formatting for improved readability in integration tests and module files * refactor: Improve comment formatting and enhance clarity in assertions across multiple files
2026-06-12 19:55:14 +02:00 · 2026-04-29 19:53:34 -04:00 · 2026-04-29 19:53:34 -04:00 · a438886217
commit a438886217
parent 4db0805de6
291 changed files with 9485 additions and 3851 deletions
--- a/src/utils/analysis_options.rs
+++ b/src/utils/analysis_options.rs
@ -27,7 +27,7 @@ pub const DEFAULT_PARSE_TIMEOUT_MS: u64 = 10_000;
 /// value.  Raised from the historical `4` to `32` so realistic codebases
 /// with wide joins (many param sources, deep helper chains) no longer
 /// silently drop origin attribution.  Tunable via
-/// [`AnalysisOptions::max_origins`] — see
+/// [`AnalysisOptions::max_origins`], see
 /// `src/taint/ssa_transfer/state.rs::effective_max_origins`.
 pub const DEFAULT_MAX_ORIGINS: u32 = 32;

@ -38,11 +38,11 @@ pub const DEFAULT_MAX_ORIGINS: u32 = 32;
 pub const MIN_MAX_ORIGINS: u32 = 1;

 /// Default upper bound on the number of abstract heap objects tracked per
-/// intra-procedural points-to set.  Set to `32` — high enough that
+/// intra-procedural points-to set.  Set to `32`, high enough that
 /// realistic factory/builder/DI patterns (routine 10–30 allocation sites
 /// aliased into one variable) stay precise, low enough to keep
 /// `HeapState` join/clone cost bounded in the worklist.  Tunable via
-/// [`AnalysisOptions::max_pointsto`] — see
+/// [`AnalysisOptions::max_pointsto`], see
 /// `src/ssa/heap.rs::effective_max_pointsto`.
 pub const DEFAULT_MAX_POINTSTO: u32 = 32;

@ -152,7 +152,7 @@ impl Default for AnalysisOptions {
 /// (notably `nyx serve`, which resolves the engine profile per scan
 /// request) can replace the installed options between scans via
 /// [`reinstall`].  Within a single scan run, engine toggles must not
-/// change mid-flight — the caller is responsible for that invariant
+/// change mid-flight, the caller is responsible for that invariant
 /// (`JobManager`'s single-scan guarantee provides it in the server).
 static RUNTIME: RwLock<Option<AnalysisOptions>> = RwLock::new(None);

@ -174,7 +174,7 @@ pub fn install(opts: AnalysisOptions) -> bool {
 /// server's scan thread, which re-resolves the engine profile from each
 /// incoming request; `install`'s first-wins semantics would otherwise
 /// pin the first scan's choice for the lifetime of the server.  Callers
-/// must ensure no scan is concurrently reading `current()` — in practice
+/// must ensure no scan is concurrently reading `current()`, in practice
 /// this means calling `reinstall` before the scan's rayon pool starts.
 pub fn reinstall(opts: AnalysisOptions) {
    *RUNTIME.write().expect("analysis options RwLock poisoned") = Some(opts);
--- a/src/utils/config.rs
+++ b/src/utils/config.rs
@ -315,8 +315,8 @@ pub struct OutputConfig {
    /// When `true`, findings whose engine provenance notes include any
    /// `OverReport` (widening) or `Bail` (lowering/parse failure)
    /// direction are filtered out before output.  `UnderReport`
-    /// findings — where the result set is a lower bound but each
-    /// emitted flow is still real — are kept.
+    /// findings, where the result set is a lower bound but each
+    /// emitted flow is still real, are kept.
    ///
    /// Surfaced via `--require-converged`; intended for strict CI
    /// gating where a finding from capped analysis is worse than no
@ -644,7 +644,7 @@ impl Default for RunsConfig {
    }
 }

-/// A named scan profile — a partial overlay of scan-related settings.
+/// A named scan profile, a partial overlay of scan-related settings.
 /// All fields are `Option<T>`: `None` means "don't override".
 #[derive(Debug, Serialize, Deserialize, Clone, Default)]
 #[serde(default)]
@ -715,7 +715,7 @@ pub struct Config {
    pub server: ServerConfig,
    pub runs: RunsConfig,
    pub profiles: HashMap<String, ScanProfile>,
-    /// Detected frameworks for the current project — set by the scan pipeline,
+    /// Detected frameworks for the current project, set by the scan pipeline,
    /// not persisted to config files.
    #[serde(skip)]
    pub framework_ctx: Option<crate::utils::project::FrameworkContext>,
--- a/src/utils/ext.rs
+++ b/src/utils/ext.rs
@ -5,7 +5,7 @@ pub fn lowercase_ext(path: &std::path::Path) -> Option<&'static str> {
        // Real-world C++ codebases overwhelmingly use `.cc` / `.cxx` /
        // `.hpp` / `.hh` / `.h++` rather than the `.cpp` synthetic-fixture
        // extension.  All map to the same tree-sitter-cpp grammar.  `.h`
-        // is intentionally NOT mapped — it's also valid C and
+        // is intentionally NOT mapped, it's also valid C and
        // disambiguating without a build system is brittle.
        "cpp" | "c++" | "cc" | "cxx" | "hpp" | "hxx" | "hh" | "h++" => Some("cpp"),
        "java" => Some("java"),
--- a/src/utils/project.rs
+++ b/src/utils/project.rs
@ -84,7 +84,7 @@ fn read_bounded(path: &Path) -> Option<String> {
 ///
 /// Intentionally a coarse byte-level substring check against the quoted module
 /// specifier (e.g. `'fastify'`, `"github.com/labstack/echo/v4"`,
-/// `'sinatra'`). Only the first 8 KiB of the file are inspected — imports /
+/// `'sinatra'`). Only the first 8 KiB of the file are inspected, imports /
 /// requires live at the top. Returns an empty list for languages without a
 /// framework detection policy here.
 pub fn detect_in_file_frameworks(bytes: &[u8], lang_slug: &str) -> Vec<DetectedFramework> {
@ -147,7 +147,7 @@ pub fn detect_frameworks(root: &Path) -> FrameworkContext {
    // ── Node.js (package.json) ──
    if let Some(content) = read_bounded(&root.join("package.json")) {
        // Crude substring search in the "dependencies" block area.
-        // Good enough for detection — no JSON parsing overhead.
+        // Good enough for detection, no JSON parsing overhead.
        if content.contains("\"express\"") {
            fws.push(DetectedFramework::Express);
        }
--- a/src/utils/query_cache.rs
+++ b/src/utils/query_cache.rs
@ -23,7 +23,7 @@ static CACHE: LazyLock<RwLock<HashMap<&'static str, QuerySet>>> =
 /// patterns for the language are cached normally. A language with an
 /// all-malformed pattern slice yields an empty cache entry.
 ///
-/// Lock poisoning on the shared cache is recovered transparently — a
+/// Lock poisoning on the shared cache is recovered transparently, a
 /// panic in another thread must not brick pattern loading process-wide.
 pub fn for_lang(lang: &'static str, ts_lang: Language) -> std::sync::Arc<Vec<CompiledQuery>> {
    // fast path
@ -31,7 +31,7 @@ pub fn for_lang(lang: &'static str, ts_lang: Language) -> std::sync::Arc<Vec<Com
        return v.clone();
    }

-    // slow path — compile
+    // slow path, compile
    let patterns = patterns::load(lang);
    let compiled: Vec<_> = patterns
        .into_iter()
--- a/src/utils/snippet.rs
+++ b/src/utils/snippet.rs
@ -1,19 +1,47 @@
-//! Source-line snippet extraction for diagnostics.
+//! UTF-8-safe truncation for diagnostic strings.
 //!
-//! Both [`crate::ast`] (per-finding evidence) and [`crate::summary`]
-//! (cross-file `SinkSite`) need to grab the source line containing a
-//! given byte offset, trim it, and cap it at a fixed character budget.
-//! The two callers used to carry private copies of this routine; the
-//! truncation step performed a raw byte slice (`&trimmed[..MAX]`) which
-//! panics whenever the cap lands inside a multi-byte UTF-8 character.
-//! Real-world Ruby/JS test suites with Cyrillic / CJK / emoji string
-//! literals tripped this on `mastodon`, `discourse`, and `gitlabhq`.
+//! Two related shapes live here:
 //!
-//! This shared helper truncates at the nearest preceding char
-//! boundary, so any UTF-8 input is safe.
+//! 1. [`line_snippet`], extracts the trimmed source line containing
+//!    a byte offset, capped at ~120 bytes.  Used by [`crate::ast`]
+//!    (per-finding evidence) and [`crate::summary`] (cross-file
+//!    `SinkSite`).
+//! 2. [`truncate_at_char_boundary`], the underlying primitive: cap a
+//!    string at `max_bytes`, rounded down to the nearest UTF-8 char
+//!    boundary.
+//!
+//! Both arose from the same family of panics: real-world Ruby/JS/Go
+//! test suites carry literal Cyrillic / CJK / emoji / Devanagari /
+//! Gurmukhi inside string and regex constants.  Naive
+//! `&s[..MAX].to_string()` truncation panics whenever the cap lands
+//! inside a multi-byte UTF-8 sequence, killing the rayon worker that
+//! happens to lower that file.  Earlier sessions fixed `line_snippet`
+//! (mastodon / discourse / gitlabhq, Cyrillic in RSpec strings); the
+//! gogs scan still tripped because the CFG condition-text path
+//! (`src/cfg/conditions.rs`, `src/cfg/mod.rs`) carried a third copy
+//! of the same byte-slice idiom.  The Gurmukhi `'ਖ'` regex literal in
+//! gogs's localised Gherkin keyword list lands byte 256 mid-character
+//! and panics.  Centralising the safe-truncation primitive prevents
+//! the next bytes-vs-chars site from re-introducing the same bug.

 const MAX_SNIPPET_BYTES: usize = 120;

+/// Truncate `s` to at most `max_bytes` bytes, rounding the cut point
+/// down to the nearest UTF-8 character boundary so the returned slice
+/// is always valid UTF-8.  When `s.len() <= max_bytes` the slice is
+/// returned unchanged.  When `max_bytes == 0` an empty slice is
+/// returned.  Never panics on multi-byte input.
+pub fn truncate_at_char_boundary(s: &str, max_bytes: usize) -> &str {
+    if s.len() <= max_bytes {
+        return s;
+    }
+    let mut end = max_bytes;
+    while end > 0 && !s.is_char_boundary(end) {
+        end -= 1;
+    }
+    &s[..end]
+}
+
 /// Extract the trimmed source line containing `byte_offset`, capped
 /// at ~120 bytes (rounded down to the nearest UTF-8 char boundary).
 /// Returns `None` when the offset is out of range or the line is
@ -36,11 +64,10 @@ pub fn line_snippet(src: &[u8], byte_offset: usize) -> Option<String> {
        return None;
    }
    if trimmed.len() > MAX_SNIPPET_BYTES {
-        let mut end = MAX_SNIPPET_BYTES;
-        while end > 0 && !trimmed.is_char_boundary(end) {
-            end -= 1;
-        }
-        Some(format!("{}...", &trimmed[..end]))
+        Some(format!(
+            "{}...",
+            truncate_at_char_boundary(trimmed, MAX_SNIPPET_BYTES)
+        ))
    } else {
        Some(trimmed.to_string())
    }
@ -48,7 +75,51 @@ pub fn line_snippet(src: &[u8], byte_offset: usize) -> Option<String> {

 #[cfg(test)]
 mod tests {
-    use super::line_snippet;
+    use super::{line_snippet, truncate_at_char_boundary};
+
+    #[test]
+    fn truncate_short_string_unchanged() {
+        assert_eq!(truncate_at_char_boundary("hello", 10), "hello");
+        assert_eq!(truncate_at_char_boundary("", 10), "");
+    }
+
+    #[test]
+    fn truncate_zero_max_returns_empty() {
+        assert_eq!(truncate_at_char_boundary("hello", 0), "");
+        assert_eq!(truncate_at_char_boundary("ਖਖਖ", 0), "");
+    }
+
+    #[test]
+    fn truncate_ascii_clean_at_byte_max() {
+        assert_eq!(truncate_at_char_boundary("hello world", 5), "hello");
+    }
+
+    #[test]
+    fn truncate_inside_multibyte_rounds_down() {
+        // 'ਖ' (Gurmukhi LETTER KHA, U+0A16) is 3 bytes in UTF-8.
+        // Build a string where byte 5 lands inside the 'ਖ'.
+        let s = "abcdਖef";
+        // bytes: 0..4 = "abcd", 4..7 = 'ਖ', 7.. = "ef"
+        // Truncating at 5 must not panic; result is "abcd".
+        assert_eq!(truncate_at_char_boundary(s, 5), "abcd");
+        assert_eq!(truncate_at_char_boundary(s, 6), "abcd");
+        assert_eq!(truncate_at_char_boundary(s, 7), "abcdਖ");
+    }
+
+    #[test]
+    fn truncate_devanagari_gherkin_regex_literal() {
+        // Reproduces the gogs panic shape: long regex string that
+        // contains Devanagari / Gurmukhi / CJK / Thai keywords with
+        // byte 256 landing mid-character.
+        let regex_body = "stream.match(/(機能|功能|フィーチャ|기능|โครงหลัก|ความสามารถ|ความต้องการทางธุรกิจ|ಹೆಚ್ಚಳ|గుణము|ਮੁਹਾਂਦਰਾ|ਨਕਸ਼ ਨੁਹਾਰ|".to_string();
+        assert!(regex_body.len() > 256);
+        // Must not panic.
+        let truncated = truncate_at_char_boundary(&regex_body, 256);
+        // Must be valid UTF-8 (it's already a `&str`, but the cut point
+        // landing on a boundary is the actual property under test).
+        assert!(regex_body.is_char_boundary(truncated.len()));
+        assert!(truncated.len() <= 256);
+    }

    #[test]
    fn ascii_short_line_returned_verbatim() {