nyx/src/utils/snippet.rs

//! Source-line snippet extraction for diagnostics.
//!
//! Both [`crate::ast`] (per-finding evidence) and [`crate::summary`]
//! (cross-file `SinkSite`) need to grab the source line containing a
//! given byte offset, trim it, and cap it at a fixed character budget.
//! The two callers used to carry private copies of this routine; the
//! truncation step performed a raw byte slice (`&trimmed[..MAX]`) which
//! panics whenever the cap lands inside a multi-byte UTF-8 character.
//! Real-world Ruby/JS test suites with Cyrillic / CJK / emoji string
//! literals tripped this on `mastodon`, `discourse`, and `gitlabhq`.
//!
//! This shared helper truncates at the nearest preceding char
//! boundary, so any UTF-8 input is safe.

const MAX_SNIPPET_BYTES: usize = 120;

/// Extract the trimmed source line containing `byte_offset`, capped
/// at ~120 bytes (rounded down to the nearest UTF-8 char boundary).
/// Returns `None` when the offset is out of range or the line is
/// blank after trimming.
pub fn line_snippet(src: &[u8], byte_offset: usize) -> Option<String> {
    if byte_offset >= src.len() {
        return None;
    }
    let line_start = src[..byte_offset]
        .iter()
        .rposition(|&b| b == b'\n')
        .map_or(0, |p| p + 1);
    let line_end = src[byte_offset..]
        .iter()
        .position(|&b| b == b'\n')
        .map_or(src.len(), |p| byte_offset + p);
    let line = std::str::from_utf8(&src[line_start..line_end]).ok()?;
    let trimmed = line.trim();
    if trimmed.is_empty() {
        return None;
    }
    if trimmed.len() > MAX_SNIPPET_BYTES {
        let mut end = MAX_SNIPPET_BYTES;
        while end > 0 && !trimmed.is_char_boundary(end) {
            end -= 1;
        }
        Some(format!("{}...", &trimmed[..end]))
    } else {
        Some(trimmed.to_string())
    }
}

#[cfg(test)]
mod tests {
    use super::line_snippet;

    #[test]
    fn ascii_short_line_returned_verbatim() {
        let src = b"let x = 1;\nlet y = 2;\n";
        assert_eq!(line_snippet(src, 0).as_deref(), Some("let x = 1;"));
        assert_eq!(line_snippet(src, 11).as_deref(), Some("let y = 2;"));
    }

    #[test]
    fn blank_line_returns_none() {
        let src = b"x\n   \n";
        assert_eq!(line_snippet(src, 2), None);
    }

    #[test]
    fn out_of_range_returns_none() {
        let src = b"abc";
        assert_eq!(line_snippet(src, 10), None);
    }

    #[test]
    fn long_ascii_line_truncated_at_120_with_ellipsis() {
        let long = "x".repeat(200);
        let src = long.as_bytes();
        let out = line_snippet(src, 0).unwrap();
        assert!(out.ends_with("..."));
        assert_eq!(out.len(), 123); // 120 + "..."
    }

    #[test]
    fn long_line_with_multibyte_char_at_boundary_does_not_panic() {
        // Cyrillic chars are 2 bytes each; build a string where byte
        // 120 lands inside a 2-byte sequence.  This is the regression
        // shape that crashed mastodon/discourse/gitlabhq scans.
        let prefix = "a".repeat(119);
        let line = format!("expect(text).to eq('{}тест огромный текст ' * 50)", prefix);
        // Pad to ensure the line is > 120 bytes.
        let line = format!("{} {}", line, "тест ".repeat(50));
        let src = line.as_bytes();
        let out = line_snippet(src, 0).unwrap();
        assert!(out.ends_with("..."));
        // Truncation must produce valid UTF-8 (no panic, no replacement).
        assert!(std::str::from_utf8(out.as_bytes()).is_ok());
        // And the prefix preceding "..." must end on a char boundary.
        let stripped = out.strip_suffix("...").unwrap();
        assert!(stripped.is_char_boundary(stripped.len()));
    }

    #[test]
    fn truncation_at_emoji_boundary_safe() {
        // 4-byte emoji.  Build line so byte 120 lands inside the emoji.
        let mut line = "x".repeat(118);
        line.push_str("🦀🦀🦀🦀🦀"); // 4 bytes each
        // Repeat to ensure > 120 bytes and the 120th byte is mid-emoji.
        let src = line.as_bytes();
        assert!(src.len() > 120);
        let out = line_snippet(src, 0).unwrap();
        assert!(std::str::from_utf8(out.as_bytes()).is_ok());
        assert!(out.ends_with("..."));
    }

    #[test]
    fn picks_correct_line_for_offset_in_middle() {
        let src = b"first\nsecond line here\nthird\n";
        // Offset 6 is the 's' of "second".
        assert_eq!(line_snippet(src, 6).as_deref(), Some("second line here"));
    }
}
Prerelease cleanup (#46) * feat: Add const_bound_vars tracking to prevent false positives in ownership checks * feat: Introduce field interner and typed bounded vars for enhanced type tracking * feat: Add typed_call_receivers and typed_bounded_dto_fields for enhanced type tracking * feat: Centralize method name extraction with bare_method_name helper * feat: Implement Phase-6 hierarchy fan-out for runtime virtual dispatch * feat: Enhance C++ taint tracking with additional container operations and inline method resolution * feat: Introduce field-sensitive points-to analysis for enhanced resource tracking * feat: Implement Pointer-Phase 6 subscript handling for enhanced container analysis * test: Add comprehensive tests for JavaScript control flow constructs and lattice operations * docs: Update advanced analysis documentation with field-sensitive points-to and hierarchy fan-out details * test: Add comprehensive tests for lattice algebra laws and SSA edge cases * feat: Add destructured session user handling and safe user ID access patterns * feat: Implement row-population reverse-walk for enhanced authorization checks * feat: Enhance authorization checks with local alias chain for self-actor types * feat: Introduce ActiveRecord query safety checks and enhance snippet extraction * feat: Implement chained method call inner-gate rebinding for SSRF prevention * feat: Add observability and error modules, enhance debug functionality, and implement theme context * feat: Remove Auth Analysis page and update navigation to redirect to Explorer * feat: Optimize SSA lowering by sharing results between taint engine and artifact extractor * feat: Optimize SSA lowering by sharing results between taint engine and artifact extractor * feat: Reset path-safe-suppressed spans before lowering to maintain analysis integrity * fix(ssa): ungate debug_assert_bfs_ordering for release-tests build The helper at src/ssa/lower.rs was gated `#[cfg(debug_assertions)]` while the unit test at the bottom of the file was gated only `#[cfg(test)]`. Since `cfg(test)` is set in release builds with `--tests` but `cfg(debug_assertions)` is not, `cargo build --release --tests` failed with E0425. Removing the gate fixes the build; the body is `debug_assert!` only, so the helper is free in release. Also drop the gate at the call site to avoid a `dead_code` warning when the lib is built without `--tests`. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * test(closure-capture): flip JS/TS fixtures to required-finding The JS and TS closure-capture fixtures pinned the old broken behaviour via `forbidden_findings: [{ "id_prefix": "taint-" }]`. The engine now correctly traces taint through the closure boundary (env source captured by an arrow function, sunk via `child_process.exec` inside the body), so the formerly-forbidden finding is a true positive. Match the Python sibling's shape — `required_findings` with `id_prefix` + `min_count` plus a small `noise_budget` — and rewrite the companion READMEs and the phase8_fragility_tests doc-comments from "known gap" to "regression guard". Verified: - cargo test --release --test phase8_fragility_tests → 8/8 pass - cargo test --release --lib bfs_assertion → pass - corpus benchmark F1 = 0.9976 (TP=205, FP=1, FN=0) — unchanged Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * feat: Add OWASP mapping and baseline mutation hooks for enhanced security analysis * feat: Introduce health module and enhance health score computation with calibration tests * feat: Add expectations configuration and cleanup .gitignore for log files * feat: Implement theme selection and enhance settings panel for triage sync * feat: Suppress false positives for strcpy calls with literal sources in AST * feat: Update analyse_function_ssa to return body CFG for accurate analysis * feat: Add bug report and feature request templates for improved issue tracking * feat: removed dev scripts * feat: update README.md for clarity and consistency in fixture descriptions * feat: removed dev docs * feat: clean up error handling and UI elements for improved user experience * feat: adjust button sizes in HeaderBar for better UI consistency * feat: enhance taint analysis with additional context for sanitizer and taint findings * cargo fmt * prettier * refactor: simplify conditional checks and improve code readability in AST and screenshot capture scripts * feat: add script to frame PNG screenshots with brand gradient * feat: add fuzzing support with new targets and CI workflows * refactor: streamline match expressions and improve formatting in CLI and output handling * feat: enhance configuration display with detailed output options * feat: stage demo configuration for improved CLI screenshot output * feat: expose merge_configs function for user-configurable settings * refactor: simplify code structure and improve readability in config handling * refactor: improve descriptions for vulnerability patterns in various languages * feat: update MIT License section with additional usage details and copyright information * feat: update screenshots * refactor: update build process and paths for frontend assets * feat: add cross-file taint fuzzing target and supporting dictionary * refactor: clean up formatting and comments in fuzz configuration and example files * refactor: remove outdated comments and clean up CI configuration files * chore: update changelog dates and improve formatting in documentation * refactor: update Cargo.toml and CI configuration for improved packaging and build process * refactor: enhance quote-stripping logic to prevent panics and add regression tests --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-04-29 00:58:38 -04:00			`//! Source-line snippet extraction for diagnostics.`
			`//!`
			//! Both [`crate::ast`] (per-finding evidence) and [`crate::summary`]
			//! (cross-file `SinkSite`) need to grab the source line containing a
			`//! given byte offset, trim it, and cap it at a fixed character budget.`
			`//! The two callers used to carry private copies of this routine; the`
			//! truncation step performed a raw byte slice (`&trimmed[..MAX]`) which
			`//! panics whenever the cap lands inside a multi-byte UTF-8 character.`
			`//! Real-world Ruby/JS test suites with Cyrillic / CJK / emoji string`
			//! literals tripped this on `mastodon`, `discourse`, and `gitlabhq`.
			`//!`
			`//! This shared helper truncates at the nearest preceding char`
			`//! boundary, so any UTF-8 input is safe.`

			`const MAX_SNIPPET_BYTES: usize = 120;`

			/// Extract the trimmed source line containing `byte_offset`, capped
			`/// at ~120 bytes (rounded down to the nearest UTF-8 char boundary).`
			/// Returns `None` when the offset is out of range or the line is
			`/// blank after trimming.`
			`pub fn line_snippet(src: &[u8], byte_offset: usize) -> Option<String> {`
			`if byte_offset >= src.len() {`
			`return None;`
			`}`
			`let line_start = src[..byte_offset]`
			`.iter()`
			`.rposition(\|&b\| b == b'\n')`
			`.map_or(0, \|p\| p + 1);`
			`let line_end = src[byte_offset..]`
			`.iter()`
			`.position(\|&b\| b == b'\n')`
			`.map_or(src.len(), \|p\| byte_offset + p);`
			`let line = std::str::from_utf8(&src[line_start..line_end]).ok()?;`
			`let trimmed = line.trim();`
			`if trimmed.is_empty() {`
			`return None;`
			`}`
			`if trimmed.len() > MAX_SNIPPET_BYTES {`
			`let mut end = MAX_SNIPPET_BYTES;`
			`while end > 0 && !trimmed.is_char_boundary(end) {`
			`end -= 1;`
			`}`
			`Some(format!("{}...", &trimmed[..end]))`
			`} else {`
			`Some(trimmed.to_string())`
			`}`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use super::line_snippet;`

			`#[test]`
			`fn ascii_short_line_returned_verbatim() {`
			`let src = b"let x = 1;\nlet y = 2;\n";`
			`assert_eq!(line_snippet(src, 0).as_deref(), Some("let x = 1;"));`
			`assert_eq!(line_snippet(src, 11).as_deref(), Some("let y = 2;"));`
			`}`

			`#[test]`
			`fn blank_line_returns_none() {`
			`let src = b"x\n \n";`
			`assert_eq!(line_snippet(src, 2), None);`
			`}`

			`#[test]`
			`fn out_of_range_returns_none() {`
			`let src = b"abc";`
			`assert_eq!(line_snippet(src, 10), None);`
			`}`

			`#[test]`
			`fn long_ascii_line_truncated_at_120_with_ellipsis() {`
			`let long = "x".repeat(200);`
			`let src = long.as_bytes();`
			`let out = line_snippet(src, 0).unwrap();`
			`assert!(out.ends_with("..."));`
			`assert_eq!(out.len(), 123); // 120 + "..."`
			`}`

			`#[test]`
			`fn long_line_with_multibyte_char_at_boundary_does_not_panic() {`
			`// Cyrillic chars are 2 bytes each; build a string where byte`
			`// 120 lands inside a 2-byte sequence. This is the regression`
			`// shape that crashed mastodon/discourse/gitlabhq scans.`
			`let prefix = "a".repeat(119);`
			`let line = format!("expect(text).to eq('{}тест огромный текст ' * 50)", prefix);`
			`// Pad to ensure the line is > 120 bytes.`
			`let line = format!("{} {}", line, "тест ".repeat(50));`
			`let src = line.as_bytes();`
			`let out = line_snippet(src, 0).unwrap();`
			`assert!(out.ends_with("..."));`
			`// Truncation must produce valid UTF-8 (no panic, no replacement).`
			`assert!(std::str::from_utf8(out.as_bytes()).is_ok());`
			`// And the prefix preceding "..." must end on a char boundary.`
			`let stripped = out.strip_suffix("...").unwrap();`
			`assert!(stripped.is_char_boundary(stripped.len()));`
			`}`

			`#[test]`
			`fn truncation_at_emoji_boundary_safe() {`
			`// 4-byte emoji. Build line so byte 120 lands inside the emoji.`
			`let mut line = "x".repeat(118);`
			`line.push_str("🦀🦀🦀🦀🦀"); // 4 bytes each`
			`// Repeat to ensure > 120 bytes and the 120th byte is mid-emoji.`
			`let src = line.as_bytes();`
			`assert!(src.len() > 120);`
			`let out = line_snippet(src, 0).unwrap();`
			`assert!(std::str::from_utf8(out.as_bytes()).is_ok());`
			`assert!(out.ends_with("..."));`
			`}`

			`#[test]`
			`fn picks_correct_line_for_offset_in_middle() {`
			`let src = b"first\nsecond line here\nthird\n";`
			`// Offset 6 is the 's' of "second".`
			`assert_eq!(line_snippet(src, 6).as_deref(), Some("second line here"));`
			`}`
			`}`