mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-12 19:55:14 +02:00
Python fp and docs updtes (#58)
* refactor: Update comments for clarity and add expectations.json files for performance metrics * feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks * feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks * refactor: Simplify code formatting for better readability in multiple files * refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration * feat: Update Java and Python patterns to include new security rules * refactor: Improve comment clarity and consistency across multiple Rust files * refactor: Simplify code formatting for improved readability in integration tests and module files * refactor: Improve comment formatting and enhance clarity in assertions across multiple files
This commit is contained in:
parent
4db0805de6
commit
a438886217
291 changed files with 9485 additions and 3851 deletions
|
|
@ -27,7 +27,7 @@ pub const DEFAULT_PARSE_TIMEOUT_MS: u64 = 10_000;
|
|||
/// value. Raised from the historical `4` to `32` so realistic codebases
|
||||
/// with wide joins (many param sources, deep helper chains) no longer
|
||||
/// silently drop origin attribution. Tunable via
|
||||
/// [`AnalysisOptions::max_origins`] — see
|
||||
/// [`AnalysisOptions::max_origins`], see
|
||||
/// `src/taint/ssa_transfer/state.rs::effective_max_origins`.
|
||||
pub const DEFAULT_MAX_ORIGINS: u32 = 32;
|
||||
|
||||
|
|
@ -38,11 +38,11 @@ pub const DEFAULT_MAX_ORIGINS: u32 = 32;
|
|||
pub const MIN_MAX_ORIGINS: u32 = 1;
|
||||
|
||||
/// Default upper bound on the number of abstract heap objects tracked per
|
||||
/// intra-procedural points-to set. Set to `32` — high enough that
|
||||
/// intra-procedural points-to set. Set to `32`, high enough that
|
||||
/// realistic factory/builder/DI patterns (routine 10–30 allocation sites
|
||||
/// aliased into one variable) stay precise, low enough to keep
|
||||
/// `HeapState` join/clone cost bounded in the worklist. Tunable via
|
||||
/// [`AnalysisOptions::max_pointsto`] — see
|
||||
/// [`AnalysisOptions::max_pointsto`], see
|
||||
/// `src/ssa/heap.rs::effective_max_pointsto`.
|
||||
pub const DEFAULT_MAX_POINTSTO: u32 = 32;
|
||||
|
||||
|
|
@ -152,7 +152,7 @@ impl Default for AnalysisOptions {
|
|||
/// (notably `nyx serve`, which resolves the engine profile per scan
|
||||
/// request) can replace the installed options between scans via
|
||||
/// [`reinstall`]. Within a single scan run, engine toggles must not
|
||||
/// change mid-flight — the caller is responsible for that invariant
|
||||
/// change mid-flight, the caller is responsible for that invariant
|
||||
/// (`JobManager`'s single-scan guarantee provides it in the server).
|
||||
static RUNTIME: RwLock<Option<AnalysisOptions>> = RwLock::new(None);
|
||||
|
||||
|
|
@ -174,7 +174,7 @@ pub fn install(opts: AnalysisOptions) -> bool {
|
|||
/// server's scan thread, which re-resolves the engine profile from each
|
||||
/// incoming request; `install`'s first-wins semantics would otherwise
|
||||
/// pin the first scan's choice for the lifetime of the server. Callers
|
||||
/// must ensure no scan is concurrently reading `current()` — in practice
|
||||
/// must ensure no scan is concurrently reading `current()`, in practice
|
||||
/// this means calling `reinstall` before the scan's rayon pool starts.
|
||||
pub fn reinstall(opts: AnalysisOptions) {
|
||||
*RUNTIME.write().expect("analysis options RwLock poisoned") = Some(opts);
|
||||
|
|
|
|||
|
|
@ -315,8 +315,8 @@ pub struct OutputConfig {
|
|||
/// When `true`, findings whose engine provenance notes include any
|
||||
/// `OverReport` (widening) or `Bail` (lowering/parse failure)
|
||||
/// direction are filtered out before output. `UnderReport`
|
||||
/// findings — where the result set is a lower bound but each
|
||||
/// emitted flow is still real — are kept.
|
||||
/// findings, where the result set is a lower bound but each
|
||||
/// emitted flow is still real, are kept.
|
||||
///
|
||||
/// Surfaced via `--require-converged`; intended for strict CI
|
||||
/// gating where a finding from capped analysis is worse than no
|
||||
|
|
@ -644,7 +644,7 @@ impl Default for RunsConfig {
|
|||
}
|
||||
}
|
||||
|
||||
/// A named scan profile — a partial overlay of scan-related settings.
|
||||
/// A named scan profile, a partial overlay of scan-related settings.
|
||||
/// All fields are `Option<T>`: `None` means "don't override".
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Default)]
|
||||
#[serde(default)]
|
||||
|
|
@ -715,7 +715,7 @@ pub struct Config {
|
|||
pub server: ServerConfig,
|
||||
pub runs: RunsConfig,
|
||||
pub profiles: HashMap<String, ScanProfile>,
|
||||
/// Detected frameworks for the current project — set by the scan pipeline,
|
||||
/// Detected frameworks for the current project, set by the scan pipeline,
|
||||
/// not persisted to config files.
|
||||
#[serde(skip)]
|
||||
pub framework_ctx: Option<crate::utils::project::FrameworkContext>,
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ pub fn lowercase_ext(path: &std::path::Path) -> Option<&'static str> {
|
|||
// Real-world C++ codebases overwhelmingly use `.cc` / `.cxx` /
|
||||
// `.hpp` / `.hh` / `.h++` rather than the `.cpp` synthetic-fixture
|
||||
// extension. All map to the same tree-sitter-cpp grammar. `.h`
|
||||
// is intentionally NOT mapped — it's also valid C and
|
||||
// is intentionally NOT mapped, it's also valid C and
|
||||
// disambiguating without a build system is brittle.
|
||||
"cpp" | "c++" | "cc" | "cxx" | "hpp" | "hxx" | "hh" | "h++" => Some("cpp"),
|
||||
"java" => Some("java"),
|
||||
|
|
|
|||
|
|
@ -84,7 +84,7 @@ fn read_bounded(path: &Path) -> Option<String> {
|
|||
///
|
||||
/// Intentionally a coarse byte-level substring check against the quoted module
|
||||
/// specifier (e.g. `'fastify'`, `"github.com/labstack/echo/v4"`,
|
||||
/// `'sinatra'`). Only the first 8 KiB of the file are inspected — imports /
|
||||
/// `'sinatra'`). Only the first 8 KiB of the file are inspected, imports /
|
||||
/// requires live at the top. Returns an empty list for languages without a
|
||||
/// framework detection policy here.
|
||||
pub fn detect_in_file_frameworks(bytes: &[u8], lang_slug: &str) -> Vec<DetectedFramework> {
|
||||
|
|
@ -147,7 +147,7 @@ pub fn detect_frameworks(root: &Path) -> FrameworkContext {
|
|||
// ── Node.js (package.json) ──
|
||||
if let Some(content) = read_bounded(&root.join("package.json")) {
|
||||
// Crude substring search in the "dependencies" block area.
|
||||
// Good enough for detection — no JSON parsing overhead.
|
||||
// Good enough for detection, no JSON parsing overhead.
|
||||
if content.contains("\"express\"") {
|
||||
fws.push(DetectedFramework::Express);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ static CACHE: LazyLock<RwLock<HashMap<&'static str, QuerySet>>> =
|
|||
/// patterns for the language are cached normally. A language with an
|
||||
/// all-malformed pattern slice yields an empty cache entry.
|
||||
///
|
||||
/// Lock poisoning on the shared cache is recovered transparently — a
|
||||
/// Lock poisoning on the shared cache is recovered transparently, a
|
||||
/// panic in another thread must not brick pattern loading process-wide.
|
||||
pub fn for_lang(lang: &'static str, ts_lang: Language) -> std::sync::Arc<Vec<CompiledQuery>> {
|
||||
// fast path
|
||||
|
|
@ -31,7 +31,7 @@ pub fn for_lang(lang: &'static str, ts_lang: Language) -> std::sync::Arc<Vec<Com
|
|||
return v.clone();
|
||||
}
|
||||
|
||||
// slow path — compile
|
||||
// slow path, compile
|
||||
let patterns = patterns::load(lang);
|
||||
let compiled: Vec<_> = patterns
|
||||
.into_iter()
|
||||
|
|
|
|||
|
|
@ -1,19 +1,47 @@
|
|||
//! Source-line snippet extraction for diagnostics.
|
||||
//! UTF-8-safe truncation for diagnostic strings.
|
||||
//!
|
||||
//! Both [`crate::ast`] (per-finding evidence) and [`crate::summary`]
|
||||
//! (cross-file `SinkSite`) need to grab the source line containing a
|
||||
//! given byte offset, trim it, and cap it at a fixed character budget.
|
||||
//! The two callers used to carry private copies of this routine; the
|
||||
//! truncation step performed a raw byte slice (`&trimmed[..MAX]`) which
|
||||
//! panics whenever the cap lands inside a multi-byte UTF-8 character.
|
||||
//! Real-world Ruby/JS test suites with Cyrillic / CJK / emoji string
|
||||
//! literals tripped this on `mastodon`, `discourse`, and `gitlabhq`.
|
||||
//! Two related shapes live here:
|
||||
//!
|
||||
//! This shared helper truncates at the nearest preceding char
|
||||
//! boundary, so any UTF-8 input is safe.
|
||||
//! 1. [`line_snippet`], extracts the trimmed source line containing
|
||||
//! a byte offset, capped at ~120 bytes. Used by [`crate::ast`]
|
||||
//! (per-finding evidence) and [`crate::summary`] (cross-file
|
||||
//! `SinkSite`).
|
||||
//! 2. [`truncate_at_char_boundary`], the underlying primitive: cap a
|
||||
//! string at `max_bytes`, rounded down to the nearest UTF-8 char
|
||||
//! boundary.
|
||||
//!
|
||||
//! Both arose from the same family of panics: real-world Ruby/JS/Go
|
||||
//! test suites carry literal Cyrillic / CJK / emoji / Devanagari /
|
||||
//! Gurmukhi inside string and regex constants. Naive
|
||||
//! `&s[..MAX].to_string()` truncation panics whenever the cap lands
|
||||
//! inside a multi-byte UTF-8 sequence, killing the rayon worker that
|
||||
//! happens to lower that file. Earlier sessions fixed `line_snippet`
|
||||
//! (mastodon / discourse / gitlabhq, Cyrillic in RSpec strings); the
|
||||
//! gogs scan still tripped because the CFG condition-text path
|
||||
//! (`src/cfg/conditions.rs`, `src/cfg/mod.rs`) carried a third copy
|
||||
//! of the same byte-slice idiom. The Gurmukhi `'ਖ'` regex literal in
|
||||
//! gogs's localised Gherkin keyword list lands byte 256 mid-character
|
||||
//! and panics. Centralising the safe-truncation primitive prevents
|
||||
//! the next bytes-vs-chars site from re-introducing the same bug.
|
||||
|
||||
const MAX_SNIPPET_BYTES: usize = 120;
|
||||
|
||||
/// Truncate `s` to at most `max_bytes` bytes, rounding the cut point
|
||||
/// down to the nearest UTF-8 character boundary so the returned slice
|
||||
/// is always valid UTF-8. When `s.len() <= max_bytes` the slice is
|
||||
/// returned unchanged. When `max_bytes == 0` an empty slice is
|
||||
/// returned. Never panics on multi-byte input.
|
||||
pub fn truncate_at_char_boundary(s: &str, max_bytes: usize) -> &str {
|
||||
if s.len() <= max_bytes {
|
||||
return s;
|
||||
}
|
||||
let mut end = max_bytes;
|
||||
while end > 0 && !s.is_char_boundary(end) {
|
||||
end -= 1;
|
||||
}
|
||||
&s[..end]
|
||||
}
|
||||
|
||||
/// Extract the trimmed source line containing `byte_offset`, capped
|
||||
/// at ~120 bytes (rounded down to the nearest UTF-8 char boundary).
|
||||
/// Returns `None` when the offset is out of range or the line is
|
||||
|
|
@ -36,11 +64,10 @@ pub fn line_snippet(src: &[u8], byte_offset: usize) -> Option<String> {
|
|||
return None;
|
||||
}
|
||||
if trimmed.len() > MAX_SNIPPET_BYTES {
|
||||
let mut end = MAX_SNIPPET_BYTES;
|
||||
while end > 0 && !trimmed.is_char_boundary(end) {
|
||||
end -= 1;
|
||||
}
|
||||
Some(format!("{}...", &trimmed[..end]))
|
||||
Some(format!(
|
||||
"{}...",
|
||||
truncate_at_char_boundary(trimmed, MAX_SNIPPET_BYTES)
|
||||
))
|
||||
} else {
|
||||
Some(trimmed.to_string())
|
||||
}
|
||||
|
|
@ -48,7 +75,51 @@ pub fn line_snippet(src: &[u8], byte_offset: usize) -> Option<String> {
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::line_snippet;
|
||||
use super::{line_snippet, truncate_at_char_boundary};
|
||||
|
||||
#[test]
|
||||
fn truncate_short_string_unchanged() {
|
||||
assert_eq!(truncate_at_char_boundary("hello", 10), "hello");
|
||||
assert_eq!(truncate_at_char_boundary("", 10), "");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_zero_max_returns_empty() {
|
||||
assert_eq!(truncate_at_char_boundary("hello", 0), "");
|
||||
assert_eq!(truncate_at_char_boundary("ਖਖਖ", 0), "");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_ascii_clean_at_byte_max() {
|
||||
assert_eq!(truncate_at_char_boundary("hello world", 5), "hello");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_inside_multibyte_rounds_down() {
|
||||
// 'ਖ' (Gurmukhi LETTER KHA, U+0A16) is 3 bytes in UTF-8.
|
||||
// Build a string where byte 5 lands inside the 'ਖ'.
|
||||
let s = "abcdਖef";
|
||||
// bytes: 0..4 = "abcd", 4..7 = 'ਖ', 7.. = "ef"
|
||||
// Truncating at 5 must not panic; result is "abcd".
|
||||
assert_eq!(truncate_at_char_boundary(s, 5), "abcd");
|
||||
assert_eq!(truncate_at_char_boundary(s, 6), "abcd");
|
||||
assert_eq!(truncate_at_char_boundary(s, 7), "abcdਖ");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_devanagari_gherkin_regex_literal() {
|
||||
// Reproduces the gogs panic shape: long regex string that
|
||||
// contains Devanagari / Gurmukhi / CJK / Thai keywords with
|
||||
// byte 256 landing mid-character.
|
||||
let regex_body = "stream.match(/(機能|功能|フィーチャ|기능|โครงหลัก|ความสามารถ|ความต้องการทางธุรกิจ|ಹೆಚ್ಚಳ|గుణము|ਮੁਹਾਂਦਰਾ|ਨਕਸ਼ ਨੁਹਾਰ|".to_string();
|
||||
assert!(regex_body.len() > 256);
|
||||
// Must not panic.
|
||||
let truncated = truncate_at_char_boundary(®ex_body, 256);
|
||||
// Must be valid UTF-8 (it's already a `&str`, but the cut point
|
||||
// landing on a boundary is the actual property under test).
|
||||
assert!(regex_body.is_char_boundary(truncated.len()));
|
||||
assert!(truncated.len() <= 256);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ascii_short_line_returned_verbatim() {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue