Python fp and docs updtes (#58)

* refactor: Update comments for clarity and add expectations.json files for performance metrics

* feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks

* feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks

* refactor: Simplify code formatting for better readability in multiple files

* refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration

* feat: Update Java and Python patterns to include new security rules

* refactor: Improve comment clarity and consistency across multiple Rust files

* refactor: Simplify code formatting for improved readability in integration tests and module files

* refactor: Improve comment formatting and enhance clarity in assertions across multiple files
This commit is contained in:
Eli Peter 2026-04-29 19:53:34 -04:00 committed by GitHub
parent 4db0805de6
commit a438886217
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
291 changed files with 9485 additions and 3851 deletions

View file

@ -27,7 +27,7 @@ pub const DEFAULT_PARSE_TIMEOUT_MS: u64 = 10_000;
/// value. Raised from the historical `4` to `32` so realistic codebases
/// with wide joins (many param sources, deep helper chains) no longer
/// silently drop origin attribution. Tunable via
/// [`AnalysisOptions::max_origins`] see
/// [`AnalysisOptions::max_origins`], see
/// `src/taint/ssa_transfer/state.rs::effective_max_origins`.
pub const DEFAULT_MAX_ORIGINS: u32 = 32;
@ -38,11 +38,11 @@ pub const DEFAULT_MAX_ORIGINS: u32 = 32;
pub const MIN_MAX_ORIGINS: u32 = 1;
/// Default upper bound on the number of abstract heap objects tracked per
/// intra-procedural points-to set. Set to `32` high enough that
/// intra-procedural points-to set. Set to `32`, high enough that
/// realistic factory/builder/DI patterns (routine 1030 allocation sites
/// aliased into one variable) stay precise, low enough to keep
/// `HeapState` join/clone cost bounded in the worklist. Tunable via
/// [`AnalysisOptions::max_pointsto`] see
/// [`AnalysisOptions::max_pointsto`], see
/// `src/ssa/heap.rs::effective_max_pointsto`.
pub const DEFAULT_MAX_POINTSTO: u32 = 32;
@ -152,7 +152,7 @@ impl Default for AnalysisOptions {
/// (notably `nyx serve`, which resolves the engine profile per scan
/// request) can replace the installed options between scans via
/// [`reinstall`]. Within a single scan run, engine toggles must not
/// change mid-flight the caller is responsible for that invariant
/// change mid-flight, the caller is responsible for that invariant
/// (`JobManager`'s single-scan guarantee provides it in the server).
static RUNTIME: RwLock<Option<AnalysisOptions>> = RwLock::new(None);
@ -174,7 +174,7 @@ pub fn install(opts: AnalysisOptions) -> bool {
/// server's scan thread, which re-resolves the engine profile from each
/// incoming request; `install`'s first-wins semantics would otherwise
/// pin the first scan's choice for the lifetime of the server. Callers
/// must ensure no scan is concurrently reading `current()` in practice
/// must ensure no scan is concurrently reading `current()`, in practice
/// this means calling `reinstall` before the scan's rayon pool starts.
pub fn reinstall(opts: AnalysisOptions) {
*RUNTIME.write().expect("analysis options RwLock poisoned") = Some(opts);

View file

@ -315,8 +315,8 @@ pub struct OutputConfig {
/// When `true`, findings whose engine provenance notes include any
/// `OverReport` (widening) or `Bail` (lowering/parse failure)
/// direction are filtered out before output. `UnderReport`
/// findings where the result set is a lower bound but each
/// emitted flow is still real are kept.
/// findings, where the result set is a lower bound but each
/// emitted flow is still real, are kept.
///
/// Surfaced via `--require-converged`; intended for strict CI
/// gating where a finding from capped analysis is worse than no
@ -644,7 +644,7 @@ impl Default for RunsConfig {
}
}
/// A named scan profile a partial overlay of scan-related settings.
/// A named scan profile, a partial overlay of scan-related settings.
/// All fields are `Option<T>`: `None` means "don't override".
#[derive(Debug, Serialize, Deserialize, Clone, Default)]
#[serde(default)]
@ -715,7 +715,7 @@ pub struct Config {
pub server: ServerConfig,
pub runs: RunsConfig,
pub profiles: HashMap<String, ScanProfile>,
/// Detected frameworks for the current project set by the scan pipeline,
/// Detected frameworks for the current project, set by the scan pipeline,
/// not persisted to config files.
#[serde(skip)]
pub framework_ctx: Option<crate::utils::project::FrameworkContext>,

View file

@ -5,7 +5,7 @@ pub fn lowercase_ext(path: &std::path::Path) -> Option<&'static str> {
// Real-world C++ codebases overwhelmingly use `.cc` / `.cxx` /
// `.hpp` / `.hh` / `.h++` rather than the `.cpp` synthetic-fixture
// extension. All map to the same tree-sitter-cpp grammar. `.h`
// is intentionally NOT mapped it's also valid C and
// is intentionally NOT mapped, it's also valid C and
// disambiguating without a build system is brittle.
"cpp" | "c++" | "cc" | "cxx" | "hpp" | "hxx" | "hh" | "h++" => Some("cpp"),
"java" => Some("java"),

View file

@ -84,7 +84,7 @@ fn read_bounded(path: &Path) -> Option<String> {
///
/// Intentionally a coarse byte-level substring check against the quoted module
/// specifier (e.g. `'fastify'`, `"github.com/labstack/echo/v4"`,
/// `'sinatra'`). Only the first 8 KiB of the file are inspected imports /
/// `'sinatra'`). Only the first 8 KiB of the file are inspected, imports /
/// requires live at the top. Returns an empty list for languages without a
/// framework detection policy here.
pub fn detect_in_file_frameworks(bytes: &[u8], lang_slug: &str) -> Vec<DetectedFramework> {
@ -147,7 +147,7 @@ pub fn detect_frameworks(root: &Path) -> FrameworkContext {
// ── Node.js (package.json) ──
if let Some(content) = read_bounded(&root.join("package.json")) {
// Crude substring search in the "dependencies" block area.
// Good enough for detection no JSON parsing overhead.
// Good enough for detection, no JSON parsing overhead.
if content.contains("\"express\"") {
fws.push(DetectedFramework::Express);
}

View file

@ -23,7 +23,7 @@ static CACHE: LazyLock<RwLock<HashMap<&'static str, QuerySet>>> =
/// patterns for the language are cached normally. A language with an
/// all-malformed pattern slice yields an empty cache entry.
///
/// Lock poisoning on the shared cache is recovered transparently a
/// Lock poisoning on the shared cache is recovered transparently, a
/// panic in another thread must not brick pattern loading process-wide.
pub fn for_lang(lang: &'static str, ts_lang: Language) -> std::sync::Arc<Vec<CompiledQuery>> {
// fast path
@ -31,7 +31,7 @@ pub fn for_lang(lang: &'static str, ts_lang: Language) -> std::sync::Arc<Vec<Com
return v.clone();
}
// slow path compile
// slow path, compile
let patterns = patterns::load(lang);
let compiled: Vec<_> = patterns
.into_iter()

View file

@ -1,19 +1,47 @@
//! Source-line snippet extraction for diagnostics.
//! UTF-8-safe truncation for diagnostic strings.
//!
//! Both [`crate::ast`] (per-finding evidence) and [`crate::summary`]
//! (cross-file `SinkSite`) need to grab the source line containing a
//! given byte offset, trim it, and cap it at a fixed character budget.
//! The two callers used to carry private copies of this routine; the
//! truncation step performed a raw byte slice (`&trimmed[..MAX]`) which
//! panics whenever the cap lands inside a multi-byte UTF-8 character.
//! Real-world Ruby/JS test suites with Cyrillic / CJK / emoji string
//! literals tripped this on `mastodon`, `discourse`, and `gitlabhq`.
//! Two related shapes live here:
//!
//! This shared helper truncates at the nearest preceding char
//! boundary, so any UTF-8 input is safe.
//! 1. [`line_snippet`], extracts the trimmed source line containing
//! a byte offset, capped at ~120 bytes. Used by [`crate::ast`]
//! (per-finding evidence) and [`crate::summary`] (cross-file
//! `SinkSite`).
//! 2. [`truncate_at_char_boundary`], the underlying primitive: cap a
//! string at `max_bytes`, rounded down to the nearest UTF-8 char
//! boundary.
//!
//! Both arose from the same family of panics: real-world Ruby/JS/Go
//! test suites carry literal Cyrillic / CJK / emoji / Devanagari /
//! Gurmukhi inside string and regex constants. Naive
//! `&s[..MAX].to_string()` truncation panics whenever the cap lands
//! inside a multi-byte UTF-8 sequence, killing the rayon worker that
//! happens to lower that file. Earlier sessions fixed `line_snippet`
//! (mastodon / discourse / gitlabhq, Cyrillic in RSpec strings); the
//! gogs scan still tripped because the CFG condition-text path
//! (`src/cfg/conditions.rs`, `src/cfg/mod.rs`) carried a third copy
//! of the same byte-slice idiom. The Gurmukhi `'ਖ'` regex literal in
//! gogs's localised Gherkin keyword list lands byte 256 mid-character
//! and panics. Centralising the safe-truncation primitive prevents
//! the next bytes-vs-chars site from re-introducing the same bug.
const MAX_SNIPPET_BYTES: usize = 120;
/// Truncate `s` to at most `max_bytes` bytes, rounding the cut point
/// down to the nearest UTF-8 character boundary so the returned slice
/// is always valid UTF-8. When `s.len() <= max_bytes` the slice is
/// returned unchanged. When `max_bytes == 0` an empty slice is
/// returned. Never panics on multi-byte input.
pub fn truncate_at_char_boundary(s: &str, max_bytes: usize) -> &str {
if s.len() <= max_bytes {
return s;
}
let mut end = max_bytes;
while end > 0 && !s.is_char_boundary(end) {
end -= 1;
}
&s[..end]
}
/// Extract the trimmed source line containing `byte_offset`, capped
/// at ~120 bytes (rounded down to the nearest UTF-8 char boundary).
/// Returns `None` when the offset is out of range or the line is
@ -36,11 +64,10 @@ pub fn line_snippet(src: &[u8], byte_offset: usize) -> Option<String> {
return None;
}
if trimmed.len() > MAX_SNIPPET_BYTES {
let mut end = MAX_SNIPPET_BYTES;
while end > 0 && !trimmed.is_char_boundary(end) {
end -= 1;
}
Some(format!("{}...", &trimmed[..end]))
Some(format!(
"{}...",
truncate_at_char_boundary(trimmed, MAX_SNIPPET_BYTES)
))
} else {
Some(trimmed.to_string())
}
@ -48,7 +75,51 @@ pub fn line_snippet(src: &[u8], byte_offset: usize) -> Option<String> {
#[cfg(test)]
mod tests {
use super::line_snippet;
use super::{line_snippet, truncate_at_char_boundary};
#[test]
fn truncate_short_string_unchanged() {
assert_eq!(truncate_at_char_boundary("hello", 10), "hello");
assert_eq!(truncate_at_char_boundary("", 10), "");
}
#[test]
fn truncate_zero_max_returns_empty() {
assert_eq!(truncate_at_char_boundary("hello", 0), "");
assert_eq!(truncate_at_char_boundary("ਖਖਖ", 0), "");
}
#[test]
fn truncate_ascii_clean_at_byte_max() {
assert_eq!(truncate_at_char_boundary("hello world", 5), "hello");
}
#[test]
fn truncate_inside_multibyte_rounds_down() {
// 'ਖ' (Gurmukhi LETTER KHA, U+0A16) is 3 bytes in UTF-8.
// Build a string where byte 5 lands inside the 'ਖ'.
let s = "abcdਖef";
// bytes: 0..4 = "abcd", 4..7 = 'ਖ', 7.. = "ef"
// Truncating at 5 must not panic; result is "abcd".
assert_eq!(truncate_at_char_boundary(s, 5), "abcd");
assert_eq!(truncate_at_char_boundary(s, 6), "abcd");
assert_eq!(truncate_at_char_boundary(s, 7), "abcdਖ");
}
#[test]
fn truncate_devanagari_gherkin_regex_literal() {
// Reproduces the gogs panic shape: long regex string that
// contains Devanagari / Gurmukhi / CJK / Thai keywords with
// byte 256 landing mid-character.
let regex_body = "stream.match(/(機能|功能|フィーチャ|기능|โครงหลัก|ความสามารถ|ความต้องการทางธุรกิจ|ಹೆಚ್ಚಳ|గుణము|ਮੁਹਾਂਦਰਾ|ਨਕਸ਼ ਨੁਹਾਰ|".to_string();
assert!(regex_body.len() > 256);
// Must not panic.
let truncated = truncate_at_char_boundary(&regex_body, 256);
// Must be valid UTF-8 (it's already a `&str`, but the cut point
// landing on a boundary is the actual property under test).
assert!(regex_body.is_char_boundary(truncated.len()));
assert!(truncated.len() <= 256);
}
#[test]
fn ascii_short_line_returned_verbatim() {