nyx/src/labels/cpp.rs
Eli Peter 58f1794a4e
Added Cap::DATA_EXFIL and taint fp and fn fixes on real repos (#59)
* feat: Enhance data exfiltration detection with source sensitivity gating for cookies and headers

* feat: Implement cross-file data exfiltration detection with parameter-specific gate filters

* feat: Add calibration tests and refine DATA_EXFIL severity scoring logic

* feat: Introduce per-detector configuration for data exfiltration suppression

* feat: Enhance DATA_EXFIL findings with destination field tracking in diagnostics and SARIF output

* feat: Add tainted body and URL handling for data exfiltration detection

* feat: Add integration tests and fixtures for DATA_EXFIL and SSRF detection in Go

* feat: Add Java integration tests and fixtures for DATA_EXFIL detection across multiple HTTP clients

* feat: Add synthetic externals handling for closure-captured variables in SSA

* feat: Implement closure-based suppression for resource leak findings

* feat: Add regression guards for shell-injection and taint propagation in for-of destructure patterns

* feat: Implement constructor cap narrowing for data exfiltration detection in HTTP request builders

* feat: Add gated sinks for data exfiltration detection in C and C++ using curl_easy_setopt

* feat: Implement DATA_EXFIL cap parity for backwards analysis and add integration tests

* feat: Add data exfiltration sinks for various languages and enhance documentation

* refactor: Simplify formatting and improve readability in various files

* refactor: Improve readability by simplifying conditional statements and adding clippy linting

* docs: Update CHANGELOG and comments for data exfiltration features and configuration

* docs: Clarify configuration instructions for data exfiltration trusted destinations

* docs: Enhance comments for evidence routing logic in data exfiltration
2026-05-01 10:59:52 -04:00

205 lines
7.5 KiB
Rust

use crate::labels::{Cap, DataLabel, GateActivation, Kind, LabelRule, ParamConfig, SinkGate};
use phf::{Map, phf_map};
pub static RULES: &[LabelRule] = &[
// ─────────── Sources ───────────
LabelRule {
matchers: &["getenv"],
label: DataLabel::Source(Cap::all()),
case_sensitive: false,
},
LabelRule {
matchers: &["std::cin", "std::getline", "fgets", "scanf", "gets"],
label: DataLabel::Source(Cap::all()),
case_sensitive: false,
},
// Network input sources
LabelRule {
matchers: &["recv", "recvfrom"],
label: DataLabel::Source(Cap::all()),
case_sensitive: false,
},
// ───────── Sanitizers ──────────
// Generic `sanitize_*` prefix: clears the full cap mask. A function
// named `sanitize_*` is a developer-asserted general-purpose
// sanitizer; without a more specific signal (e.g. an explicit
// sanitizer label rule with a narrower cap), assume it covers every
// taint cap that flows through it. Narrowing to a single cap (e.g.
// HTML_ESCAPE) under-clears developer-named sanitizers and produces
// FPs whenever the downstream sink belongs to a different cap (e.g.
// FMT_STRING via printf), which is the typical case in C/C++ code.
LabelRule {
matchers: &["sanitize_"],
label: DataLabel::Sanitizer(Cap::all()),
case_sensitive: false,
},
// Type conversion sanitizers (C++ STL forms).
// The full `std::sto*` family (including 64-bit `*ll`/`*ull` and `*ld`)
// returns an integral or floating value; downstream string-injection
// caps no longer apply.
LabelRule {
matchers: &[
"std::stoi",
"std::stol",
"std::stoll",
"std::stoul",
"std::stoull",
"std::stof",
"std::stod",
"std::stold",
],
label: DataLabel::Sanitizer(Cap::all()),
case_sensitive: false,
},
// Type conversion sanitizers (C-stdlib forms still valid in C++).
// Numeric parse → caller receives an integral / floating value, not
// the original string; downstream string-injection caps are cleared.
LabelRule {
matchers: &[
"atoi", "atol", "atoll", "atof", "strtol", "strtoul", "strtoll", "strtoull",
],
label: DataLabel::Sanitizer(Cap::all()),
case_sensitive: false,
},
// ─────────── Sinks ─────────────
LabelRule {
matchers: &[
"system", "popen", "execl", "execlp", "execle", "execve", "execvp",
],
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
case_sensitive: false,
},
LabelRule {
matchers: &["sprintf", "strcpy", "strcat"],
label: DataLabel::Sink(Cap::HTML_ESCAPE),
case_sensitive: false,
},
LabelRule {
matchers: &["printf", "fprintf"],
label: DataLabel::Sink(Cap::FMT_STRING),
case_sensitive: false,
},
LabelRule {
matchers: &["fopen", "open"],
label: DataLabel::Sink(Cap::FILE_IO),
case_sensitive: false,
},
LabelRule {
matchers: &["curl_easy_perform", "connect"],
label: DataLabel::Sink(Cap::SSRF),
case_sensitive: false,
},
];
/// Gated sinks for C++.
///
/// Mirror of the C gate set: `curl_easy_setopt` with `CURLOPT_POSTFIELDS` /
/// `CURLOPT_COPYPOSTFIELDS` at arg 1 binds the request body at arg 2.
/// Identifier-based activation is enabled via the macro-arg fallback in
/// `cfg::mod::classify_gated_sink` for `lang == "cpp" / "c++"`. Modern C++
/// HTTP wrappers (cpr, Boost.Beast) layer over libcurl or directly over the
/// socket; their ergonomic surfaces differ enough that adding gates per-
/// library is left for a follow-up driven by the corpus.
pub static GATED_SINKS: &[SinkGate] = &[SinkGate {
callee_matcher: "curl_easy_setopt",
arg_index: 1,
dangerous_values: &["CURLOPT_POSTFIELDS", "CURLOPT_COPYPOSTFIELDS"],
dangerous_prefixes: &[],
label: DataLabel::Sink(Cap::DATA_EXFIL),
case_sensitive: true,
payload_args: &[2],
keyword_name: None,
dangerous_kwargs: &[],
activation: GateActivation::ValueMatch,
}];
pub static KINDS: Map<&'static str, Kind> = phf_map! {
// control-flow
"if_statement" => Kind::If,
"while_statement" => Kind::While,
"for_statement" => Kind::For,
"for_range_loop" => Kind::For,
"do_statement" => Kind::While,
"switch_statement" => Kind::Switch,
"case_statement" => Kind::Block,
"labeled_statement" => Kind::Block,
"return_statement" => Kind::Return,
"throw_statement" => Kind::Throw,
"break_statement" => Kind::Break,
"continue_statement" => Kind::Continue,
// structure
"translation_unit" => Kind::SourceFile,
"compound_statement" => Kind::Block,
"else_clause" => Kind::Block,
"function_definition" => Kind::Function,
"try_statement" => Kind::Try,
"catch_clause" => Kind::Block,
"lambda_expression" => Kind::Function,
// Namespace bodies and C++ class bodies descend as plain Blocks so the
// CFG builder can reach the nested function_definitions/lambdas inside
// and extract them as separate bodies. Without these, a
// `class_specifier` / `struct_specifier` falls through to the
// generic `_ =>` arm in `build_sub`, which records a leaf `Seq`
// node and never walks the body, so inline member-function
// definitions (and methods of nested classes) are silently dropped.
"declaration_list" => Kind::Block,
"field_declaration_list" => Kind::Block,
"class_specifier" => Kind::Block,
"struct_specifier" => Kind::Block,
"union_specifier" => Kind::Block,
"enum_specifier" => Kind::Block,
"template_declaration" => Kind::Block,
"linkage_specification" => Kind::Block,
// data-flow
"call_expression" => Kind::CallFn,
"new_expression" => Kind::CallFn,
"delete_expression" => Kind::CallFn,
"assignment_expression" => Kind::Assignment,
"declaration" => Kind::CallWrapper,
"expression_statement" => Kind::CallWrapper,
// trivia
"comment" => Kind::Trivia,
";" => Kind::Trivia, "," => Kind::Trivia,
"(" => Kind::Trivia, ")" => Kind::Trivia,
"{" => Kind::Trivia, "}" => Kind::Trivia,
"\n" => Kind::Trivia,
"preproc_include" => Kind::Trivia,
"preproc_def" => Kind::Trivia,
"using_declaration" => Kind::Trivia,
"namespace_definition" => Kind::Block,
};
pub static PARAM_CONFIG: ParamConfig = ParamConfig {
params_field: "parameters",
param_node_kinds: &["parameter_declaration"],
self_param_kinds: &[],
ident_fields: &["declarator", "name"],
};
/// Benchmark-driven output-parameter source positions for known C++ APIs.
pub static OUTPUT_PARAM_SOURCES: &[(&str, &[usize])] = &[
("getline", &[1]), // std::getline(stream, str), str receives input
("std::getline", &[1]),
("fgets", &[0]),
("gets", &[0]),
("recv", &[1]),
("recvfrom", &[1]),
];
/// Arg-to-arg taint propagation for known C++ functions.
pub static ARG_PROPAGATIONS: &[super::ArgPropagation] = &[
super::ArgPropagation {
callee: "inet_pton",
from_args: &[1],
to_args: &[2],
},
super::ArgPropagation {
callee: "inet_aton",
from_args: &[0],
to_args: &[1],
},
];