use super::*; use crate::cfg::FuncSummaries; use crate::interop::InteropEdge; use crate::labels::Cap; use crate::symbol::FuncKey; #[test] fn env_to_arg_is_flagged() { use crate::cfg::build_cfg; use tree_sitter::Language; let src = br#" use std::env; use std::process::Command; fn main() { let x = env::var("DANGEROUS_ARG").unwrap(); Command::new("sh").arg(x).status().unwrap(); }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); assert_eq!(findings.len(), 1); // exactly one unsanitised Source→Sink } #[test] fn taint_through_if_else() { use crate::cfg::build_cfg; use tree_sitter::Language; let src = br#" use std::env; use std::process::Command; fn main() { let x = env::var("DANGEROUS").unwrap(); let safe = html_escape::encode_safe(&x); if x.len() > 5 { Command::new("sh").arg(&x).status().unwrap(); // UNSAFE } else { Command::new("sh").arg(&safe).status().unwrap(); // SAFE } }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); // Both branches have findings: the true branch uses unsanitized `x`, // the else branch uses `safe` which was sanitized with HTML_ESCAPE // but the sink requires SHELL_ESCAPE (wrong sanitizer → still tainted). assert_eq!(findings.len(), 2); } #[test] fn taint_through_while_loop() { use crate::cfg::build_cfg; use tree_sitter::Language; let src = br#" use std::{env, process::Command}; fn main() { let mut x = env::var("DANGEROUS").unwrap(); while x.len() < 100 { // Loop header (Loop) x.push_str("a"); } Command::new("sh").arg(x).status().unwrap(); // Should be flagged }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); assert_eq!(findings.len(), 1); } #[test] fn taint_killed_by_matching_sanitizer() { use crate::cfg::build_cfg; use tree_sitter::Language; // shell_escape sanitizer strips SHELL_ESCAPE → Command sink checks // SHELL_ESCAPE → the matching bit is gone → no finding. let src = br#" use std::{env, process::Command}; fn main() { let x = env::var("DANGEROUS").unwrap(); let clean = shell_escape::unix::escape(&x); Command::new("sh").arg(clean).status().unwrap(); }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); assert!( findings.is_empty(), "matching sanitizer should kill the taint" ); } #[test] fn wrong_sanitizer_preserves_taint() { use crate::cfg::build_cfg; use tree_sitter::Language; // html_escape sanitizer strips HTML_ESCAPE, but Command sink checks // SHELL_ESCAPE → the wrong bit was stripped → finding persists. let src = br#" use std::{env, process::Command}; fn main() { let x = env::var("DANGEROUS").unwrap(); let clean = html_escape::encode_safe(&x); Command::new("sh").arg(clean).status().unwrap(); }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); assert_eq!( findings.len(), 1, "wrong sanitizer should NOT kill the taint" ); } #[test] fn taint_breaks_out_of_loop() { use crate::cfg::build_cfg; use tree_sitter::Language; let src = br#" use std::{env, process::Command}; fn main() { loop { let x = env::var("DANGEROUS").unwrap(); Command::new("sh").arg(&x).status().unwrap(); // vulnerable break; } }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); assert_eq!(findings.len(), 1); } #[test] fn test_two_sources_one_sanitised() { use crate::cfg::build_cfg; use tree_sitter::Language; // Two env sources, one properly sanitised with the MATCHING sanitiser. // x → unsanitised → Command = FINDING // y → shell_escape → Command = safe let src = br#" use std::{env, process::Command}; fn main() { let x = env::var("DANGEROUS").unwrap(); let y = env::var("ANOTHER").unwrap(); let clean = shell_escape::unix::escape(&y); Command::new("sh").arg(x).status().unwrap(); Command::new("sh").arg(clean).status().unwrap(); }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); assert_eq!( findings.len(), 1, "only the unsanitised source should be flagged" ); } #[test] fn test_two_sources_wrong_sanitiser_both_flagged() { use crate::cfg::build_cfg; use tree_sitter::Language; // Two env sources, one "sanitised" with the WRONG sanitiser. // x → unsanitised → Command = FINDING // y → html_escape → Command = FINDING (wrong sanitiser for shell sink) let src = br#" use std::{env, process::Command}; fn main() { let x = env::var("DANGEROUS").unwrap(); let y = env::var("ANOTHER").unwrap(); let clean = html_escape::encode_safe(&y); Command::new("sh").arg(x).status().unwrap(); Command::new("sh").arg(clean).status().unwrap(); }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); assert_eq!( findings.len(), 2, "both should be flagged — wrong sanitiser" ); } #[test] fn test_should_not_panic_on_empty_function() { use crate::cfg::build_cfg; use tree_sitter::Language; let src = br#" use std::{env, process::Command}; fn f() { if cond() { return; } do_something(); }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); assert!(findings.is_empty()); } #[test] fn cross_file_source_resolved_via_global_summaries() { use crate::summary::FuncSummary; // Simulate file B calling `get_dangerous()` which is defined in file A. // File A's summary says get_dangerous is a Source(all). let src = br#" use std::process::Command; fn main() { let x = get_dangerous(); Command::new("sh").arg(x).status().unwrap(); }"#; let (cfg, entry, local_summaries) = parse_rust(src); // Build global summaries as if file A exported get_dangerous let mut global = GlobalSummaries::new(); let key = FuncKey { lang: Lang::Rust, namespace: "file_a.rs".into(), name: "get_dangerous".into(), arity: Some(0), }; global.insert( key, FuncSummary { name: "get_dangerous".into(), file_path: "file_a.rs".into(), lang: "rust".into(), param_count: 0, param_names: vec![], source_caps: Cap::all().bits(), sanitizer_caps: 0, sink_caps: 0, propagates_taint: false, tainted_sink_params: vec![], callees: vec![], }, ); let findings = analyse_file( &cfg, entry, &local_summaries, Some(&global), Lang::Rust, "test.rs", &[], ); assert_eq!(findings.len(), 1, "cross-file source should be detected"); } #[test] fn cross_file_sanitizer_resolved_via_global_summaries() { use crate::summary::FuncSummary; // File B gets tainted data and passes it through `my_sanitize()` from file A. let src = br#" use std::{env, process::Command}; fn main() { let x = env::var("DANGEROUS").unwrap(); let clean = my_sanitize(x); Command::new("sh").arg(clean).status().unwrap(); }"#; let (cfg, entry, local_summaries) = parse_rust(src); let mut global = GlobalSummaries::new(); let key = FuncKey { lang: Lang::Rust, namespace: "file_a.rs".into(), name: "my_sanitize".into(), arity: Some(1), }; global.insert( key, FuncSummary { name: "my_sanitize".into(), file_path: "file_a.rs".into(), lang: "rust".into(), param_count: 1, param_names: vec!["input".into()], source_caps: 0, sanitizer_caps: Cap::all().bits(), sink_caps: 0, propagates_taint: true, tainted_sink_params: vec![], callees: vec![], }, ); let findings = analyse_file( &cfg, entry, &local_summaries, Some(&global), Lang::Rust, "test.rs", &[], ); assert!( findings.is_empty(), "cross-file sanitizer should neutralise taint" ); } // ───────────────────────────────────────────────────────────────────────────── // Shared test helpers // ───────────────────────────────────────────────────────────────────────────── /// Parse Rust source bytes → (cfg, entry, local_summaries) fn parse_rust(src: &[u8]) -> (Cfg, NodeIndex, FuncSummaries) { use crate::cfg::build_cfg; use tree_sitter::Language; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src, None).unwrap(); build_cfg(&tree, src, "rust", "test.rs", None) } /// Parse Rust source bytes, build CFG, and export cross-file summaries. fn extract_summaries_from_bytes(src: &[u8], path: &str) -> Vec { use crate::cfg::export_summaries; let (_, _, local) = parse_rust(src); export_summaries(&local, path, "rust") } #[test] fn cross_file_sink_resolved_via_global_summaries() { use crate::summary::FuncSummary; // File B calls `dangerous_exec(x)` from file A which is a sink. let src = br#" use std::env; fn main() { let x = env::var("INPUT").unwrap(); dangerous_exec(x); }"#; let (cfg, entry, local_summaries) = parse_rust(src); let mut global = GlobalSummaries::new(); let key = FuncKey { lang: Lang::Rust, namespace: "file_a.rs".into(), name: "dangerous_exec".into(), arity: Some(1), }; global.insert( key, FuncSummary { name: "dangerous_exec".into(), file_path: "file_a.rs".into(), lang: "rust".into(), param_count: 1, param_names: vec!["cmd".into()], source_caps: 0, sanitizer_caps: 0, sink_caps: Cap::SHELL_ESCAPE.bits(), propagates_taint: false, tainted_sink_params: vec![0], callees: vec!["Command::new".into()], }, ); let findings = analyse_file( &cfg, entry, &local_summaries, Some(&global), Lang::Rust, "test.rs", &[], ); assert_eq!(findings.len(), 1, "cross-file sink should be detected"); } // ───────────────────────────────────────────────────────────────────────────── // Multi-file integration tests (real parsing, full pass-1 → pass-2 pipeline) // ───────────────────────────────────────────────────────────────────────────── #[test] fn multi_file_source_to_sink_detected() { use crate::summary::merge_summaries; // File A: defines get_dangerous() which calls env::var (a source). let lib_src = br#" use std::env; fn get_dangerous() -> String { env::var("SECRET").unwrap() } "#; // File B: calls get_dangerous() then passes result to Command (a sink). let caller_src = br#" use std::process::Command; fn main() { let x = get_dangerous(); Command::new("sh").arg(x).status().unwrap(); } "#; let summaries = extract_summaries_from_bytes(lib_src, "lib.rs"); let global = merge_summaries(summaries, None); let (cfg, entry, local) = parse_rust(caller_src); let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Rust, "test.rs", &[], ); assert_eq!( findings.len(), 1, "cross-file source → inline sink should produce 1 finding" ); } #[test] fn multi_file_sanitizer_neutralises_cross_file_source() { use crate::summary::merge_summaries; // File A: source + matching shell sanitizer. // NOTE: function name avoids `sanitize_` prefix which triggers // the inline HTML sanitizer label rule. let lib_src = br#" use std::env; fn get_input() -> String { env::var("INPUT").unwrap() } fn clean_shell(s: &str) -> String { shell_escape::unix::escape(s).to_string() } "#; // File B: source → clean_shell → shell sink. let caller_src = br#" use std::process::Command; fn main() { let x = get_input(); let clean = clean_shell(&x); Command::new("sh").arg(clean).status().unwrap(); } "#; let summaries = extract_summaries_from_bytes(lib_src, "lib.rs"); let global = merge_summaries(summaries, None); let (cfg, entry, local) = parse_rust(caller_src); let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Rust, "test.rs", &[], ); assert!( findings.is_empty(), "matching cross-file sanitizer should neutralise taint, got {} findings", findings.len() ); } #[test] fn multi_file_wrong_sanitizer_preserves_taint() { use crate::summary::merge_summaries; // File A: source + HTML sanitizer (wrong for shell sink). let lib_src = br#" use std::env; fn get_input() -> String { env::var("INPUT").unwrap() } fn clean_html(s: &str) -> String { html_escape::encode_safe(s).to_string() } "#; // File B: source → HTML sanitize → shell sink → should still flag. let caller_src = br#" use std::process::Command; fn main() { let x = get_input(); let clean = clean_html(&x); Command::new("sh").arg(clean).status().unwrap(); } "#; let summaries = extract_summaries_from_bytes(lib_src, "lib.rs"); let global = merge_summaries(summaries, None); let (cfg, entry, local) = parse_rust(caller_src); let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Rust, "test.rs", &[], ); assert_eq!( findings.len(), 1, "wrong sanitizer (HTML for shell sink) should NOT neutralise taint" ); } #[test] fn multi_file_sink_in_another_file() { use crate::summary::merge_summaries; // File A: defines exec_cmd() which internally calls Command::new (a sink). let lib_src = br#" use std::process::Command; fn exec_cmd(cmd: &str) { Command::new("sh").arg(cmd).status().unwrap(); } "#; // File B: env::var → exec_cmd() — sink is cross-file. let caller_src = br#" use std::env; fn main() { let x = env::var("DANGEROUS").unwrap(); exec_cmd(&x); } "#; let summaries = extract_summaries_from_bytes(lib_src, "lib.rs"); let global = merge_summaries(summaries, None); let (cfg, entry, local) = parse_rust(caller_src); let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Rust, "test.rs", &[], ); assert_eq!(findings.len(), 1, "cross-file sink should be detected"); } #[test] fn multi_file_passthrough_preserves_taint() { use crate::summary::FuncSummary; // identity() just returns its argument — it propagates taint but has no // source/sanitizer/sink caps of its own. let mut global = GlobalSummaries::new(); let key = FuncKey { lang: Lang::Rust, namespace: "lib.rs".into(), name: "identity".into(), arity: Some(1), }; global.insert( key, FuncSummary { name: "identity".into(), file_path: "lib.rs".into(), lang: "rust".into(), param_count: 1, param_names: vec!["s".into()], source_caps: 0, sanitizer_caps: 0, sink_caps: 0, propagates_taint: true, tainted_sink_params: vec![], callees: vec![], }, ); let caller_src = br#" use std::{env, process::Command}; fn main() { let x = env::var("DANGEROUS").unwrap(); let y = identity(&x); Command::new("sh").arg(y).status().unwrap(); } "#; let (cfg, entry, local) = parse_rust(caller_src); let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Rust, "test.rs", &[], ); assert_eq!( findings.len(), 1, "taint should propagate through passthrough function" ); } #[test] fn multi_file_chain_source_sanitize_sink_across_files() { use crate::summary::merge_summaries; // Library file defines all three roles: source, sanitizer, sink. let lib_src = br#" use std::env; use std::process::Command; fn get_input() -> String { env::var("INPUT").unwrap() } fn clean_shell(s: &str) -> String { shell_escape::unix::escape(s).to_string() } fn exec_cmd(cmd: &str) { Command::new("sh").arg(cmd).status().unwrap(); } "#; // Caller: source → correct sanitizer → sink. let caller_src = br#" fn main() { let x = get_input(); let clean = clean_shell(&x); exec_cmd(&clean); } "#; let summaries = extract_summaries_from_bytes(lib_src, "lib.rs"); let global = merge_summaries(summaries, None); let (cfg, entry, local) = parse_rust(caller_src); let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Rust, "test.rs", &[], ); assert!( findings.is_empty(), "source → matching sanitizer → sink should produce 0 findings, got {}", findings.len() ); } // ───────────────────────────────────────────────────────────────────────────── // Edge-case unit tests // ───────────────────────────────────────────────────────────────────────────── #[test] fn sanitizer_strips_only_matching_bits() { // Source(ALL) → shell_escape → sink_html (HTML sink). // shell_escape strips SHELL_ESCAPE but not HTML_ESCAPE. // sink_html is an HTML sink — HTML_ESCAPE bit is still set → 1 finding. let src = br#" use std::env; fn sink_html(s: &str) {} fn main() { let x = env::var("DANGEROUS").unwrap(); let clean = shell_escape::unix::escape(&x); sink_html(&clean); } "#; let (cfg, entry, summaries) = parse_rust(src); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); assert_eq!( findings.len(), 1, "shell sanitizer should NOT strip HTML_ESCAPE bit; HTML sink should still fire" ); } #[test] fn multiple_sanitizers_strip_all_bits() { // Source → shell_escape → html_escape → Command (shell sink). // shell_escape strips SHELL_ESCAPE; html_escape strips HTML_ESCAPE. // After both, the remaining taint bits relevant to SHELL_ESCAPE are gone. let src = br#" use std::{env, process::Command}; fn main() { let x = env::var("DANGEROUS").unwrap(); let a = shell_escape::unix::escape(&x); let b = html_escape::encode_safe(&a); Command::new("sh").arg(b).status().unwrap(); } "#; let (cfg, entry, summaries) = parse_rust(src); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); assert!( findings.is_empty(), "both sanitizers together should strip all relevant bits" ); } #[test] fn taint_through_variable_reassignment() { let src = br#" use std::{env, process::Command}; fn main() { let x = env::var("DANGEROUS").unwrap(); let y = x; Command::new("sh").arg(y).status().unwrap(); } "#; let (cfg, entry, summaries) = parse_rust(src); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); assert_eq!( findings.len(), 1, "taint should flow through simple variable reassignment" ); } #[test] fn untainted_variable_at_sink_is_safe() { // A string literal (not from a source) passed to Command — no finding. let src = br#" use std::process::Command; fn main() { let x = "harmless"; Command::new("sh").arg(x).status().unwrap(); } "#; let (cfg, entry, summaries) = parse_rust(src); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); assert!( findings.is_empty(), "untainted literal should not trigger a finding" ); } #[test] fn local_summary_takes_precedence_over_global() { use crate::summary::FuncSummary; // The caller file defines my_func locally as a source. // Global says my_func is a sanitizer. // Local should win → finding expected. let caller_src = br#" use std::{env, process::Command}; fn my_func() -> String { env::var("SECRET").unwrap() } fn main() { let x = my_func(); Command::new("sh").arg(x).status().unwrap(); } "#; let mut global = GlobalSummaries::new(); let key = FuncKey { lang: Lang::Rust, namespace: "other.rs".into(), name: "my_func".into(), arity: Some(0), }; global.insert( key, FuncSummary { name: "my_func".into(), file_path: "other.rs".into(), lang: "rust".into(), param_count: 0, param_names: vec![], source_caps: 0, sanitizer_caps: Cap::all().bits(), sink_caps: 0, propagates_taint: true, tainted_sink_params: vec![], callees: vec![], }, ); let (cfg, entry, local) = parse_rust(caller_src); let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Rust, "test.rs", &[], ); assert_eq!( findings.len(), 1, "local summary (source) should take precedence over global (sanitizer)" ); } #[test] fn empty_global_summaries_same_as_none() { let src = br#" use std::{env, process::Command}; fn main() { let x = env::var("DANGEROUS").unwrap(); Command::new("sh").arg(x).status().unwrap(); } "#; let (cfg, entry, summaries) = parse_rust(src); let findings_none = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); let empty = GlobalSummaries::new(); let findings_empty = analyse_file( &cfg, entry, &summaries, Some(&empty), Lang::Rust, "test.rs", &[], ); assert_eq!( findings_none.len(), findings_empty.len(), "empty GlobalSummaries should behave identically to None" ); } #[test] fn taint_not_introduced_by_non_source_function() { // Call an unknown function (no summary anywhere), assign to var, pass to sink. // Unknown calls should NOT introduce taint. let src = br#" use std::process::Command; fn main() { let x = totally_unknown_func(); Command::new("sh").arg(x).status().unwrap(); } "#; let (cfg, entry, summaries) = parse_rust(src); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); assert!( findings.is_empty(), "unknown function call should not introduce taint" ); } #[test] fn source_and_sink_on_same_function() { use crate::summary::FuncSummary; // Cross-file function that is both source AND sink. // Tainted arg hits sink → 1 finding. let mut global = GlobalSummaries::new(); let key = FuncKey { lang: Lang::Rust, namespace: "lib.rs".into(), name: "source_and_sink".into(), arity: Some(1), }; global.insert( key, FuncSummary { name: "source_and_sink".into(), file_path: "lib.rs".into(), lang: "rust".into(), param_count: 1, param_names: vec!["input".into()], source_caps: Cap::all().bits(), sanitizer_caps: 0, sink_caps: Cap::SHELL_ESCAPE.bits(), propagates_taint: false, tainted_sink_params: vec![0], callees: vec![], }, ); // Pass tainted data from env::var into source_and_sink. let src = br#" use std::env; fn main() { let x = env::var("DANGEROUS").unwrap(); source_and_sink(x); } "#; let (cfg, entry, local) = parse_rust(src); let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Rust, "test.rs", &[], ); assert_eq!( findings.len(), 1, "function that is both source and sink should detect tainted arg as finding" ); } #[test] fn multiple_cross_file_sources_one_sanitised() { use crate::summary::FuncSummary; let mut global = GlobalSummaries::new(); // Two cross-file sources let key1 = FuncKey { lang: Lang::Rust, namespace: "lib.rs".into(), name: "get_secret".into(), arity: Some(0), }; global.insert( key1, FuncSummary { name: "get_secret".into(), file_path: "lib.rs".into(), lang: "rust".into(), param_count: 0, param_names: vec![], source_caps: Cap::all().bits(), sanitizer_caps: 0, sink_caps: 0, propagates_taint: false, tainted_sink_params: vec![], callees: vec![], }, ); let key2 = FuncKey { lang: Lang::Rust, namespace: "lib.rs".into(), name: "get_other_secret".into(), arity: Some(0), }; global.insert( key2, FuncSummary { name: "get_other_secret".into(), file_path: "lib.rs".into(), lang: "rust".into(), param_count: 0, param_names: vec![], source_caps: Cap::all().bits(), sanitizer_caps: 0, sink_caps: 0, propagates_taint: false, tainted_sink_params: vec![], callees: vec![], }, ); // One source sanitised, one not. let src = br#" use std::process::Command; fn main() { let a = get_secret(); let b = get_other_secret(); let clean_a = shell_escape::unix::escape(&a); Command::new("sh").arg(clean_a).status().unwrap(); Command::new("sh").arg(b).status().unwrap(); } "#; let (cfg, entry, local) = parse_rust(src); let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Rust, "test.rs", &[], ); assert_eq!( findings.len(), 1, "only the unsanitised cross-file source should produce a finding" ); } // ───────────────────────────────────────────────────────────────────────────── // Multi-language helpers and tests // ───────────────────────────────────────────────────────────────────────────── /// Parse source bytes for any supported language → (cfg, entry, local_summaries) fn parse_lang( src: &[u8], slug: &str, ts_lang: tree_sitter::Language, ) -> (Cfg, NodeIndex, FuncSummaries) { use crate::cfg::build_cfg; let mut parser = tree_sitter::Parser::new(); parser.set_language(&ts_lang).unwrap(); let tree = parser.parse(src, None).unwrap(); let ext = match slug { "rust" => "test.rs", "javascript" => "test.js", "typescript" => "test.ts", "python" => "test.py", "go" => "test.go", "java" => "test.java", "c" => "test.c", "cpp" => "test.cpp", "php" => "test.php", "ruby" => "test.rb", _ => "test.txt", }; build_cfg(&tree, src, slug, ext, None) } #[test] fn js_source_to_sink() { let src = b"function main() {\n let x = document.location();\n eval(x);\n}\n"; let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); let (cfg, entry, summaries) = parse_lang(src, "javascript", lang); let findings = analyse_file( &cfg, entry, &summaries, None, Lang::JavaScript, "test.js", &[], ); assert_eq!( findings.len(), 1, "JS: source->sink should produce 1 finding" ); } #[test] fn ts_source_to_sink() { let src = b"function main() {\n let x = document.location();\n eval(x);\n}\n"; let lang = tree_sitter::Language::from(tree_sitter_typescript::LANGUAGE_TYPESCRIPT); let (cfg, entry, summaries) = parse_lang(src, "typescript", lang); let findings = analyse_file( &cfg, entry, &summaries, None, Lang::TypeScript, "test.ts", &[], ); assert_eq!( findings.len(), 1, "TS: source->sink should produce 1 finding" ); } #[test] fn python_source_to_sink() { let src = b"def main():\n x = os.getenv(\"SECRET\")\n os.system(x)\n"; let lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE); let (cfg, entry, summaries) = parse_lang(src, "python", lang); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Python, "test.py", &[]); assert_eq!( findings.len(), 1, "Python: source->sink should produce 1 finding" ); } #[test] fn go_source_to_sink() { let src = b"package main\n\nfunc main() {\n\tx := os.Getenv(\"SECRET\")\n\texec.Command(x)\n}\n"; let lang = tree_sitter::Language::from(tree_sitter_go::LANGUAGE); let (cfg, entry, summaries) = parse_lang(src, "go", lang); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Go, "test.go", &[]); assert_eq!( findings.len(), 1, "Go: source->sink should produce 1 finding" ); } #[test] fn java_source_to_sink() { let src = b"class Main {\n void main() {\n String x = System.getenv(\"SECRET\");\n Runtime.exec(x);\n }\n}\n"; let lang = tree_sitter::Language::from(tree_sitter_java::LANGUAGE); let (cfg, entry, summaries) = parse_lang(src, "java", lang); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Java, "test.java", &[]); assert_eq!( findings.len(), 1, "Java: source->sink should produce 1 finding" ); } #[test] fn c_source_to_sink() { let src = b"void main() {\n char* x = getenv(\"SECRET\");\n system(x);\n}\n"; let lang = tree_sitter::Language::from(tree_sitter_c::LANGUAGE); let (cfg, entry, summaries) = parse_lang(src, "c", lang); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::C, "test.c", &[]); assert_eq!( findings.len(), 1, "C: source->sink should produce 1 finding" ); } #[test] fn cpp_source_to_sink() { let src = b"void main() {\n char* x = getenv(\"SECRET\");\n system(x);\n}\n"; let lang = tree_sitter::Language::from(tree_sitter_cpp::LANGUAGE); let (cfg, entry, summaries) = parse_lang(src, "cpp", lang); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Cpp, "test.cpp", &[]); assert_eq!( findings.len(), 1, "C++: source->sink should produce 1 finding" ); } #[test] fn php_source_to_sink() { let src = b""; let lang = tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP); let (cfg, entry, summaries) = parse_lang(src, "php", lang); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Php, "test.php", &[]); assert_eq!( findings.len(), 1, "PHP: source->sink should produce 1 finding" ); } #[test] fn ruby_source_to_sink() { let src = b"def main\n x = gets()\n system(x)\nend\n"; let lang = tree_sitter::Language::from(tree_sitter_ruby::LANGUAGE); let (cfg, entry, summaries) = parse_lang(src, "ruby", lang); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Ruby, "test.rb", &[]); assert_eq!( findings.len(), 1, "Ruby: source->sink should produce 1 finding" ); } // ───────────────────────────────────────────────────────────────────────────── // Cross-language multi-file tests // ───────────────────────────────────────────────────────────────────────────── // // Cross-language resolution now requires explicit InteropEdge declarations. // Without an edge, functions from different languages are never resolved — // this prevents false positives from name collisions across languages. /// Extract cross-file summaries from any language's source bytes. fn extract_lang_summaries( src: &[u8], slug: &str, ts_lang: tree_sitter::Language, path: &str, ) -> Vec { use crate::cfg::export_summaries; let (_, _, local) = parse_lang(src, slug, ts_lang); export_summaries(&local, path, slug) } // ── Scenario 1: Python source function → JavaScript sink via interop ───── #[test] fn cross_lang_python_source_to_js_sink_via_interop() { use crate::interop::CallSiteKey; use crate::summary::merge_summaries; let py_src = b"def get_input():\n x = os.getenv(\"SECRET\")\n return x\n"; let py_lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE); let py_summaries = extract_lang_summaries(py_src, "python", py_lang, "lib.py"); let global = merge_summaries(py_summaries, None); // JavaScript file calls get_input() and passes to eval() let js_src = b"function main() {\n let x = get_input();\n eval(x);\n}\n"; let js_lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); let (cfg, entry, local) = parse_lang(js_src, "javascript", js_lang); // Without interop: no cross-lang resolution let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::JavaScript, "main.js", &[], ); assert!(findings.is_empty(), "No cross-lang without interop edge"); // With interop edge let edges = vec![InteropEdge { from: CallSiteKey { caller_lang: Lang::JavaScript, caller_namespace: "main.js".into(), caller_func: "main".into(), callee_symbol: "get_input".into(), ordinal: 0, }, to: FuncKey { lang: Lang::Python, namespace: "lib.py".into(), name: "get_input".into(), arity: Some(0), }, arg_map: vec![], ret_taints: true, }]; let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::JavaScript, "main.js", &edges, ); assert_eq!( findings.len(), 1, "Python source → JS sink via interop edge" ); } // ── Scenario 2: Go source function → Python sink via interop ───────────── #[test] fn cross_lang_go_source_to_python_sink_via_interop() { use crate::interop::CallSiteKey; use crate::summary::merge_summaries; let go_src = b"package main\n\nfunc fetch_env() string {\n\tx := os.Getenv(\"SECRET\")\n\treturn x\n}\n"; let go_lang = tree_sitter::Language::from(tree_sitter_go::LANGUAGE); let go_summaries = extract_lang_summaries(go_src, "go", go_lang, "lib.go"); let global = merge_summaries(go_summaries, None); let py_src = b"def main():\n x = fetch_env()\n os.system(x)\n"; let py_lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE); let (cfg, entry, local) = parse_lang(py_src, "python", py_lang); // Without interop: no findings let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Python, "main.py", &[], ); assert!(findings.is_empty(), "No cross-lang without interop"); // With interop let edges = vec![InteropEdge { from: CallSiteKey { caller_lang: Lang::Python, caller_namespace: "main.py".into(), caller_func: "main".into(), callee_symbol: "fetch_env".into(), ordinal: 0, }, to: FuncKey { lang: Lang::Go, namespace: "lib.go".into(), name: "fetch_env".into(), arity: Some(0), }, arg_map: vec![], ret_taints: true, }]; let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Python, "main.py", &edges, ); assert_eq!(findings.len(), 1, "Go source → Python sink via interop"); } // ── Scenario 3: Rust sanitizer applied in JavaScript context via interop ── #[test] fn cross_lang_rust_sanitizer_in_js_via_interop() { use crate::interop::CallSiteKey; use crate::summary::merge_summaries; let rs_src = br#" fn clean_shell(s: &str) -> String { shell_escape::unix::escape(s).to_string() } "#; let rs_lang = tree_sitter::Language::from(tree_sitter_rust::LANGUAGE); let rs_summaries = extract_lang_summaries(rs_src, "rust", rs_lang, "lib.rs"); let global = merge_summaries(rs_summaries, None); // JS: source → Rust sanitizer → shell sink let js_src = b"function main() {\n let x = document.location();\n let y = clean_shell(x);\n eval(y);\n}\n"; let js_lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); let (cfg, entry, local) = parse_lang(js_src, "javascript", js_lang); let edges = vec![InteropEdge { from: CallSiteKey { caller_lang: Lang::JavaScript, caller_namespace: "main.js".into(), caller_func: "main".into(), callee_symbol: "clean_shell".into(), ordinal: 0, }, to: FuncKey { lang: Lang::Rust, namespace: "lib.rs".into(), name: "clean_shell".into(), arity: Some(1), }, arg_map: vec![], ret_taints: true, }]; let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::JavaScript, "main.js", &edges, ); assert!( findings.is_empty(), "Rust SHELL_ESCAPE sanitizer should neutralise taint via interop" ); } // ── Scenario 4: C sink function called from Java via interop ───────────── #[test] fn cross_lang_c_sink_called_from_java_via_interop() { use crate::interop::CallSiteKey; use crate::summary::merge_summaries; let c_src = b"void run_cmd(char* cmd) {\n system(cmd);\n}\n"; let c_lang = tree_sitter::Language::from(tree_sitter_c::LANGUAGE); let c_summaries = extract_lang_summaries(c_src, "c", c_lang, "native.c"); let global = merge_summaries(c_summaries, None); let java_src = b"class Main {\n void main() {\n String x = System.getenv(\"INPUT\");\n run_cmd(x);\n }\n}\n"; let java_lang = tree_sitter::Language::from(tree_sitter_java::LANGUAGE); let (cfg, entry, local) = parse_lang(java_src, "java", java_lang); let edges = vec![InteropEdge { from: CallSiteKey { caller_lang: Lang::Java, caller_namespace: "Main.java".into(), caller_func: "main".into(), callee_symbol: "run_cmd".into(), ordinal: 0, }, to: FuncKey { lang: Lang::C, namespace: "native.c".into(), name: "run_cmd".into(), arity: Some(0), // C param extraction yields 0 (pre-existing limitation) }, arg_map: vec![], ret_taints: false, }]; let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Java, "Main.java", &edges, ); assert_eq!(findings.len(), 1, "Java source → C sink via interop"); } // ── Scenario 5: Multi-language summary merge with interop ──────────────── #[test] fn cross_lang_three_languages_merged_summaries_via_interop() { use crate::interop::CallSiteKey; use crate::summary::merge_summaries; // Python: source function let py_src = b"def get_secret():\n x = os.getenv(\"SECRET\")\n return x\n"; let py_lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE); let py_sums = extract_lang_summaries(py_src, "python", py_lang, "source.py"); // C: sink function let c_src = b"void run_dangerous(char* cmd) {\n system(cmd);\n}\n"; let c_lang = tree_sitter::Language::from(tree_sitter_c::LANGUAGE); let c_sums = extract_lang_summaries(c_src, "c", c_lang, "native.c"); // Rust: sanitizer function let rs_src = br#" fn make_safe(s: &str) -> String { shell_escape::unix::escape(s).to_string() } "#; let rs_lang = tree_sitter::Language::from(tree_sitter_rust::LANGUAGE); let rs_sums = extract_lang_summaries(rs_src, "rust", rs_lang, "lib.rs"); let all_sums: Vec<_> = py_sums.into_iter().chain(c_sums).chain(rs_sums).collect(); let global = merge_summaries(all_sums, None); // Go caller: source → sanitizer → sink (all cross-language) let go_src = b"package main\n\nfunc main() {\n\tx := get_secret()\n\ty := make_safe(x)\n\trun_dangerous(y)\n}\n"; let go_lang = tree_sitter::Language::from(tree_sitter_go::LANGUAGE); let (cfg, entry, local) = parse_lang(go_src, "go", go_lang); let edges = vec![ InteropEdge { from: CallSiteKey { caller_lang: Lang::Go, caller_namespace: "main.go".into(), caller_func: "main".into(), callee_symbol: "get_secret".into(), ordinal: 0, }, to: FuncKey { lang: Lang::Python, namespace: "source.py".into(), name: "get_secret".into(), arity: Some(0), }, arg_map: vec![], ret_taints: true, }, InteropEdge { from: CallSiteKey { caller_lang: Lang::Go, caller_namespace: "main.go".into(), caller_func: "main".into(), callee_symbol: "make_safe".into(), ordinal: 0, }, to: FuncKey { lang: Lang::Rust, namespace: "lib.rs".into(), name: "make_safe".into(), arity: Some(1), }, arg_map: vec![], ret_taints: true, }, InteropEdge { from: CallSiteKey { caller_lang: Lang::Go, caller_namespace: "main.go".into(), caller_func: "main".into(), callee_symbol: "run_dangerous".into(), ordinal: 0, }, to: FuncKey { lang: Lang::C, namespace: "native.c".into(), name: "run_dangerous".into(), arity: Some(0), // C param extraction yields 0 (pre-existing limitation) }, arg_map: vec![], ret_taints: false, }, ]; let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Go, "main.go", &edges, ); assert!( findings.is_empty(), "source(Py) → sanitizer(Rs) → sink(C) via interop should be safe; got {} findings", findings.len() ); } // ── Scenario 6: Same flow without sanitizer should flag via interop ────── #[test] fn cross_lang_three_languages_unsanitised_via_interop() { use crate::interop::CallSiteKey; use crate::summary::merge_summaries; let py_src = b"def get_secret():\n x = os.getenv(\"SECRET\")\n return x\n"; let py_lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE); let py_sums = extract_lang_summaries(py_src, "python", py_lang, "source.py"); let c_src = b"void run_dangerous(char* cmd) {\n system(cmd);\n}\n"; let c_lang = tree_sitter::Language::from(tree_sitter_c::LANGUAGE); let c_sums = extract_lang_summaries(c_src, "c", c_lang, "native.c"); let all_sums: Vec<_> = py_sums.into_iter().chain(c_sums).collect(); let global = merge_summaries(all_sums, None); // Go caller: source → sink directly (no sanitizer) let go_src = b"package main\n\nfunc main() {\n\tx := get_secret()\n\trun_dangerous(x)\n}\n"; let go_lang = tree_sitter::Language::from(tree_sitter_go::LANGUAGE); let (cfg, entry, local) = parse_lang(go_src, "go", go_lang); let edges = vec![ InteropEdge { from: CallSiteKey { caller_lang: Lang::Go, caller_namespace: "main.go".into(), caller_func: "main".into(), callee_symbol: "get_secret".into(), ordinal: 0, }, to: FuncKey { lang: Lang::Python, namespace: "source.py".into(), name: "get_secret".into(), arity: Some(0), }, arg_map: vec![], ret_taints: true, }, InteropEdge { from: CallSiteKey { caller_lang: Lang::Go, caller_namespace: "main.go".into(), caller_func: "main".into(), callee_symbol: "run_dangerous".into(), ordinal: 0, }, to: FuncKey { lang: Lang::C, namespace: "native.c".into(), name: "run_dangerous".into(), arity: Some(0), // C param extraction yields 0 (pre-existing limitation) }, arg_map: vec![], ret_taints: false, }, ]; let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Go, "main.go", &edges, ); assert_eq!( findings.len(), 1, "source(Py) → sink(C) without sanitizer via interop" ); } // ── Scenario 7: Name collision across languages stays separate ─────────── #[test] fn cross_lang_name_collision_stays_separate() { use crate::summary::merge_summaries; // Python version: source let py_src = b"def process_data():\n x = os.getenv(\"DATA\")\n return x\n"; let py_lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE); let py_sums = extract_lang_summaries(py_src, "python", py_lang, "handler.py"); // C version: benign passthrough (constructed manually) let c_summary = crate::summary::FuncSummary { name: "process_data".into(), file_path: "handler.c".into(), lang: "c".into(), param_count: 1, param_names: vec!["s".into()], source_caps: 0, sanitizer_caps: 0, sink_caps: 0, propagates_taint: true, tainted_sink_params: vec![], callees: vec![], }; let all_sums: Vec<_> = py_sums .into_iter() .chain(std::iter::once(c_summary)) .collect(); let global = merge_summaries(all_sums, None); // Verify they are stored under different FuncKeys let py_matches = global.lookup_same_lang(Lang::Python, "process_data"); let c_matches = global.lookup_same_lang(Lang::C, "process_data"); assert_eq!(py_matches.len(), 1, "Python version stored separately"); assert_eq!(c_matches.len(), 1, "C version stored separately"); // Python's source_caps should NOT bleed into C assert!(py_matches[0].1.source_caps != 0, "Python has source caps"); assert_eq!( c_matches[0].1.source_caps, 0, "C should NOT get Python's source caps" ); } // ── Scenario 8: Ruby passthrough in JS via interop ─────────────────────── #[test] fn cross_lang_ruby_passthrough_in_js_via_interop() { use crate::interop::CallSiteKey; use crate::summary::FuncSummary; let mut global = GlobalSummaries::new(); let key = FuncKey { lang: Lang::Ruby, namespace: "helper.rb".into(), name: "transform".into(), arity: Some(1), }; global.insert( key.clone(), FuncSummary { name: "transform".into(), file_path: "helper.rb".into(), lang: "ruby".into(), param_count: 1, param_names: vec!["data".into()], source_caps: 0, sanitizer_caps: 0, sink_caps: 0, propagates_taint: true, tainted_sink_params: vec![], callees: vec![], }, ); let js_src = b"function main() {\n let x = document.location();\n let y = transform(x);\n eval(y);\n}\n"; let js_lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); let (cfg, entry, local) = parse_lang(js_src, "javascript", js_lang); let edges = vec![InteropEdge { from: CallSiteKey { caller_lang: Lang::JavaScript, caller_namespace: "main.js".into(), caller_func: "main".into(), callee_symbol: "transform".into(), ordinal: 0, }, to: key, arg_map: vec![], ret_taints: true, }]; let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::JavaScript, "main.js", &edges, ); assert_eq!( findings.len(), 1, "taint should propagate through cross-lang passthrough via interop" ); } // ── Scenario 9: PHP source → Go sink via interop ───────────────────────── #[test] fn cross_lang_php_source_to_go_sink_via_interop() { use crate::interop::CallSiteKey; use crate::summary::{FuncSummary, merge_summaries}; let php_summary = FuncSummary { name: "read_input".into(), file_path: "input.php".into(), lang: "php".into(), param_count: 0, param_names: vec![], source_caps: Cap::all().bits(), sanitizer_caps: 0, sink_caps: 0, propagates_taint: false, tainted_sink_params: vec![], callees: vec!["file_get_contents".into()], }; let global = merge_summaries(vec![php_summary], None); let go_src = b"package main\n\nfunc main() {\n\tx := read_input()\n\texec.Command(x)\n}\n"; let go_lang = tree_sitter::Language::from(tree_sitter_go::LANGUAGE); let (cfg, entry, local) = parse_lang(go_src, "go", go_lang); let edges = vec![InteropEdge { from: CallSiteKey { caller_lang: Lang::Go, caller_namespace: "main.go".into(), caller_func: "main".into(), callee_symbol: "read_input".into(), ordinal: 0, }, to: FuncKey { lang: Lang::Php, namespace: "input.php".into(), name: "read_input".into(), arity: Some(0), }, arg_map: vec![], ret_taints: true, }]; let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Go, "main.go", &edges, ); assert_eq!(findings.len(), 1, "PHP source → Go sink via interop"); } // ── Scenario 10: Wrong sanitizer caps still wrong across languages ─────── #[test] fn cross_lang_wrong_sanitizer_still_flags_via_interop() { use crate::interop::CallSiteKey; use crate::summary::FuncSummary; let mut global = GlobalSummaries::new(); let key = FuncKey { lang: Lang::Python, namespace: "sanitizers.py".into(), name: "html_clean".into(), arity: Some(1), }; global.insert( key.clone(), FuncSummary { name: "html_clean".into(), file_path: "sanitizers.py".into(), lang: "python".into(), param_count: 1, param_names: vec!["text".into()], source_caps: 0, sanitizer_caps: Cap::HTML_ESCAPE.bits(), sink_caps: 0, propagates_taint: true, tainted_sink_params: vec![], callees: vec![], }, ); // JS: source → Python HTML sanitizer → shell sink let js_src = b"function main() {\n let x = document.location();\n let y = html_clean(x);\n eval(y);\n}\n"; let js_lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); let (cfg, entry, local) = parse_lang(js_src, "javascript", js_lang); let edges = vec![InteropEdge { from: CallSiteKey { caller_lang: Lang::JavaScript, caller_namespace: "main.js".into(), caller_func: "main".into(), callee_symbol: "html_clean".into(), ordinal: 0, }, to: key, arg_map: vec![], ret_taints: true, }]; let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::JavaScript, "main.js", &edges, ); assert_eq!( findings.len(), 1, "wrong cross-language sanitizer should NOT neutralise" ); } // ── Scenario 11: Summary lang field preserved (different FuncKeys) ─────── #[test] fn cross_lang_summary_preserves_lang_metadata() { use crate::summary::merge_summaries; let py_summary = crate::summary::FuncSummary { name: "helper".into(), file_path: "lib.py".into(), lang: "python".into(), param_count: 0, param_names: vec![], source_caps: Cap::all().bits(), sanitizer_caps: 0, sink_caps: 0, propagates_taint: false, tainted_sink_params: vec![], callees: vec![], }; let js_summary = crate::summary::FuncSummary { name: "helper".into(), file_path: "lib.js".into(), lang: "javascript".into(), param_count: 1, param_names: vec!["x".into()], source_caps: 0, sanitizer_caps: 0, sink_caps: Cap::SHELL_ESCAPE.bits(), propagates_taint: true, tainted_sink_params: vec![0], callees: vec![], }; let global = merge_summaries(vec![py_summary, js_summary], None); // They are now separate entries — not merged let py_matches = global.lookup_same_lang(Lang::Python, "helper"); let js_matches = global.lookup_same_lang(Lang::JavaScript, "helper"); assert_eq!(py_matches.len(), 1, "Python helper stored separately"); assert_eq!(js_matches.len(), 1, "JS helper stored separately"); assert!( py_matches[0].1.source_caps != 0, "Python source caps preserved" ); assert!(js_matches[0].1.sink_caps != 0, "JS sink caps preserved"); assert!( js_matches[0].1.propagates_taint, "JS propagates_taint preserved" ); } // ── Scenario 12: Full pipeline Python lib + JS caller via interop ──────── #[test] fn cross_lang_full_pipeline_python_lib_js_caller_via_interop() { use crate::interop::CallSiteKey; use crate::summary::merge_summaries; // Python library: defines dangerous_query() that reads from os.getenv let py_src = b"def dangerous_query():\n x = os.getenv(\"SQL\")\n return x\n"; let py_lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE); let py_sums = extract_lang_summaries(py_src, "python", py_lang, "db.py"); // JavaScript library: defines run_query() that calls eval (a sink) let js_lib_src = b"function run_query(q) {\n eval(q);\n}\n"; let js_lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); let js_sums = extract_lang_summaries(js_lib_src, "javascript", js_lang, "db.js"); let all_sums: Vec<_> = py_sums.into_iter().chain(js_sums).collect(); let global = merge_summaries(all_sums, None); // Go caller: dangerous_query() → run_query() let go_src = b"package main\n\nfunc main() {\n\tq := dangerous_query()\n\trun_query(q)\n}\n"; let go_lang = tree_sitter::Language::from(tree_sitter_go::LANGUAGE); let (cfg, entry, local) = parse_lang(go_src, "go", go_lang); let edges = vec![ InteropEdge { from: CallSiteKey { caller_lang: Lang::Go, caller_namespace: "main.go".into(), caller_func: "main".into(), callee_symbol: "dangerous_query".into(), ordinal: 0, }, to: FuncKey { lang: Lang::Python, namespace: "db.py".into(), name: "dangerous_query".into(), arity: Some(0), }, arg_map: vec![], ret_taints: true, }, InteropEdge { from: CallSiteKey { caller_lang: Lang::Go, caller_namespace: "main.go".into(), caller_func: "main".into(), callee_symbol: "run_query".into(), ordinal: 0, }, to: FuncKey { lang: Lang::JavaScript, namespace: "db.js".into(), name: "run_query".into(), arity: Some(1), }, arg_map: vec![], ret_taints: false, }, ]; let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Go, "main.go", &edges, ); assert_eq!( findings.len(), 1, "Python source → JS sink via Go caller via interop" ); } // ── New tests: ambiguous resolution, interop edge specificity ──────────── #[test] fn ambiguous_resolution_returns_none() { use crate::summary::FuncSummary; // Two same-lang functions, same name + arity, different namespaces let mut global = GlobalSummaries::new(); for ns in &["a.rs", "b.rs"] { let key = FuncKey { lang: Lang::Rust, namespace: (*ns).to_string(), name: "helper".into(), arity: Some(0), }; global.insert( key, FuncSummary { name: "helper".into(), file_path: (*ns).to_string(), lang: "rust".into(), param_count: 0, param_names: vec![], source_caps: Cap::all().bits(), sanitizer_caps: 0, sink_caps: 0, propagates_taint: false, tainted_sink_params: vec![], callees: vec![], }, ); } // Caller from c.rs calls helper() — ambiguous (two matches, neither is caller's namespace) let src = br#" use std::process::Command; fn main() { let x = helper(); Command::new("sh").arg(x).status().unwrap(); } "#; let (cfg, entry, local) = parse_rust(src); let findings = analyse_file(&cfg, entry, &local, Some(&global), Lang::Rust, "c.rs", &[]); // Ambiguous resolution returns None → no source → no finding assert!( findings.is_empty(), "ambiguous resolution (two namespaces) should return None → no finding" ); } #[test] fn exact_namespace_match_wins() { use crate::summary::FuncSummary; // Same name in two namespaces, but one matches caller's namespace let mut global = GlobalSummaries::new(); // test.rs version: source let key_local = FuncKey { lang: Lang::Rust, namespace: "test.rs".into(), name: "helper".into(), arity: Some(0), }; global.insert( key_local, FuncSummary { name: "helper".into(), file_path: "test.rs".into(), lang: "rust".into(), param_count: 0, param_names: vec![], source_caps: Cap::all().bits(), sanitizer_caps: 0, sink_caps: 0, propagates_taint: false, tainted_sink_params: vec![], callees: vec![], }, ); // other.rs version: no caps let key_other = FuncKey { lang: Lang::Rust, namespace: "other.rs".into(), name: "helper".into(), arity: Some(0), }; global.insert( key_other, FuncSummary { name: "helper".into(), file_path: "other.rs".into(), lang: "rust".into(), param_count: 0, param_names: vec![], source_caps: 0, sanitizer_caps: 0, sink_caps: 0, propagates_taint: false, tainted_sink_params: vec![], callees: vec![], }, ); let src = br#" use std::process::Command; fn main() { let x = helper(); Command::new("sh").arg(x).status().unwrap(); } "#; let (cfg, entry, local) = parse_rust(src); // caller_namespace = "test.rs" matches the source version let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::Rust, "test.rs", &[], ); assert_eq!( findings.len(), 1, "exact namespace match should resolve to the source version" ); } #[test] fn interop_edge_wrong_caller_lang_no_match() { use crate::interop::CallSiteKey; use crate::summary::FuncSummary; let mut global = GlobalSummaries::new(); let key = FuncKey { lang: Lang::Python, namespace: "lib.py".into(), name: "get_data".into(), arity: Some(0), }; global.insert( key.clone(), FuncSummary { name: "get_data".into(), file_path: "lib.py".into(), lang: "python".into(), param_count: 0, param_names: vec![], source_caps: Cap::all().bits(), sanitizer_caps: 0, sink_caps: 0, propagates_taint: false, tainted_sink_params: vec![], callees: vec![], }, ); // Edge specifies Python caller, but we're calling from JavaScript let edges = vec![InteropEdge { from: CallSiteKey { caller_lang: Lang::Python, // wrong! caller_namespace: "main.js".into(), caller_func: "main".into(), callee_symbol: "get_data".into(), ordinal: 0, }, to: key, arg_map: vec![], ret_taints: true, }]; let js_src = b"function main() {\n let x = get_data();\n eval(x);\n}\n"; let js_lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE); let (cfg, entry, local) = parse_lang(js_src, "javascript", js_lang); let findings = analyse_file( &cfg, entry, &local, Some(&global), Lang::JavaScript, "main.js", &edges, ); assert!( findings.is_empty(), "Edge for wrong caller_lang should not match" ); } #[test] fn return_call_recognized_as_source() { use crate::cfg::{build_cfg, export_summaries}; use tree_sitter::Language; // fn foo() -> String { env::var("X").unwrap() } // The return statement contains a call to env::var which should be // recognized as a source after the return-call fix. let src = br#" use std::env; fn foo() -> String { env::var("X").unwrap() } "#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (_, _, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let exported = export_summaries(&summaries, "test.rs", "rust"); let foo = exported .iter() .find(|s| s.name == "foo") .expect("foo should exist"); assert!( foo.source_caps != 0, "foo() should have source_caps set because env::var is called inside return" ); } // ─── Path-sensitive analysis tests ─────────────────────────────────────────── #[test] fn validate_and_early_return() { use crate::cfg::build_cfg; use tree_sitter::Language; // Validate before use: if validation fails, early return. // The sink after the guard is on the "validated" path. // // The CFG creates a synthetic pass-through node for the false path // with an explicit False edge from the If node. BFS reaches the // sink via: cond → (False) → pass-through → (Seq) → sink. // The predicate on the False edge records that `!validate(&x)` was // false (i.e. validation passed), so the sink is path-guarded. let src = br#" use std::env; use std::process::Command; fn main() { let x = env::var("INPUT").unwrap(); if !validate(&x) { return; } Command::new("sh").arg(x).status().unwrap(); }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); // Taint still flows (validate doesn't kill taint), but the finding // should be annotated as path_validated because the false path // (validation passed) has a ValidationCall predicate with polarity=true. assert_eq!(findings.len(), 1, "should still detect the taint flow"); assert!( findings[0].path_validated, "finding should be marked as path_validated (early-return guard detected)" ); assert_eq!( findings[0].guard_kind, Some(PredicateKind::ValidationCall), "guard_kind should be ValidationCall" ); } #[test] fn validate_in_if_else_path_validated() { use crate::cfg::build_cfg; use tree_sitter::Language; // If/else where the True branch (validation passed) contains the sink. // This IS detectable because the If node has genuine True/False branches. let src = br#" use std::env; use std::process::Command; fn main() { let x = env::var("INPUT").unwrap(); if validate(&x) { Command::new("sh").arg(&x).status().unwrap(); } else { println!("invalid input"); } }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); assert_eq!(findings.len(), 1, "should detect the taint flow"); assert!( findings[0].path_validated, "finding should be path_validated (sink in validated branch)" ); assert_eq!( findings[0].guard_kind, Some(PredicateKind::ValidationCall), "guard_kind should be ValidationCall" ); } #[test] fn sink_on_failed_validation_branch() { use crate::cfg::build_cfg; use tree_sitter::Language; // Sink is in the failed-validation branch (negated condition, false edge). let src = br#" use std::env; use std::process::Command; fn main() { let x = env::var("INPUT").unwrap(); if !validate(&x) { Command::new("sh").arg(&x).status().unwrap(); } }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); assert_eq!(findings.len(), 1, "should detect taint flow to sink"); assert!( !findings[0].path_validated, "finding should NOT be path_validated (sink is in failed-validation branch)" ); } #[test] fn contradictory_null_check_pruned() { use crate::cfg::build_cfg; use tree_sitter::Language; // Inner branch is infeasible: if x.is_none() then x cannot also be is_none(). // After early return on is_none(), the fall-through path has polarity=false // for NullCheck. The inner `if x.is_none()` True branch has polarity=true — // contradiction. let src = br#" use std::env; use std::process::Command; fn main() { let x = env::var("INPUT").ok(); if x.is_none() { return; } if x.is_none() { Command::new("sh").arg("dangerous").status().unwrap(); } }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); // The inner branch is infeasible, and the arg "dangerous" is a string // literal (not tainted), so there should be no findings. assert!( findings.is_empty(), "inner branch is infeasible — should produce no findings (got {})", findings.len() ); } #[test] fn sanitize_one_branch_no_regression() { use crate::cfg::build_cfg; use tree_sitter::Language; // Same as existing taint_through_if_else: sanitized in one branch, not in the other. // Verify the finding count stays at 1 (no regression from path sensitivity). let src = br#" use std::env; use std::process::Command; fn main() { let x = env::var("DANGEROUS").unwrap(); let safe = html_escape::encode_safe(&x); if x.len() > 5 { Command::new("sh").arg(&x).status().unwrap(); // UNSAFE } else { Command::new("sh").arg(&safe).status().unwrap(); // SAFE } }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); // Both branches produce findings: the true branch uses unsanitized `x`, // the else branch uses `safe` (HTML_ESCAPE sanitizer vs SHELL_ESCAPE sink). // Previously only 1 finding because else_clause was silently dropped from CFG. assert_eq!( findings.len(), 2, "two findings expected (both branches reach sink with wrong/no sanitizer)" ); } #[test] fn path_state_budget_graceful() { use crate::cfg::build_cfg; use tree_sitter::Language; // Deeply nested ifs with a sink at the innermost level. // PathState should truncate gracefully after MAX_PATH_PREDICATES. let src = br#" use std::env; use std::process::Command; fn main() { let x = env::var("INPUT").unwrap(); if x.len() > 1 { if x.len() > 2 { if x.len() > 3 { if x.len() > 4 { if x.len() > 5 { if x.len() > 6 { if x.len() > 7 { if x.len() > 8 { if x.len() > 9 { Command::new("sh").arg(&x).status().unwrap(); } } } } } } } } } }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); // Should still detect the flow — truncation shouldn't cause false negatives. assert_eq!( findings.len(), 1, "should detect taint flow even with truncated PathState" ); } #[test] fn unknown_predicate_not_pruned() { use crate::cfg::build_cfg; use tree_sitter::Language; // Comparison predicates are NOT in the contradiction whitelist, so even // seemingly contradictory comparisons should not be pruned. let src = br#" use std::env; use std::process::Command; fn main() { let x = env::var("INPUT").unwrap(); if x.len() > 5 { return; } if x.len() > 5 { Command::new("sh").arg(&x).status().unwrap(); } }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); // Comparison is not in the whitelist — the path should NOT be pruned. assert_eq!( findings.len(), 1, "Comparison predicate should not cause contradiction pruning" ); } #[test] fn multi_var_predicate_not_pruned() { use crate::cfg::build_cfg; use tree_sitter::Language; // Multi-variable conditions should never be pruned for contradiction, // even if the kind is in the whitelist. let src = br#" use std::env; use std::process::Command; fn main() { let x = env::var("INPUT").unwrap(); let y = env::var("OTHER").ok(); if y.is_none() { return; } if y.is_none() { Command::new("sh").arg(&x).status().unwrap(); } }"#; let mut parser = tree_sitter::Parser::new(); parser .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) .unwrap(); let tree = parser.parse(src as &[u8], None).unwrap(); let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None); let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]); // Note: y.is_none() condition references `y` and `is_none` — two idents. // Wait, `is_none` is a method — collect_idents finds `y` and `is_none` as // separate identifiers. That makes it multi-var, so contradiction should // NOT fire. However, the actual behavior depends on how many idents // collect_idents extracts from `y.is_none()`. If it returns ["y", "is_none"], // then the predicate has 2 vars → multi-var → not pruned → finding exists. assert!( !findings.is_empty(), "multi-var predicate should not be pruned; flow should be detected" ); }