nyx/tests/calibration_data_exfil.rs

//! Calibration tests for `taint-data-exfiltration` severity, confidence,
//! and rank scoring.
//!
//! These tests pin the calibration described in `docs/detectors.md` so any
//! future change to the scoring path either preserves the documented tier
//! relationships or breaks a test deliberately.
//!
//! What is checked here:
//!
//!   * Cookie source + Confirmed symbolic verdict produces High severity
//!     (cookies carry session / credential material and are treated as
//!     Secret-tier for the leak class).
//!   * Env source + Confirmed verdict produces High severity (same
//!     reasoning, env vars carry credential material).
//!   * Header / FileSystem / Database / CaughtException sources downgrade
//!     to Medium severity even with a Confirmed verdict — they are
//!     Sensitive but not credential-grade secrets.
//!   * No symbolic verdict (or `Inconclusive` / `NotAttempted`) → Low
//!     confidence (the instruction's "Inconclusive" tier; the
//!     `Confidence` enum has no separate Inconclusive variant so it
//!     floors to Low).
//!   * Opaque body (Confirmed but with empty witness) → Medium
//!     confidence; the abstract domain still produced a corroboration
//!     signal even if the witness string is bare.
//!   * `path_validated=true` drops a confidence tier (Medium → Low).
//!   * On the same source, DATA_EXFIL ranks strictly below SSRF (the
//!     taint-class bonus is +7 for data-exfil vs +10 for the generic
//!     `taint-unsanitised-flow`).

use nyx_scanner::commands::scan::Diag;
use nyx_scanner::evidence::{
    Confidence, Evidence, SpanEvidence, SymbolicVerdict, Verdict, compute_confidence,
};
use nyx_scanner::labels::SourceKind;
use nyx_scanner::patterns::{FindingCategory, Severity};
use nyx_scanner::rank::compute_attack_rank;

fn make_evidence(source_kind: SourceKind, verdict: Option<Verdict>) -> Evidence {
    Evidence {
        source: Some(SpanEvidence {
            path: "src/leak.js".into(),
            line: 1,
            col: 1,
            kind: "source".into(),
            snippet: Some("req.cookies.session".into()),
        }),
        sink: Some(SpanEvidence {
            path: "src/leak.js".into(),
            line: 5,
            col: 5,
            kind: "sink".into(),
            snippet: Some("fetch('/endpoint', { body: payload })".into()),
        }),
        source_kind: Some(source_kind),
        hop_count: Some(1),
        cap_specificity: Some(1),
        symbolic: verdict.map(|v| SymbolicVerdict {
            verdict: v,
            constraints_checked: 0,
            paths_explored: 1,
            // For Confirmed cases use the strong-witness phrasing so the
            // test exercises the same code path that real symex output
            // takes (see `compute_taint_confidence` for the analogous
            // witness-strength branch).
            witness: matches!(v, Verdict::Confirmed)
                .then(|| "tainted cookie flows to fetch body".into()),
            interproc_call_chains: vec![],
            cutoff_notes: vec![],
        }),
        ..Default::default()
    }
}

fn make_diag(
    rule_id: &str,
    severity: Severity,
    source_kind: SourceKind,
    verdict: Option<Verdict>,
    path_validated: bool,
) -> Diag {
    Diag {
        path: "src/leak.js".into(),
        line: 5,
        col: 5,
        severity,
        id: rule_id.into(),
        category: FindingCategory::Security,
        path_validated,
        guard_kind: if path_validated {
            Some("Validation".into())
        } else {
            None
        },
        message: None,
        labels: vec![],
        confidence: None,
        evidence: Some(make_evidence(source_kind, verdict)),
        rank_score: None,
        rank_reason: None,
        suppressed: false,
        suppression: None,
        rollup: None,
        finding_id: String::new(),
        alternative_finding_ids: vec![],
        stable_hash: 0,
    }
}

// ── Calibration fixture 1: Cookie source, Confirmed verdict ─────────────

#[test]
fn cookie_source_with_confirmed_verdict_is_high_medium() {
    // Severity: cookies are Secret-tier for DATA_EXFIL → High.
    // Confidence: Confirmed verdict on a Sensitive source → Medium (the
    // routing caps at Medium even with a strong witness; see
    // `compute_data_exfil_confidence`).
    let diag = make_diag(
        "taint-data-exfiltration (source 1:1)",
        Severity::High,
        SourceKind::Cookie,
        Some(Verdict::Confirmed),
        false,
    );

    let confidence = compute_confidence(&diag);
    assert_eq!(
        confidence,
        Confidence::Medium,
        "Cookie + Confirmed → Medium (DATA_EXFIL cap), got {confidence:?}"
    );
}

// ── Calibration fixture 2: Env source, Confirmed verdict ────────────────

#[test]
fn env_source_with_confirmed_verdict_is_high_medium() {
    // Env vars carry credential / config material and are treated as
    // Secret-tier alongside cookies.
    let diag = make_diag(
        "taint-data-exfiltration (source 1:1)",
        Severity::High,
        SourceKind::EnvironmentConfig,
        Some(Verdict::Confirmed),
        false,
    );

    let confidence = compute_confidence(&diag);
    assert_eq!(
        confidence,
        Confidence::Medium,
        "Env + Confirmed → Medium, got {confidence:?}"
    );
}

// ── Calibration fixture 3: Header source, opaque body (no verdict) ──────

#[test]
fn header_source_without_symex_is_medium_low() {
    // Header is Sensitive but not credential-grade; severity downgrades
    // to Medium.  No symbolic verdict → confidence Low (the "Inconclusive
    // when no symex verdict" tier from the instruction).
    let diag = make_diag(
        "taint-data-exfiltration (source 1:1)",
        Severity::Medium,
        SourceKind::Header,
        None,
        false,
    );

    let confidence = compute_confidence(&diag);
    assert_eq!(
        confidence,
        Confidence::Low,
        "Header + no verdict → Low, got {confidence:?}"
    );
}

// ── Calibration fixture 4: guarded path drops a tier ────────────────────

#[test]
fn guarded_path_drops_confidence_tier() {
    // Cookie + Confirmed would normally yield Medium confidence; the
    // path-validated flag drops it one step to Low.  Without the guard
    // the same diag must score Medium (asserted alongside to lock in
    // the delta, not just the floor).
    let unguarded = make_diag(
        "taint-data-exfiltration (source 1:1)",
        Severity::High,
        SourceKind::Cookie,
        Some(Verdict::Confirmed),
        false,
    );
    let guarded = make_diag(
        "taint-data-exfiltration (source 1:1)",
        Severity::High,
        SourceKind::Cookie,
        Some(Verdict::Confirmed),
        true,
    );

    assert_eq!(compute_confidence(&unguarded), Confidence::Medium);
    assert_eq!(
        compute_confidence(&guarded),
        Confidence::Low,
        "guarded DATA_EXFIL path must drop one confidence tier"
    );
}

// ── Calibration fixture 5: ranking — DATA_EXFIL below SSRF on same source

#[test]
fn data_exfil_ranks_below_ssrf_on_same_source() {
    // Cookie source flowing to `fetch` could fire either DATA_EXFIL (body
    // arg) or SSRF / generic taint (URL arg).  On the same severity tier
    // SSRF must outrank DATA_EXFIL because the analysis-kind bonus is +10
    // for `taint-unsanitised-flow` and +7 for `taint-data-exfiltration`.
    let exfil = make_diag(
        "taint-data-exfiltration (source 1:1)",
        Severity::High,
        SourceKind::Cookie,
        Some(Verdict::Confirmed),
        false,
    );
    let ssrf = make_diag(
        "taint-unsanitised-flow (source 1:1)",
        Severity::High,
        SourceKind::Cookie,
        Some(Verdict::Confirmed),
        false,
    );

    let exfil_score = compute_attack_rank(&exfil).score;
    let ssrf_score = compute_attack_rank(&ssrf).score;
    assert!(
        ssrf_score > exfil_score,
        "SSRF score ({ssrf_score}) must outrank DATA_EXFIL score \
         ({exfil_score}) on the same source"
    );
    // The delta is exactly the analysis-kind bonus difference (+3) — pin
    // it so accidental drift trips the test rather than silently moving
    // both bonuses in lock-step.
    assert!(
        (ssrf_score - exfil_score - 3.0).abs() < 0.001,
        "SSRF − DATA_EXFIL should equal the analysis-kind bonus delta \
         (+3); got {} ({} − {})",
        ssrf_score - exfil_score,
        ssrf_score,
        exfil_score,
    );
}

// ── Calibration fixture 6: DATA_EXFIL above AST patterns ────────────────

#[test]
fn data_exfil_ranks_above_ast_pattern() {
    // The instruction mandates DATA_EXFIL sit above informational AST
    // patterns.  Use a Medium DATA_EXFIL (header source) vs a Low AST
    // pattern (the typical AST-only banned-API match) to lock the
    // ordering in even at the weaker end of the DATA_EXFIL spectrum.
    let medium_exfil = make_diag(
        "taint-data-exfiltration (source 1:1)",
        Severity::Medium,
        SourceKind::Header,
        Some(Verdict::Confirmed),
        false,
    );
    let mut ast_pattern = make_diag(
        "js.code_exec.eval",
        Severity::Low,
        SourceKind::Unknown,
        None,
        false,
    );
    // AST patterns don't carry taint evidence; clear it so the ranker
    // takes the AST-only branch.
    ast_pattern.evidence = None;

    let exfil_score = compute_attack_rank(&medium_exfil).score;
    let ast_score = compute_attack_rank(&ast_pattern).score;
    assert!(
        exfil_score > ast_score,
        "DATA_EXFIL ({exfil_score}) must outrank AST pattern ({ast_score})"
    );
}
-												Added Cap::DATA_EXFIL and taint fp and fn fixes on real repos (#59)

* feat: Enhance data exfiltration detection with source sensitivity gating for cookies and headers

* feat: Implement cross-file data exfiltration detection with parameter-specific gate filters

* feat: Add calibration tests and refine DATA_EXFIL severity scoring logic

* feat: Introduce per-detector configuration for data exfiltration suppression

* feat: Enhance DATA_EXFIL findings with destination field tracking in diagnostics and SARIF output

* feat: Add tainted body and URL handling for data exfiltration detection

* feat: Add integration tests and fixtures for DATA_EXFIL and SSRF detection in Go

* feat: Add Java integration tests and fixtures for DATA_EXFIL detection across multiple HTTP clients

* feat: Add synthetic externals handling for closure-captured variables in SSA

* feat: Implement closure-based suppression for resource leak findings

* feat: Add regression guards for shell-injection and taint propagation in for-of destructure patterns

* feat: Implement constructor cap narrowing for data exfiltration detection in HTTP request builders

* feat: Add gated sinks for data exfiltration detection in C and C++ using curl_easy_setopt

* feat: Implement DATA_EXFIL cap parity for backwards analysis and add integration tests

* feat: Add data exfiltration sinks for various languages and enhance documentation

* refactor: Simplify formatting and improve readability in various files

* refactor: Improve readability by simplifying conditional statements and adding clippy linting

* docs: Update CHANGELOG and comments for data exfiltration features and configuration

* docs: Clarify configuration instructions for data exfiltration trusted destinations

* docs: Enhance comments for evidence routing logic in data exfiltration
											
										
										
											2026-05-01 10:59:52 -04:00
+								//! Calibration tests for `taint-data-exfiltration` severity, confidence,
 								//! and rank scoring.
 								//!
 								//! These tests pin the calibration described in `docs/detectors.md` so any
 								//! future change to the scoring path either preserves the documented tier
 								//! relationships or breaks a test deliberately.
 								//!
 								//! What is checked here:
 								//!
 								//!   * Cookie source + Confirmed symbolic verdict produces High severity
 								//!     (cookies carry session / credential material and are treated as
 								//!     Secret-tier for the leak class).
 								//!   * Env source + Confirmed verdict produces High severity (same
 								//!     reasoning, env vars carry credential material).
 								//!   * Header / FileSystem / Database / CaughtException sources downgrade
 								//!     to Medium severity even with a Confirmed verdict — they are
 								//!     Sensitive but not credential-grade secrets.
 								//!   * No symbolic verdict (or `Inconclusive` / `NotAttempted`) → Low
 								//!     confidence (the instruction's "Inconclusive" tier; the
 								//!     `Confidence` enum has no separate Inconclusive variant so it
 								//!     floors to Low).
 								//!   * Opaque body (Confirmed but with empty witness) → Medium
 								//!     confidence; the abstract domain still produced a corroboration
 								//!     signal even if the witness string is bare.
 								//!   * `path_validated=true` drops a confidence tier (Medium → Low).
 								//!   * On the same source, DATA_EXFIL ranks strictly below SSRF (the
 								//!     taint-class bonus is +7 for data-exfil vs +10 for the generic
 								//!     `taint-unsanitised-flow`).
 								use nyx_scanner::commands::scan::Diag;
 								use nyx_scanner::evidence::{
 								    Confidence, Evidence, SpanEvidence, SymbolicVerdict, Verdict, compute_confidence,
 								};
 								use nyx_scanner::labels::SourceKind;
 								use nyx_scanner::patterns::{FindingCategory, Severity};
 								use nyx_scanner::rank::compute_attack_rank;
 								fn make_evidence(source_kind: SourceKind, verdict: Option<Verdict>) -> Evidence {
 								    Evidence {
 								        source: Some(SpanEvidence {
 								            path: "src/leak.js".into(),
 								            line: 1,
 								            col: 1,
 								            kind: "source".into(),
 								            snippet: Some("req.cookies.session".into()),
 								        }),
 								        sink: Some(SpanEvidence {
 								            path: "src/leak.js".into(),
 								            line: 5,
 								            col: 5,
 								            kind: "sink".into(),
 								            snippet: Some("fetch('/endpoint', { body: payload })".into()),
 								        }),
 								        source_kind: Some(source_kind),
 								        hop_count: Some(1),
 								        cap_specificity: Some(1),
 								        symbolic: verdict.map(|v| SymbolicVerdict {
 								            verdict: v,
 								            constraints_checked: 0,
 								            paths_explored: 1,
 								            // For Confirmed cases use the strong-witness phrasing so the
 								            // test exercises the same code path that real symex output
 								            // takes (see `compute_taint_confidence` for the analogous
 								            // witness-strength branch).
 								            witness: matches!(v, Verdict::Confirmed)
 								                .then(|| "tainted cookie flows to fetch body".into()),
 								            interproc_call_chains: vec![],
 								            cutoff_notes: vec![],
 								        }),
 								        ..Default::default()
 								    }
 								}
 								fn make_diag(
 								    rule_id: &str,
 								    severity: Severity,
 								    source_kind: SourceKind,
 								    verdict: Option<Verdict>,
 								    path_validated: bool,
 								) -> Diag {
 								    Diag {
 								        path: "src/leak.js".into(),
 								        line: 5,
 								        col: 5,
 								        severity,
 								        id: rule_id.into(),
 								        category: FindingCategory::Security,
 								        path_validated,
 								        guard_kind: if path_validated {
 								            Some("Validation".into())
 								        } else {
 								            None
 								        },
 								        message: None,
 								        labels: vec![],
 								        confidence: None,
 								        evidence: Some(make_evidence(source_kind, verdict)),
 								        rank_score: None,
 								        rank_reason: None,
 								        suppressed: false,
 								        suppression: None,
 								        rollup: None,
 								        finding_id: String::new(),
 								        alternative_finding_ids: vec![],
-												[pitboss] phase 01: M1 — Spec extraction + `--verify` plumbing (no sandbox)

											
										
										
											2026-05-11 21:19:03 -04:00
+								        stable_hash: 0,
-												Added Cap::DATA_EXFIL and taint fp and fn fixes on real repos (#59)

* feat: Enhance data exfiltration detection with source sensitivity gating for cookies and headers

* feat: Implement cross-file data exfiltration detection with parameter-specific gate filters

* feat: Add calibration tests and refine DATA_EXFIL severity scoring logic

* feat: Introduce per-detector configuration for data exfiltration suppression

* feat: Enhance DATA_EXFIL findings with destination field tracking in diagnostics and SARIF output

* feat: Add tainted body and URL handling for data exfiltration detection

* feat: Add integration tests and fixtures for DATA_EXFIL and SSRF detection in Go

* feat: Add Java integration tests and fixtures for DATA_EXFIL detection across multiple HTTP clients

* feat: Add synthetic externals handling for closure-captured variables in SSA

* feat: Implement closure-based suppression for resource leak findings

* feat: Add regression guards for shell-injection and taint propagation in for-of destructure patterns

* feat: Implement constructor cap narrowing for data exfiltration detection in HTTP request builders

* feat: Add gated sinks for data exfiltration detection in C and C++ using curl_easy_setopt

* feat: Implement DATA_EXFIL cap parity for backwards analysis and add integration tests

* feat: Add data exfiltration sinks for various languages and enhance documentation

* refactor: Simplify formatting and improve readability in various files

* refactor: Improve readability by simplifying conditional statements and adding clippy linting

* docs: Update CHANGELOG and comments for data exfiltration features and configuration

* docs: Clarify configuration instructions for data exfiltration trusted destinations

* docs: Enhance comments for evidence routing logic in data exfiltration
											
										
										
											2026-05-01 10:59:52 -04:00
+								    }
 								}
 								// ── Calibration fixture 1: Cookie source, Confirmed verdict ─────────────
 								#[test]
 								fn cookie_source_with_confirmed_verdict_is_high_medium() {
 								    // Severity: cookies are Secret-tier for DATA_EXFIL → High.
 								    // Confidence: Confirmed verdict on a Sensitive source → Medium (the
 								    // routing caps at Medium even with a strong witness; see
 								    // `compute_data_exfil_confidence`).
 								    let diag = make_diag(
 								        "taint-data-exfiltration (source 1:1)",
 								        Severity::High,
 								        SourceKind::Cookie,
 								        Some(Verdict::Confirmed),
 								        false,
 								    );
 								    let confidence = compute_confidence(&diag);
 								    assert_eq!(
 								        confidence,
 								        Confidence::Medium,
 								        "Cookie + Confirmed → Medium (DATA_EXFIL cap), got {confidence:?}"
 								    );
 								}
 								// ── Calibration fixture 2: Env source, Confirmed verdict ────────────────
 								#[test]
 								fn env_source_with_confirmed_verdict_is_high_medium() {
 								    // Env vars carry credential / config material and are treated as
 								    // Secret-tier alongside cookies.
 								    let diag = make_diag(
 								        "taint-data-exfiltration (source 1:1)",
 								        Severity::High,
 								        SourceKind::EnvironmentConfig,
 								        Some(Verdict::Confirmed),
 								        false,
 								    );
 								    let confidence = compute_confidence(&diag);
 								    assert_eq!(
 								        confidence,
 								        Confidence::Medium,
 								        "Env + Confirmed → Medium, got {confidence:?}"
 								    );
 								}
 								// ── Calibration fixture 3: Header source, opaque body (no verdict) ──────
 								#[test]
 								fn header_source_without_symex_is_medium_low() {
 								    // Header is Sensitive but not credential-grade; severity downgrades
 								    // to Medium.  No symbolic verdict → confidence Low (the "Inconclusive
 								    // when no symex verdict" tier from the instruction).
 								    let diag = make_diag(
 								        "taint-data-exfiltration (source 1:1)",
 								        Severity::Medium,
 								        SourceKind::Header,
 								        None,
 								        false,
 								    );
 								    let confidence = compute_confidence(&diag);
 								    assert_eq!(
 								        confidence,
 								        Confidence::Low,
 								        "Header + no verdict → Low, got {confidence:?}"
 								    );
 								}
 								// ── Calibration fixture 4: guarded path drops a tier ────────────────────
 								#[test]
 								fn guarded_path_drops_confidence_tier() {
 								    // Cookie + Confirmed would normally yield Medium confidence; the
 								    // path-validated flag drops it one step to Low.  Without the guard
 								    // the same diag must score Medium (asserted alongside to lock in
 								    // the delta, not just the floor).
 								    let unguarded = make_diag(
 								        "taint-data-exfiltration (source 1:1)",
 								        Severity::High,
 								        SourceKind::Cookie,
 								        Some(Verdict::Confirmed),
 								        false,
 								    );
 								    let guarded = make_diag(
 								        "taint-data-exfiltration (source 1:1)",
 								        Severity::High,
 								        SourceKind::Cookie,
 								        Some(Verdict::Confirmed),
 								        true,
 								    );
 								    assert_eq!(compute_confidence(&unguarded), Confidence::Medium);
 								    assert_eq!(
 								        compute_confidence(&guarded),
 								        Confidence::Low,
 								        "guarded DATA_EXFIL path must drop one confidence tier"
 								    );
 								}
 								// ── Calibration fixture 5: ranking — DATA_EXFIL below SSRF on same source
 								#[test]
 								fn data_exfil_ranks_below_ssrf_on_same_source() {
 								    // Cookie source flowing to `fetch` could fire either DATA_EXFIL (body
 								    // arg) or SSRF / generic taint (URL arg).  On the same severity tier
 								    // SSRF must outrank DATA_EXFIL because the analysis-kind bonus is +10
 								    // for `taint-unsanitised-flow` and +7 for `taint-data-exfiltration`.
 								    let exfil = make_diag(
 								        "taint-data-exfiltration (source 1:1)",
 								        Severity::High,
 								        SourceKind::Cookie,
 								        Some(Verdict::Confirmed),
 								        false,
 								    );
 								    let ssrf = make_diag(
 								        "taint-unsanitised-flow (source 1:1)",
 								        Severity::High,
 								        SourceKind::Cookie,
 								        Some(Verdict::Confirmed),
 								        false,
 								    );
 								    let exfil_score = compute_attack_rank(&exfil).score;
 								    let ssrf_score = compute_attack_rank(&ssrf).score;
 								    assert!(
 								        ssrf_score > exfil_score,
 								        "SSRF score ({ssrf_score}) must outrank DATA_EXFIL score \
 								         ({exfil_score}) on the same source"
 								    );
 								    // The delta is exactly the analysis-kind bonus difference (+3) — pin
 								    // it so accidental drift trips the test rather than silently moving
 								    // both bonuses in lock-step.
 								    assert!(
 								        (ssrf_score - exfil_score - 3.0).abs() < 0.001,
 								        "SSRF − DATA_EXFIL should equal the analysis-kind bonus delta \
 								         (+3); got {} ({} − {})",
 								        ssrf_score - exfil_score,
 								        ssrf_score,
 								        exfil_score,
 								    );
 								}
 								// ── Calibration fixture 6: DATA_EXFIL above AST patterns ────────────────
 								#[test]
 								fn data_exfil_ranks_above_ast_pattern() {
 								    // The instruction mandates DATA_EXFIL sit above informational AST
 								    // patterns.  Use a Medium DATA_EXFIL (header source) vs a Low AST
 								    // pattern (the typical AST-only banned-API match) to lock the
 								    // ordering in even at the weaker end of the DATA_EXFIL spectrum.
 								    let medium_exfil = make_diag(
 								        "taint-data-exfiltration (source 1:1)",
 								        Severity::Medium,
 								        SourceKind::Header,
 								        Some(Verdict::Confirmed),
 								        false,
 								    );
 								    let mut ast_pattern = make_diag(
 								        "js.code_exec.eval",
 								        Severity::Low,
 								        SourceKind::Unknown,
 								        None,
 								        false,
 								    );
 								    // AST patterns don't carry taint evidence; clear it so the ranker
 								    // takes the AST-only branch.
 								    ast_pattern.evidence = None;
 								    let exfil_score = compute_attack_rank(&medium_exfil).score;
 								    let ast_score = compute_attack_rank(&ast_pattern).score;
 								    assert!(
 								        exfil_score > ast_score,
 								        "DATA_EXFIL ({exfil_score}) must outrank AST pattern ({ast_score})"
 								    );
 								}