nyx/src/auth_analysis/extract/mod.rs

use super::config::AuthAnalysisRules;
use super::model::{AuthorizationModel, CallSite};
use crate::utils::project::{FrameworkContext, rust_file_imports_web_framework};
use std::collections::HashMap;
use std::path::Path;
use tree_sitter::Tree;

pub mod actix_web;
pub mod axum;
pub mod common;
pub mod django;
pub mod echo;
pub mod express;
pub mod fastify;
pub mod flask;
pub mod gin;
pub mod koa;
pub mod rails;
pub mod rocket;
pub mod sinatra;
pub mod spring;

pub trait AuthExtractor {
    fn supports(&self, lang: &str, framework_ctx: Option<&FrameworkContext>) -> bool;

    /// Returns true when this extractor expects the orchestrator to
    /// have already populated `model.units` with one
    /// `AnalysisUnitKind::Function` entry per top-level function /
    /// method via [`common::collect_top_level_units`].  Defaults to
    /// `true`; framework extractors that build their own unit set
    /// (Spring, Rails) override to `false` so the orchestrator skips
    /// the shared collection pass when only those extractors match.
    fn requires_top_level_units(&self) -> bool {
        true
    }

    fn extract(
        &self,
        tree: &Tree,
        bytes: &[u8],
        path: &Path,
        rules: &AuthAnalysisRules,
        model: &mut AuthorizationModel,
    );
}

pub fn extract_authorization_model(
    lang: &str,
    framework_ctx: Option<&FrameworkContext>,
    tree: &Tree,
    bytes: &[u8],
    path: &Path,
    rules: &AuthAnalysisRules,
    cross_file_router_deps: Option<&HashMap<String, Vec<(CallSite, bool)>>>,
) -> AuthorizationModel {
    let extractors: [&dyn AuthExtractor; 13] = [
        &express::ExpressExtractor,
        &koa::KoaExtractor,
        &fastify::FastifyExtractor,
        &gin::GinExtractor,
        &echo::EchoExtractor,
        &flask::FlaskExtractor,
        &django::DjangoExtractor,
        &spring::SpringExtractor,
        &rails::RailsExtractor,
        &sinatra::SinatraExtractor,
        &axum::AxumExtractor,
        &actix_web::ActixWebExtractor,
        &rocket::RocketExtractor,
    ];
    let mut model = AuthorizationModel {
        lang: lang.to_string(),
        ..Default::default()
    };
    // Pre-populate the cross-file router-dep map BEFORE extractors run.
    // FlaskExtractor reads `model.cross_file_router_deps` and merges the
    // resolved deps into its local router-deps map at extraction time,
    // so per-route auth attribution sees both the local-file
    // `dependencies=[Security(...)]` declarations and the cross-file
    // lift from `<parent>.include_router(<this_file>.<router>, ...)`
    // edges visible elsewhere in the project.  Empty / `None` for every
    // non-Python language and for files with no matching child edges.
    if let Some(deps) = cross_file_router_deps {
        model.cross_file_router_deps = deps.clone();
    }

    // **Hoist `collect_top_level_units` out of the per-extractor loop.**
    // For multi-extractor languages (Go: gin+echo, JS/TS: express+koa+
    // fastify, Python: flask+django, Rust: axum+actix_web+rocket, Ruby:
    // sinatra) the legacy code re-walked the entire AST and rebuilt the
    // `Function`-kind unit set per extractor (then deduped by span).
    // `collect_top_level_units` was the dominant cost in
    // `extract_authorization_model` (46% of total wall-clock on the
    // mattermost/server/channels/app subtree, 2026-05-04 profile).
    //
    // After the hoist each extractor receives a `&mut model` that
    // already carries the shared unit set; framework-specific work
    // (route detection, middleware injection, typed-extractor guards)
    // augments and promotes those units in place via the existing
    // `attach_route_handler` "promote-or-create" path.
    //
    // Spring + Rails build their own unit set (`maybe_collect_controller`
    // / Rails' `collect_nodes`), so they opt out via
    // `requires_top_level_units = false`; the shared pass runs only
    // when at least one matching extractor needs it.
    let any_requires_units = extractors
        .iter()
        .any(|e| e.supports(lang, framework_ctx) && e.requires_top_level_units());
    if any_requires_units {
        common::collect_top_level_units(tree.root_node(), bytes, rules, &mut model);
    }

    for extractor in extractors {
        if extractor.supports(lang, framework_ctx) {
            extractor.extract(tree, bytes, path, rules, &mut model);
        }
    }

    // Per-language web-framework signal used to gate the param-name arm
    // of `unit_has_user_input_evidence`.  Combines the project-root
    // manifest detection (`framework_ctx`) with a per-file `use`/`import`
    // check, so a single file in a workspace whose root manifest does
    // not name a web framework can still opt back in by directly
    // importing one (e.g. `crates/collab/src/rpc.rs` in zed: workspace
    // root has no axum, but the file uses `axum::Router`).
    //
    // Three-valued: `Some(true)` keeps step 3 firing, `Some(false)`
    // suppresses it, `None` means no detection ran ─ behavior unchanged.
    model.lang_web_framework_signal = compute_web_framework_signal(lang, framework_ctx, bytes);

    // **Dedup units by span across extractors.**  Multiple extractors
    // (e.g. Flask + Django on a Python file) each call
    // `collect_top_level_units`, producing one unit per top-level
    // function.  When one extractor also recognises a route on that
    // function and promotes its copy to `RouteHandler` (with injected
    // middleware auth checks), the *other* extractor's untouched
    // `Function` copy still runs through `check_ownership_gaps` and
    // emits the FP from a unit that never saw the middleware-derived
    // auth check.
    //
    // This step keeps a single canonical unit per source span,
    // preferring `RouteHandler` over `Function`, merging auth_checks
    // and folding operation lists conservatively.  Route registrations
    // are remapped to the surviving unit index.
    deduplicate_units_by_span(&mut model);

    model
}

/// Compute the per-file web-framework signal used to gate the
/// param-name arm of `unit_has_user_input_evidence`.
///
/// Currently emits a non-`None` value only for Rust files.  The Rust
/// auth analysis is the single biggest source of internal-helper FPs
/// in non-web crates (zed's GUI / editor crates); the other languages
/// have their own handler-classification policies that already filter
/// effectively, so they keep their existing behavior (None →
/// fall-through to the param-name heuristic) until each is validated.
///
/// Three-valued semantics:
/// * `Some(true)` ─ project root manifest names a Rust web framework
///   (axum / actix_web / rocket), OR the file directly imports one.
///   Param-name evidence stays on.
/// * `Some(false)` ─ project root manifest was inspected (Cargo.toml
///   exists) and named no Rust web framework, AND the file does not
///   directly import one.  Param-name evidence is suppressed: the
///   project has no HTTP boundary in Rust.
/// * `None` ─ no detection ran (no `framework_ctx`, no Cargo.toml
///   inspected).  Behavior unchanged.
fn compute_web_framework_signal(
    lang: &str,
    framework_ctx: Option<&FrameworkContext>,
    bytes: &[u8],
) -> Option<bool> {
    if !matches!(lang, "rust" | "rs") {
        return None;
    }
    let project_signal = framework_ctx.and_then(|ctx| ctx.lang_has_web_framework("rust"));
    if project_signal == Some(true) {
        return Some(true);
    }
    // Project says "no Rust framework" or never inspected.  Consult the
    // file's own imports as a per-file fallback; if the file uses an
    // axum / actix_web / rocket symbol directly, treat it as a handler
    // file even when the workspace-root Cargo.toml does not list the
    // crate.  (Real example: zed's `crates/collab/src/rpc.rs` imports
    // axum but the workspace root Cargo.toml does not.)
    if rust_file_imports_web_framework(bytes) {
        return Some(true);
    }
    // No file-level evidence either.  Only flip to `Some(false)` if a
    // Cargo.toml manifest was actually inspected — single-file scans
    // without project context get `None` and preserve prior behavior.
    project_signal
}

fn deduplicate_units_by_span(model: &mut AuthorizationModel) {
    use crate::auth_analysis::model::{AnalysisUnit, AnalysisUnitKind};
    use std::collections::HashMap;

    // First pass: choose a winner for each span, prefer the
    // first-seen `RouteHandler` over any `Function` copy.
    let mut winner_by_span: HashMap<(usize, usize), usize> = HashMap::new();
    for (idx, unit) in model.units.iter().enumerate() {
        let key = unit.span;
        match winner_by_span.get(&key) {
            None => {
                winner_by_span.insert(key, idx);
            }
            Some(&existing) => {
                let prev_kind = model.units[existing].kind;
                if prev_kind != AnalysisUnitKind::RouteHandler
                    && unit.kind == AnalysisUnitKind::RouteHandler
                {
                    winner_by_span.insert(key, idx);
                }
            }
        }
    }

    // Second pass: drain auth_checks from losers so we can append them
    // to the winners after the layout collapses.
    let mut moved_checks: Vec<Vec<crate::auth_analysis::model::AuthCheck>> =
        Vec::with_capacity(model.units.len());
    for old_idx in 0..model.units.len() {
        let span = model.units[old_idx].span;
        let winner = *winner_by_span.get(&span).unwrap_or(&old_idx);
        if winner == old_idx {
            moved_checks.push(Vec::new());
        } else {
            moved_checks.push(std::mem::take(&mut model.units[old_idx].auth_checks));
        }
    }

    // Third pass: emit surviving units (clone the winners) and build
    // the old-idx → new-idx remap.
    let mut new_idx_for_old: HashMap<usize, usize> = HashMap::new();
    let mut surviving: Vec<AnalysisUnit> = Vec::with_capacity(winner_by_span.len());
    for old_idx in 0..model.units.len() {
        let span = model.units[old_idx].span;
        let winner = *winner_by_span.get(&span).unwrap_or(&old_idx);
        if winner == old_idx {
            new_idx_for_old.insert(old_idx, surviving.len());
            surviving.push(model.units[old_idx].clone());
        }
    }

    // Fourth pass: drain loser auth_checks into their winners, deduping
    // by (span, callee).  Operations are not merged: both extractor
    // passes recompute the same operation list from the AST, so the
    // winner already carries the canonical set.
    for (old_idx, checks) in moved_checks.iter_mut().enumerate() {
        let span = model.units[old_idx].span;
        let winner = *winner_by_span.get(&span).unwrap_or(&old_idx);
        if winner == old_idx {
            continue;
        }
        let Some(&new_winner_idx) = new_idx_for_old.get(&winner) else {
            continue;
        };
        for check in checks.drain(..) {
            let already_present = surviving[new_winner_idx]
                .auth_checks
                .iter()
                .any(|existing| existing.span == check.span && existing.callee == check.callee);
            if !already_present {
                surviving[new_winner_idx].auth_checks.push(check);
            }
        }
    }

    model.units = surviving;
    for route in &mut model.routes {
        if let Some(&new_idx) = new_idx_for_old.get(&route.unit_idx) {
            route.unit_idx = new_idx;
        }
    }
}