[pitboss] phase 21: Track F.1 — SurfaceMap module + Python/Flask vertical

This commit is contained in:
pitboss 2026-05-15 12:33:10 -05:00
parent f8bff38217
commit c03326a658
9 changed files with 1396 additions and 1 deletions

View file

@ -2126,6 +2126,7 @@ pub(crate) fn scan_filesystem_with_observer(
);
}
let pass2_start = std::time::Instant::now();
let mut gs = global_summaries;
let mut diags: Vec<Diag> = {
let _span = tracing::info_span!("pass2_analysis", files = all_paths.len()).entered();
let pb = make_progress_bar(
@ -2156,7 +2157,6 @@ pub(crate) fn scan_filesystem_with_observer(
);
}
let mut gs = global_summaries;
let total_batches = batches.len() as u64 + u64::from(!orphans.is_empty());
if let Some(p) = progress {
p.set_batches_total(total_batches);
@ -2177,6 +2177,20 @@ pub(crate) fn scan_filesystem_with_observer(
result
};
tracing::info!(diags = diags.len(), "pass 2 complete");
// Phase 21: build the SurfaceMap from the post-pass-2 view.
// No persistence here; the index-backed path persists into the
// `surface_map` SQLite table. Errors here are swallowed: the
// surface map is an additive Phase F deliverable, not a gate.
let _surface_map = crate::surface::build::build_surface_map(
&crate::surface::build::SurfaceBuildInputs {
files: &all_paths,
scan_root: Some(root),
global_summaries: &gs,
call_graph: &call_graph,
config: cfg,
},
);
if let Some(p) = progress {
p.record_pass2_ms(pass2_start.elapsed().as_millis() as u64);
}
@ -2987,6 +3001,34 @@ pub fn scan_with_index_parallel_observer(
let mut diags = topo_diags;
// Phase 21: build + persist the SurfaceMap from the post-pass-2
// view. Errors here are logged but not propagated — the surface
// map is an additive Phase F deliverable, not a scan gate.
{
let surface_map = crate::surface::build::build_surface_map(
&crate::surface::build::SurfaceBuildInputs {
files: &files,
scan_root: Some(scan_root),
global_summaries: &global_summaries,
call_graph: &call_graph,
config: cfg,
},
);
let mut idx = Indexer::from_pool(project, &pool)?;
if let Err(e) = idx.replace_surface_map(&surface_map) {
tracing::warn!("failed to persist surface_map: {e}");
} else if let Some(l) = logs {
l.info(
format!(
"Surface map: {} nodes, {} edges",
surface_map.node_count(),
surface_map.edge_count()
),
None,
);
}
}
// NOTE: Taint-mode output is *not* filtered here. `run_rules_on_bytes`
// already gates AST queries and auth analyses behind `mode == Full`, so
// Taint-mode raw output is exactly the set of diagnostics the analysis

View file

@ -228,6 +228,15 @@ pub mod index {
CREATE INDEX IF NOT EXISTS idx_dynamic_verdict_cache_spec_hash
ON dynamic_verdict_cache(spec_hash);
-- Phase 21: persisted attack-surface map. One row per project.
-- Stored as canonical JSON so the round-trip is byte-identical
-- across rescans (see `SurfaceMap::to_json`).
CREATE TABLE IF NOT EXISTS surface_map (
project TEXT PRIMARY KEY,
map_json BLOB NOT NULL,
updated_at INTEGER NOT NULL
);
-- Indexes on (project, file_path) for the per-file replace_* paths.
-- Without these, every DELETE WHERE project=? AND file_path=? does a
-- full table scan, which dominates indexing time as the cache grows.
@ -547,6 +556,22 @@ pub mod index {
conn.execute_batch(SCHEMA)?;
}
// Phase 21: ensure the `surface_map` table exists on
// DBs created before this column set was introduced.
let surface_exists: bool = conn
.query_row(
"SELECT 1 FROM sqlite_master
WHERE type = 'table' AND name = 'surface_map'",
[],
|_| Ok(true),
)
.optional()?
.unwrap_or(false);
if !surface_exists {
tracing::info!("creating surface_map table");
conn.execute_batch(SCHEMA)?;
}
// Schema version check: invalidate cached summary tables
// when the on-disk artefact layout has changed in an
// incompatible way, independently of the engine version.
@ -1882,6 +1907,63 @@ pub mod index {
Ok(out)
}
/// Persist a [`crate::surface::SurfaceMap`] for this project.
///
/// Replaces any previously-persisted map; the table holds one row
/// per project. The map is canonicalised before serialisation so
/// `replace_surface_map` + `load_surface_map` round-trip is
/// byte-identical for structurally identical maps.
pub fn replace_surface_map(
&mut self,
map: &crate::surface::SurfaceMap,
) -> NyxResult<()> {
let now = SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs() as i64;
let mut canon = map.clone();
let bytes = canon
.to_json()
.map_err(|e| NyxError::Msg(format!("surface map serialise: {e}")))?;
self.c().execute(
"INSERT OR REPLACE INTO surface_map (project, map_json, updated_at)
VALUES (?1, ?2, ?3)",
params![self.project, bytes, now],
)?;
Ok(())
}
/// Load the persisted [`crate::surface::SurfaceMap`] for this
/// project, or `None` when no map has been written.
pub fn load_surface_map(&self) -> NyxResult<Option<crate::surface::SurfaceMap>> {
let row: Option<Vec<u8>> = self
.c()
.query_row(
"SELECT map_json FROM surface_map WHERE project = ?1",
params![self.project],
|r| r.get::<_, Vec<u8>>(0),
)
.optional()?;
let Some(bytes) = row else {
return Ok(None);
};
let map = crate::surface::SurfaceMap::from_json(&bytes)
.map_err(|e| NyxError::Msg(format!("surface map deserialise: {e}")))?;
Ok(Some(map))
}
/// Return the raw JSON bytes stored for the surface map without
/// deserialising. Used by the round-trip parity tests so they
/// can compare on-disk bytes across rescans.
pub fn load_surface_map_bytes(&self) -> NyxResult<Option<Vec<u8>>> {
let row: Option<Vec<u8>> = self
.c()
.query_row(
"SELECT map_json FROM surface_map WHERE project = ?1",
params![self.project],
|r| r.get::<_, Vec<u8>>(0),
)
.optional()?;
Ok(row)
}
/// Remove a file and all derived persisted state for this project.
///
/// This deletes the file row, issues, and all persisted summary rows so

View file

@ -121,6 +121,7 @@ pub mod ssa;
pub mod state;
pub mod summary;
pub mod suppress;
pub mod surface;
pub mod symbol;
pub mod symex;
pub mod taint;

163
src/surface/build.rs Normal file
View file

@ -0,0 +1,163 @@
//! Top-level [`SurfaceMap`] builder.
//!
//! Consumes the post-pass-2 [`GlobalSummaries`] + [`CallGraph`] for
//! call-graph reachability and the project's file list for the
//! per-language framework probes. Phase 21 only invokes the Python +
//! Flask probe; Phase 22 wires the remaining language probes through
//! [`crate::surface::lang`].
//!
//! Build steps (Phase 21):
//!
//! 1. For every Python file, parse it once and invoke
//! [`crate::surface::lang::python_flask::detect_flask_routes`].
//! 2. Collect the resulting [`SurfaceNode::EntryPoint`] nodes.
//! 3. Canonicalise the map (sort nodes + edges, dedup edges) so two
//! runs over the same source produce byte-identical JSON.
use crate::callgraph::CallGraph;
use crate::summary::GlobalSummaries;
use crate::surface::{SurfaceMap, lang::python_flask};
use crate::utils::config::Config;
use std::path::{Path, PathBuf};
/// Inputs to [`build_surface_map`]. Wrapped in a struct so the
/// downstream Phase 22 work (additional probes, call-graph-derived
/// `Reaches` edges, label-rule data-source nodes) can extend the
/// signature without touching every caller.
pub struct SurfaceBuildInputs<'a> {
pub files: &'a [PathBuf],
pub scan_root: Option<&'a Path>,
pub global_summaries: &'a GlobalSummaries,
pub call_graph: &'a CallGraph,
pub config: &'a Config,
}
/// Build a [`SurfaceMap`] for the project under analysis.
///
/// Best-effort: parse failures on individual files are swallowed so
/// the surface map of a 10k-file project is not killed by one bad
/// Python file. Returns an empty map when the inputs contain no
/// recognised entry-points.
pub fn build_surface_map(inputs: &SurfaceBuildInputs<'_>) -> SurfaceMap {
let mut map = SurfaceMap::new();
// Phase 21: only Python / Flask. The downstream Phase 22 probes
// will dispatch on file extension here.
let mut python_parser = tree_sitter::Parser::new();
if python_parser
.set_language(&tree_sitter_python::LANGUAGE.into())
.is_err()
{
return map;
}
for path in inputs.files {
if !is_python_file(path) {
continue;
}
let Ok(bytes) = std::fs::read(path) else {
continue;
};
let Some(tree) = python_parser.parse(&bytes, None) else {
continue;
};
let nodes =
python_flask::detect_flask_routes(&tree, &bytes, path, inputs.scan_root);
for n in nodes {
map.nodes.push(n);
}
}
// GlobalSummaries / CallGraph are reserved for Phase 22's
// `DangerousLocal` + `Reaches`-edge fill-in. Phase 21 records
// them in the inputs so callers do not need to be touched again
// when Phase 22 wires them up.
let _ = inputs.global_summaries;
let _ = inputs.call_graph;
let _ = inputs.config;
map.canonicalize();
map
}
fn is_python_file(path: &Path) -> bool {
matches!(
path.extension().and_then(|s| s.to_str()),
Some("py") | Some("pyi")
)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::entry_points::HttpMethod;
use std::fs;
use tempfile::tempdir;
#[test]
fn empty_inputs_produce_empty_map() {
let dir = tempdir().unwrap();
let cfg = Config::default();
let gs = GlobalSummaries::new();
let cg = CallGraph {
graph: petgraph::graph::DiGraph::new(),
index: Default::default(),
unresolved_not_found: vec![],
unresolved_ambiguous: vec![],
};
let files: Vec<PathBuf> = vec![];
let inputs = SurfaceBuildInputs {
files: &files,
scan_root: Some(dir.path()),
global_summaries: &gs,
call_graph: &cg,
config: &cfg,
};
let map = build_surface_map(&inputs);
assert_eq!(map.node_count(), 0);
assert_eq!(map.edge_count(), 0);
}
#[test]
fn flask_file_produces_entry_points() {
let dir = tempdir().unwrap();
let py = dir.path().join("app.py");
fs::write(
&py,
r#"
from flask import Flask
app = Flask(__name__)
@app.route("/")
def index():
return "hi"
@app.post("/submit")
def submit():
return "ok"
"#,
)
.unwrap();
let cfg = Config::default();
let gs = GlobalSummaries::new();
let cg = CallGraph {
graph: petgraph::graph::DiGraph::new(),
index: Default::default(),
unresolved_not_found: vec![],
unresolved_ambiguous: vec![],
};
let files = vec![py.clone()];
let inputs = SurfaceBuildInputs {
files: &files,
scan_root: Some(dir.path()),
global_summaries: &gs,
call_graph: &cg,
config: &cfg,
};
let map = build_surface_map(&inputs);
assert_eq!(map.node_count(), 2);
let methods: Vec<HttpMethod> = map.entry_points().map(|ep| ep.method).collect();
assert!(methods.contains(&HttpMethod::GET));
assert!(methods.contains(&HttpMethod::POST));
}
}

107
src/surface/graph.rs Normal file
View file

@ -0,0 +1,107 @@
//! petgraph-backed read-only view over a [`SurfaceMap`].
//!
//! The on-disk shape is two parallel `Vec`s (deterministic ordering,
//! byte-identical JSON), but downstream consumers — the Track G chain
//! composer, the `nyx surface` CLI walker — want graph queries:
//! neighbours, reachability, topological order. [`petgraph_view`]
//! constructs a `DiGraph<NodeRef<'_>, EdgeRef<'_>>` on demand without
//! cloning the underlying nodes or edges.
use super::{EdgeKind, SurfaceEdge, SurfaceMap, SurfaceNode};
use petgraph::graph::{DiGraph, NodeIndex};
use std::collections::HashMap;
/// Borrowed handle to one [`SurfaceNode`] inside the petgraph view.
#[derive(Debug, Clone, Copy)]
pub struct NodeRef<'a> {
pub idx: u32,
pub node: &'a SurfaceNode,
}
/// Borrowed handle to one [`SurfaceEdge`] inside the petgraph view.
#[derive(Debug, Clone, Copy)]
pub struct EdgeRef<'a> {
pub edge: &'a SurfaceEdge,
}
impl<'a> EdgeRef<'a> {
pub fn kind(&self) -> EdgeKind {
self.edge.kind
}
}
/// Materialise a petgraph view of `map`. Node indices in the returned
/// graph match `map.nodes` ordering 1:1, and the `lookup` map lets
/// callers translate from the surface index (`u32`) to the petgraph
/// [`NodeIndex`]. Walking edges respects `map.edges` order.
pub fn petgraph_view(map: &SurfaceMap) -> SurfaceGraphView<'_> {
let mut graph: DiGraph<NodeRef<'_>, EdgeRef<'_>> = DiGraph::new();
let mut lookup: HashMap<u32, NodeIndex> = HashMap::with_capacity(map.nodes.len());
for (i, node) in map.nodes.iter().enumerate() {
let nx = graph.add_node(NodeRef {
idx: i as u32,
node,
});
lookup.insert(i as u32, nx);
}
for edge in &map.edges {
if let (Some(&from), Some(&to)) = (lookup.get(&edge.from), lookup.get(&edge.to)) {
graph.add_edge(from, to, EdgeRef { edge });
}
}
SurfaceGraphView { graph, lookup }
}
/// petgraph view returned by [`petgraph_view`].
pub struct SurfaceGraphView<'a> {
pub graph: DiGraph<NodeRef<'a>, EdgeRef<'a>>,
pub lookup: HashMap<u32, NodeIndex>,
}
impl<'a> SurfaceGraphView<'a> {
/// Resolve a surface index back to its petgraph [`NodeIndex`].
pub fn node_index(&self, surface_idx: u32) -> Option<NodeIndex> {
self.lookup.get(&surface_idx).copied()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::entry_points::HttpMethod;
use crate::surface::{EntryPoint, Framework, SourceLocation};
#[test]
fn petgraph_view_preserves_indices() {
let mut m = SurfaceMap::new();
m.nodes.push(SurfaceNode::EntryPoint(EntryPoint {
location: SourceLocation::new("a.py", 1, 1),
framework: Framework::Flask,
method: HttpMethod::GET,
route: "/a".into(),
handler_name: "h".into(),
handler_location: SourceLocation::new("a.py", 2, 1),
auth_required: false,
}));
m.nodes.push(SurfaceNode::EntryPoint(EntryPoint {
location: SourceLocation::new("b.py", 1, 1),
framework: Framework::Flask,
method: HttpMethod::POST,
route: "/b".into(),
handler_name: "h".into(),
handler_location: SourceLocation::new("b.py", 2, 1),
auth_required: false,
}));
m.edges.push(SurfaceEdge {
from: 0,
to: 1,
kind: EdgeKind::Calls,
});
let view = petgraph_view(&m);
assert_eq!(view.graph.node_count(), 2);
assert_eq!(view.graph.edge_count(), 1);
let n0 = view.node_index(0).unwrap();
let n1 = view.node_index(1).unwrap();
assert!(view.graph.find_edge(n0, n1).is_some());
}
}

6
src/surface/lang/mod.rs Normal file
View file

@ -0,0 +1,6 @@
//! Per-language framework probes. Phase 21 ships Python + Flask;
//! Phase 22 generalises to FastAPI / Django, Java Spring / JAX-RS,
//! Ruby Rails / Sinatra, Go net/http / gin, Rust axum / actix /
//! rocket, JS/TS Express + Next.js.
pub mod python_flask;

View file

@ -0,0 +1,413 @@
//! Python + Flask framework probe.
//!
//! Walks a parsed Python file looking for the four canonical Flask
//! route shapes:
//!
//! * `@app.route("/path", methods=[...])`
//! * `@app.get("/path")` / `.post(...)` / etc. (Flask ≥ 2.0)
//! * `@bp.route("/path", methods=[...])` on a `Blueprint`
//! * `@bp.get("/path")` / `.post(...)` / etc.
//!
//! `auth_required` is inferred from the decorator stack: any decorator
//! whose textual representation matches one of [`AUTH_DECORATORS`] is
//! treated as an auth boundary on the following route. This catches
//! the canonical `@login_required` (Flask-Login), `@auth_required`
//! (custom guards), and `@jwt_required` / `@jwt_required()` (Flask-JWT
//! and -JWT-Extended).
use crate::entry_points::HttpMethod;
use crate::surface::{
EntryPoint, Framework, SourceLocation, SurfaceNode, relative_path_string,
};
use std::path::Path;
use tree_sitter::{Node, Tree};
/// Decorator names that mark a route as requiring authentication.
/// Matched against the *leaf* of the decorator expression — i.e. the
/// last `attribute` / `identifier` segment — so `@login_required`,
/// `@auth.login_required`, and `@flask_login.login_required` all
/// match. Match is case-insensitive on the underscored form.
pub const AUTH_DECORATORS: &[&str] = &[
"login_required",
"auth_required",
"jwt_required",
"token_required",
"requires_auth",
"authenticated",
"require_login",
];
/// Detect every Flask route in a parsed Python file.
///
/// `scan_root` is used to convert the file path to a project-relative
/// POSIX path; pass `None` to record absolute paths. Returns one
/// [`SurfaceNode::EntryPoint`] per `@route` / `@get` / `@post` / …
/// decorator that targets a Flask-shaped receiver (`app`, `bp`,
/// `blueprint`, or anything ending in `_bp` / `Blueprint`).
pub fn detect_flask_routes(
tree: &Tree,
bytes: &[u8],
path: &Path,
scan_root: Option<&Path>,
) -> Vec<SurfaceNode> {
let file_rel = relative_path_string(path, scan_root);
let mut out = Vec::new();
walk_decorated(tree.root_node(), bytes, &mut |func_node, decorators| {
// Reverse pass: find Flask-route decorators and collect auth
// markers seen at *any* position in the decorator stack —
// Flask honours decorators in stacked order regardless of
// sequence relative to the route.
let auth_required = decorators
.iter()
.any(|d| decorator_is_auth_marker(*d, bytes));
for dec in decorators {
if let Some((method, route_path)) = flask_route_decorator(*dec, bytes) {
let dec_pos = dec.start_position();
let handler_pos = func_node.start_position();
let handler_name = function_name(*func_node, bytes).unwrap_or_default();
out.push(SurfaceNode::EntryPoint(EntryPoint {
location: SourceLocation::new(
file_rel.clone(),
(dec_pos.row + 1) as u32,
(dec_pos.column + 1) as u32,
),
framework: Framework::Flask,
method,
route: route_path,
handler_name,
handler_location: SourceLocation::new(
file_rel.clone(),
(handler_pos.row + 1) as u32,
(handler_pos.column + 1) as u32,
),
auth_required,
}));
}
}
});
out
}
/// Walk every `function_definition` in `root` and invoke `visit` with
/// the function node plus the list of decorator nodes wrapping it.
/// Handles both `decorated_definition` (one or more decorators) and
/// bare `function_definition` (zero decorators, visit skipped).
fn walk_decorated<'tree, F>(root: Node<'tree>, bytes: &[u8], visit: &mut F)
where
F: FnMut(&Node<'tree>, &[Node<'tree>]),
{
if root.kind() == "decorated_definition" {
let mut cursor = root.walk();
let mut decorators: Vec<Node<'tree>> = Vec::new();
let mut func: Option<Node<'tree>> = None;
for child in root.children(&mut cursor) {
match child.kind() {
"decorator" => decorators.push(child),
"function_definition" => func = Some(child),
_ => {}
}
}
if let Some(func_node) = func {
visit(&func_node, &decorators);
}
let _ = bytes;
}
let mut cursor = root.walk();
for child in root.children(&mut cursor) {
walk_decorated(child, bytes, visit);
}
}
/// Classify a `decorator` node as a Flask route, returning the
/// `(method, path)` pair. Recognises both the `@app.route(...)` and
/// `@app.<verb>(...)` shapes and the Blueprint equivalents.
fn flask_route_decorator(decorator: Node, bytes: &[u8]) -> Option<(HttpMethod, String)> {
let mut walker = decorator.walk();
let expr = decorator
.children(&mut walker)
.find(|c| c.kind() != "@" && c.kind() != "comment")?;
let (call_target, args) = match expr.kind() {
"call" => (
expr.child_by_field_name("function")?,
expr.child_by_field_name("arguments"),
),
_ => return None,
};
if call_target.kind() != "attribute" {
return None;
}
let object = call_target.child_by_field_name("object")?;
if !receiver_is_flask(object, bytes) {
return None;
}
let attr = call_target.child_by_field_name("attribute")?;
let attr_text = attr.utf8_text(bytes).ok()?;
let route_path = args
.and_then(|a| first_string_arg(a, bytes))
.unwrap_or_default();
if attr_text == "route" {
let method = args
.and_then(|a| extract_first_method(a, bytes))
.unwrap_or(HttpMethod::GET);
return Some((method, route_path));
}
if let Some(method) = HttpMethod::from_ident(attr_text) {
return Some((method, route_path));
}
None
}
/// `true` when the decorator receiver looks like a Flask app or
/// Blueprint binding. Allowlist over identifier names + a structural
/// match on call expressions like `Blueprint("name", __name__)`.
fn receiver_is_flask(object: Node, bytes: &[u8]) -> bool {
fn name_matches(text: &str) -> bool {
let lower = text.to_ascii_lowercase();
lower == "app"
|| lower == "bp"
|| lower == "blueprint"
|| lower.ends_with("_app")
|| lower.ends_with("_bp")
|| lower.ends_with("blueprint")
|| lower.ends_with("api")
}
match object.kind() {
"identifier" => object.utf8_text(bytes).ok().is_some_and(name_matches),
"attribute" => object
.child_by_field_name("attribute")
.and_then(|a| a.utf8_text(bytes).ok())
.is_some_and(name_matches),
"call" => {
let Some(callee) = object.child_by_field_name("function") else {
return false;
};
let Ok(text) = callee.utf8_text(bytes) else {
return false;
};
let leaf = text.rsplit('.').next().unwrap_or(text).trim();
leaf == "Flask" || leaf == "Blueprint"
}
_ => false,
}
}
/// Pull the first string literal positional argument out of a
/// `argument_list` node. Used to extract the route path from
/// `@app.route("/path", ...)`.
fn first_string_arg(args: Node, bytes: &[u8]) -> Option<String> {
let mut cursor = args.walk();
for arg in args.children(&mut cursor) {
if arg.kind() == "string" {
return Some(string_literal_text(arg, bytes));
}
}
None
}
/// Strip Python quotes / prefix bytes (`b"..."`, `r"..."`) and return
/// the literal content. Falls back to the raw slice when the literal
/// has an unfamiliar shape.
fn string_literal_text(node: Node, bytes: &[u8]) -> String {
let raw = node.utf8_text(bytes).unwrap_or("");
let trimmed = raw.trim();
let mut s = trimmed;
while let Some(rest) = s.strip_prefix(['b', 'r', 'B', 'R', 'f', 'F']) {
s = rest;
}
let stripped = s
.trim_start_matches(['\'', '"'])
.trim_end_matches(['\'', '"']);
stripped.to_string()
}
/// Extract the first HTTP method named in a `methods=[...]` kwarg, or
/// `None` when the decorator omits the kwarg. The first method in
/// the list wins; multi-method routes are recorded as the first
/// (Flask itself runs the same handler for every listed method).
fn extract_first_method(args: Node, bytes: &[u8]) -> Option<HttpMethod> {
let mut cursor = args.walk();
for arg in args.children(&mut cursor) {
if arg.kind() != "keyword_argument" {
continue;
}
let name_node = arg.child_by_field_name("name")?;
let Ok(name) = name_node.utf8_text(bytes) else {
continue;
};
if name != "methods" {
continue;
}
let value = arg.child_by_field_name("value")?;
let mut cur = value.walk();
for child in value.children(&mut cur) {
if child.kind() == "string" {
let text = string_literal_text(child, bytes);
if let Some(m) = HttpMethod::from_ident(&text) {
return Some(m);
}
}
}
}
None
}
/// `true` when the decorator is an auth-guard marker. Matches the
/// last segment of the decorator expression against
/// [`AUTH_DECORATORS`].
fn decorator_is_auth_marker(decorator: Node, bytes: &[u8]) -> bool {
let mut walker = decorator.walk();
let Some(expr) = decorator
.children(&mut walker)
.find(|c| c.kind() != "@" && c.kind() != "comment")
else {
return false;
};
let target = match expr.kind() {
"call" => expr.child_by_field_name("function"),
_ => Some(expr),
};
let Some(target) = target else { return false };
let Ok(text) = target.utf8_text(bytes) else {
return false;
};
let leaf = text.rsplit('.').next().unwrap_or(text).trim();
AUTH_DECORATORS
.iter()
.any(|d| leaf.eq_ignore_ascii_case(d))
}
/// Read the function name from a `function_definition` node.
fn function_name(func: Node, bytes: &[u8]) -> Option<String> {
let name_node = func.child_by_field_name("name")?;
name_node.utf8_text(bytes).ok().map(str::to_string)
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
fn parse(src: &str) -> (Tree, Vec<u8>) {
let mut parser = tree_sitter::Parser::new();
parser
.set_language(&tree_sitter_python::LANGUAGE.into())
.unwrap();
let tree = parser.parse(src, None).unwrap();
(tree, src.as_bytes().to_vec())
}
fn detect(src: &str) -> Vec<SurfaceNode> {
let (tree, bytes) = parse(src);
detect_flask_routes(&tree, &bytes, &PathBuf::from("app.py"), None)
}
#[test]
fn detects_basic_route() {
let src = r#"
from flask import Flask
app = Flask(__name__)
@app.route("/hello")
def hello():
return "hi"
"#;
let nodes = detect(src);
assert_eq!(nodes.len(), 1);
if let SurfaceNode::EntryPoint(ep) = &nodes[0] {
assert_eq!(ep.route, "/hello");
assert_eq!(ep.method, HttpMethod::GET);
assert_eq!(ep.handler_name, "hello");
assert!(!ep.auth_required);
} else {
panic!("not an EntryPoint");
}
}
#[test]
fn detects_methods_kwarg() {
let src = r#"
from flask import Flask
app = Flask(__name__)
@app.route("/submit", methods=["POST"])
def submit():
return "ok"
"#;
let nodes = detect(src);
let ep = match &nodes[0] {
SurfaceNode::EntryPoint(ep) => ep,
_ => panic!("not an EntryPoint"),
};
assert_eq!(ep.method, HttpMethod::POST);
}
#[test]
fn detects_verb_decorator() {
let src = r#"
from flask import Flask
app = Flask(__name__)
@app.post("/users")
def create():
return "ok"
"#;
let nodes = detect(src);
let ep = match &nodes[0] {
SurfaceNode::EntryPoint(ep) => ep,
_ => panic!("not an EntryPoint"),
};
assert_eq!(ep.method, HttpMethod::POST);
}
#[test]
fn detects_blueprint() {
let src = r#"
from flask import Blueprint
bp = Blueprint("admin", __name__)
@bp.get("/admin")
def admin():
return "secret"
"#;
let nodes = detect(src);
let ep = match &nodes[0] {
SurfaceNode::EntryPoint(ep) => ep,
_ => panic!("not an EntryPoint"),
};
assert_eq!(ep.route, "/admin");
}
#[test]
fn detects_auth_decorator() {
let src = r#"
from flask import Flask
from flask_login import login_required
app = Flask(__name__)
@app.route("/secret")
@login_required
def secret():
return "shh"
"#;
let nodes = detect(src);
let ep = match &nodes[0] {
SurfaceNode::EntryPoint(ep) => ep,
_ => panic!("not an EntryPoint"),
};
assert!(ep.auth_required);
}
#[test]
fn rejects_non_flask_receiver() {
let src = r#"
client = requests.Session()
@client.get("/whatever")
def x():
pass
"#;
let nodes = detect(src);
// `client` does not match the Flask receiver allowlist.
assert!(nodes.is_empty());
}
}

398
src/surface/mod.rs Normal file
View file

@ -0,0 +1,398 @@
//! Phase 21 — attack-surface map.
//!
//! The `SurfaceMap` graph names the externally-reachable shape of the
//! project under analysis: HTTP route entry-points (Flask, FastAPI,
//! Spring, Express, …), the data stores they read/write, the external
//! services they talk to, and the local sinks they ultimately reach.
//!
//! Track G's chain composer walks this graph to translate findings into
//! cross-feature attack chains, and the `nyx surface` CLI prints a
//! human-readable tree from it. Phase 21 ships the graph types plus
//! the first framework probe (Python + Flask); Phase 22 generalises the
//! probe to the remaining languages and Phase 23 wires the CLI.
//!
//! Storage shape: a flat `Vec<SurfaceNode>` sorted by [`SourceLocation`]
//! and a flat `Vec<SurfaceEdge>` sorted by `(from_idx, to_idx, kind)`.
//! Both vectors are byte-deterministic, so two scans of the same source
//! produce byte-identical JSON when round-tripped through SQLite. See
//! [`graph::petgraph_view`] for a petgraph-backed view used by the
//! chain composer.
use crate::entry_points::HttpMethod;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;
use std::path::Path;
pub mod build;
pub mod graph;
pub mod lang;
/// Stable source location used as the primary key for every
/// [`SurfaceNode`]. `file` is a project-relative POSIX path so the
/// SurfaceMap is portable across machines; `line` and `col` are
/// 1-indexed. Ordering is `(file, line, col)` lexicographic, matching
/// the determinism the rest of the analyser uses for spans.
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
pub struct SourceLocation {
pub file: String,
pub line: u32,
pub col: u32,
}
impl SourceLocation {
pub fn new(file: impl Into<String>, line: u32, col: u32) -> Self {
Self {
file: file.into(),
line,
col,
}
}
}
/// Web-framework tag attached to every [`EntryPoint`]. The set is
/// fixed in Phase 21 + 22 and matches the set of framework probes
/// behind [`lang`]. New frameworks land as new variants.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum Framework {
Flask,
FastApi,
Django,
Express,
Spring,
JaxRs,
Rails,
Sinatra,
Axum,
Actix,
Rocket,
NetHttp,
Gin,
NextAppRouter,
NextServerAction,
}
/// HTTP-handler entry-point recognised by a framework probe.
///
/// Every node carries the route's declared path string, HTTP method,
/// and a resolved handler [`SourceLocation`] pointing at the function
/// definition. `auth_required` is `true` when the decorator stack
/// (or framework equivalent) contains an auth guard the probe was
/// able to identify; Phase 21 recognises Flask's `@login_required`,
/// `@auth_required`, and `@jwt_required` decorators.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct EntryPoint {
pub location: SourceLocation,
pub framework: Framework,
pub method: HttpMethod,
pub route: String,
pub handler_name: String,
pub handler_location: SourceLocation,
pub auth_required: bool,
}
/// Persistent data store reachable from the surface — SQL database,
/// key-value store, document DB, blob store. Phase 22 populates this
/// from label-rule data-source matches and ORM-receiver type facts;
/// Phase 21 ships the type for forward-compat only and emits no
/// `DataStore` nodes.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct DataStore {
pub location: SourceLocation,
pub kind: DataStoreKind,
pub label: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum DataStoreKind {
Sql,
KeyValue,
Document,
BlobStore,
Filesystem,
Unknown,
}
/// External service the surface talks to over a network — third-party
/// HTTP API, message broker, search index. Phase 22 fills this in;
/// Phase 21 ships the type.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ExternalService {
pub location: SourceLocation,
pub kind: ExternalServiceKind,
pub label: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ExternalServiceKind {
HttpApi,
MessageBroker,
SearchIndex,
AuthProvider,
Unknown,
}
/// Local sink with no externally observable side-effect — `eval`,
/// `pickle.loads`, `subprocess.Popen`, raw SQL execute, etc. Phase 22
/// fills this in from the existing label-rule registry; Phase 21
/// ships the type.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct DangerousLocal {
pub location: SourceLocation,
pub function_name: String,
pub cap_bits: u32,
}
/// A node in the [`SurfaceMap`]. Every variant carries a
/// [`SourceLocation`] so the surface ordering is total and stable.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "node", rename_all = "snake_case")]
pub enum SurfaceNode {
EntryPoint(EntryPoint),
DataStore(DataStore),
ExternalService(ExternalService),
DangerousLocal(DangerousLocal),
}
impl SurfaceNode {
pub fn location(&self) -> &SourceLocation {
match self {
SurfaceNode::EntryPoint(n) => &n.location,
SurfaceNode::DataStore(n) => &n.location,
SurfaceNode::ExternalService(n) => &n.location,
SurfaceNode::DangerousLocal(n) => &n.location,
}
}
/// Discriminator used as a secondary sort key so two nodes that
/// happen to share a [`SourceLocation`] (e.g. multiple route
/// decorators on one function) keep a deterministic relative
/// order. Returns the variant index in the enum declaration.
fn kind_ordinal(&self) -> u8 {
match self {
SurfaceNode::EntryPoint(_) => 0,
SurfaceNode::DataStore(_) => 1,
SurfaceNode::ExternalService(_) => 2,
SurfaceNode::DangerousLocal(_) => 3,
}
}
/// Tertiary sort key used to disambiguate nodes that share both
/// [`SourceLocation`] and kind — e.g. a single Flask function with
/// two `@app.route(...)` decorators ending up at the same handler
/// location.
fn dedup_tag(&self) -> String {
match self {
SurfaceNode::EntryPoint(n) => format!("{:?}:{:?}:{}", n.framework, n.method, n.route),
SurfaceNode::DataStore(n) => format!("{:?}:{}", n.kind, n.label),
SurfaceNode::ExternalService(n) => format!("{:?}:{}", n.kind, n.label),
SurfaceNode::DangerousLocal(n) => format!("{}:{:#x}", n.function_name, n.cap_bits),
}
}
}
/// Semantic kind of an edge in the [`SurfaceMap`]. Encodes the
/// seven edge classes the chain composer walks; persistence is via
/// JSON so adding a variant is a non-breaking schema change as long
/// as the SQLite-level migration drops the old surface_map rows.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum EdgeKind {
/// Caller → callee. Wraps the call-graph edge so consumers do
/// not have to consult [`crate::callgraph::CallGraph`] directly.
Calls,
/// Function or entry-point reads from a data store / external
/// service.
ReadsFrom,
/// Function or entry-point writes to a data store.
WritesTo,
/// Function or entry-point sends a request to an external
/// service.
TalksTo,
/// Entry-point reaches a dangerous-local sink through some
/// transitive call chain.
Reaches,
/// Entry-point triggers a side-effecting action (job, email,
/// webhook) other than a direct call.
Triggers,
/// Entry-point gates downstream access on a successful auth
/// check. The `from` is the auth-check node, the `to` is the
/// entry-point.
AuthRequiredOn,
}
/// A single edge in the [`SurfaceMap`]. `from` and `to` are indices
/// into [`SurfaceMap::nodes`]; the surface ordering keeps these
/// stable across rescans.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
pub struct SurfaceEdge {
pub from: u32,
pub to: u32,
pub kind: EdgeKind,
}
/// The attack-surface graph for a project. Stored as parallel
/// `Vec`s keyed on [`SourceLocation`] so JSON serialisation is
/// byte-deterministic and SQLite round-trips are stable.
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct SurfaceMap {
pub nodes: Vec<SurfaceNode>,
pub edges: Vec<SurfaceEdge>,
}
impl SurfaceMap {
/// Construct an empty map.
pub fn new() -> Self {
Self::default()
}
/// Total node count. Cheap.
pub fn node_count(&self) -> usize {
self.nodes.len()
}
/// Total edge count. Cheap.
pub fn edge_count(&self) -> usize {
self.edges.len()
}
/// Return the first entry-point node matching `(method, route)`.
/// Linear scan; the SurfaceMap is small (one node per route +
/// store + service + sink) so this is fine in practice.
pub fn entry_for_route(&self, method: HttpMethod, route: &str) -> Option<&EntryPoint> {
self.nodes.iter().find_map(|n| match n {
SurfaceNode::EntryPoint(ep) if ep.method == method && ep.route == route => Some(ep),
_ => None,
})
}
/// Iterate over every entry-point node in surface order.
pub fn entry_points(&self) -> impl Iterator<Item = &EntryPoint> {
self.nodes.iter().filter_map(|n| match n {
SurfaceNode::EntryPoint(ep) => Some(ep),
_ => None,
})
}
/// Sort nodes by `(SourceLocation, kind_ordinal, dedup_tag)` and
/// rewrite every edge's `from`/`to` accordingly. Two structurally
/// identical maps are byte-identical after [`canonicalize`] +
/// `serde_json::to_vec` regardless of insertion order.
///
/// [`canonicalize`]: SurfaceMap::canonicalize
pub fn canonicalize(&mut self) {
if self.nodes.is_empty() {
self.edges.sort();
self.edges.dedup();
return;
}
let mut indexed: Vec<(usize, &SurfaceNode)> = self.nodes.iter().enumerate().collect();
indexed.sort_by(|(_, a), (_, b)| {
let key_a = (a.location(), a.kind_ordinal(), a.dedup_tag());
let key_b = (b.location(), b.kind_ordinal(), b.dedup_tag());
key_a.cmp(&key_b)
});
let mut remap: BTreeMap<u32, u32> = BTreeMap::new();
let mut new_nodes: Vec<SurfaceNode> = Vec::with_capacity(self.nodes.len());
for (new_idx, (old_idx, _)) in indexed.iter().enumerate() {
remap.insert(*old_idx as u32, new_idx as u32);
}
for (_, node) in indexed {
new_nodes.push(node.clone());
}
for edge in &mut self.edges {
if let Some(&new_from) = remap.get(&edge.from) {
edge.from = new_from;
}
if let Some(&new_to) = remap.get(&edge.to) {
edge.to = new_to;
}
}
self.nodes = new_nodes;
self.edges.sort();
self.edges.dedup();
}
/// Serialize to deterministic JSON. The map is canonicalised
/// first; structurally identical maps emit byte-identical JSON.
pub fn to_json(&mut self) -> serde_json::Result<Vec<u8>> {
self.canonicalize();
serde_json::to_vec(self)
}
/// Deserialize from JSON. Does not canonicalise; the producer is
/// responsible for emitting a canonicalised payload.
pub fn from_json(bytes: &[u8]) -> serde_json::Result<Self> {
serde_json::from_slice(bytes)
}
}
/// Convert an absolute path to a project-relative POSIX path string.
/// Returns the absolute path verbatim when the file is outside the
/// scan root or when path stripping fails.
pub fn relative_path_string(path: &Path, scan_root: Option<&Path>) -> String {
if let Some(root) = scan_root {
if let Ok(rel) = path.strip_prefix(root) {
return rel.to_string_lossy().replace('\\', "/");
}
}
path.to_string_lossy().replace('\\', "/")
}
#[cfg(test)]
mod tests {
use super::*;
fn loc(file: &str, line: u32, col: u32) -> SourceLocation {
SourceLocation::new(file, line, col)
}
fn ep(file: &str, line: u32, route: &str, method: HttpMethod) -> SurfaceNode {
SurfaceNode::EntryPoint(EntryPoint {
location: loc(file, line, 1),
framework: Framework::Flask,
method,
route: route.into(),
handler_name: "h".into(),
handler_location: loc(file, line + 1, 1),
auth_required: false,
})
}
#[test]
fn canonicalize_sorts_nodes_and_remaps_edges() {
let mut m = SurfaceMap::new();
m.nodes.push(ep("b.py", 10, "/b", HttpMethod::GET));
m.nodes.push(ep("a.py", 5, "/a", HttpMethod::GET));
m.edges.push(SurfaceEdge {
from: 0,
to: 1,
kind: EdgeKind::Calls,
});
m.canonicalize();
assert_eq!(m.nodes[0].location().file, "a.py");
assert_eq!(m.nodes[1].location().file, "b.py");
// edge `from=0` was b.py (now index 1), `to=1` was a.py (now index 0)
assert_eq!(m.edges[0].from, 1);
assert_eq!(m.edges[0].to, 0);
}
#[test]
fn json_round_trip_byte_identical() {
let mut a = SurfaceMap::new();
a.nodes.push(ep("a.py", 1, "/a", HttpMethod::GET));
a.nodes.push(ep("b.py", 2, "/b", HttpMethod::POST));
a.edges.push(SurfaceEdge {
from: 0,
to: 1,
kind: EdgeKind::Calls,
});
let bytes_a = a.to_json().unwrap();
let b = SurfaceMap::from_json(&bytes_a).unwrap();
let mut b = b;
let bytes_b = b.to_json().unwrap();
assert_eq!(bytes_a, bytes_b);
}
}

183
tests/surface_flask.rs Normal file
View file

@ -0,0 +1,183 @@
//! Phase 21 — `SurfaceMap` Python + Flask vertical.
//!
//! Five-route Flask fixture exercising:
//!
//! * `@app.route("/", methods=["GET"])` default GET
//! * `@app.route("/submit", methods=["POST"])` POST via methods kwarg
//! * `@app.get("/users")` verb decorator
//! * `@bp.post("/admin")` Blueprint receiver
//! * `@app.route("/secret")` + `@login_required` auth-guarded
//!
//! Asserts every route node appears with the correct `method`, `route`,
//! `auth_required`, and `handler_name`. Round-trips the surface map
//! through SQLite and confirms the byte representation is identical to
//! the in-memory canonical JSON.
use nyx_scanner::commands::index::build_index;
use nyx_scanner::commands::scan::scan_with_index_parallel;
use nyx_scanner::database::index::Indexer;
use nyx_scanner::entry_points::HttpMethod;
use nyx_scanner::surface::{Framework, SurfaceMap, SurfaceNode};
use nyx_scanner::utils::config::{AnalysisMode, Config};
use std::path::Path;
use std::sync::Arc;
fn test_cfg() -> Config {
let mut cfg = Config::default();
cfg.scanner.mode = AnalysisMode::Full;
cfg.scanner.read_vcsignore = false;
cfg.scanner.require_git_to_read_vcsignore = false;
cfg.performance.worker_threads = Some(1);
cfg.performance.batch_size = 8;
cfg.performance.channel_multiplier = 1;
cfg
}
const FIVE_ROUTE_FIXTURE: &str = r#"
from flask import Flask, Blueprint
from flask_login import login_required
app = Flask(__name__)
bp = Blueprint("admin", __name__)
@app.route("/", methods=["GET"])
def index():
return "home"
@app.route("/submit", methods=["POST"])
def submit():
return "ok"
@app.get("/users")
def list_users():
return "users"
@bp.post("/admin")
def admin_create():
return "created"
@login_required
@app.route("/secret")
def secret():
return "shh"
"#;
fn seed_flask_fixture(root: &Path) {
std::fs::write(root.join("app.py"), FIVE_ROUTE_FIXTURE.as_bytes()).unwrap();
}
#[test]
fn surface_map_captures_five_flask_routes() {
let project = tempfile::tempdir().unwrap();
seed_flask_fixture(project.path());
let db_dir = tempfile::tempdir().unwrap();
let db_path = db_dir.path().join("surface.sqlite");
build_index("surface", project.path(), &db_path, &test_cfg(), false)
.expect("build_index on flask fixture should succeed");
let pool = Indexer::init(&db_path).expect("re-init pool");
let _ = scan_with_index_parallel(
"surface",
Arc::clone(&pool),
&test_cfg(),
false,
project.path(),
)
.expect("indexed scan should succeed");
let idx = Indexer::from_pool("surface", &pool).expect("from_pool");
let map = idx
.load_surface_map()
.expect("load_surface_map ok")
.expect("surface map persisted after scan");
let entries: Vec<_> = map.entry_points().collect();
assert_eq!(
entries.len(),
5,
"expected five Flask routes, got {entries:#?}",
);
let assert_route = |method: HttpMethod, route: &str, handler: &str, auth: bool| {
let ep = map.entry_for_route(method, route).unwrap_or_else(|| {
panic!("missing route {method:?} {route}; map = {entries:#?}");
});
assert_eq!(ep.framework, Framework::Flask, "framework mismatch on {route}");
assert_eq!(ep.handler_name, handler, "handler mismatch on {route}");
assert_eq!(
ep.auth_required, auth,
"auth mismatch on {route} (got {})",
ep.auth_required
);
// Handler location must point inside the project file.
assert!(
ep.handler_location.file.ends_with("app.py"),
"handler location not in app.py: {:?}",
ep.handler_location.file
);
};
assert_route(HttpMethod::GET, "/", "index", false);
assert_route(HttpMethod::POST, "/submit", "submit", false);
assert_route(HttpMethod::GET, "/users", "list_users", false);
assert_route(HttpMethod::POST, "/admin", "admin_create", false);
assert_route(HttpMethod::GET, "/secret", "secret", true);
}
#[test]
fn surface_map_round_trips_byte_identical_through_sqlite() {
let project = tempfile::tempdir().unwrap();
seed_flask_fixture(project.path());
let db_dir = tempfile::tempdir().unwrap();
let db_path = db_dir.path().join("rt.sqlite");
build_index("rt", project.path(), &db_path, &test_cfg(), false).expect("first build_index");
let pool = Indexer::init(&db_path).expect("first pool");
let _ = scan_with_index_parallel("rt", Arc::clone(&pool), &test_cfg(), false, project.path())
.expect("first scan");
let idx = Indexer::from_pool("rt", &pool).expect("first from_pool");
let bytes_first = idx
.load_surface_map_bytes()
.expect("load bytes 1")
.expect("surface map persisted 1");
drop(idx);
// Rescan against the same DB. No source change → byte-identical
// canonical surface map.
let _ = scan_with_index_parallel("rt", Arc::clone(&pool), &test_cfg(), false, project.path())
.expect("second scan");
let idx2 = Indexer::from_pool("rt", &pool).expect("second from_pool");
let bytes_second = idx2
.load_surface_map_bytes()
.expect("load bytes 2")
.expect("surface map persisted 2");
assert_eq!(
bytes_first, bytes_second,
"surface_map JSON must be byte-identical across rescans"
);
// Round-trip through the in-memory representation: canonicalise →
// serialise should reproduce the on-disk bytes exactly.
let mut map = SurfaceMap::from_json(&bytes_first).expect("from_json");
let bytes_after_round_trip = map.to_json().expect("to_json");
assert_eq!(
bytes_first, bytes_after_round_trip,
"canonical JSON must match round-tripped JSON"
);
// Light sanity check: the same map deserialised twice still names
// the five fixture routes (i.e. persistence does not lose nodes).
let entries: Vec<&str> = map
.nodes
.iter()
.filter_map(|n| match n {
SurfaceNode::EntryPoint(ep) => Some(ep.route.as_str()),
_ => None,
})
.collect();
for route in ["/", "/submit", "/users", "/admin", "/secret"] {
assert!(
entries.contains(&route),
"route {route} missing after round trip; got {entries:?}",
);
}
}