nyx/src/dynamic/environment.rs

1307 lines
48 KiB
Rust

//! Project dependency capture + workdir staging (Phase 09 — Track D.1 + D.2).
//!
//! [`capture_project_dependencies`] reads the user's project root and
//! produces a [`CapturedDeps`] record describing every artifact the
//! harness will need at runtime — toolchain pin, direct imports of the
//! entry file, web framework signal, and local config files reachable
//! from the entry point. [`stage_workdir`] then materialises a minimal
//! copy of those artifacts into the per-spec workdir so the sandboxed
//! harness can `import flask` (or its per-language equivalent) inside an
//! offline sandbox without leaking the whole project tree across the
//! filesystem boundary.
//!
//! The lang-specific manifest (`requirements.txt`, `package.json`,
//! `Cargo.toml`, …) is then synthesised by the per-language emitter via
//! [`crate::dynamic::lang::LangEmitter::materialize_runtime`] from the
//! [`Environment`] handed back by `stage_workdir`.
//!
//! ## Scope
//!
//! - Direct imports of the spec's entry file (tree-sitter walk, top-level
//! `import` / `require` / `use` only — transitive imports are deferred
//! to a future phase).
//! - Framework deps inferred from [`crate::utils::project::detect_frameworks`].
//! - Local config files reachable from the entry point's directory
//! (`config.yaml`, `config.yml`, `.env`, `appsettings.json`, plus the
//! toolchain-resolver-recognised manifest itself).
//! - Source files reached via reverse callgraph closure from the sink's
//! enclosing function. Bounded by [`MAX_WORKDIR_BYTES`] so a
//! pathological closure does not copy the entire repository.
//!
//! The staged workdir is intentionally minimalist: every file copied has
//! to either be the entry, a dep manifest, a config file, or an in-closure
//! source file. The 10 MiB ceiling protects against runaway full-tree
//! copy regressions called out in the Phase 09 acceptance.
use crate::callgraph::{callers_of, CallGraph};
use crate::dynamic::spec::HarnessSpec;
use crate::dynamic::toolchain::{self, ToolchainResolution};
use crate::summary::GlobalSummaries;
use crate::symbol::{FuncKey, Lang};
use crate::utils::project::{detect_frameworks, DetectedFramework};
use std::collections::HashSet;
use std::io;
use std::path::{Path, PathBuf};
// ── Phase 11 — Track D.4: deterministic secret derivation ────────────────────
/// Prefix prepended to every derived secret so a leaked harness value is
/// immediately recognisable as a Nyx stub rather than a real credential.
pub const SECRET_VALUE_PREFIX: &str = "nyx-stub-";
/// Deterministic placeholder for a secret env var.
///
/// Constructed by [`derive_secret`] from `BLAKE3(spec_hash || env_var_name)`
/// and prefixed with [`SECRET_VALUE_PREFIX`]. The value is stable for the
/// lifetime of a spec, so two harness invocations under the same
/// [`HarnessSpec`] see identical credentials — but never the user's real
/// secret.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SecretValue(String);
impl SecretValue {
/// Raw value, ready to drop into `env`.
pub fn as_str(&self) -> &str {
&self.0
}
/// Consume into the owned string.
pub fn into_string(self) -> String {
self.0
}
}
impl std::fmt::Display for SecretValue {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&self.0)
}
}
/// Derive a deterministic placeholder for `env_var_name` keyed by
/// `spec_hash`.
///
/// `BLAKE3(spec_hash || '|' || env_var_name)` → first 32 hex chars →
/// `"nyx-stub-{hex}"`. The separator (`|`) prevents accidental collisions
/// between `("abc", "DEF")` and `("abcDEF", "")`.
///
/// Length is bounded at 32 hex characters (128 bits) so the value remains
/// short enough to fit comfortably in URLs, JSON config blobs, and POSIX
/// argv without inflating the env footprint.
pub fn derive_secret(spec_hash: &str, env_var_name: &str) -> SecretValue {
let mut hasher = blake3::Hasher::new();
hasher.update(spec_hash.as_bytes());
hasher.update(b"|");
hasher.update(env_var_name.as_bytes());
let hex = hasher.finalize().to_hex();
let mut out = String::with_capacity(SECRET_VALUE_PREFIX.len() + 32);
out.push_str(SECRET_VALUE_PREFIX);
out.push_str(&hex.as_str()[..32]);
SecretValue(out)
}
/// Scan `entry_file` for env-var references in `lang`.
///
/// Returns the set of env-var names referenced via the language's standard
/// env access API:
///
/// | Lang | Patterns |
/// |---|---|
/// | Python | `os.environ.get("X")`, `os.environ["X"]`, `os.getenv("X")` |
/// | JS/TS | `process.env.X`, `process.env["X"]` |
/// | Java | `System.getenv("X")` |
/// | Rust | `std::env::var("X")`, `env::var("X")` |
/// | Go | `os.Getenv("X")`, `os.LookupEnv("X")` |
/// | PHP | `getenv("X")`, `$_ENV["X"]`, `$_SERVER["X"]` |
/// | Ruby | `ENV["X"]`, `ENV.fetch("X")` |
/// | C/C++ | `getenv("X")` |
///
/// Static substring scan — bounded by [`IMPORT_SCAN_LIMIT`] like the import
/// extractor. No AST: an entry-file with `os.environ.get(some_var)` (a
/// non-literal arg) is intentionally skipped; the secret bag is populated
/// from literal references only so a typo cannot produce noisy injection.
pub fn extract_env_var_references(entry_file: &Path, lang: Lang) -> Vec<String> {
let bytes = match read_bounded(entry_file) {
Some(s) => s,
None => return Vec::new(),
};
let source = match std::str::from_utf8(&bytes) {
Ok(s) => s,
Err(_) => return Vec::new(),
};
let patterns: &[&str] = match lang {
Lang::Python => &[
"os.environ.get(",
"os.environ[",
"os.getenv(",
"environ.get(",
"environ[",
"getenv(",
],
Lang::JavaScript | Lang::TypeScript => &["process.env.", "process.env["],
Lang::Java => &["System.getenv(", "getenv("],
Lang::Rust => &["std::env::var(", "env::var(", "env::var_os(", "std::env::var_os("],
Lang::Go => &["os.Getenv(", "os.LookupEnv("],
Lang::Php => &["getenv(", "$_ENV[", "$_SERVER["],
Lang::Ruby => &["ENV[", "ENV.fetch(", "ENV.fetch "],
Lang::C | Lang::Cpp => &["getenv("],
};
let mut out: Vec<String> = Vec::new();
let mut seen: HashSet<String> = HashSet::new();
for pat in patterns {
let mut start = 0;
while let Some(rel) = source[start..].find(pat) {
let abs = start + rel + pat.len();
start = abs;
let tail = &source[abs..];
let name = match lang {
Lang::JavaScript | Lang::TypeScript if *pat == "process.env." => {
extract_identifier_name(tail)
}
_ => extract_quoted_arg(tail),
};
if let Some(name) = name {
if !name.is_empty() && is_env_var_name(&name) && seen.insert(name.clone()) {
out.push(name);
}
}
}
}
out
}
/// Extract a quoted (single or double quote) literal argument starting at
/// `s`. Skips leading whitespace; stops at the matching close-quote.
/// Returns `None` when the first non-whitespace char is not a quote — the
/// arg is dynamic and the scanner deliberately skips it.
fn extract_quoted_arg(s: &str) -> Option<String> {
let bytes = s.as_bytes();
let mut i = 0;
while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
i += 1;
}
if i >= bytes.len() {
return None;
}
let quote = match bytes[i] {
b'"' => b'"',
b'\'' => b'\'',
b'`' => b'`',
_ => return None,
};
i += 1;
let start = i;
while i < bytes.len() && bytes[i] != quote {
if bytes[i] == b'\n' {
return None;
}
i += 1;
}
if i >= bytes.len() {
return None;
}
std::str::from_utf8(&bytes[start..i]).ok().map(|s| s.to_owned())
}
/// Extract a bare identifier (e.g. `FOO` in `process.env.FOO`). Stops at
/// the first non-identifier byte.
fn extract_identifier_name(s: &str) -> Option<String> {
let bytes = s.as_bytes();
let mut i = 0;
while i < bytes.len() {
let c = bytes[i];
let is_ident = c.is_ascii_alphanumeric() || c == b'_';
if !is_ident {
break;
}
i += 1;
}
if i == 0 {
return None;
}
std::str::from_utf8(&bytes[..i]).ok().map(|s| s.to_owned())
}
/// Permissive env-var-name shape: starts with a letter or underscore, then
/// any of `[A-Za-z0-9_]`. Filters out blatantly bogus parses (e.g. when
/// the quoted scanner picks up `{`).
fn is_env_var_name(s: &str) -> bool {
if s.is_empty() {
return false;
}
let mut chars = s.chars();
let first = chars.next().unwrap();
if !(first.is_ascii_alphabetic() || first == '_') {
return false;
}
chars.all(|c| c.is_ascii_alphanumeric() || c == '_')
}
/// Build the per-spec secret bag: each env var the entry file references
/// gets a deterministic `(name, derive_secret(spec_hash, name))` entry.
///
/// Returned in deterministic source-order so two runs against the same
/// inputs produce byte-identical env layouts.
pub fn build_secret_bag(
entry_file: &Path,
lang: Lang,
spec_hash: &str,
) -> Vec<(String, String)> {
let mut out: Vec<(String, String)> = Vec::new();
for name in extract_env_var_references(entry_file, lang) {
let val = derive_secret(spec_hash, &name);
out.push((name, val.into_string()));
}
out
}
/// Hard upper bound on the bytes a staged workdir may consume after
/// `stage_workdir` returns. Phase 09 acceptance pins this to 10 MiB so a
/// pathological full-tree copy regression is caught at the test boundary
/// rather than ballooning the sandbox into the user's whole repo.
pub const MAX_WORKDIR_BYTES: u64 = 10 * 1024 * 1024;
/// Bytes scanned for `import` / `require` / `use` statements when the
/// per-language extractor is asked to enumerate the entry file's direct
/// dependencies. 64 KiB covers every reasonable header / preamble; we
/// intentionally do not walk the whole file because the import shape
/// almost always lives at the top.
const IMPORT_SCAN_LIMIT: usize = 64 * 1024;
/// Names of common config files reachable from the entry point. The
/// existence test is `entry_dir.join(name).is_file()` so we never recurse
/// into subdirectories — that's intentional: the harness boots from
/// `workdir/` and any path beneath the entry's directory is reachable via
/// relative paths only if it sits at the same level.
const CONFIG_FILE_CANDIDATES: &[&str] = &[
"config.yaml",
"config.yml",
".env",
"appsettings.json",
"settings.json",
"config.toml",
"config.json",
];
/// Per-language manifest files (lockfile + manifest pair) recognised by
/// the toolchain resolver. When present at `project_root`, these are
/// copied verbatim into the staged workdir so the build sandbox sees the
/// user's pinned dependency set. Order is significant only insofar as
/// the first match wins for [`CapturedDeps::lockfile_origin`].
const MANIFEST_FILES_BY_LANG: &[(Lang, &[&str])] = &[
(Lang::Python, &["requirements.txt", "pyproject.toml", "Pipfile", "Pipfile.lock"]),
(Lang::JavaScript, &["package.json", "package-lock.json", "yarn.lock", "pnpm-lock.yaml"]),
(Lang::TypeScript, &["package.json", "package-lock.json", "yarn.lock", "tsconfig.json"]),
(Lang::Rust, &["Cargo.toml", "Cargo.lock"]),
(Lang::Go, &["go.mod", "go.sum"]),
(Lang::Java, &["pom.xml", "build.gradle", "build.gradle.kts"]),
(Lang::Php, &["composer.json", "composer.lock"]),
(Lang::Ruby, &["Gemfile", "Gemfile.lock"]),
(Lang::C, &["Makefile", "CMakeLists.txt"]),
(Lang::Cpp, &["Makefile", "CMakeLists.txt"]),
];
/// Static-analysis output captured from the project, ready to be staged
/// into the harness workdir.
///
/// Returned by [`capture_project_dependencies`] and consumed by
/// [`stage_workdir`]. The struct deliberately separates *capture* (read
/// the project tree, no writes) from *staging* (write the workdir, no
/// reads of the source tree), so a future phase can persist
/// `CapturedDeps` to disk and re-stage without re-walking the source.
#[derive(Debug, Clone)]
pub struct CapturedDeps {
/// Absolute path to the user's project root used as the read anchor.
pub project_root: PathBuf,
/// Absolute path to the entry file (resolved against `project_root`).
pub entry_file: PathBuf,
/// Resolved language toolchain pin (version + drift flag).
pub toolchain: ToolchainResolution,
/// Top-level imports literally appearing in [`Self::entry_file`].
///
/// `lib_name` is the canonical package/module the import names. The
/// per-language `materialize_runtime` impl pins each entry to the
/// project's framework version when possible, or to a known-good
/// recent version otherwise.
pub direct_deps: Vec<String>,
/// Web frameworks detected from project manifests. Surfaced as a
/// separate field (rather than folded into `direct_deps`) so the
/// emitters can decide whether to pin to a specific framework
/// version even when the entry file imports the framework
/// transitively.
pub frameworks: Vec<DetectedFramework>,
/// Three-valued lang-has-framework signal (see
/// [`FrameworkContext::lang_has_web_framework`]).
pub framework_signal: Option<bool>,
/// Absolute paths of local config files reachable from the entry
/// point's directory. Each is copied verbatim into the workdir
/// during [`stage_workdir`].
pub config_files: Vec<PathBuf>,
/// Source files reachable from the sink's enclosing function via
/// reverse callgraph edges. Always includes the entry file. Empty
/// when no summaries / callgraph are threaded into the capture step.
pub source_closure: Vec<PathBuf>,
/// Manifest files (lockfile + project manifest pair) recognised for
/// [`Self::toolchain`]'s language. Each entry is an absolute path
/// inside `project_root`; the first existing entry from
/// [`MANIFEST_FILES_BY_LANG`] wins for [`Self::lockfile`].
pub manifests: Vec<PathBuf>,
/// First recognised manifest file (== `manifests[0]` when present).
/// Used by the per-language emitter as the canonical lockfile when
/// synthesising the staged manifest.
pub lockfile: Option<PathBuf>,
}
/// Runtime environment handle owned by the staging step.
///
/// Holds everything the per-language `materialize_runtime` impl needs to
/// emit a pinned manifest, plus the workdir handle so the staged paths
/// resolve correctly. Construction is owned by [`stage_workdir`]; the
/// fields are otherwise read-only so future stub injection (Phase 09+
/// extensions) can extend the struct without invalidating existing
/// callers.
#[derive(Debug, Clone)]
pub struct Environment {
/// Stable hash of the originating spec. Copied here so the emitter
/// can include it in the manifest comment header for forensic
/// traceability.
pub spec_hash: String,
/// Absolute path to the workdir that was just staged.
pub workdir: PathBuf,
/// Absolute path to the canonical lockfile staged into the workdir
/// (e.g. `workdir/requirements.txt`, `workdir/Cargo.lock`). `None`
/// when the language has no recognised lockfile or the user's
/// project carried none.
pub lockfile: Option<PathBuf>,
/// Source files materialised into the workdir, as paths *relative*
/// to the workdir root (e.g. `"src/handler.py"`).
pub staged_sources: Vec<PathBuf>,
/// Environment variables the harness should set before invoking the
/// entry point. Populated by [`build_secret_bag`] during
/// [`stage_workdir_full`] (Phase 11 — Track D.4) with deterministic
/// stub values for every env var the entry file literally
/// references. Phase 10 stub endpoints (SQL DB path, HTTP origin
/// URL, etc.) are layered on top by the verifier via
/// [`crate::dynamic::sandbox::SandboxOptions::extra_env`].
pub env_vars: Vec<(String, String)>,
/// Stub registry handles. Reserved for the Phase 10 stub-injection
/// layer; Phase 09 stages no stubs so this is always empty.
pub stub_handles: Vec<String>,
/// Language-toolchain pin carried over from
/// [`CapturedDeps::toolchain`] so the emitter does not need both
/// inputs.
pub toolchain: ToolchainResolution,
/// Direct deps the entry imports. Same shape as
/// [`CapturedDeps::direct_deps`].
pub direct_deps: Vec<String>,
/// Frameworks detected in the project root.
pub frameworks: Vec<DetectedFramework>,
/// Language pinned via the originating spec. Cached here so the
/// emitter does not have to re-thread the spec.
pub lang: Lang,
}
/// Manifest / lockfile artifacts the harness build needs alongside the
/// generated source. Returned by
/// [`crate::dynamic::lang::LangEmitter::materialize_runtime`].
///
/// Mirrors [`crate::dynamic::lang::HarnessSource::extra_files`] so the
/// harness staging path can write the manifest directly via the existing
/// extra-files loop.
#[derive(Debug, Clone, Default)]
pub struct RuntimeArtifacts {
/// `(relative_path, contents)` pairs written under `Environment::workdir`.
pub files: Vec<(String, String)>,
}
impl RuntimeArtifacts {
/// Convenience builder.
pub fn new() -> Self {
Self::default()
}
/// Push a `(rel_path, content)` artifact.
pub fn push(&mut self, rel_path: impl Into<String>, content: impl Into<String>) {
self.files.push((rel_path.into(), content.into()));
}
}
/// Walk the user's project tree to assemble the runtime dependencies the
/// harness needs.
///
/// Reads only — never writes. The returned [`CapturedDeps`] is the
/// single input to [`stage_workdir`], which is the sole owner of the
/// workdir filesystem mutations.
///
/// Always returns a populated record: missing inputs are best-effort and
/// fall back to defaults (system toolchain, empty deps). The function
/// never fails — every failure mode (manifest unreadable, entry file
/// missing) is folded into the returned record.
pub fn capture_project_dependencies(project_root: &Path, spec: &HarnessSpec) -> CapturedDeps {
capture_project_dependencies_with_context(project_root, spec, None, None)
}
/// Strategy-aware [`capture_project_dependencies`] that consults the
/// whole-program [`CallGraph`] and [`GlobalSummaries`] when present.
///
/// When both are provided, [`CapturedDeps::source_closure`] is populated
/// via reverse-edge BFS from the sink's enclosing function so the
/// staging step copies every file the entry transitively depends on.
/// When either is `None` the closure shrinks to a single-file set
/// containing only the entry — staging still works for the simple case
/// but cross-file helpers are not copied across.
pub fn capture_project_dependencies_with_context(
project_root: &Path,
spec: &HarnessSpec,
summaries: Option<&GlobalSummaries>,
callgraph: Option<&CallGraph>,
) -> CapturedDeps {
let entry_file = resolve_under_root(project_root, &spec.entry_file);
let toolchain = resolve_toolchain_for_lang(spec.lang, project_root);
let direct_deps = extract_direct_deps(&entry_file, spec.lang);
let framework_ctx = detect_frameworks(project_root);
let frameworks = framework_ctx.frameworks.clone();
let framework_signal = framework_ctx.lang_has_web_framework(framework_slug_for_lang(spec.lang));
let config_files = collect_config_files(&entry_file, project_root);
let manifests = collect_manifest_files(spec.lang, project_root);
let lockfile = manifests.first().cloned();
let source_closure = compute_source_closure(&entry_file, project_root, spec, summaries, callgraph);
CapturedDeps {
project_root: project_root.to_path_buf(),
entry_file,
toolchain,
direct_deps,
frameworks,
framework_signal,
config_files,
source_closure,
manifests,
lockfile,
}
}
/// Materialise a minimal copy of the project into `workdir`.
///
/// Writes (in order):
/// 1. The entry file itself (under its source-tree-relative path so
/// relative `from .x import y` works inside the workdir).
/// 2. Every file in `captured.source_closure`, preserving the
/// `project_root`-relative layout.
/// 3. Every manifest file in `captured.manifests`.
/// 4. Every local config file in `captured.config_files`.
///
/// Each write checks the running workdir size against
/// [`MAX_WORKDIR_BYTES`] and stops early on overflow; the function
/// returns `io::ErrorKind::FileTooLarge` in that case so the caller can
/// surface a `Inconclusive(WorkdirOverflow)` verdict in a future phase.
///
/// The returned [`Environment`] is the sole handle subsequent emitters
/// consult; callers must not assume the workdir is otherwise mutated
/// outside of this function (the harness builder still writes the
/// generated source via [`crate::dynamic::harness::build`]).
pub fn stage_workdir(captured: &CapturedDeps, workdir: &Path) -> io::Result<Environment> {
let lang = guess_lang_for_toolchain(&captured.toolchain.toolchain_id);
stage_workdir_full(captured, workdir, "", lang)
}
/// Like [`stage_workdir`] but lets the caller thread the originating
/// spec hash into the resulting [`Environment`].
pub fn stage_workdir_with_spec_hash(
captured: &CapturedDeps,
workdir: &Path,
spec_hash: &str,
) -> io::Result<Environment> {
let lang = guess_lang_for_toolchain(&captured.toolchain.toolchain_id);
stage_workdir_full(captured, workdir, spec_hash, lang)
}
/// Strategy-aware [`stage_workdir`] that lets the caller pin the
/// [`Environment`]'s [`Lang`] explicitly (rather than guessing from the
/// toolchain id). Used by the integration tests and by future harness
/// staging plumbing that already has a [`HarnessSpec`] in scope.
pub fn stage_workdir_full(
captured: &CapturedDeps,
workdir: &Path,
spec_hash: &str,
lang: Lang,
) -> io::Result<Environment> {
std::fs::create_dir_all(workdir)?;
let mut running_bytes: u64 = 0;
let mut staged_sources: Vec<PathBuf> = Vec::new();
// 1. Entry file — preserve project-relative layout when the entry
// lives under project_root, otherwise fall back to the basename.
if captured.entry_file.exists() {
let rel = rel_under_root(&captured.entry_file, &captured.project_root)
.unwrap_or_else(|| PathBuf::from(captured.entry_file.file_name().unwrap_or_default()));
running_bytes = copy_into_workdir(
&captured.entry_file,
workdir,
&rel,
running_bytes,
&mut staged_sources,
)?;
}
// 2. Source closure — every reachable in-closure file.
for src in &captured.source_closure {
if src == &captured.entry_file {
continue;
}
if !src.exists() {
continue;
}
let rel = match rel_under_root(src, &captured.project_root) {
Some(r) => r,
None => continue,
};
running_bytes = copy_into_workdir(src, workdir, &rel, running_bytes, &mut staged_sources)?;
}
// 3. Manifests (project-relative).
let mut lockfile_in_workdir: Option<PathBuf> = None;
for manifest in &captured.manifests {
if !manifest.exists() {
continue;
}
let rel = match rel_under_root(manifest, &captured.project_root) {
Some(r) => r,
None => continue,
};
running_bytes = copy_into_workdir(
manifest,
workdir,
&rel,
running_bytes,
&mut staged_sources,
)?;
if lockfile_in_workdir.is_none() {
lockfile_in_workdir = Some(workdir.join(&rel));
}
}
// 4. Config files (preserve relative layout under project_root).
for cfg in &captured.config_files {
if !cfg.exists() {
continue;
}
let rel = match rel_under_root(cfg, &captured.project_root) {
Some(r) => r,
None => PathBuf::from(cfg.file_name().unwrap_or_default()),
};
running_bytes =
copy_into_workdir(cfg, workdir, &rel, running_bytes, &mut staged_sources)?;
}
// Phase 11 — Track D.4: populate the per-spec secret bag for every
// env var the entry file literally references. `spec_hash` is empty
// for the legacy [`stage_workdir`] entry point; in that case the
// derived values still hash deterministically (collisions are avoided
// by the env-var name component) but two distinct specs would alias.
// Callers with a real spec hash should use
// [`stage_workdir_full`] / [`stage_workdir_with_spec_hash`].
let env_vars = build_secret_bag(&captured.entry_file, lang, spec_hash);
Ok(Environment {
spec_hash: spec_hash.to_owned(),
workdir: workdir.to_path_buf(),
lockfile: lockfile_in_workdir,
staged_sources,
env_vars,
stub_handles: Vec::new(),
toolchain: captured.toolchain.clone(),
direct_deps: captured.direct_deps.clone(),
frameworks: captured.frameworks.clone(),
lang,
})
}
fn guess_lang_for_toolchain(toolchain_id: &str) -> Lang {
Lang::from_slug(framework_slug_for_lang_for_toolchain(toolchain_id)).unwrap_or(Lang::Python)
}
// ── Helpers ──────────────────────────────────────────────────────────────────
fn copy_into_workdir(
src: &Path,
workdir: &Path,
rel: &Path,
running_bytes: u64,
staged: &mut Vec<PathBuf>,
) -> io::Result<u64> {
let metadata = match std::fs::metadata(src) {
Ok(m) => m,
Err(_) => return Ok(running_bytes),
};
let size = metadata.len();
if running_bytes.saturating_add(size) > MAX_WORKDIR_BYTES {
return Err(io::Error::new(
io::ErrorKind::Other,
format!(
"staged workdir would exceed {} bytes (next file `{}` = {} bytes)",
MAX_WORKDIR_BYTES,
rel.display(),
size
),
));
}
let dest = workdir.join(rel);
if let Some(parent) = dest.parent() {
std::fs::create_dir_all(parent)?;
}
std::fs::copy(src, &dest)?;
staged.push(rel.to_path_buf());
Ok(running_bytes.saturating_add(size))
}
fn resolve_under_root(project_root: &Path, entry_file: &str) -> PathBuf {
let p = Path::new(entry_file);
if p.is_absolute() {
return p.to_path_buf();
}
project_root.join(p)
}
fn rel_under_root(path: &Path, root: &Path) -> Option<PathBuf> {
let abs_path = path.canonicalize().ok().unwrap_or_else(|| path.to_path_buf());
let abs_root = root.canonicalize().ok().unwrap_or_else(|| root.to_path_buf());
abs_path
.strip_prefix(&abs_root)
.ok()
.map(|p| p.to_path_buf())
}
fn resolve_toolchain_for_lang(lang: Lang, project_root: &Path) -> ToolchainResolution {
match lang {
Lang::Python => toolchain::resolve_python(project_root),
Lang::Rust => toolchain::resolve_rust(project_root),
Lang::JavaScript | Lang::TypeScript => toolchain::resolve_node(project_root),
Lang::Go => toolchain::resolve_go(project_root),
Lang::Java => toolchain::resolve_java(project_root),
Lang::Php => toolchain::resolve_php(project_root),
_ => toolchain::resolve_python(project_root),
}
}
fn framework_slug_for_lang(lang: Lang) -> &'static str {
match lang {
Lang::Python => "python",
Lang::JavaScript => "javascript",
Lang::TypeScript => "typescript",
Lang::Java => "java",
Lang::Go => "go",
Lang::Php => "php",
Lang::Ruby => "ruby",
Lang::Rust => "rust",
Lang::C => "c",
Lang::Cpp => "cpp",
}
}
fn framework_slug_for_lang_for_toolchain(toolchain_id: &str) -> &'static str {
if toolchain_id.starts_with("python") {
"python"
} else if toolchain_id.starts_with("node") {
"javascript"
} else if toolchain_id.starts_with("rust") {
"rust"
} else if toolchain_id.starts_with("go") {
"go"
} else if toolchain_id.starts_with("java") {
"java"
} else if toolchain_id.starts_with("php") {
"php"
} else {
"python"
}
}
fn collect_config_files(entry_file: &Path, project_root: &Path) -> Vec<PathBuf> {
let mut out: Vec<PathBuf> = Vec::new();
let mut seen: HashSet<PathBuf> = HashSet::new();
let dirs: Vec<PathBuf> = {
let mut v = Vec::new();
v.push(project_root.to_path_buf());
if let Some(parent) = entry_file.parent() {
if parent != project_root && parent.starts_with(project_root) {
v.push(parent.to_path_buf());
}
}
v
};
for dir in &dirs {
for name in CONFIG_FILE_CANDIDATES {
let cand = dir.join(name);
if cand.is_file() && !seen.contains(&cand) {
seen.insert(cand.clone());
out.push(cand);
}
}
}
out
}
fn collect_manifest_files(lang: Lang, project_root: &Path) -> Vec<PathBuf> {
let names = MANIFEST_FILES_BY_LANG
.iter()
.find(|(l, _)| *l == lang)
.map(|(_, n)| *n)
.unwrap_or(&[]);
let mut out: Vec<PathBuf> = Vec::new();
for name in names {
let cand = project_root.join(name);
if cand.is_file() {
out.push(cand);
}
}
out
}
/// Walk `entry_file` for top-level imports and project-internal package
/// names. Distinct per language; the fall-through returns an empty Vec
/// so unsupported languages do not crash, they just stage with no
/// imports.
pub(crate) fn extract_direct_deps(entry_file: &Path, lang: Lang) -> Vec<String> {
let bytes = match read_bounded(entry_file) {
Some(s) => s,
None => return Vec::new(),
};
let head = match std::str::from_utf8(&bytes) {
Ok(s) => s,
Err(_) => return Vec::new(),
};
match lang {
Lang::Python => extract_python_imports(head),
Lang::JavaScript | Lang::TypeScript => extract_js_imports(head),
Lang::Ruby => extract_ruby_imports(head),
Lang::Php => extract_php_imports(head),
Lang::Go => extract_go_imports(head),
Lang::Java => extract_java_imports(head),
Lang::Rust => extract_rust_imports(head),
Lang::C | Lang::Cpp => extract_c_includes(head),
}
}
fn extract_python_imports(source: &str) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
let mut seen: HashSet<String> = HashSet::new();
for line in source.lines() {
let line = line.trim_start();
if line.is_empty() || line.starts_with('#') {
continue;
}
let candidate = if let Some(rest) = line.strip_prefix("from ") {
// `from X.Y import Z` → top-level pkg = "X"
let mod_name = rest.split_whitespace().next().unwrap_or("");
if mod_name.is_empty() || mod_name.starts_with('.') {
continue;
}
mod_name.split('.').next().unwrap_or("").to_owned()
} else if let Some(rest) = line.strip_prefix("import ") {
// `import X.Y` → top-level pkg = "X"
// `import X.Y as Z` → top-level pkg = "X"
// `import X, Y` → first "X" only (best-effort)
let mod_name = rest.split([',', ' ']).next().unwrap_or("").trim();
if mod_name.is_empty() {
continue;
}
mod_name.split('.').next().unwrap_or("").to_owned()
} else {
continue;
};
if candidate.is_empty() {
continue;
}
if !seen.contains(&candidate) {
seen.insert(candidate.clone());
out.push(candidate);
}
}
out
}
fn extract_js_imports(source: &str) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
let mut seen: HashSet<String> = HashSet::new();
let push = |s: &str, out: &mut Vec<String>, seen: &mut HashSet<String>| {
let trimmed = s.trim_matches(|c: char| c == '\'' || c == '"' || c == '`');
if trimmed.is_empty() || trimmed.starts_with('.') || trimmed.starts_with('/') {
return;
}
// Scoped pkg (`@scope/name`) keeps full prefix; bare pkg keeps top segment.
let canonical = if trimmed.starts_with('@') {
let parts: Vec<&str> = trimmed.splitn(3, '/').collect();
if parts.len() >= 2 {
format!("{}/{}", parts[0], parts[1])
} else {
trimmed.to_owned()
}
} else {
trimmed.split('/').next().unwrap_or(trimmed).to_owned()
};
if !seen.contains(&canonical) {
seen.insert(canonical.clone());
out.push(canonical);
}
};
for line in source.lines() {
let line = line.trim_start();
if let Some(idx) = line.find("from ") {
// `import x from 'pkg'`
let after = &line[idx + 5..];
let after = after.trim_start();
if let Some(end) = after.find(['\'', '"', '`']) {
let quote = after.as_bytes()[end] as char;
if let Some(close) = after[end + 1..].find(quote) {
push(&after[end + 1..end + 1 + close], &mut out, &mut seen);
}
}
}
if let Some(idx) = line.find("require(") {
let after = &line[idx + 8..];
let after = after.trim_start();
if let Some(end) = after.find(['\'', '"', '`']) {
let quote = after.as_bytes()[end] as char;
if let Some(close) = after[end + 1..].find(quote) {
push(&after[end + 1..end + 1 + close], &mut out, &mut seen);
}
}
}
if line.starts_with("import ") && !line.contains("from ") {
// Side-effect import: `import 'pkg'`.
let rest = line.trim_start_matches("import ").trim();
push(rest, &mut out, &mut seen);
}
}
out
}
fn extract_ruby_imports(source: &str) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
let mut seen: HashSet<String> = HashSet::new();
for line in source.lines() {
let line = line.trim_start();
let rest = if let Some(r) = line.strip_prefix("require_relative ") {
r
} else if let Some(r) = line.strip_prefix("require ") {
r
} else {
continue;
};
let trimmed = rest.trim().trim_matches(|c: char| c == '\'' || c == '"');
if trimmed.is_empty() {
continue;
}
let pkg = trimmed.split('/').next().unwrap_or(trimmed).to_owned();
if !seen.contains(&pkg) {
seen.insert(pkg.clone());
out.push(pkg);
}
}
out
}
fn extract_php_imports(source: &str) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
let mut seen: HashSet<String> = HashSet::new();
for line in source.lines() {
let line = line.trim_start();
let rest = if let Some(r) = line.strip_prefix("use ") {
r
} else if let Some(r) = line.strip_prefix("require_once ") {
r
} else if let Some(r) = line.strip_prefix("require ") {
r
} else if let Some(r) = line.strip_prefix("include ") {
r
} else {
continue;
};
let trimmed = rest
.trim()
.trim_end_matches(';')
.trim_matches(|c: char| c == '\'' || c == '"');
if trimmed.is_empty() {
continue;
}
let pkg = trimmed.split('\\').next().unwrap_or(trimmed).to_owned();
if !seen.contains(&pkg) {
seen.insert(pkg.clone());
out.push(pkg);
}
}
out
}
fn extract_go_imports(source: &str) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
let mut seen: HashSet<String> = HashSet::new();
let mut in_block = false;
for line in source.lines() {
let line = line.trim_start();
if line.starts_with("import (") {
in_block = true;
continue;
}
if in_block {
if line.starts_with(')') {
in_block = false;
continue;
}
let trimmed = line.trim().trim_matches(|c: char| c == '\'' || c == '"');
if trimmed.is_empty() {
continue;
}
// Skip aliased imports' alias prefix: `foo "pkg"`.
let pkg_part = trimmed
.rsplit_once(' ')
.map(|(_, r)| r.trim_matches(|c: char| c == '"' || c == '`' || c == '\''))
.unwrap_or(trimmed)
.trim_matches(|c: char| c == '"' || c == '`' || c == '\'');
if pkg_part.is_empty() || pkg_part.starts_with("//") {
continue;
}
if !seen.contains(pkg_part) {
seen.insert(pkg_part.to_owned());
out.push(pkg_part.to_owned());
}
} else if let Some(rest) = line.strip_prefix("import ") {
let trimmed = rest.trim().trim_matches(|c: char| c == '"' || c == '`');
if !trimmed.is_empty() && !seen.contains(trimmed) {
seen.insert(trimmed.to_owned());
out.push(trimmed.to_owned());
}
}
}
out
}
fn extract_java_imports(source: &str) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
let mut seen: HashSet<String> = HashSet::new();
for line in source.lines() {
let line = line.trim_start();
let rest = match line.strip_prefix("import ") {
Some(r) => r,
None => continue,
};
let trimmed = rest.trim().trim_end_matches(';');
if trimmed.is_empty() {
continue;
}
// Top-level Java package = first dotted segment.
let pkg = trimmed.split('.').next().unwrap_or(trimmed).to_owned();
if !seen.contains(&pkg) {
seen.insert(pkg.clone());
out.push(pkg);
}
}
out
}
fn extract_rust_imports(source: &str) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
let mut seen: HashSet<String> = HashSet::new();
for line in source.lines() {
let line = line.trim_start();
let rest = match line.strip_prefix("use ") {
Some(r) => r,
None => match line.strip_prefix("extern crate ") {
Some(r) => r,
None => continue,
},
};
let trimmed = rest.trim().trim_end_matches(';');
if trimmed.is_empty() {
continue;
}
let crate_name = trimmed
.split("::")
.next()
.unwrap_or(trimmed)
.split([' ', ','])
.next()
.unwrap_or(trimmed)
.to_owned();
if crate_name == "self" || crate_name == "super" || crate_name == "crate" {
continue;
}
if !seen.contains(&crate_name) {
seen.insert(crate_name.clone());
out.push(crate_name);
}
}
out
}
fn extract_c_includes(source: &str) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
let mut seen: HashSet<String> = HashSet::new();
for line in source.lines() {
let line = line.trim_start();
if !line.starts_with("#include") {
continue;
}
let rest = line.trim_start_matches("#include").trim();
let trimmed = rest
.trim_start_matches('<')
.trim_end_matches('>')
.trim_start_matches('"')
.trim_end_matches('"');
if trimmed.is_empty() {
continue;
}
if !seen.contains(trimmed) {
seen.insert(trimmed.to_owned());
out.push(trimmed.to_owned());
}
}
out
}
fn read_bounded(path: &Path) -> Option<Vec<u8>> {
use std::io::Read;
let file = std::fs::File::open(path).ok()?;
let mut buf: Vec<u8> = Vec::new();
let mut reader = std::io::BufReader::new(file).take(IMPORT_SCAN_LIMIT as u64);
reader.read_to_end(&mut buf).ok()?;
Some(buf)
}
/// Reverse-edge callgraph closure starting from the spec's sink-enclosing
/// function and walking outward through callers until the entry file is
/// reached or there are no more callers. Falls back to the entry-file
/// only when summaries / callgraph are not present.
///
/// The resulting set is bounded by the number of [`FuncKey`]s in the
/// call graph; in practice harness fixtures sit at <100 nodes so the BFS
/// terminates almost immediately.
fn compute_source_closure(
entry_file: &Path,
project_root: &Path,
spec: &HarnessSpec,
summaries: Option<&GlobalSummaries>,
callgraph: Option<&CallGraph>,
) -> Vec<PathBuf> {
let mut out: Vec<PathBuf> = Vec::new();
let mut seen: HashSet<PathBuf> = HashSet::new();
let push = |p: PathBuf, out: &mut Vec<PathBuf>, seen: &mut HashSet<PathBuf>| {
if !seen.contains(&p) {
seen.insert(p.clone());
out.push(p);
}
};
push(entry_file.to_path_buf(), &mut out, &mut seen);
let (Some(gs), Some(cg)) = (summaries, callgraph) else {
return out;
};
let sink_file_abs = resolve_under_root(project_root, &spec.sink_file);
// Seed: every FuncKey whose namespace is the sink file.
let mut frontier: Vec<FuncKey> = gs
.iter()
.filter_map(|(k, _)| {
let ns_abs = resolve_under_root(project_root, &k.namespace);
if paths_equal(&ns_abs, &sink_file_abs) {
Some(k.clone())
} else {
None
}
})
.collect();
let mut visited: HashSet<FuncKey> = frontier.iter().cloned().collect();
let mut steps = 0;
const MAX_STEPS: usize = 256;
while let Some(callee) = frontier.pop() {
if steps > MAX_STEPS {
break;
}
steps += 1;
let ns_abs = resolve_under_root(project_root, &callee.namespace);
push(ns_abs.clone(), &mut out, &mut seen);
for caller in callers_of(cg, &callee) {
if visited.contains(&caller) {
continue;
}
visited.insert(caller.clone());
frontier.push(caller);
}
}
out
}
fn paths_equal(a: &Path, b: &Path) -> bool {
let a_can = a.canonicalize().ok();
let b_can = b.canonicalize().ok();
match (a_can, b_can) {
(Some(a), Some(b)) => a == b,
_ => a == b,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::dynamic::spec::{EntryKind, HarnessSpec, PayloadSlot, SpecDerivationStrategy};
use crate::labels::Cap;
use std::fs;
use tempfile::TempDir;
fn fake_spec(entry_file: &str, lang: Lang) -> HarnessSpec {
HarnessSpec {
finding_id: "0000000000000001".into(),
entry_file: entry_file.into(),
entry_name: "handler".into(),
entry_kind: EntryKind::Function,
lang,
toolchain_id: "python-3.11".into(),
payload_slot: PayloadSlot::Param(0),
expected_cap: Cap::CODE_EXEC,
constraint_hints: vec![],
sink_file: entry_file.into(),
sink_line: 10,
spec_hash: "test0000abcd1234".into(),
derivation: SpecDerivationStrategy::FromFlowSteps,
stubs_required: vec![],
}
}
#[test]
fn extract_python_imports_picks_top_level_pkg() {
let src = r#"
from flask import Flask, request
import os
import sqlalchemy
import pandas as pd
from sqlalchemy.orm import sessionmaker
"#;
let deps = extract_python_imports(src);
assert!(deps.contains(&"flask".to_owned()));
assert!(deps.contains(&"os".to_owned()));
assert!(deps.contains(&"sqlalchemy".to_owned()));
assert!(deps.contains(&"pandas".to_owned()));
// sqlalchemy.orm is deduped to "sqlalchemy".
assert_eq!(deps.iter().filter(|d| *d == "sqlalchemy").count(), 1);
}
#[test]
fn extract_js_imports_handles_scoped_pkg() {
let src = r#"
import express from 'express';
const helmet = require("helmet");
import { Router } from '@koa/router';
import './local-thing';
"#;
let deps = extract_js_imports(src);
assert!(deps.contains(&"express".to_owned()));
assert!(deps.contains(&"helmet".to_owned()));
assert!(deps.contains(&"@koa/router".to_owned()));
// Relative imports are skipped.
assert!(!deps.iter().any(|d| d.starts_with('.')));
}
#[test]
fn extract_rust_imports_collects_crates() {
let src = "use serde::Deserialize;\nuse tokio::net::TcpListener;\nextern crate libc;\nuse crate::foo::bar;\n";
let deps = extract_rust_imports(src);
assert!(deps.contains(&"serde".to_owned()));
assert!(deps.contains(&"tokio".to_owned()));
assert!(deps.contains(&"libc".to_owned()));
// Project-internal references skipped.
assert!(!deps.contains(&"crate".to_owned()));
}
#[test]
fn extract_go_imports_handles_block_and_single() {
let src = "package main\nimport \"fmt\"\nimport (\n\t\"net/http\"\n\t alias \"github.com/gin-gonic/gin\"\n)\n";
let deps = extract_go_imports(src);
assert!(deps.contains(&"fmt".to_owned()));
assert!(deps.contains(&"net/http".to_owned()));
assert!(deps.contains(&"github.com/gin-gonic/gin".to_owned()));
}
#[test]
fn capture_returns_default_when_root_empty() {
let tmp = TempDir::new().unwrap();
let root = tmp.path();
let spec = fake_spec("app.py", Lang::Python);
let captured = capture_project_dependencies(root, &spec);
assert!(captured.direct_deps.is_empty());
assert!(captured.frameworks.is_empty());
assert!(captured.lockfile.is_none());
assert_eq!(captured.toolchain.toolchain_id, "python-3");
}
#[test]
fn capture_picks_up_python_imports_and_frameworks() {
let tmp = TempDir::new().unwrap();
let root = tmp.path();
fs::write(
root.join("app.py"),
"from flask import Flask, request\nimport os\nimport requests\n",
)
.unwrap();
fs::write(root.join("requirements.txt"), "Flask==2.3.0\nrequests>=2.28\n").unwrap();
let spec = fake_spec("app.py", Lang::Python);
let captured = capture_project_dependencies(root, &spec);
assert!(captured.direct_deps.contains(&"flask".to_owned()));
assert!(captured.direct_deps.contains(&"requests".to_owned()));
assert!(captured.frameworks.contains(&DetectedFramework::Flask));
assert!(captured.lockfile.is_some());
}
#[test]
fn stage_workdir_copies_entry_and_manifest() {
let tmp = TempDir::new().unwrap();
let root = tmp.path();
fs::write(root.join("app.py"), "from flask import Flask\n").unwrap();
fs::write(root.join("requirements.txt"), "Flask\n").unwrap();
let spec = fake_spec("app.py", Lang::Python);
let captured = capture_project_dependencies(root, &spec);
let stage = TempDir::new().unwrap();
let env = stage_workdir_with_spec_hash(&captured, stage.path(), "deadbeef").unwrap();
assert!(env.workdir.join("app.py").is_file());
assert!(env.workdir.join("requirements.txt").is_file());
assert_eq!(env.spec_hash, "deadbeef");
assert!(env.lockfile.is_some());
}
#[test]
fn stage_workdir_respects_max_size() {
let tmp = TempDir::new().unwrap();
let root = tmp.path();
// Write a single source over the budget. The copy must error.
let big = vec![b'x'; (MAX_WORKDIR_BYTES + 1) as usize];
fs::write(root.join("app.py"), &big).unwrap();
let spec = fake_spec("app.py", Lang::Python);
let captured = capture_project_dependencies(root, &spec);
let stage = TempDir::new().unwrap();
let err = stage_workdir(&captured, stage.path()).unwrap_err();
assert!(err.to_string().contains("exceed"));
}
#[test]
fn config_files_picked_up_when_present() {
let tmp = TempDir::new().unwrap();
let root = tmp.path();
fs::write(root.join("app.py"), "from flask import Flask\n").unwrap();
fs::write(root.join("config.yaml"), "debug: true\n").unwrap();
fs::write(root.join(".env"), "FLASK_DEBUG=1\n").unwrap();
let spec = fake_spec("app.py", Lang::Python);
let captured = capture_project_dependencies(root, &spec);
assert_eq!(captured.config_files.len(), 2);
}
}