mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-21 20:18:06 +02:00
[pitboss/grind] deferred session-0003 (20260516T052512Z-20f8)
This commit is contained in:
parent
282acddbbf
commit
678f0f5d48
35 changed files with 737 additions and 109 deletions
|
|
@ -516,6 +516,26 @@ pub enum ReplayResult {
|
|||
},
|
||||
}
|
||||
|
||||
/// Tri-state map of [`ReplayResult`] onto the eval-corpus
|
||||
/// `VerifyResult::replay_stable` field shape.
|
||||
///
|
||||
/// * `Some(true)` — replay matched the recorded outcome.
|
||||
/// * `Some(false)` — replay diverged or aborted in a way that the M7
|
||||
/// Gate-5 inversion treats as instability.
|
||||
/// * `None` — replay was not informative (toolchain mismatched, docker
|
||||
/// unavailable, or the bundle had no `reproduce.sh`). The corpus
|
||||
/// tabulator treats `None` as "no signal" and excludes the row from
|
||||
/// the per-cell `stable_replays` numerator.
|
||||
pub fn replay_stability(result: &ReplayResult) -> Option<bool> {
|
||||
match result {
|
||||
ReplayResult::Pass => Some(true),
|
||||
ReplayResult::Mismatch | ReplayResult::UnexpectedError { .. } => Some(false),
|
||||
ReplayResult::DockerUnavailable
|
||||
| ReplayResult::ToolchainMismatch
|
||||
| ReplayResult::ScriptInvocationFailed { .. } => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Phase 28 — Track H.3. Run `reproduce.sh` in `bundle_root` and map the
|
||||
/// shell exit code into a [`ReplayResult`].
|
||||
///
|
||||
|
|
@ -648,6 +668,8 @@ mod tests {
|
|||
}],
|
||||
toolchain_match: Some("exact".into()),
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -780,6 +802,28 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn replay_stability_maps_to_eval_corpus_tristate() {
|
||||
// The eval-corpus tabulator wants Pass → stable, anything that
|
||||
// looks like instability → unstable, and infra-blocked variants
|
||||
// → no signal (None) so the per-cell stable_replays denominator
|
||||
// is not inflated by a row that never had a chance to replay.
|
||||
assert_eq!(replay_stability(&ReplayResult::Pass), Some(true));
|
||||
assert_eq!(replay_stability(&ReplayResult::Mismatch), Some(false));
|
||||
assert_eq!(
|
||||
replay_stability(&ReplayResult::UnexpectedError { exit_code: 9 }),
|
||||
Some(false)
|
||||
);
|
||||
assert_eq!(replay_stability(&ReplayResult::DockerUnavailable), None);
|
||||
assert_eq!(replay_stability(&ReplayResult::ToolchainMismatch), None);
|
||||
assert_eq!(
|
||||
replay_stability(&ReplayResult::ScriptInvocationFailed {
|
||||
message: "missing".into()
|
||||
}),
|
||||
None,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn replay_bundle_reports_missing_script() {
|
||||
let dir = TempDir::new().unwrap();
|
||||
|
|
|
|||
|
|
@ -502,6 +502,51 @@ pub fn read_events(path: &Path) -> Result<Vec<serde_json::Value>, TelemetryReadE
|
|||
Ok(out)
|
||||
}
|
||||
|
||||
/// Scan the `verify_feedback` records in an events log for the given
|
||||
/// finding id and return the matching `VerifyResult::wrong` value.
|
||||
///
|
||||
/// * `Some(true)` — most-recent feedback for this finding was
|
||||
/// `wrong:<reason>`.
|
||||
/// * `Some(false)` — most-recent feedback was `right`.
|
||||
/// * `None` — no feedback recorded for this finding.
|
||||
///
|
||||
/// Multiple records for the same finding collapse to the **last** one
|
||||
/// in file order: callers run `nyx verify-feedback` more than once when
|
||||
/// they correct an earlier judgment, and the latest reading is the
|
||||
/// authoritative one. The events log is read via the raw JSONL path
|
||||
/// (NOT [`read_events`]) because `verify_feedback` rows were written
|
||||
/// before the `schema_version`-envelope migration and may legitimately
|
||||
/// pre-date the schema bump; a missing `schema_version` here is not
|
||||
/// fatal.
|
||||
pub fn feedback_wrong_for_finding(path: &Path, finding_id: &str) -> Option<bool> {
|
||||
let file = std::fs::File::open(path).ok()?;
|
||||
let reader = BufReader::new(file);
|
||||
let mut latest: Option<bool> = None;
|
||||
for line in reader.lines().map_while(Result::ok) {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
let Ok(value) = serde_json::from_str::<serde_json::Value>(&line) else {
|
||||
continue;
|
||||
};
|
||||
if value.get("event").and_then(|v| v.as_str()) != Some("verify_feedback") {
|
||||
continue;
|
||||
}
|
||||
if value.get("finding_id").and_then(|v| v.as_str()) != Some(finding_id) {
|
||||
continue;
|
||||
}
|
||||
let Some(feedback) = value.get("feedback").and_then(|v| v.as_str()) else {
|
||||
continue;
|
||||
};
|
||||
if feedback.starts_with("wrong:") || feedback == "wrong" {
|
||||
latest = Some(true);
|
||||
} else if feedback == "right" {
|
||||
latest = Some(false);
|
||||
}
|
||||
}
|
||||
latest
|
||||
}
|
||||
|
||||
// ── Rank delta telemetry ──────────────────────────────────────────────────────
|
||||
|
||||
/// One telemetry event per ranked finding that carries a dynamic verdict delta.
|
||||
|
|
@ -598,6 +643,44 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn feedback_wrong_for_finding_returns_latest_record() {
|
||||
use std::io::Write;
|
||||
let dir = TempDir::new().unwrap();
|
||||
let log = dir.path().join("events.jsonl");
|
||||
let mut f = std::fs::File::create(&log).unwrap();
|
||||
// Three records for the same finding: initial wrong, later
|
||||
// overridden by right. The latest wins.
|
||||
writeln!(
|
||||
f,
|
||||
r#"{{"event":"verify_feedback","finding_id":"abc1","feedback":"wrong:sample"}}"#
|
||||
)
|
||||
.unwrap();
|
||||
writeln!(
|
||||
f,
|
||||
r#"{{"event":"verify_feedback","finding_id":"abc2","feedback":"wrong:other"}}"#
|
||||
)
|
||||
.unwrap();
|
||||
writeln!(
|
||||
f,
|
||||
r#"{{"event":"verify_feedback","finding_id":"abc1","feedback":"right"}}"#
|
||||
)
|
||||
.unwrap();
|
||||
// Non-feedback rows are ignored.
|
||||
writeln!(f, r#"{{"event":"verify","finding_id":"abc1"}}"#).unwrap();
|
||||
f.flush().unwrap();
|
||||
assert_eq!(feedback_wrong_for_finding(&log, "abc1"), Some(false));
|
||||
assert_eq!(feedback_wrong_for_finding(&log, "abc2"), Some(true));
|
||||
assert_eq!(feedback_wrong_for_finding(&log, "missing"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn feedback_wrong_for_finding_tolerates_missing_file() {
|
||||
let dir = TempDir::new().unwrap();
|
||||
let log = dir.path().join("nonexistent.jsonl");
|
||||
assert_eq!(feedback_wrong_for_finding(&log, "abc1"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn emit_writes_valid_json() {
|
||||
let dir = TempDir::new().unwrap();
|
||||
|
|
|
|||
|
|
@ -286,6 +286,8 @@ fn entry_kind_unsupported_verdict(
|
|||
attempts: vec![],
|
||||
toolchain_match: None,
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -328,6 +330,8 @@ fn spec_derivation_failed_verdict(
|
|||
attempts: vec![],
|
||||
toolchain_match: None,
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -344,6 +348,8 @@ fn spec_derivation_failed_verdict(
|
|||
attempts: vec![],
|
||||
toolchain_match: None,
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -449,6 +455,8 @@ pub fn verify_finding(diag: &Diag, opts: &VerifyOptions) -> VerifyResult {
|
|||
attempts: vec![],
|
||||
toolchain_match: None,
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -531,6 +539,8 @@ pub fn verify_finding(diag: &Diag, opts: &VerifyOptions) -> VerifyResult {
|
|||
attempts: vec![],
|
||||
toolchain_match: None,
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -559,6 +569,8 @@ pub fn verify_finding(diag: &Diag, opts: &VerifyOptions) -> VerifyResult {
|
|||
attempts: vec![],
|
||||
toolchain_match: None,
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
@ -734,6 +746,8 @@ fn build_verdict(
|
|||
attempts: attempts.clone(),
|
||||
toolchain_match: Some(toolchain_match.to_owned()),
|
||||
differential: run.differential.clone(),
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
},
|
||||
&run.harness_source,
|
||||
&run.entry_source,
|
||||
|
|
@ -754,6 +768,8 @@ fn build_verdict(
|
|||
attempts,
|
||||
toolchain_match: Some(toolchain_match.to_owned()),
|
||||
differential: run.differential,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -767,6 +783,8 @@ fn build_verdict(
|
|||
attempts,
|
||||
toolchain_match: Some(toolchain_match.to_owned()),
|
||||
differential: run.differential,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
}
|
||||
} else if run.unrelated_crash {
|
||||
// Phase 08 §C.4: the harness crashed but the death
|
||||
|
|
@ -786,6 +804,8 @@ fn build_verdict(
|
|||
attempts,
|
||||
toolchain_match: Some(toolchain_match.to_owned()),
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
}
|
||||
} else if run.no_benign_control {
|
||||
// Phase 07 §4.1: vuln oracle + sink-hit fired but the
|
||||
|
|
@ -804,6 +824,8 @@ fn build_verdict(
|
|||
attempts,
|
||||
toolchain_match: Some(toolchain_match.to_owned()),
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
}
|
||||
} else if let Some(d) = run.differential.as_ref() {
|
||||
// Differential ran but didn't produce `Confirmed`. Map
|
||||
|
|
@ -825,6 +847,8 @@ fn build_verdict(
|
|||
attempts,
|
||||
toolchain_match: Some(toolchain_match.to_owned()),
|
||||
differential: run.differential,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
}
|
||||
}
|
||||
crate::evidence::DifferentialVerdict::ReversedDifferential => {
|
||||
|
|
@ -842,6 +866,8 @@ fn build_verdict(
|
|||
attempts,
|
||||
toolchain_match: Some(toolchain_match.to_owned()),
|
||||
differential: run.differential,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
}
|
||||
}
|
||||
crate::evidence::DifferentialVerdict::Confirmed
|
||||
|
|
@ -855,6 +881,8 @@ fn build_verdict(
|
|||
attempts,
|
||||
toolchain_match: Some(toolchain_match.to_owned()),
|
||||
differential: run.differential,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
},
|
||||
}
|
||||
} else if run.oracle_collision {
|
||||
|
|
@ -871,6 +899,8 @@ fn build_verdict(
|
|||
attempts,
|
||||
toolchain_match: Some(toolchain_match.to_owned()),
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
}
|
||||
} else {
|
||||
VerifyResult {
|
||||
|
|
@ -883,6 +913,8 @@ fn build_verdict(
|
|||
attempts,
|
||||
toolchain_match: Some(toolchain_match.to_owned()),
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -896,6 +928,8 @@ fn build_verdict(
|
|||
attempts: vec![],
|
||||
toolchain_match: None,
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
},
|
||||
Err(RunError::Harness(e)) => {
|
||||
// Defence-in-depth residual for `EntryKindUnsupported` from the
|
||||
|
|
@ -939,6 +973,8 @@ fn build_verdict(
|
|||
attempts: vec![],
|
||||
toolchain_match: None,
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
}
|
||||
}
|
||||
Err(RunError::BuildFailed { stderr, attempts: build_att }) => VerifyResult {
|
||||
|
|
@ -951,6 +987,8 @@ fn build_verdict(
|
|||
attempts: vec![],
|
||||
toolchain_match: None,
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
},
|
||||
Err(RunError::Sandbox(e)) => VerifyResult {
|
||||
finding_id: finding_id.to_owned(),
|
||||
|
|
@ -962,6 +1000,8 @@ fn build_verdict(
|
|||
attempts: vec![],
|
||||
toolchain_match: None,
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
|
@ -1041,6 +1081,8 @@ mod tests {
|
|||
attempts: vec![],
|
||||
toolchain_match: Some("exact".to_owned()),
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
};
|
||||
|
||||
// Insert.
|
||||
|
|
@ -1090,6 +1132,8 @@ mod tests {
|
|||
attempts: vec![],
|
||||
toolchain_match: Some("exact".to_owned()),
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
};
|
||||
|
||||
insert_verdict_cache(&db_path, "spec_aaa", "hash_xyz", "", "python-3.11", &result);
|
||||
|
|
@ -1125,6 +1169,8 @@ mod tests {
|
|||
attempts: vec![],
|
||||
toolchain_match: None,
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
};
|
||||
insert_verdict_cache(db_path, "spec", "hash", "", "python-3", &result);
|
||||
assert!(!db_path.exists(), "insert must not create a new DB");
|
||||
|
|
@ -1179,6 +1225,8 @@ mod tests {
|
|||
attempts: vec![],
|
||||
toolchain_match: Some("exact".to_owned()),
|
||||
differential: None,
|
||||
replay_stable: None,
|
||||
wrong: None,
|
||||
};
|
||||
|
||||
// Insert directly with the old corpus_version bypassing the helper.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue