[pitboss/grind] deferred session-0003 (20260516T052512Z-20f8)

This commit is contained in:
pitboss 2026-05-16 02:26:41 -05:00
parent 282acddbbf
commit 678f0f5d48
35 changed files with 737 additions and 109 deletions

View file

@ -516,6 +516,26 @@ pub enum ReplayResult {
},
}
/// Tri-state map of [`ReplayResult`] onto the eval-corpus
/// `VerifyResult::replay_stable` field shape.
///
/// * `Some(true)` — replay matched the recorded outcome.
/// * `Some(false)` — replay diverged or aborted in a way that the M7
/// Gate-5 inversion treats as instability.
/// * `None` — replay was not informative (toolchain mismatched, docker
/// unavailable, or the bundle had no `reproduce.sh`). The corpus
/// tabulator treats `None` as "no signal" and excludes the row from
/// the per-cell `stable_replays` numerator.
pub fn replay_stability(result: &ReplayResult) -> Option<bool> {
match result {
ReplayResult::Pass => Some(true),
ReplayResult::Mismatch | ReplayResult::UnexpectedError { .. } => Some(false),
ReplayResult::DockerUnavailable
| ReplayResult::ToolchainMismatch
| ReplayResult::ScriptInvocationFailed { .. } => None,
}
}
/// Phase 28 — Track H.3. Run `reproduce.sh` in `bundle_root` and map the
/// shell exit code into a [`ReplayResult`].
///
@ -648,6 +668,8 @@ mod tests {
}],
toolchain_match: Some("exact".into()),
differential: None,
replay_stable: None,
wrong: None,
}
}
@ -780,6 +802,28 @@ mod tests {
}
}
#[test]
fn replay_stability_maps_to_eval_corpus_tristate() {
// The eval-corpus tabulator wants Pass → stable, anything that
// looks like instability → unstable, and infra-blocked variants
// → no signal (None) so the per-cell stable_replays denominator
// is not inflated by a row that never had a chance to replay.
assert_eq!(replay_stability(&ReplayResult::Pass), Some(true));
assert_eq!(replay_stability(&ReplayResult::Mismatch), Some(false));
assert_eq!(
replay_stability(&ReplayResult::UnexpectedError { exit_code: 9 }),
Some(false)
);
assert_eq!(replay_stability(&ReplayResult::DockerUnavailable), None);
assert_eq!(replay_stability(&ReplayResult::ToolchainMismatch), None);
assert_eq!(
replay_stability(&ReplayResult::ScriptInvocationFailed {
message: "missing".into()
}),
None,
);
}
#[test]
fn replay_bundle_reports_missing_script() {
let dir = TempDir::new().unwrap();

View file

@ -502,6 +502,51 @@ pub fn read_events(path: &Path) -> Result<Vec<serde_json::Value>, TelemetryReadE
Ok(out)
}
/// Scan the `verify_feedback` records in an events log for the given
/// finding id and return the matching `VerifyResult::wrong` value.
///
/// * `Some(true)` — most-recent feedback for this finding was
/// `wrong:<reason>`.
/// * `Some(false)` — most-recent feedback was `right`.
/// * `None` — no feedback recorded for this finding.
///
/// Multiple records for the same finding collapse to the **last** one
/// in file order: callers run `nyx verify-feedback` more than once when
/// they correct an earlier judgment, and the latest reading is the
/// authoritative one. The events log is read via the raw JSONL path
/// (NOT [`read_events`]) because `verify_feedback` rows were written
/// before the `schema_version`-envelope migration and may legitimately
/// pre-date the schema bump; a missing `schema_version` here is not
/// fatal.
pub fn feedback_wrong_for_finding(path: &Path, finding_id: &str) -> Option<bool> {
let file = std::fs::File::open(path).ok()?;
let reader = BufReader::new(file);
let mut latest: Option<bool> = None;
for line in reader.lines().map_while(Result::ok) {
if line.trim().is_empty() {
continue;
}
let Ok(value) = serde_json::from_str::<serde_json::Value>(&line) else {
continue;
};
if value.get("event").and_then(|v| v.as_str()) != Some("verify_feedback") {
continue;
}
if value.get("finding_id").and_then(|v| v.as_str()) != Some(finding_id) {
continue;
}
let Some(feedback) = value.get("feedback").and_then(|v| v.as_str()) else {
continue;
};
if feedback.starts_with("wrong:") || feedback == "wrong" {
latest = Some(true);
} else if feedback == "right" {
latest = Some(false);
}
}
latest
}
// ── Rank delta telemetry ──────────────────────────────────────────────────────
/// One telemetry event per ranked finding that carries a dynamic verdict delta.
@ -598,6 +643,44 @@ mod tests {
}
}
#[test]
fn feedback_wrong_for_finding_returns_latest_record() {
use std::io::Write;
let dir = TempDir::new().unwrap();
let log = dir.path().join("events.jsonl");
let mut f = std::fs::File::create(&log).unwrap();
// Three records for the same finding: initial wrong, later
// overridden by right. The latest wins.
writeln!(
f,
r#"{{"event":"verify_feedback","finding_id":"abc1","feedback":"wrong:sample"}}"#
)
.unwrap();
writeln!(
f,
r#"{{"event":"verify_feedback","finding_id":"abc2","feedback":"wrong:other"}}"#
)
.unwrap();
writeln!(
f,
r#"{{"event":"verify_feedback","finding_id":"abc1","feedback":"right"}}"#
)
.unwrap();
// Non-feedback rows are ignored.
writeln!(f, r#"{{"event":"verify","finding_id":"abc1"}}"#).unwrap();
f.flush().unwrap();
assert_eq!(feedback_wrong_for_finding(&log, "abc1"), Some(false));
assert_eq!(feedback_wrong_for_finding(&log, "abc2"), Some(true));
assert_eq!(feedback_wrong_for_finding(&log, "missing"), None);
}
#[test]
fn feedback_wrong_for_finding_tolerates_missing_file() {
let dir = TempDir::new().unwrap();
let log = dir.path().join("nonexistent.jsonl");
assert_eq!(feedback_wrong_for_finding(&log, "abc1"), None);
}
#[test]
fn emit_writes_valid_json() {
let dir = TempDir::new().unwrap();

View file

@ -286,6 +286,8 @@ fn entry_kind_unsupported_verdict(
attempts: vec![],
toolchain_match: None,
differential: None,
replay_stable: None,
wrong: None,
}
}
@ -328,6 +330,8 @@ fn spec_derivation_failed_verdict(
attempts: vec![],
toolchain_match: None,
differential: None,
replay_stable: None,
wrong: None,
};
}
@ -344,6 +348,8 @@ fn spec_derivation_failed_verdict(
attempts: vec![],
toolchain_match: None,
differential: None,
replay_stable: None,
wrong: None,
}
}
@ -449,6 +455,8 @@ pub fn verify_finding(diag: &Diag, opts: &VerifyOptions) -> VerifyResult {
attempts: vec![],
toolchain_match: None,
differential: None,
replay_stable: None,
wrong: None,
};
}
@ -531,6 +539,8 @@ pub fn verify_finding(diag: &Diag, opts: &VerifyOptions) -> VerifyResult {
attempts: vec![],
toolchain_match: None,
differential: None,
replay_stable: None,
wrong: None,
};
}
@ -559,6 +569,8 @@ pub fn verify_finding(diag: &Diag, opts: &VerifyOptions) -> VerifyResult {
attempts: vec![],
toolchain_match: None,
differential: None,
replay_stable: None,
wrong: None,
};
}
}
@ -734,6 +746,8 @@ fn build_verdict(
attempts: attempts.clone(),
toolchain_match: Some(toolchain_match.to_owned()),
differential: run.differential.clone(),
replay_stable: None,
wrong: None,
},
&run.harness_source,
&run.entry_source,
@ -754,6 +768,8 @@ fn build_verdict(
attempts,
toolchain_match: Some(toolchain_match.to_owned()),
differential: run.differential,
replay_stable: None,
wrong: None,
};
}
@ -767,6 +783,8 @@ fn build_verdict(
attempts,
toolchain_match: Some(toolchain_match.to_owned()),
differential: run.differential,
replay_stable: None,
wrong: None,
}
} else if run.unrelated_crash {
// Phase 08 §C.4: the harness crashed but the death
@ -786,6 +804,8 @@ fn build_verdict(
attempts,
toolchain_match: Some(toolchain_match.to_owned()),
differential: None,
replay_stable: None,
wrong: None,
}
} else if run.no_benign_control {
// Phase 07 §4.1: vuln oracle + sink-hit fired but the
@ -804,6 +824,8 @@ fn build_verdict(
attempts,
toolchain_match: Some(toolchain_match.to_owned()),
differential: None,
replay_stable: None,
wrong: None,
}
} else if let Some(d) = run.differential.as_ref() {
// Differential ran but didn't produce `Confirmed`. Map
@ -825,6 +847,8 @@ fn build_verdict(
attempts,
toolchain_match: Some(toolchain_match.to_owned()),
differential: run.differential,
replay_stable: None,
wrong: None,
}
}
crate::evidence::DifferentialVerdict::ReversedDifferential => {
@ -842,6 +866,8 @@ fn build_verdict(
attempts,
toolchain_match: Some(toolchain_match.to_owned()),
differential: run.differential,
replay_stable: None,
wrong: None,
}
}
crate::evidence::DifferentialVerdict::Confirmed
@ -855,6 +881,8 @@ fn build_verdict(
attempts,
toolchain_match: Some(toolchain_match.to_owned()),
differential: run.differential,
replay_stable: None,
wrong: None,
},
}
} else if run.oracle_collision {
@ -871,6 +899,8 @@ fn build_verdict(
attempts,
toolchain_match: Some(toolchain_match.to_owned()),
differential: None,
replay_stable: None,
wrong: None,
}
} else {
VerifyResult {
@ -883,6 +913,8 @@ fn build_verdict(
attempts,
toolchain_match: Some(toolchain_match.to_owned()),
differential: None,
replay_stable: None,
wrong: None,
}
}
}
@ -896,6 +928,8 @@ fn build_verdict(
attempts: vec![],
toolchain_match: None,
differential: None,
replay_stable: None,
wrong: None,
},
Err(RunError::Harness(e)) => {
// Defence-in-depth residual for `EntryKindUnsupported` from the
@ -939,6 +973,8 @@ fn build_verdict(
attempts: vec![],
toolchain_match: None,
differential: None,
replay_stable: None,
wrong: None,
}
}
Err(RunError::BuildFailed { stderr, attempts: build_att }) => VerifyResult {
@ -951,6 +987,8 @@ fn build_verdict(
attempts: vec![],
toolchain_match: None,
differential: None,
replay_stable: None,
wrong: None,
},
Err(RunError::Sandbox(e)) => VerifyResult {
finding_id: finding_id.to_owned(),
@ -962,6 +1000,8 @@ fn build_verdict(
attempts: vec![],
toolchain_match: None,
differential: None,
replay_stable: None,
wrong: None,
},
}
}
@ -1041,6 +1081,8 @@ mod tests {
attempts: vec![],
toolchain_match: Some("exact".to_owned()),
differential: None,
replay_stable: None,
wrong: None,
};
// Insert.
@ -1090,6 +1132,8 @@ mod tests {
attempts: vec![],
toolchain_match: Some("exact".to_owned()),
differential: None,
replay_stable: None,
wrong: None,
};
insert_verdict_cache(&db_path, "spec_aaa", "hash_xyz", "", "python-3.11", &result);
@ -1125,6 +1169,8 @@ mod tests {
attempts: vec![],
toolchain_match: None,
differential: None,
replay_stable: None,
wrong: None,
};
insert_verdict_cache(db_path, "spec", "hash", "", "python-3", &result);
assert!(!db_path.exists(), "insert must not create a new DB");
@ -1179,6 +1225,8 @@ mod tests {
attempts: vec![],
toolchain_match: Some("exact".to_owned()),
differential: None,
replay_stable: None,
wrong: None,
};
// Insert directly with the old corpus_version bypassing the helper.