Critical bug fixes and recall improvements (#68)

2026-07-24 21:41:02 +02:00 · 2026-05-11 12:42:39 -04:00 · 2026-05-11 12:42:39 -04:00 · 55247b7fcd
commit 55247b7fcd
parent 7d0e7320e2
352 changed files with 60069 additions and 900 deletions
--- a/scripts/validate_recall.sh
+++ b/scripts/validate_recall.sh
@ -0,0 +1,316 @@
+#!/usr/bin/env bash
+# validate_recall.sh — run `nyx scan --format json` against a real OSS
+# checkout and diff the result against a frozen baseline.
+#
+# Phase 11 of the JS/TS recall-gap engine plan owns the JS targets.
+# Phase 17 adds cross-language targets (php/java/python/rust/go/ruby)
+# under `tests/recall_targets/xlang/<lang>/<target>.json`. JS-era
+# baselines stay at `tests/recall_targets/<target>.json` for backwards
+# compatibility.
+#
+# Baseline files were relocated out of `.pitboss/` per the Phase 01
+# precedent — pitboss implementer agents must not write under
+# `.pitboss/`.
+#
+# Usage:
+#   scripts/validate_recall.sh <target> <clone_path> [--capture]
+#   scripts/validate_recall.sh --lang <lang> <target> <clone_path> [--capture]
+#   scripts/validate_recall.sh [--lang <lang>] <target> --from-snapshot <prior_run.json>
+#
+#   <lang>        php | java | python | rust | go | ruby
+#                 Selects the per-language target set under
+#                 `tests/recall_targets/xlang/<lang>/`.
+#   <target>      Without --lang: cal_com | vercel_commerce |
+#                 shadcn_examples | blitz_apps (Phase 11 JS targets).
+#                 With --lang: any baseline shipped under
+#                 `tests/recall_targets/xlang/<lang>/<target>.json`.
+#   <clone_path>  path to a local clone of the OSS repo (omitted when
+#                 --from-snapshot is supplied).
+#   --capture     overwrite the baseline with the current scan output
+#                 (every finding marked `needs_review`); use this when
+#                 the baseline file is a placeholder or when intentional
+#                 recall lift is being frozen.
+#   --from-snapshot <path>
+#                 skip the scan; load `<path>.findings` (a previously
+#                 captured baseline JSON) as the current finding set
+#                 and diff it against `<target>`'s baseline. Mutually
+#                 exclusive with --capture.
+#
+# Default mode (no --capture, no --from-snapshot) loads the baseline,
+# re-scans the clone, and prints `{ added, removed, unchanged }`
+# finding counts per rule_id. Findings are matched on the tuple
+# `(rule_id, path_suffix, line)`; `path_suffix` is the clone-relative
+# path so the diff is robust against absolute-path differences.
+#
+# Dependencies: bash, jq. Nothing else.
+
+set -euo pipefail
+
+usage() {
+    cat >&2 <<EOF
+usage: $(basename "$0") <target> <clone_path> [--capture]
+       $(basename "$0") --lang <lang> <target> <clone_path> [--capture]
+       $(basename "$0") [--lang <lang>] <target> --from-snapshot <prior_run.json>
+
+  lang             php | java | python | rust | go | ruby
+  target           JS targets: cal_com | vercel_commerce | shadcn_examples |
+                   blitz_apps. With --lang: any name shipped under
+                   tests/recall_targets/xlang/<lang>/.
+  clone_path       path to local checkout of the target repo (omitted with
+                   --from-snapshot)
+  --capture        overwrite the baseline JSON with the current scan output
+  --from-snapshot  diff a previously captured baseline JSON against <target>
+                   without rescanning; mutually exclusive with --capture.
+EOF
+    exit 2
+}
+
+LANG_FLAG=""
+POSITIONAL=()
+CAPTURE=0
+SNAPSHOT=""
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --lang)
+            if [ $# -lt 2 ]; then
+                echo "--lang requires an argument" >&2
+                usage
+            fi
+            LANG_FLAG="$2"
+            shift 2
+            ;;
+        --lang=*)
+            LANG_FLAG="${1#--lang=}"
+            shift
+            ;;
+        --capture)
+            CAPTURE=1
+            shift
+            ;;
+        --from-snapshot)
+            if [ $# -lt 2 ]; then
+                echo "--from-snapshot requires a path argument" >&2
+                usage
+            fi
+            SNAPSHOT="$2"
+            shift 2
+            ;;
+        --from-snapshot=*)
+            SNAPSHOT="${1#--from-snapshot=}"
+            shift
+            ;;
+        -h|--help)
+            usage
+            ;;
+        --*)
+            echo "unknown flag: $1" >&2
+            usage
+            ;;
+        *)
+            POSITIONAL+=("$1")
+            shift
+            ;;
+    esac
+done
+
+if [ "$CAPTURE" -eq 1 ] && [ -n "$SNAPSHOT" ]; then
+    echo "--capture and --from-snapshot are mutually exclusive" >&2
+    usage
+fi
+
+if [ -n "$SNAPSHOT" ]; then
+    if [ ${#POSITIONAL[@]} -lt 1 ]; then
+        usage
+    fi
+    TARGET="${POSITIONAL[0]}"
+    CLONE_PATH=""
+else
+    if [ ${#POSITIONAL[@]} -lt 2 ]; then
+        usage
+    fi
+    TARGET="${POSITIONAL[0]}"
+    CLONE_PATH="${POSITIONAL[1]}"
+fi
+
+REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+
+if [ -n "$LANG_FLAG" ]; then
+    XLANG_DIR="$REPO_ROOT/tests/recall_targets/xlang/${LANG_FLAG}"
+    if [ ! -d "$XLANG_DIR" ]; then
+        AVAILABLE="$(find "$REPO_ROOT/tests/recall_targets/xlang" -mindepth 1 -maxdepth 1 -type d -exec basename {} \; 2>/dev/null | sort | paste -sd ' ' -)"
+        echo "unknown lang: $LANG_FLAG (available: ${AVAILABLE:-none})" >&2
+        usage
+    fi
+    BASELINE="$XLANG_DIR/${TARGET}.json"
+else
+    case "$TARGET" in
+        cal_com|vercel_commerce|shadcn_examples|blitz_apps) ;;
+        *) echo "unknown target: $TARGET (use --lang for cross-lang targets)" >&2; usage ;;
+    esac
+    BASELINE="$REPO_ROOT/tests/recall_targets/${TARGET}.json"
+fi
+
+if [ -n "$SNAPSHOT" ]; then
+    if [ ! -f "$SNAPSHOT" ]; then
+        echo "snapshot file not found: $SNAPSHOT" >&2
+        exit 1
+    fi
+else
+    if [ ! -d "$CLONE_PATH" ]; then
+        echo "clone path is not a directory: $CLONE_PATH" >&2
+        exit 1
+    fi
+fi
+
+if ! command -v jq >/dev/null 2>&1; then
+    echo "jq is required but not installed" >&2
+    exit 1
+fi
+
+if [ ! -f "$BASELINE" ]; then
+    echo "baseline not found: $BASELINE" >&2
+    exit 1
+fi
+
+# Locate the nyx binary — prefer a release build, fall back to debug.
+# Skipped under --from-snapshot since no scan is performed.
+if [ -z "$SNAPSHOT" ]; then
+    if [ -n "${NYX_BIN:-}" ] && [ -x "$NYX_BIN" ]; then
+        NYX="$NYX_BIN"
+    elif [ -x "$REPO_ROOT/target/release/nyx" ]; then
+        NYX="$REPO_ROOT/target/release/nyx"
+    elif [ -x "$REPO_ROOT/target/debug/nyx" ]; then
+        NYX="$REPO_ROOT/target/debug/nyx"
+    elif command -v nyx >/dev/null 2>&1; then
+        NYX="$(command -v nyx)"
+    else
+        echo "nyx binary not found; build with 'cargo build --release' first" >&2
+        exit 1
+    fi
+fi
+
+if [ -n "$SNAPSHOT" ]; then
+    CLONE_ABS=""
+    if [ -n "$LANG_FLAG" ]; then
+        echo "[validate_recall] lang=$LANG_FLAG target=$TARGET snapshot=$SNAPSHOT" >&2
+    else
+        echo "[validate_recall] target=$TARGET snapshot=$SNAPSHOT" >&2
+    fi
+    echo "[validate_recall] baseline=$BASELINE (no scan; --from-snapshot)" >&2
+
+    # Snapshot mode: load the previously captured baseline JSON's
+    # `findings` array verbatim. Both snapshot and baseline are stored
+    # in the same diff-tuple shape (`rule_id` / `path_suffix` / `line`)
+    # so no path normalization is needed.
+    CURRENT="$(jq '.findings // [] | [ .[] | {
+        rule_id: (.rule_id // ""),
+        path_suffix: (.path_suffix // ""),
+        line: (.line // 0),
+        severity: (.severity // "Unknown")
+    } ]' "$SNAPSHOT")"
+else
+    CLONE_ABS="$(cd "$CLONE_PATH" && pwd)"
+    TMP_OUT="$(mktemp -t nyx_recall_${TARGET}.XXXXXX.json)"
+    trap 'rm -f "$TMP_OUT"' EXIT
+
+    if [ -n "$LANG_FLAG" ]; then
+        echo "[validate_recall] lang=$LANG_FLAG target=$TARGET clone=$CLONE_ABS" >&2
+    else
+        echo "[validate_recall] target=$TARGET clone=$CLONE_ABS" >&2
+    fi
+    echo "[validate_recall] nyx=$NYX baseline=$BASELINE" >&2
+    echo "[validate_recall] scanning..." >&2
+
+    "$NYX" scan "$CLONE_ABS" --format json --index off >"$TMP_OUT"
+
+    # Strip the clone-absolute prefix off each finding's path so the diff
+    # tuple `(rule_id, path_suffix, line)` is portable across machines.
+    # Also drop the trailing ` (source N:M)` suffix on `id` so taint
+    # findings group under their canonical rule_id.
+    CURRENT="$(jq --arg root "$CLONE_ABS/" '
+        [ .[] | {
+            rule_id: ((.id // "") | sub(" \\(source [^)]*\\)$"; "")),
+            path_suffix: ((.path // "") | ltrimstr($root)),
+            line: (.line // 0),
+            severity: (.severity // "Unknown")
+        } ]
+    ' "$TMP_OUT")"
+fi
+
+if [ "$CAPTURE" -eq 1 ]; then
+    PIN="$(cd "$CLONE_ABS" && git log -1 --format=%H 2>/dev/null || echo "unknown")"
+    # Preserve any verdict / note labels from the prior baseline whose
+    # (rule_id, path_suffix, line) tuple still appears in the current
+    # scan. New findings get the placeholder verdict; vanished findings
+    # are dropped.
+    PRIOR_FINDINGS="$(jq '.findings // []' "$BASELINE")"
+    UPDATED="$(jq --argjson findings "$CURRENT" \
+                  --argjson prior "$PRIOR_FINDINGS" \
+                  --arg pin "$PIN" \
+                  --arg captured_on "$(date -u +%Y-%m-%d)" \
+                  '
+                  def key(f): [f.rule_id, f.path_suffix, f.line];
+                  def prior_idx:
+                      reduce $prior[] as $f ({}; .[(key($f) | tojson)] = $f);
+                  prior_idx as $pidx
+                  | . + {
+                        captured_against: ("real-scan @ " + $pin),
+                        captured_on: $captured_on,
+                        pinned_commit: $pin,
+                        findings: ($findings | map(
+                            . as $f
+                            | ($pidx[(key($f) | tojson)] // null) as $prev
+                            | if $prev != null and ($prev.verdict // "needs_review") != "needs_review"
+                              then . + {
+                                  verdict: $prev.verdict,
+                                  note: ($prev.note // "carried from prior baseline")
+                              }
+                              else . + {
+                                  verdict: "needs_review",
+                                  note: "captured by validate_recall.sh --capture"
+                              }
+                              end
+                        ))
+                    }
+                  ' "$BASELINE")"
+    printf '%s\n' "$UPDATED" >"$BASELINE"
+    KEPT_LABELS="$(echo "$UPDATED" | jq '[.findings[] | select((.verdict // "needs_review") != "needs_review")] | length')"
+    echo "[validate_recall] wrote $(echo "$CURRENT" | jq 'length') findings to $BASELINE (preserved $KEPT_LABELS prior verdicts)" >&2
+    exit 0
+fi
+
+# Diff mode: compare current scan to baseline.
+BASELINE_FINDINGS="$(jq '.findings // []' "$BASELINE")"
+
+DIFF_REPORT="$(jq -n \
+    --argjson cur "$CURRENT" \
+    --argjson base "$BASELINE_FINDINGS" '
+    def key(f): [f.rule_id, f.path_suffix, f.line];
+
+    def index_set(arr):
+        reduce arr[] as $f ({}; .[(key($f) | tojson)] = $f);
+
+    (index_set($cur))   as $cidx
+    | (index_set($base)) as $bidx
+    | ($cidx | keys_unsorted) as $ckeys
+    | ($bidx | keys_unsorted) as $bkeys
+    | ($ckeys - $bkeys) as $added_keys
+    | ($bkeys - $ckeys) as $removed_keys
+    | ($ckeys - $added_keys) as $unchanged_keys
+    | def by_rule(keys; idx):
+        keys
+        | map(idx[.])
+        | group_by(.rule_id)
+        | map({(.[0].rule_id): length}) | add // {};
+
+    {
+        added:     by_rule($added_keys; $cidx),
+        removed:   by_rule($removed_keys; $bidx),
+        unchanged: by_rule($unchanged_keys; $cidx),
+        added_total:     ($added_keys | length),
+        removed_total:   ($removed_keys | length),
+        unchanged_total: ($unchanged_keys | length)
+    }
+')"
+
+printf '%s\n' "$DIFF_REPORT"