mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-15 20:05:13 +02:00
feat(dynamic): enhance corpus sync script with improved payload parsing, registry checks, and expanded validation logic
This commit is contained in:
parent
467d41dcfb
commit
8ee6e3af7c
22 changed files with 810 additions and 230 deletions
|
|
@ -1,84 +1,106 @@
|
|||
#!/usr/bin/env python3
|
||||
# Usage: python3 scripts/check_corpus_sync.py
|
||||
# Run from repo root or any subdirectory; the script relocates to repo root.
|
||||
# Exits 0 if src/dynamic/corpus.rs and scripts/corpus_dashboard.py agree on
|
||||
# CORPUS_VERSION and all payload labels. Exits 1 on any divergence.
|
||||
# Exits 0 if scripts/corpus_dashboard.py reads the same CORPUS_VERSION and
|
||||
# payload identities as the canonical Rust registry.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# ── locate repo root (parent of the scripts/ dir this file lives in) ─────────
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
REPO_ROOT = SCRIPT_DIR.parent
|
||||
os.chdir(REPO_ROOT)
|
||||
|
||||
sys.path.insert(0, str(SCRIPT_DIR))
|
||||
import corpus_dashboard # noqa: E402
|
||||
|
||||
CORPUS_RS = REPO_ROOT / "src" / "dynamic" / "corpus.rs"
|
||||
DASHBOARD_PY = REPO_ROOT / "scripts" / "corpus_dashboard.py"
|
||||
CORPUS_DIR = REPO_ROOT / "src" / "dynamic" / "corpus"
|
||||
|
||||
# ── parse helpers ─────────────────────────────────────────────────────────────
|
||||
|
||||
def parse_corpus_rs(path: Path):
|
||||
def parse_corpus_rs_version(path: Path) -> int | None:
|
||||
text = path.read_text(encoding="utf-8")
|
||||
version_match = re.search(r'pub const CORPUS_VERSION:\s*u32\s*=\s*(\d+);', text)
|
||||
version = int(version_match.group(1)) if version_match else None
|
||||
labels = set(re.findall(r'label:\s*"([^"]+)"', text))
|
||||
return version, labels
|
||||
version_match = re.search(r"pub const CORPUS_VERSION:\s*u32\s*=\s*(\d+);", text)
|
||||
return int(version_match.group(1)) if version_match else None
|
||||
|
||||
def parse_dashboard_py(path: Path):
|
||||
text = path.read_text(encoding="utf-8")
|
||||
version_match = re.search(r'CORPUS_VERSION\s*=\s*(\d+)', text)
|
||||
version = int(version_match.group(1)) if version_match else None
|
||||
labels = set(re.findall(r'label="([^"]+)"', text))
|
||||
return version, labels
|
||||
|
||||
# ── main ──────────────────────────────────────────────────────────────────────
|
||||
def payload_identities(payloads: list[corpus_dashboard.PayloadEntry]) -> set[tuple[str, str, str]]:
|
||||
return {(p.cap, p.lang, p.label) for p in payloads}
|
||||
|
||||
|
||||
def count_raw_payload_blocks(path: Path = CORPUS_DIR) -> int:
|
||||
count = 0
|
||||
for source in path.rglob("*.rs"):
|
||||
if source.name in {"audit.rs", "mod.rs", "registry.rs"}:
|
||||
continue
|
||||
text = source.read_text(encoding="utf-8")
|
||||
count += len(re.findall(r"\bCuratedPayload\s*\{", text))
|
||||
return count
|
||||
|
||||
|
||||
def fmt_identity(identity: tuple[str, str, str]) -> str:
|
||||
cap, lang, label = identity
|
||||
return f"{cap}/{lang}/{label}"
|
||||
|
||||
|
||||
def main() -> int:
|
||||
rs_version, rs_labels = parse_corpus_rs(CORPUS_RS)
|
||||
py_version, py_labels = parse_dashboard_py(DASHBOARD_PY)
|
||||
rs_version = parse_corpus_rs_version(CORPUS_RS)
|
||||
dashboard_version = corpus_dashboard.CORPUS_VERSION
|
||||
registry_payloads = corpus_dashboard.load_payloads()
|
||||
raw_payload_count = count_raw_payload_blocks()
|
||||
|
||||
ok = True
|
||||
|
||||
# version check
|
||||
if rs_version is None:
|
||||
print("ERROR: CORPUS_VERSION not found in corpus.rs")
|
||||
ok = False
|
||||
if py_version is None:
|
||||
print("ERROR: CORPUS_VERSION not found in corpus_dashboard.py")
|
||||
elif rs_version == dashboard_version:
|
||||
print(f"CORPUS_VERSION: {rs_version} [match]")
|
||||
else:
|
||||
print(
|
||||
"CORPUS_VERSION mismatch: "
|
||||
f"corpus.rs={rs_version} corpus_dashboard.py={dashboard_version}"
|
||||
)
|
||||
ok = False
|
||||
if rs_version is not None and py_version is not None:
|
||||
if rs_version == py_version:
|
||||
print(f"CORPUS_VERSION: {rs_version} [match]")
|
||||
else:
|
||||
print(f"CORPUS_VERSION mismatch: corpus.rs={rs_version} corpus_dashboard.py={py_version}")
|
||||
ok = False
|
||||
|
||||
# label check
|
||||
only_in_rs = rs_labels - py_labels
|
||||
only_in_py = py_labels - rs_labels
|
||||
shared = rs_labels & py_labels
|
||||
registry_ids = payload_identities(registry_payloads)
|
||||
dashboard_ids = payload_identities(corpus_dashboard.PAYLOADS)
|
||||
only_in_registry = registry_ids - dashboard_ids
|
||||
only_in_dashboard = dashboard_ids - registry_ids
|
||||
shared = registry_ids & dashboard_ids
|
||||
|
||||
print(f"Labels in both: {len(shared)}")
|
||||
if only_in_rs:
|
||||
print(f"Labels only in corpus.rs: {len(only_in_rs)}")
|
||||
for lbl in sorted(only_in_rs):
|
||||
print(f" + {lbl}")
|
||||
print(f"Payload identities in both: {len(shared)}")
|
||||
if only_in_registry:
|
||||
print(f"Payload identities only in Rust registry: {len(only_in_registry)}")
|
||||
for identity in sorted(only_in_registry):
|
||||
print(f" + {fmt_identity(identity)}")
|
||||
ok = False
|
||||
if only_in_py:
|
||||
print(f"Labels only in corpus_dashboard.py: {len(only_in_py)}")
|
||||
for lbl in sorted(only_in_py):
|
||||
print(f" - {lbl}")
|
||||
if only_in_dashboard:
|
||||
print(f"Payload identities only in dashboard: {len(only_in_dashboard)}")
|
||||
for identity in sorted(only_in_dashboard):
|
||||
print(f" - {fmt_identity(identity)}")
|
||||
ok = False
|
||||
|
||||
if len(corpus_dashboard.PAYLOADS) == raw_payload_count:
|
||||
print(f"CuratedPayload blocks covered: {raw_payload_count} [match]")
|
||||
else:
|
||||
print(
|
||||
"CuratedPayload block count mismatch: "
|
||||
f"source_tree={raw_payload_count} dashboard={len(corpus_dashboard.PAYLOADS)}"
|
||||
)
|
||||
ok = False
|
||||
|
||||
if ok:
|
||||
print("Corpus sync: OK")
|
||||
return 0
|
||||
else:
|
||||
print("Corpus sync: FAIL — update corpus_dashboard.py to match corpus.rs")
|
||||
return 1
|
||||
|
||||
print("Corpus sync: FAIL - update corpus_dashboard.py to match the Rust registry")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Corpus health report for src/dynamic/corpus.rs.
|
||||
"""Corpus health report for the Rust dynamic payload registry.
|
||||
|
||||
Produces:
|
||||
- Per-cap coverage table (payload count, benign controls, OOB slots)
|
||||
|
|
@ -7,28 +7,43 @@ Produces:
|
|||
- CVE reference count
|
||||
- Marker collision audit
|
||||
|
||||
Exit code 0 = healthy. Non-zero = collision or missing coverage.
|
||||
Exit code 0 = healthy. Non-zero = collision or missing coverage.
|
||||
|
||||
Usage:
|
||||
python3 scripts/corpus_dashboard.py [--repro-dir REPRO_DIR] [--json]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import ast
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
# ── Payload table (mirrors src/dynamic/corpus.rs) ────────────────────────────
|
||||
# Manually synced; CI should flag drift via cargo test no_marker_collisions.
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
REPO_ROOT = SCRIPT_DIR.parent
|
||||
CORPUS_RS = REPO_ROOT / "src" / "dynamic" / "corpus.rs"
|
||||
CORPUS_DIR = REPO_ROOT / "src" / "dynamic" / "corpus"
|
||||
REGISTRY_RS = CORPUS_DIR / "registry.rs"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RegistryEntry:
|
||||
cap: str
|
||||
lang: str
|
||||
module_path: str
|
||||
source_path: Path
|
||||
|
||||
CORPUS_VERSION = 6
|
||||
|
||||
@dataclass
|
||||
class PayloadEntry:
|
||||
cap: str
|
||||
lang: str
|
||||
label: str
|
||||
bytes_repr: str
|
||||
oracle_kind: str
|
||||
|
|
@ -39,132 +54,402 @@ class PayloadEntry:
|
|||
deprecated_at_corpus_version: Optional[int]
|
||||
fixture_paths: list[str]
|
||||
oob_nonce_slot: bool
|
||||
source_path: str
|
||||
cve_refs: list[str] = field(default_factory=list)
|
||||
|
||||
PAYLOADS: list[PayloadEntry] = [
|
||||
# ── SQL_QUERY ──────────────────────────────────────────────────────────────
|
||||
PayloadEntry(
|
||||
cap="SQL_QUERY", label="sqli-tautology",
|
||||
bytes_repr="' OR '1'='1", oracle_kind="OutputContains",
|
||||
oracle_value="NYX_SQL_CONFIRMED", is_benign=False,
|
||||
provenance="Curated", since_corpus_version=1,
|
||||
deprecated_at_corpus_version=None,
|
||||
fixture_paths=["tests/benchmark/corpus/rust/sqli/sqli_rusqlite_format.rs"],
|
||||
oob_nonce_slot=False,
|
||||
),
|
||||
PayloadEntry(
|
||||
cap="SQL_QUERY", label="sqli-union-nyx",
|
||||
bytes_repr="' UNION SELECT 'NYX_SQL_CONFIRMED'--",
|
||||
oracle_kind="OutputContains", oracle_value="NYX_SQL_CONFIRMED",
|
||||
is_benign=False, provenance="Curated", since_corpus_version=1,
|
||||
deprecated_at_corpus_version=None,
|
||||
fixture_paths=["tests/benchmark/corpus/rust/sqli/sqli_rusqlite_format.rs"],
|
||||
oob_nonce_slot=False,
|
||||
),
|
||||
# ── CODE_EXEC ──────────────────────────────────────────────────────────────
|
||||
PayloadEntry(
|
||||
cap="CODE_EXEC", label="cmdi-echo-marker",
|
||||
bytes_repr="; echo NYX_PWN_CMDI",
|
||||
oracle_kind="OutputContains", oracle_value="NYX_PWN_CMDI",
|
||||
is_benign=False, provenance="Curated", since_corpus_version=1,
|
||||
deprecated_at_corpus_version=None,
|
||||
fixture_paths=[
|
||||
"tests/benchmark/corpus/rust/cmdi/cmdi_command.rs",
|
||||
"tests/benchmark/corpus/rust/cmdi/cmdi_args.rs",
|
||||
],
|
||||
oob_nonce_slot=False,
|
||||
),
|
||||
# ── FILE_IO ────────────────────────────────────────────────────────────────
|
||||
PayloadEntry(
|
||||
cap="FILE_IO", label="path-traversal-passwd",
|
||||
bytes_repr="../../../../etc/passwd",
|
||||
oracle_kind="OutputContains", oracle_value="root:",
|
||||
is_benign=False, provenance="Curated", since_corpus_version=1,
|
||||
deprecated_at_corpus_version=None,
|
||||
fixture_paths=[
|
||||
"tests/benchmark/corpus/rust/path_traversal/path_file_open.rs",
|
||||
"tests/benchmark/corpus/rust/path_traversal/path_read.rs",
|
||||
],
|
||||
oob_nonce_slot=False,
|
||||
),
|
||||
PayloadEntry(
|
||||
cap="FILE_IO", label="path-traversal-benign",
|
||||
bytes_repr="benign_safe_file_that_does_not_exist_NYX_BENIGN",
|
||||
oracle_kind="OutputContains", oracle_value="root:",
|
||||
is_benign=True, provenance="Curated", since_corpus_version=1,
|
||||
deprecated_at_corpus_version=None,
|
||||
fixture_paths=["tests/benchmark/corpus/rust/path_traversal/path_file_open.rs"],
|
||||
oob_nonce_slot=False,
|
||||
),
|
||||
# ── SSRF ───────────────────────────────────────────────────────────────────
|
||||
PayloadEntry(
|
||||
cap="SSRF", label="ssrf-file-scheme",
|
||||
bytes_repr="file:///etc/passwd",
|
||||
oracle_kind="OutputContains", oracle_value="daemon:",
|
||||
is_benign=False, provenance="Curated", since_corpus_version=1,
|
||||
deprecated_at_corpus_version=None,
|
||||
fixture_paths=["tests/benchmark/corpus/rust/ssrf/ssrf_reqwest.rs"],
|
||||
oob_nonce_slot=False,
|
||||
),
|
||||
PayloadEntry(
|
||||
cap="SSRF", label="ssrf-oob-nonce",
|
||||
bytes_repr="<OOB URL generated at runtime>",
|
||||
oracle_kind="OobCallback", oracle_value="host=127.0.0.1",
|
||||
is_benign=False, provenance="Curated", since_corpus_version=2,
|
||||
deprecated_at_corpus_version=None,
|
||||
fixture_paths=["tests/benchmark/corpus/rust/ssrf/ssrf_reqwest.rs"],
|
||||
oob_nonce_slot=True,
|
||||
),
|
||||
# ── HTML_ESCAPE ────────────────────────────────────────────────────────────
|
||||
PayloadEntry(
|
||||
cap="HTML_ESCAPE", label="xss-script-marker",
|
||||
bytes_repr="<script>NYX_XSS_CONFIRMED</script>",
|
||||
oracle_kind="OutputContains",
|
||||
oracle_value="<script>NYX_XSS_CONFIRMED</script>",
|
||||
is_benign=False, provenance="Curated", since_corpus_version=1,
|
||||
deprecated_at_corpus_version=None,
|
||||
fixture_paths=["tests/benchmark/corpus/rust/xss/axum_html/main.rs"],
|
||||
oob_nonce_slot=False,
|
||||
),
|
||||
PayloadEntry(
|
||||
cap="HTML_ESCAPE", label="xss-benign-text",
|
||||
bytes_repr="Hello World",
|
||||
oracle_kind="OutputContains",
|
||||
oracle_value="<script>NYX_XSS_CONFIRMED</script>",
|
||||
is_benign=True, provenance="Curated", since_corpus_version=1,
|
||||
deprecated_at_corpus_version=None,
|
||||
fixture_paths=["tests/benchmark/corpus/rust/xss/axum_html/main.rs"],
|
||||
oob_nonce_slot=False,
|
||||
),
|
||||
]
|
||||
|
||||
ALL_CAPS = ["SQL_QUERY", "CODE_EXEC", "FILE_IO", "SSRF", "HTML_ESCAPE"]
|
||||
# Rust source helpers ---------------------------------------------------------
|
||||
|
||||
|
||||
# ── Marker collision audit ────────────────────────────────────────────────────
|
||||
def load_corpus_version(path: Path = CORPUS_RS) -> int:
|
||||
text = path.read_text(encoding="utf-8")
|
||||
match = re.search(r"pub const CORPUS_VERSION:\s*u32\s*=\s*(\d+);", text)
|
||||
if not match:
|
||||
raise ValueError(f"CORPUS_VERSION not found in {path}")
|
||||
return int(match.group(1))
|
||||
|
||||
def audit_marker_collisions() -> list[tuple[str, str, str]]:
|
||||
collisions = []
|
||||
for p in PAYLOADS:
|
||||
if p.is_benign or p.oracle_kind != "OutputContains":
|
||||
|
||||
def parse_registry_entries(path: Path = REGISTRY_RS) -> list[RegistryEntry]:
|
||||
text = path.read_text(encoding="utf-8")
|
||||
entries: list[RegistryEntry] = []
|
||||
pattern = re.compile(
|
||||
r"\(\s*Cap::([A-Z0-9_]+)\s*,\s*Lang::([A-Za-z0-9_]+)\s*,"
|
||||
r"\s*([A-Za-z0-9_:]+)::PAYLOADS\s*,?\s*\)",
|
||||
re.DOTALL,
|
||||
)
|
||||
for match in pattern.finditer(text):
|
||||
cap, lang, module_path = match.groups()
|
||||
source_path = CORPUS_DIR / f"{module_path.replace('::', '/')}.rs"
|
||||
entries.append(RegistryEntry(cap, lang, module_path, source_path))
|
||||
if not entries:
|
||||
raise ValueError(f"No registry entries found in {path}")
|
||||
return entries
|
||||
|
||||
|
||||
def _raw_string_bounds(text: str, index: int) -> Optional[tuple[int, int, int]]:
|
||||
if text.startswith("br", index):
|
||||
marker_index = index + 2
|
||||
elif text.startswith("r", index):
|
||||
marker_index = index + 1
|
||||
else:
|
||||
return None
|
||||
|
||||
cursor = marker_index
|
||||
while cursor < len(text) and text[cursor] == "#":
|
||||
cursor += 1
|
||||
if cursor >= len(text) or text[cursor] != '"':
|
||||
return None
|
||||
|
||||
hashes = text[marker_index:cursor]
|
||||
body_start = cursor + 1
|
||||
terminator = '"' + hashes
|
||||
body_end = text.find(terminator, body_start)
|
||||
if body_end < 0:
|
||||
raise ValueError("unterminated Rust raw string literal")
|
||||
return body_start, body_end, body_end + len(terminator)
|
||||
|
||||
|
||||
def _quoted_literal_end(text: str, index: int) -> Optional[int]:
|
||||
raw = _raw_string_bounds(text, index)
|
||||
if raw:
|
||||
return raw[2]
|
||||
|
||||
if text.startswith('b"', index):
|
||||
quote = '"'
|
||||
cursor = index + 2
|
||||
elif text[index:index + 1] == '"':
|
||||
quote = '"'
|
||||
cursor = index + 1
|
||||
elif (
|
||||
text[index:index + 1] == "'"
|
||||
and index + 1 < len(text)
|
||||
and not (text[index + 1].isalpha() or text[index + 1] == "_")
|
||||
):
|
||||
quote = "'"
|
||||
cursor = index + 1
|
||||
else:
|
||||
return None
|
||||
|
||||
while cursor < len(text):
|
||||
char = text[cursor]
|
||||
if char == "\\":
|
||||
cursor += 2
|
||||
continue
|
||||
marker = p.oracle_value or ""
|
||||
for other in PAYLOADS:
|
||||
if other.cap == p.cap:
|
||||
if char == quote:
|
||||
return cursor + 1
|
||||
cursor += 1
|
||||
raise ValueError("unterminated Rust quoted literal")
|
||||
|
||||
|
||||
def _skip_ignored(text: str, index: int) -> int:
|
||||
if text.startswith("//", index):
|
||||
newline = text.find("\n", index + 2)
|
||||
return len(text) if newline < 0 else newline + 1
|
||||
|
||||
if text.startswith("/*", index):
|
||||
depth = 1
|
||||
cursor = index + 2
|
||||
while cursor < len(text) and depth:
|
||||
if text.startswith("/*", cursor):
|
||||
depth += 1
|
||||
cursor += 2
|
||||
elif text.startswith("*/", cursor):
|
||||
depth -= 1
|
||||
cursor += 2
|
||||
else:
|
||||
cursor += 1
|
||||
if depth:
|
||||
raise ValueError("unterminated Rust block comment")
|
||||
return cursor
|
||||
|
||||
literal_end = _quoted_literal_end(text, index)
|
||||
return literal_end if literal_end is not None else index
|
||||
|
||||
|
||||
def _find_matching(text: str, open_index: int, open_char: str, close_char: str) -> int:
|
||||
depth = 1
|
||||
cursor = open_index + 1
|
||||
while cursor < len(text):
|
||||
skipped = _skip_ignored(text, cursor)
|
||||
if skipped != cursor:
|
||||
cursor = skipped
|
||||
continue
|
||||
|
||||
char = text[cursor]
|
||||
if char == open_char:
|
||||
depth += 1
|
||||
elif char == close_char:
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
return cursor
|
||||
cursor += 1
|
||||
raise ValueError(f"unterminated {open_char}{close_char} block")
|
||||
|
||||
|
||||
def _payload_blocks(text: str) -> list[str]:
|
||||
blocks: list[str] = []
|
||||
for match in re.finditer(r"\bCuratedPayload\s*\{", text):
|
||||
open_index = match.end() - 1
|
||||
close_index = _find_matching(text, open_index, "{", "}")
|
||||
blocks.append(text[open_index + 1:close_index])
|
||||
return blocks
|
||||
|
||||
|
||||
def _add_field(segment: str, fields: dict[str, str]) -> None:
|
||||
match = re.search(r"(^|\n)\s*([A-Za-z_][A-Za-z0-9_]*)\s*:", segment)
|
||||
if not match:
|
||||
return
|
||||
fields[match.group(2)] = segment[match.end():].strip()
|
||||
|
||||
|
||||
def _split_top_level_fields(block: str) -> dict[str, str]:
|
||||
fields: dict[str, str] = {}
|
||||
start = 0
|
||||
cursor = 0
|
||||
brace_depth = 0
|
||||
bracket_depth = 0
|
||||
paren_depth = 0
|
||||
|
||||
while cursor < len(block):
|
||||
skipped = _skip_ignored(block, cursor)
|
||||
if skipped != cursor:
|
||||
cursor = skipped
|
||||
continue
|
||||
|
||||
char = block[cursor]
|
||||
if char == "{":
|
||||
brace_depth += 1
|
||||
elif char == "}":
|
||||
brace_depth -= 1
|
||||
elif char == "[":
|
||||
bracket_depth += 1
|
||||
elif char == "]":
|
||||
bracket_depth -= 1
|
||||
elif char == "(":
|
||||
paren_depth += 1
|
||||
elif char == ")":
|
||||
paren_depth -= 1
|
||||
elif (
|
||||
char == ","
|
||||
and brace_depth == 0
|
||||
and bracket_depth == 0
|
||||
and paren_depth == 0
|
||||
):
|
||||
_add_field(block[start:cursor], fields)
|
||||
start = cursor + 1
|
||||
cursor += 1
|
||||
|
||||
_add_field(block[start:], fields)
|
||||
return fields
|
||||
|
||||
|
||||
def _parse_rust_string_literal(text: str, index: int) -> Optional[tuple[str, int]]:
|
||||
raw = _raw_string_bounds(text, index)
|
||||
if raw:
|
||||
body_start, body_end, literal_end = raw
|
||||
return text[body_start:body_end], literal_end
|
||||
|
||||
if text.startswith('b"', index):
|
||||
cursor = index + 2
|
||||
elif text[index:index + 1] == '"':
|
||||
cursor = index + 1
|
||||
else:
|
||||
return None
|
||||
|
||||
while cursor < len(text):
|
||||
char = text[cursor]
|
||||
if char == "\\":
|
||||
cursor += 2
|
||||
continue
|
||||
if char == '"':
|
||||
literal = text[index:cursor + 1]
|
||||
value = ast.literal_eval(literal)
|
||||
if isinstance(value, bytes):
|
||||
return value.decode("latin-1"), cursor + 1
|
||||
return str(value), cursor + 1
|
||||
cursor += 1
|
||||
raise ValueError("unterminated Rust string literal")
|
||||
|
||||
|
||||
def _rust_string_literals(expr: str) -> list[str]:
|
||||
strings: list[str] = []
|
||||
cursor = 0
|
||||
while cursor < len(expr):
|
||||
if expr.startswith("//", cursor) or expr.startswith("/*", cursor):
|
||||
cursor = _skip_ignored(expr, cursor)
|
||||
continue
|
||||
|
||||
parsed = _parse_rust_string_literal(expr, cursor)
|
||||
if parsed:
|
||||
value, cursor = parsed
|
||||
strings.append(value)
|
||||
continue
|
||||
|
||||
cursor += 1
|
||||
return strings
|
||||
|
||||
|
||||
def _parse_string_constants(text: str) -> dict[str, str]:
|
||||
constants: dict[str, str] = {}
|
||||
pattern = re.compile(r"(?:pub\s+)?const\s+([A-Z][A-Z0-9_]*):\s*&str\s*=\s*([^;]+);")
|
||||
for match in pattern.finditer(text):
|
||||
strings = _rust_string_literals(match.group(2))
|
||||
if strings:
|
||||
constants[match.group(1)] = strings[0]
|
||||
return constants
|
||||
|
||||
|
||||
def _required(fields: dict[str, str], name: str, source_path: Path) -> str:
|
||||
if name not in fields:
|
||||
rel = source_path.relative_to(REPO_ROOT)
|
||||
raise ValueError(f"missing field {name!r} in payload from {rel}")
|
||||
return fields[name]
|
||||
|
||||
|
||||
def _string_expr(expr: str, constants: dict[str, str]) -> str:
|
||||
expr = expr.strip()
|
||||
if expr in constants:
|
||||
return constants[expr]
|
||||
strings = _rust_string_literals(expr)
|
||||
if strings:
|
||||
return strings[0]
|
||||
return expr
|
||||
|
||||
|
||||
def _bool_expr(expr: str) -> bool:
|
||||
value = expr.strip()
|
||||
if value == "true":
|
||||
return True
|
||||
if value == "false":
|
||||
return False
|
||||
raise ValueError(f"expected Rust bool literal, got {value!r}")
|
||||
|
||||
|
||||
def _int_expr(expr: str) -> int:
|
||||
match = re.search(r"\d+", expr)
|
||||
if not match:
|
||||
raise ValueError(f"expected integer literal, got {expr!r}")
|
||||
return int(match.group(0))
|
||||
|
||||
|
||||
def _optional_int_expr(expr: str) -> Optional[int]:
|
||||
expr = expr.strip()
|
||||
if expr == "None":
|
||||
return None
|
||||
match = re.fullmatch(r"Some\(\s*(\d+)\s*\)", expr)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
raise ValueError(f"expected Rust Option<u32> literal, got {expr!r}")
|
||||
|
||||
|
||||
def _oracle_expr(expr: str, constants: dict[str, str]) -> tuple[str, Optional[str]]:
|
||||
expr = expr.strip()
|
||||
if expr.startswith("Oracle::OutputContains"):
|
||||
open_index = expr.find("(")
|
||||
close_index = _find_matching(expr, open_index, "(", ")")
|
||||
marker = _string_expr(expr[open_index + 1:close_index], constants)
|
||||
return "OutputContains", marker
|
||||
|
||||
if expr.startswith("Oracle::OobCallback"):
|
||||
strings = _rust_string_literals(expr)
|
||||
return "OobCallback", f"host={strings[0]}" if strings else None
|
||||
|
||||
if expr.startswith("Oracle::SinkCrash"):
|
||||
return "SinkCrash", "signals=all"
|
||||
|
||||
if expr.startswith("Oracle::SinkProbe"):
|
||||
predicates = list(dict.fromkeys(re.findall(r"ProbePredicate::([A-Za-z0-9_]+)", expr)))
|
||||
return "SinkProbe", ",".join(predicates) if predicates else None
|
||||
|
||||
return expr.split("{", 1)[0].split("(", 1)[0].strip(), None
|
||||
|
||||
|
||||
def _payload_from_block(
|
||||
entry: RegistryEntry,
|
||||
block: str,
|
||||
constants: dict[str, str],
|
||||
) -> PayloadEntry:
|
||||
fields = _split_top_level_fields(block)
|
||||
source_path = entry.source_path
|
||||
oracle_kind, oracle_value = _oracle_expr(_required(fields, "oracle", source_path), constants)
|
||||
rel_source = str(source_path.relative_to(REPO_ROOT))
|
||||
|
||||
return PayloadEntry(
|
||||
cap=entry.cap,
|
||||
lang=entry.lang,
|
||||
label=_string_expr(_required(fields, "label", source_path), constants),
|
||||
bytes_repr=_string_expr(_required(fields, "bytes", source_path), constants),
|
||||
oracle_kind=oracle_kind,
|
||||
oracle_value=oracle_value,
|
||||
is_benign=_bool_expr(_required(fields, "is_benign", source_path)),
|
||||
provenance=_required(fields, "provenance", source_path)
|
||||
.strip()
|
||||
.removeprefix("PayloadProvenance::"),
|
||||
since_corpus_version=_int_expr(_required(fields, "since_corpus_version", source_path)),
|
||||
deprecated_at_corpus_version=_optional_int_expr(
|
||||
_required(fields, "deprecated_at_corpus_version", source_path)
|
||||
),
|
||||
fixture_paths=_rust_string_literals(_required(fields, "fixture_paths", source_path)),
|
||||
oob_nonce_slot=_bool_expr(_required(fields, "oob_nonce_slot", source_path)),
|
||||
source_path=rel_source,
|
||||
cve_refs=sorted(set(re.findall(r"CVE-\d{4}-\d{4,7}", block))),
|
||||
)
|
||||
|
||||
|
||||
def load_payloads() -> list[PayloadEntry]:
|
||||
payloads: list[PayloadEntry] = []
|
||||
for entry in parse_registry_entries():
|
||||
if not entry.source_path.exists():
|
||||
rel = entry.source_path.relative_to(REPO_ROOT)
|
||||
raise FileNotFoundError(f"registry entry points at missing payload file: {rel}")
|
||||
|
||||
text = entry.source_path.read_text(encoding="utf-8")
|
||||
constants = _parse_string_constants(text)
|
||||
blocks = _payload_blocks(text)
|
||||
if not blocks:
|
||||
rel = entry.source_path.relative_to(REPO_ROOT)
|
||||
raise ValueError(f"no CuratedPayload entries found in {rel}")
|
||||
|
||||
for block in blocks:
|
||||
payloads.append(_payload_from_block(entry, block, constants))
|
||||
|
||||
return payloads
|
||||
|
||||
|
||||
CORPUS_VERSION = load_corpus_version()
|
||||
PAYLOADS: list[PayloadEntry] = load_payloads()
|
||||
ALL_CAPS = list(dict.fromkeys(p.cap for p in PAYLOADS))
|
||||
|
||||
|
||||
# Marker collision audit ------------------------------------------------------
|
||||
|
||||
|
||||
def audit_marker_collisions(payloads: list[PayloadEntry] = PAYLOADS) -> list[tuple[str, str, str]]:
|
||||
collisions = []
|
||||
for payload in payloads:
|
||||
if payload.is_benign or payload.oracle_kind != "OutputContains":
|
||||
continue
|
||||
marker = payload.oracle_value or ""
|
||||
if not marker:
|
||||
continue
|
||||
|
||||
for other in payloads:
|
||||
if other.cap == payload.cap:
|
||||
continue
|
||||
if other.is_benign or other.oob_nonce_slot:
|
||||
continue
|
||||
if marker in other.bytes_repr:
|
||||
collisions.append((p.cap, p.label, other.cap))
|
||||
collisions.append((payload.cap, payload.label, other.cap))
|
||||
return collisions
|
||||
|
||||
|
||||
# ── Coverage table ────────────────────────────────────────────────────────────
|
||||
# Coverage table --------------------------------------------------------------
|
||||
|
||||
def build_coverage_table() -> dict:
|
||||
|
||||
def build_coverage_table(payloads: list[PayloadEntry] = PAYLOADS) -> dict:
|
||||
result = {}
|
||||
for cap in ALL_CAPS:
|
||||
cap_payloads = [p for p in PAYLOADS if p.cap == cap]
|
||||
cap_payloads = [payload for payload in payloads if payload.cap == cap]
|
||||
result[cap] = {
|
||||
"total": len(cap_payloads),
|
||||
"vuln": sum(1 for p in cap_payloads if not p.is_benign),
|
||||
|
|
@ -176,7 +461,8 @@ def build_coverage_table() -> dict:
|
|||
return result
|
||||
|
||||
|
||||
# ── Repro artifact timestamps ─────────────────────────────────────────────────
|
||||
# Repro artifact timestamps ---------------------------------------------------
|
||||
|
||||
|
||||
def scan_last_confirmed(repro_dir: Path) -> dict[str, str]:
|
||||
"""Return {payload_label: iso_timestamp} from repro artifact metadata."""
|
||||
|
|
@ -189,7 +475,6 @@ def scan_last_confirmed(repro_dir: Path) -> dict[str, str]:
|
|||
label = data.get("payload_label", "")
|
||||
ts = data.get("confirmed_at", "")
|
||||
if label and ts:
|
||||
# Keep most recent.
|
||||
if label not in timestamps or ts > timestamps[label]:
|
||||
timestamps[label] = ts
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
|
|
@ -197,30 +482,30 @@ def scan_last_confirmed(repro_dir: Path) -> dict[str, str]:
|
|||
return timestamps
|
||||
|
||||
|
||||
# ── fuzz-discovered count ─────────────────────────────────────────────────────
|
||||
# fuzz-discovered count -------------------------------------------------------
|
||||
|
||||
|
||||
def count_discovered(discovered_dir: Path) -> int:
|
||||
if not discovered_dir.exists():
|
||||
return 0
|
||||
return sum(
|
||||
1 for f in discovered_dir.rglob("*")
|
||||
if f.is_file() and not f.name.endswith(".json") and f.name != ".gitkeep"
|
||||
1 for path in discovered_dir.rglob("*")
|
||||
if path.is_file() and not path.name.endswith(".json") and path.name != ".gitkeep"
|
||||
)
|
||||
|
||||
|
||||
# ── Main ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Nyx corpus health dashboard")
|
||||
parser.add_argument("--repro-dir", default="repro", help="Path to repro artifacts")
|
||||
parser.add_argument("--discovered-dir", default="fuzz-discovered",
|
||||
help="Path to fuzz-discovered/ directory")
|
||||
parser.add_argument(
|
||||
"--discovered-dir",
|
||||
default="fuzz-discovered",
|
||||
help="Path to fuzz-discovered/ directory",
|
||||
)
|
||||
parser.add_argument("--json", action="store_true", help="Output JSON instead of text")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Change to repo root (parent of scripts/).
|
||||
repo_root = Path(__file__).parent.parent
|
||||
os.chdir(repo_root)
|
||||
os.chdir(REPO_ROOT)
|
||||
|
||||
collisions = audit_marker_collisions()
|
||||
coverage = build_coverage_table()
|
||||
|
|
@ -229,10 +514,12 @@ def main() -> int:
|
|||
|
||||
report = {
|
||||
"corpus_version": CORPUS_VERSION,
|
||||
"registry_entries": len(parse_registry_entries()),
|
||||
"total_payloads": len(PAYLOADS),
|
||||
"coverage": coverage,
|
||||
"marker_collisions": collisions,
|
||||
"last_confirmed": timestamps,
|
||||
"cve_reference_count": sum(len(p.cve_refs) for p in PAYLOADS),
|
||||
"fuzz_discovered_pending": discovered_count,
|
||||
"healthy": len(collisions) == 0,
|
||||
}
|
||||
|
|
@ -241,44 +528,41 @@ def main() -> int:
|
|||
print(json.dumps(report, indent=2))
|
||||
return 0 if report["healthy"] else 1
|
||||
|
||||
# Text output.
|
||||
print(f"Nyx Corpus Dashboard (corpus_version={CORPUS_VERSION})")
|
||||
print("=" * 60)
|
||||
print()
|
||||
|
||||
# Coverage table.
|
||||
print("Per-cap coverage:")
|
||||
hdr = f" {'Cap':<18} {'Total':>5} {'Vuln':>5} {'Benign':>6} {'OOB':>4} {'Fixtures':>8}"
|
||||
hdr = f" {'Cap':<22} {'Total':>5} {'Vuln':>5} {'Benign':>6} {'OOB':>4} {'Fixtures':>8}"
|
||||
print(hdr)
|
||||
print(" " + "-" * 52)
|
||||
print(" " + "-" * 56)
|
||||
for cap, info in coverage.items():
|
||||
fixture_ok = "ok" if info["has_fixture_paths"] else "MISSING"
|
||||
print(
|
||||
f" {cap:<18} {info['total']:>5} {info['vuln']:>5} "
|
||||
f" {cap:<22} {info['total']:>5} {info['vuln']:>5} "
|
||||
f"{info['benign']:>6} {info['oob_slots']:>4} {fixture_ok:>8}"
|
||||
)
|
||||
print()
|
||||
|
||||
# Last confirmed timestamps.
|
||||
if timestamps:
|
||||
print("Last confirmed timestamps:")
|
||||
for label, ts in sorted(timestamps.items()):
|
||||
print(f" {label:<35} {ts}")
|
||||
print()
|
||||
|
||||
# fuzz-discovered pending.
|
||||
print(f"Registry entries: {report['registry_entries']}")
|
||||
print(f"CVE references: {report['cve_reference_count']}")
|
||||
print(f"Fuzz-discovered pending promotion: {discovered_count}")
|
||||
print()
|
||||
|
||||
# Marker collisions.
|
||||
if collisions:
|
||||
print("FAIL: Marker collisions detected (§16.3):")
|
||||
print("FAIL: Marker collisions detected (section 16.3):")
|
||||
for cap, label, other_cap in collisions:
|
||||
print(f" {cap}/{label} marker appears in {other_cap} payload bytes")
|
||||
return 1
|
||||
else:
|
||||
print("OK: No marker collisions detected.")
|
||||
return 0
|
||||
|
||||
print("OK: No marker collisions detected.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue