feat(dynamic): enhance corpus sync script with improved payload parsing, registry checks, and expanded validation logic

This commit is contained in:
elipeter 2026-06-01 22:51:05 -05:00
parent 467d41dcfb
commit 8ee6e3af7c
22 changed files with 810 additions and 230 deletions

View file

@ -1,84 +1,106 @@
#!/usr/bin/env python3
# Usage: python3 scripts/check_corpus_sync.py
# Run from repo root or any subdirectory; the script relocates to repo root.
# Exits 0 if src/dynamic/corpus.rs and scripts/corpus_dashboard.py agree on
# CORPUS_VERSION and all payload labels. Exits 1 on any divergence.
# Exits 0 if scripts/corpus_dashboard.py reads the same CORPUS_VERSION and
# payload identities as the canonical Rust registry.
from __future__ import annotations
import os
import re
import sys
from pathlib import Path
# ── locate repo root (parent of the scripts/ dir this file lives in) ─────────
SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parent
os.chdir(REPO_ROOT)
sys.path.insert(0, str(SCRIPT_DIR))
import corpus_dashboard # noqa: E402
CORPUS_RS = REPO_ROOT / "src" / "dynamic" / "corpus.rs"
DASHBOARD_PY = REPO_ROOT / "scripts" / "corpus_dashboard.py"
CORPUS_DIR = REPO_ROOT / "src" / "dynamic" / "corpus"
# ── parse helpers ─────────────────────────────────────────────────────────────
def parse_corpus_rs(path: Path):
def parse_corpus_rs_version(path: Path) -> int | None:
text = path.read_text(encoding="utf-8")
version_match = re.search(r'pub const CORPUS_VERSION:\s*u32\s*=\s*(\d+);', text)
version = int(version_match.group(1)) if version_match else None
labels = set(re.findall(r'label:\s*"([^"]+)"', text))
return version, labels
version_match = re.search(r"pub const CORPUS_VERSION:\s*u32\s*=\s*(\d+);", text)
return int(version_match.group(1)) if version_match else None
def parse_dashboard_py(path: Path):
text = path.read_text(encoding="utf-8")
version_match = re.search(r'CORPUS_VERSION\s*=\s*(\d+)', text)
version = int(version_match.group(1)) if version_match else None
labels = set(re.findall(r'label="([^"]+)"', text))
return version, labels
# ── main ──────────────────────────────────────────────────────────────────────
def payload_identities(payloads: list[corpus_dashboard.PayloadEntry]) -> set[tuple[str, str, str]]:
return {(p.cap, p.lang, p.label) for p in payloads}
def count_raw_payload_blocks(path: Path = CORPUS_DIR) -> int:
count = 0
for source in path.rglob("*.rs"):
if source.name in {"audit.rs", "mod.rs", "registry.rs"}:
continue
text = source.read_text(encoding="utf-8")
count += len(re.findall(r"\bCuratedPayload\s*\{", text))
return count
def fmt_identity(identity: tuple[str, str, str]) -> str:
cap, lang, label = identity
return f"{cap}/{lang}/{label}"
def main() -> int:
rs_version, rs_labels = parse_corpus_rs(CORPUS_RS)
py_version, py_labels = parse_dashboard_py(DASHBOARD_PY)
rs_version = parse_corpus_rs_version(CORPUS_RS)
dashboard_version = corpus_dashboard.CORPUS_VERSION
registry_payloads = corpus_dashboard.load_payloads()
raw_payload_count = count_raw_payload_blocks()
ok = True
# version check
if rs_version is None:
print("ERROR: CORPUS_VERSION not found in corpus.rs")
ok = False
if py_version is None:
print("ERROR: CORPUS_VERSION not found in corpus_dashboard.py")
elif rs_version == dashboard_version:
print(f"CORPUS_VERSION: {rs_version} [match]")
else:
print(
"CORPUS_VERSION mismatch: "
f"corpus.rs={rs_version} corpus_dashboard.py={dashboard_version}"
)
ok = False
if rs_version is not None and py_version is not None:
if rs_version == py_version:
print(f"CORPUS_VERSION: {rs_version} [match]")
else:
print(f"CORPUS_VERSION mismatch: corpus.rs={rs_version} corpus_dashboard.py={py_version}")
ok = False
# label check
only_in_rs = rs_labels - py_labels
only_in_py = py_labels - rs_labels
shared = rs_labels & py_labels
registry_ids = payload_identities(registry_payloads)
dashboard_ids = payload_identities(corpus_dashboard.PAYLOADS)
only_in_registry = registry_ids - dashboard_ids
only_in_dashboard = dashboard_ids - registry_ids
shared = registry_ids & dashboard_ids
print(f"Labels in both: {len(shared)}")
if only_in_rs:
print(f"Labels only in corpus.rs: {len(only_in_rs)}")
for lbl in sorted(only_in_rs):
print(f" + {lbl}")
print(f"Payload identities in both: {len(shared)}")
if only_in_registry:
print(f"Payload identities only in Rust registry: {len(only_in_registry)}")
for identity in sorted(only_in_registry):
print(f" + {fmt_identity(identity)}")
ok = False
if only_in_py:
print(f"Labels only in corpus_dashboard.py: {len(only_in_py)}")
for lbl in sorted(only_in_py):
print(f" - {lbl}")
if only_in_dashboard:
print(f"Payload identities only in dashboard: {len(only_in_dashboard)}")
for identity in sorted(only_in_dashboard):
print(f" - {fmt_identity(identity)}")
ok = False
if len(corpus_dashboard.PAYLOADS) == raw_payload_count:
print(f"CuratedPayload blocks covered: {raw_payload_count} [match]")
else:
print(
"CuratedPayload block count mismatch: "
f"source_tree={raw_payload_count} dashboard={len(corpus_dashboard.PAYLOADS)}"
)
ok = False
if ok:
print("Corpus sync: OK")
return 0
else:
print("Corpus sync: FAIL — update corpus_dashboard.py to match corpus.rs")
return 1
print("Corpus sync: FAIL - update corpus_dashboard.py to match the Rust registry")
return 1
if __name__ == "__main__":
sys.exit(main())