llm-circuit-finder/compare_eval.py

217 lines
6.7 KiB
Python
Raw Normal View History

2026-03-18 21:06:28 +00:00
#!/usr/bin/env python3
"""
Compare lm_eval results across multiple runs.
Reads the results JSON files from lm_eval output directories and
prints a side-by-side comparison table.
Usage:
python compare_eval.py ./eval_base ./eval_rys_balanced ./eval_rys_triple
python compare_eval.py ./eval_*
"""
import argparse
import json
import sys
from pathlib import Path
def load_results(folder: str) -> dict:
"""Load lm_eval results from an output folder."""
folder = Path(folder)
# lm_eval saves results in a JSON file inside the folder
# The filename varies, so find it
candidates = list(folder.glob("**/*results*.json"))
if not candidates:
# Try the folder itself as a JSON file
if folder.suffix == '.json' and folder.is_file():
with open(folder) as f:
return json.load(f)
print(f"WARNING: No results JSON found in {folder}", file=sys.stderr)
return {}
# Use the most recent one
results_file = max(candidates, key=lambda p: p.stat().st_mtime)
with open(results_file) as f:
data = json.load(f)
return data
def extract_metrics(data: dict) -> dict:
"""Extract task metrics from lm_eval results format."""
metrics = {}
results = data.get("results", {})
for task_name, task_data in results.items():
for key, value in task_data.items():
if key.endswith(",none") or key.endswith(",flexible-extract") or key.endswith(",strict-match") or key.endswith(",get-answer"):
# Parse "metric_name,filter_name" format
parts = key.rsplit(",", 1)
metric = parts[0]
filter_name = parts[1] if len(parts) > 1 else ""
if isinstance(value, (int, float)):
display_name = f"{task_name}"
if filter_name and filter_name != "none":
display_name += f" ({filter_name})"
if metric not in ("alias",):
label = f"{task_name}|{metric}|{filter_name}"
metrics[label] = {
"task": task_name,
"metric": metric,
"filter": filter_name,
"value": value,
}
return metrics
def get_display_name(label: str, metric_info: dict) -> str:
"""Create a readable display name from metric info."""
task = metric_info["task"]
metric = metric_info["metric"]
filt = metric_info["filter"]
# Shorten common task names
task = task.replace("bbh_cot_fewshot_", "bbh/")
if filt and filt not in ("none", "get-answer"):
return f"{task} [{filt}]"
return f"{task} [{metric}]"
def compare(folders: list[str], names: list[str] = None):
"""Compare results across folders."""
if names is None:
names = [Path(f).name for f in folders]
# Pad names to equal length
max_name_len = max(len(n) for n in names)
# Load all results
all_metrics = {}
for i, folder in enumerate(folders):
data = load_results(folder)
metrics = extract_metrics(data)
all_metrics[names[i]] = metrics
# Collect all unique metric labels
all_labels = set()
for metrics in all_metrics.values():
all_labels.update(metrics.keys())
# Sort labels by task name
sorted_labels = sorted(all_labels)
# Print header
col_width = 10
name_col = max(45, max_name_len)
header = f"{'Metric':<{name_col}}"
for name in names:
header += f" {name:>{col_width}}"
if len(names) > 1:
header += f" {'Δ(last-first)':>{col_width+2}}"
print()
print("=" * len(header))
print("lm_eval Results Comparison")
print("=" * len(header))
print(header)
print("-" * len(header))
prev_task = None
for label in sorted_labels:
# Get display info from first run that has this metric
metric_info = None
for metrics in all_metrics.values():
if label in metrics:
metric_info = metrics[label]
break
if metric_info is None:
continue
display = get_display_name(label, metric_info)
# Skip stderr and alias entries
if "stderr" in label.lower() or "alias" in label.lower():
continue
# Add separator between tasks
current_task = metric_info["task"]
if prev_task and current_task != prev_task:
print()
prev_task = current_task
row = f"{display:<{name_col}}"
values = []
for name in names:
metrics = all_metrics[name]
if label in metrics:
val = metrics[label]["value"]
values.append(val)
if isinstance(val, float):
row += f" {val:>{col_width}.4f}"
else:
row += f" {val:>{col_width}}"
else:
values.append(None)
row += f" {'---':>{col_width}}"
# Delta column (last - first)
if len(names) > 1 and values[0] is not None and values[-1] is not None:
delta = values[-1] - values[0]
if delta > 0:
row += f" {delta:>+{col_width}.4f}"
elif delta < 0:
row += f" {delta:>+{col_width}.4f}"
else:
row += f" {'0':>{col_width+1}}"
elif len(names) > 1:
row += f" {'---':>{col_width+1}}"
print(row)
print("=" * len(header))
# Summary: average across all metrics per run
print(f"\n{'Average (all metrics)':<{name_col}}", end="")
avgs = []
for name in names:
metrics = all_metrics[name]
vals = [m["value"] for m in metrics.values()
if isinstance(m["value"], (int, float)) and "stderr" not in m["metric"]]
if vals:
avg = sum(vals) / len(vals)
avgs.append(avg)
print(f" {avg:>{col_width}.4f}", end="")
else:
avgs.append(None)
print(f" {'---':>{col_width}}", end="")
if len(avgs) > 1 and avgs[0] is not None and avgs[-1] is not None:
delta = avgs[-1] - avgs[0]
print(f" {delta:>+{col_width}.4f}", end="")
print()
def main():
parser = argparse.ArgumentParser(
description="Compare lm_eval results across runs",
epilog="Example: python compare_eval.py ./eval_base ./eval_rys"
)
parser.add_argument("folders", nargs="+", help="lm_eval output directories or JSON files")
parser.add_argument("--names", nargs="+", help="Display names for each run (default: folder names)")
args = parser.parse_args()
names = args.names if args.names else None
compare(args.folders, names)
if __name__ == "__main__":
main()