mirror of
https://github.com/alainnothere/llm-circuit-finder.git
synced 2026-04-24 20:56:21 +02:00
216 lines
6.7 KiB
Python
216 lines
6.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Compare lm_eval results across multiple runs.
|
|
|
|
Reads the results JSON files from lm_eval output directories and
|
|
prints a side-by-side comparison table.
|
|
|
|
Usage:
|
|
python compare_eval.py ./eval_base ./eval_rys_balanced ./eval_rys_triple
|
|
python compare_eval.py ./eval_*
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
def load_results(folder: str) -> dict:
|
|
"""Load lm_eval results from an output folder."""
|
|
folder = Path(folder)
|
|
|
|
# lm_eval saves results in a JSON file inside the folder
|
|
# The filename varies, so find it
|
|
candidates = list(folder.glob("**/*results*.json"))
|
|
if not candidates:
|
|
# Try the folder itself as a JSON file
|
|
if folder.suffix == '.json' and folder.is_file():
|
|
with open(folder) as f:
|
|
return json.load(f)
|
|
print(f"WARNING: No results JSON found in {folder}", file=sys.stderr)
|
|
return {}
|
|
|
|
# Use the most recent one
|
|
results_file = max(candidates, key=lambda p: p.stat().st_mtime)
|
|
|
|
with open(results_file) as f:
|
|
data = json.load(f)
|
|
|
|
return data
|
|
|
|
|
|
def extract_metrics(data: dict) -> dict:
|
|
"""Extract task metrics from lm_eval results format."""
|
|
metrics = {}
|
|
|
|
results = data.get("results", {})
|
|
for task_name, task_data in results.items():
|
|
for key, value in task_data.items():
|
|
if key.endswith(",none") or key.endswith(",flexible-extract") or key.endswith(",strict-match") or key.endswith(",get-answer"):
|
|
# Parse "metric_name,filter_name" format
|
|
parts = key.rsplit(",", 1)
|
|
metric = parts[0]
|
|
filter_name = parts[1] if len(parts) > 1 else ""
|
|
|
|
if isinstance(value, (int, float)):
|
|
display_name = f"{task_name}"
|
|
if filter_name and filter_name != "none":
|
|
display_name += f" ({filter_name})"
|
|
if metric not in ("alias",):
|
|
label = f"{task_name}|{metric}|{filter_name}"
|
|
metrics[label] = {
|
|
"task": task_name,
|
|
"metric": metric,
|
|
"filter": filter_name,
|
|
"value": value,
|
|
}
|
|
|
|
return metrics
|
|
|
|
|
|
def get_display_name(label: str, metric_info: dict) -> str:
|
|
"""Create a readable display name from metric info."""
|
|
task = metric_info["task"]
|
|
metric = metric_info["metric"]
|
|
filt = metric_info["filter"]
|
|
|
|
# Shorten common task names
|
|
task = task.replace("bbh_cot_fewshot_", "bbh/")
|
|
|
|
if filt and filt not in ("none", "get-answer"):
|
|
return f"{task} [{filt}]"
|
|
return f"{task} [{metric}]"
|
|
|
|
|
|
def compare(folders: list[str], names: list[str] = None):
|
|
"""Compare results across folders."""
|
|
if names is None:
|
|
names = [Path(f).name for f in folders]
|
|
|
|
# Pad names to equal length
|
|
max_name_len = max(len(n) for n in names)
|
|
|
|
# Load all results
|
|
all_metrics = {}
|
|
for i, folder in enumerate(folders):
|
|
data = load_results(folder)
|
|
metrics = extract_metrics(data)
|
|
all_metrics[names[i]] = metrics
|
|
|
|
# Collect all unique metric labels
|
|
all_labels = set()
|
|
for metrics in all_metrics.values():
|
|
all_labels.update(metrics.keys())
|
|
|
|
# Sort labels by task name
|
|
sorted_labels = sorted(all_labels)
|
|
|
|
# Print header
|
|
col_width = 10
|
|
name_col = max(45, max_name_len)
|
|
|
|
header = f"{'Metric':<{name_col}}"
|
|
for name in names:
|
|
header += f" {name:>{col_width}}"
|
|
if len(names) > 1:
|
|
header += f" {'Δ(last-first)':>{col_width+2}}"
|
|
|
|
print()
|
|
print("=" * len(header))
|
|
print("lm_eval Results Comparison")
|
|
print("=" * len(header))
|
|
print(header)
|
|
print("-" * len(header))
|
|
|
|
prev_task = None
|
|
for label in sorted_labels:
|
|
# Get display info from first run that has this metric
|
|
metric_info = None
|
|
for metrics in all_metrics.values():
|
|
if label in metrics:
|
|
metric_info = metrics[label]
|
|
break
|
|
if metric_info is None:
|
|
continue
|
|
|
|
display = get_display_name(label, metric_info)
|
|
|
|
# Skip stderr and alias entries
|
|
if "stderr" in label.lower() or "alias" in label.lower():
|
|
continue
|
|
|
|
# Add separator between tasks
|
|
current_task = metric_info["task"]
|
|
if prev_task and current_task != prev_task:
|
|
print()
|
|
prev_task = current_task
|
|
|
|
row = f"{display:<{name_col}}"
|
|
|
|
values = []
|
|
for name in names:
|
|
metrics = all_metrics[name]
|
|
if label in metrics:
|
|
val = metrics[label]["value"]
|
|
values.append(val)
|
|
if isinstance(val, float):
|
|
row += f" {val:>{col_width}.4f}"
|
|
else:
|
|
row += f" {val:>{col_width}}"
|
|
else:
|
|
values.append(None)
|
|
row += f" {'---':>{col_width}}"
|
|
|
|
# Delta column (last - first)
|
|
if len(names) > 1 and values[0] is not None and values[-1] is not None:
|
|
delta = values[-1] - values[0]
|
|
if delta > 0:
|
|
row += f" {delta:>+{col_width}.4f}"
|
|
elif delta < 0:
|
|
row += f" {delta:>+{col_width}.4f}"
|
|
else:
|
|
row += f" {'0':>{col_width+1}}"
|
|
elif len(names) > 1:
|
|
row += f" {'---':>{col_width+1}}"
|
|
|
|
print(row)
|
|
|
|
print("=" * len(header))
|
|
|
|
# Summary: average across all metrics per run
|
|
print(f"\n{'Average (all metrics)':<{name_col}}", end="")
|
|
avgs = []
|
|
for name in names:
|
|
metrics = all_metrics[name]
|
|
vals = [m["value"] for m in metrics.values()
|
|
if isinstance(m["value"], (int, float)) and "stderr" not in m["metric"]]
|
|
if vals:
|
|
avg = sum(vals) / len(vals)
|
|
avgs.append(avg)
|
|
print(f" {avg:>{col_width}.4f}", end="")
|
|
else:
|
|
avgs.append(None)
|
|
print(f" {'---':>{col_width}}", end="")
|
|
|
|
if len(avgs) > 1 and avgs[0] is not None and avgs[-1] is not None:
|
|
delta = avgs[-1] - avgs[0]
|
|
print(f" {delta:>+{col_width}.4f}", end="")
|
|
print()
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Compare lm_eval results across runs",
|
|
epilog="Example: python compare_eval.py ./eval_base ./eval_rys"
|
|
)
|
|
parser.add_argument("folders", nargs="+", help="lm_eval output directories or JSON files")
|
|
parser.add_argument("--names", nargs="+", help="Display names for each run (default: folder names)")
|
|
args = parser.parse_args()
|
|
|
|
names = args.names if args.names else None
|
|
compare(args.folders, names)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|