llm-circuit-finder/sweep.py

#!/usr/bin/env python3
"""
RYS Layer Duplication Sweep

Orchestrates the search for optimal layer duplication configuration:
1. Generate modified GGUF with duplicated layers
2. Start llama-server with the modified model
3. Run math + EQ probes
4. Score and record results
5. Print live results table
6. Kill server, repeat

Usage:
    python sweep.py \
        --model /path/to/model.gguf \
        --llama-server /path/to/llama-server \
        --tmpdir /dev/shm/rys \
        --results results.jsonl

The sweep strategy:
    Pass 1: 8-layer blocks at stride 4 across the middle
    Pass 2: Refine within the hot zone with smaller blocks
"""

import argparse
import json
import os
import signal
import subprocess
import sys
import time
from datetime import datetime
from pathlib import Path

import requests

from gguf_surgery import duplicate_layers
from math_probe import MATH_QUESTIONS, score_math_response
from eq_probe import EQ_SCENARIOS, build_eq_prompt, parse_eq_response, score_eq_response
from reasoning_probe import REASONING_QUESTIONS, score_reasoning_response


# Server config
DEFAULT_PORT = 8099
SERVER_STARTUP_TIMEOUT = 120  # seconds
REQUEST_TIMEOUT = 60  # seconds per completion


def wait_for_server(port: int, timeout: int = SERVER_STARTUP_TIMEOUT) -> bool:
    """Wait for llama-server to be ready."""
    url = f"http://127.0.0.1:{port}/health"
    start = time.time()
    while time.time() - start < timeout:
        try:
            r = requests.get(url, timeout=2)
            if r.status_code == 200:
                data = r.json()
                if data.get("status") == "ok":
                    return True
        except (requests.ConnectionError, requests.Timeout):
            pass
        time.sleep(1)
    return False


def start_server(llama_server_path: str, model_path: str, port: int,
                 extra_args: list[str] = None) -> subprocess.Popen:
    """Start llama-server and return the process handle."""
    cmd = [
        llama_server_path,
        "-m", model_path,
        "--port", str(port),
        "-c", "4096",           # small context for probe eval
        "-ngl", "99",           # offload all layers to GPU
        "--flash-attn", "on",
        "--cache-type-k", "q8_0",
        "--cache-type-v", "q8_0",
        "--no-warmup",
        "-np", "1",             # single slot
    ]
    if extra_args:
        cmd.extend(extra_args)

    print(f"  [CMD] {' '.join(cmd)}", flush=True)

    # Let server output go to a log file so we can debug without pipe deadlocks
    log_path = Path(f"/tmp/rys_server_{port}.log")
    log_file = open(log_path, "w")
    proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT)
    proc._log_file = log_file  # keep reference so it doesn't get GC'd
    proc._log_path = log_path
    print(f"  [PID] Server started as PID {proc.pid}, log: {log_path}", flush=True)
    return proc


def stop_server(proc: subprocess.Popen):
    """Gracefully stop the server."""
    if proc.poll() is None:
        proc.terminate()
        try:
            proc.wait(timeout=10)
        except subprocess.TimeoutExpired:
            proc.kill()
            proc.wait()
    # Close the log file
    if hasattr(proc, '_log_file'):
        proc._log_file.close()


def dump_server_log(proc: subprocess.Popen, tail_lines: int = 30):
    """Print the last N lines of the server log for debugging."""
    if hasattr(proc, '_log_path') and proc._log_path.exists():
        lines = proc._log_path.read_text().splitlines()
        print(f"  --- Server log (last {tail_lines} lines) ---", file=sys.stderr)
        for line in lines[-tail_lines:]:
            print(f"  | {line}", file=sys.stderr)
        print(f"  --- End server log ---", file=sys.stderr)


def query_model(prompt: str, port: int, max_tokens: int = 64) -> str | None:
    """Send a completion request to llama-server."""
    url = f"http://127.0.0.1:{port}/v1/chat/completions"

    payload = {
        "model": "test",
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "max_tokens": max_tokens,
        "temperature": 0.0,
    }

    try:
        r = requests.post(url, json=payload, timeout=REQUEST_TIMEOUT)
        if r.status_code == 200:
            data = r.json()
            return data["choices"][0]["message"]["content"]
        else:
            print(f"  [WARN] Server returned {r.status_code}", file=sys.stderr)
            return None
    except (requests.ConnectionError, requests.Timeout) as e:
        print(f"  [WARN] Request failed: {e}", file=sys.stderr)
        return None


def run_math_probe(port: int) -> float:
    """Run all math questions and return average score (0-1)."""
    scores = []
    for question, answer in MATH_QUESTIONS:
        response = query_model(question, port, max_tokens=48)
        if response is not None:
            score = score_math_response(answer, response)
            scores.append(score)
        else:
            scores.append(0.0)
    return sum(scores) / len(scores) if scores else 0.0


def run_eq_probe(port: int) -> float:
    """Run all EQ scenarios and return average score (0-100)."""
    scores = []
    for scenario in EQ_SCENARIOS:
        prompt = build_eq_prompt(scenario)
        response = query_model(prompt, port, max_tokens=48)
        if response is not None:
            predicted = parse_eq_response(response, len(scenario["emotions"]))
            score = score_eq_response(scenario["reference"], predicted)
            scores.append(score)
        else:
            scores.append(0.0)
    return sum(scores) / len(scores) if scores else 0.0


def run_reasoning_probe(port: int) -> dict:
    """Run all reasoning questions, return scores by category and overall."""
    by_category = {}
    for q in REASONING_QUESTIONS:
        cat = q["type"]
        if cat not in by_category:
            by_category[cat] = []
        response = query_model(q["prompt"], port, max_tokens=512)
        score = score_reasoning_response(q, response)
        by_category[cat].append(score)

    # Per-category averages
    cat_scores = {}
    for cat, scores in by_category.items():
        cat_scores[cat] = sum(scores) / len(scores) if scores else 0.0

    # Overall reasoning score (0-1)
    all_scores = [s for scores in by_category.values() for s in scores]
    overall = sum(all_scores) / len(all_scores) if all_scores else 0.0

    return {"categories": cat_scores, "overall": overall}


def run_evaluation(port: int) -> dict:
    """Run all probes and return results."""
    math_score = run_math_probe(port)
    eq_score = run_eq_probe(port)
    reasoning = run_reasoning_probe(port)
    return {
        "math_score": math_score,
        "eq_score": eq_score,
        "reasoning_score": reasoning["overall"],
        "reasoning_cats": reasoning["categories"],
    }


def print_results_table(results: list[dict], baseline: dict | None = None):
    """Print a live-updating results table."""
    print("\n" + "=" * 105)
    print(f"{'Config':>12} {'Layers':>8} {'Math':>8} {'EQ':>8} {'Reason':>8} "
          f"{'Math Δ':>8} {'EQ Δ':>8} {'Reas Δ':>8} {'Combined Δ':>11}")
    print("-" * 105)

    if baseline:
        brs = baseline.get('reasoning_score', 0)
        print(f"{'BASELINE':>12} {'0':>8} "
              f"{baseline['math_score']:>8.4f} {baseline['eq_score']:>8.2f} {brs:>8.2%} "
              f"{'---':>8} {'---':>8} {'---':>8} {'---':>11}")
        print("-" * 105)

    for r in results:
        config = f"({r['dup_start']},{r['dup_end']})"
        n_dup = r['dup_end'] - r['dup_start']
        rs = r.get('reasoning_score', 0)

        if baseline:
            math_delta = r['math_score'] - baseline['math_score']
            eq_delta = r['eq_score'] - baseline['eq_score']
            reas_delta = rs - baseline.get('reasoning_score', 0)
            # Combined: weight EQ and reasoning more than math
            combined = eq_delta + (reas_delta * 100)
            math_d = f"{math_delta:>+8.4f}"
            eq_d = f"{eq_delta:>+8.2f}"
            reas_d = f"{reas_delta:>+8.2%}"
            comb_d = f"{combined:>+11.2f}"
        else:
            math_d = eq_d = reas_d = comb_d = "---"

        print(f"{config:>12} {n_dup:>8} "
              f"{r['math_score']:>8.4f} {r['eq_score']:>8.2f} {rs:>8.2%} "
              f"{math_d} {eq_d} {reas_d} {comb_d}")

    print("=" * 105)
    sys.stdout.flush()


def generate_sweep_configs(n_layers: int, block_sizes: list[int],
                           start_min: int = 4, start_max: int = None,
                           stride: int = 4) -> list[tuple[int, int]]:
    """
    Generate (dup_start, dup_end) configs for the sweep.

    Args:
        n_layers: Total layers in the model
        block_sizes: List of block sizes to try (e.g., [8])
        start_min: Earliest layer to start duplication
        start_max: Latest layer to start (default: n_layers - max(block_sizes) - 4)
        stride: Step between start positions
    """
    if start_max is None:
        start_max = n_layers - max(block_sizes) - 4

    configs = []
    for bs in block_sizes:
        for start in range(start_min, start_max + 1, stride):
            end = start + bs
            if end <= n_layers:
                configs.append((start, end))

    return configs


def main():
    parser = argparse.ArgumentParser(description="RYS Layer Duplication Sweep")
    parser.add_argument("--model", required=True, help="Path to input GGUF model")
    parser.add_argument("--llama-server", required=True, help="Path to llama-server binary")
    parser.add_argument("--tmpdir", default="/dev/shm/rys",
                        help="Temp directory for modified GGUFs (use tmpfs/RAM)")
    parser.add_argument("--results", default="rys_results.jsonl",
                        help="Output results file (JSONL)")
    parser.add_argument("--port", type=int, default=DEFAULT_PORT)
    parser.add_argument("--block-sizes", type=int, nargs="+", default=[8],
                        help="Block sizes to sweep (default: 8)")
    parser.add_argument("--stride", type=int, default=4,
                        help="Stride between start positions (default: 4)")
    parser.add_argument("--start-min", type=int, default=4,
                        help="Earliest layer to start duplication")
    parser.add_argument("--start-max", type=int, default=None,
                        help="Latest layer to start duplication")
    parser.add_argument("--skip-baseline", action="store_true",
                        help="Skip baseline run (use if already in results)")
    parser.add_argument("--server-args", nargs=argparse.REMAINDER, default=[],
                        help="Extra args to pass to llama-server (must be last)")
    args = parser.parse_args()

    model_path = Path(args.model).resolve()
    tmpdir = Path(args.tmpdir)
    tmpdir.mkdir(parents=True, exist_ok=True)

    results_path = Path(args.results)
    results = []
    baseline = None

    # Load existing results if resuming
    if results_path.exists():
        with open(results_path) as f:
            for line in f:
                line = line.strip()
                if line:
                    entry = json.loads(line)
                    if entry.get("is_baseline"):
                        baseline = entry
                    else:
                        results.append(entry)
        print(f"Loaded {len(results)} existing results + baseline={baseline is not None}")

    # Run baseline (unmodified model)
    if not args.skip_baseline and baseline is None:
        print("\n>>> Running BASELINE evaluation...")
        proc = start_server(args.llama_server, str(model_path), args.port, args.server_args)
        try:
            if not wait_for_server(args.port):
                print("ERROR: Server failed to start for baseline", file=sys.stderr)
                dump_server_log(proc)
                stop_server(proc)
                sys.exit(1)

            print("  Server ready. Running probes...")
            eval_result = run_evaluation(args.port)
            baseline = {
                "is_baseline": True,
                "dup_start": -1,
                "dup_end": -1,
                "math_score": eval_result["math_score"],
                "eq_score": eval_result["eq_score"],
                "reasoning_score": eval_result["reasoning_score"],
                "reasoning_cats": eval_result.get("reasoning_cats", {}),
                "timestamp": datetime.now().isoformat(),
            }

            with open(results_path, "a") as f:
                f.write(json.dumps(baseline) + "\n")

            brs = baseline['reasoning_score']
            print(f"  Baseline: math={baseline['math_score']:.4f} eq={baseline['eq_score']:.2f} reasoning={brs:.2%}")
        finally:
            stop_server(proc)

    # Get model layer count from the GGUF metadata
    from gguf import GGUFReader
    reader = GGUFReader(str(model_path), 'r')
    arch_field = reader.get_field('general.architecture')
    arch = arch_field.contents()
    block_count_field = reader.get_field(f'{arch}.block_count')
    n_layers = block_count_field.contents()
    print(f"\nModel: {model_path.name}")
    print(f"Architecture: {arch}, Layers: {n_layers}")

    # Generate sweep configurations
    configs = generate_sweep_configs(
        n_layers=n_layers,
        block_sizes=args.block_sizes,
        start_min=args.start_min,
        start_max=args.start_max,
        stride=args.stride,
    )

    # Filter out already-completed configs
    done = {(r["dup_start"], r["dup_end"]) for r in results}
    configs = [(s, e) for s, e in configs if (s, e) not in done]

    print(f"Configs to test: {len(configs)}")
    if configs:
        print(f"  Range: ({configs[0][0]},{configs[0][1]}) to ({configs[-1][0]},{configs[-1][1]})")

    print_results_table(results, baseline)

    for idx, (dup_start, dup_end) in enumerate(configs):
        n_dup = dup_end - dup_start
        config_str = f"({dup_start},{dup_end})"
        print(f"\n>>> [{idx+1}/{len(configs)}] Testing config {config_str} "
              f"(+{n_dup} layers)...")

        # Generate modified GGUF
        modified_path = tmpdir / f"rys_{dup_start}_{dup_end}.gguf"
        print(f"  Generating modified GGUF...")
        try:
            duplicate_layers(
                str(model_path), str(modified_path),
                dup_start, dup_end, verbose=False
            )
        except Exception as e:
            print(f"  ERROR generating GGUF: {e}", file=sys.stderr)
            continue

        # Start server with modified model
        print(f"  Starting server...")
        proc = start_server(
            args.llama_server, str(modified_path), args.port, args.server_args
        )

        try:
            if not wait_for_server(args.port):
                print(f"  ERROR: Server failed to start for {config_str}", file=sys.stderr)
                dump_server_log(proc)
                print(f"  Check server log above for details.", file=sys.stderr)
                continue

            print(f"  Server ready. Running probes...")
            eval_result = run_evaluation(args.port)

            entry = {
                "dup_start": dup_start,
                "dup_end": dup_end,
                "n_dup_layers": n_dup,
                "math_score": eval_result["math_score"],
                "eq_score": eval_result["eq_score"],
                "reasoning_score": eval_result["reasoning_score"],
                "reasoning_cats": eval_result.get("reasoning_cats", {}),
                "timestamp": datetime.now().isoformat(),
            }

            results.append(entry)

            # Append to results file
            with open(results_path, "a") as f:
                f.write(json.dumps(entry) + "\n")

            print_results_table(results, baseline)

        finally:
            stop_server(proc)

            # Clean up modified GGUF to free tmpfs space
            if modified_path.exists():
                modified_path.unlink()
                print(f"  Cleaned up {modified_path.name}")

    print("\n\nSweep complete!")
    print_results_table(results, baseline)


if __name__ == "__main__":
    main()