Add files via upload

2026-04-24 20:56:21 +02:00 · 2026-03-18 21:06:28 +00:00 · 2026-03-18 21:06:28 +00:00 · 8b68afbd3e
commit 8b68afbd3e
parent 9ddf9fafd9
9 changed files with 2148 additions and 0 deletions
--- a/compare_eval.py
+++ b/compare_eval.py
@ -0,0 +1,216 @@
 #!/usr/bin/env python3
 """
 Compare lm_eval results across multiple runs.
 Reads the results JSON files from lm_eval output directories and
 prints a side-by-side comparison table.
 Usage:
    python compare_eval.py ./eval_base ./eval_rys_balanced ./eval_rys_triple
    python compare_eval.py ./eval_*
 """
 import argparse
 import json
 import sys
 from pathlib import Path
 def load_results(folder: str) -> dict:
    """Load lm_eval results from an output folder."""
    folder = Path(folder)
    # lm_eval saves results in a JSON file inside the folder
    # The filename varies, so find it
    candidates = list(folder.glob("**/*results*.json"))
    if not candidates:
        # Try the folder itself as a JSON file
        if folder.suffix == '.json' and folder.is_file():
            with open(folder) as f:
                return json.load(f)
        print(f"WARNING: No results JSON found in {folder}", file=sys.stderr)
        return {}
    # Use the most recent one
    results_file = max(candidates, key=lambda p: p.stat().st_mtime)
    with open(results_file) as f:
        data = json.load(f)
    return data
 def extract_metrics(data: dict) -> dict:
    """Extract task metrics from lm_eval results format."""
    metrics = {}
    results = data.get("results", {})
    for task_name, task_data in results.items():
        for key, value in task_data.items():
            if key.endswith(",none") or key.endswith(",flexible-extract") or key.endswith(",strict-match") or key.endswith(",get-answer"):
                # Parse "metric_name,filter_name" format
                parts = key.rsplit(",", 1)
                metric = parts[0]
                filter_name = parts[1] if len(parts) > 1 else ""
                if isinstance(value, (int, float)):
                    display_name = f"{task_name}"
                    if filter_name and filter_name != "none":
                        display_name += f" ({filter_name})"
                    if metric not in ("alias",):
                        label = f"{task_name}|{metric}|{filter_name}"
                        metrics[label] = {
                            "task": task_name,
                            "metric": metric,
                            "filter": filter_name,
                            "value": value,
                        }
    return metrics
 def get_display_name(label: str, metric_info: dict) -> str:
    """Create a readable display name from metric info."""
    task = metric_info["task"]
    metric = metric_info["metric"]
    filt = metric_info["filter"]
    # Shorten common task names
    task = task.replace("bbh_cot_fewshot_", "bbh/")
    if filt and filt not in ("none", "get-answer"):
        return f"{task} [{filt}]"
    return f"{task} [{metric}]"
 def compare(folders: list[str], names: list[str] = None):
    """Compare results across folders."""
    if names is None:
        names = [Path(f).name for f in folders]
    # Pad names to equal length
    max_name_len = max(len(n) for n in names)
    # Load all results
    all_metrics = {}
    for i, folder in enumerate(folders):
        data = load_results(folder)
        metrics = extract_metrics(data)
        all_metrics[names[i]] = metrics
    # Collect all unique metric labels
    all_labels = set()
    for metrics in all_metrics.values():
        all_labels.update(metrics.keys())
    # Sort labels by task name
    sorted_labels = sorted(all_labels)
    # Print header
    col_width = 10
    name_col = max(45, max_name_len)
    header = f"{'Metric':<{name_col}}"
    for name in names:
        header += f" {name:>{col_width}}"
    if len(names) > 1:
        header += f" {'Δ(last-first)':>{col_width+2}}"
    print()
    print("=" * len(header))
    print("lm_eval Results Comparison")
    print("=" * len(header))
    print(header)
    print("-" * len(header))
    prev_task = None
    for label in sorted_labels:
        # Get display info from first run that has this metric
        metric_info = None
        for metrics in all_metrics.values():
            if label in metrics:
                metric_info = metrics[label]
                break
        if metric_info is None:
            continue
        display = get_display_name(label, metric_info)
        # Skip stderr and alias entries
        if "stderr" in label.lower() or "alias" in label.lower():
            continue
        # Add separator between tasks
        current_task = metric_info["task"]
        if prev_task and current_task != prev_task:
            print()
        prev_task = current_task
        row = f"{display:<{name_col}}"
        values = []
        for name in names:
            metrics = all_metrics[name]
            if label in metrics:
                val = metrics[label]["value"]
                values.append(val)
                if isinstance(val, float):
                    row += f" {val:>{col_width}.4f}"
                else:
                    row += f" {val:>{col_width}}"
            else:
                values.append(None)
                row += f" {'---':>{col_width}}"
        # Delta column (last - first)
        if len(names) > 1 and values[0] is not None and values[-1] is not None:
            delta = values[-1] - values[0]
            if delta > 0:
                row += f"  {delta:>+{col_width}.4f}"
            elif delta < 0:
                row += f"  {delta:>+{col_width}.4f}"
            else:
                row += f"  {'0':>{col_width+1}}"
        elif len(names) > 1:
            row += f"  {'---':>{col_width+1}}"
        print(row)
    print("=" * len(header))
    # Summary: average across all metrics per run
    print(f"\n{'Average (all metrics)':<{name_col}}", end="")
    avgs = []
    for name in names:
        metrics = all_metrics[name]
        vals = [m["value"] for m in metrics.values()
                if isinstance(m["value"], (int, float)) and "stderr" not in m["metric"]]
        if vals:
            avg = sum(vals) / len(vals)
            avgs.append(avg)
            print(f" {avg:>{col_width}.4f}", end="")
        else:
            avgs.append(None)
            print(f" {'---':>{col_width}}", end="")
    if len(avgs) > 1 and avgs[0] is not None and avgs[-1] is not None:
        delta = avgs[-1] - avgs[0]
        print(f"  {delta:>+{col_width}.4f}", end="")
    print()
 def main():
    parser = argparse.ArgumentParser(
        description="Compare lm_eval results across runs",
        epilog="Example: python compare_eval.py ./eval_base ./eval_rys"
    )
    parser.add_argument("folders", nargs="+", help="lm_eval output directories or JSON files")
    parser.add_argument("--names", nargs="+", help="Display names for each run (default: folder names)")
    args = parser.parse_args()
    names = args.names if args.names else None
    compare(args.folders, names)
 if __name__ == "__main__":
    main()
--- a/eq_probe.py
+++ b/eq_probe.py
@ -0,0 +1,267 @@
 #!/usr/bin/env python3
 """
 EQ Probe for RYS Layer Duplication Sweep
 Simplified EQ-Bench-style scenarios. Each presents a social situation
 and asks the model to rate the intensity of 4 emotions (0-10).
 Scored by comparing to reference answers using EQ-Bench v2 scoring.
 The reference answers were crafted to represent reasonable human consensus.
 What matters for the sweep is relative performance between configs,
 not absolute calibration.
 """
 import json
 import re
 import math
 # Each scenario: situation text, 4 emotions to rate, reference scores (0-10)
 EQ_SCENARIOS = [
    {
        "id": "eq01",
        "situation": (
            "David just found out his closest friend has been spreading rumors about him "
            "at work for months. When confronted, the friend laughed it off and said "
            "\"everyone does it, don't be so sensitive.\" David has been quiet all evening."
        ),
        "emotions": ["Anger", "Betrayal", "Sadness", "Confusion"],
        "reference": [7, 9, 6, 4]
    },
    {
        "id": "eq02",
        "situation": (
            "Maria spent six months preparing a presentation for the board. During the "
            "meeting, her manager took credit for all her ideas and presented them as his "
            "own. Afterward, he thanked her for 'helping out with the slides.'"
        ),
        "emotions": ["Frustration", "Resentment", "Humiliation", "Determination"],
        "reference": [8, 8, 5, 6]
    },
    {
        "id": "eq03",
        "situation": (
            "After 20 years of marriage, Tom's wife tells him she wants a divorce. "
            "She says she still cares about him but hasn't been happy for years. "
            "Tom had no idea anything was wrong."
        ),
        "emotions": ["Shock", "Grief", "Self-doubt", "Denial"],
        "reference": [9, 7, 6, 7]
    },
    {
        "id": "eq04",
        "situation": (
            "A young doctor must tell parents that their child's surgery was successful "
            "but there will be permanent limitations. The parents are already celebrating "
            "in the waiting room."
        ),
        "emotions": ["Relief", "Anxiety", "Empathy", "Professional_detachment"],
        "reference": [4, 7, 8, 5]
    },
    {
        "id": "eq05",
        "situation": (
            "Chen receives a prestigious award at a ceremony. As he walks to the stage, "
            "he sees his estranged father in the audience - the man who abandoned the "
            "family when Chen was twelve."
        ),
        "emotions": ["Pride", "Anger", "Longing", "Anxiety"],
        "reference": [7, 5, 6, 6]
    },
    {
        "id": "eq06",
        "situation": (
            "A retired teacher learns that a former student, who she failed years ago "
            "and who dropped out, has become extremely successful. The student publicly "
            "credits 'proving my teacher wrong' as their motivation."
        ),
        "emotions": ["Guilt", "Pride", "Defensiveness", "Amusement"],
        "reference": [5, 4, 6, 3]
    },
    {
        "id": "eq07",
        "situation": (
            "Sophie finds out she's been accepted to her dream university on the same "
            "day her best friend receives a rejection from the same school. Her friend "
            "calls to congratulate her, voice cracking."
        ),
        "emotions": ["Joy", "Guilt", "Empathy", "Awkwardness"],
        "reference": [7, 6, 8, 7]
    },
    {
        "id": "eq08",
        "situation": (
            "A firefighter rescues a child from a burning building. Weeks later, he "
            "wakes up screaming from nightmares about the ones he couldn't save in "
            "a previous fire. His partner asks if he's okay."
        ),
        "emotions": ["Satisfaction", "Trauma", "Vulnerability", "Shame"],
        "reference": [3, 8, 7, 5]
    },
    {
        "id": "eq09",
        "situation": (
            "An elderly woman's family surprises her with a birthday party. She smiles "
            "and thanks everyone, but keeps glancing at an empty chair - where her "
            "late husband always sat."
        ),
        "emotions": ["Gratitude", "Grief", "Loneliness", "Warmth"],
        "reference": [7, 7, 6, 6]
    },
    {
        "id": "eq10",
        "situation": (
            "A manager must lay off a team member who is also a close friend and a "
            "single parent. The company requires it due to budget cuts. HR is waiting "
            "for the paperwork."
        ),
        "emotions": ["Guilt", "Dread", "Helplessness", "Resentment"],
        "reference": [8, 8, 7, 5]
    },
    {
        "id": "eq11",
        "situation": (
            "James finds old love letters in the attic from his wife, written to "
            "someone else before they met. The letters are passionate and describe "
            "a depth of feeling he's not sure she's ever expressed toward him."
        ),
        "emotions": ["Jealousy", "Insecurity", "Curiosity", "Sadness"],
        "reference": [6, 7, 5, 4]
    },
    {
        "id": "eq12",
        "situation": (
            "A teenager confesses to her mother that she's been self-harming. The mother, "
            "who is a psychologist, realizes she completely missed the signs despite her "
            "professional training."
        ),
        "emotions": ["Fear", "Guilt", "Love", "Self_criticism"],
        "reference": [8, 8, 9, 7]
    },
    {
        "id": "eq13",
        "situation": (
            "A war veteran returns home after two years to find his dog waiting on the "
            "porch, much older and thinner. The dog recognizes him immediately and "
            "limps over, tail wagging."
        ),
        "emotions": ["Joy", "Guilt", "Tenderness", "Sorrow"],
        "reference": [8, 5, 9, 4]
    },
    {
        "id": "eq14",
        "situation": (
            "During a job interview, the candidate realizes the interviewer is someone "
            "she bullied badly in high school. The interviewer clearly recognizes her "
            "but proceeds professionally."
        ),
        "emotions": ["Shame", "Anxiety", "Admiration", "Regret"],
        "reference": [7, 8, 4, 7]
    },
    {
        "id": "eq15",
        "situation": (
            "A father watches his daughter's wedding, knowing he has a terminal diagnosis "
            "he hasn't shared with the family. He chose to wait until after the wedding "
            "to tell them."
        ),
        "emotions": ["Joy", "Grief", "Protectiveness", "Isolation"],
        "reference": [6, 8, 8, 7]
    },
    {
        "id": "eq16",
        "situation": (
            "Two siblings meet for the first time as adults after being separated in "
            "foster care as children. They look alike but have lived completely different "
            "lives. One is wealthy, the other struggles financially."
        ),
        "emotions": ["Wonder", "Resentment", "Hope", "Grief"],
        "reference": [7, 3, 7, 6]
    },
 ]
 def build_eq_prompt(scenario: dict) -> str:
    """Build the prompt for a single EQ scenario."""
    emotions_str = ", ".join(scenario["emotions"])
    return (
        f"Read the following situation and rate the emotional intensity that "
        f"the main character is likely feeling for each of the listed emotions. "
        f"Rate each emotion from 0 (not feeling it at all) to 10 (extremely intense).\n\n"
        f"Situation: {scenario['situation']}\n\n"
        f"Rate these emotions: {emotions_str}\n\n"
        f"Respond ONLY with the four numbers separated by commas, in the same order. "
        f"Example format: 5, 3, 8, 2\n"
        f"Do not include any other text."
    )
 def parse_eq_response(response: str, n_emotions: int = 4) -> list[float] | None:
    """Extract emotion ratings from model response."""
    # Try to find comma-separated numbers
    numbers = re.findall(r'(\d+(?:\.\d+)?)', response)
    if len(numbers) < n_emotions:
        return None
    try:
        # Take the first n_emotions numbers found
        ratings = [float(numbers[i]) for i in range(n_emotions)]
        # Clamp to 0-10
        ratings = [max(0.0, min(10.0, r)) for r in ratings]
        return ratings
    except (ValueError, IndexError):
        return None
 def score_eq_response(reference: list[int], predicted: list[float]) -> float:
    """
    EQ-Bench v2 style scoring.
    Differences 1-4 from reference are scaled down on a curve.
    Differences 5-10 count 1:1.
    Score 0 = random, 100 = perfect match.
    """
    if predicted is None or len(predicted) != len(reference):
        return 0.0
    total_penalty = 0.0
    max_possible_penalty = 10.0 * len(reference)  # worst case: all off by 10
    for ref, pred in zip(reference, predicted):
        diff = abs(ref - pred)
        # Scale down small differences (EQ-Bench v2 approach)
        if diff <= 4:
            # Quadratic scaling: diff^2 / 4 so diff=4 -> penalty=4
            penalty = (diff ** 2) / 4.0
        else:
            # Linear for larger diffs, continuous at diff=4 (penalty=4)
            penalty = diff
        total_penalty += penalty
    # Normalize: 0 penalty = score 100, max penalty = score ~0
    score = max(0.0, 100.0 * (1.0 - total_penalty / max_possible_penalty))
    return score
 # Convenience
 EQ_PROMPTS = [(s, build_eq_prompt(s)) for s in EQ_SCENARIOS]
 if __name__ == "__main__":
    print(f"EQ Probe: {len(EQ_SCENARIOS)} scenarios")
    print("=" * 60)
    for i, scenario in enumerate(EQ_SCENARIOS):
        print(f"\n[{scenario['id']}] Emotions: {scenario['emotions']}")
        print(f"    Reference: {scenario['reference']}")
        prompt = build_eq_prompt(scenario)
        print(f"    Prompt length: {len(prompt)} chars")
    # Test scoring
    print("\n\nScoring tests:")
    print(f"  Perfect match:    {score_eq_response([7, 9, 6, 4], [7, 9, 6, 4]):.1f}")
    print(f"  All off by 1:     {score_eq_response([7, 9, 6, 4], [8, 8, 7, 5]):.1f}")
    print(f"  All off by 3:     {score_eq_response([7, 9, 6, 4], [4, 6, 3, 1]):.1f}")
    print(f"  All off by 5:     {score_eq_response([7, 9, 6, 4], [2, 4, 1, 0]):.1f}")
    print(f"  Worst case:       {score_eq_response([7, 9, 6, 4], [0, 0, 0, 10]):.1f}")
    print(f"  Unparseable:      {score_eq_response([7, 9, 6, 4], None):.1f}")
--- a/gguf_surgery.py
+++ b/gguf_surgery.py
@ -0,0 +1,221 @@
 #!/usr/bin/env python3
 """
 GGUF Layer Duplication Surgery
 Reads a GGUF model file, duplicates transformer layers i..j-1 so they
 execute twice in the forward pass, and writes a new GGUF with the
 modified layer structure.
 For a model with N layers, configuration (i, j) produces:
  layers 0..j-1, then layers i..j-1 again, then layers j..N-1
  Total layers: N + (j - i)
 Tensor naming convention: blk.{layer_idx}.{tensor_name}
 Non-block tensors (token_embd, output_norm, output) are copied as-is.
 """
 import argparse
 import re
 import sys
 from pathlib import Path
 import numpy as np
 from tqdm import tqdm
 import gguf
 from gguf import GGUFReader, GGUFWriter, GGUFValueType
 BLK_PATTERN = re.compile(r'^blk\.(\d+)\.(.+)$')
 def get_field_value(reader: GGUFReader, key: str):
    """Extract a scalar value from a reader field."""
    field = reader.get_field(key)
    if field is None:
        return None
    return field.contents()
 def duplicate_layers(input_path: str, output_path: str, dup_start: int, dup_end: int, verbose: bool = False):
    """
    Create a new GGUF with layers dup_start..dup_end-1 duplicated.
    The new layer order is:
      Original layers 0..dup_end-1
      Duplicated layers dup_start..dup_end-1  (renumbered)
      Original layers dup_end..N-1            (renumbered)
    """
    reader = GGUFReader(input_path, 'r')
    arch = get_field_value(reader, gguf.Keys.General.ARCHITECTURE)
    if arch is None:
        raise ValueError("Could not read architecture from GGUF")
    block_count_key = f'{arch}.block_count'
    orig_block_count = get_field_value(reader, block_count_key)
    if orig_block_count is None:
        raise ValueError(f"Could not read {block_count_key} from GGUF")
    n_dup = dup_end - dup_start
    new_block_count = orig_block_count + n_dup
    if verbose:
        print(f"Architecture: {arch}")
        print(f"Original layers: {orig_block_count}")
        print(f"Duplicating layers {dup_start}..{dup_end - 1} ({n_dup} layers)")
        print(f"New layer count: {new_block_count}")
    if dup_start < 0 or dup_end > orig_block_count or dup_start >= dup_end:
        raise ValueError(
            f"Invalid duplication range ({dup_start}, {dup_end}) "
            f"for model with {orig_block_count} layers"
        )
    # Build layer mapping: new_idx -> original_layer_idx
    # Phase 1: original 0..dup_end-1 keep their indices
    # Phase 2: duplicates of dup_start..dup_end-1 get indices dup_end..dup_end+n_dup-1
    # Phase 3: original dup_end..N-1 shift up by n_dup
    layer_map = {}
    for orig_idx in range(dup_end):
        layer_map[orig_idx] = orig_idx
    for k in range(n_dup):
        layer_map[dup_end + k] = dup_start + k
    for orig_idx in range(dup_end, orig_block_count):
        layer_map[orig_idx + n_dup] = orig_idx
    if verbose:
        print("Layer mapping (new -> orig):")
        for new_idx in sorted(layer_map.keys()):
            tag = " [DUP]" if (dup_end <= new_idx < dup_end + n_dup) else ""
            print(f"  new {new_idx:3d} -> orig {layer_map[new_idx]:3d}{tag}")
    # Create writer
    writer = GGUFWriter(output_path, arch=arch, endianess=reader.endianess)
    alignment = get_field_value(reader, gguf.Keys.General.ALIGNMENT)
    if alignment is not None:
        writer.data_alignment = alignment
    # Copy metadata, overriding block_count
    for field in reader.fields.values():
        if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
            continue
        val_type = field.types[0]
        sub_type = field.types[-1] if val_type == GGUFValueType.ARRAY else None
        if field.name == block_count_key:
            writer.add_key_value(field.name, new_block_count, val_type)
            if verbose:
                print(f"Modified {field.name}: {orig_block_count} -> {new_block_count}")
        else:
            val = field.contents()
            if val is not None:
                writer.add_key_value(field.name, val, val_type, sub_type=sub_type)
    # Organize tensors by type
    non_block_tensors = []
    block_tensors = {}  # orig_layer_idx -> [(suffix, tensor), ...]
    for tensor in reader.tensors:
        match = BLK_PATTERN.match(tensor.name)
        if match:
            layer_idx = int(match.group(1))
            suffix = match.group(2)
            if layer_idx not in block_tensors:
                block_tensors[layer_idx] = []
            block_tensors[layer_idx].append((suffix, tensor))
        else:
            non_block_tensors.append(tensor)
    # Split non-block tensors into pre-block and post-block
    pre_block = []
    post_block = []
    for t in non_block_tensors:
        if 'output' in t.name:
            post_block.append(t)
        else:
            pre_block.append(t)
    # Add tensor infos in order and build write queue
    total_bytes = 0
    block_write_order = []  # (new_name, original_tensor)
    for tensor in pre_block:
        writer.add_tensor_info(
            tensor.name, tensor.data.shape, tensor.data.dtype,
            tensor.data.nbytes, tensor.tensor_type
        )
        total_bytes += tensor.n_bytes
    for new_idx in range(new_block_count):
        orig_idx = layer_map[new_idx]
        if orig_idx not in block_tensors:
            print(f"WARNING: No tensors for original layer {orig_idx}", file=sys.stderr)
            continue
        for suffix, tensor in block_tensors[orig_idx]:
            new_name = f"blk.{new_idx}.{suffix}"
            writer.add_tensor_info(
                new_name, tensor.data.shape, tensor.data.dtype,
                tensor.data.nbytes, tensor.tensor_type
            )
            total_bytes += tensor.n_bytes
            block_write_order.append((new_name, tensor))
    for tensor in post_block:
        writer.add_tensor_info(
            tensor.name, tensor.data.shape, tensor.data.dtype,
            tensor.data.nbytes, tensor.tensor_type
        )
        total_bytes += tensor.n_bytes
    # Write file
    writer.write_header_to_file()
    writer.write_kv_data_to_file()
    writer.write_ti_data_to_file()
    bar = tqdm(desc="Writing GGUF", total=total_bytes, unit="B", unit_scale=True)
    for tensor in pre_block:
        writer.write_tensor_data(tensor.data)
        bar.update(tensor.n_bytes)
    for _, tensor in block_write_order:
        writer.write_tensor_data(tensor.data)
        bar.update(tensor.n_bytes)
    for tensor in post_block:
        writer.write_tensor_data(tensor.data)
        bar.update(tensor.n_bytes)
    bar.close()
    writer.close()
    if verbose:
        print(f"Done. Written to {output_path}")
        out_size = Path(output_path).stat().st_size / (1024**3)
        print(f"Output size: {out_size:.2f} GiB")
 def main():
    parser = argparse.ArgumentParser(
        description="Duplicate layers in a GGUF model (RYS method)"
    )
    parser.add_argument("input", help="Input GGUF file path")
    parser.add_argument("output", help="Output GGUF file path")
    parser.add_argument("-i", "--dup-start", type=int, required=True,
                        help="First layer to duplicate (inclusive)")
    parser.add_argument("-j", "--dup-end", type=int, required=True,
                        help="Last layer to duplicate (exclusive)")
    parser.add_argument("-v", "--verbose", action="store_true")
    args = parser.parse_args()
    duplicate_layers(args.input, args.output, args.dup_start, args.dup_end, args.verbose)
 if __name__ == "__main__":
    main()
--- a/layer_path.py
+++ b/layer_path.py
@ -0,0 +1,248 @@
 #!/usr/bin/env python3
 """
 Explicit Layer Path GGUF Surgery
 You provide the exact sequence of layers the model should execute.
 No ambiguous range notation — just list the layers.
 Examples:
    # Normal 40-layer model (identity, for testing)
    python layer_path.py model.gguf out.gguf -p 0,1,2,...,39
    # Duplicate layers 13-16 once (same as RYS with i=13,j=17)
    python layer_path.py model.gguf out.gguf -p 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,13,14,15,16,17,18,...,39
    # Repeat layer 13 four times
    python layer_path.py model.gguf out.gguf -p 0,1,...,12,13,13,13,13,14,15,...,39
    # Triple-pass layers 13-16
    python layer_path.py model.gguf out.gguf -p 0,1,...,16,13,14,15,16,13,14,15,16,17,...,39
    # Shorthand: use .. to fill in sequential ranges
    python layer_path.py model.gguf out.gguf -p 0..16,13,14,15,16,13,14,15,16,17..39
 Usage:
    python layer_path.py input.gguf output.gguf -p "0..16,13,14,15,16,17..39" -v
 """
 import argparse
 import re
 import sys
 from pathlib import Path
 import numpy as np
 from tqdm import tqdm
 import gguf
 from gguf import GGUFReader, GGUFWriter, GGUFValueType
 BLK_PATTERN = re.compile(r'^blk\.(\d+)\.(.+)$')
 def get_field_value(reader, key):
    field = reader.get_field(key)
    if field is None:
        return None
    return field.contents()
 def parse_layer_path(path_str: str) -> list[int]:
    """
    Parse a layer path string into a list of layer indices.
    Supports:
        - Individual numbers: 0,1,2,13,13,14
        - Ranges with ..: 0..16 expands to 0,1,2,...,16 (inclusive)
        - Mixed: 0..12,13,13,13,14..39
    Whitespace is ignored.
    """
    path_str = path_str.replace(' ', '')
    layers = []
    for part in path_str.split(','):
        part = part.strip()
        if not part:
            continue
        if '..' in part:
            # Range: start..end (inclusive)
            pieces = part.split('..')
            if len(pieces) != 2:
                raise ValueError(f"Invalid range: '{part}'. Use 'start..end'")
            start = int(pieces[0])
            end = int(pieces[1])
            if start > end:
                raise ValueError(f"Invalid range: {start}..{end} (start > end)")
            layers.extend(range(start, end + 1))
        else:
            layers.append(int(part))
    return layers
 def build_gguf_from_path(input_path: str, output_path: str,
                         layer_path: list[int], verbose: bool = False):
    """
    Create a new GGUF where the forward pass follows the given layer path.
    """
    reader = GGUFReader(input_path, 'r')
    arch = get_field_value(reader, gguf.Keys.General.ARCHITECTURE)
    block_count_key = f'{arch}.block_count'
    orig_block_count = get_field_value(reader, block_count_key)
    # Validate all layer indices
    for idx in layer_path:
        if idx < 0 or idx >= orig_block_count:
            raise ValueError(
                f"Layer {idx} out of range (model has {orig_block_count} layers, 0..{orig_block_count-1})"
            )
    new_block_count = len(layer_path)
    if verbose:
        print(f"Architecture: {arch}")
        print(f"Original layers: {orig_block_count}")
        print(f"New layer count: {new_block_count}")
        print(f"Layer path: {layer_path}")
        # Show which layers are repeated
        from collections import Counter
        counts = Counter(layer_path)
        repeated = {k: v for k, v in counts.items() if v > 1}
        if repeated:
            print(f"Repeated layers: {dict(sorted(repeated.items()))}")
        else:
            print("No repeated layers (just a reorder)")
    # layer_map: new_position -> original_layer_index
    layer_map = {new_idx: orig_idx for new_idx, orig_idx in enumerate(layer_path)}
    # Create writer
    writer = GGUFWriter(output_path, arch=arch, endianess=reader.endianess)
    alignment = get_field_value(reader, gguf.Keys.General.ALIGNMENT)
    if alignment is not None:
        writer.data_alignment = alignment
    # Copy metadata, override block count
    for field in reader.fields.values():
        if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
            continue
        val_type = field.types[0]
        sub_type = field.types[-1] if val_type == GGUFValueType.ARRAY else None
        if field.name == block_count_key:
            writer.add_key_value(field.name, new_block_count, val_type)
        else:
            val = field.contents()
            if val is not None:
                writer.add_key_value(field.name, val, val_type, sub_type=sub_type)
    # Organize tensors by layer
    non_block_tensors = []
    block_tensors = {}
    for tensor in reader.tensors:
        match = BLK_PATTERN.match(tensor.name)
        if match:
            layer_idx = int(match.group(1))
            suffix = match.group(2)
            if layer_idx not in block_tensors:
                block_tensors[layer_idx] = []
            block_tensors[layer_idx].append((suffix, tensor))
        else:
            non_block_tensors.append(tensor)
    pre_block = [t for t in non_block_tensors if 'output' not in t.name]
    post_block = [t for t in non_block_tensors if 'output' in t.name]
    # Add tensor infos and build write order
    total_bytes = 0
    block_write_order = []
    for tensor in pre_block:
        writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype,
                               tensor.data.nbytes, tensor.tensor_type)
        total_bytes += tensor.n_bytes
    for new_idx in range(new_block_count):
        orig_idx = layer_map[new_idx]
        if orig_idx not in block_tensors:
            raise ValueError(f"No tensors found for original layer {orig_idx}")
        for suffix, tensor in block_tensors[orig_idx]:
            new_name = f"blk.{new_idx}.{suffix}"
            writer.add_tensor_info(new_name, tensor.data.shape, tensor.data.dtype,
                                   tensor.data.nbytes, tensor.tensor_type)
            total_bytes += tensor.n_bytes
            block_write_order.append(tensor)
    for tensor in post_block:
        writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype,
                               tensor.data.nbytes, tensor.tensor_type)
        total_bytes += tensor.n_bytes
    # Write
    writer.write_header_to_file()
    writer.write_kv_data_to_file()
    writer.write_ti_data_to_file()
    bar = tqdm(desc="Writing GGUF", total=total_bytes, unit="B", unit_scale=True)
    for tensor in pre_block:
        writer.write_tensor_data(tensor.data)
        bar.update(tensor.n_bytes)
    for tensor in block_write_order:
        writer.write_tensor_data(tensor.data)
        bar.update(tensor.n_bytes)
    for tensor in post_block:
        writer.write_tensor_data(tensor.data)
        bar.update(tensor.n_bytes)
    bar.close()
    writer.close()
    if verbose:
        out_size = Path(output_path).stat().st_size / (1024**3)
        print(f"Done. Output: {out_size:.2f} GiB")
 def main():
    parser = argparse.ArgumentParser(
        description="Build GGUF with explicit layer execution path",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
  # Duplicate layers 13-16 once (RYS style)
  %(prog)s model.gguf out.gguf -p "0..16,13,14,15,16,17..39"
  # Triple-pass layers 13-16
  %(prog)s model.gguf out.gguf -p "0..16,13,14,15,16,13,14,15,16,17..39"
  # Repeat just layer 13 four times
  %(prog)s model.gguf out.gguf -p "0..12,13,13,13,13,14..39"
  # Skip layer 5 entirely
  %(prog)s model.gguf out.gguf -p "0..4,6..39"
        """
    )
    parser.add_argument("input", help="Input GGUF file")
    parser.add_argument("output", help="Output GGUF file")
    parser.add_argument("-p", "--path", required=True,
                        help="Layer execution path (e.g. '0..16,13,14,15,16,17..39')")
    parser.add_argument("-v", "--verbose", action="store_true")
    args = parser.parse_args()
    layer_path = parse_layer_path(args.path)
    print(f"Model: {args.input}")
    print(f"Output: {args.output}")
    print(f"Layer path ({len(layer_path)} layers): {layer_path}")
    build_gguf_from_path(args.input, args.output, layer_path, args.verbose)
 if __name__ == "__main__":
    main()
--- a/math_probe.py
+++ b/math_probe.py
@ -0,0 +1,155 @@
 #!/usr/bin/env python3
 """
 Math Probe for RYS Layer Duplication Sweep
 Hard arithmetic questions where the model must guess the answer
 without chain-of-thought. Scored with Ng's partial-credit function.
 """
 import json
 import math
 import random
 def calculate_score(actual, estimate):
    """
    Ng's partial-credit scoring function.
    Pads shorter answers, penalizes proportionally.
    Returns 0-1 score.
    """
    try:
        actual_str = str(int(actual))
        estimate_str = str(int(estimate))
    except (ValueError, OverflowError):
        return 0.0
    max_length = max(len(actual_str), len(estimate_str))
    actual_padded = actual_str.ljust(max_length, "0")
    estimate_padded = estimate_str.ljust(max_length, "0")
    padding_size = max_length - min(len(actual_str), len(estimate_str))
    actual_int = int(actual_padded)
    estimate_int = int(estimate_padded)
    if max(actual_int, estimate_int) == 0:
        return 0.0
    relative_diff = abs(actual_int - estimate_int) / max(actual_int, estimate_int)
    correction_factor = 1 - (padding_size / max_length)
    score = (1 - relative_diff) * correction_factor
    return max(0.0, min(score, 1.0))
 def generate_math_questions(seed=42):
    """
    Generate hard arithmetic questions with known answers.
    Mix of operations to test different numeric intuition.
    Returns list of (question_text, correct_answer) tuples.
    """
    rng = random.Random(seed)
    questions = []
    # Cube roots of large numbers (compute perfect cubes, ask for root)
    for _ in range(4):
        root = rng.randint(20000, 50000)
        cube = root ** 3
        questions.append((
            f"What is the cube root of {cube}? "
            f"Answer with just the number, no explanation.",
            root
        ))
    # Large multiplications
    for _ in range(4):
        a = rng.randint(100000, 999999)
        b = rng.randint(100000, 999999)
        product = a * b
        questions.append((
            f"What is {a} multiplied by {b}? "
            f"Answer with just the number, no explanation.",
            product
        ))
    # Square roots of large numbers (perfect squares)
    for _ in range(4):
        root = rng.randint(50000, 200000)
        square = root ** 2
        questions.append((
            f"What is the square root of {square}? "
            f"Answer with just the number, no explanation.",
            root
        ))
    # Mixed: cube root multiplied by a number
    for _ in range(4):
        root = rng.randint(100, 999)
        cube = root ** 3
        multiplier = rng.randint(10, 99)
        answer = root * multiplier
        questions.append((
            f"What is the cube root of {cube}, multiplied by {multiplier}? "
            f"Answer with just the number, no explanation.",
            answer
        ))
    return questions
 def parse_number_from_response(response: str) -> int | None:
    """
    Extract the first integer from a model response.
    Handles common LLM quirks: commas in numbers, trailing text, etc.
    """
    import re
    # Clean up common formatting
    text = response.strip()
    # Try to find a number (possibly with commas)
    # Match negative or positive integers, possibly with commas
    patterns = [
        r'[-+]?[\d,]+',  # numbers with optional commas
    ]
    for pattern in patterns:
        matches = re.findall(pattern, text)
        if matches:
            # Take the first/longest match
            num_str = max(matches, key=len)
            num_str = num_str.replace(',', '')
            try:
                return int(num_str)
            except ValueError:
                continue
    return None
 def score_math_response(question_answer: int, response: str) -> float:
    """Score a single math response."""
    parsed = parse_number_from_response(response)
    if parsed is None:
        return 0.0
    return calculate_score(question_answer, parsed)
 # Pre-generated questions for consistency across runs
 MATH_QUESTIONS = generate_math_questions(seed=42)
 if __name__ == "__main__":
    # Print the questions and answers for verification
    print("Math Probe Questions:")
    print("=" * 60)
    for i, (q, a) in enumerate(MATH_QUESTIONS):
        print(f"\n[{i+1}] {q}")
        print(f"    Answer: {a}")
    # Test the scoring function
    print("\n\nScoring tests:")
    print(f"  Exact match:     {calculate_score(4302459, 4302459):.4f}")
    print(f"  Missing digit:   {calculate_score(4302459, 430245):.4f}")
    print(f"  One digit off:   {calculate_score(123456789, 123356789):.4f}")
    print(f"  Way off:         {calculate_score(4302459, 9999999):.4f}")
    print(f"  Zero vs nonzero: {calculate_score(4302459, 0):.4f}")
--- a/multi_repeat.py
+++ b/multi_repeat.py
@ -0,0 +1,198 @@
 #!/usr/bin/env python3
 """
 Multi-repeat GGUF surgery.
 Duplicates a block of layers N times total (default 3 = original + 2 copies).
 For (i=13, j=17, repeats=3) on a 40-layer model:
  0..16, 13..16, 13..16, 17..39  = 48 layers total
  The block 13-16 executes 3 times.
 Usage:
    python multi_repeat.py \
        /path/to/model.gguf \
        /dev/shm/rys/triple_13_17.gguf \
        -i 13 -j 17 -n 3 -v
 """
 import argparse
 import re
 import sys
 from pathlib import Path
 import numpy as np
 from tqdm import tqdm
 import gguf
 from gguf import GGUFReader, GGUFWriter, GGUFValueType
 BLK_PATTERN = re.compile(r'^blk\.(\d+)\.(.+)$')
 def get_field_value(reader, key):
    field = reader.get_field(key)
    if field is None:
        return None
    return field.contents()
 def multi_repeat_layers(input_path, output_path, dup_start, dup_end, n_repeats, verbose=False):
    reader = GGUFReader(input_path, 'r')
    arch = get_field_value(reader, gguf.Keys.General.ARCHITECTURE)
    block_count_key = f'{arch}.block_count'
    orig_block_count = get_field_value(reader, block_count_key)
    n_block = dup_end - dup_start
    extra_copies = n_repeats - 1  # original pass counts as 1
    new_block_count = orig_block_count + (n_block * extra_copies)
    if verbose:
        print(f"Architecture: {arch}")
        print(f"Original layers: {orig_block_count}")
        print(f"Block: layers {dup_start}..{dup_end - 1} ({n_block} layers)")
        print(f"Repeats: {n_repeats}x (original + {extra_copies} copies)")
        print(f"New layer count: {new_block_count}")
    # Build layer map
    # Phase 1: original 0..dup_end-1
    # Phase 2..N: copies of dup_start..dup_end-1
    # Phase last: original dup_end..orig-1 (shifted)
    layer_map = {}
    # Phase 1: original layers up to dup_end
    for idx in range(dup_end):
        layer_map[idx] = idx
    # Phase 2+: extra copies
    offset = dup_end
    for copy in range(extra_copies):
        for k in range(n_block):
            layer_map[offset + k] = dup_start + k
        offset += n_block
    # Phase last: remaining original layers shifted
    for orig_idx in range(dup_end, orig_block_count):
        layer_map[orig_idx + (n_block * extra_copies)] = orig_idx
    assert len(layer_map) == new_block_count
    if verbose:
        path = [layer_map[i] for i in range(new_block_count)]
        print(f"Execution path ({len(path)} layers):")
        i = 0
        while i < len(path):
            run_start = path[i]
            run_end = run_start
            j = i + 1
            while j < len(path) and path[j] == run_end + 1:
                run_end = path[j]
                j += 1
            if run_start == run_end:
                print(f"  [{run_start}]")
            else:
                print(f"  [{run_start}..{run_end}]")
            i = j
    # Create writer
    writer = GGUFWriter(output_path, arch=arch, endianess=reader.endianess)
    alignment = get_field_value(reader, gguf.Keys.General.ALIGNMENT)
    if alignment is not None:
        writer.data_alignment = alignment
    # Copy metadata
    for field in reader.fields.values():
        if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
            continue
        val_type = field.types[0]
        sub_type = field.types[-1] if val_type == GGUFValueType.ARRAY else None
        if field.name == block_count_key:
            writer.add_key_value(field.name, new_block_count, val_type)
        else:
            val = field.contents()
            if val is not None:
                writer.add_key_value(field.name, val, val_type, sub_type=sub_type)
    # Organize tensors
    non_block_tensors = []
    block_tensors = {}
    for tensor in reader.tensors:
        match = BLK_PATTERN.match(tensor.name)
        if match:
            layer_idx = int(match.group(1))
            suffix = match.group(2)
            if layer_idx not in block_tensors:
                block_tensors[layer_idx] = []
            block_tensors[layer_idx].append((suffix, tensor))
        else:
            non_block_tensors.append(tensor)
    pre_block = [t for t in non_block_tensors if 'output' not in t.name]
    post_block = [t for t in non_block_tensors if 'output' in t.name]
    total_bytes = 0
    block_write_order = []
    for tensor in pre_block:
        writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype,
                               tensor.data.nbytes, tensor.tensor_type)
        total_bytes += tensor.n_bytes
    for new_idx in range(new_block_count):
        orig_idx = layer_map[new_idx]
        for suffix, tensor in block_tensors[orig_idx]:
            new_name = f"blk.{new_idx}.{suffix}"
            writer.add_tensor_info(new_name, tensor.data.shape, tensor.data.dtype,
                                   tensor.data.nbytes, tensor.tensor_type)
            total_bytes += tensor.n_bytes
            block_write_order.append(tensor)
    for tensor in post_block:
        writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype,
                               tensor.data.nbytes, tensor.tensor_type)
        total_bytes += tensor.n_bytes
    # Write
    writer.write_header_to_file()
    writer.write_kv_data_to_file()
    writer.write_ti_data_to_file()
    bar = tqdm(desc="Writing GGUF", total=total_bytes, unit="B", unit_scale=True)
    for tensor in pre_block:
        writer.write_tensor_data(tensor.data)
        bar.update(tensor.n_bytes)
    for tensor in block_write_order:
        writer.write_tensor_data(tensor.data)
        bar.update(tensor.n_bytes)
    for tensor in post_block:
        writer.write_tensor_data(tensor.data)
        bar.update(tensor.n_bytes)
    bar.close()
    writer.close()
    if verbose:
        size = Path(output_path).stat().st_size / (1024**3)
        print(f"Written: {output_path} ({size:.2f} GiB)")
 def main():
    parser = argparse.ArgumentParser(description="Multi-repeat layer duplication")
    parser.add_argument("input", help="Input GGUF")
    parser.add_argument("output", help="Output GGUF")
    parser.add_argument("-i", "--dup-start", type=int, required=True)
    parser.add_argument("-j", "--dup-end", type=int, required=True)
    parser.add_argument("-n", "--repeats", type=int, default=3,
                        help="Total times the block executes (default: 3)")
    parser.add_argument("-v", "--verbose", action="store_true")
    args = parser.parse_args()
    multi_repeat_layers(args.input, args.output,
                        args.dup_start, args.dup_end,
                        args.repeats, args.verbose)
 if __name__ == "__main__":
    main()
--- a/reasoning_probe.py
+++ b/reasoning_probe.py
@ -0,0 +1,282 @@
 #!/usr/bin/env python3
 """
 Reasoning Probe for RYS Layer Duplication Sweep
 BBH-style questions: causal judgement, date understanding, logical
 deduction, navigation, plus basic math word problems.
 All answers are short (Yes/No, a letter, or a number).
 """
 import re
 # ─── Causal Judgement (Yes/No) ─────────────────────────────────────
 CAUSAL_QUESTIONS = [
    {
        "prompt": (
            "How would a typical person answer each of the following questions about causation?\n"
            "Frank T., had an ongoing dispute with his neighbor over a stretch of land and one day "
            "decided to shoot his neighbor in the body. Frank T. had no experience with guns, his "
            "hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet "
            "bounced off a large boulder several feet away and hit the neighbor's body, causing "
            "significant injury. Did Frank T. intentionally shoot his neighbor in the body?\n"
            "Options:\n- Yes\n- No\n\n"
            "Think step by step, then answer Yes or No."
        ),
        "answer": "no",
    },
    {
        "prompt": (
            "How would a typical person answer each of the following questions about causation?\n"
            "Suzy and Billy are working on a project that is very important for our nation's security. "
            "The boss tells them both: \"Be sure that you are here at exactly 9 am. It is absolutely "
            "essential that you arrive at that time.\" Both Billy and Suzy arrive at 9 am. As it happens, "
            "there was a motion detector installed in the room where they arrived. The motion detector "
            "was set up to be triggered if at least one person appeared in the room at the same time. "
            "So the motion detector went off. Did Billy cause the motion detector to go off?\n"
            "Options:\n- Yes\n- No\n\n"
            "Think step by step, then answer Yes or No."
        ),
        "answer": "yes",
    },
    {
        "prompt": (
            "How would a typical person answer each of the following questions about causation?\n"
            "George and his sister Lena reunite at their parents' house for Thanksgiving. Whereas "
            "George just got into medical school, Lena is unhappy in her marriage and recently lost "
            "her job. Over the course of the day, George and Lena get into a number of heated arguments. "
            "Later in the afternoon they play a game of darts. They split the first two games, and the "
            "third game is close until the end. Who will win comes down to George's last shot. If he "
            "hits a high point region, he wins; if he hits a low point region, Lena wins. George thinks "
            "of the difficult time Lena is having, and he really wants to let her win. He aims the dart "
            "at the low point region. He sets up his shot and the dart lands in the low point region. "
            "After his shot, Lena wins the game and is very happy. Did George hit the low point region "
            "intentionally?\n"
            "Options:\n- Yes\n- No\n\n"
            "Think step by step, then answer Yes or No."
        ),
        "answer": "yes",
    },
 ]
 # ─── Date Understanding (multiple choice letter) ──────────────────
 DATE_QUESTIONS = [
    {
        "prompt": (
            "Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?\n"
            "Options:\n"
            "(A) 12/14/2026\n(B) 12/14/1950\n(C) 12/14/2007\n"
            "(D) 12/14/1937\n(E) 07/14/1938\n(F) 12/14/1988\n\n"
            "Think step by step, then give your answer as a single letter."
        ),
        "answer": "d",
    },
    {
        "prompt": (
            "Tomorrow is 11/12/2019. What is the date one year ago from today in MM/DD/YYYY?\n"
            "Options:\n"
            "(A) 09/04/2018\n(B) 11/11/2018\n(C) 08/25/2018\n"
            "(D) 11/02/2018\n(E) 11/04/2018\n\n"
            "Think step by step, then give your answer as a single letter."
        ),
        "answer": "b",
    },
    {
        "prompt": (
            "Jane and John married on Jan 2, 1958. It is their 5-year anniversary today. "
            "What is the date tomorrow in MM/DD/YYYY?\n"
            "Options:\n"
            "(A) 01/11/1961\n(B) 01/03/1963\n(C) 01/18/1961\n"
            "(D) 10/14/1960\n(E) 01/03/1982\n(F) 12/03/1960\n\n"
            "Think step by step, then give your answer as a single letter."
        ),
        "answer": "b",
    },
 ]
 # ─── Logical Deduction (multiple choice letter) ───────────────────
 LOGIC_QUESTIONS = [
    {
        "prompt": (
            "The following paragraphs each describe a set of three objects arranged in a fixed order. "
            "The statements are logically consistent within each paragraph.\n"
            "In a golf tournament, there were three golfers: Amy, Eli, and Eve. "
            "Eve finished above Amy. Eli finished below Amy.\n"
            "Options:\n"
            "(A) Amy finished last\n(B) Eli finished last\n(C) Eve finished last\n\n"
            "Think step by step, then give your answer as a single letter."
        ),
        "answer": "b",
    },
    {
        "prompt": (
            "The following paragraphs each describe a set of three objects arranged in a fixed order. "
            "The statements are logically consistent within each paragraph.\n"
            "On a shelf, there are three books: a white book, a green book, and an orange book. "
            "The green book is to the right of the white book. The orange book is the rightmost.\n"
            "Options:\n"
            "(A) The white book is the leftmost\n(B) The green book is the leftmost\n"
            "(C) The orange book is the leftmost\n\n"
            "Think step by step, then give your answer as a single letter."
        ),
        "answer": "a",
    },
    {
        "prompt": (
            "The following paragraphs each describe a set of three objects arranged in a fixed order. "
            "The statements are logically consistent within each paragraph.\n"
            "On a shelf, there are three books: a red book, a gray book, and a white book. "
            "The white book is to the left of the gray book. The red book is the second from the left.\n"
            "Options:\n"
            "(A) The red book is the leftmost\n(B) The gray book is the leftmost\n"
            "(C) The white book is the leftmost\n\n"
            "Think step by step, then give your answer as a single letter."
        ),
        "answer": "c",
    },
 ]
 # ─── Navigation (Yes/No) ──────────────────────────────────────────
 NAV_QUESTIONS = [
    {
        "prompt": (
            "If you follow these instructions, do you return to the starting point? "
            "Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.\n"
            "Options:\n- Yes\n- No\n\n"
            "Think step by step, then answer Yes or No."
        ),
        "answer": "no",
    },
    {
        "prompt": (
            "If you follow these instructions, do you return to the starting point? "
            "Turn around. Take 1 step. Take 6 steps. Turn around. Take 6 steps. Take 9 steps. Take 1 step.\n"
            "Options:\n- Yes\n- No\n\n"
            "Think step by step, then answer Yes or No."
        ),
        "answer": "no",
    },
    {
        "prompt": (
            "If you follow these instructions, do you return to the starting point? "
            "Always face forward. Take 2 steps right. Take 9 steps left. Take 7 steps right.\n"
            "Options:\n- Yes\n- No\n\n"
            "Think step by step, then answer Yes or No."
        ),
        "answer": "yes",
    },
 ]
 # ─── GSM8K Word Problems (number answer) ──────────────────────────
 GSM_QUESTIONS = [
    {
        "prompt": (
            "There are 15 trees in the grove. Grove workers will plant trees in the grove today. "
            "After they are done, there will be 21 trees. How many trees did the grove workers plant today?\n\n"
            "Solve step by step. End with 'The answer is [NUMBER]'."
        ),
        "answer": "6",
    },
    {
        "prompt": (
            "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\n\n"
            "Solve step by step. End with 'The answer is [NUMBER]'."
        ),
        "answer": "5",
    },
    {
        "prompt": (
            "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\n\n"
            "Solve step by step. End with 'The answer is [NUMBER]'."
        ),
        "answer": "39",
    },
    {
        "prompt": (
            "Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. "
            "How many lollipops did Jason give to Denny?\n\n"
            "Solve step by step. End with 'The answer is [NUMBER]'."
        ),
        "answer": "8",
    },
    {
        "prompt": (
            "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. "
            "How many toys does he have now?\n\n"
            "Solve step by step. End with 'The answer is [NUMBER]'."
        ),
        "answer": "9",
    },
 ]
 # ─── All questions with types ─────────────────────────────────────
 ALL_REASONING = []
 for q in CAUSAL_QUESTIONS:
    ALL_REASONING.append({**q, "type": "causal"})
 for q in DATE_QUESTIONS:
    ALL_REASONING.append({**q, "type": "date"})
 for q in LOGIC_QUESTIONS:
    ALL_REASONING.append({**q, "type": "logic"})
 for q in NAV_QUESTIONS:
    ALL_REASONING.append({**q, "type": "nav"})
 for q in GSM_QUESTIONS:
    ALL_REASONING.append({**q, "type": "gsm"})
 # Alias for sweep.py import
 REASONING_QUESTIONS = ALL_REASONING
 def extract_final_answer(response: str) -> str:
    """Extract the final answer from a CoT response."""
    match = re.search(r'the answer is\s+(.+?)[\.\!\n\r]', response, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    match = re.search(r'\b(yes|no)\b', response.split('\n')[-1], re.IGNORECASE)
    if match:
        return match.group(1).strip()
    match = re.search(r'\(?([A-F])\)?', response.split('\n')[-1])
    if match:
        return match.group(1).strip()
    return response.strip().split('\n')[-1].strip()
 def score_reasoning_response(question: dict, response: str) -> float:
    """Score a reasoning question. Returns 0 or 1."""
    if response is None:
        return 0.0
    final = extract_final_answer(response).lower().strip()
    correct = question["answer"].lower().strip()
    final = re.sub(r'[^a-z0-9]', '', final)
    correct = re.sub(r'[^a-z0-9]', '', correct)
    if final == correct:
        return 1.0
    if correct in ("yes", "no"):
        last_line = response.strip().split('\n')[-1].lower()
        if correct in last_line and (correct == "yes") != ("no" in last_line):
            return 1.0
    return 0.0
 if __name__ == "__main__":
    print(f"Reasoning Probe: {len(ALL_REASONING)} questions")
    print(f"  Causal:  {len(CAUSAL_QUESTIONS)}")
    print(f"  Date:    {len(DATE_QUESTIONS)}")
    print(f"  Logic:   {len(LOGIC_QUESTIONS)}")
    print(f"  Nav:     {len(NAV_QUESTIONS)}")
    print(f"  GSM:     {len(GSM_QUESTIONS)}")
    print("\nSample questions:")
    for q in ALL_REASONING[:2]:
        print(f"\n  [{q['type']}] answer={q['answer']}")
        print(f"  {q['prompt'][:80]}...")
--- a/sweep.py
+++ b/sweep.py
@ -0,0 +1,447 @@
 #!/usr/bin/env python3
 """
 RYS Layer Duplication Sweep
 Orchestrates the search for optimal layer duplication configuration:
 1. Generate modified GGUF with duplicated layers
 2. Start llama-server with the modified model
 3. Run math + EQ probes
 4. Score and record results
 5. Print live results table
 6. Kill server, repeat
 Usage:
    python sweep.py \
        --model /path/to/model.gguf \
        --llama-server /path/to/llama-server \
        --tmpdir /dev/shm/rys \
        --results results.jsonl
 The sweep strategy:
    Pass 1: 8-layer blocks at stride 4 across the middle
    Pass 2: Refine within the hot zone with smaller blocks
 """
 import argparse
 import json
 import os
 import signal
 import subprocess
 import sys
 import time
 from datetime import datetime
 from pathlib import Path
 import requests
 from gguf_surgery import duplicate_layers
 from math_probe import MATH_QUESTIONS, score_math_response
 from eq_probe import EQ_SCENARIOS, build_eq_prompt, parse_eq_response, score_eq_response
 from reasoning_probe import REASONING_QUESTIONS, score_reasoning_response
 # Server config
 DEFAULT_PORT = 8099
 SERVER_STARTUP_TIMEOUT = 120  # seconds
 REQUEST_TIMEOUT = 60  # seconds per completion
 def wait_for_server(port: int, timeout: int = SERVER_STARTUP_TIMEOUT) -> bool:
    """Wait for llama-server to be ready."""
    url = f"http://127.0.0.1:{port}/health"
    start = time.time()
    while time.time() - start < timeout:
        try:
            r = requests.get(url, timeout=2)
            if r.status_code == 200:
                data = r.json()
                if data.get("status") == "ok":
                    return True
        except (requests.ConnectionError, requests.Timeout):
            pass
        time.sleep(1)
    return False
 def start_server(llama_server_path: str, model_path: str, port: int,
                 extra_args: list[str] = None) -> subprocess.Popen:
    """Start llama-server and return the process handle."""
    cmd = [
        llama_server_path,
        "-m", model_path,
        "--port", str(port),
        "-c", "4096",           # small context for probe eval
        "-ngl", "99",           # offload all layers to GPU
        "--flash-attn", "on",
        "--cache-type-k", "q8_0",
        "--cache-type-v", "q8_0",
        "--no-warmup",
        "-np", "1",             # single slot
    ]
    if extra_args:
        cmd.extend(extra_args)
    print(f"  [CMD] {' '.join(cmd)}", flush=True)
    # Let server output go to a log file so we can debug without pipe deadlocks
    log_path = Path(f"/tmp/rys_server_{port}.log")
    log_file = open(log_path, "w")
    proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT)
    proc._log_file = log_file  # keep reference so it doesn't get GC'd
    proc._log_path = log_path
    print(f"  [PID] Server started as PID {proc.pid}, log: {log_path}", flush=True)
    return proc
 def stop_server(proc: subprocess.Popen):
    """Gracefully stop the server."""
    if proc.poll() is None:
        proc.terminate()
        try:
            proc.wait(timeout=10)
        except subprocess.TimeoutExpired:
            proc.kill()
            proc.wait()
    # Close the log file
    if hasattr(proc, '_log_file'):
        proc._log_file.close()
 def dump_server_log(proc: subprocess.Popen, tail_lines: int = 30):
    """Print the last N lines of the server log for debugging."""
    if hasattr(proc, '_log_path') and proc._log_path.exists():
        lines = proc._log_path.read_text().splitlines()
        print(f"  --- Server log (last {tail_lines} lines) ---", file=sys.stderr)
        for line in lines[-tail_lines:]:
            print(f"  | {line}", file=sys.stderr)
        print(f"  --- End server log ---", file=sys.stderr)
 def query_model(prompt: str, port: int, max_tokens: int = 64) -> str | None:
    """Send a completion request to llama-server."""
    url = f"http://127.0.0.1:{port}/v1/chat/completions"
    payload = {
        "model": "test",
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "max_tokens": max_tokens,
        "temperature": 0.0,
    }
    try:
        r = requests.post(url, json=payload, timeout=REQUEST_TIMEOUT)
        if r.status_code == 200:
            data = r.json()
            return data["choices"][0]["message"]["content"]
        else:
            print(f"  [WARN] Server returned {r.status_code}", file=sys.stderr)
            return None
    except (requests.ConnectionError, requests.Timeout) as e:
        print(f"  [WARN] Request failed: {e}", file=sys.stderr)
        return None
 def run_math_probe(port: int) -> float:
    """Run all math questions and return average score (0-1)."""
    scores = []
    for question, answer in MATH_QUESTIONS:
        response = query_model(question, port, max_tokens=48)
        if response is not None:
            score = score_math_response(answer, response)
            scores.append(score)
        else:
            scores.append(0.0)
    return sum(scores) / len(scores) if scores else 0.0
 def run_eq_probe(port: int) -> float:
    """Run all EQ scenarios and return average score (0-100)."""
    scores = []
    for scenario in EQ_SCENARIOS:
        prompt = build_eq_prompt(scenario)
        response = query_model(prompt, port, max_tokens=48)
        if response is not None:
            predicted = parse_eq_response(response, len(scenario["emotions"]))
            score = score_eq_response(scenario["reference"], predicted)
            scores.append(score)
        else:
            scores.append(0.0)
    return sum(scores) / len(scores) if scores else 0.0
 def run_reasoning_probe(port: int) -> dict:
    """Run all reasoning questions, return scores by category and overall."""
    by_category = {}
    for q in REASONING_QUESTIONS:
        cat = q["type"]
        if cat not in by_category:
            by_category[cat] = []
        response = query_model(q["prompt"], port, max_tokens=512)
        score = score_reasoning_response(q, response)
        by_category[cat].append(score)
    # Per-category averages
    cat_scores = {}
    for cat, scores in by_category.items():
        cat_scores[cat] = sum(scores) / len(scores) if scores else 0.0
    # Overall reasoning score (0-1)
    all_scores = [s for scores in by_category.values() for s in scores]
    overall = sum(all_scores) / len(all_scores) if all_scores else 0.0
    return {"categories": cat_scores, "overall": overall}
 def run_evaluation(port: int) -> dict:
    """Run all probes and return results."""
    math_score = run_math_probe(port)
    eq_score = run_eq_probe(port)
    reasoning = run_reasoning_probe(port)
    return {
        "math_score": math_score,
        "eq_score": eq_score,
        "reasoning_score": reasoning["overall"],
        "reasoning_cats": reasoning["categories"],
    }
 def print_results_table(results: list[dict], baseline: dict | None = None):
    """Print a live-updating results table."""
    print("\n" + "=" * 105)
    print(f"{'Config':>12} {'Layers':>8} {'Math':>8} {'EQ':>8} {'Reason':>8} "
          f"{'Math Δ':>8} {'EQ Δ':>8} {'Reas Δ':>8} {'Combined Δ':>11}")
    print("-" * 105)
    if baseline:
        brs = baseline.get('reasoning_score', 0)
        print(f"{'BASELINE':>12} {'0':>8} "
              f"{baseline['math_score']:>8.4f} {baseline['eq_score']:>8.2f} {brs:>8.2%} "
              f"{'---':>8} {'---':>8} {'---':>8} {'---':>11}")
        print("-" * 105)
    for r in results:
        config = f"({r['dup_start']},{r['dup_end']})"
        n_dup = r['dup_end'] - r['dup_start']
        rs = r.get('reasoning_score', 0)
        if baseline:
            math_delta = r['math_score'] - baseline['math_score']
            eq_delta = r['eq_score'] - baseline['eq_score']
            reas_delta = rs - baseline.get('reasoning_score', 0)
            # Combined: weight EQ and reasoning more than math
            combined = eq_delta + (reas_delta * 100)
            math_d = f"{math_delta:>+8.4f}"
            eq_d = f"{eq_delta:>+8.2f}"
            reas_d = f"{reas_delta:>+8.2%}"
            comb_d = f"{combined:>+11.2f}"
        else:
            math_d = eq_d = reas_d = comb_d = "---"
        print(f"{config:>12} {n_dup:>8} "
              f"{r['math_score']:>8.4f} {r['eq_score']:>8.2f} {rs:>8.2%} "
              f"{math_d} {eq_d} {reas_d} {comb_d}")
    print("=" * 105)
    sys.stdout.flush()
 def generate_sweep_configs(n_layers: int, block_sizes: list[int],
                           start_min: int = 4, start_max: int = None,
                           stride: int = 4) -> list[tuple[int, int]]:
    """
    Generate (dup_start, dup_end) configs for the sweep.
    Args:
        n_layers: Total layers in the model
        block_sizes: List of block sizes to try (e.g., [8])
        start_min: Earliest layer to start duplication
        start_max: Latest layer to start (default: n_layers - max(block_sizes) - 4)
        stride: Step between start positions
    """
    if start_max is None:
        start_max = n_layers - max(block_sizes) - 4
    configs = []
    for bs in block_sizes:
        for start in range(start_min, start_max + 1, stride):
            end = start + bs
            if end <= n_layers:
                configs.append((start, end))
    return configs
 def main():
    parser = argparse.ArgumentParser(description="RYS Layer Duplication Sweep")
    parser.add_argument("--model", required=True, help="Path to input GGUF model")
    parser.add_argument("--llama-server", required=True, help="Path to llama-server binary")
    parser.add_argument("--tmpdir", default="/dev/shm/rys",
                        help="Temp directory for modified GGUFs (use tmpfs/RAM)")
    parser.add_argument("--results", default="rys_results.jsonl",
                        help="Output results file (JSONL)")
    parser.add_argument("--port", type=int, default=DEFAULT_PORT)
    parser.add_argument("--block-sizes", type=int, nargs="+", default=[8],
                        help="Block sizes to sweep (default: 8)")
    parser.add_argument("--stride", type=int, default=4,
                        help="Stride between start positions (default: 4)")
    parser.add_argument("--start-min", type=int, default=4,
                        help="Earliest layer to start duplication")
    parser.add_argument("--start-max", type=int, default=None,
                        help="Latest layer to start duplication")
    parser.add_argument("--skip-baseline", action="store_true",
                        help="Skip baseline run (use if already in results)")
    parser.add_argument("--server-args", nargs=argparse.REMAINDER, default=[],
                        help="Extra args to pass to llama-server (must be last)")
    args = parser.parse_args()
    model_path = Path(args.model).resolve()
    tmpdir = Path(args.tmpdir)
    tmpdir.mkdir(parents=True, exist_ok=True)
    results_path = Path(args.results)
    results = []
    baseline = None
    # Load existing results if resuming
    if results_path.exists():
        with open(results_path) as f:
            for line in f:
                line = line.strip()
                if line:
                    entry = json.loads(line)
                    if entry.get("is_baseline"):
                        baseline = entry
                    else:
                        results.append(entry)
        print(f"Loaded {len(results)} existing results + baseline={baseline is not None}")
    # Run baseline (unmodified model)
    if not args.skip_baseline and baseline is None:
        print("\n>>> Running BASELINE evaluation...")
        proc = start_server(args.llama_server, str(model_path), args.port, args.server_args)
        try:
            if not wait_for_server(args.port):
                print("ERROR: Server failed to start for baseline", file=sys.stderr)
                dump_server_log(proc)
                stop_server(proc)
                sys.exit(1)
            print("  Server ready. Running probes...")
            eval_result = run_evaluation(args.port)
            baseline = {
                "is_baseline": True,
                "dup_start": -1,
                "dup_end": -1,
                "math_score": eval_result["math_score"],
                "eq_score": eval_result["eq_score"],
                "reasoning_score": eval_result["reasoning_score"],
                "reasoning_cats": eval_result.get("reasoning_cats", {}),
                "timestamp": datetime.now().isoformat(),
            }
            with open(results_path, "a") as f:
                f.write(json.dumps(baseline) + "\n")
            brs = baseline['reasoning_score']
            print(f"  Baseline: math={baseline['math_score']:.4f} eq={baseline['eq_score']:.2f} reasoning={brs:.2%}")
        finally:
            stop_server(proc)
    # Get model layer count from the GGUF metadata
    from gguf import GGUFReader
    reader = GGUFReader(str(model_path), 'r')
    arch_field = reader.get_field('general.architecture')
    arch = arch_field.contents()
    block_count_field = reader.get_field(f'{arch}.block_count')
    n_layers = block_count_field.contents()
    print(f"\nModel: {model_path.name}")
    print(f"Architecture: {arch}, Layers: {n_layers}")
    # Generate sweep configurations
    configs = generate_sweep_configs(
        n_layers=n_layers,
        block_sizes=args.block_sizes,
        start_min=args.start_min,
        start_max=args.start_max,
        stride=args.stride,
    )
    # Filter out already-completed configs
    done = {(r["dup_start"], r["dup_end"]) for r in results}
    configs = [(s, e) for s, e in configs if (s, e) not in done]
    print(f"Configs to test: {len(configs)}")
    if configs:
        print(f"  Range: ({configs[0][0]},{configs[0][1]}) to ({configs[-1][0]},{configs[-1][1]})")
    print_results_table(results, baseline)
    for idx, (dup_start, dup_end) in enumerate(configs):
        n_dup = dup_end - dup_start
        config_str = f"({dup_start},{dup_end})"
        print(f"\n>>> [{idx+1}/{len(configs)}] Testing config {config_str} "
              f"(+{n_dup} layers)...")
        # Generate modified GGUF
        modified_path = tmpdir / f"rys_{dup_start}_{dup_end}.gguf"
        print(f"  Generating modified GGUF...")
        try:
            duplicate_layers(
                str(model_path), str(modified_path),
                dup_start, dup_end, verbose=False
            )
        except Exception as e:
            print(f"  ERROR generating GGUF: {e}", file=sys.stderr)
            continue
        # Start server with modified model
        print(f"  Starting server...")
        proc = start_server(
            args.llama_server, str(modified_path), args.port, args.server_args
        )
        try:
            if not wait_for_server(args.port):
                print(f"  ERROR: Server failed to start for {config_str}", file=sys.stderr)
                dump_server_log(proc)
                print(f"  Check server log above for details.", file=sys.stderr)
                continue
            print(f"  Server ready. Running probes...")
            eval_result = run_evaluation(args.port)
            entry = {
                "dup_start": dup_start,
                "dup_end": dup_end,
                "n_dup_layers": n_dup,
                "math_score": eval_result["math_score"],
                "eq_score": eval_result["eq_score"],
                "reasoning_score": eval_result["reasoning_score"],
                "reasoning_cats": eval_result.get("reasoning_cats", {}),
                "timestamp": datetime.now().isoformat(),
            }
            results.append(entry)
            # Append to results file
            with open(results_path, "a") as f:
                f.write(json.dumps(entry) + "\n")
            print_results_table(results, baseline)
        finally:
            stop_server(proc)
            # Clean up modified GGUF to free tmpfs space
            if modified_path.exists():
                modified_path.unlink()
                print(f"  Cleaned up {modified_path.name}")
    print("\n\nSweep complete!")
    print_results_table(results, baseline)
 if __name__ == "__main__":
    main()
--- a/visualize.py
+++ b/visualize.py
@ -0,0 +1,114 @@
 #!/usr/bin/env python3
 """
 Visualize RYS sweep results.
 Reads the JSONL output from sweep.py, prints ranked table and bar chart.
 """
 import json
 import sys
 def load_results(path: str):
    baseline = None
    results = []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            entry = json.loads(line)
            if entry.get("is_baseline"):
                baseline = entry
            else:
                results.append(entry)
    return baseline, results
 def print_ranked(baseline, results):
    """Print results ranked by combined delta."""
    if not baseline:
        print("No baseline found in results!")
        return
    for r in results:
        math_delta = r["math_score"] - baseline["math_score"]
        eq_delta = r["eq_score"] - baseline["eq_score"]
        r["math_delta"] = math_delta
        r["eq_delta"] = eq_delta
        r["combined"] = (math_delta * 100) + eq_delta
    ranked = sorted(results, key=lambda x: x["combined"], reverse=True)
    print(f"\nBaseline: math={baseline['math_score']:.4f}  eq={baseline['eq_score']:.2f}")
    print()
    print(f"{'Rank':>4} {'Config':>12} {'Dup':>4} "
          f"{'Math':>8} {'EQ':>8} "
          f"{'Math Δ':>9} {'EQ Δ':>8} {'Combined':>10}")
    print("-" * 80)
    for i, r in enumerate(ranked):
        config = f"({r['dup_start']},{r['dup_end']})"
        n_dup = r['dup_end'] - r['dup_start']
        if r["combined"] > 0:
            marker = "+"
        elif r["combined"] < -5:
            marker = "!"
        else:
            marker = " "
        print(f"{i+1:>4} {config:>12} {n_dup:>4} "
              f"{r['math_score']:>8.4f} {r['eq_score']:>8.2f} "
              f"{r['math_delta']:>+9.4f} {r['eq_delta']:>+8.2f} "
              f"{r['combined']:>+10.2f} {marker}")
    if ranked:
        best = ranked[0]
        worst = ranked[-1]
        print()
        print(f"Best:  ({best['dup_start']},{best['dup_end']})  combined={best['combined']:+.2f}")
        print(f"Worst: ({worst['dup_start']},{worst['dup_end']})  combined={worst['combined']:+.2f}")
 def print_bar_chart(baseline, results):
    """Print a horizontal bar chart sorted by start position."""
    if not baseline or not results:
        return
    for r in results:
        math_delta = r["math_score"] - baseline["math_score"]
        eq_delta = r["eq_score"] - baseline["eq_score"]
        r["combined"] = (math_delta * 100) + eq_delta
    ordered = sorted(results, key=lambda x: x["dup_start"])
    max_val = max(abs(r["combined"]) for r in ordered)
    if max_val == 0:
        max_val = 1
    half = 20
    print(f"\nCombined delta (baseline = |):")
    print(f"{'Config':>12} {'negative':<{half}}|{'positive':<{half}}")
    for r in ordered:
        config = f"({r['dup_start']},{r['dup_end']})"
        val = r["combined"]
        bar_len = int(abs(val) / max_val * half)
        if val >= 0:
            bar = " " * half + "|" + "#" * bar_len
        else:
            pad = half - bar_len
            bar = " " * pad + "=" * bar_len + "|"
        print(f"{config:>12} {bar} {val:+.2f}")
 if __name__ == "__main__":
    if len(sys.argv) < 2:
        print(f"Usage: {sys.argv[0]} <results.jsonl>")
        sys.exit(1)
    baseline, results = load_results(sys.argv[1])
    print_ranked(baseline, results)
    print_bar_chart(baseline, results)