diff --git a/compare_eval.py b/compare_eval.py new file mode 100644 index 0000000..e6176e1 --- /dev/null +++ b/compare_eval.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +""" +Compare lm_eval results across multiple runs. + +Reads the results JSON files from lm_eval output directories and +prints a side-by-side comparison table. + +Usage: + python compare_eval.py ./eval_base ./eval_rys_balanced ./eval_rys_triple + python compare_eval.py ./eval_* +""" + +import argparse +import json +import sys +from pathlib import Path + + +def load_results(folder: str) -> dict: + """Load lm_eval results from an output folder.""" + folder = Path(folder) + + # lm_eval saves results in a JSON file inside the folder + # The filename varies, so find it + candidates = list(folder.glob("**/*results*.json")) + if not candidates: + # Try the folder itself as a JSON file + if folder.suffix == '.json' and folder.is_file(): + with open(folder) as f: + return json.load(f) + print(f"WARNING: No results JSON found in {folder}", file=sys.stderr) + return {} + + # Use the most recent one + results_file = max(candidates, key=lambda p: p.stat().st_mtime) + + with open(results_file) as f: + data = json.load(f) + + return data + + +def extract_metrics(data: dict) -> dict: + """Extract task metrics from lm_eval results format.""" + metrics = {} + + results = data.get("results", {}) + for task_name, task_data in results.items(): + for key, value in task_data.items(): + if key.endswith(",none") or key.endswith(",flexible-extract") or key.endswith(",strict-match") or key.endswith(",get-answer"): + # Parse "metric_name,filter_name" format + parts = key.rsplit(",", 1) + metric = parts[0] + filter_name = parts[1] if len(parts) > 1 else "" + + if isinstance(value, (int, float)): + display_name = f"{task_name}" + if filter_name and filter_name != "none": + display_name += f" ({filter_name})" + if metric not in ("alias",): + label = f"{task_name}|{metric}|{filter_name}" + metrics[label] = { + "task": task_name, + "metric": metric, + "filter": filter_name, + "value": value, + } + + return metrics + + +def get_display_name(label: str, metric_info: dict) -> str: + """Create a readable display name from metric info.""" + task = metric_info["task"] + metric = metric_info["metric"] + filt = metric_info["filter"] + + # Shorten common task names + task = task.replace("bbh_cot_fewshot_", "bbh/") + + if filt and filt not in ("none", "get-answer"): + return f"{task} [{filt}]" + return f"{task} [{metric}]" + + +def compare(folders: list[str], names: list[str] = None): + """Compare results across folders.""" + if names is None: + names = [Path(f).name for f in folders] + + # Pad names to equal length + max_name_len = max(len(n) for n in names) + + # Load all results + all_metrics = {} + for i, folder in enumerate(folders): + data = load_results(folder) + metrics = extract_metrics(data) + all_metrics[names[i]] = metrics + + # Collect all unique metric labels + all_labels = set() + for metrics in all_metrics.values(): + all_labels.update(metrics.keys()) + + # Sort labels by task name + sorted_labels = sorted(all_labels) + + # Print header + col_width = 10 + name_col = max(45, max_name_len) + + header = f"{'Metric':<{name_col}}" + for name in names: + header += f" {name:>{col_width}}" + if len(names) > 1: + header += f" {'Δ(last-first)':>{col_width+2}}" + + print() + print("=" * len(header)) + print("lm_eval Results Comparison") + print("=" * len(header)) + print(header) + print("-" * len(header)) + + prev_task = None + for label in sorted_labels: + # Get display info from first run that has this metric + metric_info = None + for metrics in all_metrics.values(): + if label in metrics: + metric_info = metrics[label] + break + if metric_info is None: + continue + + display = get_display_name(label, metric_info) + + # Skip stderr and alias entries + if "stderr" in label.lower() or "alias" in label.lower(): + continue + + # Add separator between tasks + current_task = metric_info["task"] + if prev_task and current_task != prev_task: + print() + prev_task = current_task + + row = f"{display:<{name_col}}" + + values = [] + for name in names: + metrics = all_metrics[name] + if label in metrics: + val = metrics[label]["value"] + values.append(val) + if isinstance(val, float): + row += f" {val:>{col_width}.4f}" + else: + row += f" {val:>{col_width}}" + else: + values.append(None) + row += f" {'---':>{col_width}}" + + # Delta column (last - first) + if len(names) > 1 and values[0] is not None and values[-1] is not None: + delta = values[-1] - values[0] + if delta > 0: + row += f" {delta:>+{col_width}.4f}" + elif delta < 0: + row += f" {delta:>+{col_width}.4f}" + else: + row += f" {'0':>{col_width+1}}" + elif len(names) > 1: + row += f" {'---':>{col_width+1}}" + + print(row) + + print("=" * len(header)) + + # Summary: average across all metrics per run + print(f"\n{'Average (all metrics)':<{name_col}}", end="") + avgs = [] + for name in names: + metrics = all_metrics[name] + vals = [m["value"] for m in metrics.values() + if isinstance(m["value"], (int, float)) and "stderr" not in m["metric"]] + if vals: + avg = sum(vals) / len(vals) + avgs.append(avg) + print(f" {avg:>{col_width}.4f}", end="") + else: + avgs.append(None) + print(f" {'---':>{col_width}}", end="") + + if len(avgs) > 1 and avgs[0] is not None and avgs[-1] is not None: + delta = avgs[-1] - avgs[0] + print(f" {delta:>+{col_width}.4f}", end="") + print() + + +def main(): + parser = argparse.ArgumentParser( + description="Compare lm_eval results across runs", + epilog="Example: python compare_eval.py ./eval_base ./eval_rys" + ) + parser.add_argument("folders", nargs="+", help="lm_eval output directories or JSON files") + parser.add_argument("--names", nargs="+", help="Display names for each run (default: folder names)") + args = parser.parse_args() + + names = args.names if args.names else None + compare(args.folders, names) + + +if __name__ == "__main__": + main() diff --git a/eq_probe.py b/eq_probe.py new file mode 100644 index 0000000..1dd1329 --- /dev/null +++ b/eq_probe.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +""" +EQ Probe for RYS Layer Duplication Sweep + +Simplified EQ-Bench-style scenarios. Each presents a social situation +and asks the model to rate the intensity of 4 emotions (0-10). +Scored by comparing to reference answers using EQ-Bench v2 scoring. + +The reference answers were crafted to represent reasonable human consensus. +What matters for the sweep is relative performance between configs, +not absolute calibration. +""" + +import json +import re +import math + + +# Each scenario: situation text, 4 emotions to rate, reference scores (0-10) +EQ_SCENARIOS = [ + { + "id": "eq01", + "situation": ( + "David just found out his closest friend has been spreading rumors about him " + "at work for months. When confronted, the friend laughed it off and said " + "\"everyone does it, don't be so sensitive.\" David has been quiet all evening." + ), + "emotions": ["Anger", "Betrayal", "Sadness", "Confusion"], + "reference": [7, 9, 6, 4] + }, + { + "id": "eq02", + "situation": ( + "Maria spent six months preparing a presentation for the board. During the " + "meeting, her manager took credit for all her ideas and presented them as his " + "own. Afterward, he thanked her for 'helping out with the slides.'" + ), + "emotions": ["Frustration", "Resentment", "Humiliation", "Determination"], + "reference": [8, 8, 5, 6] + }, + { + "id": "eq03", + "situation": ( + "After 20 years of marriage, Tom's wife tells him she wants a divorce. " + "She says she still cares about him but hasn't been happy for years. " + "Tom had no idea anything was wrong." + ), + "emotions": ["Shock", "Grief", "Self-doubt", "Denial"], + "reference": [9, 7, 6, 7] + }, + { + "id": "eq04", + "situation": ( + "A young doctor must tell parents that their child's surgery was successful " + "but there will be permanent limitations. The parents are already celebrating " + "in the waiting room." + ), + "emotions": ["Relief", "Anxiety", "Empathy", "Professional_detachment"], + "reference": [4, 7, 8, 5] + }, + { + "id": "eq05", + "situation": ( + "Chen receives a prestigious award at a ceremony. As he walks to the stage, " + "he sees his estranged father in the audience - the man who abandoned the " + "family when Chen was twelve." + ), + "emotions": ["Pride", "Anger", "Longing", "Anxiety"], + "reference": [7, 5, 6, 6] + }, + { + "id": "eq06", + "situation": ( + "A retired teacher learns that a former student, who she failed years ago " + "and who dropped out, has become extremely successful. The student publicly " + "credits 'proving my teacher wrong' as their motivation." + ), + "emotions": ["Guilt", "Pride", "Defensiveness", "Amusement"], + "reference": [5, 4, 6, 3] + }, + { + "id": "eq07", + "situation": ( + "Sophie finds out she's been accepted to her dream university on the same " + "day her best friend receives a rejection from the same school. Her friend " + "calls to congratulate her, voice cracking." + ), + "emotions": ["Joy", "Guilt", "Empathy", "Awkwardness"], + "reference": [7, 6, 8, 7] + }, + { + "id": "eq08", + "situation": ( + "A firefighter rescues a child from a burning building. Weeks later, he " + "wakes up screaming from nightmares about the ones he couldn't save in " + "a previous fire. His partner asks if he's okay." + ), + "emotions": ["Satisfaction", "Trauma", "Vulnerability", "Shame"], + "reference": [3, 8, 7, 5] + }, + { + "id": "eq09", + "situation": ( + "An elderly woman's family surprises her with a birthday party. She smiles " + "and thanks everyone, but keeps glancing at an empty chair - where her " + "late husband always sat." + ), + "emotions": ["Gratitude", "Grief", "Loneliness", "Warmth"], + "reference": [7, 7, 6, 6] + }, + { + "id": "eq10", + "situation": ( + "A manager must lay off a team member who is also a close friend and a " + "single parent. The company requires it due to budget cuts. HR is waiting " + "for the paperwork." + ), + "emotions": ["Guilt", "Dread", "Helplessness", "Resentment"], + "reference": [8, 8, 7, 5] + }, + { + "id": "eq11", + "situation": ( + "James finds old love letters in the attic from his wife, written to " + "someone else before they met. The letters are passionate and describe " + "a depth of feeling he's not sure she's ever expressed toward him." + ), + "emotions": ["Jealousy", "Insecurity", "Curiosity", "Sadness"], + "reference": [6, 7, 5, 4] + }, + { + "id": "eq12", + "situation": ( + "A teenager confesses to her mother that she's been self-harming. The mother, " + "who is a psychologist, realizes she completely missed the signs despite her " + "professional training." + ), + "emotions": ["Fear", "Guilt", "Love", "Self_criticism"], + "reference": [8, 8, 9, 7] + }, + { + "id": "eq13", + "situation": ( + "A war veteran returns home after two years to find his dog waiting on the " + "porch, much older and thinner. The dog recognizes him immediately and " + "limps over, tail wagging." + ), + "emotions": ["Joy", "Guilt", "Tenderness", "Sorrow"], + "reference": [8, 5, 9, 4] + }, + { + "id": "eq14", + "situation": ( + "During a job interview, the candidate realizes the interviewer is someone " + "she bullied badly in high school. The interviewer clearly recognizes her " + "but proceeds professionally." + ), + "emotions": ["Shame", "Anxiety", "Admiration", "Regret"], + "reference": [7, 8, 4, 7] + }, + { + "id": "eq15", + "situation": ( + "A father watches his daughter's wedding, knowing he has a terminal diagnosis " + "he hasn't shared with the family. He chose to wait until after the wedding " + "to tell them." + ), + "emotions": ["Joy", "Grief", "Protectiveness", "Isolation"], + "reference": [6, 8, 8, 7] + }, + { + "id": "eq16", + "situation": ( + "Two siblings meet for the first time as adults after being separated in " + "foster care as children. They look alike but have lived completely different " + "lives. One is wealthy, the other struggles financially." + ), + "emotions": ["Wonder", "Resentment", "Hope", "Grief"], + "reference": [7, 3, 7, 6] + }, +] + + +def build_eq_prompt(scenario: dict) -> str: + """Build the prompt for a single EQ scenario.""" + emotions_str = ", ".join(scenario["emotions"]) + return ( + f"Read the following situation and rate the emotional intensity that " + f"the main character is likely feeling for each of the listed emotions. " + f"Rate each emotion from 0 (not feeling it at all) to 10 (extremely intense).\n\n" + f"Situation: {scenario['situation']}\n\n" + f"Rate these emotions: {emotions_str}\n\n" + f"Respond ONLY with the four numbers separated by commas, in the same order. " + f"Example format: 5, 3, 8, 2\n" + f"Do not include any other text." + ) + + +def parse_eq_response(response: str, n_emotions: int = 4) -> list[float] | None: + """Extract emotion ratings from model response.""" + # Try to find comma-separated numbers + numbers = re.findall(r'(\d+(?:\.\d+)?)', response) + + if len(numbers) < n_emotions: + return None + + try: + # Take the first n_emotions numbers found + ratings = [float(numbers[i]) for i in range(n_emotions)] + # Clamp to 0-10 + ratings = [max(0.0, min(10.0, r)) for r in ratings] + return ratings + except (ValueError, IndexError): + return None + + +def score_eq_response(reference: list[int], predicted: list[float]) -> float: + """ + EQ-Bench v2 style scoring. + Differences 1-4 from reference are scaled down on a curve. + Differences 5-10 count 1:1. + Score 0 = random, 100 = perfect match. + """ + if predicted is None or len(predicted) != len(reference): + return 0.0 + + total_penalty = 0.0 + max_possible_penalty = 10.0 * len(reference) # worst case: all off by 10 + + for ref, pred in zip(reference, predicted): + diff = abs(ref - pred) + # Scale down small differences (EQ-Bench v2 approach) + if diff <= 4: + # Quadratic scaling: diff^2 / 4 so diff=4 -> penalty=4 + penalty = (diff ** 2) / 4.0 + else: + # Linear for larger diffs, continuous at diff=4 (penalty=4) + penalty = diff + total_penalty += penalty + + # Normalize: 0 penalty = score 100, max penalty = score ~0 + score = max(0.0, 100.0 * (1.0 - total_penalty / max_possible_penalty)) + return score + + +# Convenience +EQ_PROMPTS = [(s, build_eq_prompt(s)) for s in EQ_SCENARIOS] + + +if __name__ == "__main__": + print(f"EQ Probe: {len(EQ_SCENARIOS)} scenarios") + print("=" * 60) + + for i, scenario in enumerate(EQ_SCENARIOS): + print(f"\n[{scenario['id']}] Emotions: {scenario['emotions']}") + print(f" Reference: {scenario['reference']}") + prompt = build_eq_prompt(scenario) + print(f" Prompt length: {len(prompt)} chars") + + # Test scoring + print("\n\nScoring tests:") + print(f" Perfect match: {score_eq_response([7, 9, 6, 4], [7, 9, 6, 4]):.1f}") + print(f" All off by 1: {score_eq_response([7, 9, 6, 4], [8, 8, 7, 5]):.1f}") + print(f" All off by 3: {score_eq_response([7, 9, 6, 4], [4, 6, 3, 1]):.1f}") + print(f" All off by 5: {score_eq_response([7, 9, 6, 4], [2, 4, 1, 0]):.1f}") + print(f" Worst case: {score_eq_response([7, 9, 6, 4], [0, 0, 0, 10]):.1f}") + print(f" Unparseable: {score_eq_response([7, 9, 6, 4], None):.1f}") diff --git a/gguf_surgery.py b/gguf_surgery.py new file mode 100644 index 0000000..3940df7 --- /dev/null +++ b/gguf_surgery.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 +""" +GGUF Layer Duplication Surgery + +Reads a GGUF model file, duplicates transformer layers i..j-1 so they +execute twice in the forward pass, and writes a new GGUF with the +modified layer structure. + +For a model with N layers, configuration (i, j) produces: + layers 0..j-1, then layers i..j-1 again, then layers j..N-1 + Total layers: N + (j - i) + +Tensor naming convention: blk.{layer_idx}.{tensor_name} +Non-block tensors (token_embd, output_norm, output) are copied as-is. +""" + +import argparse +import re +import sys +from pathlib import Path + +import numpy as np +from tqdm import tqdm + +import gguf +from gguf import GGUFReader, GGUFWriter, GGUFValueType + + +BLK_PATTERN = re.compile(r'^blk\.(\d+)\.(.+)$') + + +def get_field_value(reader: GGUFReader, key: str): + """Extract a scalar value from a reader field.""" + field = reader.get_field(key) + if field is None: + return None + return field.contents() + + +def duplicate_layers(input_path: str, output_path: str, dup_start: int, dup_end: int, verbose: bool = False): + """ + Create a new GGUF with layers dup_start..dup_end-1 duplicated. + + The new layer order is: + Original layers 0..dup_end-1 + Duplicated layers dup_start..dup_end-1 (renumbered) + Original layers dup_end..N-1 (renumbered) + """ + reader = GGUFReader(input_path, 'r') + + arch = get_field_value(reader, gguf.Keys.General.ARCHITECTURE) + if arch is None: + raise ValueError("Could not read architecture from GGUF") + + block_count_key = f'{arch}.block_count' + orig_block_count = get_field_value(reader, block_count_key) + if orig_block_count is None: + raise ValueError(f"Could not read {block_count_key} from GGUF") + + n_dup = dup_end - dup_start + new_block_count = orig_block_count + n_dup + + if verbose: + print(f"Architecture: {arch}") + print(f"Original layers: {orig_block_count}") + print(f"Duplicating layers {dup_start}..{dup_end - 1} ({n_dup} layers)") + print(f"New layer count: {new_block_count}") + + if dup_start < 0 or dup_end > orig_block_count or dup_start >= dup_end: + raise ValueError( + f"Invalid duplication range ({dup_start}, {dup_end}) " + f"for model with {orig_block_count} layers" + ) + + # Build layer mapping: new_idx -> original_layer_idx + # Phase 1: original 0..dup_end-1 keep their indices + # Phase 2: duplicates of dup_start..dup_end-1 get indices dup_end..dup_end+n_dup-1 + # Phase 3: original dup_end..N-1 shift up by n_dup + layer_map = {} + + for orig_idx in range(dup_end): + layer_map[orig_idx] = orig_idx + + for k in range(n_dup): + layer_map[dup_end + k] = dup_start + k + + for orig_idx in range(dup_end, orig_block_count): + layer_map[orig_idx + n_dup] = orig_idx + + if verbose: + print("Layer mapping (new -> orig):") + for new_idx in sorted(layer_map.keys()): + tag = " [DUP]" if (dup_end <= new_idx < dup_end + n_dup) else "" + print(f" new {new_idx:3d} -> orig {layer_map[new_idx]:3d}{tag}") + + # Create writer + writer = GGUFWriter(output_path, arch=arch, endianess=reader.endianess) + + alignment = get_field_value(reader, gguf.Keys.General.ALIGNMENT) + if alignment is not None: + writer.data_alignment = alignment + + # Copy metadata, overriding block_count + for field in reader.fields.values(): + if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'): + continue + + val_type = field.types[0] + sub_type = field.types[-1] if val_type == GGUFValueType.ARRAY else None + + if field.name == block_count_key: + writer.add_key_value(field.name, new_block_count, val_type) + if verbose: + print(f"Modified {field.name}: {orig_block_count} -> {new_block_count}") + else: + val = field.contents() + if val is not None: + writer.add_key_value(field.name, val, val_type, sub_type=sub_type) + + # Organize tensors by type + non_block_tensors = [] + block_tensors = {} # orig_layer_idx -> [(suffix, tensor), ...] + + for tensor in reader.tensors: + match = BLK_PATTERN.match(tensor.name) + if match: + layer_idx = int(match.group(1)) + suffix = match.group(2) + if layer_idx not in block_tensors: + block_tensors[layer_idx] = [] + block_tensors[layer_idx].append((suffix, tensor)) + else: + non_block_tensors.append(tensor) + + # Split non-block tensors into pre-block and post-block + pre_block = [] + post_block = [] + for t in non_block_tensors: + if 'output' in t.name: + post_block.append(t) + else: + pre_block.append(t) + + # Add tensor infos in order and build write queue + total_bytes = 0 + block_write_order = [] # (new_name, original_tensor) + + for tensor in pre_block: + writer.add_tensor_info( + tensor.name, tensor.data.shape, tensor.data.dtype, + tensor.data.nbytes, tensor.tensor_type + ) + total_bytes += tensor.n_bytes + + for new_idx in range(new_block_count): + orig_idx = layer_map[new_idx] + if orig_idx not in block_tensors: + print(f"WARNING: No tensors for original layer {orig_idx}", file=sys.stderr) + continue + for suffix, tensor in block_tensors[orig_idx]: + new_name = f"blk.{new_idx}.{suffix}" + writer.add_tensor_info( + new_name, tensor.data.shape, tensor.data.dtype, + tensor.data.nbytes, tensor.tensor_type + ) + total_bytes += tensor.n_bytes + block_write_order.append((new_name, tensor)) + + for tensor in post_block: + writer.add_tensor_info( + tensor.name, tensor.data.shape, tensor.data.dtype, + tensor.data.nbytes, tensor.tensor_type + ) + total_bytes += tensor.n_bytes + + # Write file + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_ti_data_to_file() + + bar = tqdm(desc="Writing GGUF", total=total_bytes, unit="B", unit_scale=True) + + for tensor in pre_block: + writer.write_tensor_data(tensor.data) + bar.update(tensor.n_bytes) + + for _, tensor in block_write_order: + writer.write_tensor_data(tensor.data) + bar.update(tensor.n_bytes) + + for tensor in post_block: + writer.write_tensor_data(tensor.data) + bar.update(tensor.n_bytes) + + bar.close() + writer.close() + + if verbose: + print(f"Done. Written to {output_path}") + out_size = Path(output_path).stat().st_size / (1024**3) + print(f"Output size: {out_size:.2f} GiB") + + +def main(): + parser = argparse.ArgumentParser( + description="Duplicate layers in a GGUF model (RYS method)" + ) + parser.add_argument("input", help="Input GGUF file path") + parser.add_argument("output", help="Output GGUF file path") + parser.add_argument("-i", "--dup-start", type=int, required=True, + help="First layer to duplicate (inclusive)") + parser.add_argument("-j", "--dup-end", type=int, required=True, + help="Last layer to duplicate (exclusive)") + parser.add_argument("-v", "--verbose", action="store_true") + args = parser.parse_args() + + duplicate_layers(args.input, args.output, args.dup_start, args.dup_end, args.verbose) + + +if __name__ == "__main__": + main() diff --git a/layer_path.py b/layer_path.py new file mode 100644 index 0000000..90905f9 --- /dev/null +++ b/layer_path.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +""" +Explicit Layer Path GGUF Surgery + +You provide the exact sequence of layers the model should execute. +No ambiguous range notation — just list the layers. + +Examples: + # Normal 40-layer model (identity, for testing) + python layer_path.py model.gguf out.gguf -p 0,1,2,...,39 + + # Duplicate layers 13-16 once (same as RYS with i=13,j=17) + python layer_path.py model.gguf out.gguf -p 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,13,14,15,16,17,18,...,39 + + # Repeat layer 13 four times + python layer_path.py model.gguf out.gguf -p 0,1,...,12,13,13,13,13,14,15,...,39 + + # Triple-pass layers 13-16 + python layer_path.py model.gguf out.gguf -p 0,1,...,16,13,14,15,16,13,14,15,16,17,...,39 + + # Shorthand: use .. to fill in sequential ranges + python layer_path.py model.gguf out.gguf -p 0..16,13,14,15,16,13,14,15,16,17..39 + +Usage: + python layer_path.py input.gguf output.gguf -p "0..16,13,14,15,16,17..39" -v +""" + +import argparse +import re +import sys +from pathlib import Path + +import numpy as np +from tqdm import tqdm + +import gguf +from gguf import GGUFReader, GGUFWriter, GGUFValueType + + +BLK_PATTERN = re.compile(r'^blk\.(\d+)\.(.+)$') + + +def get_field_value(reader, key): + field = reader.get_field(key) + if field is None: + return None + return field.contents() + + +def parse_layer_path(path_str: str) -> list[int]: + """ + Parse a layer path string into a list of layer indices. + + Supports: + - Individual numbers: 0,1,2,13,13,14 + - Ranges with ..: 0..16 expands to 0,1,2,...,16 (inclusive) + - Mixed: 0..12,13,13,13,14..39 + + Whitespace is ignored. + """ + path_str = path_str.replace(' ', '') + layers = [] + + for part in path_str.split(','): + part = part.strip() + if not part: + continue + + if '..' in part: + # Range: start..end (inclusive) + pieces = part.split('..') + if len(pieces) != 2: + raise ValueError(f"Invalid range: '{part}'. Use 'start..end'") + start = int(pieces[0]) + end = int(pieces[1]) + if start > end: + raise ValueError(f"Invalid range: {start}..{end} (start > end)") + layers.extend(range(start, end + 1)) + else: + layers.append(int(part)) + + return layers + + +def build_gguf_from_path(input_path: str, output_path: str, + layer_path: list[int], verbose: bool = False): + """ + Create a new GGUF where the forward pass follows the given layer path. + """ + reader = GGUFReader(input_path, 'r') + + arch = get_field_value(reader, gguf.Keys.General.ARCHITECTURE) + block_count_key = f'{arch}.block_count' + orig_block_count = get_field_value(reader, block_count_key) + + # Validate all layer indices + for idx in layer_path: + if idx < 0 or idx >= orig_block_count: + raise ValueError( + f"Layer {idx} out of range (model has {orig_block_count} layers, 0..{orig_block_count-1})" + ) + + new_block_count = len(layer_path) + + if verbose: + print(f"Architecture: {arch}") + print(f"Original layers: {orig_block_count}") + print(f"New layer count: {new_block_count}") + print(f"Layer path: {layer_path}") + + # Show which layers are repeated + from collections import Counter + counts = Counter(layer_path) + repeated = {k: v for k, v in counts.items() if v > 1} + if repeated: + print(f"Repeated layers: {dict(sorted(repeated.items()))}") + else: + print("No repeated layers (just a reorder)") + + # layer_map: new_position -> original_layer_index + layer_map = {new_idx: orig_idx for new_idx, orig_idx in enumerate(layer_path)} + + # Create writer + writer = GGUFWriter(output_path, arch=arch, endianess=reader.endianess) + + alignment = get_field_value(reader, gguf.Keys.General.ALIGNMENT) + if alignment is not None: + writer.data_alignment = alignment + + # Copy metadata, override block count + for field in reader.fields.values(): + if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'): + continue + val_type = field.types[0] + sub_type = field.types[-1] if val_type == GGUFValueType.ARRAY else None + if field.name == block_count_key: + writer.add_key_value(field.name, new_block_count, val_type) + else: + val = field.contents() + if val is not None: + writer.add_key_value(field.name, val, val_type, sub_type=sub_type) + + # Organize tensors by layer + non_block_tensors = [] + block_tensors = {} + + for tensor in reader.tensors: + match = BLK_PATTERN.match(tensor.name) + if match: + layer_idx = int(match.group(1)) + suffix = match.group(2) + if layer_idx not in block_tensors: + block_tensors[layer_idx] = [] + block_tensors[layer_idx].append((suffix, tensor)) + else: + non_block_tensors.append(tensor) + + pre_block = [t for t in non_block_tensors if 'output' not in t.name] + post_block = [t for t in non_block_tensors if 'output' in t.name] + + # Add tensor infos and build write order + total_bytes = 0 + block_write_order = [] + + for tensor in pre_block: + writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, + tensor.data.nbytes, tensor.tensor_type) + total_bytes += tensor.n_bytes + + for new_idx in range(new_block_count): + orig_idx = layer_map[new_idx] + if orig_idx not in block_tensors: + raise ValueError(f"No tensors found for original layer {orig_idx}") + for suffix, tensor in block_tensors[orig_idx]: + new_name = f"blk.{new_idx}.{suffix}" + writer.add_tensor_info(new_name, tensor.data.shape, tensor.data.dtype, + tensor.data.nbytes, tensor.tensor_type) + total_bytes += tensor.n_bytes + block_write_order.append(tensor) + + for tensor in post_block: + writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, + tensor.data.nbytes, tensor.tensor_type) + total_bytes += tensor.n_bytes + + # Write + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_ti_data_to_file() + + bar = tqdm(desc="Writing GGUF", total=total_bytes, unit="B", unit_scale=True) + + for tensor in pre_block: + writer.write_tensor_data(tensor.data) + bar.update(tensor.n_bytes) + + for tensor in block_write_order: + writer.write_tensor_data(tensor.data) + bar.update(tensor.n_bytes) + + for tensor in post_block: + writer.write_tensor_data(tensor.data) + bar.update(tensor.n_bytes) + + bar.close() + writer.close() + + if verbose: + out_size = Path(output_path).stat().st_size / (1024**3) + print(f"Done. Output: {out_size:.2f} GiB") + + +def main(): + parser = argparse.ArgumentParser( + description="Build GGUF with explicit layer execution path", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Duplicate layers 13-16 once (RYS style) + %(prog)s model.gguf out.gguf -p "0..16,13,14,15,16,17..39" + + # Triple-pass layers 13-16 + %(prog)s model.gguf out.gguf -p "0..16,13,14,15,16,13,14,15,16,17..39" + + # Repeat just layer 13 four times + %(prog)s model.gguf out.gguf -p "0..12,13,13,13,13,14..39" + + # Skip layer 5 entirely + %(prog)s model.gguf out.gguf -p "0..4,6..39" + """ + ) + parser.add_argument("input", help="Input GGUF file") + parser.add_argument("output", help="Output GGUF file") + parser.add_argument("-p", "--path", required=True, + help="Layer execution path (e.g. '0..16,13,14,15,16,17..39')") + parser.add_argument("-v", "--verbose", action="store_true") + args = parser.parse_args() + + layer_path = parse_layer_path(args.path) + print(f"Model: {args.input}") + print(f"Output: {args.output}") + print(f"Layer path ({len(layer_path)} layers): {layer_path}") + + build_gguf_from_path(args.input, args.output, layer_path, args.verbose) + + +if __name__ == "__main__": + main() diff --git a/math_probe.py b/math_probe.py new file mode 100644 index 0000000..bbcea76 --- /dev/null +++ b/math_probe.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +""" +Math Probe for RYS Layer Duplication Sweep + +Hard arithmetic questions where the model must guess the answer +without chain-of-thought. Scored with Ng's partial-credit function. +""" + +import json +import math +import random + + +def calculate_score(actual, estimate): + """ + Ng's partial-credit scoring function. + Pads shorter answers, penalizes proportionally. + Returns 0-1 score. + """ + try: + actual_str = str(int(actual)) + estimate_str = str(int(estimate)) + except (ValueError, OverflowError): + return 0.0 + + max_length = max(len(actual_str), len(estimate_str)) + actual_padded = actual_str.ljust(max_length, "0") + estimate_padded = estimate_str.ljust(max_length, "0") + padding_size = max_length - min(len(actual_str), len(estimate_str)) + + actual_int = int(actual_padded) + estimate_int = int(estimate_padded) + + if max(actual_int, estimate_int) == 0: + return 0.0 + + relative_diff = abs(actual_int - estimate_int) / max(actual_int, estimate_int) + correction_factor = 1 - (padding_size / max_length) + score = (1 - relative_diff) * correction_factor + + return max(0.0, min(score, 1.0)) + + +def generate_math_questions(seed=42): + """ + Generate hard arithmetic questions with known answers. + Mix of operations to test different numeric intuition. + Returns list of (question_text, correct_answer) tuples. + """ + rng = random.Random(seed) + questions = [] + + # Cube roots of large numbers (compute perfect cubes, ask for root) + for _ in range(4): + root = rng.randint(20000, 50000) + cube = root ** 3 + questions.append(( + f"What is the cube root of {cube}? " + f"Answer with just the number, no explanation.", + root + )) + + # Large multiplications + for _ in range(4): + a = rng.randint(100000, 999999) + b = rng.randint(100000, 999999) + product = a * b + questions.append(( + f"What is {a} multiplied by {b}? " + f"Answer with just the number, no explanation.", + product + )) + + # Square roots of large numbers (perfect squares) + for _ in range(4): + root = rng.randint(50000, 200000) + square = root ** 2 + questions.append(( + f"What is the square root of {square}? " + f"Answer with just the number, no explanation.", + root + )) + + # Mixed: cube root multiplied by a number + for _ in range(4): + root = rng.randint(100, 999) + cube = root ** 3 + multiplier = rng.randint(10, 99) + answer = root * multiplier + questions.append(( + f"What is the cube root of {cube}, multiplied by {multiplier}? " + f"Answer with just the number, no explanation.", + answer + )) + + return questions + + +def parse_number_from_response(response: str) -> int | None: + """ + Extract the first integer from a model response. + Handles common LLM quirks: commas in numbers, trailing text, etc. + """ + import re + + # Clean up common formatting + text = response.strip() + + # Try to find a number (possibly with commas) + # Match negative or positive integers, possibly with commas + patterns = [ + r'[-+]?[\d,]+', # numbers with optional commas + ] + + for pattern in patterns: + matches = re.findall(pattern, text) + if matches: + # Take the first/longest match + num_str = max(matches, key=len) + num_str = num_str.replace(',', '') + try: + return int(num_str) + except ValueError: + continue + + return None + + +def score_math_response(question_answer: int, response: str) -> float: + """Score a single math response.""" + parsed = parse_number_from_response(response) + if parsed is None: + return 0.0 + return calculate_score(question_answer, parsed) + + +# Pre-generated questions for consistency across runs +MATH_QUESTIONS = generate_math_questions(seed=42) + + +if __name__ == "__main__": + # Print the questions and answers for verification + print("Math Probe Questions:") + print("=" * 60) + for i, (q, a) in enumerate(MATH_QUESTIONS): + print(f"\n[{i+1}] {q}") + print(f" Answer: {a}") + + # Test the scoring function + print("\n\nScoring tests:") + print(f" Exact match: {calculate_score(4302459, 4302459):.4f}") + print(f" Missing digit: {calculate_score(4302459, 430245):.4f}") + print(f" One digit off: {calculate_score(123456789, 123356789):.4f}") + print(f" Way off: {calculate_score(4302459, 9999999):.4f}") + print(f" Zero vs nonzero: {calculate_score(4302459, 0):.4f}") diff --git a/multi_repeat.py b/multi_repeat.py new file mode 100644 index 0000000..654fdf6 --- /dev/null +++ b/multi_repeat.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +""" +Multi-repeat GGUF surgery. + +Duplicates a block of layers N times total (default 3 = original + 2 copies). + +For (i=13, j=17, repeats=3) on a 40-layer model: + 0..16, 13..16, 13..16, 17..39 = 48 layers total + The block 13-16 executes 3 times. + +Usage: + python multi_repeat.py \ + /path/to/model.gguf \ + /dev/shm/rys/triple_13_17.gguf \ + -i 13 -j 17 -n 3 -v +""" + +import argparse +import re +import sys +from pathlib import Path + +import numpy as np +from tqdm import tqdm + +import gguf +from gguf import GGUFReader, GGUFWriter, GGUFValueType + +BLK_PATTERN = re.compile(r'^blk\.(\d+)\.(.+)$') + + +def get_field_value(reader, key): + field = reader.get_field(key) + if field is None: + return None + return field.contents() + + +def multi_repeat_layers(input_path, output_path, dup_start, dup_end, n_repeats, verbose=False): + reader = GGUFReader(input_path, 'r') + + arch = get_field_value(reader, gguf.Keys.General.ARCHITECTURE) + block_count_key = f'{arch}.block_count' + orig_block_count = get_field_value(reader, block_count_key) + + n_block = dup_end - dup_start + extra_copies = n_repeats - 1 # original pass counts as 1 + new_block_count = orig_block_count + (n_block * extra_copies) + + if verbose: + print(f"Architecture: {arch}") + print(f"Original layers: {orig_block_count}") + print(f"Block: layers {dup_start}..{dup_end - 1} ({n_block} layers)") + print(f"Repeats: {n_repeats}x (original + {extra_copies} copies)") + print(f"New layer count: {new_block_count}") + + # Build layer map + # Phase 1: original 0..dup_end-1 + # Phase 2..N: copies of dup_start..dup_end-1 + # Phase last: original dup_end..orig-1 (shifted) + layer_map = {} + + # Phase 1: original layers up to dup_end + for idx in range(dup_end): + layer_map[idx] = idx + + # Phase 2+: extra copies + offset = dup_end + for copy in range(extra_copies): + for k in range(n_block): + layer_map[offset + k] = dup_start + k + offset += n_block + + # Phase last: remaining original layers shifted + for orig_idx in range(dup_end, orig_block_count): + layer_map[orig_idx + (n_block * extra_copies)] = orig_idx + + assert len(layer_map) == new_block_count + + if verbose: + path = [layer_map[i] for i in range(new_block_count)] + print(f"Execution path ({len(path)} layers):") + i = 0 + while i < len(path): + run_start = path[i] + run_end = run_start + j = i + 1 + while j < len(path) and path[j] == run_end + 1: + run_end = path[j] + j += 1 + if run_start == run_end: + print(f" [{run_start}]") + else: + print(f" [{run_start}..{run_end}]") + i = j + + # Create writer + writer = GGUFWriter(output_path, arch=arch, endianess=reader.endianess) + + alignment = get_field_value(reader, gguf.Keys.General.ALIGNMENT) + if alignment is not None: + writer.data_alignment = alignment + + # Copy metadata + for field in reader.fields.values(): + if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'): + continue + val_type = field.types[0] + sub_type = field.types[-1] if val_type == GGUFValueType.ARRAY else None + if field.name == block_count_key: + writer.add_key_value(field.name, new_block_count, val_type) + else: + val = field.contents() + if val is not None: + writer.add_key_value(field.name, val, val_type, sub_type=sub_type) + + # Organize tensors + non_block_tensors = [] + block_tensors = {} + for tensor in reader.tensors: + match = BLK_PATTERN.match(tensor.name) + if match: + layer_idx = int(match.group(1)) + suffix = match.group(2) + if layer_idx not in block_tensors: + block_tensors[layer_idx] = [] + block_tensors[layer_idx].append((suffix, tensor)) + else: + non_block_tensors.append(tensor) + + pre_block = [t for t in non_block_tensors if 'output' not in t.name] + post_block = [t for t in non_block_tensors if 'output' in t.name] + + total_bytes = 0 + block_write_order = [] + + for tensor in pre_block: + writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, + tensor.data.nbytes, tensor.tensor_type) + total_bytes += tensor.n_bytes + + for new_idx in range(new_block_count): + orig_idx = layer_map[new_idx] + for suffix, tensor in block_tensors[orig_idx]: + new_name = f"blk.{new_idx}.{suffix}" + writer.add_tensor_info(new_name, tensor.data.shape, tensor.data.dtype, + tensor.data.nbytes, tensor.tensor_type) + total_bytes += tensor.n_bytes + block_write_order.append(tensor) + + for tensor in post_block: + writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, + tensor.data.nbytes, tensor.tensor_type) + total_bytes += tensor.n_bytes + + # Write + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_ti_data_to_file() + + bar = tqdm(desc="Writing GGUF", total=total_bytes, unit="B", unit_scale=True) + + for tensor in pre_block: + writer.write_tensor_data(tensor.data) + bar.update(tensor.n_bytes) + for tensor in block_write_order: + writer.write_tensor_data(tensor.data) + bar.update(tensor.n_bytes) + for tensor in post_block: + writer.write_tensor_data(tensor.data) + bar.update(tensor.n_bytes) + + bar.close() + writer.close() + + if verbose: + size = Path(output_path).stat().st_size / (1024**3) + print(f"Written: {output_path} ({size:.2f} GiB)") + + +def main(): + parser = argparse.ArgumentParser(description="Multi-repeat layer duplication") + parser.add_argument("input", help="Input GGUF") + parser.add_argument("output", help="Output GGUF") + parser.add_argument("-i", "--dup-start", type=int, required=True) + parser.add_argument("-j", "--dup-end", type=int, required=True) + parser.add_argument("-n", "--repeats", type=int, default=3, + help="Total times the block executes (default: 3)") + parser.add_argument("-v", "--verbose", action="store_true") + args = parser.parse_args() + + multi_repeat_layers(args.input, args.output, + args.dup_start, args.dup_end, + args.repeats, args.verbose) + + +if __name__ == "__main__": + main() diff --git a/reasoning_probe.py b/reasoning_probe.py new file mode 100644 index 0000000..4188760 --- /dev/null +++ b/reasoning_probe.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +""" +Reasoning Probe for RYS Layer Duplication Sweep + +BBH-style questions: causal judgement, date understanding, logical +deduction, navigation, plus basic math word problems. +All answers are short (Yes/No, a letter, or a number). +""" + +import re + + +# ─── Causal Judgement (Yes/No) ───────────────────────────────────── + +CAUSAL_QUESTIONS = [ + { + "prompt": ( + "How would a typical person answer each of the following questions about causation?\n" + "Frank T., had an ongoing dispute with his neighbor over a stretch of land and one day " + "decided to shoot his neighbor in the body. Frank T. had no experience with guns, his " + "hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet " + "bounced off a large boulder several feet away and hit the neighbor's body, causing " + "significant injury. Did Frank T. intentionally shoot his neighbor in the body?\n" + "Options:\n- Yes\n- No\n\n" + "Think step by step, then answer Yes or No." + ), + "answer": "no", + }, + { + "prompt": ( + "How would a typical person answer each of the following questions about causation?\n" + "Suzy and Billy are working on a project that is very important for our nation's security. " + "The boss tells them both: \"Be sure that you are here at exactly 9 am. It is absolutely " + "essential that you arrive at that time.\" Both Billy and Suzy arrive at 9 am. As it happens, " + "there was a motion detector installed in the room where they arrived. The motion detector " + "was set up to be triggered if at least one person appeared in the room at the same time. " + "So the motion detector went off. Did Billy cause the motion detector to go off?\n" + "Options:\n- Yes\n- No\n\n" + "Think step by step, then answer Yes or No." + ), + "answer": "yes", + }, + { + "prompt": ( + "How would a typical person answer each of the following questions about causation?\n" + "George and his sister Lena reunite at their parents' house for Thanksgiving. Whereas " + "George just got into medical school, Lena is unhappy in her marriage and recently lost " + "her job. Over the course of the day, George and Lena get into a number of heated arguments. " + "Later in the afternoon they play a game of darts. They split the first two games, and the " + "third game is close until the end. Who will win comes down to George's last shot. If he " + "hits a high point region, he wins; if he hits a low point region, Lena wins. George thinks " + "of the difficult time Lena is having, and he really wants to let her win. He aims the dart " + "at the low point region. He sets up his shot and the dart lands in the low point region. " + "After his shot, Lena wins the game and is very happy. Did George hit the low point region " + "intentionally?\n" + "Options:\n- Yes\n- No\n\n" + "Think step by step, then answer Yes or No." + ), + "answer": "yes", + }, +] + +# ─── Date Understanding (multiple choice letter) ────────────────── + +DATE_QUESTIONS = [ + { + "prompt": ( + "Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?\n" + "Options:\n" + "(A) 12/14/2026\n(B) 12/14/1950\n(C) 12/14/2007\n" + "(D) 12/14/1937\n(E) 07/14/1938\n(F) 12/14/1988\n\n" + "Think step by step, then give your answer as a single letter." + ), + "answer": "d", + }, + { + "prompt": ( + "Tomorrow is 11/12/2019. What is the date one year ago from today in MM/DD/YYYY?\n" + "Options:\n" + "(A) 09/04/2018\n(B) 11/11/2018\n(C) 08/25/2018\n" + "(D) 11/02/2018\n(E) 11/04/2018\n\n" + "Think step by step, then give your answer as a single letter." + ), + "answer": "b", + }, + { + "prompt": ( + "Jane and John married on Jan 2, 1958. It is their 5-year anniversary today. " + "What is the date tomorrow in MM/DD/YYYY?\n" + "Options:\n" + "(A) 01/11/1961\n(B) 01/03/1963\n(C) 01/18/1961\n" + "(D) 10/14/1960\n(E) 01/03/1982\n(F) 12/03/1960\n\n" + "Think step by step, then give your answer as a single letter." + ), + "answer": "b", + }, +] + +# ─── Logical Deduction (multiple choice letter) ─────────────────── + +LOGIC_QUESTIONS = [ + { + "prompt": ( + "The following paragraphs each describe a set of three objects arranged in a fixed order. " + "The statements are logically consistent within each paragraph.\n" + "In a golf tournament, there were three golfers: Amy, Eli, and Eve. " + "Eve finished above Amy. Eli finished below Amy.\n" + "Options:\n" + "(A) Amy finished last\n(B) Eli finished last\n(C) Eve finished last\n\n" + "Think step by step, then give your answer as a single letter." + ), + "answer": "b", + }, + { + "prompt": ( + "The following paragraphs each describe a set of three objects arranged in a fixed order. " + "The statements are logically consistent within each paragraph.\n" + "On a shelf, there are three books: a white book, a green book, and an orange book. " + "The green book is to the right of the white book. The orange book is the rightmost.\n" + "Options:\n" + "(A) The white book is the leftmost\n(B) The green book is the leftmost\n" + "(C) The orange book is the leftmost\n\n" + "Think step by step, then give your answer as a single letter." + ), + "answer": "a", + }, + { + "prompt": ( + "The following paragraphs each describe a set of three objects arranged in a fixed order. " + "The statements are logically consistent within each paragraph.\n" + "On a shelf, there are three books: a red book, a gray book, and a white book. " + "The white book is to the left of the gray book. The red book is the second from the left.\n" + "Options:\n" + "(A) The red book is the leftmost\n(B) The gray book is the leftmost\n" + "(C) The white book is the leftmost\n\n" + "Think step by step, then give your answer as a single letter." + ), + "answer": "c", + }, +] + +# ─── Navigation (Yes/No) ────────────────────────────────────────── + +NAV_QUESTIONS = [ + { + "prompt": ( + "If you follow these instructions, do you return to the starting point? " + "Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.\n" + "Options:\n- Yes\n- No\n\n" + "Think step by step, then answer Yes or No." + ), + "answer": "no", + }, + { + "prompt": ( + "If you follow these instructions, do you return to the starting point? " + "Turn around. Take 1 step. Take 6 steps. Turn around. Take 6 steps. Take 9 steps. Take 1 step.\n" + "Options:\n- Yes\n- No\n\n" + "Think step by step, then answer Yes or No." + ), + "answer": "no", + }, + { + "prompt": ( + "If you follow these instructions, do you return to the starting point? " + "Always face forward. Take 2 steps right. Take 9 steps left. Take 7 steps right.\n" + "Options:\n- Yes\n- No\n\n" + "Think step by step, then answer Yes or No." + ), + "answer": "yes", + }, +] + +# ─── GSM8K Word Problems (number answer) ────────────────────────── + +GSM_QUESTIONS = [ + { + "prompt": ( + "There are 15 trees in the grove. Grove workers will plant trees in the grove today. " + "After they are done, there will be 21 trees. How many trees did the grove workers plant today?\n\n" + "Solve step by step. End with 'The answer is [NUMBER]'." + ), + "answer": "6", + }, + { + "prompt": ( + "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\n\n" + "Solve step by step. End with 'The answer is [NUMBER]'." + ), + "answer": "5", + }, + { + "prompt": ( + "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\n\n" + "Solve step by step. End with 'The answer is [NUMBER]'." + ), + "answer": "39", + }, + { + "prompt": ( + "Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. " + "How many lollipops did Jason give to Denny?\n\n" + "Solve step by step. End with 'The answer is [NUMBER]'." + ), + "answer": "8", + }, + { + "prompt": ( + "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. " + "How many toys does he have now?\n\n" + "Solve step by step. End with 'The answer is [NUMBER]'." + ), + "answer": "9", + }, +] + +# ─── All questions with types ───────────────────────────────────── + +ALL_REASONING = [] +for q in CAUSAL_QUESTIONS: + ALL_REASONING.append({**q, "type": "causal"}) +for q in DATE_QUESTIONS: + ALL_REASONING.append({**q, "type": "date"}) +for q in LOGIC_QUESTIONS: + ALL_REASONING.append({**q, "type": "logic"}) +for q in NAV_QUESTIONS: + ALL_REASONING.append({**q, "type": "nav"}) +for q in GSM_QUESTIONS: + ALL_REASONING.append({**q, "type": "gsm"}) + +# Alias for sweep.py import +REASONING_QUESTIONS = ALL_REASONING + + +def extract_final_answer(response: str) -> str: + """Extract the final answer from a CoT response.""" + match = re.search(r'the answer is\s+(.+?)[\.\!\n\r]', response, re.IGNORECASE) + if match: + return match.group(1).strip() + match = re.search(r'\b(yes|no)\b', response.split('\n')[-1], re.IGNORECASE) + if match: + return match.group(1).strip() + match = re.search(r'\(?([A-F])\)?', response.split('\n')[-1]) + if match: + return match.group(1).strip() + return response.strip().split('\n')[-1].strip() + + +def score_reasoning_response(question: dict, response: str) -> float: + """Score a reasoning question. Returns 0 or 1.""" + if response is None: + return 0.0 + + final = extract_final_answer(response).lower().strip() + correct = question["answer"].lower().strip() + + final = re.sub(r'[^a-z0-9]', '', final) + correct = re.sub(r'[^a-z0-9]', '', correct) + + if final == correct: + return 1.0 + + if correct in ("yes", "no"): + last_line = response.strip().split('\n')[-1].lower() + if correct in last_line and (correct == "yes") != ("no" in last_line): + return 1.0 + + return 0.0 + + +if __name__ == "__main__": + print(f"Reasoning Probe: {len(ALL_REASONING)} questions") + print(f" Causal: {len(CAUSAL_QUESTIONS)}") + print(f" Date: {len(DATE_QUESTIONS)}") + print(f" Logic: {len(LOGIC_QUESTIONS)}") + print(f" Nav: {len(NAV_QUESTIONS)}") + print(f" GSM: {len(GSM_QUESTIONS)}") + + print("\nSample questions:") + for q in ALL_REASONING[:2]: + print(f"\n [{q['type']}] answer={q['answer']}") + print(f" {q['prompt'][:80]}...") diff --git a/sweep.py b/sweep.py new file mode 100644 index 0000000..83c13ec --- /dev/null +++ b/sweep.py @@ -0,0 +1,447 @@ +#!/usr/bin/env python3 +""" +RYS Layer Duplication Sweep + +Orchestrates the search for optimal layer duplication configuration: +1. Generate modified GGUF with duplicated layers +2. Start llama-server with the modified model +3. Run math + EQ probes +4. Score and record results +5. Print live results table +6. Kill server, repeat + +Usage: + python sweep.py \ + --model /path/to/model.gguf \ + --llama-server /path/to/llama-server \ + --tmpdir /dev/shm/rys \ + --results results.jsonl + +The sweep strategy: + Pass 1: 8-layer blocks at stride 4 across the middle + Pass 2: Refine within the hot zone with smaller blocks +""" + +import argparse +import json +import os +import signal +import subprocess +import sys +import time +from datetime import datetime +from pathlib import Path + +import requests + +from gguf_surgery import duplicate_layers +from math_probe import MATH_QUESTIONS, score_math_response +from eq_probe import EQ_SCENARIOS, build_eq_prompt, parse_eq_response, score_eq_response +from reasoning_probe import REASONING_QUESTIONS, score_reasoning_response + + +# Server config +DEFAULT_PORT = 8099 +SERVER_STARTUP_TIMEOUT = 120 # seconds +REQUEST_TIMEOUT = 60 # seconds per completion + + +def wait_for_server(port: int, timeout: int = SERVER_STARTUP_TIMEOUT) -> bool: + """Wait for llama-server to be ready.""" + url = f"http://127.0.0.1:{port}/health" + start = time.time() + while time.time() - start < timeout: + try: + r = requests.get(url, timeout=2) + if r.status_code == 200: + data = r.json() + if data.get("status") == "ok": + return True + except (requests.ConnectionError, requests.Timeout): + pass + time.sleep(1) + return False + + +def start_server(llama_server_path: str, model_path: str, port: int, + extra_args: list[str] = None) -> subprocess.Popen: + """Start llama-server and return the process handle.""" + cmd = [ + llama_server_path, + "-m", model_path, + "--port", str(port), + "-c", "4096", # small context for probe eval + "-ngl", "99", # offload all layers to GPU + "--flash-attn", "on", + "--cache-type-k", "q8_0", + "--cache-type-v", "q8_0", + "--no-warmup", + "-np", "1", # single slot + ] + if extra_args: + cmd.extend(extra_args) + + print(f" [CMD] {' '.join(cmd)}", flush=True) + + # Let server output go to a log file so we can debug without pipe deadlocks + log_path = Path(f"/tmp/rys_server_{port}.log") + log_file = open(log_path, "w") + proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT) + proc._log_file = log_file # keep reference so it doesn't get GC'd + proc._log_path = log_path + print(f" [PID] Server started as PID {proc.pid}, log: {log_path}", flush=True) + return proc + + +def stop_server(proc: subprocess.Popen): + """Gracefully stop the server.""" + if proc.poll() is None: + proc.terminate() + try: + proc.wait(timeout=10) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + # Close the log file + if hasattr(proc, '_log_file'): + proc._log_file.close() + + +def dump_server_log(proc: subprocess.Popen, tail_lines: int = 30): + """Print the last N lines of the server log for debugging.""" + if hasattr(proc, '_log_path') and proc._log_path.exists(): + lines = proc._log_path.read_text().splitlines() + print(f" --- Server log (last {tail_lines} lines) ---", file=sys.stderr) + for line in lines[-tail_lines:]: + print(f" | {line}", file=sys.stderr) + print(f" --- End server log ---", file=sys.stderr) + + +def query_model(prompt: str, port: int, max_tokens: int = 64) -> str | None: + """Send a completion request to llama-server.""" + url = f"http://127.0.0.1:{port}/v1/chat/completions" + + payload = { + "model": "test", + "messages": [ + {"role": "user", "content": prompt} + ], + "max_tokens": max_tokens, + "temperature": 0.0, + } + + try: + r = requests.post(url, json=payload, timeout=REQUEST_TIMEOUT) + if r.status_code == 200: + data = r.json() + return data["choices"][0]["message"]["content"] + else: + print(f" [WARN] Server returned {r.status_code}", file=sys.stderr) + return None + except (requests.ConnectionError, requests.Timeout) as e: + print(f" [WARN] Request failed: {e}", file=sys.stderr) + return None + + +def run_math_probe(port: int) -> float: + """Run all math questions and return average score (0-1).""" + scores = [] + for question, answer in MATH_QUESTIONS: + response = query_model(question, port, max_tokens=48) + if response is not None: + score = score_math_response(answer, response) + scores.append(score) + else: + scores.append(0.0) + return sum(scores) / len(scores) if scores else 0.0 + + +def run_eq_probe(port: int) -> float: + """Run all EQ scenarios and return average score (0-100).""" + scores = [] + for scenario in EQ_SCENARIOS: + prompt = build_eq_prompt(scenario) + response = query_model(prompt, port, max_tokens=48) + if response is not None: + predicted = parse_eq_response(response, len(scenario["emotions"])) + score = score_eq_response(scenario["reference"], predicted) + scores.append(score) + else: + scores.append(0.0) + return sum(scores) / len(scores) if scores else 0.0 + + +def run_reasoning_probe(port: int) -> dict: + """Run all reasoning questions, return scores by category and overall.""" + by_category = {} + for q in REASONING_QUESTIONS: + cat = q["type"] + if cat not in by_category: + by_category[cat] = [] + response = query_model(q["prompt"], port, max_tokens=512) + score = score_reasoning_response(q, response) + by_category[cat].append(score) + + # Per-category averages + cat_scores = {} + for cat, scores in by_category.items(): + cat_scores[cat] = sum(scores) / len(scores) if scores else 0.0 + + # Overall reasoning score (0-1) + all_scores = [s for scores in by_category.values() for s in scores] + overall = sum(all_scores) / len(all_scores) if all_scores else 0.0 + + return {"categories": cat_scores, "overall": overall} + + +def run_evaluation(port: int) -> dict: + """Run all probes and return results.""" + math_score = run_math_probe(port) + eq_score = run_eq_probe(port) + reasoning = run_reasoning_probe(port) + return { + "math_score": math_score, + "eq_score": eq_score, + "reasoning_score": reasoning["overall"], + "reasoning_cats": reasoning["categories"], + } + + +def print_results_table(results: list[dict], baseline: dict | None = None): + """Print a live-updating results table.""" + print("\n" + "=" * 105) + print(f"{'Config':>12} {'Layers':>8} {'Math':>8} {'EQ':>8} {'Reason':>8} " + f"{'Math Δ':>8} {'EQ Δ':>8} {'Reas Δ':>8} {'Combined Δ':>11}") + print("-" * 105) + + if baseline: + brs = baseline.get('reasoning_score', 0) + print(f"{'BASELINE':>12} {'0':>8} " + f"{baseline['math_score']:>8.4f} {baseline['eq_score']:>8.2f} {brs:>8.2%} " + f"{'---':>8} {'---':>8} {'---':>8} {'---':>11}") + print("-" * 105) + + for r in results: + config = f"({r['dup_start']},{r['dup_end']})" + n_dup = r['dup_end'] - r['dup_start'] + rs = r.get('reasoning_score', 0) + + if baseline: + math_delta = r['math_score'] - baseline['math_score'] + eq_delta = r['eq_score'] - baseline['eq_score'] + reas_delta = rs - baseline.get('reasoning_score', 0) + # Combined: weight EQ and reasoning more than math + combined = eq_delta + (reas_delta * 100) + math_d = f"{math_delta:>+8.4f}" + eq_d = f"{eq_delta:>+8.2f}" + reas_d = f"{reas_delta:>+8.2%}" + comb_d = f"{combined:>+11.2f}" + else: + math_d = eq_d = reas_d = comb_d = "---" + + print(f"{config:>12} {n_dup:>8} " + f"{r['math_score']:>8.4f} {r['eq_score']:>8.2f} {rs:>8.2%} " + f"{math_d} {eq_d} {reas_d} {comb_d}") + + print("=" * 105) + sys.stdout.flush() + + +def generate_sweep_configs(n_layers: int, block_sizes: list[int], + start_min: int = 4, start_max: int = None, + stride: int = 4) -> list[tuple[int, int]]: + """ + Generate (dup_start, dup_end) configs for the sweep. + + Args: + n_layers: Total layers in the model + block_sizes: List of block sizes to try (e.g., [8]) + start_min: Earliest layer to start duplication + start_max: Latest layer to start (default: n_layers - max(block_sizes) - 4) + stride: Step between start positions + """ + if start_max is None: + start_max = n_layers - max(block_sizes) - 4 + + configs = [] + for bs in block_sizes: + for start in range(start_min, start_max + 1, stride): + end = start + bs + if end <= n_layers: + configs.append((start, end)) + + return configs + + +def main(): + parser = argparse.ArgumentParser(description="RYS Layer Duplication Sweep") + parser.add_argument("--model", required=True, help="Path to input GGUF model") + parser.add_argument("--llama-server", required=True, help="Path to llama-server binary") + parser.add_argument("--tmpdir", default="/dev/shm/rys", + help="Temp directory for modified GGUFs (use tmpfs/RAM)") + parser.add_argument("--results", default="rys_results.jsonl", + help="Output results file (JSONL)") + parser.add_argument("--port", type=int, default=DEFAULT_PORT) + parser.add_argument("--block-sizes", type=int, nargs="+", default=[8], + help="Block sizes to sweep (default: 8)") + parser.add_argument("--stride", type=int, default=4, + help="Stride between start positions (default: 4)") + parser.add_argument("--start-min", type=int, default=4, + help="Earliest layer to start duplication") + parser.add_argument("--start-max", type=int, default=None, + help="Latest layer to start duplication") + parser.add_argument("--skip-baseline", action="store_true", + help="Skip baseline run (use if already in results)") + parser.add_argument("--server-args", nargs=argparse.REMAINDER, default=[], + help="Extra args to pass to llama-server (must be last)") + args = parser.parse_args() + + model_path = Path(args.model).resolve() + tmpdir = Path(args.tmpdir) + tmpdir.mkdir(parents=True, exist_ok=True) + + results_path = Path(args.results) + results = [] + baseline = None + + # Load existing results if resuming + if results_path.exists(): + with open(results_path) as f: + for line in f: + line = line.strip() + if line: + entry = json.loads(line) + if entry.get("is_baseline"): + baseline = entry + else: + results.append(entry) + print(f"Loaded {len(results)} existing results + baseline={baseline is not None}") + + # Run baseline (unmodified model) + if not args.skip_baseline and baseline is None: + print("\n>>> Running BASELINE evaluation...") + proc = start_server(args.llama_server, str(model_path), args.port, args.server_args) + try: + if not wait_for_server(args.port): + print("ERROR: Server failed to start for baseline", file=sys.stderr) + dump_server_log(proc) + stop_server(proc) + sys.exit(1) + + print(" Server ready. Running probes...") + eval_result = run_evaluation(args.port) + baseline = { + "is_baseline": True, + "dup_start": -1, + "dup_end": -1, + "math_score": eval_result["math_score"], + "eq_score": eval_result["eq_score"], + "reasoning_score": eval_result["reasoning_score"], + "reasoning_cats": eval_result.get("reasoning_cats", {}), + "timestamp": datetime.now().isoformat(), + } + + with open(results_path, "a") as f: + f.write(json.dumps(baseline) + "\n") + + brs = baseline['reasoning_score'] + print(f" Baseline: math={baseline['math_score']:.4f} eq={baseline['eq_score']:.2f} reasoning={brs:.2%}") + finally: + stop_server(proc) + + # Get model layer count from the GGUF metadata + from gguf import GGUFReader + reader = GGUFReader(str(model_path), 'r') + arch_field = reader.get_field('general.architecture') + arch = arch_field.contents() + block_count_field = reader.get_field(f'{arch}.block_count') + n_layers = block_count_field.contents() + print(f"\nModel: {model_path.name}") + print(f"Architecture: {arch}, Layers: {n_layers}") + + # Generate sweep configurations + configs = generate_sweep_configs( + n_layers=n_layers, + block_sizes=args.block_sizes, + start_min=args.start_min, + start_max=args.start_max, + stride=args.stride, + ) + + # Filter out already-completed configs + done = {(r["dup_start"], r["dup_end"]) for r in results} + configs = [(s, e) for s, e in configs if (s, e) not in done] + + print(f"Configs to test: {len(configs)}") + if configs: + print(f" Range: ({configs[0][0]},{configs[0][1]}) to ({configs[-1][0]},{configs[-1][1]})") + + print_results_table(results, baseline) + + for idx, (dup_start, dup_end) in enumerate(configs): + n_dup = dup_end - dup_start + config_str = f"({dup_start},{dup_end})" + print(f"\n>>> [{idx+1}/{len(configs)}] Testing config {config_str} " + f"(+{n_dup} layers)...") + + # Generate modified GGUF + modified_path = tmpdir / f"rys_{dup_start}_{dup_end}.gguf" + print(f" Generating modified GGUF...") + try: + duplicate_layers( + str(model_path), str(modified_path), + dup_start, dup_end, verbose=False + ) + except Exception as e: + print(f" ERROR generating GGUF: {e}", file=sys.stderr) + continue + + # Start server with modified model + print(f" Starting server...") + proc = start_server( + args.llama_server, str(modified_path), args.port, args.server_args + ) + + try: + if not wait_for_server(args.port): + print(f" ERROR: Server failed to start for {config_str}", file=sys.stderr) + dump_server_log(proc) + print(f" Check server log above for details.", file=sys.stderr) + continue + + print(f" Server ready. Running probes...") + eval_result = run_evaluation(args.port) + + entry = { + "dup_start": dup_start, + "dup_end": dup_end, + "n_dup_layers": n_dup, + "math_score": eval_result["math_score"], + "eq_score": eval_result["eq_score"], + "reasoning_score": eval_result["reasoning_score"], + "reasoning_cats": eval_result.get("reasoning_cats", {}), + "timestamp": datetime.now().isoformat(), + } + + results.append(entry) + + # Append to results file + with open(results_path, "a") as f: + f.write(json.dumps(entry) + "\n") + + print_results_table(results, baseline) + + finally: + stop_server(proc) + + # Clean up modified GGUF to free tmpfs space + if modified_path.exists(): + modified_path.unlink() + print(f" Cleaned up {modified_path.name}") + + print("\n\nSweep complete!") + print_results_table(results, baseline) + + +if __name__ == "__main__": + main() diff --git a/visualize.py b/visualize.py new file mode 100644 index 0000000..570420d --- /dev/null +++ b/visualize.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +""" +Visualize RYS sweep results. +Reads the JSONL output from sweep.py, prints ranked table and bar chart. +""" + +import json +import sys + + +def load_results(path: str): + baseline = None + results = [] + with open(path) as f: + for line in f: + line = line.strip() + if not line: + continue + entry = json.loads(line) + if entry.get("is_baseline"): + baseline = entry + else: + results.append(entry) + return baseline, results + + +def print_ranked(baseline, results): + """Print results ranked by combined delta.""" + if not baseline: + print("No baseline found in results!") + return + + for r in results: + math_delta = r["math_score"] - baseline["math_score"] + eq_delta = r["eq_score"] - baseline["eq_score"] + r["math_delta"] = math_delta + r["eq_delta"] = eq_delta + r["combined"] = (math_delta * 100) + eq_delta + + ranked = sorted(results, key=lambda x: x["combined"], reverse=True) + + print(f"\nBaseline: math={baseline['math_score']:.4f} eq={baseline['eq_score']:.2f}") + print() + print(f"{'Rank':>4} {'Config':>12} {'Dup':>4} " + f"{'Math':>8} {'EQ':>8} " + f"{'Math Δ':>9} {'EQ Δ':>8} {'Combined':>10}") + print("-" * 80) + + for i, r in enumerate(ranked): + config = f"({r['dup_start']},{r['dup_end']})" + n_dup = r['dup_end'] - r['dup_start'] + + if r["combined"] > 0: + marker = "+" + elif r["combined"] < -5: + marker = "!" + else: + marker = " " + + print(f"{i+1:>4} {config:>12} {n_dup:>4} " + f"{r['math_score']:>8.4f} {r['eq_score']:>8.2f} " + f"{r['math_delta']:>+9.4f} {r['eq_delta']:>+8.2f} " + f"{r['combined']:>+10.2f} {marker}") + + if ranked: + best = ranked[0] + worst = ranked[-1] + print() + print(f"Best: ({best['dup_start']},{best['dup_end']}) combined={best['combined']:+.2f}") + print(f"Worst: ({worst['dup_start']},{worst['dup_end']}) combined={worst['combined']:+.2f}") + + +def print_bar_chart(baseline, results): + """Print a horizontal bar chart sorted by start position.""" + if not baseline or not results: + return + + for r in results: + math_delta = r["math_score"] - baseline["math_score"] + eq_delta = r["eq_score"] - baseline["eq_score"] + r["combined"] = (math_delta * 100) + eq_delta + + ordered = sorted(results, key=lambda x: x["dup_start"]) + + max_val = max(abs(r["combined"]) for r in ordered) + if max_val == 0: + max_val = 1 + + half = 20 + print(f"\nCombined delta (baseline = |):") + print(f"{'Config':>12} {'negative':<{half}}|{'positive':<{half}}") + + for r in ordered: + config = f"({r['dup_start']},{r['dup_end']})" + val = r["combined"] + bar_len = int(abs(val) / max_val * half) + + if val >= 0: + bar = " " * half + "|" + "#" * bar_len + else: + pad = half - bar_len + bar = " " * pad + "=" * bar_len + "|" + + print(f"{config:>12} {bar} {val:+.2f}") + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} ") + sys.exit(1) + + baseline, results = load_results(sys.argv[1]) + print_ranked(baseline, results) + print_bar_chart(baseline, results)