Add files via upload

This commit is contained in:
alainnothere 2026-03-18 21:06:28 +00:00 committed by GitHub
parent 9ddf9fafd9
commit 8b68afbd3e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 2148 additions and 0 deletions

216
compare_eval.py Normal file
View file

@ -0,0 +1,216 @@
#!/usr/bin/env python3
"""
Compare lm_eval results across multiple runs.
Reads the results JSON files from lm_eval output directories and
prints a side-by-side comparison table.
Usage:
python compare_eval.py ./eval_base ./eval_rys_balanced ./eval_rys_triple
python compare_eval.py ./eval_*
"""
import argparse
import json
import sys
from pathlib import Path
def load_results(folder: str) -> dict:
"""Load lm_eval results from an output folder."""
folder = Path(folder)
# lm_eval saves results in a JSON file inside the folder
# The filename varies, so find it
candidates = list(folder.glob("**/*results*.json"))
if not candidates:
# Try the folder itself as a JSON file
if folder.suffix == '.json' and folder.is_file():
with open(folder) as f:
return json.load(f)
print(f"WARNING: No results JSON found in {folder}", file=sys.stderr)
return {}
# Use the most recent one
results_file = max(candidates, key=lambda p: p.stat().st_mtime)
with open(results_file) as f:
data = json.load(f)
return data
def extract_metrics(data: dict) -> dict:
"""Extract task metrics from lm_eval results format."""
metrics = {}
results = data.get("results", {})
for task_name, task_data in results.items():
for key, value in task_data.items():
if key.endswith(",none") or key.endswith(",flexible-extract") or key.endswith(",strict-match") or key.endswith(",get-answer"):
# Parse "metric_name,filter_name" format
parts = key.rsplit(",", 1)
metric = parts[0]
filter_name = parts[1] if len(parts) > 1 else ""
if isinstance(value, (int, float)):
display_name = f"{task_name}"
if filter_name and filter_name != "none":
display_name += f" ({filter_name})"
if metric not in ("alias",):
label = f"{task_name}|{metric}|{filter_name}"
metrics[label] = {
"task": task_name,
"metric": metric,
"filter": filter_name,
"value": value,
}
return metrics
def get_display_name(label: str, metric_info: dict) -> str:
"""Create a readable display name from metric info."""
task = metric_info["task"]
metric = metric_info["metric"]
filt = metric_info["filter"]
# Shorten common task names
task = task.replace("bbh_cot_fewshot_", "bbh/")
if filt and filt not in ("none", "get-answer"):
return f"{task} [{filt}]"
return f"{task} [{metric}]"
def compare(folders: list[str], names: list[str] = None):
"""Compare results across folders."""
if names is None:
names = [Path(f).name for f in folders]
# Pad names to equal length
max_name_len = max(len(n) for n in names)
# Load all results
all_metrics = {}
for i, folder in enumerate(folders):
data = load_results(folder)
metrics = extract_metrics(data)
all_metrics[names[i]] = metrics
# Collect all unique metric labels
all_labels = set()
for metrics in all_metrics.values():
all_labels.update(metrics.keys())
# Sort labels by task name
sorted_labels = sorted(all_labels)
# Print header
col_width = 10
name_col = max(45, max_name_len)
header = f"{'Metric':<{name_col}}"
for name in names:
header += f" {name:>{col_width}}"
if len(names) > 1:
header += f" {'Δ(last-first)':>{col_width+2}}"
print()
print("=" * len(header))
print("lm_eval Results Comparison")
print("=" * len(header))
print(header)
print("-" * len(header))
prev_task = None
for label in sorted_labels:
# Get display info from first run that has this metric
metric_info = None
for metrics in all_metrics.values():
if label in metrics:
metric_info = metrics[label]
break
if metric_info is None:
continue
display = get_display_name(label, metric_info)
# Skip stderr and alias entries
if "stderr" in label.lower() or "alias" in label.lower():
continue
# Add separator between tasks
current_task = metric_info["task"]
if prev_task and current_task != prev_task:
print()
prev_task = current_task
row = f"{display:<{name_col}}"
values = []
for name in names:
metrics = all_metrics[name]
if label in metrics:
val = metrics[label]["value"]
values.append(val)
if isinstance(val, float):
row += f" {val:>{col_width}.4f}"
else:
row += f" {val:>{col_width}}"
else:
values.append(None)
row += f" {'---':>{col_width}}"
# Delta column (last - first)
if len(names) > 1 and values[0] is not None and values[-1] is not None:
delta = values[-1] - values[0]
if delta > 0:
row += f" {delta:>+{col_width}.4f}"
elif delta < 0:
row += f" {delta:>+{col_width}.4f}"
else:
row += f" {'0':>{col_width+1}}"
elif len(names) > 1:
row += f" {'---':>{col_width+1}}"
print(row)
print("=" * len(header))
# Summary: average across all metrics per run
print(f"\n{'Average (all metrics)':<{name_col}}", end="")
avgs = []
for name in names:
metrics = all_metrics[name]
vals = [m["value"] for m in metrics.values()
if isinstance(m["value"], (int, float)) and "stderr" not in m["metric"]]
if vals:
avg = sum(vals) / len(vals)
avgs.append(avg)
print(f" {avg:>{col_width}.4f}", end="")
else:
avgs.append(None)
print(f" {'---':>{col_width}}", end="")
if len(avgs) > 1 and avgs[0] is not None and avgs[-1] is not None:
delta = avgs[-1] - avgs[0]
print(f" {delta:>+{col_width}.4f}", end="")
print()
def main():
parser = argparse.ArgumentParser(
description="Compare lm_eval results across runs",
epilog="Example: python compare_eval.py ./eval_base ./eval_rys"
)
parser.add_argument("folders", nargs="+", help="lm_eval output directories or JSON files")
parser.add_argument("--names", nargs="+", help="Display names for each run (default: folder names)")
args = parser.parse_args()
names = args.names if args.names else None
compare(args.folders, names)
if __name__ == "__main__":
main()

267
eq_probe.py Normal file
View file

@ -0,0 +1,267 @@
#!/usr/bin/env python3
"""
EQ Probe for RYS Layer Duplication Sweep
Simplified EQ-Bench-style scenarios. Each presents a social situation
and asks the model to rate the intensity of 4 emotions (0-10).
Scored by comparing to reference answers using EQ-Bench v2 scoring.
The reference answers were crafted to represent reasonable human consensus.
What matters for the sweep is relative performance between configs,
not absolute calibration.
"""
import json
import re
import math
# Each scenario: situation text, 4 emotions to rate, reference scores (0-10)
EQ_SCENARIOS = [
{
"id": "eq01",
"situation": (
"David just found out his closest friend has been spreading rumors about him "
"at work for months. When confronted, the friend laughed it off and said "
"\"everyone does it, don't be so sensitive.\" David has been quiet all evening."
),
"emotions": ["Anger", "Betrayal", "Sadness", "Confusion"],
"reference": [7, 9, 6, 4]
},
{
"id": "eq02",
"situation": (
"Maria spent six months preparing a presentation for the board. During the "
"meeting, her manager took credit for all her ideas and presented them as his "
"own. Afterward, he thanked her for 'helping out with the slides.'"
),
"emotions": ["Frustration", "Resentment", "Humiliation", "Determination"],
"reference": [8, 8, 5, 6]
},
{
"id": "eq03",
"situation": (
"After 20 years of marriage, Tom's wife tells him she wants a divorce. "
"She says she still cares about him but hasn't been happy for years. "
"Tom had no idea anything was wrong."
),
"emotions": ["Shock", "Grief", "Self-doubt", "Denial"],
"reference": [9, 7, 6, 7]
},
{
"id": "eq04",
"situation": (
"A young doctor must tell parents that their child's surgery was successful "
"but there will be permanent limitations. The parents are already celebrating "
"in the waiting room."
),
"emotions": ["Relief", "Anxiety", "Empathy", "Professional_detachment"],
"reference": [4, 7, 8, 5]
},
{
"id": "eq05",
"situation": (
"Chen receives a prestigious award at a ceremony. As he walks to the stage, "
"he sees his estranged father in the audience - the man who abandoned the "
"family when Chen was twelve."
),
"emotions": ["Pride", "Anger", "Longing", "Anxiety"],
"reference": [7, 5, 6, 6]
},
{
"id": "eq06",
"situation": (
"A retired teacher learns that a former student, who she failed years ago "
"and who dropped out, has become extremely successful. The student publicly "
"credits 'proving my teacher wrong' as their motivation."
),
"emotions": ["Guilt", "Pride", "Defensiveness", "Amusement"],
"reference": [5, 4, 6, 3]
},
{
"id": "eq07",
"situation": (
"Sophie finds out she's been accepted to her dream university on the same "
"day her best friend receives a rejection from the same school. Her friend "
"calls to congratulate her, voice cracking."
),
"emotions": ["Joy", "Guilt", "Empathy", "Awkwardness"],
"reference": [7, 6, 8, 7]
},
{
"id": "eq08",
"situation": (
"A firefighter rescues a child from a burning building. Weeks later, he "
"wakes up screaming from nightmares about the ones he couldn't save in "
"a previous fire. His partner asks if he's okay."
),
"emotions": ["Satisfaction", "Trauma", "Vulnerability", "Shame"],
"reference": [3, 8, 7, 5]
},
{
"id": "eq09",
"situation": (
"An elderly woman's family surprises her with a birthday party. She smiles "
"and thanks everyone, but keeps glancing at an empty chair - where her "
"late husband always sat."
),
"emotions": ["Gratitude", "Grief", "Loneliness", "Warmth"],
"reference": [7, 7, 6, 6]
},
{
"id": "eq10",
"situation": (
"A manager must lay off a team member who is also a close friend and a "
"single parent. The company requires it due to budget cuts. HR is waiting "
"for the paperwork."
),
"emotions": ["Guilt", "Dread", "Helplessness", "Resentment"],
"reference": [8, 8, 7, 5]
},
{
"id": "eq11",
"situation": (
"James finds old love letters in the attic from his wife, written to "
"someone else before they met. The letters are passionate and describe "
"a depth of feeling he's not sure she's ever expressed toward him."
),
"emotions": ["Jealousy", "Insecurity", "Curiosity", "Sadness"],
"reference": [6, 7, 5, 4]
},
{
"id": "eq12",
"situation": (
"A teenager confesses to her mother that she's been self-harming. The mother, "
"who is a psychologist, realizes she completely missed the signs despite her "
"professional training."
),
"emotions": ["Fear", "Guilt", "Love", "Self_criticism"],
"reference": [8, 8, 9, 7]
},
{
"id": "eq13",
"situation": (
"A war veteran returns home after two years to find his dog waiting on the "
"porch, much older and thinner. The dog recognizes him immediately and "
"limps over, tail wagging."
),
"emotions": ["Joy", "Guilt", "Tenderness", "Sorrow"],
"reference": [8, 5, 9, 4]
},
{
"id": "eq14",
"situation": (
"During a job interview, the candidate realizes the interviewer is someone "
"she bullied badly in high school. The interviewer clearly recognizes her "
"but proceeds professionally."
),
"emotions": ["Shame", "Anxiety", "Admiration", "Regret"],
"reference": [7, 8, 4, 7]
},
{
"id": "eq15",
"situation": (
"A father watches his daughter's wedding, knowing he has a terminal diagnosis "
"he hasn't shared with the family. He chose to wait until after the wedding "
"to tell them."
),
"emotions": ["Joy", "Grief", "Protectiveness", "Isolation"],
"reference": [6, 8, 8, 7]
},
{
"id": "eq16",
"situation": (
"Two siblings meet for the first time as adults after being separated in "
"foster care as children. They look alike but have lived completely different "
"lives. One is wealthy, the other struggles financially."
),
"emotions": ["Wonder", "Resentment", "Hope", "Grief"],
"reference": [7, 3, 7, 6]
},
]
def build_eq_prompt(scenario: dict) -> str:
"""Build the prompt for a single EQ scenario."""
emotions_str = ", ".join(scenario["emotions"])
return (
f"Read the following situation and rate the emotional intensity that "
f"the main character is likely feeling for each of the listed emotions. "
f"Rate each emotion from 0 (not feeling it at all) to 10 (extremely intense).\n\n"
f"Situation: {scenario['situation']}\n\n"
f"Rate these emotions: {emotions_str}\n\n"
f"Respond ONLY with the four numbers separated by commas, in the same order. "
f"Example format: 5, 3, 8, 2\n"
f"Do not include any other text."
)
def parse_eq_response(response: str, n_emotions: int = 4) -> list[float] | None:
"""Extract emotion ratings from model response."""
# Try to find comma-separated numbers
numbers = re.findall(r'(\d+(?:\.\d+)?)', response)
if len(numbers) < n_emotions:
return None
try:
# Take the first n_emotions numbers found
ratings = [float(numbers[i]) for i in range(n_emotions)]
# Clamp to 0-10
ratings = [max(0.0, min(10.0, r)) for r in ratings]
return ratings
except (ValueError, IndexError):
return None
def score_eq_response(reference: list[int], predicted: list[float]) -> float:
"""
EQ-Bench v2 style scoring.
Differences 1-4 from reference are scaled down on a curve.
Differences 5-10 count 1:1.
Score 0 = random, 100 = perfect match.
"""
if predicted is None or len(predicted) != len(reference):
return 0.0
total_penalty = 0.0
max_possible_penalty = 10.0 * len(reference) # worst case: all off by 10
for ref, pred in zip(reference, predicted):
diff = abs(ref - pred)
# Scale down small differences (EQ-Bench v2 approach)
if diff <= 4:
# Quadratic scaling: diff^2 / 4 so diff=4 -> penalty=4
penalty = (diff ** 2) / 4.0
else:
# Linear for larger diffs, continuous at diff=4 (penalty=4)
penalty = diff
total_penalty += penalty
# Normalize: 0 penalty = score 100, max penalty = score ~0
score = max(0.0, 100.0 * (1.0 - total_penalty / max_possible_penalty))
return score
# Convenience
EQ_PROMPTS = [(s, build_eq_prompt(s)) for s in EQ_SCENARIOS]
if __name__ == "__main__":
print(f"EQ Probe: {len(EQ_SCENARIOS)} scenarios")
print("=" * 60)
for i, scenario in enumerate(EQ_SCENARIOS):
print(f"\n[{scenario['id']}] Emotions: {scenario['emotions']}")
print(f" Reference: {scenario['reference']}")
prompt = build_eq_prompt(scenario)
print(f" Prompt length: {len(prompt)} chars")
# Test scoring
print("\n\nScoring tests:")
print(f" Perfect match: {score_eq_response([7, 9, 6, 4], [7, 9, 6, 4]):.1f}")
print(f" All off by 1: {score_eq_response([7, 9, 6, 4], [8, 8, 7, 5]):.1f}")
print(f" All off by 3: {score_eq_response([7, 9, 6, 4], [4, 6, 3, 1]):.1f}")
print(f" All off by 5: {score_eq_response([7, 9, 6, 4], [2, 4, 1, 0]):.1f}")
print(f" Worst case: {score_eq_response([7, 9, 6, 4], [0, 0, 0, 10]):.1f}")
print(f" Unparseable: {score_eq_response([7, 9, 6, 4], None):.1f}")

221
gguf_surgery.py Normal file
View file

@ -0,0 +1,221 @@
#!/usr/bin/env python3
"""
GGUF Layer Duplication Surgery
Reads a GGUF model file, duplicates transformer layers i..j-1 so they
execute twice in the forward pass, and writes a new GGUF with the
modified layer structure.
For a model with N layers, configuration (i, j) produces:
layers 0..j-1, then layers i..j-1 again, then layers j..N-1
Total layers: N + (j - i)
Tensor naming convention: blk.{layer_idx}.{tensor_name}
Non-block tensors (token_embd, output_norm, output) are copied as-is.
"""
import argparse
import re
import sys
from pathlib import Path
import numpy as np
from tqdm import tqdm
import gguf
from gguf import GGUFReader, GGUFWriter, GGUFValueType
BLK_PATTERN = re.compile(r'^blk\.(\d+)\.(.+)$')
def get_field_value(reader: GGUFReader, key: str):
"""Extract a scalar value from a reader field."""
field = reader.get_field(key)
if field is None:
return None
return field.contents()
def duplicate_layers(input_path: str, output_path: str, dup_start: int, dup_end: int, verbose: bool = False):
"""
Create a new GGUF with layers dup_start..dup_end-1 duplicated.
The new layer order is:
Original layers 0..dup_end-1
Duplicated layers dup_start..dup_end-1 (renumbered)
Original layers dup_end..N-1 (renumbered)
"""
reader = GGUFReader(input_path, 'r')
arch = get_field_value(reader, gguf.Keys.General.ARCHITECTURE)
if arch is None:
raise ValueError("Could not read architecture from GGUF")
block_count_key = f'{arch}.block_count'
orig_block_count = get_field_value(reader, block_count_key)
if orig_block_count is None:
raise ValueError(f"Could not read {block_count_key} from GGUF")
n_dup = dup_end - dup_start
new_block_count = orig_block_count + n_dup
if verbose:
print(f"Architecture: {arch}")
print(f"Original layers: {orig_block_count}")
print(f"Duplicating layers {dup_start}..{dup_end - 1} ({n_dup} layers)")
print(f"New layer count: {new_block_count}")
if dup_start < 0 or dup_end > orig_block_count or dup_start >= dup_end:
raise ValueError(
f"Invalid duplication range ({dup_start}, {dup_end}) "
f"for model with {orig_block_count} layers"
)
# Build layer mapping: new_idx -> original_layer_idx
# Phase 1: original 0..dup_end-1 keep their indices
# Phase 2: duplicates of dup_start..dup_end-1 get indices dup_end..dup_end+n_dup-1
# Phase 3: original dup_end..N-1 shift up by n_dup
layer_map = {}
for orig_idx in range(dup_end):
layer_map[orig_idx] = orig_idx
for k in range(n_dup):
layer_map[dup_end + k] = dup_start + k
for orig_idx in range(dup_end, orig_block_count):
layer_map[orig_idx + n_dup] = orig_idx
if verbose:
print("Layer mapping (new -> orig):")
for new_idx in sorted(layer_map.keys()):
tag = " [DUP]" if (dup_end <= new_idx < dup_end + n_dup) else ""
print(f" new {new_idx:3d} -> orig {layer_map[new_idx]:3d}{tag}")
# Create writer
writer = GGUFWriter(output_path, arch=arch, endianess=reader.endianess)
alignment = get_field_value(reader, gguf.Keys.General.ALIGNMENT)
if alignment is not None:
writer.data_alignment = alignment
# Copy metadata, overriding block_count
for field in reader.fields.values():
if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
continue
val_type = field.types[0]
sub_type = field.types[-1] if val_type == GGUFValueType.ARRAY else None
if field.name == block_count_key:
writer.add_key_value(field.name, new_block_count, val_type)
if verbose:
print(f"Modified {field.name}: {orig_block_count} -> {new_block_count}")
else:
val = field.contents()
if val is not None:
writer.add_key_value(field.name, val, val_type, sub_type=sub_type)
# Organize tensors by type
non_block_tensors = []
block_tensors = {} # orig_layer_idx -> [(suffix, tensor), ...]
for tensor in reader.tensors:
match = BLK_PATTERN.match(tensor.name)
if match:
layer_idx = int(match.group(1))
suffix = match.group(2)
if layer_idx not in block_tensors:
block_tensors[layer_idx] = []
block_tensors[layer_idx].append((suffix, tensor))
else:
non_block_tensors.append(tensor)
# Split non-block tensors into pre-block and post-block
pre_block = []
post_block = []
for t in non_block_tensors:
if 'output' in t.name:
post_block.append(t)
else:
pre_block.append(t)
# Add tensor infos in order and build write queue
total_bytes = 0
block_write_order = [] # (new_name, original_tensor)
for tensor in pre_block:
writer.add_tensor_info(
tensor.name, tensor.data.shape, tensor.data.dtype,
tensor.data.nbytes, tensor.tensor_type
)
total_bytes += tensor.n_bytes
for new_idx in range(new_block_count):
orig_idx = layer_map[new_idx]
if orig_idx not in block_tensors:
print(f"WARNING: No tensors for original layer {orig_idx}", file=sys.stderr)
continue
for suffix, tensor in block_tensors[orig_idx]:
new_name = f"blk.{new_idx}.{suffix}"
writer.add_tensor_info(
new_name, tensor.data.shape, tensor.data.dtype,
tensor.data.nbytes, tensor.tensor_type
)
total_bytes += tensor.n_bytes
block_write_order.append((new_name, tensor))
for tensor in post_block:
writer.add_tensor_info(
tensor.name, tensor.data.shape, tensor.data.dtype,
tensor.data.nbytes, tensor.tensor_type
)
total_bytes += tensor.n_bytes
# Write file
writer.write_header_to_file()
writer.write_kv_data_to_file()
writer.write_ti_data_to_file()
bar = tqdm(desc="Writing GGUF", total=total_bytes, unit="B", unit_scale=True)
for tensor in pre_block:
writer.write_tensor_data(tensor.data)
bar.update(tensor.n_bytes)
for _, tensor in block_write_order:
writer.write_tensor_data(tensor.data)
bar.update(tensor.n_bytes)
for tensor in post_block:
writer.write_tensor_data(tensor.data)
bar.update(tensor.n_bytes)
bar.close()
writer.close()
if verbose:
print(f"Done. Written to {output_path}")
out_size = Path(output_path).stat().st_size / (1024**3)
print(f"Output size: {out_size:.2f} GiB")
def main():
parser = argparse.ArgumentParser(
description="Duplicate layers in a GGUF model (RYS method)"
)
parser.add_argument("input", help="Input GGUF file path")
parser.add_argument("output", help="Output GGUF file path")
parser.add_argument("-i", "--dup-start", type=int, required=True,
help="First layer to duplicate (inclusive)")
parser.add_argument("-j", "--dup-end", type=int, required=True,
help="Last layer to duplicate (exclusive)")
parser.add_argument("-v", "--verbose", action="store_true")
args = parser.parse_args()
duplicate_layers(args.input, args.output, args.dup_start, args.dup_end, args.verbose)
if __name__ == "__main__":
main()

248
layer_path.py Normal file
View file

@ -0,0 +1,248 @@
#!/usr/bin/env python3
"""
Explicit Layer Path GGUF Surgery
You provide the exact sequence of layers the model should execute.
No ambiguous range notation just list the layers.
Examples:
# Normal 40-layer model (identity, for testing)
python layer_path.py model.gguf out.gguf -p 0,1,2,...,39
# Duplicate layers 13-16 once (same as RYS with i=13,j=17)
python layer_path.py model.gguf out.gguf -p 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,13,14,15,16,17,18,...,39
# Repeat layer 13 four times
python layer_path.py model.gguf out.gguf -p 0,1,...,12,13,13,13,13,14,15,...,39
# Triple-pass layers 13-16
python layer_path.py model.gguf out.gguf -p 0,1,...,16,13,14,15,16,13,14,15,16,17,...,39
# Shorthand: use .. to fill in sequential ranges
python layer_path.py model.gguf out.gguf -p 0..16,13,14,15,16,13,14,15,16,17..39
Usage:
python layer_path.py input.gguf output.gguf -p "0..16,13,14,15,16,17..39" -v
"""
import argparse
import re
import sys
from pathlib import Path
import numpy as np
from tqdm import tqdm
import gguf
from gguf import GGUFReader, GGUFWriter, GGUFValueType
BLK_PATTERN = re.compile(r'^blk\.(\d+)\.(.+)$')
def get_field_value(reader, key):
field = reader.get_field(key)
if field is None:
return None
return field.contents()
def parse_layer_path(path_str: str) -> list[int]:
"""
Parse a layer path string into a list of layer indices.
Supports:
- Individual numbers: 0,1,2,13,13,14
- Ranges with ..: 0..16 expands to 0,1,2,...,16 (inclusive)
- Mixed: 0..12,13,13,13,14..39
Whitespace is ignored.
"""
path_str = path_str.replace(' ', '')
layers = []
for part in path_str.split(','):
part = part.strip()
if not part:
continue
if '..' in part:
# Range: start..end (inclusive)
pieces = part.split('..')
if len(pieces) != 2:
raise ValueError(f"Invalid range: '{part}'. Use 'start..end'")
start = int(pieces[0])
end = int(pieces[1])
if start > end:
raise ValueError(f"Invalid range: {start}..{end} (start > end)")
layers.extend(range(start, end + 1))
else:
layers.append(int(part))
return layers
def build_gguf_from_path(input_path: str, output_path: str,
layer_path: list[int], verbose: bool = False):
"""
Create a new GGUF where the forward pass follows the given layer path.
"""
reader = GGUFReader(input_path, 'r')
arch = get_field_value(reader, gguf.Keys.General.ARCHITECTURE)
block_count_key = f'{arch}.block_count'
orig_block_count = get_field_value(reader, block_count_key)
# Validate all layer indices
for idx in layer_path:
if idx < 0 or idx >= orig_block_count:
raise ValueError(
f"Layer {idx} out of range (model has {orig_block_count} layers, 0..{orig_block_count-1})"
)
new_block_count = len(layer_path)
if verbose:
print(f"Architecture: {arch}")
print(f"Original layers: {orig_block_count}")
print(f"New layer count: {new_block_count}")
print(f"Layer path: {layer_path}")
# Show which layers are repeated
from collections import Counter
counts = Counter(layer_path)
repeated = {k: v for k, v in counts.items() if v > 1}
if repeated:
print(f"Repeated layers: {dict(sorted(repeated.items()))}")
else:
print("No repeated layers (just a reorder)")
# layer_map: new_position -> original_layer_index
layer_map = {new_idx: orig_idx for new_idx, orig_idx in enumerate(layer_path)}
# Create writer
writer = GGUFWriter(output_path, arch=arch, endianess=reader.endianess)
alignment = get_field_value(reader, gguf.Keys.General.ALIGNMENT)
if alignment is not None:
writer.data_alignment = alignment
# Copy metadata, override block count
for field in reader.fields.values():
if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
continue
val_type = field.types[0]
sub_type = field.types[-1] if val_type == GGUFValueType.ARRAY else None
if field.name == block_count_key:
writer.add_key_value(field.name, new_block_count, val_type)
else:
val = field.contents()
if val is not None:
writer.add_key_value(field.name, val, val_type, sub_type=sub_type)
# Organize tensors by layer
non_block_tensors = []
block_tensors = {}
for tensor in reader.tensors:
match = BLK_PATTERN.match(tensor.name)
if match:
layer_idx = int(match.group(1))
suffix = match.group(2)
if layer_idx not in block_tensors:
block_tensors[layer_idx] = []
block_tensors[layer_idx].append((suffix, tensor))
else:
non_block_tensors.append(tensor)
pre_block = [t for t in non_block_tensors if 'output' not in t.name]
post_block = [t for t in non_block_tensors if 'output' in t.name]
# Add tensor infos and build write order
total_bytes = 0
block_write_order = []
for tensor in pre_block:
writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype,
tensor.data.nbytes, tensor.tensor_type)
total_bytes += tensor.n_bytes
for new_idx in range(new_block_count):
orig_idx = layer_map[new_idx]
if orig_idx not in block_tensors:
raise ValueError(f"No tensors found for original layer {orig_idx}")
for suffix, tensor in block_tensors[orig_idx]:
new_name = f"blk.{new_idx}.{suffix}"
writer.add_tensor_info(new_name, tensor.data.shape, tensor.data.dtype,
tensor.data.nbytes, tensor.tensor_type)
total_bytes += tensor.n_bytes
block_write_order.append(tensor)
for tensor in post_block:
writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype,
tensor.data.nbytes, tensor.tensor_type)
total_bytes += tensor.n_bytes
# Write
writer.write_header_to_file()
writer.write_kv_data_to_file()
writer.write_ti_data_to_file()
bar = tqdm(desc="Writing GGUF", total=total_bytes, unit="B", unit_scale=True)
for tensor in pre_block:
writer.write_tensor_data(tensor.data)
bar.update(tensor.n_bytes)
for tensor in block_write_order:
writer.write_tensor_data(tensor.data)
bar.update(tensor.n_bytes)
for tensor in post_block:
writer.write_tensor_data(tensor.data)
bar.update(tensor.n_bytes)
bar.close()
writer.close()
if verbose:
out_size = Path(output_path).stat().st_size / (1024**3)
print(f"Done. Output: {out_size:.2f} GiB")
def main():
parser = argparse.ArgumentParser(
description="Build GGUF with explicit layer execution path",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Duplicate layers 13-16 once (RYS style)
%(prog)s model.gguf out.gguf -p "0..16,13,14,15,16,17..39"
# Triple-pass layers 13-16
%(prog)s model.gguf out.gguf -p "0..16,13,14,15,16,13,14,15,16,17..39"
# Repeat just layer 13 four times
%(prog)s model.gguf out.gguf -p "0..12,13,13,13,13,14..39"
# Skip layer 5 entirely
%(prog)s model.gguf out.gguf -p "0..4,6..39"
"""
)
parser.add_argument("input", help="Input GGUF file")
parser.add_argument("output", help="Output GGUF file")
parser.add_argument("-p", "--path", required=True,
help="Layer execution path (e.g. '0..16,13,14,15,16,17..39')")
parser.add_argument("-v", "--verbose", action="store_true")
args = parser.parse_args()
layer_path = parse_layer_path(args.path)
print(f"Model: {args.input}")
print(f"Output: {args.output}")
print(f"Layer path ({len(layer_path)} layers): {layer_path}")
build_gguf_from_path(args.input, args.output, layer_path, args.verbose)
if __name__ == "__main__":
main()

155
math_probe.py Normal file
View file

@ -0,0 +1,155 @@
#!/usr/bin/env python3
"""
Math Probe for RYS Layer Duplication Sweep
Hard arithmetic questions where the model must guess the answer
without chain-of-thought. Scored with Ng's partial-credit function.
"""
import json
import math
import random
def calculate_score(actual, estimate):
"""
Ng's partial-credit scoring function.
Pads shorter answers, penalizes proportionally.
Returns 0-1 score.
"""
try:
actual_str = str(int(actual))
estimate_str = str(int(estimate))
except (ValueError, OverflowError):
return 0.0
max_length = max(len(actual_str), len(estimate_str))
actual_padded = actual_str.ljust(max_length, "0")
estimate_padded = estimate_str.ljust(max_length, "0")
padding_size = max_length - min(len(actual_str), len(estimate_str))
actual_int = int(actual_padded)
estimate_int = int(estimate_padded)
if max(actual_int, estimate_int) == 0:
return 0.0
relative_diff = abs(actual_int - estimate_int) / max(actual_int, estimate_int)
correction_factor = 1 - (padding_size / max_length)
score = (1 - relative_diff) * correction_factor
return max(0.0, min(score, 1.0))
def generate_math_questions(seed=42):
"""
Generate hard arithmetic questions with known answers.
Mix of operations to test different numeric intuition.
Returns list of (question_text, correct_answer) tuples.
"""
rng = random.Random(seed)
questions = []
# Cube roots of large numbers (compute perfect cubes, ask for root)
for _ in range(4):
root = rng.randint(20000, 50000)
cube = root ** 3
questions.append((
f"What is the cube root of {cube}? "
f"Answer with just the number, no explanation.",
root
))
# Large multiplications
for _ in range(4):
a = rng.randint(100000, 999999)
b = rng.randint(100000, 999999)
product = a * b
questions.append((
f"What is {a} multiplied by {b}? "
f"Answer with just the number, no explanation.",
product
))
# Square roots of large numbers (perfect squares)
for _ in range(4):
root = rng.randint(50000, 200000)
square = root ** 2
questions.append((
f"What is the square root of {square}? "
f"Answer with just the number, no explanation.",
root
))
# Mixed: cube root multiplied by a number
for _ in range(4):
root = rng.randint(100, 999)
cube = root ** 3
multiplier = rng.randint(10, 99)
answer = root * multiplier
questions.append((
f"What is the cube root of {cube}, multiplied by {multiplier}? "
f"Answer with just the number, no explanation.",
answer
))
return questions
def parse_number_from_response(response: str) -> int | None:
"""
Extract the first integer from a model response.
Handles common LLM quirks: commas in numbers, trailing text, etc.
"""
import re
# Clean up common formatting
text = response.strip()
# Try to find a number (possibly with commas)
# Match negative or positive integers, possibly with commas
patterns = [
r'[-+]?[\d,]+', # numbers with optional commas
]
for pattern in patterns:
matches = re.findall(pattern, text)
if matches:
# Take the first/longest match
num_str = max(matches, key=len)
num_str = num_str.replace(',', '')
try:
return int(num_str)
except ValueError:
continue
return None
def score_math_response(question_answer: int, response: str) -> float:
"""Score a single math response."""
parsed = parse_number_from_response(response)
if parsed is None:
return 0.0
return calculate_score(question_answer, parsed)
# Pre-generated questions for consistency across runs
MATH_QUESTIONS = generate_math_questions(seed=42)
if __name__ == "__main__":
# Print the questions and answers for verification
print("Math Probe Questions:")
print("=" * 60)
for i, (q, a) in enumerate(MATH_QUESTIONS):
print(f"\n[{i+1}] {q}")
print(f" Answer: {a}")
# Test the scoring function
print("\n\nScoring tests:")
print(f" Exact match: {calculate_score(4302459, 4302459):.4f}")
print(f" Missing digit: {calculate_score(4302459, 430245):.4f}")
print(f" One digit off: {calculate_score(123456789, 123356789):.4f}")
print(f" Way off: {calculate_score(4302459, 9999999):.4f}")
print(f" Zero vs nonzero: {calculate_score(4302459, 0):.4f}")

198
multi_repeat.py Normal file
View file

@ -0,0 +1,198 @@
#!/usr/bin/env python3
"""
Multi-repeat GGUF surgery.
Duplicates a block of layers N times total (default 3 = original + 2 copies).
For (i=13, j=17, repeats=3) on a 40-layer model:
0..16, 13..16, 13..16, 17..39 = 48 layers total
The block 13-16 executes 3 times.
Usage:
python multi_repeat.py \
/path/to/model.gguf \
/dev/shm/rys/triple_13_17.gguf \
-i 13 -j 17 -n 3 -v
"""
import argparse
import re
import sys
from pathlib import Path
import numpy as np
from tqdm import tqdm
import gguf
from gguf import GGUFReader, GGUFWriter, GGUFValueType
BLK_PATTERN = re.compile(r'^blk\.(\d+)\.(.+)$')
def get_field_value(reader, key):
field = reader.get_field(key)
if field is None:
return None
return field.contents()
def multi_repeat_layers(input_path, output_path, dup_start, dup_end, n_repeats, verbose=False):
reader = GGUFReader(input_path, 'r')
arch = get_field_value(reader, gguf.Keys.General.ARCHITECTURE)
block_count_key = f'{arch}.block_count'
orig_block_count = get_field_value(reader, block_count_key)
n_block = dup_end - dup_start
extra_copies = n_repeats - 1 # original pass counts as 1
new_block_count = orig_block_count + (n_block * extra_copies)
if verbose:
print(f"Architecture: {arch}")
print(f"Original layers: {orig_block_count}")
print(f"Block: layers {dup_start}..{dup_end - 1} ({n_block} layers)")
print(f"Repeats: {n_repeats}x (original + {extra_copies} copies)")
print(f"New layer count: {new_block_count}")
# Build layer map
# Phase 1: original 0..dup_end-1
# Phase 2..N: copies of dup_start..dup_end-1
# Phase last: original dup_end..orig-1 (shifted)
layer_map = {}
# Phase 1: original layers up to dup_end
for idx in range(dup_end):
layer_map[idx] = idx
# Phase 2+: extra copies
offset = dup_end
for copy in range(extra_copies):
for k in range(n_block):
layer_map[offset + k] = dup_start + k
offset += n_block
# Phase last: remaining original layers shifted
for orig_idx in range(dup_end, orig_block_count):
layer_map[orig_idx + (n_block * extra_copies)] = orig_idx
assert len(layer_map) == new_block_count
if verbose:
path = [layer_map[i] for i in range(new_block_count)]
print(f"Execution path ({len(path)} layers):")
i = 0
while i < len(path):
run_start = path[i]
run_end = run_start
j = i + 1
while j < len(path) and path[j] == run_end + 1:
run_end = path[j]
j += 1
if run_start == run_end:
print(f" [{run_start}]")
else:
print(f" [{run_start}..{run_end}]")
i = j
# Create writer
writer = GGUFWriter(output_path, arch=arch, endianess=reader.endianess)
alignment = get_field_value(reader, gguf.Keys.General.ALIGNMENT)
if alignment is not None:
writer.data_alignment = alignment
# Copy metadata
for field in reader.fields.values():
if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
continue
val_type = field.types[0]
sub_type = field.types[-1] if val_type == GGUFValueType.ARRAY else None
if field.name == block_count_key:
writer.add_key_value(field.name, new_block_count, val_type)
else:
val = field.contents()
if val is not None:
writer.add_key_value(field.name, val, val_type, sub_type=sub_type)
# Organize tensors
non_block_tensors = []
block_tensors = {}
for tensor in reader.tensors:
match = BLK_PATTERN.match(tensor.name)
if match:
layer_idx = int(match.group(1))
suffix = match.group(2)
if layer_idx not in block_tensors:
block_tensors[layer_idx] = []
block_tensors[layer_idx].append((suffix, tensor))
else:
non_block_tensors.append(tensor)
pre_block = [t for t in non_block_tensors if 'output' not in t.name]
post_block = [t for t in non_block_tensors if 'output' in t.name]
total_bytes = 0
block_write_order = []
for tensor in pre_block:
writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype,
tensor.data.nbytes, tensor.tensor_type)
total_bytes += tensor.n_bytes
for new_idx in range(new_block_count):
orig_idx = layer_map[new_idx]
for suffix, tensor in block_tensors[orig_idx]:
new_name = f"blk.{new_idx}.{suffix}"
writer.add_tensor_info(new_name, tensor.data.shape, tensor.data.dtype,
tensor.data.nbytes, tensor.tensor_type)
total_bytes += tensor.n_bytes
block_write_order.append(tensor)
for tensor in post_block:
writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype,
tensor.data.nbytes, tensor.tensor_type)
total_bytes += tensor.n_bytes
# Write
writer.write_header_to_file()
writer.write_kv_data_to_file()
writer.write_ti_data_to_file()
bar = tqdm(desc="Writing GGUF", total=total_bytes, unit="B", unit_scale=True)
for tensor in pre_block:
writer.write_tensor_data(tensor.data)
bar.update(tensor.n_bytes)
for tensor in block_write_order:
writer.write_tensor_data(tensor.data)
bar.update(tensor.n_bytes)
for tensor in post_block:
writer.write_tensor_data(tensor.data)
bar.update(tensor.n_bytes)
bar.close()
writer.close()
if verbose:
size = Path(output_path).stat().st_size / (1024**3)
print(f"Written: {output_path} ({size:.2f} GiB)")
def main():
parser = argparse.ArgumentParser(description="Multi-repeat layer duplication")
parser.add_argument("input", help="Input GGUF")
parser.add_argument("output", help="Output GGUF")
parser.add_argument("-i", "--dup-start", type=int, required=True)
parser.add_argument("-j", "--dup-end", type=int, required=True)
parser.add_argument("-n", "--repeats", type=int, default=3,
help="Total times the block executes (default: 3)")
parser.add_argument("-v", "--verbose", action="store_true")
args = parser.parse_args()
multi_repeat_layers(args.input, args.output,
args.dup_start, args.dup_end,
args.repeats, args.verbose)
if __name__ == "__main__":
main()

282
reasoning_probe.py Normal file
View file

@ -0,0 +1,282 @@
#!/usr/bin/env python3
"""
Reasoning Probe for RYS Layer Duplication Sweep
BBH-style questions: causal judgement, date understanding, logical
deduction, navigation, plus basic math word problems.
All answers are short (Yes/No, a letter, or a number).
"""
import re
# ─── Causal Judgement (Yes/No) ─────────────────────────────────────
CAUSAL_QUESTIONS = [
{
"prompt": (
"How would a typical person answer each of the following questions about causation?\n"
"Frank T., had an ongoing dispute with his neighbor over a stretch of land and one day "
"decided to shoot his neighbor in the body. Frank T. had no experience with guns, his "
"hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet "
"bounced off a large boulder several feet away and hit the neighbor's body, causing "
"significant injury. Did Frank T. intentionally shoot his neighbor in the body?\n"
"Options:\n- Yes\n- No\n\n"
"Think step by step, then answer Yes or No."
),
"answer": "no",
},
{
"prompt": (
"How would a typical person answer each of the following questions about causation?\n"
"Suzy and Billy are working on a project that is very important for our nation's security. "
"The boss tells them both: \"Be sure that you are here at exactly 9 am. It is absolutely "
"essential that you arrive at that time.\" Both Billy and Suzy arrive at 9 am. As it happens, "
"there was a motion detector installed in the room where they arrived. The motion detector "
"was set up to be triggered if at least one person appeared in the room at the same time. "
"So the motion detector went off. Did Billy cause the motion detector to go off?\n"
"Options:\n- Yes\n- No\n\n"
"Think step by step, then answer Yes or No."
),
"answer": "yes",
},
{
"prompt": (
"How would a typical person answer each of the following questions about causation?\n"
"George and his sister Lena reunite at their parents' house for Thanksgiving. Whereas "
"George just got into medical school, Lena is unhappy in her marriage and recently lost "
"her job. Over the course of the day, George and Lena get into a number of heated arguments. "
"Later in the afternoon they play a game of darts. They split the first two games, and the "
"third game is close until the end. Who will win comes down to George's last shot. If he "
"hits a high point region, he wins; if he hits a low point region, Lena wins. George thinks "
"of the difficult time Lena is having, and he really wants to let her win. He aims the dart "
"at the low point region. He sets up his shot and the dart lands in the low point region. "
"After his shot, Lena wins the game and is very happy. Did George hit the low point region "
"intentionally?\n"
"Options:\n- Yes\n- No\n\n"
"Think step by step, then answer Yes or No."
),
"answer": "yes",
},
]
# ─── Date Understanding (multiple choice letter) ──────────────────
DATE_QUESTIONS = [
{
"prompt": (
"Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?\n"
"Options:\n"
"(A) 12/14/2026\n(B) 12/14/1950\n(C) 12/14/2007\n"
"(D) 12/14/1937\n(E) 07/14/1938\n(F) 12/14/1988\n\n"
"Think step by step, then give your answer as a single letter."
),
"answer": "d",
},
{
"prompt": (
"Tomorrow is 11/12/2019. What is the date one year ago from today in MM/DD/YYYY?\n"
"Options:\n"
"(A) 09/04/2018\n(B) 11/11/2018\n(C) 08/25/2018\n"
"(D) 11/02/2018\n(E) 11/04/2018\n\n"
"Think step by step, then give your answer as a single letter."
),
"answer": "b",
},
{
"prompt": (
"Jane and John married on Jan 2, 1958. It is their 5-year anniversary today. "
"What is the date tomorrow in MM/DD/YYYY?\n"
"Options:\n"
"(A) 01/11/1961\n(B) 01/03/1963\n(C) 01/18/1961\n"
"(D) 10/14/1960\n(E) 01/03/1982\n(F) 12/03/1960\n\n"
"Think step by step, then give your answer as a single letter."
),
"answer": "b",
},
]
# ─── Logical Deduction (multiple choice letter) ───────────────────
LOGIC_QUESTIONS = [
{
"prompt": (
"The following paragraphs each describe a set of three objects arranged in a fixed order. "
"The statements are logically consistent within each paragraph.\n"
"In a golf tournament, there were three golfers: Amy, Eli, and Eve. "
"Eve finished above Amy. Eli finished below Amy.\n"
"Options:\n"
"(A) Amy finished last\n(B) Eli finished last\n(C) Eve finished last\n\n"
"Think step by step, then give your answer as a single letter."
),
"answer": "b",
},
{
"prompt": (
"The following paragraphs each describe a set of three objects arranged in a fixed order. "
"The statements are logically consistent within each paragraph.\n"
"On a shelf, there are three books: a white book, a green book, and an orange book. "
"The green book is to the right of the white book. The orange book is the rightmost.\n"
"Options:\n"
"(A) The white book is the leftmost\n(B) The green book is the leftmost\n"
"(C) The orange book is the leftmost\n\n"
"Think step by step, then give your answer as a single letter."
),
"answer": "a",
},
{
"prompt": (
"The following paragraphs each describe a set of three objects arranged in a fixed order. "
"The statements are logically consistent within each paragraph.\n"
"On a shelf, there are three books: a red book, a gray book, and a white book. "
"The white book is to the left of the gray book. The red book is the second from the left.\n"
"Options:\n"
"(A) The red book is the leftmost\n(B) The gray book is the leftmost\n"
"(C) The white book is the leftmost\n\n"
"Think step by step, then give your answer as a single letter."
),
"answer": "c",
},
]
# ─── Navigation (Yes/No) ──────────────────────────────────────────
NAV_QUESTIONS = [
{
"prompt": (
"If you follow these instructions, do you return to the starting point? "
"Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. Take 8 steps.\n"
"Options:\n- Yes\n- No\n\n"
"Think step by step, then answer Yes or No."
),
"answer": "no",
},
{
"prompt": (
"If you follow these instructions, do you return to the starting point? "
"Turn around. Take 1 step. Take 6 steps. Turn around. Take 6 steps. Take 9 steps. Take 1 step.\n"
"Options:\n- Yes\n- No\n\n"
"Think step by step, then answer Yes or No."
),
"answer": "no",
},
{
"prompt": (
"If you follow these instructions, do you return to the starting point? "
"Always face forward. Take 2 steps right. Take 9 steps left. Take 7 steps right.\n"
"Options:\n- Yes\n- No\n\n"
"Think step by step, then answer Yes or No."
),
"answer": "yes",
},
]
# ─── GSM8K Word Problems (number answer) ──────────────────────────
GSM_QUESTIONS = [
{
"prompt": (
"There are 15 trees in the grove. Grove workers will plant trees in the grove today. "
"After they are done, there will be 21 trees. How many trees did the grove workers plant today?\n\n"
"Solve step by step. End with 'The answer is [NUMBER]'."
),
"answer": "6",
},
{
"prompt": (
"If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\n\n"
"Solve step by step. End with 'The answer is [NUMBER]'."
),
"answer": "5",
},
{
"prompt": (
"Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\n\n"
"Solve step by step. End with 'The answer is [NUMBER]'."
),
"answer": "39",
},
{
"prompt": (
"Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. "
"How many lollipops did Jason give to Denny?\n\n"
"Solve step by step. End with 'The answer is [NUMBER]'."
),
"answer": "8",
},
{
"prompt": (
"Shawn has five toys. For Christmas, he got two toys each from his mom and dad. "
"How many toys does he have now?\n\n"
"Solve step by step. End with 'The answer is [NUMBER]'."
),
"answer": "9",
},
]
# ─── All questions with types ─────────────────────────────────────
ALL_REASONING = []
for q in CAUSAL_QUESTIONS:
ALL_REASONING.append({**q, "type": "causal"})
for q in DATE_QUESTIONS:
ALL_REASONING.append({**q, "type": "date"})
for q in LOGIC_QUESTIONS:
ALL_REASONING.append({**q, "type": "logic"})
for q in NAV_QUESTIONS:
ALL_REASONING.append({**q, "type": "nav"})
for q in GSM_QUESTIONS:
ALL_REASONING.append({**q, "type": "gsm"})
# Alias for sweep.py import
REASONING_QUESTIONS = ALL_REASONING
def extract_final_answer(response: str) -> str:
"""Extract the final answer from a CoT response."""
match = re.search(r'the answer is\s+(.+?)[\.\!\n\r]', response, re.IGNORECASE)
if match:
return match.group(1).strip()
match = re.search(r'\b(yes|no)\b', response.split('\n')[-1], re.IGNORECASE)
if match:
return match.group(1).strip()
match = re.search(r'\(?([A-F])\)?', response.split('\n')[-1])
if match:
return match.group(1).strip()
return response.strip().split('\n')[-1].strip()
def score_reasoning_response(question: dict, response: str) -> float:
"""Score a reasoning question. Returns 0 or 1."""
if response is None:
return 0.0
final = extract_final_answer(response).lower().strip()
correct = question["answer"].lower().strip()
final = re.sub(r'[^a-z0-9]', '', final)
correct = re.sub(r'[^a-z0-9]', '', correct)
if final == correct:
return 1.0
if correct in ("yes", "no"):
last_line = response.strip().split('\n')[-1].lower()
if correct in last_line and (correct == "yes") != ("no" in last_line):
return 1.0
return 0.0
if __name__ == "__main__":
print(f"Reasoning Probe: {len(ALL_REASONING)} questions")
print(f" Causal: {len(CAUSAL_QUESTIONS)}")
print(f" Date: {len(DATE_QUESTIONS)}")
print(f" Logic: {len(LOGIC_QUESTIONS)}")
print(f" Nav: {len(NAV_QUESTIONS)}")
print(f" GSM: {len(GSM_QUESTIONS)}")
print("\nSample questions:")
for q in ALL_REASONING[:2]:
print(f"\n [{q['type']}] answer={q['answer']}")
print(f" {q['prompt'][:80]}...")

447
sweep.py Normal file
View file

@ -0,0 +1,447 @@
#!/usr/bin/env python3
"""
RYS Layer Duplication Sweep
Orchestrates the search for optimal layer duplication configuration:
1. Generate modified GGUF with duplicated layers
2. Start llama-server with the modified model
3. Run math + EQ probes
4. Score and record results
5. Print live results table
6. Kill server, repeat
Usage:
python sweep.py \
--model /path/to/model.gguf \
--llama-server /path/to/llama-server \
--tmpdir /dev/shm/rys \
--results results.jsonl
The sweep strategy:
Pass 1: 8-layer blocks at stride 4 across the middle
Pass 2: Refine within the hot zone with smaller blocks
"""
import argparse
import json
import os
import signal
import subprocess
import sys
import time
from datetime import datetime
from pathlib import Path
import requests
from gguf_surgery import duplicate_layers
from math_probe import MATH_QUESTIONS, score_math_response
from eq_probe import EQ_SCENARIOS, build_eq_prompt, parse_eq_response, score_eq_response
from reasoning_probe import REASONING_QUESTIONS, score_reasoning_response
# Server config
DEFAULT_PORT = 8099
SERVER_STARTUP_TIMEOUT = 120 # seconds
REQUEST_TIMEOUT = 60 # seconds per completion
def wait_for_server(port: int, timeout: int = SERVER_STARTUP_TIMEOUT) -> bool:
"""Wait for llama-server to be ready."""
url = f"http://127.0.0.1:{port}/health"
start = time.time()
while time.time() - start < timeout:
try:
r = requests.get(url, timeout=2)
if r.status_code == 200:
data = r.json()
if data.get("status") == "ok":
return True
except (requests.ConnectionError, requests.Timeout):
pass
time.sleep(1)
return False
def start_server(llama_server_path: str, model_path: str, port: int,
extra_args: list[str] = None) -> subprocess.Popen:
"""Start llama-server and return the process handle."""
cmd = [
llama_server_path,
"-m", model_path,
"--port", str(port),
"-c", "4096", # small context for probe eval
"-ngl", "99", # offload all layers to GPU
"--flash-attn", "on",
"--cache-type-k", "q8_0",
"--cache-type-v", "q8_0",
"--no-warmup",
"-np", "1", # single slot
]
if extra_args:
cmd.extend(extra_args)
print(f" [CMD] {' '.join(cmd)}", flush=True)
# Let server output go to a log file so we can debug without pipe deadlocks
log_path = Path(f"/tmp/rys_server_{port}.log")
log_file = open(log_path, "w")
proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT)
proc._log_file = log_file # keep reference so it doesn't get GC'd
proc._log_path = log_path
print(f" [PID] Server started as PID {proc.pid}, log: {log_path}", flush=True)
return proc
def stop_server(proc: subprocess.Popen):
"""Gracefully stop the server."""
if proc.poll() is None:
proc.terminate()
try:
proc.wait(timeout=10)
except subprocess.TimeoutExpired:
proc.kill()
proc.wait()
# Close the log file
if hasattr(proc, '_log_file'):
proc._log_file.close()
def dump_server_log(proc: subprocess.Popen, tail_lines: int = 30):
"""Print the last N lines of the server log for debugging."""
if hasattr(proc, '_log_path') and proc._log_path.exists():
lines = proc._log_path.read_text().splitlines()
print(f" --- Server log (last {tail_lines} lines) ---", file=sys.stderr)
for line in lines[-tail_lines:]:
print(f" | {line}", file=sys.stderr)
print(f" --- End server log ---", file=sys.stderr)
def query_model(prompt: str, port: int, max_tokens: int = 64) -> str | None:
"""Send a completion request to llama-server."""
url = f"http://127.0.0.1:{port}/v1/chat/completions"
payload = {
"model": "test",
"messages": [
{"role": "user", "content": prompt}
],
"max_tokens": max_tokens,
"temperature": 0.0,
}
try:
r = requests.post(url, json=payload, timeout=REQUEST_TIMEOUT)
if r.status_code == 200:
data = r.json()
return data["choices"][0]["message"]["content"]
else:
print(f" [WARN] Server returned {r.status_code}", file=sys.stderr)
return None
except (requests.ConnectionError, requests.Timeout) as e:
print(f" [WARN] Request failed: {e}", file=sys.stderr)
return None
def run_math_probe(port: int) -> float:
"""Run all math questions and return average score (0-1)."""
scores = []
for question, answer in MATH_QUESTIONS:
response = query_model(question, port, max_tokens=48)
if response is not None:
score = score_math_response(answer, response)
scores.append(score)
else:
scores.append(0.0)
return sum(scores) / len(scores) if scores else 0.0
def run_eq_probe(port: int) -> float:
"""Run all EQ scenarios and return average score (0-100)."""
scores = []
for scenario in EQ_SCENARIOS:
prompt = build_eq_prompt(scenario)
response = query_model(prompt, port, max_tokens=48)
if response is not None:
predicted = parse_eq_response(response, len(scenario["emotions"]))
score = score_eq_response(scenario["reference"], predicted)
scores.append(score)
else:
scores.append(0.0)
return sum(scores) / len(scores) if scores else 0.0
def run_reasoning_probe(port: int) -> dict:
"""Run all reasoning questions, return scores by category and overall."""
by_category = {}
for q in REASONING_QUESTIONS:
cat = q["type"]
if cat not in by_category:
by_category[cat] = []
response = query_model(q["prompt"], port, max_tokens=512)
score = score_reasoning_response(q, response)
by_category[cat].append(score)
# Per-category averages
cat_scores = {}
for cat, scores in by_category.items():
cat_scores[cat] = sum(scores) / len(scores) if scores else 0.0
# Overall reasoning score (0-1)
all_scores = [s for scores in by_category.values() for s in scores]
overall = sum(all_scores) / len(all_scores) if all_scores else 0.0
return {"categories": cat_scores, "overall": overall}
def run_evaluation(port: int) -> dict:
"""Run all probes and return results."""
math_score = run_math_probe(port)
eq_score = run_eq_probe(port)
reasoning = run_reasoning_probe(port)
return {
"math_score": math_score,
"eq_score": eq_score,
"reasoning_score": reasoning["overall"],
"reasoning_cats": reasoning["categories"],
}
def print_results_table(results: list[dict], baseline: dict | None = None):
"""Print a live-updating results table."""
print("\n" + "=" * 105)
print(f"{'Config':>12} {'Layers':>8} {'Math':>8} {'EQ':>8} {'Reason':>8} "
f"{'Math Δ':>8} {'EQ Δ':>8} {'Reas Δ':>8} {'Combined Δ':>11}")
print("-" * 105)
if baseline:
brs = baseline.get('reasoning_score', 0)
print(f"{'BASELINE':>12} {'0':>8} "
f"{baseline['math_score']:>8.4f} {baseline['eq_score']:>8.2f} {brs:>8.2%} "
f"{'---':>8} {'---':>8} {'---':>8} {'---':>11}")
print("-" * 105)
for r in results:
config = f"({r['dup_start']},{r['dup_end']})"
n_dup = r['dup_end'] - r['dup_start']
rs = r.get('reasoning_score', 0)
if baseline:
math_delta = r['math_score'] - baseline['math_score']
eq_delta = r['eq_score'] - baseline['eq_score']
reas_delta = rs - baseline.get('reasoning_score', 0)
# Combined: weight EQ and reasoning more than math
combined = eq_delta + (reas_delta * 100)
math_d = f"{math_delta:>+8.4f}"
eq_d = f"{eq_delta:>+8.2f}"
reas_d = f"{reas_delta:>+8.2%}"
comb_d = f"{combined:>+11.2f}"
else:
math_d = eq_d = reas_d = comb_d = "---"
print(f"{config:>12} {n_dup:>8} "
f"{r['math_score']:>8.4f} {r['eq_score']:>8.2f} {rs:>8.2%} "
f"{math_d} {eq_d} {reas_d} {comb_d}")
print("=" * 105)
sys.stdout.flush()
def generate_sweep_configs(n_layers: int, block_sizes: list[int],
start_min: int = 4, start_max: int = None,
stride: int = 4) -> list[tuple[int, int]]:
"""
Generate (dup_start, dup_end) configs for the sweep.
Args:
n_layers: Total layers in the model
block_sizes: List of block sizes to try (e.g., [8])
start_min: Earliest layer to start duplication
start_max: Latest layer to start (default: n_layers - max(block_sizes) - 4)
stride: Step between start positions
"""
if start_max is None:
start_max = n_layers - max(block_sizes) - 4
configs = []
for bs in block_sizes:
for start in range(start_min, start_max + 1, stride):
end = start + bs
if end <= n_layers:
configs.append((start, end))
return configs
def main():
parser = argparse.ArgumentParser(description="RYS Layer Duplication Sweep")
parser.add_argument("--model", required=True, help="Path to input GGUF model")
parser.add_argument("--llama-server", required=True, help="Path to llama-server binary")
parser.add_argument("--tmpdir", default="/dev/shm/rys",
help="Temp directory for modified GGUFs (use tmpfs/RAM)")
parser.add_argument("--results", default="rys_results.jsonl",
help="Output results file (JSONL)")
parser.add_argument("--port", type=int, default=DEFAULT_PORT)
parser.add_argument("--block-sizes", type=int, nargs="+", default=[8],
help="Block sizes to sweep (default: 8)")
parser.add_argument("--stride", type=int, default=4,
help="Stride between start positions (default: 4)")
parser.add_argument("--start-min", type=int, default=4,
help="Earliest layer to start duplication")
parser.add_argument("--start-max", type=int, default=None,
help="Latest layer to start duplication")
parser.add_argument("--skip-baseline", action="store_true",
help="Skip baseline run (use if already in results)")
parser.add_argument("--server-args", nargs=argparse.REMAINDER, default=[],
help="Extra args to pass to llama-server (must be last)")
args = parser.parse_args()
model_path = Path(args.model).resolve()
tmpdir = Path(args.tmpdir)
tmpdir.mkdir(parents=True, exist_ok=True)
results_path = Path(args.results)
results = []
baseline = None
# Load existing results if resuming
if results_path.exists():
with open(results_path) as f:
for line in f:
line = line.strip()
if line:
entry = json.loads(line)
if entry.get("is_baseline"):
baseline = entry
else:
results.append(entry)
print(f"Loaded {len(results)} existing results + baseline={baseline is not None}")
# Run baseline (unmodified model)
if not args.skip_baseline and baseline is None:
print("\n>>> Running BASELINE evaluation...")
proc = start_server(args.llama_server, str(model_path), args.port, args.server_args)
try:
if not wait_for_server(args.port):
print("ERROR: Server failed to start for baseline", file=sys.stderr)
dump_server_log(proc)
stop_server(proc)
sys.exit(1)
print(" Server ready. Running probes...")
eval_result = run_evaluation(args.port)
baseline = {
"is_baseline": True,
"dup_start": -1,
"dup_end": -1,
"math_score": eval_result["math_score"],
"eq_score": eval_result["eq_score"],
"reasoning_score": eval_result["reasoning_score"],
"reasoning_cats": eval_result.get("reasoning_cats", {}),
"timestamp": datetime.now().isoformat(),
}
with open(results_path, "a") as f:
f.write(json.dumps(baseline) + "\n")
brs = baseline['reasoning_score']
print(f" Baseline: math={baseline['math_score']:.4f} eq={baseline['eq_score']:.2f} reasoning={brs:.2%}")
finally:
stop_server(proc)
# Get model layer count from the GGUF metadata
from gguf import GGUFReader
reader = GGUFReader(str(model_path), 'r')
arch_field = reader.get_field('general.architecture')
arch = arch_field.contents()
block_count_field = reader.get_field(f'{arch}.block_count')
n_layers = block_count_field.contents()
print(f"\nModel: {model_path.name}")
print(f"Architecture: {arch}, Layers: {n_layers}")
# Generate sweep configurations
configs = generate_sweep_configs(
n_layers=n_layers,
block_sizes=args.block_sizes,
start_min=args.start_min,
start_max=args.start_max,
stride=args.stride,
)
# Filter out already-completed configs
done = {(r["dup_start"], r["dup_end"]) for r in results}
configs = [(s, e) for s, e in configs if (s, e) not in done]
print(f"Configs to test: {len(configs)}")
if configs:
print(f" Range: ({configs[0][0]},{configs[0][1]}) to ({configs[-1][0]},{configs[-1][1]})")
print_results_table(results, baseline)
for idx, (dup_start, dup_end) in enumerate(configs):
n_dup = dup_end - dup_start
config_str = f"({dup_start},{dup_end})"
print(f"\n>>> [{idx+1}/{len(configs)}] Testing config {config_str} "
f"(+{n_dup} layers)...")
# Generate modified GGUF
modified_path = tmpdir / f"rys_{dup_start}_{dup_end}.gguf"
print(f" Generating modified GGUF...")
try:
duplicate_layers(
str(model_path), str(modified_path),
dup_start, dup_end, verbose=False
)
except Exception as e:
print(f" ERROR generating GGUF: {e}", file=sys.stderr)
continue
# Start server with modified model
print(f" Starting server...")
proc = start_server(
args.llama_server, str(modified_path), args.port, args.server_args
)
try:
if not wait_for_server(args.port):
print(f" ERROR: Server failed to start for {config_str}", file=sys.stderr)
dump_server_log(proc)
print(f" Check server log above for details.", file=sys.stderr)
continue
print(f" Server ready. Running probes...")
eval_result = run_evaluation(args.port)
entry = {
"dup_start": dup_start,
"dup_end": dup_end,
"n_dup_layers": n_dup,
"math_score": eval_result["math_score"],
"eq_score": eval_result["eq_score"],
"reasoning_score": eval_result["reasoning_score"],
"reasoning_cats": eval_result.get("reasoning_cats", {}),
"timestamp": datetime.now().isoformat(),
}
results.append(entry)
# Append to results file
with open(results_path, "a") as f:
f.write(json.dumps(entry) + "\n")
print_results_table(results, baseline)
finally:
stop_server(proc)
# Clean up modified GGUF to free tmpfs space
if modified_path.exists():
modified_path.unlink()
print(f" Cleaned up {modified_path.name}")
print("\n\nSweep complete!")
print_results_table(results, baseline)
if __name__ == "__main__":
main()

114
visualize.py Normal file
View file

@ -0,0 +1,114 @@
#!/usr/bin/env python3
"""
Visualize RYS sweep results.
Reads the JSONL output from sweep.py, prints ranked table and bar chart.
"""
import json
import sys
def load_results(path: str):
baseline = None
results = []
with open(path) as f:
for line in f:
line = line.strip()
if not line:
continue
entry = json.loads(line)
if entry.get("is_baseline"):
baseline = entry
else:
results.append(entry)
return baseline, results
def print_ranked(baseline, results):
"""Print results ranked by combined delta."""
if not baseline:
print("No baseline found in results!")
return
for r in results:
math_delta = r["math_score"] - baseline["math_score"]
eq_delta = r["eq_score"] - baseline["eq_score"]
r["math_delta"] = math_delta
r["eq_delta"] = eq_delta
r["combined"] = (math_delta * 100) + eq_delta
ranked = sorted(results, key=lambda x: x["combined"], reverse=True)
print(f"\nBaseline: math={baseline['math_score']:.4f} eq={baseline['eq_score']:.2f}")
print()
print(f"{'Rank':>4} {'Config':>12} {'Dup':>4} "
f"{'Math':>8} {'EQ':>8} "
f"{'Math Δ':>9} {'EQ Δ':>8} {'Combined':>10}")
print("-" * 80)
for i, r in enumerate(ranked):
config = f"({r['dup_start']},{r['dup_end']})"
n_dup = r['dup_end'] - r['dup_start']
if r["combined"] > 0:
marker = "+"
elif r["combined"] < -5:
marker = "!"
else:
marker = " "
print(f"{i+1:>4} {config:>12} {n_dup:>4} "
f"{r['math_score']:>8.4f} {r['eq_score']:>8.2f} "
f"{r['math_delta']:>+9.4f} {r['eq_delta']:>+8.2f} "
f"{r['combined']:>+10.2f} {marker}")
if ranked:
best = ranked[0]
worst = ranked[-1]
print()
print(f"Best: ({best['dup_start']},{best['dup_end']}) combined={best['combined']:+.2f}")
print(f"Worst: ({worst['dup_start']},{worst['dup_end']}) combined={worst['combined']:+.2f}")
def print_bar_chart(baseline, results):
"""Print a horizontal bar chart sorted by start position."""
if not baseline or not results:
return
for r in results:
math_delta = r["math_score"] - baseline["math_score"]
eq_delta = r["eq_score"] - baseline["eq_score"]
r["combined"] = (math_delta * 100) + eq_delta
ordered = sorted(results, key=lambda x: x["dup_start"])
max_val = max(abs(r["combined"]) for r in ordered)
if max_val == 0:
max_val = 1
half = 20
print(f"\nCombined delta (baseline = |):")
print(f"{'Config':>12} {'negative':<{half}}|{'positive':<{half}}")
for r in ordered:
config = f"({r['dup_start']},{r['dup_end']})"
val = r["combined"]
bar_len = int(abs(val) / max_val * half)
if val >= 0:
bar = " " * half + "|" + "#" * bar_len
else:
pad = half - bar_len
bar = " " * pad + "=" * bar_len + "|"
print(f"{config:>12} {bar} {val:+.2f}")
if __name__ == "__main__":
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} <results.jsonl>")
sys.exit(1)
baseline, results = load_results(sys.argv[1])
print_ranked(baseline, results)
print_bar_chart(baseline, results)