tinyforge-zero/experiments/mbpp_seeded_cross_arch.py
Rana Usman 826f934d2e Ship every paper-referenced experiment script
Reorganizes the repo so every section of the paper has a corresponding
script. Previously only the core recipe + control + evals were here.

New subdirs:
- tts/             — test-time sampling (§2.2, §3.3): scaling sweep, HE, MATH-500,
                     AIME, 14B-recipe + TTS, 8B-raw-TTS control.
- experiments/     — every §3 finding as a runnable script:
                     · self_consistency (§3.4)
                     · recipe_x_tts_synergy (§3.5, novel)
                     · mbpp_seeded_cross_arch (§3.9)
                     · cross_domain_code_to_math (§3.10)
                     · self_correction_math_{naive,fixed} (§3.10, the
                       catastrophic-then-recovered case)
                     · math500_seeded_mining (§3.10 distribution mismatch)
                     · bcb_hard_eval (§3.10 distribution mismatch)
                     · recursive_bootstrap (§3.10 plateau)
                     · diversity_cued_mining (§3.10 low yield)
                     · aime_scaling (TTS curve)
                     · star_baseline_gsm8k (related-work baseline)
- evals/           — moved out of recipe/ (eval_raw, eval_plus, confirm)

Also adds: bootstrap_14b_4bit_harvest, curriculum_code, math_bootstrap to
recipe/ for completeness.

REPRODUCE.md now maps each paper section / table / figure to its exact
script and expected output.
2026-05-13 21:09:54 +05:00

241 lines
9.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Self-bootstrap with MBPP-train as problem seeds + vLLM on H100.
- Use MBPP train (374 problems) as PROBLEM seeds (no human solutions used).
- For each: greedy attempt. If fails, sample N attempts at temp=0.8.
- Mine at-edge pairs (broken, fixed).
- Train LoRA. Eval on HumanEval + MBPP-test.
"""
import os, json, time, re, subprocess, tempfile, argparse, gc, random
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from datasets import load_dataset
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def run_python(code, timeout=8):
with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
f.write(code); path = f.name
try:
r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp")
return r.returncode == 0, (r.stderr or "")[:200]
except subprocess.TimeoutExpired: return False, "timeout"
finally:
try: os.unlink(path)
except: pass
def vllm_gen(llm, prompts, max_new=400, temperature=0.0, n=1, stops=None):
from vllm import SamplingParams
sp = SamplingParams(temperature=temperature, top_p=0.95 if temperature > 0 else 1.0,
max_tokens=max_new, n=n,
stop=stops or ["\nclass ", "\nif __name__", "\n\nprint", "\n\ndef "])
out = llm.generate(prompts, sp, use_tqdm=False)
# returns list of lists when n>1
if n == 1:
return [o.outputs[0].text for o in out]
return [[c.text for c in o.outputs] for o in out]
def he_prompt(p): return p["prompt"]
def mbpp_prompt(p):
return (f"# Task: {p['prompt']}\n"
f"# Tests:\n# " + "\n# ".join(p["test_list"]) + "\n\n")
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--attempts_per", type=int, default=8)
ap.add_argument("--max_pairs", type=int, default=200)
ap.add_argument("--tag", required=True)
args = ap.parse_args()
out_dir = f"/workspace/selfmine_mbpp/{args.tag}"
os.makedirs(out_dir, exist_ok=True)
from vllm import LLM
from transformers import AutoTokenizer
log(f"loading {args.model} into vLLM")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048)
log(f" loaded")
# --- Load benchmarks
he = list(load_dataset("openai_humaneval", split="test"))
mbpp_test = list(load_dataset("mbpp", "sanitized", split="test"))[:200]
mbpp_train = list(load_dataset("mbpp", "sanitized", split="train"))
log(f" HE: {len(he)}, MBPP-test: {len(mbpp_test)}, MBPP-train: {len(mbpp_train)}")
# --- BASE eval
log("=== BASE evals ===")
t0 = time.time()
he_outs = vllm_gen(llm, [he_prompt(p) for p in he], max_new=400)
log(f" HE base gen done in {time.time()-t0:.1f}s")
base_he = 0
for p, raw in zip(he, he_outs):
full = p["prompt"] + raw
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
ok, _ = run_python(test_code, timeout=10)
if ok: base_he += 1
t1 = time.time()
mbpp_outs = vllm_gen(llm, [mbpp_prompt(p) for p in mbpp_test], max_new=400)
log(f" MBPP-test base gen done in {time.time()-t1:.1f}s")
base_mbpp = 0
for p, raw in zip(mbpp_test, mbpp_outs):
test_code = raw + "\n\n" + "\n".join(p["test_list"])
ok, _ = run_python(test_code, timeout=10)
if ok: base_mbpp += 1
log(f" BASE: HE={base_he}/{len(he)} MBPP={base_mbpp}/{len(mbpp_test)}")
# --- Mine pairs from MBPP-train
log(f"=== mining from {len(mbpp_train)} MBPP-train problems ===")
train_prompts = [mbpp_prompt(p) for p in mbpp_train]
# greedy attempt
t0 = time.time()
greedy_outs = vllm_gen(llm, train_prompts, max_new=400)
log(f" greedy gen in {time.time()-t0:.1f}s")
pairs = []
hard_indices = []
for i, (p, raw) in enumerate(zip(mbpp_train, greedy_outs)):
test_code = raw + "\n\n" + "\n".join(p["test_list"])
ok, err = run_python(test_code, timeout=8)
if not ok:
hard_indices.append((i, p, raw, err))
log(f" {len(mbpp_train) - len(hard_indices)} greedy-correct, {len(hard_indices)} hard")
if not hard_indices:
log("nothing to mine — base too strong"); return
# sample N attempts per hard problem
log(f" sampling {args.attempts_per} attempts × {len(hard_indices)} hard problems...")
hard_prompts = []
for _i, p, _r, _e in hard_indices:
hard_prompts.append(mbpp_prompt(p))
t1 = time.time()
sample_outs = vllm_gen(llm, hard_prompts, max_new=400, temperature=0.8, n=args.attempts_per)
log(f" sample gen in {time.time()-t1:.1f}s")
t2 = time.time()
for (idx, p, greedy_raw, err), attempts in zip(hard_indices, sample_outs):
# check each attempt
passes = []
for a in attempts:
test_code = a + "\n\n" + "\n".join(p["test_list"])
ok, _ = run_python(test_code, timeout=8)
if ok: passes.append(a)
if passes:
pairs.append({
"problem": p["prompt"],
"tests": p["test_list"],
"broken": greedy_raw.strip(),
"fixed": passes[0].strip(),
"error": err,
})
if len(pairs) >= args.max_pairs: break
log(f" verification in {time.time()-t2:.1f}s — mined {len(pairs)} pairs")
with open(f"{out_dir}/pairs.jsonl", "w") as fh:
for r in pairs: fh.write(json.dumps(r) + "\n")
if len(pairs) < 5:
log("too few pairs — exiting"); return
# --- Train LoRA
log("=== TRAINING ===")
del llm; gc.collect(); torch.cuda.empty_cache()
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset as HFDataset
from peft import LoraConfig, get_peft_model
def make_ex(r):
user = (f"# Task: {r['problem']}\n"
f"# Tests:\n# " + "\n# ".join(r['tests']) + "\n"
f"# My broken attempt:\n{r['broken']}\n"
f"# Error: {r.get('error','')[:120]}\n"
f"# Corrected:\n")
target = r["fixed"]
full = user + target
full_ids = tok(full, add_special_tokens=False)["input_ids"]
user_ids = tok(user, add_special_tokens=False)["input_ids"]
MAX = 1024
full_ids = full_ids[:MAX]
labels = list(full_ids)
n_user = min(len(user_ids), len(labels))
for i in range(n_user): labels[i] = -100
pad = MAX - len(full_ids)
return {"input_ids": full_ids + [tok.pad_token_id]*pad,
"attention_mask": [1]*len(full_ids) + [0]*pad,
"labels": labels + [-100]*pad}
model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0")
lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")
model = get_peft_model(model, lora_cfg)
ds = HFDataset.from_list([make_ex(r) for r in pairs])
targs = TrainingArguments(
output_dir=f"{out_dir}/ckpt", num_train_epochs=2,
per_device_train_batch_size=2, gradient_accumulation_steps=4,
learning_rate=1e-4, bf16=True, logging_steps=20,
save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05,
)
Trainer(model=model, args=targs, train_dataset=ds, tokenizer=tok).train()
log("training done")
adapter_dir = f"{out_dir}/adapter"
model.save_pretrained(adapter_dir)
del model; gc.collect(); torch.cuda.empty_cache()
# --- TRAINED eval
from vllm import LLM
from vllm.lora.request import LoRARequest
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048,
enable_lora=True, max_lora_rank=16)
lora_req = LoRARequest("tf_adapter", 1, adapter_dir)
from vllm import SamplingParams
sp = SamplingParams(temperature=0, max_tokens=400, stop=["\nclass ", "\nif __name__", "\n\nprint", "\n\ndef "])
log("=== TRAINED evals ===")
t0 = time.time()
he_outs = [o.outputs[0].text for o in llm.generate([he_prompt(p) for p in he], sp, lora_request=lora_req, use_tqdm=False)]
log(f" HE trained gen in {time.time()-t0:.1f}s")
tr_he = 0
for p, raw in zip(he, he_outs):
full = p["prompt"] + raw
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
ok, _ = run_python(test_code, timeout=10)
if ok: tr_he += 1
t1 = time.time()
mbpp_outs = [o.outputs[0].text for o in llm.generate([mbpp_prompt(p) for p in mbpp_test], sp, lora_request=lora_req, use_tqdm=False)]
log(f" MBPP-test trained gen in {time.time()-t1:.1f}s")
tr_mbpp = 0
for p, raw in zip(mbpp_test, mbpp_outs):
test_code = raw + "\n\n" + "\n".join(p["test_list"])
ok, _ = run_python(test_code, timeout=10)
if ok: tr_mbpp += 1
result = {
"model": args.model, "n_pairs": len(pairs),
"humaneval": {"base": base_he, "trained": tr_he, "delta": tr_he-base_he, "n": len(he)},
"mbpp": {"base": base_mbpp, "trained": tr_mbpp, "delta": tr_mbpp-base_mbpp, "n": len(mbpp_test)},
"elapsed_s": time.time() - T0,
}
with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} — MBPP-train SEEDED ({len(pairs)} pairs)")
print(f" HumanEval: base={base_he}/{len(he)} trained={tr_he}/{len(he)} Δ={tr_he-base_he:+d}")
print(f" MBPP: base={base_mbpp}/{len(mbpp_test)} trained={tr_mbpp}/{len(mbpp_test)} Δ={tr_mbpp-base_mbpp:+d}")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()