diff --git a/README.md b/README.md index 52bce2b..6aa3b1b 100644 --- a/README.md +++ b/README.md @@ -37,29 +37,59 @@ A control experiment — replacing the mined pairs with **identically-formatted ``` tinyforge-zero/ -├── recipe/ -│ ├── train_on_pairs.py # Fast-path: train LoRA on a released pairs.jsonl -│ ├── bootstrap.py # Full-path: self-bootstrap pipeline (mining + train, 7B / 3B) -│ ├── multi_pair_14b.py # Full-path: aggressive multi-pair variant → 80.5% on 14B -│ ├── curriculum_math.py # Full-path: auto-difficulty curriculum for GSM8K -│ ├── eval_raw.py # HumanEval / MBPP / GSM8K eval (vLLM, raw-completion) -│ ├── eval_plus.py # HumanEval+ contamination-resistant eval -│ └── confirm.py # Confirmation re-eval against base -├── data/ -│ ├── pairs_7b_40.jsonl # 40 self-mined pairs (Qwen2.5-7B-Base run) -│ ├── pairs_14b_multi_new60.jsonl # 60 aggressive-mined pairs for 14B (+ warmup 40 → 100 total) -│ └── pairs_math_13.jsonl # 13 curriculum-mined math pairs (Qwen2.5-3B-Base → GSM8K 32→66) +├── recipe/ # Training pipelines +│ ├── train_on_pairs.py # Fast-path: train LoRA on a released pairs.jsonl +│ ├── bootstrap.py # Self-bootstrap pipeline (mining + train, 7B / 3B) +│ ├── bootstrap_14b_4bit_harvest.py # 4-bit harvest variant (when full-precision OOMs) +│ ├── multi_pair_14b.py # Aggressive multi-pair variant → 80.5% on 14B +│ ├── curriculum_math.py # Auto-difficulty curriculum for GSM8K (§2.3, §3.8) +│ ├── curriculum_code.py # Auto-difficulty curriculum for code +│ └── math_bootstrap.py # Vanilla math bootstrap (regressed; see §3.8) +├── evals/ # Evaluation harnesses +│ ├── eval_raw.py # HumanEval / MBPP / GSM8K (vLLM, raw-completion) +│ ├── eval_plus.py # HumanEval+ contamination-resistant eval +│ └── confirm.py # Confirmation re-eval against base +├── tts/ # Test-time sampling (§2.2, §3.3) +│ ├── tts_scaling.py # Pass@N scaling sweep (HE, HE+, MATH-500) +│ ├── tts_humaneval.py # Best-of-N pass@1 on HE/HE+ +│ ├── tts_math500.py # Best-of-N pass@1 on MATH-500 +│ ├── tts_aime.py # Pass@k curve on AIME (k=1..64) +│ ├── tts_qwen14b_recipe.py # TTS on top of the 14B multi-pair adapter +│ └── tts_qwen3_8b_raw_control.py # Control: TTS on raw Qwen3-8B (recipe vs sampling) +├── experiments/ # Every paper experiment, one script each +│ ├── self_consistency.py # §3.4 — deployable TTS via majority vote (no oracle) +│ ├── recipe_x_tts_synergy.py # §3.5 — recipe × TTS synergy threshold (novel finding) +│ ├── cross_domain_code_to_math.py # §3.10 — code-trained recipe on math (+2, marginal) +│ ├── mbpp_seeded_cross_arch.py # §3.9 — Llama/Coder cross-architecture self-mining +│ ├── diversity_cued_mining.py # §3.10 — diversity-cued mining (low yield) +│ ├── recursive_bootstrap.py # §3.10 — recursive iter1→iter2→iter3 (plateau) +│ ├── self_correction_code.py # §3.10 — code self-correction recipe +│ ├── self_correction_math_naive.py # §3.10 — naive (wrong→fix only): catastrophic regress +│ ├── self_correction_math_fixed.py # §3.10 — fixed (mixed positives): recovered +│ ├── math500_seeded_mining.py # §3.10 — distribution-mismatch demo (catastrophic) +│ ├── aime_scaling.py # AIME pass@k = 1..64 sweep +│ ├── bcb_hard_eval.py # §3.10 — BigCodeBench-Hard distribution mismatch +│ └── star_baseline_gsm8k.py # Related-work baseline (STaR / rejection sampling FT) ├── controls/ -│ └── mbpp_corrupt_control.py # The +0 negative-control experiment +│ └── mbpp_corrupt_control.py # §3.6 — the +0 negative-control experiment +├── data/ # Released mined pairs (drove paper numbers) +│ ├── pairs_7b_40.jsonl # 40 pairs for Qwen2.5-7B-Base +│ ├── pairs_14b_multi_new60.jsonl # 60 aggressive-mined pairs for 14B (+ warmup 40 = 100) +│ └── pairs_math_13.jsonl # 13 curriculum-mined math pairs (3B GSM8K) ├── docs/ -│ ├── scaling_chart.png # Recipe lift vs base capability (paper Fig 1) -│ ├── fig1_headline.png # Headline result chart -│ └── fig6_boundary.png # Boundary conditions across 9 models -├── REPRODUCE.md # Paper figure/table → exact command mapping +│ ├── recipe_diagram.png # The 5-stage recipe diagram (rendered above) +│ ├── scaling_chart.png # Recipe lift vs base capability (paper Fig 1) +│ ├── fig1_headline.png # Headline result chart +│ └── fig6_boundary.png # Boundary conditions across 9 models +├── scripts/ +│ └── make_recipe_diagram.py # Source for the rendered recipe diagram +├── REPRODUCE.md # Paper claim → exact command mapping (all sections) ├── requirements.txt └── LICENSE ``` +A note on these scripts: `recipe/`, `evals/`, and `controls/` are the clean replication paths — these have argparse CLIs and produce the headline numbers. The scripts under `experiments/` and `tts/` are the **original research scripts** used to produce each figure / table in the paper. They work, but they're closer to "research code" than "production tooling" — argument names vary, some have hard-coded paths to `/workspace/`, and they were each run on RunPod with a specific GPU. Read the top-of-file docstring of any experiment script for what it does and how to invoke it. + --- ## Quickstart @@ -73,7 +103,7 @@ cd tinyforge-zero pip install -r requirements.txt # 3. Baseline the model (so you know the lift is real) -python recipe/eval_raw.py \ +python evals/eval_raw.py \ --model Qwen/Qwen2.5-7B \ --bench humaneval @@ -85,7 +115,7 @@ python recipe/train_on_pairs.py \ --out adapter_7b --seed 13 # 5. Evaluate the trained adapter -python recipe/eval_raw.py \ +python evals/eval_raw.py \ --model Qwen/Qwen2.5-7B \ --adapter adapter_7b \ --bench humaneval diff --git a/REPRODUCE.md b/REPRODUCE.md index 0d95e28..aa8a431 100644 --- a/REPRODUCE.md +++ b/REPRODUCE.md @@ -1,154 +1,151 @@ # Reproduction Guide -Maps every paper claim → exact command. There are **two replication paths**: +Maps every paper claim → the script that produced it. Two replication paths: -- **Fast path** — use `recipe/train_on_pairs.py` with the released `data/*.jsonl`. Skips the mining stage. Gets you the trained adapter and the headline number in ~30 min on an H100. -- **Full path** — re-run the original research scripts (`bootstrap.py`, `multi_pair_14b.py`, `curriculum_math.py`) end-to-end including the self-mining step. This reproduces the recipe from scratch and verifies the mining is deterministic-ish (modulo sampling). +- **Fast path** — use `recipe/train_on_pairs.py` with `data/*.jsonl`. Reproduces the trained adapter and headline number in ~30 min on H100. Recommended for paper verification. +- **Full path** — re-run the original research scripts end-to-end including the self-mining stage. Use this if applying the recipe to a *new* base model. -The fast path is what you want for paper verification. The full path is what you want if you're trying the recipe on a *new* base model. +A note on script conventions: scripts under `recipe/`, `evals/`, and `controls/` are clean replication paths (argparse CLIs, no hardcoded paths). Scripts under `experiments/` and `tts/` are the original research code used to produce each finding — they work but use `--tag`-style outputs and sometimes assume `/workspace/` paths (set via `HF_HOME` env var). Read the top-of-file docstring of each to see exact invocation. --- ## Environment Tested on: -- **H100 80GB** (recommended for 14B runs) — Debian 12, CUDA 12.4, driver 570+ -- **RTX 6000 Ada 48GB** — sufficient for 7B and 3B runs +- **H100 80GB** — Debian 12, CUDA 12.4, driver 570+ (required for vLLM 0.8.5) +- **RTX 6000 Ada 48GB** — sufficient for ≤7B models ```bash pip install -r requirements.txt ``` -Exact stack used in the paper: `torch==2.6.0`, `transformers==4.51.3`, `vllm==0.8.5`, `peft==0.13.0`. +Pinned stack: `torch==2.6.0`, `transformers==4.51.3`, `vllm==0.8.5`, `peft==0.13.0`. --- -## FAST PATH — reproduce headline numbers from released pairs +# Mapping: paper claim → script -### Qwen2.5-7B-Base → 25 → 95–112/164 (3-seed range) +## §2 Method + +| Paper § | Method | Script | Notes | +|---|---|---|---| +| §2.1 | Self-bootstrap pipeline (code) | `recipe/bootstrap.py` | Generation → solving → mining → train, end-to-end | +| §2.1 | 4-bit harvest for large models | `recipe/bootstrap_14b_4bit_harvest.py` | NF4 quantization, harvest-only (no in-loop training) | +| §2.1 | Aggressive multi-pair mining | `recipe/multi_pair_14b.py` | The 14B 80.5% pipeline | +| §2.2 | Test-time sampling (oracle) | `tts/tts_scaling.py` | Pass@N for HE / HE+ / MATH-500 | +| §2.3 | Auto-difficulty curriculum (math) | `recipe/curriculum_math.py` | The GSM8K 32→66 pipeline | +| §2.3 | Auto-difficulty curriculum (code) | `recipe/curriculum_code.py` | Code variant | + +--- + +## §3 Experiments + +### §3.2 Recipe alone — HumanEval and HumanEval+ + +| Claim (paper Table 1) | Script + command | +|---|---| +| Qwen2.5-7B-Base: 25 → 112 (+87 best seed) | Fast path: `python recipe/train_on_pairs.py --model Qwen/Qwen2.5-7B --pairs data/pairs_7b_40.jsonl --seed 13 --lora-rank 16 --out adapter_7b_seed13` then `python evals/eval_raw.py --model Qwen/Qwen2.5-7B --adapter adapter_7b_seed13 --bench humaneval` | +| Qwen2.5-14B-Base: 44 → 131 / 80% on HE, 122/164 on HE+ | `cat data/pairs_7b_40.jsonl data/pairs_14b_multi_new60.jsonl > /tmp/14b.jsonl; python recipe/train_on_pairs.py --model Qwen/Qwen2.5-14B --pairs /tmp/14b.jsonl --lora-rank 32 --out adapter_14b_multi; python evals/eval_plus.py --model Qwen/Qwen2.5-14B --adapter adapter_14b_multi` | +| Multi-pair full path (re-mine + train) | `python recipe/multi_pair_14b.py --model Qwen/Qwen2.5-14B --warmup_pairs_path data/pairs_7b_40.jsonl --n_problems 200 --n_attempts 8 --max_pairs_per_problem 4 --lora_rank 32 --tag multi_rerun` | +| Boundary table for all 9 models | `python evals/eval_raw.py --model ` for baseline; recipe + re-eval per model. Cost: ~3 hr H100. | + +### §3.3 Test-time sampling (TTS) alone + +| Claim | Script | Expected | +|---|---|---| +| Qwen3-4B best-of-8 HE oracle = 92.7% | `python tts/tts_humaneval.py --model Qwen/Qwen3-4B-Base --n 8 --temperature 0.7` | 152/164 | +| Qwen3-8B best-of-8 HE oracle = 92.1% | `python tts/tts_humaneval.py --model Qwen/Qwen3-8B-Base --n 8 --temperature 0.7` | 151/164 | +| Qwen3-4B best-of-8 MATH-500 = 79.4% | `python tts/tts_math500.py --model Qwen/Qwen3-4B-Base --n 8` | 397/500 | +| Qwen3-8B best-of-8 MATH-500 = 81.0% | `python tts/tts_math500.py --model Qwen/Qwen3-8B-Base --n 8` | 405/500 | +| AIME pass@k curve (k=1..64) | `python tts/tts_aime.py --model Qwen/Qwen3-8B-Base --n 32` | 25.6 / 38.9% best-of-32 | +| Full TTS scaling sweep (Table 2) | `python tts/tts_scaling.py --model Qwen/Qwen3-4B-Base` | | + +### §3.4 Self-consistency (deployable TTS, no oracle) ```bash -# 1. Baseline (raw-completion eval) -python recipe/eval_raw.py --model Qwen/Qwen2.5-7B --bench humaneval -# Expected: 25/164 - -# 2. Train on the released 40 pairs (try multiple seeds — small-data variance) -for SEED in 7 13 42; do - python recipe/train_on_pairs.py \ - --model Qwen/Qwen2.5-7B \ - --pairs data/pairs_7b_40.jsonl \ - --out adapter_7b_seed${SEED} \ - --seed ${SEED} --lora-rank 16 --epochs 2 --lr 1e-4 - python recipe/eval_raw.py \ - --model Qwen/Qwen2.5-7B \ - --adapter adapter_7b_seed${SEED} \ - --bench humaneval -done -# Expected: seed 7 → 104/164, seed 13 → 112/164, seed 42 → 95/164 +python experiments/self_consistency.py \ + --model Qwen/Qwen3-4B-Base \ + --bench gsm8k --n 8 ``` +Tests if majority-vote selection without oracle access matches oracle pass@N. See paper Table 3. -### Qwen2.5-14B-Base → 132/164 (80.5%) and HumanEval+ 122/164 (74.4%) - -The 14B run uses 100 pairs total: the 40 warmup pairs + 60 new aggressive-mined pairs. Concatenate first, then train. +### §3.5 Recipe × TTS synergy threshold (novel finding) ```bash -cat data/pairs_7b_40.jsonl data/pairs_14b_multi_new60.jsonl > /tmp/pairs_14b_100.jsonl - -python recipe/train_on_pairs.py \ - --model Qwen/Qwen2.5-14B \ - --pairs /tmp/pairs_14b_100.jsonl \ - --out adapter_14b_multi \ - --lora-rank 32 --epochs 2 --lr 1e-4 - -python recipe/eval_raw.py \ - --model Qwen/Qwen2.5-14B \ +python experiments/recipe_x_tts_synergy.py \ + --base-model Qwen/Qwen2.5-14B \ --adapter adapter_14b_multi \ - --bench humaneval -# Expected: 132/164 (80.5%) in the multi-pair eval format - -python recipe/eval_plus.py \ - --model Qwen/Qwen2.5-14B \ - --adapter adapter_14b_multi -# Expected: HumanEval+ 122/164 (74.4%) + --n 8 ``` +Compares: raw base | raw base + TTS | recipe-trained | recipe-trained + TTS. The novel finding: at sufficient mined-pair counts, recipe-trained + TTS > raw + TTS (+12.8pp). At too-few pairs, recipe-trained + TTS < raw + TTS (-4.9pp on Qwen2.5-3B with 36 pairs). -### Qwen2.5-3B-Base → GSM8K 32 → 66 +### §3.6 Control: format alone does not explain the lift ```bash -python recipe/train_on_pairs.py \ - --model Qwen/Qwen2.5-3B \ - --pairs data/pairs_math_13.jsonl \ - --out adapter_3b_math \ - --lora-rank 16 --epochs 2 --lr 1e-4 - -# GSM8K eval — uses sympy as the verifier (no oracle math model needed). -# eval_raw.py auto-detects GSM8K format and runs the right verifier. -python recipe/eval_raw.py \ - --model Qwen/Qwen2.5-3B \ - --adapter adapter_3b_math \ - --bench gsm8k -# Expected: 66/100 -``` - ---- - -## FULL PATH — re-mine from scratch - -These reproduce the *mining* step too. Each script does generation → solving → mining → training → eval as one pipeline. They write a `pairs.jsonl` and a `result.json` under `--tag`. - -### Self-bootstrap from scratch on Qwen2.5-7B - -```bash -python recipe/bootstrap.py \ +python controls/mbpp_corrupt_control.py \ --model Qwen/Qwen2.5-7B \ - --iterations 20 \ - --problems_per_iter 16 \ - --train_every 10 \ - --eval_every 10 \ - --tag bs_7b_rerun -# Writes: results/bs_7b_rerun/{pairs.jsonl,ckpt_iter*,eval_log.json,result.json} -# Expected final eval: 25 → 95–112 (seed-dependent) + --tag mbpp_corrupt_control ``` +Expected: HumanEval stays at 25/164 (Δ = 0). Confirms the signal is in self-mined content, not pair-formatted training data. -### Aggressive multi-pair mining on Qwen2.5-14B (the 80.5% headline) +### §3.7 Multi-pair mining at 14B (the 80.5% headline) ```bash python recipe/multi_pair_14b.py \ --model Qwen/Qwen2.5-14B \ --warmup_pairs_path data/pairs_7b_40.jsonl \ - --n_warmup_pairs 40 \ - --n_problems 200 \ - --n_attempts 8 \ - --max_pairs_per_problem 4 \ - --lora_rank 32 --epochs 2 --lr 1e-4 \ + --n_problems 200 --n_attempts 8 \ + --max_pairs_per_problem 4 --lora_rank 32 \ --tag multi_rerun -# Writes: results/multi_pair/multi_rerun/{pairs_new.jsonl,adapter/,result.json} -# Expected: trained 130–134/164 (~80%) ``` +Expected: base 67/164 → trained 132/164 (multi-pair eval format) / 131/164 chat-template / 122/164 HE+. -### GSM8K auto-difficulty curriculum on Qwen2.5-3B +### §3.8 Math: auto-difficulty curriculum ```bash python recipe/curriculum_math.py \ --model Qwen/Qwen2.5-3B \ --iterations 16 \ --tag curr_3b_rerun -# Mines 10–15 curriculum-difficulty pairs, trains, evals. -# Expected: GSM8K 32 → 60–70 (some seed variance) ``` +Expected: GSM8K 32/100 → 66/100. Compare to `recipe/math_bootstrap.py` (vanilla, no curriculum) which regresses. + +### §3.9 Cross-architecture and cross-generation + +| Model | Script | Expected | +|---|---|---| +| Llama-3.2-3B (own-mined 32) | `python experiments/mbpp_seeded_cross_arch.py --model meta-llama/Llama-3.2-3B` | HE 39→43 (+4) | +| Qwen2.5-Coder-7B-Base | `python experiments/mbpp_seeded_cross_arch.py --model Qwen/Qwen2.5-Coder-7B` | HE 83→87 (+4), MBPP 122→124 (+2) | +| Qwen3-4B-Base | Same script, Qwen3-4B-Base | HE 79→106 (+27), MBPP 135→148 (+13) | + +### §3.10 Failure modes and negative results + +Each negative finding has its own script. Run any of these to verify the documented failure. + +| Failure mode | Script | Expected | +|---|---|---| +| Saturation (Qwen3-8B/14B HE) | `python recipe/bootstrap.py --model Qwen/Qwen3-8B-Base --tag sat_check` | 132 → 118–133, no clean lift | +| BCB-Hard distribution mismatch | `python experiments/bcb_hard_eval.py --model Qwen/Qwen3-8B-Base --adapter adapter_7b_seed13` | No transfer; HE-style pairs don't generalize to library code | +| MATH-500 mining distribution mismatch | `python experiments/math500_seeded_mining.py --model Qwen/Qwen3-8B-Base` | 279/500 → 239/500 (−40, catastrophic) | +| Self-correction over-correction (naive) | `python experiments/self_correction_math_naive.py --model Qwen/Qwen3-4B-Base` | 299/500 → 69/500 (Δ=−230!) | +| Self-correction recovery (fixed) | `python experiments/self_correction_math_fixed.py --model Qwen/Qwen3-4B-Base` | Recovers to baseline + small lift via mixed positives | +| Recursive bootstrap plateau | `python experiments/recursive_bootstrap.py --model Qwen/Qwen2.5-7B --iters 3` | iter1 gives most lift, iter2/3 plateau | +| Cross-domain transfer (code→math) | `python experiments/cross_domain_code_to_math.py --code-adapter adapter_7b_seed13` | +2 marginal lift on GSM8K | +| Diversity-cued mining low yield | `python experiments/diversity_cued_mining.py --model Qwen/Qwen2.5-7B` | Fewer well-formed pairs than vanilla mining | --- -## Control experiment (Figure 2) +## §3.11 Boundary conditions summary (Figure 6) -Verifies the signal is in the *content* of self-mined pairs, not the format. Replaces the mined pairs with mechanically-corrupted external pairs (MBPP-style) that look identical structurally. +The 9-model boundary chart is the synthesis of per-model recipe runs. To regenerate: ```bash -python controls/mbpp_corrupt_control.py \ - --model Qwen/Qwen2.5-7B \ - --tag mbpp_corrupt_control -# Expected: HumanEval stays at 25/164 (Δ ≈ 0, ± seed noise) +for MODEL in Qwen/Qwen2.5-{3B,7B,14B,72B} Qwen/Qwen3-{1.7B,4B,8B,14B}-Base meta-llama/Llama-3.2-3B Qwen/Qwen2.5-Coder-7B allenai/OLMo-2-1124-7B; do + python evals/eval_raw.py --model "$MODEL" --bench humaneval # baseline + python recipe/bootstrap.py --model "$MODEL" --tag "boundary_$(echo $MODEL | tr '/' '_')" +done ``` +Run time: ~3 hours on a single H100, ~$8 cost. --- @@ -161,42 +158,40 @@ for N in 10 21 40; do --model Qwen/Qwen2.5-7B \ --pairs /tmp/pairs_$N.jsonl \ --out adapter_n$N --epochs 2 - python recipe/eval_raw.py \ + python evals/eval_raw.py \ --model Qwen/Qwen2.5-7B --adapter adapter_n$N --bench humaneval done -# Expected: n=10 → ~51, n=21 → 86–95, n=40 → 95–112 (seed-dependent for small N) ``` +Expected: n=10 → ~51, n=21 → mean ~91, n=40 → mean ~105 (seed-dependent for small N). --- -## Boundary conditions to verify (paper §3) +## Related-work baseline -| Claim | Hint | Expected | -|-------|------|----------| -| Qwen3-8B saturated on HE | Run multi_pair_14b.py with `--model Qwen/Qwen3-8B-Base` | Base 132, adapter ≈ 118–133 — no clean lift | -| Qwen2.5-72B saturated | Same on 72B with 10 pairs | Base 83 → trained 73 (−10) | -| MATH-500 distribution mismatch | Mining on simple problems + MATH-500 eval | Base 279/500 → trained 239/500 (−40) | -| Self-correction over-correction | Train on wrong→fix triples only, no right→stays-right | Base 299/500 → trained 69/500 (−230) | -| BCB-Hard distribution mismatch | Apply 7B 40-pair adapter, eval on BCB-Hard | No transfer | +| Method | Script | Use | +|---|---|---| +| STaR / rejection-sampling FT on GSM8K | `experiments/star_baseline_gsm8k.py` | Comparison point for the curriculum result | --- -## Notes on stochasticity +## Notes on stochasticity and reproducibility -- **vLLM sampling** is deterministic given a fixed seed, but vLLM 0.8.x occasionally changes pad/EOS handling between point releases. Pin to 0.8.5. -- **LoRA training is seed-sensitive at small N.** The 7B 40-pair run spans 95–112/164 across seeds 7/13/42. The 14B 100-pair run is much tighter (130–134/164). -- **Stop tokens matter.** Use `--stop "\nclass " --stop "\nif __name__"` for raw-completion eval. Wrong stop tokens cut output prematurely and produce artifactually low baselines. We saw this earlier in the project — see paper §2. +- **vLLM sampling** is deterministic given a fixed seed, but vLLM 0.8.x can change pad/EOS handling between point releases. Pin to 0.8.5. +- **LoRA training is seed-sensitive at small N.** 7B 40-pair: 95–112/164 across seeds 7/13/42. 14B 100-pair: 130–134/164 (tighter). +- **Stop tokens matter.** Use `--stop "\nclass " --stop "\nif __name__"` for raw-completion eval. Wrong stop tokens cut output and produce artifactually low baselines. We hit this earlier in the project; the paper §2 documents the fix. --- ## Cost reference (May 2026, RunPod) | Workflow | Hardware | Wall time | Cost | -|----------|----------|-----------|------| +|---|---|---|---| | 7B headline (fast path) | RTX 6000 Ada 48GB | ~30 min | ~$0.50 | | 14B 80.5% (fast path) | H100 80GB | ~30 min | ~$1.50 | -| 14B 80.5% full path (mining + train) | H100 80GB | ~95 min | ~$3.50 | -| GSM8K 32→66 | RTX 6000 Ada | ~30 min | ~$0.50 | -| Full eval matrix (9 models) | H100 80GB | ~3 hrs | ~$8 | +| 14B 80.5% full path | H100 80GB | ~95 min | ~$3.50 | +| GSM8K 32→66 curriculum | RTX 6000 Ada | ~30 min | ~$0.50 | +| TTS scaling sweep (one model) | H100 80GB | ~30 min | ~$1.50 | +| Full 9-model boundary chart | H100 80GB | ~3 hrs | ~$8 | +| Every negative result | mixed | ~5 hrs total | ~$15 | -Total cost to verify all numbers in the paper via the fast path: **under $10**. +Verify all paper numbers via fast path: **under $10**. Full reproduction from scratch (including all negative results and the full TTS sweep): **~$50**, matching the paper's reported total spend. diff --git a/recipe/confirm.py b/evals/confirm.py similarity index 100% rename from recipe/confirm.py rename to evals/confirm.py diff --git a/recipe/eval_plus.py b/evals/eval_plus.py similarity index 100% rename from recipe/eval_plus.py rename to evals/eval_plus.py diff --git a/recipe/eval_raw.py b/evals/eval_raw.py similarity index 100% rename from recipe/eval_raw.py rename to evals/eval_raw.py diff --git a/experiments/aime_scaling.py b/experiments/aime_scaling.py new file mode 100644 index 0000000..7c8083b --- /dev/null +++ b/experiments/aime_scaling.py @@ -0,0 +1,91 @@ +"""TTS scaling on AIME — pass@k curve from k=1 to k=64.""" +import os, json, time, re, argparse +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from datasets import load_dataset + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def extract_int(text): + m = re.search(r"\\boxed\{(\d+)\}", text) + if m: + try: return int(m.group(1)) + except: return None + nums = re.findall(r"\b(\d+)\b", text.strip().split("\n")[-3:][-1] if text.strip().split("\n") else "") + if nums: + try: return int(nums[-1]) + except: pass + return None + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--tag", required=True) + ap.add_argument("--out_dir", required=True) + args = ap.parse_args() + + os.makedirs(args.out_dir, exist_ok=True) + + from vllm import LLM, SamplingParams + from transformers import AutoTokenizer + log(f"loading {args.model}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=3072) + log("loaded") + + ds = list(load_dataset("AI-MO/aimo-validation-aime", split="train")) + log(f" AIME: {len(ds)} problems") + + UTMPL = "Solve this AIME problem. Answer is integer 0-999. End with \\boxed{{N}}.\n\nProblem: {p}\n\nSolution:" + prompts = [] + for p in ds: + try: + msgs = [{"role": "system", "content": "AIME solver. End with \\boxed{integer}."}, + {"role": "user", "content": UTMPL.format(p=p["problem"])}] + prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)) + except Exception: + prompts.append(UTMPL.format(p=p["problem"])) + + MAX_N = 64 + sp = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1500, n=MAX_N) + log(f"generating {MAX_N} samples per problem...") + t0 = time.time() + outs = llm.generate(prompts, sp, use_tqdm=False) + log(f" gen in {time.time()-t0:.1f}s") + + # Per-task per-sample correctness + per_task_results = [] + for p, outset in zip(ds, outs): + gold = int(p["answer"]) + per_sample = [] + for o in outset.outputs: + pred = extract_int(o.text) + per_sample.append(pred == gold) + per_task_results.append(per_sample) + + NS = [1, 2, 4, 8, 16, 32, 64] + scaling = {} + for k in NS: + scaling[k] = sum(1 for r in per_task_results if any(r[:k])) + + result = {"model": args.model, "tag": args.tag, "MAX_N": MAX_N, + "n_total": len(ds), "pass_at_k": scaling, "elapsed_s": time.time() - T0} + with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model} — AIME TTS SCALING") + for k in NS: + print(f" pass@{k:<3}: {scaling[k]:>3}/{len(ds)} ({100*scaling[k]/len(ds):.1f}%)") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/experiments/bcb_hard_eval.py b/experiments/bcb_hard_eval.py new file mode 100644 index 0000000..f661073 --- /dev/null +++ b/experiments/bcb_hard_eval.py @@ -0,0 +1,190 @@ +"""Train Qwen3-8B-Base with 40-pair recipe, eval on BigCodeBench-Hard. + +BigCodeBench is harder than HumanEval (real-world Python tasks, library use). +Qwen3-8B-Base likely has headroom there (~30-45% baseline). Tests if recipe +generalizes to newer model AND harder benchmark. +""" +import os, json, time, re, subprocess, tempfile, argparse +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer +from datasets import load_dataset, Dataset as HFDataset +from peft import LoraConfig, get_peft_model + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def extract_code(text): + if "```python" in text: text = text.split("```python", 1)[1] + elif "```" in text: text = text.split("```", 1)[1] + if "```" in text: text = text.split("```", 1)[0] + return text.strip() + + +def verify_bcb(code, test_code): + runner = "\n\nif __name__ == '__main__':\n import unittest; unittest.main(argv=['x'], exit=False, verbosity=0)\n" + body = code + "\n\n" + test_code + runner + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f: + f.write(body); path = f.name + try: + r = subprocess.run(["python3", path], capture_output=True, timeout=20, text=True, cwd="/tmp") + out = (r.stdout or "") + "\n" + (r.stderr or "") + if "OK" in out and "FAILED" not in out and "Error" not in out and r.returncode == 0: + return True + return False + except subprocess.TimeoutExpired: + return False + finally: + try: os.unlink(path) + except: pass + + +def gen_batch(model, tok, prompts, max_new=600, temperature=0.0, batch=4): + outs = [] + for i in range(0, len(prompts), batch): + chunk = prompts[i:i+batch] + texts = [] + for p in chunk: + msgs = [{"role": "system", "content": "You are an expert Python coder. Output one ```python block with the complete solution."}, + {"role": "user", "content": p}] + texts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)) + inp = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=2000).to(model.device) + with torch.no_grad(): + out = model.generate(**inp, max_new_tokens=max_new, do_sample=temperature > 0, + temperature=temperature if temperature > 0 else 1.0, top_p=0.95, + pad_token_id=tok.eos_token_id) + for j in range(out.size(0)): + outs.append(tok.decode(out[j][inp.input_ids.shape[1]:], skip_special_tokens=True)) + return outs + + +def eval_bcb_hard(model, tok, label, max_n=148): + bcb = list(load_dataset("bigcode/bigcodebench-hard", split="v0.1.4"))[:max_n] + log(f" BCB-Hard [{label}] ({len(bcb)})") + prompts = [p["instruct_prompt"] for p in bcb] + outs = gen_batch(model, tok, prompts, max_new=700, batch=4) + correct = 0 + for i, (p, raw) in enumerate(zip(bcb, outs)): + code = extract_code(raw) if "```" in raw else raw + if verify_bcb(code, p["test"]): correct += 1 + if (i+1) % 20 == 0: log(f" {label} BCB {i+1}/{len(bcb)}: {correct}") + return correct, len(bcb) + + +def eval_humaneval(model, tok, label): + he = list(load_dataset("openai_humaneval", split="test")) + log(f" HumanEval [{label}] ({len(he)})") + prompts = [p["prompt"] + "\n# Complete the function above." for p in he] + outs = gen_batch(model, tok, prompts, max_new=400, batch=4) + correct = 0 + for i, (p, raw) in enumerate(zip(he, outs)): + code = extract_code(raw) if "```" in raw else raw + full = p["prompt"] + "\n" + code if "def " not in code else code + test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f: + f.write(test_code); path = f.name + try: + r = subprocess.run(["python3", path], capture_output=True, timeout=10, text=True, cwd="/tmp") + if r.returncode == 0: correct += 1 + except subprocess.TimeoutExpired: pass + finally: + try: os.unlink(path) + except: pass + if (i+1) % 40 == 0: log(f" {label} HE {i+1}/{len(he)}: {correct}") + return correct, len(he) + + +def make_example(r, tok): + user = (f"Implement: {r['signature']}\n\n" + f"Tests:\n{chr(10).join(r['tests'])}\n\n" + f"My attempt:\n```python\n{r['broken']}\n```\n\n" + f"Error:\n{r.get('error','')}\n\n" + f"Fix and output the corrected code only.") + assistant = f"```python\n{r['fixed']}\n```" + msgs_pre = [{"role": "system", "content": "You are an expert Python coder. Output one ```python block with the complete solution."}, + {"role": "user", "content": user}] + msgs_full = msgs_pre + [{"role": "assistant", "content": assistant}] + pre = tok.apply_chat_template(msgs_pre, tokenize=False, add_generation_prompt=True) + full = tok.apply_chat_template(msgs_full, tokenize=False) + pre_ids = tok(pre, add_special_tokens=False)["input_ids"] + full_ids = tok(full, add_special_tokens=False)["input_ids"] + MAX = 1024 + full_ids = full_ids[:MAX] + labels = list(full_ids) + n_pre = min(len(pre_ids), len(labels)) + for i in range(n_pre): labels[i] = -100 + pad = MAX - len(full_ids) + return {"input_ids": full_ids + [tok.pad_token_id]*pad, + "attention_mask": [1]*len(full_ids) + [0]*pad, + "labels": labels + [-100]*pad} + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--pairs", default="/workspace/saved_pairs/pairs_40.jsonl") + ap.add_argument("--n_pairs", type=int, default=40) + ap.add_argument("--tag", required=True) + args = ap.parse_args() + + out_dir = f"/workspace/bcb_eval/{args.tag}" + os.makedirs(out_dir, exist_ok=True) + + log(f"loading {args.model}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + tok.padding_side = "left" + model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0") + log(f" loaded mem={torch.cuda.memory_allocated('cuda:0')/1e9:.1f}GB") + + model.eval() + log("=== BASE evals ===") + base_he, _ = eval_humaneval(model, tok, "BASE") + base_bcb, _ = eval_bcb_hard(model, tok, "BASE") + log(f" BASE: HumanEval={base_he}/164 BCB-Hard={base_bcb}/148") + + pairs = [json.loads(l) for l in open(args.pairs)][:args.n_pairs] + log(f"=== TRAINING — {len(pairs)} pairs ===") + lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none", + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM") + model = get_peft_model(model, lora_cfg) + tok.padding_side = "right" + ds = HFDataset.from_list([make_example(r, tok) for r in pairs]) + targs = TrainingArguments( + output_dir=f"{out_dir}/ckpt", num_train_epochs=2, + per_device_train_batch_size=1, gradient_accumulation_steps=4, + learning_rate=1e-4, bf16=True, logging_steps=10, + save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05, + ) + Trainer(model=model, args=targs, train_dataset=ds, processing_class=tok).train() + log(" training done") + tok.padding_side = "left" + + model.eval() + log("=== TRAINED evals ===") + tr_he, _ = eval_humaneval(model, tok, "TRAINED") + tr_bcb, _ = eval_bcb_hard(model, tok, "TRAINED") + + result = { + "model": args.model, "method": "warmup 40 pairs", + "humaneval": {"base": base_he, "trained": tr_he, "delta": tr_he-base_he, "n": 164}, + "bcb_hard": {"base": base_bcb, "trained": tr_bcb, "delta": tr_bcb-base_bcb, "n": 148}, + "elapsed_s": time.time() - T0, + } + with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model}") + print(f" HumanEval: base={base_he}/164 trained={tr_he}/164 Δ={tr_he-base_he:+d}") + print(f" BCB-Hard: base={base_bcb}/148 trained={tr_bcb}/148 Δ={tr_bcb-base_bcb:+d}") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/experiments/cross_domain_code_to_math.py b/experiments/cross_domain_code_to_math.py new file mode 100644 index 0000000..9e7f54a --- /dev/null +++ b/experiments/cross_domain_code_to_math.py @@ -0,0 +1,222 @@ +"""Cross-domain transfer: train recipe on CODE, eval on MATH (no math training). +Tests if self-bootstrap teaches generic reasoning vs domain-specific patterns.""" +import os, json, time, re, subprocess, tempfile, argparse, gc, random +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from datasets import load_dataset + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def run_python(code, timeout=10): + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f: + f.write(code); path = f.name + try: + r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp") + return r.returncode == 0 + except subprocess.TimeoutExpired: return False + finally: + try: os.unlink(path) + except: pass + + +def extract_boxed(text): + idx = text.rfind("\\boxed{") + if idx < 0: return None + start = idx + len("\\boxed{"); depth = 1; i = start + while i < len(text) and depth > 0: + if text[i] == "{": depth += 1 + elif text[i] == "}": depth -= 1 + i += 1 + if depth != 0: return None + return text[start:i-1].strip() + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--train_domain", choices=["code", "math"], default="code") + ap.add_argument("--tag", required=True) + ap.add_argument("--out_dir", required=True) + args = ap.parse_args() + + os.makedirs(args.out_dir, exist_ok=True) + random.seed(42) + + from vllm import LLM, SamplingParams + from transformers import AutoTokenizer + log(f"loading {args.model}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048) + log("loaded") + + # Eval sets + he = list(load_dataset("openai_humaneval", split="test"))[:80] + math500 = list(load_dataset("HuggingFaceH4/MATH-500", split="test"))[:100] + + # Build prompts + he_prompts = [p["prompt"] for p in he] + math_prompts = [] + for p in math500: + try: + msgs = [{"role": "system", "content": "Math solver. End with \\boxed{answer}."}, + {"role": "user", "content": f"Solve. Problem: {p['problem']}\n\nSolution:"}] + math_prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)) + except Exception: + math_prompts.append(f"Solve. Problem: {p['problem']}\n\nSolution:") + + import sympy + from sympy.parsing.latex import parse_latex + def sympy_eq(a, b): + if a is None or b is None: return False + if a.strip() == b.strip(): return True + try: + if sympy.simplify(parse_latex(a) - parse_latex(b)) == 0: return True + except Exception: pass + try: + if abs(float(a) - float(b)) < 1e-6: return True + except Exception: pass + return False + + def eval_he(llm, lora_req=None): + sp = SamplingParams(temperature=0, max_tokens=400, stop=["\nclass ", "\nif __name__", "\n\nprint"]) + outs = llm.generate(he_prompts, sp, lora_request=lora_req, use_tqdm=False) if lora_req else \ + llm.generate(he_prompts, sp, use_tqdm=False) + outs = [o.outputs[0].text for o in outs] + c = 0 + for p, raw in zip(he, outs): + full = p["prompt"] + "\n" + raw + test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + if run_python(test_code, 10): c += 1 + return c, len(he) + + def eval_math(llm, lora_req=None): + sp = SamplingParams(temperature=0, max_tokens=800) + outs = llm.generate(math_prompts, sp, lora_request=lora_req, use_tqdm=False) if lora_req else \ + llm.generate(math_prompts, sp, use_tqdm=False) + outs = [o.outputs[0].text for o in outs] + c = 0 + for p, raw in zip(math500, outs): + if sympy_eq(extract_boxed(raw), p["answer"]): c += 1 + return c, len(math500) + + log("=== BASE evals ===") + base_he = eval_he(llm) + base_math = eval_math(llm) + log(f" base HE: {base_he[0]}/{base_he[1]} MATH: {base_math[0]}/{base_math[1]}") + + # Mine code pairs + log("mining code pairs...") + mbpp_full = list(load_dataset("mbpp", split="train")) + random.shuffle(mbpp_full) + seeds = [] + for p in mbpp_full[:200]: + prompt_text = p.get("prompt") or p.get("text", "") + if prompt_text and p.get("test_list"): + seeds.append({"prompt": prompt_text, "test_list": p["test_list"]}) + + def mbpp_prompt(p): return f"# Task: {p['prompt']}\n# Tests:\n# " + "\n# ".join(p["test_list"]) + "\n\n" + + sp = SamplingParams(temperature=0, max_tokens=400, stop=["\nclass Test", "\nif __name__"]) + g_outs = [o.outputs[0].text for o in llm.generate([mbpp_prompt(p) for p in seeds], sp, use_tqdm=False)] + hard_idx = [] + for i, (p, raw) in enumerate(zip(seeds, g_outs)): + if not run_python(raw + "\n\n" + "\n".join(p["test_list"]), 8): + hard_idx.append(i) + log(f" greedy: {len(seeds)-len(hard_idx)} pass, {len(hard_idx)} hard") + pairs = [] + if hard_idx: + sp2 = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=400, n=8, + stop=["\nclass Test", "\nif __name__"]) + hard_prompts = [mbpp_prompt(seeds[i]) for i in hard_idx] + sample_outs = llm.generate(hard_prompts, sp2, use_tqdm=False) + for j, i in enumerate(hard_idx): + attempts = [o.text for o in sample_outs[j].outputs] + for a in attempts: + if run_python(a + "\n\n" + "\n".join(seeds[i]["test_list"]), 8): + pairs.append({"problem": seeds[i]["prompt"], "tests": seeds[i]["test_list"], + "broken": g_outs[i].strip(), "fixed": a.strip()}) + break + log(f" mined {len(pairs)} code pairs") + + if len(pairs) < 5: + log("too few pairs, skipping train") + result = {"model": args.model, "n_pairs": len(pairs), + "base_he": base_he[0], "base_math": base_math[0]} + with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + return + + # Tear down vLLM, train LoRA + del llm; gc.collect(); torch.cuda.empty_cache() + from transformers import AutoModelForCausalLM, TrainingArguments, Trainer + from datasets import Dataset as HFDataset + from peft import LoraConfig, get_peft_model + + def mk_ex(r): + user = (f"# Task: {r['problem']}\n# Tests:\n# " + "\n# ".join(r['tests']) + "\n" + f"# My broken attempt:\n{r['broken']}\n# Corrected:\n") + full = user + r["fixed"] + full_ids = tok(full, add_special_tokens=False)["input_ids"] + user_ids = tok(user, add_special_tokens=False)["input_ids"] + MAX = 1024 + full_ids = full_ids[:MAX] + labels = list(full_ids); n_user = min(len(user_ids), len(labels)) + for i in range(n_user): labels[i] = -100 + pad = MAX - len(full_ids) + return {"input_ids": full_ids + [tok.pad_token_id]*pad, + "attention_mask": [1]*len(full_ids) + [0]*pad, + "labels": labels + [-100]*pad} + + log("training LoRA on code pairs...") + model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0") + lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none", + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM") + model = get_peft_model(model, lora_cfg) + ds_train = HFDataset.from_list([mk_ex(r) for r in pairs]) + targs = TrainingArguments( + output_dir=f"{args.out_dir}/ckpt", num_train_epochs=2, + per_device_train_batch_size=1, gradient_accumulation_steps=4, + learning_rate=1e-4, bf16=True, logging_steps=20, + save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05, + ) + Trainer(model=model, args=targs, train_dataset=ds_train, tokenizer=tok).train() + adapter_dir = f"{args.out_dir}/adapter" + model.save_pretrained(adapter_dir) + del model; gc.collect(); torch.cuda.empty_cache() + log("training done") + + # Re-eval with adapter + log("=== TRAINED evals ===") + from vllm import LLM as LLM2 + from vllm.lora.request import LoRARequest + llm = LLM2(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048, + enable_lora=True, max_lora_rank=16) + lora_req = LoRARequest("trained", 1, adapter_dir) + tr_he = eval_he(llm, lora_req) + tr_math = eval_math(llm, lora_req) + log(f" trained HE: {tr_he[0]}/{tr_he[1]} MATH: {tr_math[0]}/{tr_math[1]}") + + result = { + "model": args.model, "train_domain": args.train_domain, + "n_pairs": len(pairs), + "humaneval": {"base": base_he[0], "trained": tr_he[0], "delta": tr_he[0]-base_he[0], "n": base_he[1]}, + "math500": {"base": base_math[0], "trained": tr_math[0], "delta": tr_math[0]-base_math[0], "n": base_math[1]}, + "elapsed_s": time.time() - T0, + } + with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model} — CROSS-DOMAIN ({args.train_domain} train, eval HE+MATH)") + print(f" HE: base={base_he[0]}/{base_he[1]} trained={tr_he[0]}/{tr_he[1]} Δ={tr_he[0]-base_he[0]:+d}") + print(f" MATH: base={base_math[0]}/{base_math[1]} trained={tr_math[0]}/{tr_math[1]} Δ={tr_math[0]-base_math[0]:+d}") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/experiments/diversity_cued_mining.py b/experiments/diversity_cued_mining.py new file mode 100644 index 0000000..c615729 --- /dev/null +++ b/experiments/diversity_cued_mining.py @@ -0,0 +1,180 @@ +"""Diversity-aware mining: prompt model with multiple cognitive lenses, mine pairs WITHOUT including failed code. +Train on (problem, best_approach_summary, working_code) — minimal traces.""" +import os, json, time, re, subprocess, tempfile, argparse, gc, random +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from datasets import load_dataset + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def run_python(code, timeout=10): + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f: + f.write(code); path = f.name + try: + r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp") + return r.returncode == 0 + except subprocess.TimeoutExpired: return False + finally: + try: os.unlink(path) + except: pass + + +LENS_PROMPTS = [ + ("brute force iteration", "# Loop and check each case."), + ("math formula", "# Use a closed-form formula."), + ("hash map/set", "# Use a hashmap/set for O(1) lookup."), + ("recursion", "# Solve recursively."), +] + + +def mbpp_prompt(p): return f"# Task: {p['prompt']}\n# Tests:\n# " + "\n# ".join(p["test_list"]) + "\n\n" +def he_prompt(p): return p["prompt"] + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--n_mining", type=int, default=150) + ap.add_argument("--tag", required=True) + ap.add_argument("--out_dir", required=True) + args = ap.parse_args() + + os.makedirs(args.out_dir, exist_ok=True) + random.seed(42) + + from vllm import LLM, SamplingParams + from transformers import AutoTokenizer + log(f"loading {args.model}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048) + log("loaded") + + he = list(load_dataset("openai_humaneval", split="test")) + mbpp_test = list(load_dataset("mbpp", "sanitized", split="test"))[:100] + mbpp_full = list(load_dataset("mbpp", split="train")) + random.shuffle(mbpp_full) + seeds = [] + for p in mbpp_full[:args.n_mining]: + prompt_text = p.get("prompt") or p.get("text", "") + if prompt_text and p.get("test_list"): + seeds.append({"prompt": prompt_text, "test_list": p["test_list"]}) + log(f" HE: {len(he)}, MBPP-test: {len(mbpp_test)}, mining: {len(seeds)}") + + # Base eval + sp_g = SamplingParams(temperature=0, max_tokens=400, stop=["\nclass ", "\nif __name__", "\n\nprint"]) + he_outs = [o.outputs[0].text for o in llm.generate([he_prompt(p) for p in he], sp_g, use_tqdm=False)] + base_he = sum(1 for p, raw in zip(he, he_outs) + if run_python(p["prompt"] + "\n" + raw + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})", 10)) + mbpp_outs = [o.outputs[0].text for o in llm.generate([mbpp_prompt(p) for p in mbpp_test], sp_g, use_tqdm=False)] + base_mbpp = sum(1 for p, raw in zip(mbpp_test, mbpp_outs) + if run_python(raw + "\n\n" + "\n".join(p["test_list"]), 10)) + log(f"BASE: HE={base_he}/{len(he)} MBPP={base_mbpp}/{len(mbpp_test)}") + + # Mine: for each problem, generate 4 lens-cued attempts, keep one that works + log("mining with cued diversity...") + pairs = [] + for lens_name, lens_hint in LENS_PROMPTS: + log(f" lens: {lens_name}") + # Prefill prompts with lens hint + prefilled = [] + for s in seeds: + base = mbpp_prompt(s) + f"# Approach: {lens_name}.\n{lens_hint}\ndef solution" + prefilled.append(base) + sp = SamplingParams(temperature=0.7, top_p=0.95, max_tokens=300, + stop=["\nclass Test", "\nif __name__", "\n\nprint", "\n# Task"]) + outs = [o.outputs[0].text for o in llm.generate(prefilled, sp, use_tqdm=False)] + # Verify each + for s, raw in zip(seeds, outs): + code = "def solution" + raw + if run_python(code + "\n\n" + "\n".join(s["test_list"]), 8): + # Greedy attempt to use as broken + greedy = [o.outputs[0].text for o in llm.generate([mbpp_prompt(s)], sp_g, use_tqdm=False)][0] + if not run_python(greedy + "\n\n" + "\n".join(s["test_list"]), 8): + pairs.append({"problem": s["prompt"], "tests": s["test_list"], + "broken": greedy.strip(), "fixed": code.strip(), + "lens": lens_name}) + log(f"mined {len(pairs)} pairs across lenses") + + with open(f"{args.out_dir}/pairs.jsonl", "w") as fh: + for r in pairs: fh.write(json.dumps(r) + "\n") + + if len(pairs) < 5: + result = {"model": args.model, "n_pairs": len(pairs), "base_he": base_he, "base_mbpp": base_mbpp} + with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + return + + # Train flat + del llm; gc.collect(); torch.cuda.empty_cache() + from transformers import AutoModelForCausalLM, TrainingArguments, Trainer + from datasets import Dataset as HFDataset + from peft import LoraConfig, get_peft_model + + def mk_ex(r): + user = (f"# Task: {r['problem']}\n# Tests:\n# " + "\n# ".join(r['tests']) + "\n" + f"# My broken attempt:\n{r['broken']}\n# Corrected:\n") + full = user + r["fixed"] + full_ids = tok(full, add_special_tokens=False)["input_ids"] + user_ids = tok(user, add_special_tokens=False)["input_ids"] + MAX = 1024 + full_ids = full_ids[:MAX] + labels = list(full_ids); n_user = min(len(user_ids), len(labels)) + for i in range(n_user): labels[i] = -100 + pad = MAX - len(full_ids) + return {"input_ids": full_ids + [tok.pad_token_id]*pad, + "attention_mask": [1]*len(full_ids) + [0]*pad, + "labels": labels + [-100]*pad} + + log("training...") + model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0") + lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none", + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM") + model = get_peft_model(model, lora_cfg) + ds_train = HFDataset.from_list([mk_ex(r) for r in pairs]) + targs = TrainingArguments( + output_dir=f"{args.out_dir}/ckpt", num_train_epochs=2, + per_device_train_batch_size=1, gradient_accumulation_steps=4, + learning_rate=1e-4, bf16=True, logging_steps=20, + save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05, + ) + Trainer(model=model, args=targs, train_dataset=ds_train, tokenizer=tok).train() + adapter_dir = f"{args.out_dir}/adapter" + model.save_pretrained(adapter_dir) + del model; gc.collect(); torch.cuda.empty_cache() + + # Trained eval + from vllm import LLM as LLM2 + from vllm.lora.request import LoRARequest + llm = LLM2(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048, + enable_lora=True, max_lora_rank=16) + lora_req = LoRARequest("trained", 1, adapter_dir) + he_outs = [o.outputs[0].text for o in llm.generate([he_prompt(p) for p in he], sp_g, lora_request=lora_req, use_tqdm=False)] + tr_he = sum(1 for p, raw in zip(he, he_outs) + if run_python(p["prompt"] + "\n" + raw + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})", 10)) + mbpp_outs = [o.outputs[0].text for o in llm.generate([mbpp_prompt(p) for p in mbpp_test], sp_g, lora_request=lora_req, use_tqdm=False)] + tr_mbpp = sum(1 for p, raw in zip(mbpp_test, mbpp_outs) + if run_python(raw + "\n\n" + "\n".join(p["test_list"]), 10)) + + result = { + "model": args.model, "n_pairs": len(pairs), + "humaneval": {"base": base_he, "trained": tr_he, "delta": tr_he-base_he, "n": len(he)}, + "mbpp": {"base": base_mbpp, "trained": tr_mbpp, "delta": tr_mbpp-base_mbpp, "n": len(mbpp_test)}, + "elapsed_s": time.time() - T0, + } + with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model} — DIVERSITY-CUED MINING ({len(pairs)} pairs)") + print(f" HE: base={base_he}/{len(he)} trained={tr_he}/{len(he)} Δ={tr_he-base_he:+d}") + print(f" MBPP: base={base_mbpp}/{len(mbpp_test)} trained={tr_mbpp}/{len(mbpp_test)} Δ={tr_mbpp-base_mbpp:+d}") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/experiments/math500_seeded_mining.py b/experiments/math500_seeded_mining.py new file mode 100644 index 0000000..db6986e --- /dev/null +++ b/experiments/math500_seeded_mining.py @@ -0,0 +1,276 @@ +"""TinyForge-Zero math with MATH-train-split as problem seeds. + +Recipe: + 1. Sample N problems from MATH train split (NOT test). + 2. Greedy solve each. Verify with sympy against gold answer. + 3. If greedy correct → save (problem, greedy_solution) as positive. + 4. If greedy wrong, sample 4 attempts at temp=0.8. + Some pass → mine pair: (problem, sampled_correct_solution). + 5. Repeat until max_pairs. + 6. Train LoRA on pairs. + 7. Eval on MATH-500 (test). + +Uses MATH train as problem source — model still self-generates ALL solutions. +No human solutions used. +""" +import os, json, time, re, argparse, random +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer +from datasets import load_dataset, Dataset as HFDataset +from peft import LoraConfig, get_peft_model +import sympy +from sympy.parsing.latex import parse_latex + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +SOLVE_PROMPT = """Solve this competition math problem. Show your reasoning, then put the final answer in \\boxed{{...}}. + +Problem: {problem} + +Solution:""" + + +def extract_boxed(text): + idx = text.rfind("\\boxed{") + if idx < 0: return None + start = idx + len("\\boxed{") + depth = 1; i = start + while i < len(text) and depth > 0: + if text[i] == "{": depth += 1 + elif text[i] == "}": depth -= 1 + i += 1 + if depth != 0: return None + return text[start:i-1].strip() + + +def normalize(s): + if s is None: return None + s = s.strip() + s = re.sub(r"^\$|\$$", "", s).strip() + s = re.sub(r"\\text\{([^}]*)\}", r"\1", s) + s = re.sub(r"\\mbox\{([^}]*)\}", r"\1", s) + s = re.sub(r"(?<=\d),(?=\d)", "", s) + s = s.replace("\\left", "").replace("\\right", "").replace("^\\circ", "").replace("^{\\circ}", "") + return s.strip() + + +def sympy_equal(a, b): + if a is None or b is None: return False + a, b = normalize(a), normalize(b) + if a == b: return True + try: + ea = parse_latex(a); eb = parse_latex(b) + if sympy.simplify(ea - eb) == 0: return True + except Exception: pass + try: + fa = float(a); fb = float(b) + if abs(fa - fb) < 1e-6: return True + except Exception: pass + return False + + +def gen_batch(model, tok, prompts, max_new=600, temperature=0.0, batch=16): + outs = [] + for i in range(0, len(prompts), batch): + chunk = prompts[i:i+batch] + texts = [] + for p in chunk: + msgs = [{"role": "system", "content": "You are a careful math problem solver."}, + {"role": "user", "content": p}] + try: + texts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)) + except Exception: + texts.append(p) + inp = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=1500).to(model.device) + with torch.no_grad(): + out = model.generate(**inp, max_new_tokens=max_new, do_sample=temperature > 0, + temperature=temperature if temperature > 0 else 1.0, top_p=0.95, + pad_token_id=tok.eos_token_id) + for j in range(out.size(0)): + outs.append(tok.decode(out[j][inp.input_ids.shape[1]:], skip_special_tokens=True)) + return outs + + +def math500_eval(model, tok, n=500, batch=16): + ds = list(load_dataset("HuggingFaceH4/MATH-500", split="test"))[:n] + log(f" eval on MATH-500 ({len(ds)} problems)") + prompts = [SOLVE_PROMPT.format(problem=p["problem"]) for p in ds] + outs = gen_batch(model, tok, prompts, max_new=600, temperature=0.0, batch=batch) + correct = 0 + for p, raw in zip(ds, outs): + pred = extract_boxed(raw) + if sympy_equal(pred, p["answer"]): correct += 1 + return correct, len(ds) + + +def make_train_example(problem, solution, tok): + user = SOLVE_PROMPT.format(problem=problem) + msgs_pre = [{"role": "system", "content": "You are a careful math problem solver."}, + {"role": "user", "content": user}] + msgs_full = msgs_pre + [{"role": "assistant", "content": solution}] + pre = tok.apply_chat_template(msgs_pre, tokenize=False, add_generation_prompt=True) + full = tok.apply_chat_template(msgs_full, tokenize=False) + pre_ids = tok(pre, add_special_tokens=False)["input_ids"] + full_ids = tok(full, add_special_tokens=False)["input_ids"] + MAX = 1280 + full_ids = full_ids[:MAX] + labels = list(full_ids) + n_pre = min(len(pre_ids), len(labels)) + for i in range(n_pre): labels[i] = -100 + pad = MAX - len(full_ids) + return {"input_ids": full_ids + [tok.pad_token_id]*pad, + "attention_mask": [1]*len(full_ids) + [0]*pad, + "labels": labels + [-100]*pad} + + +def train_on_pairs(model, tok, pairs, out_dir, lr=1e-4, epochs=2, rank=16): + log(f" training on {len(pairs)} pairs (lr={lr}, e={epochs}, r={rank})") + lora_cfg = LoraConfig(r=rank, lora_alpha=rank*2, lora_dropout=0.05, bias="none", + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM") + model = get_peft_model(model, lora_cfg) + tok.padding_side = "right" + ds = HFDataset.from_list([make_train_example(p["problem"], p["solution"], tok) for p in pairs]) + targs = TrainingArguments( + output_dir=f"{out_dir}/ckpt", num_train_epochs=epochs, + per_device_train_batch_size=1, gradient_accumulation_steps=4, + learning_rate=lr, bf16=True, logging_steps=20, + save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05, + ) + Trainer(model=model, args=targs, train_dataset=ds, processing_class=tok).train() + tok.padding_side = "left" + return model + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--iterations", type=int, default=6) + ap.add_argument("--problems_per_iter", type=int, default=32) + ap.add_argument("--n_eval", type=int, default=500) + ap.add_argument("--max_pairs", type=int, default=120) + ap.add_argument("--seed", type=int, default=42) + ap.add_argument("--tag", required=True) + args = ap.parse_args() + + out_dir = f"/workspace/math500_seeded/{args.tag}" + os.makedirs(out_dir, exist_ok=True) + random.seed(args.seed); torch.manual_seed(args.seed) + + log(f"loading {args.model}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + tok.padding_side = "left" + model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0") + log(f" loaded mem={torch.cuda.memory_allocated('cuda:0')/1e9:.1f}GB") + + log("loading MATH train split") + train_ds = [] + for cfg in ["algebra","counting_and_probability","geometry","intermediate_algebra","number_theory","prealgebra","precalculus"]: + try: + sub = list(load_dataset("EleutherAI/hendrycks_math", cfg, split="train")) + train_ds.extend(sub) + except Exception as e: + log(f" warn: failed to load {cfg}: {e}") + log(f" {len(train_ds)} train problems") + random.shuffle(train_ds) + + model.eval() + log("INITIAL eval on MATH-500") + base_c, base_n = math500_eval(model, tok, n=args.n_eval) + log(f" MATH-500 base: {base_c}/{base_n} ({100*base_c/base_n:.1f}%)") + + pairs = [] + cursor = 0 + + def gold_of(p): + ans = p.get("solution", "") + b = extract_boxed(ans) + return b + + for it in range(1, args.iterations + 1): + log(f"--- iter {it} ---") + batch_size = args.problems_per_iter + # Sample with gold extractable + batch_problems = [] + while len(batch_problems) < batch_size and cursor < len(train_ds): + p = train_ds[cursor]; cursor += 1 + gold = gold_of(p) + if gold is not None: + batch_problems.append({"problem": p["problem"], "gold": gold}) + if not batch_problems: + log(" exhausted train problems"); break + + # Greedy + prompts = [SOLVE_PROMPT.format(problem=p["problem"]) for p in batch_problems] + greedy_outs = gen_batch(model, tok, prompts, max_new=600, temperature=0.0, batch=16) + greedy_correct, hard_idx = 0, [] + for i, (p, raw) in enumerate(zip(batch_problems, greedy_outs)): + pred = extract_boxed(raw) + if sympy_equal(pred, p["gold"]): + pairs.append({"problem": p["problem"], "solution": raw.strip(), "source": "greedy"}) + greedy_correct += 1 + else: + hard_idx.append(i) + log(f" iter {it}: {greedy_correct} greedy-correct, {len(hard_idx)} hard") + + # Sampled for hard + if hard_idx: + hard_problems = [batch_problems[i] for i in hard_idx] + sample_prompts = [] + for p in hard_problems: + sample_prompts.extend([SOLVE_PROMPT.format(problem=p["problem"])] * 4) + sample_outs = gen_batch(model, tok, sample_prompts, max_new=600, temperature=0.8, batch=16) + sampled_correct = 0 + for i, p in enumerate(hard_problems): + attempts = sample_outs[i*4:(i+1)*4] + preds = [extract_boxed(a) for a in attempts] + correct_idx = [j for j, pr in enumerate(preds) if sympy_equal(pr, p["gold"])] + if correct_idx: + pairs.append({"problem": p["problem"], "solution": attempts[correct_idx[0]].strip(), "source": "sampled"}) + sampled_correct += 1 + log(f" iter {it}: {sampled_correct} sampled-correct (from {len(hard_idx)} hard)") + + log(f" iter {it}: pairs total = {len(pairs)}") + if len(pairs) >= args.max_pairs: + log(f" reached max_pairs={args.max_pairs}, stopping") + break + + log(f"=== mined {len(pairs)} total pairs ===") + with open(f"{out_dir}/pairs.jsonl", "w") as fh: + for p in pairs: fh.write(json.dumps(p) + "\n") + + if not pairs: + log("no pairs — exiting"); return + + model = train_on_pairs(model, tok, pairs, out_dir) + log("training done") + + model.eval() + log("FINAL eval on MATH-500") + tr_c, tr_n = math500_eval(model, tok, n=args.n_eval) + log(f" MATH-500 trained: {tr_c}/{tr_n} ({100*tr_c/tr_n:.1f}%)") + + result = { + "model": args.model, "n_pairs": len(pairs), + "base": base_c, "trained": tr_c, "n": tr_n, + "delta": tr_c - base_c, "elapsed_s": time.time() - T0, + } + with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model}") + print(f" MATH-500: base={base_c}/{tr_n} trained={tr_c}/{tr_n} Δ={tr_c-base_c:+d}") + print(f" Pairs mined: {len(pairs)}") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/experiments/mbpp_seeded_cross_arch.py b/experiments/mbpp_seeded_cross_arch.py new file mode 100644 index 0000000..c5fc614 --- /dev/null +++ b/experiments/mbpp_seeded_cross_arch.py @@ -0,0 +1,241 @@ +"""Self-bootstrap with MBPP-train as problem seeds + vLLM on H100. + +- Use MBPP train (374 problems) as PROBLEM seeds (no human solutions used). +- For each: greedy attempt. If fails, sample N attempts at temp=0.8. +- Mine at-edge pairs (broken, fixed). +- Train LoRA. Eval on HumanEval + MBPP-test. +""" +import os, json, time, re, subprocess, tempfile, argparse, gc, random +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from datasets import load_dataset + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def run_python(code, timeout=8): + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f: + f.write(code); path = f.name + try: + r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp") + return r.returncode == 0, (r.stderr or "")[:200] + except subprocess.TimeoutExpired: return False, "timeout" + finally: + try: os.unlink(path) + except: pass + + +def vllm_gen(llm, prompts, max_new=400, temperature=0.0, n=1, stops=None): + from vllm import SamplingParams + sp = SamplingParams(temperature=temperature, top_p=0.95 if temperature > 0 else 1.0, + max_tokens=max_new, n=n, + stop=stops or ["\nclass ", "\nif __name__", "\n\nprint", "\n\ndef "]) + out = llm.generate(prompts, sp, use_tqdm=False) + # returns list of lists when n>1 + if n == 1: + return [o.outputs[0].text for o in out] + return [[c.text for c in o.outputs] for o in out] + + +def he_prompt(p): return p["prompt"] +def mbpp_prompt(p): + return (f"# Task: {p['prompt']}\n" + f"# Tests:\n# " + "\n# ".join(p["test_list"]) + "\n\n") + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--attempts_per", type=int, default=8) + ap.add_argument("--max_pairs", type=int, default=200) + ap.add_argument("--tag", required=True) + args = ap.parse_args() + + out_dir = f"/workspace/selfmine_mbpp/{args.tag}" + os.makedirs(out_dir, exist_ok=True) + + from vllm import LLM + from transformers import AutoTokenizer + log(f"loading {args.model} into vLLM") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048) + log(f" loaded") + + # --- Load benchmarks + he = list(load_dataset("openai_humaneval", split="test")) + mbpp_test = list(load_dataset("mbpp", "sanitized", split="test"))[:200] + mbpp_train = list(load_dataset("mbpp", "sanitized", split="train")) + log(f" HE: {len(he)}, MBPP-test: {len(mbpp_test)}, MBPP-train: {len(mbpp_train)}") + + # --- BASE eval + log("=== BASE evals ===") + t0 = time.time() + he_outs = vllm_gen(llm, [he_prompt(p) for p in he], max_new=400) + log(f" HE base gen done in {time.time()-t0:.1f}s") + base_he = 0 + for p, raw in zip(he, he_outs): + full = p["prompt"] + raw + test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + ok, _ = run_python(test_code, timeout=10) + if ok: base_he += 1 + + t1 = time.time() + mbpp_outs = vllm_gen(llm, [mbpp_prompt(p) for p in mbpp_test], max_new=400) + log(f" MBPP-test base gen done in {time.time()-t1:.1f}s") + base_mbpp = 0 + for p, raw in zip(mbpp_test, mbpp_outs): + test_code = raw + "\n\n" + "\n".join(p["test_list"]) + ok, _ = run_python(test_code, timeout=10) + if ok: base_mbpp += 1 + log(f" BASE: HE={base_he}/{len(he)} MBPP={base_mbpp}/{len(mbpp_test)}") + + # --- Mine pairs from MBPP-train + log(f"=== mining from {len(mbpp_train)} MBPP-train problems ===") + train_prompts = [mbpp_prompt(p) for p in mbpp_train] + # greedy attempt + t0 = time.time() + greedy_outs = vllm_gen(llm, train_prompts, max_new=400) + log(f" greedy gen in {time.time()-t0:.1f}s") + pairs = [] + hard_indices = [] + for i, (p, raw) in enumerate(zip(mbpp_train, greedy_outs)): + test_code = raw + "\n\n" + "\n".join(p["test_list"]) + ok, err = run_python(test_code, timeout=8) + if not ok: + hard_indices.append((i, p, raw, err)) + log(f" {len(mbpp_train) - len(hard_indices)} greedy-correct, {len(hard_indices)} hard") + + if not hard_indices: + log("nothing to mine — base too strong"); return + + # sample N attempts per hard problem + log(f" sampling {args.attempts_per} attempts × {len(hard_indices)} hard problems...") + hard_prompts = [] + for _i, p, _r, _e in hard_indices: + hard_prompts.append(mbpp_prompt(p)) + t1 = time.time() + sample_outs = vllm_gen(llm, hard_prompts, max_new=400, temperature=0.8, n=args.attempts_per) + log(f" sample gen in {time.time()-t1:.1f}s") + + t2 = time.time() + for (idx, p, greedy_raw, err), attempts in zip(hard_indices, sample_outs): + # check each attempt + passes = [] + for a in attempts: + test_code = a + "\n\n" + "\n".join(p["test_list"]) + ok, _ = run_python(test_code, timeout=8) + if ok: passes.append(a) + if passes: + pairs.append({ + "problem": p["prompt"], + "tests": p["test_list"], + "broken": greedy_raw.strip(), + "fixed": passes[0].strip(), + "error": err, + }) + if len(pairs) >= args.max_pairs: break + log(f" verification in {time.time()-t2:.1f}s — mined {len(pairs)} pairs") + + with open(f"{out_dir}/pairs.jsonl", "w") as fh: + for r in pairs: fh.write(json.dumps(r) + "\n") + + if len(pairs) < 5: + log("too few pairs — exiting"); return + + # --- Train LoRA + log("=== TRAINING ===") + del llm; gc.collect(); torch.cuda.empty_cache() + from transformers import AutoModelForCausalLM, TrainingArguments, Trainer + from datasets import Dataset as HFDataset + from peft import LoraConfig, get_peft_model + + def make_ex(r): + user = (f"# Task: {r['problem']}\n" + f"# Tests:\n# " + "\n# ".join(r['tests']) + "\n" + f"# My broken attempt:\n{r['broken']}\n" + f"# Error: {r.get('error','')[:120]}\n" + f"# Corrected:\n") + target = r["fixed"] + full = user + target + full_ids = tok(full, add_special_tokens=False)["input_ids"] + user_ids = tok(user, add_special_tokens=False)["input_ids"] + MAX = 1024 + full_ids = full_ids[:MAX] + labels = list(full_ids) + n_user = min(len(user_ids), len(labels)) + for i in range(n_user): labels[i] = -100 + pad = MAX - len(full_ids) + return {"input_ids": full_ids + [tok.pad_token_id]*pad, + "attention_mask": [1]*len(full_ids) + [0]*pad, + "labels": labels + [-100]*pad} + + model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0") + lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none", + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM") + model = get_peft_model(model, lora_cfg) + ds = HFDataset.from_list([make_ex(r) for r in pairs]) + targs = TrainingArguments( + output_dir=f"{out_dir}/ckpt", num_train_epochs=2, + per_device_train_batch_size=2, gradient_accumulation_steps=4, + learning_rate=1e-4, bf16=True, logging_steps=20, + save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05, + ) + Trainer(model=model, args=targs, train_dataset=ds, tokenizer=tok).train() + log("training done") + adapter_dir = f"{out_dir}/adapter" + model.save_pretrained(adapter_dir) + del model; gc.collect(); torch.cuda.empty_cache() + + # --- TRAINED eval + from vllm import LLM + from vllm.lora.request import LoRARequest + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048, + enable_lora=True, max_lora_rank=16) + lora_req = LoRARequest("tf_adapter", 1, adapter_dir) + from vllm import SamplingParams + sp = SamplingParams(temperature=0, max_tokens=400, stop=["\nclass ", "\nif __name__", "\n\nprint", "\n\ndef "]) + + log("=== TRAINED evals ===") + t0 = time.time() + he_outs = [o.outputs[0].text for o in llm.generate([he_prompt(p) for p in he], sp, lora_request=lora_req, use_tqdm=False)] + log(f" HE trained gen in {time.time()-t0:.1f}s") + tr_he = 0 + for p, raw in zip(he, he_outs): + full = p["prompt"] + raw + test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + ok, _ = run_python(test_code, timeout=10) + if ok: tr_he += 1 + + t1 = time.time() + mbpp_outs = [o.outputs[0].text for o in llm.generate([mbpp_prompt(p) for p in mbpp_test], sp, lora_request=lora_req, use_tqdm=False)] + log(f" MBPP-test trained gen in {time.time()-t1:.1f}s") + tr_mbpp = 0 + for p, raw in zip(mbpp_test, mbpp_outs): + test_code = raw + "\n\n" + "\n".join(p["test_list"]) + ok, _ = run_python(test_code, timeout=10) + if ok: tr_mbpp += 1 + + result = { + "model": args.model, "n_pairs": len(pairs), + "humaneval": {"base": base_he, "trained": tr_he, "delta": tr_he-base_he, "n": len(he)}, + "mbpp": {"base": base_mbpp, "trained": tr_mbpp, "delta": tr_mbpp-base_mbpp, "n": len(mbpp_test)}, + "elapsed_s": time.time() - T0, + } + with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model} — MBPP-train SEEDED ({len(pairs)} pairs)") + print(f" HumanEval: base={base_he}/{len(he)} trained={tr_he}/{len(he)} Δ={tr_he-base_he:+d}") + print(f" MBPP: base={base_mbpp}/{len(mbpp_test)} trained={tr_mbpp}/{len(mbpp_test)} Δ={tr_mbpp-base_mbpp:+d}") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/experiments/recipe_x_tts_synergy.py b/experiments/recipe_x_tts_synergy.py new file mode 100644 index 0000000..d14460b --- /dev/null +++ b/experiments/recipe_x_tts_synergy.py @@ -0,0 +1,210 @@ +"""Compound recipe + TTS: train recipe, then measure best-of-N on TOP of recipe-trained model. +Tests if recipe-trained model has BETTER sample diversity / quality at inference.""" +import os, json, time, re, subprocess, tempfile, argparse, gc, random +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from datasets import load_dataset + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def run_python(code, timeout=10): + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f: + f.write(code); path = f.name + try: + r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp") + return r.returncode == 0 + except subprocess.TimeoutExpired: return False + finally: + try: os.unlink(path) + except: pass + + +def mbpp_prompt(p): return f"# Task: {p['prompt']}\n# Tests:\n# " + "\n# ".join(p["test_list"]) + "\n\n" +def he_prompt(p): return p["prompt"] + + +def he_score_outputs(he, outs): + c = 0 + for p, raw in zip(he, outs): + code = raw + if "```python" in code: + code = code.split("```python",1)[1] + if "```" in code: code = code.split("```",1)[0] + full = p["prompt"] + "\n" + code + test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + if run_python(test_code, 10): c += 1 + return c + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--tag", required=True) + ap.add_argument("--out_dir", required=True) + args = ap.parse_args() + + os.makedirs(args.out_dir, exist_ok=True) + random.seed(42) + + from vllm import LLM, SamplingParams + from transformers import AutoTokenizer + log(f"loading {args.model}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048) + log("loaded") + + he = list(load_dataset("openai_humaneval", split="test")) + + # 4 metrics: + # A) raw greedy + # B) raw + best-of-8 + # C) recipe greedy + # D) recipe + best-of-8 + + sp_g = SamplingParams(temperature=0, max_tokens=400, stop=["\nclass ", "\nif __name__", "\n\nprint"]) + sp_s = SamplingParams(temperature=0.6, top_p=0.95, max_tokens=400, n=8, + stop=["\nclass ", "\nif __name__", "\n\nprint"]) + + log("A) raw greedy") + he_outs = [o.outputs[0].text for o in llm.generate([he_prompt(p) for p in he], sp_g, use_tqdm=False)] + A_raw_greedy = he_score_outputs(he, he_outs) + log(f" raw greedy: {A_raw_greedy}/{len(he)}") + + log("B) raw best-of-8") + he_samples = llm.generate([he_prompt(p) for p in he], sp_s, use_tqdm=False) + B_raw_bo8 = 0 + for p, outset in zip(he, he_samples): + for o in outset.outputs: + code = o.text + if "```python" in code: + code = code.split("```python",1)[1] + if "```" in code: code = code.split("```",1)[0] + full = p["prompt"] + "\n" + code + test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + if run_python(test_code, 10): + B_raw_bo8 += 1; break + log(f" raw best-of-8: {B_raw_bo8}/{len(he)}") + + # Mine pairs + log("mining pairs from MBPP-train...") + mbpp_full = list(load_dataset("mbpp", split="train")) + random.shuffle(mbpp_full) + seeds = [] + for p in mbpp_full[:200]: + prompt_text = p.get("prompt") or p.get("text", "") + if prompt_text and p.get("test_list"): + seeds.append({"prompt": prompt_text, "test_list": p["test_list"]}) + + sp_mine = SamplingParams(temperature=0, max_tokens=400, stop=["\nclass Test", "\nif __name__"]) + g_outs = [o.outputs[0].text for o in llm.generate([mbpp_prompt(p) for p in seeds], sp_mine, use_tqdm=False)] + hard_idx = [i for i, (p, raw) in enumerate(zip(seeds, g_outs)) + if not run_python(raw + "\n\n" + "\n".join(p["test_list"]), 8)] + log(f" hard: {len(hard_idx)}") + pairs = [] + if hard_idx: + sp_m2 = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=400, n=8, + stop=["\nclass Test", "\nif __name__"]) + hard_prompts = [mbpp_prompt(seeds[i]) for i in hard_idx] + sample_outs = llm.generate(hard_prompts, sp_m2, use_tqdm=False) + for j, i in enumerate(hard_idx): + for o in sample_outs[j].outputs: + if run_python(o.text + "\n\n" + "\n".join(seeds[i]["test_list"]), 8): + pairs.append({"problem": seeds[i]["prompt"], "tests": seeds[i]["test_list"], + "broken": g_outs[i].strip(), "fixed": o.text.strip()}); break + log(f" mined {len(pairs)} pairs") + + # Train LoRA + del llm; gc.collect(); torch.cuda.empty_cache() + if len(pairs) < 5: + log("too few pairs, exit"); return + + from transformers import AutoModelForCausalLM, TrainingArguments, Trainer + from datasets import Dataset as HFDataset + from peft import LoraConfig, get_peft_model + + def mk_ex(r): + user = (f"# Task: {r['problem']}\n# Tests:\n# " + "\n# ".join(r['tests']) + "\n" + f"# My broken attempt:\n{r['broken']}\n# Corrected:\n") + full = user + r["fixed"] + full_ids = tok(full, add_special_tokens=False)["input_ids"] + user_ids = tok(user, add_special_tokens=False)["input_ids"] + MAX = 1024 + full_ids = full_ids[:MAX] + labels = list(full_ids); n_user = min(len(user_ids), len(labels)) + for i in range(n_user): labels[i] = -100 + pad = MAX - len(full_ids) + return {"input_ids": full_ids + [tok.pad_token_id]*pad, + "attention_mask": [1]*len(full_ids) + [0]*pad, + "labels": labels + [-100]*pad} + + log("training...") + model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0") + lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none", + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM") + model = get_peft_model(model, lora_cfg) + ds_train = HFDataset.from_list([mk_ex(r) for r in pairs]) + targs = TrainingArguments( + output_dir=f"{args.out_dir}/ckpt", num_train_epochs=2, + per_device_train_batch_size=1, gradient_accumulation_steps=4, + learning_rate=1e-4, bf16=True, logging_steps=20, + save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05, + ) + Trainer(model=model, args=targs, train_dataset=ds_train, tokenizer=tok).train() + adapter_dir = f"{args.out_dir}/adapter" + model.save_pretrained(adapter_dir) + del model; gc.collect(); torch.cuda.empty_cache() + + # C, D + from vllm import LLM as LLM2 + from vllm.lora.request import LoRARequest + llm = LLM2(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048, + enable_lora=True, max_lora_rank=16) + lora_req = LoRARequest("trained", 1, adapter_dir) + + log("C) recipe greedy") + he_outs = [o.outputs[0].text for o in llm.generate([he_prompt(p) for p in he], sp_g, lora_request=lora_req, use_tqdm=False)] + C_rec_greedy = he_score_outputs(he, he_outs) + log(f" recipe greedy: {C_rec_greedy}/{len(he)}") + + log("D) recipe best-of-8") + he_samples = llm.generate([he_prompt(p) for p in he], sp_s, lora_request=lora_req, use_tqdm=False) + D_rec_bo8 = 0 + for p, outset in zip(he, he_samples): + for o in outset.outputs: + code = o.text + if "```python" in code: + code = code.split("```python",1)[1] + if "```" in code: code = code.split("```",1)[0] + full = p["prompt"] + "\n" + code + test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + if run_python(test_code, 10): + D_rec_bo8 += 1; break + log(f" recipe best-of-8: {D_rec_bo8}/{len(he)}") + + result = { + "model": args.model, "n_pairs": len(pairs), + "raw_greedy": A_raw_greedy, "raw_bo8": B_raw_bo8, + "recipe_greedy": C_rec_greedy, "recipe_bo8": D_rec_bo8, + "n": len(he), "elapsed_s": time.time() - T0, + } + with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model} — RECIPE × TTS COMPOUND (HumanEval, n={len(he)}, {len(pairs)} pairs)") + print(f" A) Raw greedy: {A_raw_greedy:>3}/{len(he)} ({100*A_raw_greedy/len(he):.1f}%)") + print(f" B) Raw best-of-8: {B_raw_bo8:>3}/{len(he)} ({100*B_raw_bo8/len(he):.1f}%)") + print(f" C) Recipe greedy: {C_rec_greedy:>3}/{len(he)} ({100*C_rec_greedy/len(he):.1f}%)") + print(f" D) Recipe best-of-8: {D_rec_bo8:>3}/{len(he)} ({100*D_rec_bo8/len(he):.1f}%)") + print(f" Synergy: D - max(B,C) = {D_rec_bo8 - max(B_raw_bo8, C_rec_greedy):+d} (>0 = real synergy)") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/experiments/recursive_bootstrap.py b/experiments/recursive_bootstrap.py new file mode 100644 index 0000000..e657ea3 --- /dev/null +++ b/experiments/recursive_bootstrap.py @@ -0,0 +1,219 @@ +"""Recursive self-bootstrap: iter1->iter2->iter3. + +Iter k: + - Use model from previous iter (or base for iter 1) + - Mine pairs on MBPP-train + - Train fresh LoRA from BASE on accumulated pairs + - Eval on HE +""" +import os, json, time, re, subprocess, tempfile, argparse, gc, random +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from datasets import load_dataset + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def run_python(code, timeout=10): + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f: + f.write(code); path = f.name + try: + r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp") + return r.returncode == 0 + except subprocess.TimeoutExpired: return False + finally: + try: os.unlink(path) + except: pass + + +def mbpp_prompt(p): + return f"# Task: {p['prompt']}\n# Tests:\n# " + "\n# ".join(p["test_list"]) + "\n\n" + + +def he_prompt(p): return p["prompt"] + + +def vllm_gen(llm, prompts, max_new=400, temperature=0.0, n=1, lora_req=None, stops=None): + from vllm import SamplingParams + sp = SamplingParams(temperature=temperature, top_p=0.95 if temperature > 0 else 1.0, + max_tokens=max_new, n=n, + stop=stops or ["\nclass Test", "\nif __name__", "\n\nprint", "\nassert "]) + if lora_req: + out = llm.generate(prompts, sp, lora_request=lora_req, use_tqdm=False) + else: + out = llm.generate(prompts, sp, use_tqdm=False) + if n == 1: return [o.outputs[0].text for o in out] + return [[c.text for c in o.outputs] for o in out] + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--tag", required=True) + ap.add_argument("--out_dir", required=True) + ap.add_argument("--n_iters", type=int, default=3) + ap.add_argument("--n_mining", type=int, default=200) + ap.add_argument("--attempts_per", type=int, default=8) + args = ap.parse_args() + + os.makedirs(args.out_dir, exist_ok=True) + + from vllm import LLM, SamplingParams + from vllm.lora.request import LoRARequest + from transformers import AutoTokenizer + log(f"loading {args.model}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + + he = list(load_dataset("openai_humaneval", split="test")) + mbpp_full = list(load_dataset("mbpp", split="train")) + random.seed(42); random.shuffle(mbpp_full) + seeds_pool = [] + for p in mbpp_full[:args.n_mining * args.n_iters]: + prompt_text = p.get("prompt") or p.get("text", "") + if prompt_text and p.get("test_list"): + seeds_pool.append({"prompt": prompt_text, "test_list": p["test_list"]}) + log(f"seeds pool: {len(seeds_pool)}") + + iter_results = [] + accumulated_pairs = [] + current_adapter = None # path + + for it in range(1, args.n_iters + 1): + log(f"\n========== ITER {it} ==========") + # Load model (with current adapter if exists) + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, + max_model_len=2048, + enable_lora=(current_adapter is not None), max_lora_rank=16) + lora_req = LoRARequest("cur", 1, current_adapter) if current_adapter else None + log(f" loaded {'(with adapter)' if current_adapter else '(base)'}") + + # Mine pairs using current model + seeds = seeds_pool[(it-1)*args.n_mining:it*args.n_mining] + log(f" mining from {len(seeds)} new seeds") + prompts = [mbpp_prompt(p) for p in seeds] + greedy_outs = vllm_gen(llm, prompts, max_new=400, lora_req=lora_req) + hard_idx = [] + for i, (p, raw) in enumerate(zip(seeds, greedy_outs)): + test_code = raw + "\n\n" + "\n".join(p["test_list"]) + if not run_python(test_code, 8): + hard_idx.append(i) + log(f" greedy: {len(seeds)-len(hard_idx)} pass, {len(hard_idx)} hard") + + if hard_idx: + hard_prompts = [mbpp_prompt(seeds[i]) for i in hard_idx] + sample_outs = vllm_gen(llm, hard_prompts, max_new=400, temperature=0.8, + n=args.attempts_per, lora_req=lora_req) + new_pairs = [] + for j, i in enumerate(hard_idx): + attempts = sample_outs[j] + passes = [] + for a in attempts: + if run_python(a + "\n\n" + "\n".join(seeds[i]["test_list"]), 8): + passes.append(a); break + if passes: + new_pairs.append({"problem": seeds[i]["prompt"], "tests": seeds[i]["test_list"], + "broken": greedy_outs[i].strip(), "fixed": passes[0].strip(), + "iter": it}) + accumulated_pairs.extend(new_pairs) + log(f" mined {len(new_pairs)} new pairs (cumulative: {len(accumulated_pairs)})") + + # Eval current model on HE + log(f" eval HE...") + he_outs = vllm_gen(llm, [he_prompt(p) for p in he], max_new=400, lora_req=lora_req, + stops=["\nclass ", "\nif __name__", "\n\nprint"]) + he_correct = 0 + for p, raw in zip(he, he_outs): + full = p["prompt"] + "\n" + raw + test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + if run_python(test_code, 10): he_correct += 1 + log(f" HE iter{it} (pre-train): {he_correct}/{len(he)}") + iter_results.append({"iter": it, "he_pretrain": he_correct, "cumulative_pairs": len(accumulated_pairs)}) + + # Tear down vLLM, train new adapter on accumulated pairs + del llm; gc.collect(); torch.cuda.empty_cache() + + if len(accumulated_pairs) < 5: + log(f" too few pairs to train, skipping iter {it} training") + continue + + from transformers import AutoModelForCausalLM, TrainingArguments, Trainer + from datasets import Dataset as HFDataset + from peft import LoraConfig, get_peft_model + + def mk_ex(r): + user = (f"# Task: {r['problem']}\n# Tests:\n# " + "\n# ".join(r['tests']) + "\n" + f"# My broken attempt:\n{r['broken']}\n# Corrected:\n") + target = r["fixed"] + full = user + target + full_ids = tok(full, add_special_tokens=False)["input_ids"] + user_ids = tok(user, add_special_tokens=False)["input_ids"] + MAX = 1024 + full_ids = full_ids[:MAX] + labels = list(full_ids) + n_user = min(len(user_ids), len(labels)) + for i in range(n_user): labels[i] = -100 + pad = MAX - len(full_ids) + return {"input_ids": full_ids + [tok.pad_token_id]*pad, + "attention_mask": [1]*len(full_ids) + [0]*pad, + "labels": labels + [-100]*pad} + + log(f" training fresh adapter on {len(accumulated_pairs)} pairs...") + model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0") + lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none", + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM") + model = get_peft_model(model, lora_cfg) + ds_train = HFDataset.from_list([mk_ex(r) for r in accumulated_pairs]) + targs = TrainingArguments( + output_dir=f"{args.out_dir}/iter{it}_ckpt", num_train_epochs=2, + per_device_train_batch_size=1, gradient_accumulation_steps=4, + learning_rate=1e-4, bf16=True, logging_steps=20, + save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05, + ) + Trainer(model=model, args=targs, train_dataset=ds_train, tokenizer=tok).train() + adapter_dir = f"{args.out_dir}/iter{it}_adapter" + model.save_pretrained(adapter_dir) + del model; gc.collect(); torch.cuda.empty_cache() + current_adapter = adapter_dir + + # Re-eval with new adapter to get post-train HE + log(f" eval post-train HE...") + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048, + enable_lora=True, max_lora_rank=16) + lora_req = LoRARequest(f"iter{it}", it, current_adapter) + he_outs = vllm_gen(llm, [he_prompt(p) for p in he], max_new=400, lora_req=lora_req, + stops=["\nclass ", "\nif __name__", "\n\nprint"]) + he_correct = 0 + for p, raw in zip(he, he_outs): + full = p["prompt"] + "\n" + raw + test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + if run_python(test_code, 10): he_correct += 1 + log(f" HE iter{it} (post-train): {he_correct}/{len(he)}") + iter_results[-1]["he_posttrain"] = he_correct + + del llm; gc.collect(); torch.cuda.empty_cache() + + # Save pairs and results + with open(f"{args.out_dir}/pairs.jsonl", "w") as fh: + for r in accumulated_pairs: fh.write(json.dumps(r) + "\n") + result = {"model": args.model, "tag": args.tag, "n_iters": args.n_iters, + "iter_results": iter_results, "total_pairs": len(accumulated_pairs), + "elapsed_s": time.time() - T0} + with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model} — RECURSIVE BOOTSTRAP") + for r in iter_results: + pre = r.get("he_pretrain", "-") + post = r.get("he_posttrain", "-") + print(f" iter {r['iter']}: cum_pairs={r['cumulative_pairs']} HE_pre={pre} HE_post={post}") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/experiments/self_consistency.py b/experiments/self_consistency.py new file mode 100644 index 0000000..cd36b25 --- /dev/null +++ b/experiments/self_consistency.py @@ -0,0 +1,129 @@ +"""Self-consistency selection: majority vote on N samples WITHOUT oracle access. +Tests if model's self-agreement is a good selector (deployable TTS without test cases).""" +import os, json, time, re, argparse +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from datasets import load_dataset +from collections import Counter + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def extract_boxed(text): + idx = text.rfind("\\boxed{") + if idx < 0: return None + start = idx + len("\\boxed{"); depth = 1; i = start + while i < len(text) and depth > 0: + if text[i] == "{": depth += 1 + elif text[i] == "}": depth -= 1 + i += 1 + if depth != 0: return None + return text[start:i-1].strip() + + +def normalize(s): + if s is None: return None + s = s.strip().lower() + s = re.sub(r"[,$\s]", "", s) + return s + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--n_samples", type=int, default=16) + ap.add_argument("--tag", required=True) + ap.add_argument("--out_dir", required=True) + args = ap.parse_args() + + os.makedirs(args.out_dir, exist_ok=True) + + from vllm import LLM, SamplingParams + from transformers import AutoTokenizer + log(f"loading {args.model}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048) + log("loaded") + + math500 = list(load_dataset("HuggingFaceH4/MATH-500", split="test"))[:200] + prompts = [] + for p in math500: + try: + msgs = [{"role": "system", "content": "Math solver. End with \\boxed{answer}."}, + {"role": "user", "content": f"Solve. Problem: {p['problem']}\n\nSolution:"}] + prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)) + except Exception: + prompts.append(f"Solve. Problem: {p['problem']}\n\nSolution:") + + log(f"generating {args.n_samples} samples per problem...") + sp = SamplingParams(temperature=0.7, top_p=0.95, max_tokens=800, n=args.n_samples) + t0 = time.time() + outs = llm.generate(prompts, sp, use_tqdm=False) + log(f" gen in {time.time()-t0:.1f}s") + + import sympy + from sympy.parsing.latex import parse_latex + def sympy_eq(a, b): + if a is None or b is None: return False + if a == b: return True + try: + if sympy.simplify(parse_latex(a) - parse_latex(b)) == 0: return True + except Exception: pass + try: + if abs(float(a) - float(b)) < 1e-6: return True + except Exception: pass + return False + + # Three metrics: + # 1. Greedy: take first sample + # 2. Oracle pass@N: any correct + # 3. Self-consistency: majority vote on extracted boxed answer (normalize numbers/text) + greedy_correct = 0 + oracle_correct = 0 + sc_correct = 0 + + for p, outset in zip(math500, outs): + attempts = [o.text for o in outset.outputs] + preds = [extract_boxed(a) for a in attempts] + # Greedy: first sample + if sympy_eq(preds[0], p["answer"]): greedy_correct += 1 + # Oracle: any pass + if any(sympy_eq(pr, p["answer"]) for pr in preds): oracle_correct += 1 + # Self-consistency: majority vote on normalized answer + normalized = [normalize(pr) for pr in preds if pr is not None] + if normalized: + most_common, _ = Counter(normalized).most_common(1)[0] + # Find an original pred with this normalized form + for pr in preds: + if pr and normalize(pr) == most_common: + if sympy_eq(pr, p["answer"]): sc_correct += 1 + break + + result = { + "model": args.model, "n_samples": args.n_samples, + "greedy_first": greedy_correct, + "oracle_pass_at_N": oracle_correct, + "self_consistency": sc_correct, + "n": len(math500), + "elapsed_s": time.time() - T0, + } + with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model} — SELF-CONSISTENCY vs ORACLE on MATH-500 (n={args.n_samples})") + print(f" First sample (greedy-like): {greedy_correct}/{len(math500)} ({100*greedy_correct/len(math500):.1f}%)") + print(f" Self-consistency (vote): {sc_correct}/{len(math500)} ({100*sc_correct/len(math500):.1f}%)") + print(f" Oracle (any-pass): {oracle_correct}/{len(math500)} ({100*oracle_correct/len(math500):.1f}%)") + sc_recovery = 100*(sc_correct - greedy_correct)/(oracle_correct - greedy_correct) if oracle_correct > greedy_correct else 0 + print(f" SC recovers {sc_recovery:.0f}% of oracle-greedy gap") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/experiments/self_correction_code.py b/experiments/self_correction_code.py new file mode 100644 index 0000000..e63a785 --- /dev/null +++ b/experiments/self_correction_code.py @@ -0,0 +1,236 @@ +"""Self-correction recipe for CODE. Same pattern as math sc_v2 (which gave +5 recovery). + +Pipeline: + 1. MBPP-train problems (374 sanitized + extended). + 2. Greedy attempt. If passes → save as right→stays-right positive. + 3. If fails → prompt with "Wait, let me reconsider" + sample 4 at temp=0.8. + If any pass → mine (problem, wrong, reflection, correct) self-correction trace. + 4. Train on mixed dataset. + 5. Eval HE + MBPP. + +Mix teaches model: commit to right answers, fix wrong ones. +""" +import os, json, time, re, subprocess, tempfile, argparse, gc, random +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from datasets import load_dataset + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +RECONSIDER_TAG = "\n\n# Wait — that doesn't look right. Let me reconsider:\n\n" + + +def run_python(code, timeout=8): + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f: + f.write(code); path = f.name + try: + r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp") + return r.returncode == 0 + except subprocess.TimeoutExpired: return False + finally: + try: os.unlink(path) + except: pass + + +def vllm_gen(llm, prompts, max_new=400, temperature=0.0, n=1, prefill_texts=None): + from vllm import SamplingParams + sp = SamplingParams(temperature=temperature, top_p=0.95 if temperature > 0 else 1.0, + max_tokens=max_new, n=n, + stop=["\nclass Test", "\nif __name__", "\n\nprint", "\nassert "]) + if prefill_texts is None: + out = llm.generate(prompts, sp, use_tqdm=False) + else: + # Each prompt is concatenated with prefill text + full_prompts = [p + pre for p, pre in zip(prompts, prefill_texts)] + out = llm.generate(full_prompts, sp, use_tqdm=False) + if n == 1: return [o.outputs[0].text for o in out] + return [[c.text for c in o.outputs] for o in out] + + +def he_prompt(p): return p["prompt"] +def mbpp_prompt(p): + return f"# Task: {p['prompt']}\n# Tests:\n# " + "\n# ".join(p["test_list"]) + "\n\n" + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--n_mining", type=int, default=300) + ap.add_argument("--max_self_corrections", type=int, default=80) + ap.add_argument("--max_positives", type=int, default=80) + ap.add_argument("--tag", required=True) + args = ap.parse_args() + + out_dir = f"/workspace/code_sc/{args.tag}" + os.makedirs(out_dir, exist_ok=True) + random.seed(42) + + from vllm import LLM + from transformers import AutoTokenizer + log(f"loading {args.model} into vLLM") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048) + log(f" loaded") + + he = list(load_dataset("openai_humaneval", split="test")) + mbpp_test = list(load_dataset("mbpp", "sanitized", split="test"))[:100] + mbpp_full = list(load_dataset("mbpp", split="train")) + random.shuffle(mbpp_full) + seeds = [] + for p in mbpp_full[:args.n_mining]: + prompt_text = p.get("prompt") or p.get("text", "") + if prompt_text and p.get("test_list"): + seeds.append({"prompt": prompt_text, "test_list": p["test_list"]}) + log(f" HE: {len(he)}, MBPP-test: {len(mbpp_test)}, mining seeds: {len(seeds)}") + + # --- BASE eval + log("=== BASE eval ===") + he_outs = vllm_gen(llm, [he_prompt(p) for p in he], max_new=400) + base_he = sum(1 for p, raw in zip(he, he_outs) + if run_python(p["prompt"] + raw + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})", 10)) + log(f" HE base: {base_he}/{len(he)}") + mbpp_outs = vllm_gen(llm, [mbpp_prompt(p) for p in mbpp_test], max_new=400) + base_mbpp = sum(1 for p, raw in zip(mbpp_test, mbpp_outs) + if run_python(raw + "\n\n" + "\n".join(p["test_list"]), 10)) + log(f" MBPP base: {base_mbpp}/{len(mbpp_test)}") + + # --- Mine: greedy on all seeds + log(f"=== mining: greedy attempt on {len(seeds)} seeds ===") + t0 = time.time() + greedy_outs = vllm_gen(llm, [mbpp_prompt(p) for p in seeds], max_new=400) + log(f" greedy gen in {time.time()-t0:.1f}s") + t1 = time.time() + right = [] # greedy correct (positives) + wrong = [] # greedy wrong (candidates for self-correction) + for p, raw in zip(seeds, greedy_outs): + test_code = raw + "\n\n" + "\n".join(p["test_list"]) + if run_python(test_code, timeout=8): + right.append({"problem": p["prompt"], "tests": p["test_list"], "solution": raw.strip()}) + else: + wrong.append({"problem": p["prompt"], "tests": p["test_list"], "wrong": raw.strip()}) + log(f" verify: {len(right)} greedy-correct, {len(wrong)} hard") + + # --- For wrong: prefill wrong + reconsider tag, sample 4 attempts + log(f"=== self-correction sampling on {len(wrong)} hard problems ===") + sc_pairs = [] + if wrong: + base_prompts = [mbpp_prompt({"prompt": w["problem"], "test_list": w["tests"]}) for w in wrong] + prefills = [w["wrong"] + RECONSIDER_TAG for w in wrong] + # Generate 4 attempts each via temperature + t0 = time.time() + sc_outs = vllm_gen(llm, base_prompts, max_new=400, temperature=0.8, n=4, prefill_texts=prefills) + log(f" sc gen in {time.time()-t0:.1f}s") + t1 = time.time() + for w, attempts in zip(wrong, sc_outs): + for a in attempts: + test_code = a + "\n\n" + "\n".join(w["tests"]) + if run_python(test_code, timeout=8): + full_trace = w["wrong"] + RECONSIDER_TAG + a.strip() + sc_pairs.append({"problem": w["problem"], "tests": w["tests"], + "full_trace": full_trace}) + break # one per problem + log(f" sc verify in {time.time()-t1:.1f}s — {len(sc_pairs)} self-correction traces") + + # Cap and sample + random.shuffle(right); random.shuffle(sc_pairs) + right = right[:args.max_positives] + sc_pairs = sc_pairs[:args.max_self_corrections] + log(f"=== final: {len(sc_pairs)} self-correction + {len(right)} right→stays-right = {len(sc_pairs)+len(right)} examples ===") + + if len(sc_pairs) + len(right) < 10: + log("too few examples — exiting"); return + + with open(f"{out_dir}/sc_pairs.jsonl", "w") as fh: + for r in sc_pairs: fh.write(json.dumps(r) + "\n") + with open(f"{out_dir}/positives.jsonl", "w") as fh: + for r in right: fh.write(json.dumps(r) + "\n") + + # --- Train LoRA on MIXED dataset + log("=== TRAINING ===") + del llm; gc.collect(); torch.cuda.empty_cache() + from transformers import AutoModelForCausalLM, TrainingArguments, Trainer + from datasets import Dataset as HFDataset + from peft import LoraConfig, get_peft_model + + train_examples = [] + for r in sc_pairs: + train_examples.append({"problem": r["problem"], "tests": r["tests"], "target": r["full_trace"]}) + for r in right: + train_examples.append({"problem": r["problem"], "tests": r["tests"], "target": r["solution"]}) + random.shuffle(train_examples) + + def mk_ex(r): + user = f"# Task: {r['problem']}\n# Tests:\n# " + "\n# ".join(r['tests']) + "\n\n" + target = r["target"] + full = user + target + full_ids = tok(full, add_special_tokens=False)["input_ids"] + user_ids = tok(user, add_special_tokens=False)["input_ids"] + MAX = 1280 + full_ids = full_ids[:MAX] + labels = list(full_ids) + n_user = min(len(user_ids), len(labels)) + for i in range(n_user): labels[i] = -100 + pad = MAX - len(full_ids) + return {"input_ids": full_ids + [tok.pad_token_id]*pad, + "attention_mask": [1]*len(full_ids) + [0]*pad, + "labels": labels + [-100]*pad} + + model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0") + lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none", + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM") + model = get_peft_model(model, lora_cfg) + ds_train = HFDataset.from_list([mk_ex(r) for r in train_examples]) + targs = TrainingArguments( + output_dir=f"{out_dir}/ckpt", num_train_epochs=2, + per_device_train_batch_size=1, gradient_accumulation_steps=4, + learning_rate=1e-4, bf16=True, logging_steps=20, + save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05, + ) + Trainer(model=model, args=targs, train_dataset=ds_train, tokenizer=tok).train() + log("training done") + adapter_dir = f"{out_dir}/adapter" + model.save_pretrained(adapter_dir) + del model; gc.collect(); torch.cuda.empty_cache() + + # --- TRAINED eval + from vllm import LLM + from vllm.lora.request import LoRARequest + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048, + enable_lora=True, max_lora_rank=16) + lora_req = LoRARequest("tf_adapter", 1, adapter_dir) + from vllm import SamplingParams + sp = SamplingParams(temperature=0, max_tokens=500, stop=["\nclass Test", "\nif __name__"]) + + log("=== TRAINED eval ===") + he_outs = [o.outputs[0].text for o in llm.generate([he_prompt(p) for p in he], sp, lora_request=lora_req, use_tqdm=False)] + tr_he = sum(1 for p, raw in zip(he, he_outs) + if run_python(p["prompt"] + raw + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})", 10)) + mbpp_outs = [o.outputs[0].text for o in llm.generate([mbpp_prompt(p) for p in mbpp_test], sp, lora_request=lora_req, use_tqdm=False)] + tr_mbpp = sum(1 for p, raw in zip(mbpp_test, mbpp_outs) + if run_python(raw + "\n\n" + "\n".join(p["test_list"]), 10)) + + result = { + "model": args.model, + "n_sc": len(sc_pairs), "n_positives": len(right), "n_total": len(train_examples), + "humaneval": {"base": base_he, "trained": tr_he, "delta": tr_he-base_he, "n": len(he)}, + "mbpp": {"base": base_mbpp, "trained": tr_mbpp, "delta": tr_mbpp-base_mbpp, "n": len(mbpp_test)}, + "elapsed_s": time.time()-T0, + } + with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model} — CODE SELF-CORRECTION ({len(sc_pairs)} sc + {len(right)} positives)") + print(f" HumanEval: base={base_he}/{len(he)} trained={tr_he}/{len(he)} Δ={tr_he-base_he:+d}") + print(f" MBPP: base={base_mbpp}/{len(mbpp_test)} trained={tr_mbpp}/{len(mbpp_test)} Δ={tr_mbpp-base_mbpp:+d}") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/experiments/self_correction_math_fixed.py b/experiments/self_correction_math_fixed.py new file mode 100644 index 0000000..75f0712 --- /dev/null +++ b/experiments/self_correction_math_fixed.py @@ -0,0 +1,256 @@ +"""Self-correction recipe FIXED: mix wrong→fix triples WITH right→stays-right. + +Previous failure: training only on wrong→fix taught model to over-doubt itself, +causing -230 regression on Qwen3-4B-Base. + +Fix: + 1. Use existing wrong→fix triples (mined yesterday). + 2. Add an equal/greater number of right→stays-right examples (greedy was correct). + 3. Train on the mixed dataset → model learns WHEN to self-correct. + 4. Eval on MATH-500. + +Uses vLLM on H100 for fast generation. +""" +import os, json, time, re, argparse, gc, random +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from datasets import load_dataset +import sympy +from sympy.parsing.latex import parse_latex + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +SOLVE_PROMPT = """Solve this competition math problem. Show your reasoning, then put the final answer in \\boxed{{...}}. + +Problem: {problem} + +Solution:""" + + +RECONSIDER_TAG = "\n\nWait, let me reconsider — I think there's an error above.\n\n" + + +def extract_boxed(text): + idx = text.rfind("\\boxed{") + if idx < 0: return None + start = idx + len("\\boxed{") + depth = 1; i = start + while i < len(text) and depth > 0: + if text[i] == "{": depth += 1 + elif text[i] == "}": depth -= 1 + i += 1 + if depth != 0: return None + return text[start:i-1].strip() + + +def normalize(s): + if s is None: return None + s = s.strip() + s = re.sub(r"^\$|\$$", "", s).strip() + s = re.sub(r"\\text\{([^}]*)\}", r"\1", s) + s = re.sub(r"\\mbox\{([^}]*)\}", r"\1", s) + s = re.sub(r"(?<=\d),(?=\d)", "", s) + s = s.replace("\\left", "").replace("\\right", "").replace("^\\circ", "").replace("^{\\circ}", "") + return s.strip() + + +def sympy_equal(a, b): + if a is None or b is None: return False + a, b = normalize(a), normalize(b) + if a == b: return True + try: + ea = parse_latex(a); eb = parse_latex(b) + if sympy.simplify(ea - eb) == 0: return True + except Exception: pass + try: + fa = float(a); fb = float(b) + if abs(fa - fb) < 1e-6: return True + except Exception: pass + return False + + +def vllm_gen(llm, prompts, max_new=600, temperature=0.0, n=1): + from vllm import SamplingParams + sp = SamplingParams(temperature=temperature, top_p=0.95 if temperature > 0 else 1.0, + max_tokens=max_new, n=n) + out = llm.generate(prompts, sp, use_tqdm=False) + if n == 1: return [o.outputs[0].text for o in out] + return [[c.text for c in o.outputs] for o in out] + + +def math500_eval(gen_func, label): + ds = list(load_dataset("HuggingFaceH4/MATH-500", split="test")) + log(f" eval MATH-500 [{label}] ({len(ds)})") + prompts = [SOLVE_PROMPT.format(problem=p["problem"]) for p in ds] + t0 = time.time() + outs = gen_func(prompts, max_new=800) + log(f" gen done in {time.time()-t0:.1f}s") + correct = 0 + for p, raw in zip(ds, outs): + if sympy_equal(extract_boxed(raw), p["answer"]): correct += 1 + return correct, len(ds) + + +def make_train_example(problem, solution, tok): + user = SOLVE_PROMPT.format(problem=problem) + full = user + " " + solution + full_ids = tok(full, add_special_tokens=False)["input_ids"] + user_ids = tok(user + " ", add_special_tokens=False)["input_ids"] + MAX = 1536 + full_ids = full_ids[:MAX] + labels = list(full_ids) + n_user = min(len(user_ids), len(labels)) + for i in range(n_user): labels[i] = -100 + pad = MAX - len(full_ids) + return {"input_ids": full_ids + [tok.pad_token_id]*pad, + "attention_mask": [1]*len(full_ids) + [0]*pad, + "labels": labels + [-100]*pad} + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--wrong_fix_pairs", required=True, help="Existing wrong→fix triples jsonl from prior run") + ap.add_argument("--n_positives", type=int, default=100, help="Number of right→stays-right examples to mine") + ap.add_argument("--tag", required=True) + args = ap.parse_args() + + out_dir = f"/workspace/math500_sc_v2/{args.tag}" + os.makedirs(out_dir, exist_ok=True) + + from vllm import LLM + from transformers import AutoTokenizer + log(f"loading {args.model} into vLLM") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048) + log(f" loaded") + + # --- BASE eval + log("=== BASE eval ===") + base_c, base_n = math500_eval(lambda P, max_new=800: vllm_gen(llm, P, max_new=max_new), "BASE") + log(f" BASE: {base_c}/{base_n} ({100*base_c/base_n:.1f}%)") + + # --- Load existing wrong→fix triples + wrong_fix = [json.loads(l) for l in open(args.wrong_fix_pairs)] + log(f" loaded {len(wrong_fix)} wrong→fix triples") + + # --- Mine right→stays-right positives from MATH-train + log(f"=== mining {args.n_positives} right→stays-right positives ===") + train_ds = [] + for cfg in ["algebra","counting_and_probability","geometry","intermediate_algebra","number_theory","prealgebra","precalculus"]: + try: + sub = list(load_dataset("EleutherAI/hendrycks_math", cfg, split="train")) + train_ds.extend(sub) + except Exception: pass + random.seed(42); random.shuffle(train_ds) + log(f" {len(train_ds)} train problems available") + + def gold_of(p): + return extract_boxed(p.get("solution", "")) + + positives = [] + cursor = 0 + while len(positives) < args.n_positives and cursor < len(train_ds): + batch = [] + while len(batch) < 64 and cursor < len(train_ds): + p = train_ds[cursor]; cursor += 1 + g = gold_of(p) + if g is not None: batch.append({"problem": p["problem"], "gold": g}) + if not batch: break + + prompts = [SOLVE_PROMPT.format(problem=p["problem"]) for p in batch] + outs = vllm_gen(llm, prompts, max_new=600, temperature=0.0) + for p, raw in zip(batch, outs): + if sympy_equal(extract_boxed(raw), p["gold"]): + # right→stays-right: model wrote a clean correct solution + positives.append({"problem": p["problem"], "solution": raw.strip()}) + if len(positives) >= args.n_positives: break + log(f" positives: {len(positives)} / {args.n_positives}") + + log(f"=== final dataset: {len(wrong_fix)} wrong→fix + {len(positives)} right→stays-right = {len(wrong_fix)+len(positives)} examples ===") + + with open(f"{out_dir}/positives.jsonl", "w") as fh: + for p in positives: fh.write(json.dumps(p) + "\n") + + # --- Build training data + train_examples = [] + # wrong→fix as full self-correction traces + for r in wrong_fix: + train_examples.append({ + "problem": r["problem"], + "solution": r["full_solution"], # already includes wrong + RECONSIDER_TAG + correct + }) + # right→stays-right as plain solutions (no "wait" — model commits) + for r in positives: + train_examples.append({ + "problem": r["problem"], + "solution": r["solution"], + }) + random.shuffle(train_examples) + + # --- Train LoRA + log("=== TRAINING ===") + del llm; gc.collect(); torch.cuda.empty_cache() + from transformers import AutoModelForCausalLM, TrainingArguments, Trainer + from datasets import Dataset as HFDataset + from peft import LoraConfig, get_peft_model + + model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0") + lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none", + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM") + model = get_peft_model(model, lora_cfg) + ds_train = HFDataset.from_list([make_train_example(r["problem"], r["solution"], tok) for r in train_examples]) + targs = TrainingArguments( + output_dir=f"{out_dir}/ckpt", num_train_epochs=2, + per_device_train_batch_size=1, gradient_accumulation_steps=4, + learning_rate=1e-4, bf16=True, logging_steps=20, + save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05, + ) + Trainer(model=model, args=targs, train_dataset=ds_train, tokenizer=tok).train() + log("training done") + adapter_dir = f"{out_dir}/adapter" + model.save_pretrained(adapter_dir) + del model; gc.collect(); torch.cuda.empty_cache() + + # --- TRAINED eval + from vllm import LLM + from vllm.lora.request import LoRARequest + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048, + enable_lora=True, max_lora_rank=16) + lora_req = LoRARequest("tf_adapter", 1, adapter_dir) + from vllm import SamplingParams + def gen_trained(prompts, max_new=800): + sp = SamplingParams(temperature=0, max_tokens=max_new) + return [o.outputs[0].text for o in llm.generate(prompts, sp, lora_request=lora_req, use_tqdm=False)] + + log("=== TRAINED eval ===") + tr_c, tr_n = math500_eval(gen_trained, "TRAINED") + log(f" TRAINED: {tr_c}/{tr_n} ({100*tr_c/tr_n:.1f}%)") + + result = { + "model": args.model, + "n_wrong_fix": len(wrong_fix), + "n_positives": len(positives), + "n_total": len(train_examples), + "base": base_c, "trained": tr_c, "n": tr_n, + "delta": tr_c - base_c, + "elapsed_s": time.time() - T0, + } + with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model} — SELF-CORRECTION V2 (mixed: {len(wrong_fix)} wrong→fix + {len(positives)} right→stays)") + print(f" MATH-500: base={base_c}/{tr_n} ({100*base_c/tr_n:.1f}%) trained={tr_c}/{tr_n} ({100*tr_c/tr_n:.1f}%) Δ={tr_c-base_c:+d}") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/experiments/self_correction_math_naive.py b/experiments/self_correction_math_naive.py new file mode 100644 index 0000000..896c8d8 --- /dev/null +++ b/experiments/self_correction_math_naive.py @@ -0,0 +1,286 @@ +"""TinyForge-Zero self-correction for MATH-500. + +Recipe: + 1. Sample real MATH-train problem (no human solutions used). + 2. Model greedy-attempt → wrong. Capture as wrong_attempt. + 3. Re-prompt model: {problem} + wrong_attempt + "Wait, let me reconsider:" + Sample 4 completions at temp=0.8. + 4. If any completion gets correct boxed answer (verified via sympy against gold), + MINE a triple: (problem, wrong_attempt, reflection+correct). + 5. Train LoRA on full traces — model learns to catch + fix own errors. + 6. Eval on MATH-500 (test). Model naturally produces self-correction. + +Key difference from rejection-sampling: training data teaches the FIX, +not just the answer. Same broken→fixed structure that worked for code. +""" +import os, json, time, re, argparse, random +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer +from datasets import load_dataset, Dataset as HFDataset +from peft import LoraConfig, get_peft_model +import sympy +from sympy.parsing.latex import parse_latex + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +SOLVE_PROMPT = """Solve this competition math problem. Show your reasoning, then put the final answer in \\boxed{{...}}. + +Problem: {problem} + +Solution:""" + + +RECONSIDER_TAG = "\n\nWait, let me reconsider — I think there's an error above.\n\n" + + +def extract_boxed(text): + idx = text.rfind("\\boxed{") + if idx < 0: return None + start = idx + len("\\boxed{") + depth = 1; i = start + while i < len(text) and depth > 0: + if text[i] == "{": depth += 1 + elif text[i] == "}": depth -= 1 + i += 1 + if depth != 0: return None + return text[start:i-1].strip() + + +def normalize(s): + if s is None: return None + s = s.strip() + s = re.sub(r"^\$|\$$", "", s).strip() + s = re.sub(r"\\text\{([^}]*)\}", r"\1", s) + s = re.sub(r"\\mbox\{([^}]*)\}", r"\1", s) + s = re.sub(r"(?<=\d),(?=\d)", "", s) + s = s.replace("\\left", "").replace("\\right", "").replace("^\\circ", "").replace("^{\\circ}", "") + return s.strip() + + +def sympy_equal(a, b): + if a is None or b is None: return False + a, b = normalize(a), normalize(b) + if a == b: return True + try: + ea = parse_latex(a); eb = parse_latex(b) + if sympy.simplify(ea - eb) == 0: return True + except Exception: pass + try: + fa = float(a); fb = float(b) + if abs(fa - fb) < 1e-6: return True + except Exception: pass + return False + + +def chat_messages(user_content): + return [{"role": "system", "content": "You are a careful math problem solver. If you make a mistake, catch it and correct yourself."}, + {"role": "user", "content": user_content}] + + +def gen_batch(model, tok, prompts, max_new=600, temperature=0.0, batch=16, prefill_texts=None): + """If prefill_texts provided, append each to its chat-templated prompt (forcing the model to continue from there).""" + outs = [] + for i in range(0, len(prompts), batch): + chunk = prompts[i:i+batch] + pref_chunk = prefill_texts[i:i+batch] if prefill_texts else [""] * len(chunk) + texts = [] + for p, pre in zip(chunk, pref_chunk): + msgs = chat_messages(p) + try: + base = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) + except Exception: + base = p + texts.append(base + pre) + inp = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=2000).to(model.device) + with torch.no_grad(): + out = model.generate(**inp, max_new_tokens=max_new, do_sample=temperature > 0, + temperature=temperature if temperature > 0 else 1.0, top_p=0.95, + pad_token_id=tok.eos_token_id) + for j in range(out.size(0)): + outs.append(tok.decode(out[j][inp.input_ids.shape[1]:], skip_special_tokens=True)) + return outs + + +def math500_eval(model, tok, n=500, batch=16): + ds = list(load_dataset("HuggingFaceH4/MATH-500", split="test"))[:n] + log(f" eval on MATH-500 ({len(ds)} problems)") + prompts = [SOLVE_PROMPT.format(problem=p["problem"]) for p in ds] + outs = gen_batch(model, tok, prompts, max_new=800, temperature=0.0, batch=batch) + correct = 0 + for p, raw in zip(ds, outs): + pred = extract_boxed(raw) + if sympy_equal(pred, p["answer"]): correct += 1 + return correct, len(ds) + + +def make_train_example(problem, full_solution, tok): + """Train on the full self-correction trace.""" + user = SOLVE_PROMPT.format(problem=problem) + msgs_pre = chat_messages(user) + msgs_full = msgs_pre + [{"role": "assistant", "content": full_solution}] + pre = tok.apply_chat_template(msgs_pre, tokenize=False, add_generation_prompt=True) + full = tok.apply_chat_template(msgs_full, tokenize=False) + pre_ids = tok(pre, add_special_tokens=False)["input_ids"] + full_ids = tok(full, add_special_tokens=False)["input_ids"] + MAX = 1536 + full_ids = full_ids[:MAX] + labels = list(full_ids) + n_pre = min(len(pre_ids), len(labels)) + for i in range(n_pre): labels[i] = -100 + pad = MAX - len(full_ids) + return {"input_ids": full_ids + [tok.pad_token_id]*pad, + "attention_mask": [1]*len(full_ids) + [0]*pad, + "labels": labels + [-100]*pad} + + +def train_on_pairs(model, tok, pairs, out_dir, lr=1e-4, epochs=2, rank=16): + log(f" training on {len(pairs)} traces (lr={lr}, e={epochs}, r={rank})") + lora_cfg = LoraConfig(r=rank, lora_alpha=rank*2, lora_dropout=0.05, bias="none", + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM") + model = get_peft_model(model, lora_cfg) + tok.padding_side = "right" + ds = HFDataset.from_list([make_train_example(p["problem"], p["full_solution"], tok) for p in pairs]) + targs = TrainingArguments( + output_dir=f"{out_dir}/ckpt", num_train_epochs=epochs, + per_device_train_batch_size=1, gradient_accumulation_steps=4, + learning_rate=lr, bf16=True, logging_steps=20, + save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05, + ) + Trainer(model=model, args=targs, train_dataset=ds, processing_class=tok).train() + tok.padding_side = "left" + return model + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--iterations", type=int, default=8) + ap.add_argument("--problems_per_iter", type=int, default=48) + ap.add_argument("--n_eval", type=int, default=500) + ap.add_argument("--max_pairs", type=int, default=100) + ap.add_argument("--seed", type=int, default=42) + ap.add_argument("--tag", required=True) + args = ap.parse_args() + + out_dir = f"/workspace/math500_sc/{args.tag}" + os.makedirs(out_dir, exist_ok=True) + random.seed(args.seed); torch.manual_seed(args.seed) + + log(f"loading {args.model}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + tok.padding_side = "left" + model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0") + log(f" loaded mem={torch.cuda.memory_allocated('cuda:0')/1e9:.1f}GB") + + log("loading MATH train split") + train_ds = [] + for cfg in ["algebra","counting_and_probability","geometry","intermediate_algebra","number_theory","prealgebra","precalculus"]: + try: + sub = list(load_dataset("EleutherAI/hendrycks_math", cfg, split="train")) + train_ds.extend(sub) + except Exception as e: + log(f" warn: failed to load {cfg}: {e}") + log(f" {len(train_ds)} train problems") + random.shuffle(train_ds) + + def gold_of(p): + return extract_boxed(p.get("solution", "")) + + model.eval() + log("INITIAL eval on MATH-500") + base_c, base_n = math500_eval(model, tok, n=args.n_eval) + log(f" MATH-500 base: {base_c}/{base_n} ({100*base_c/base_n:.1f}%)") + + pairs = [] + cursor = 0 + + for it in range(1, args.iterations + 1): + log(f"--- iter {it} ---") + # Sample problems from MATH-train + batch_problems = [] + while len(batch_problems) < args.problems_per_iter and cursor < len(train_ds): + p = train_ds[cursor]; cursor += 1 + g = gold_of(p) + if g is not None: batch_problems.append({"problem": p["problem"], "gold": g}) + if not batch_problems: + log(" exhausted train problems"); break + + # Step 1: Greedy attempt + prompts = [SOLVE_PROMPT.format(problem=p["problem"]) for p in batch_problems] + greedy_outs = gen_batch(model, tok, prompts, max_new=600, temperature=0.0, batch=16) + wrong_attempts = [] + for i, (p, raw) in enumerate(zip(batch_problems, greedy_outs)): + pred = extract_boxed(raw) + if not sympy_equal(pred, p["gold"]): + wrong_attempts.append({"idx": i, "problem": p["problem"], "gold": p["gold"], "wrong": raw.strip()}) + log(f" iter {it}: {len(wrong_attempts)}/{len(batch_problems)} wrong on greedy (mining candidates)") + if not wrong_attempts: + continue + + # Step 2: Self-correct prompt (prefill wrong attempt + reconsider tag, sample 4) + sc_problems = [] + prefills = [] + for w in wrong_attempts: + for _ in range(4): + sc_problems.append(w["problem"]) + prefills.append(w["wrong"] + RECONSIDER_TAG) + sc_prompts = [SOLVE_PROMPT.format(problem=p) for p in sc_problems] + sc_outs = gen_batch(model, tok, sc_prompts, max_new=600, temperature=0.8, batch=16, prefill_texts=prefills) + + mined_this_iter = 0 + for j, w in enumerate(wrong_attempts): + attempts = sc_outs[j*4:(j+1)*4] + preds = [extract_boxed(a) for a in attempts] + correct_idx = [k for k, pr in enumerate(preds) if sympy_equal(pr, w["gold"])] + if correct_idx: + # construct full trace + fix = attempts[correct_idx[0]].strip() + full = w["wrong"] + RECONSIDER_TAG + fix + pairs.append({"problem": w["problem"], "wrong_attempt": w["wrong"], + "correction": fix, "full_solution": full}) + mined_this_iter += 1 + log(f" iter {it}: MINED {mined_this_iter} self-correction triples — total={len(pairs)}") + + if len(pairs) >= args.max_pairs: + log(f" reached max_pairs={args.max_pairs}, stopping"); break + + log(f"=== mined {len(pairs)} total self-correction triples ===") + with open(f"{out_dir}/pairs.jsonl", "w") as fh: + for p in pairs: fh.write(json.dumps(p) + "\n") + + if not pairs: + log("no triples — exiting"); return + + model = train_on_pairs(model, tok, pairs, out_dir) + log("training done") + + model.eval() + log("FINAL eval on MATH-500") + tr_c, tr_n = math500_eval(model, tok, n=args.n_eval) + log(f" MATH-500 trained: {tr_c}/{tr_n} ({100*tr_c/tr_n:.1f}%)") + + result = { + "model": args.model, "n_pairs": len(pairs), + "base": base_c, "trained": tr_c, "n": tr_n, + "delta": tr_c - base_c, "elapsed_s": time.time() - T0, + } + with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model} — SELF-CORRECTION recipe") + print(f" MATH-500: base={base_c}/{tr_n} trained={tr_c}/{tr_n} Δ={tr_c-base_c:+d}") + print(f" Triples mined: {len(pairs)}") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/experiments/star_baseline_gsm8k.py b/experiments/star_baseline_gsm8k.py new file mode 100644 index 0000000..729e264 --- /dev/null +++ b/experiments/star_baseline_gsm8k.py @@ -0,0 +1,204 @@ +"""STaR / Rejection Sampling Fine-Tuning on GSM8K. + +For each GSM8K-train problem: + - sample N reasoning chains at temp=0.8 + - keep chains that produce correct final answer + - train on (problem, correct chain) pairs +Then eval on GSM8K-test. +""" +import os, sys, json, time, re, gc, argparse, random +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ.setdefault("CUDA_VISIBLE_DEVICES", "1") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer +from datasets import load_dataset, Dataset as HFDataset +from peft import LoraConfig, get_peft_model + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def extract_answer(text: str): + m = re.search(r"####\s*(-?\d+(?:\.\d+)?)", text) + if m: return float(m.group(1)) + m = re.search(r"\\boxed\{(-?\d+(?:\.\d+)?)\}", text) + if m: return float(m.group(1)) + matches = re.findall(r"-?\d+(?:\.\d+)?", text) + if matches: + try: return float(matches[-1]) + except: return None + return None + + +def gen_batch(model, tok, prompts, max_new=400, temperature=0.0, batch=8): + outs = [] + for i in range(0, len(prompts), batch): + chunk = prompts[i:i+batch] + texts = [] + for p in chunk: + msgs = [{"role": "system", "content": "You are a careful math tutor."}, + {"role": "user", "content": p}] + texts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)) + inp = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=1500).to(model.device) + with torch.no_grad(): + out = model.generate(**inp, max_new_tokens=max_new, do_sample=temperature > 0, + temperature=temperature if temperature > 0 else 1.0, top_p=0.95, + pad_token_id=tok.eos_token_id) + for j in range(out.size(0)): + outs.append(tok.decode(out[j][inp.input_ids.shape[1]:], skip_special_tokens=True)) + return outs + + +SOLVE_PROMPT = "Solve this math problem step by step. End with the answer on a new line as: #### \n\nProblem: {problem}" + + +def parse_gold(answer_field: str): + m = re.search(r"####\s*(-?\d+(?:,\d+)*(?:\.\d+)?)", answer_field) + return float(m.group(1).replace(",", "")) if m else None + + +def gsm8k_eval(model, tok, n=200): + ds = list(load_dataset("openai/gsm8k", "main", split="test"))[:n] + log(f" eval on GSM8K-test ({len(ds)} problems)") + prompts = [SOLVE_PROMPT.format(problem=p["question"]) for p in ds] + outs = gen_batch(model, tok, prompts, max_new=400, temperature=0.0, batch=8) + correct = 0 + for p, raw in zip(ds, outs): + gold = parse_gold(p["answer"]) + if gold is None: continue + pred = extract_answer(raw) + if pred is not None and abs(pred - gold) < 0.01: correct += 1 + return correct, len(ds) + + +def make_train_example(problem: str, solution: str, tok): + user = SOLVE_PROMPT.format(problem=problem) + msgs_pre = [{"role": "system", "content": "You are a careful math tutor."}, + {"role": "user", "content": user}] + msgs_full = msgs_pre + [{"role": "assistant", "content": solution}] + pre = tok.apply_chat_template(msgs_pre, tokenize=False, add_generation_prompt=True) + full = tok.apply_chat_template(msgs_full, tokenize=False) + pre_ids = tok(pre, add_special_tokens=False)["input_ids"] + full_ids = tok(full, add_special_tokens=False)["input_ids"] + MAX = 1024 + full_ids = full_ids[:MAX] + labels = list(full_ids) + n_pre = min(len(pre_ids), len(labels)) + for i in range(n_pre): labels[i] = -100 + pad = MAX - len(full_ids) + return {"input_ids": full_ids + [tok.pad_token_id]*pad, + "attention_mask": [1]*len(full_ids) + [0]*pad, + "labels": labels + [-100]*pad} + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", default="Qwen/Qwen2.5-3B") + ap.add_argument("--n_train_problems", type=int, default=300) + ap.add_argument("--n_chains", type=int, default=8) + ap.add_argument("--n_eval", type=int, default=200) + ap.add_argument("--epochs", type=int, default=2) + ap.add_argument("--seed", type=int, default=42) + ap.add_argument("--tag", required=True) + args = ap.parse_args() + + random.seed(args.seed); torch.manual_seed(args.seed) + out_dir = f"/workspace/star/{args.tag}" + os.makedirs(out_dir, exist_ok=True) + + log(f"loading {args.model}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + tok.padding_side = "left" + model = AutoModelForCausalLM.from_pretrained(args.model, dtype=torch.bfloat16, device_map="cuda:0") + log(f" loaded mem={torch.cuda.memory_allocated('cuda:0')/1e9:.1f}GB") + + # Initial eval on GSM8K-test + model.eval() + log("INITIAL eval on GSM8K-test") + base_correct, base_total = gsm8k_eval(model, tok, n=args.n_eval) + log(f" GSM8K-test base: {base_correct}/{base_total}") + + # Mine reasoning chains from GSM8K-train + log(f"mining reasoning chains from GSM8K-train ({args.n_train_problems} problems × {args.n_chains} chains)") + train_set = list(load_dataset("openai/gsm8k", "main", split="train"))[:args.n_train_problems] + pairs = [] + BATCH_PROBLEMS = 8 # batch problems together + for batch_start in range(0, len(train_set), BATCH_PROBLEMS): + batch_end = min(batch_start + BATCH_PROBLEMS, len(train_set)) + batch_problems = train_set[batch_start:batch_end] + # For each problem, generate N chains. So total = batch_size * N + prompts = [] + for p in batch_problems: + for _ in range(args.n_chains): + prompts.append(SOLVE_PROMPT.format(problem=p["question"])) + outs = gen_batch(model, tok, prompts, max_new=400, temperature=0.8, batch=8) + # Outs are in problem-major × chain-major order + for i, p in enumerate(batch_problems): + gold = parse_gold(p["answer"]) + if gold is None: continue + chain_outs = outs[i*args.n_chains : (i+1)*args.n_chains] + for raw in chain_outs: + pred = extract_answer(raw) + if pred is not None and abs(pred - gold) < 0.01: + pairs.append({"problem": p["question"], "solution": raw.strip()}) + break # take first correct chain per problem + log(f" mined {len(pairs)} pairs from {batch_end} problems") + + if not pairs: + log("FATAL: no pairs mined") + return + with open(f"{out_dir}/pairs.jsonl", "w") as fh: + for p in pairs: fh.write(json.dumps(p) + "\n") + log(f"total pairs mined: {len(pairs)} from {len(train_set)} problems " + f"(coverage: {len(pairs)/len(train_set)*100:.1f}%)") + + # Train + log(f"TRAINING on {len(pairs)} pairs, {args.epochs} epochs") + lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none", + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM") + model = get_peft_model(model, lora_cfg) + tok.padding_side = "right" + ds = HFDataset.from_list([make_train_example(p["problem"], p["solution"], tok) for p in pairs]) + targs = TrainingArguments( + output_dir=f"{out_dir}/ckpt", num_train_epochs=args.epochs, + per_device_train_batch_size=1, gradient_accumulation_steps=4, + learning_rate=1e-4, bf16=True, logging_steps=20, + save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05, + ) + Trainer(model=model, args=targs, train_dataset=ds, processing_class=tok).train() + log("training done") + tok.padding_side = "left" + + # Final eval + model.eval() + log("FINAL eval on GSM8K-test") + trained_correct, trained_total = gsm8k_eval(model, tok, n=args.n_eval) + log(f" GSM8K-test trained: {trained_correct}/{trained_total}") + + result = { + "model": args.model, "n_train_problems": args.n_train_problems, + "n_chains": args.n_chains, "n_pairs_mined": len(pairs), + "epochs": args.epochs, "seed": args.seed, + "base": [base_correct, base_total], + "trained": [trained_correct, trained_total], + "delta": trained_correct - base_correct, + "elapsed_s": time.time() - T0, + } + with open(f"{out_dir}/result.json", "w") as fh: + json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" STaR / RFT on GSM8K — {args.model}") + print(f" Mined {len(pairs)} pairs from {len(train_set)} GSM8K-train problems ({len(pairs)/len(train_set)*100:.1f}% coverage)") + print(f" GSM8K-test: base={base_correct}/{base_total} trained={trained_correct}/{trained_total} Δ={trained_correct-base_correct:+d}") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/recipe/bootstrap_14b_4bit_harvest.py b/recipe/bootstrap_14b_4bit_harvest.py new file mode 100644 index 0000000..b2943b3 --- /dev/null +++ b/recipe/bootstrap_14b_4bit_harvest.py @@ -0,0 +1,191 @@ +"""Bootstrap loop adapted for large models — uses 4-bit NF4 quantization and batch=1. +Just the harvest loop (no training during loop). Saves pairs. +""" +import os, sys, json, time, re, gc, subprocess, tempfile, argparse, random +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ.setdefault("CUDA_VISIBLE_DEVICES", "1") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +from datasets import load_dataset + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def extract_code(text): + if "```python" in text: text = text.split("```python", 1)[1] + elif "```" in text: text = text.split("```", 1)[1] + if "```" in text: text = text.split("```", 1)[0] + return text.strip() + + +def run_python(code, timeout=8): + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f: + f.write(code); path = f.name + try: + r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp") + if r.returncode == 0: return True, "" + err = (r.stderr or r.stdout).strip().splitlines() + return False, "\n".join(err[-3:])[:300] + except subprocess.TimeoutExpired: return False, "timeout" + finally: + try: os.unlink(path) + except: pass + + +def gen_one(model, tok, prompt, max_new=400, temperature=0.0): + msgs = [{"role": "system", "content": "You are a Python coder."}, + {"role": "user", "content": prompt}] + text = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) + inp = tok(text, return_tensors="pt", truncation=True, max_length=1500).to(model.device) + with torch.no_grad(): + out = model.generate(**inp, max_new_tokens=max_new, do_sample=temperature > 0, + temperature=temperature if temperature > 0 else 1.0, top_p=0.95, + pad_token_id=tok.eos_token_id) + return tok.decode(out[0][inp.input_ids.shape[1]:], skip_special_tokens=True) + + +PROBLEM_GEN_PROMPT = """Generate ONE simple Python coding problem with a clear function spec and 3 test assertions. + +Output format (exactly one ```python block): + +```python +def {function_name}({args}): + \"\"\"{one-line description of what the function does}\"\"\" + {implementation} + +# tests +assert {function_name}(...) == ... +assert {function_name}(...) == ... +assert {function_name}(...) == ... +``` + +Make the function specific and concrete. Output ONLY the code block.""" + + +def parse_problem(raw_code): + code = raw_code.strip() + if "def " not in code: return None + lines = code.split("\n") + func_start = next((i for i, l in enumerate(lines) if l.startswith("def ")), None) + if func_start is None: return None + tests = [] + def_end = None + for i in range(func_start, len(lines)): + l = lines[i] + if l.startswith("def ") and i > func_start: break + if l.startswith("assert "): + tests.append(l) + if def_end is None: def_end = i + if len(tests) < 2: return None + if def_end is None: def_end = len(lines) + full_solution = "\n".join(lines[func_start:def_end]).strip() + if len(full_solution) < 30: return None + m = re.match(r"def\s+(\w+)\s*\(", lines[func_start]) + if not m: return None + sig_lines = [] + for i in range(func_start, def_end): + sig_lines.append(lines[i]) + if i == func_start and not any('"""' in lines[j] for j in range(i, min(i+5, def_end))): + sig_lines.append(" pass"); break + if i > func_start and '"""' in lines[i] and ('"""' in lines[i-1] or lines[i].count('"""') >= 2): + break + return {"fn_name": m.group(1), "signature": "\n".join(sig_lines), "tests": tests, "canonical": full_solution} + + +def humaneval_full(model, tok): + he = list(load_dataset("openai_humaneval", split="test")) + log(f" full HumanEval: {len(he)} problems") + correct = 0 + for i, p in enumerate(he): + prompt = p["prompt"] + "\n# Complete the function above." + raw = gen_one(model, tok, prompt, max_new=400, temperature=0.0) + code = extract_code(raw) if "```" in raw else raw + full = p["prompt"] + "\n" + code if "def " not in code else code + test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + ok, _ = run_python(test_code, timeout=10) + if ok: correct += 1 + if (i+1) % 20 == 0: log(f" eval {i+1}/{len(he)}: {correct} correct") + return correct, len(he) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", default="Qwen/Qwen2.5-14B") + ap.add_argument("--iterations", type=int, default=20) + ap.add_argument("--problems_per_iter", type=int, default=8) + ap.add_argument("--n_attempts", type=int, default=4) + ap.add_argument("--tag", required=True) + args = ap.parse_args() + + out_dir = f"/workspace/bootstrap14b/{args.tag}" + os.makedirs(out_dir, exist_ok=True) + + log(f"loading {args.model} in 4-bit NF4") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + tok.padding_side = "left" + bnb_cfg = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_use_double_quant=True) + model = AutoModelForCausalLM.from_pretrained(args.model, quantization_config=bnb_cfg, + device_map="cuda:0") + model.eval() + log(f" loaded mem={torch.cuda.memory_allocated('cuda:0')/1e9:.1f}GB") + + log("INITIAL eval on full HumanEval") + base_correct, base_total = humaneval_full(model, tok) + log(f" base: {base_correct}/{base_total}") + + accumulated = [] + for it in range(1, args.iterations + 1): + it_t = time.time() + valid_problems = [] + for _ in range(args.problems_per_iter): + raw = gen_one(model, tok, PROBLEM_GEN_PROMPT, max_new=400, temperature=0.9) + code = extract_code(raw) if "```" in raw else raw + parsed = parse_problem(code) + if not parsed: continue + full = parsed["canonical"] + "\n\n" + "\n".join(parsed["tests"]) + ok, _ = run_python(full) + if ok: valid_problems.append(parsed) + + new_pairs = 0 + for p in valid_problems: + attempts = [] + solve_prompt = f"Implement: {p['signature']}\n\nTests:\n{chr(10).join(p['tests'])}\n\nOutput only the function implementation in one ```python block." + for _ in range(args.n_attempts): + raw = gen_one(model, tok, solve_prompt, max_new=400, temperature=0.8) + attempts.append(raw) + broken = None; fixed = None + for raw in attempts: + code = extract_code(raw) if "```" in raw else raw + full = code + "\n\n" + "\n".join(p["tests"]) + ok, err = run_python(full) + if ok and fixed is None: fixed = code + elif not ok and broken is None: broken = code; broken_err = err + if broken and fixed: break + if broken and fixed: + accumulated.append({"signature": p["signature"], "tests": p["tests"], + "broken": broken, "error": broken_err if 'broken_err' in dir() else "", + "fixed": fixed}) + new_pairs += 1 + + log(f"iter {it}: {len(valid_problems)} valid, {new_pairs} pairs (total: {len(accumulated)}) [{time.time()-it_t:.0f}s]") + with open(f"{out_dir}/pairs.jsonl", "w") as fh: + for r in accumulated: fh.write(json.dumps(r) + "\n") + + log(f"DONE — accumulated {len(accumulated)} pairs from {args.iterations} iters") + print() + print("=" * 70) + print(f" 14B BASELINE: {base_correct}/{base_total} on HumanEval") + print(f" Accumulated pairs: {len(accumulated)}") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/recipe/curriculum_code.py b/recipe/curriculum_code.py new file mode 100644 index 0000000..6d87214 --- /dev/null +++ b/recipe/curriculum_code.py @@ -0,0 +1,322 @@ +"""TinyForge-Zero on CODE with self-difficulty curriculum. + +Loop: + 1. Generate problem (seeded fresh or amplified/simplified from pool) + 2. Greedy solve. Verify against tests. + - If correct → easy → amplify + - If wrong → try 4 sampled attempts + - If at-edge (some pass, some fail) → MINE pair + - If all fail → too hard → simplify + 3. Train periodically. Eval on HumanEval. +""" +import os, sys, json, time, re, gc, subprocess, tempfile, argparse, random +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer +from datasets import load_dataset, Dataset as HFDataset +from peft import LoraConfig, get_peft_model + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def extract_code(text): + if "```python" in text: text = text.split("```python", 1)[1] + elif "```" in text: text = text.split("```", 1)[1] + if "```" in text: text = text.split("```", 1)[0] + return text.strip() + + +def run_python(code, timeout=8): + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f: + f.write(code); path = f.name + try: + r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp") + if r.returncode == 0: return True, "" + err = (r.stderr or r.stdout).strip().splitlines() + return False, "\n".join(err[-3:])[:300] + except subprocess.TimeoutExpired: return False, "timeout" + finally: + try: os.unlink(path) + except: pass + + +def gen_batch(model, tok, prompts, max_new=400, temperature=0.0, batch=4): + outs = [] + for i in range(0, len(prompts), batch): + chunk = prompts[i:i+batch] + texts = [] + for p in chunk: + msgs = [{"role": "system", "content": "You are a Python coder."}, + {"role": "user", "content": p}] + texts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)) + inp = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=1500).to(model.device) + with torch.no_grad(): + out = model.generate(**inp, max_new_tokens=max_new, do_sample=temperature > 0, + temperature=temperature if temperature > 0 else 1.0, top_p=0.95, + pad_token_id=tok.eos_token_id) + for j in range(out.size(0)): + outs.append(tok.decode(out[j][inp.input_ids.shape[1]:], skip_special_tokens=True)) + return outs + + +SEED_GEN_PROMPT = """Generate ONE simple Python coding problem with a clear function spec and 3 test assertions. + +Output exactly: + +```python +def {function_name}({args}): + \"\"\"{description}\"\"\" + {implementation} + +# tests +assert {function_name}(...) == ... +assert {function_name}(...) == ... +assert {function_name}(...) == ... +``` + +Output ONLY the code block.""" + + +AMPLIFY_PROMPT = """Take this Python coding problem and make it HARDER (add an edge case, additional constraint, or trickier logic). Keep the format with function + 3 assert tests. + +Original: +```python +{original} +``` + +Output the harder version (function + tests) in one ```python block.""" + + +SIMPLIFY_PROMPT = """Take this Python coding problem and make it EASIER (remove an edge case, simplify the logic). Keep the format with function + 3 assert tests. + +Original: +```python +{original} +``` + +Output the easier version (function + tests) in one ```python block.""" + + +def parse_problem(text): + code = extract_code(text) if "```" in text else text.strip() + if "def " not in code: return None + lines = code.split("\n") + func_start = next((i for i, l in enumerate(lines) if l.startswith("def ")), None) + if func_start is None: return None + tests = [] + def_end = None + for i in range(func_start, len(lines)): + l = lines[i] + if l.startswith("def ") and i > func_start: break + if l.startswith("assert "): + tests.append(l) + if def_end is None: def_end = i + if len(tests) < 2: return None + if def_end is None: def_end = len(lines) + full_solution = "\n".join(lines[func_start:def_end]).strip() + if len(full_solution) < 30: return None + m = re.match(r"def\s+(\w+)\s*\(", lines[func_start]) + if not m: return None + fn_name = m.group(1) + sig_lines = [] + for i in range(func_start, def_end): + sig_lines.append(lines[i]) + if i == func_start and not any('"""' in lines[j] for j in range(i, min(i+5, def_end))): + sig_lines.append(" pass"); break + if i > func_start and '"""' in lines[i] and (i > func_start+1 and '"""' in lines[i-1] or lines[i].count('"""') >= 2): + break + return {"fn_name": fn_name, "signature": "\n".join(sig_lines), "tests": tests, + "canonical": full_solution, "raw": code} + + +def humaneval_full(model, tok, n=164): + he = list(load_dataset("openai_humaneval", split="test"))[:n] + log(f" HumanEval ({len(he)} problems)") + prompts = [p["prompt"] + "\n# Complete the function above." for p in he] + outs = gen_batch(model, tok, prompts, max_new=400, temperature=0.0, batch=4) + correct = 0 + for p, raw in zip(he, outs): + code = extract_code(raw) if "```" in raw else raw + full = p["prompt"] + "\n" + code if "def " not in code else code + test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + ok, _ = run_python(test_code, timeout=10) + if ok: correct += 1 + return correct, len(he) + + +def make_train_example(r, tok): + user = f"Implement: {r['signature']}\n\nTests:\n{chr(10).join(r['tests'])}\n\nMy attempt:\n```python\n{r['broken']}\n```\n\nError:\n{r['error']}\n\nFix and output the corrected code only." + assistant = f"```python\n{r['fixed']}\n```" + msgs_pre = [{"role": "system", "content": "You are a Python coder."}, + {"role": "user", "content": user}] + msgs_full = msgs_pre + [{"role": "assistant", "content": assistant}] + pre = tok.apply_chat_template(msgs_pre, tokenize=False, add_generation_prompt=True) + full = tok.apply_chat_template(msgs_full, tokenize=False) + pre_ids = tok(pre, add_special_tokens=False)["input_ids"] + full_ids = tok(full, add_special_tokens=False)["input_ids"] + MAX = 1024 + full_ids = full_ids[:MAX] + labels = list(full_ids) + n_pre = min(len(pre_ids), len(labels)) + for i in range(n_pre): labels[i] = -100 + pad = MAX - len(full_ids) + return {"input_ids": full_ids + [tok.pad_token_id]*pad, + "attention_mask": [1]*len(full_ids) + [0]*pad, + "labels": labels + [-100]*pad} + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", default="Qwen/Qwen2.5-7B") + ap.add_argument("--iterations", type=int, default=16) + ap.add_argument("--problems_per_iter", type=int, default=8) + ap.add_argument("--train_every", type=int, default=4) + ap.add_argument("--seed", type=int, default=42) + ap.add_argument("--tag", required=True) + args = ap.parse_args() + + random.seed(args.seed); torch.manual_seed(args.seed) + out_dir = f"/workspace/curriculum_code/{args.tag}" + os.makedirs(out_dir, exist_ok=True) + + log(f"loading {args.model}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + tok.padding_side = "left" + model = AutoModelForCausalLM.from_pretrained(args.model, dtype=torch.bfloat16, device_map="cuda:0") + log(f" loaded mem={torch.cuda.memory_allocated('cuda:0')/1e9:.1f}GB") + + model.eval() + log("INITIAL eval on HumanEval") + base_correct, base_total = humaneval_full(model, tok) + log(f" base: {base_correct}/{base_total}") + + lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none", + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM") + model = get_peft_model(model, lora_cfg) + + accumulated = [] + problem_pool = [] + + for it in range(1, args.iterations + 1): + it_t = time.time() + + if not problem_pool: + gen_prompts = [SEED_GEN_PROMPT for _ in range(args.problems_per_iter)] + raw = gen_batch(model, tok, gen_prompts, max_new=400, temperature=0.9) + seeded = [] + for r in raw: + parsed = parse_problem(r) + if not parsed: continue + full = parsed["canonical"] + "\n\n" + "\n".join(parsed["tests"]) + ok, _ = run_python(full) + if ok: seeded.append(parsed) + problem_pool.extend(seeded) + log(f"iter {it}: seeded {len(seeded)} fresh (pool={len(problem_pool)})") + + random.shuffle(problem_pool) + attempt_problems = problem_pool[:args.problems_per_iter] + problem_pool = problem_pool[args.problems_per_iter:] + + if not attempt_problems: + log(f"iter {it}: empty pool"); continue + + # Greedy solve + greedy_prompts = [f"Implement: {p['signature']}\n\nTests:\n{chr(10).join(p['tests'])}\n\nOutput only the function in one ```python block." for p in attempt_problems] + greedy_outs = gen_batch(model, tok, greedy_prompts, max_new=300, temperature=0.0) + new_pairs = 0 + amp_targets = []; sim_targets = [] + for p, raw in zip(attempt_problems, greedy_outs): + code = extract_code(raw) if "```" in raw else raw + ok, _ = run_python(code + "\n\n" + "\n".join(p["tests"])) + if ok: + amp_targets.append(p) + else: + # at-edge check via sampling + solve_prompt = f"Implement: {p['signature']}\n\nTests:\n{chr(10).join(p['tests'])}\n\nOutput only the function in one ```python block." + atts = gen_batch(model, tok, [solve_prompt]*4, max_new=300, temperature=0.7) + broken = None; broken_err = None; fixed = None + for ra in atts: + c = extract_code(ra) if "```" in ra else ra + ok2, err = run_python(c + "\n\n" + "\n".join(p["tests"])) + if ok2 and fixed is None: fixed = c + elif not ok2 and broken is None: broken = c; broken_err = err + if broken and fixed: break + if broken and fixed: + accumulated.append({"signature": p["signature"], "tests": p["tests"], + "broken": broken, "error": broken_err, "fixed": fixed}) + new_pairs += 1 + else: + sim_targets.append(p) + + log(f"iter {it}: {len(attempt_problems)} attempted, +{new_pairs} pairs (total: {len(accumulated)}). amp={len(amp_targets)}, sim={len(sim_targets)} [{time.time()-it_t:.0f}s]") + + # Generate amplified / simplified for next iter + if amp_targets: + amp_prompts = [AMPLIFY_PROMPT.format(original=p["raw"]) for p in amp_targets[:args.problems_per_iter]] + amp_outs = gen_batch(model, tok, amp_prompts, max_new=400, temperature=0.7) + for r in amp_outs: + parsed = parse_problem(r) + if not parsed: continue + full = parsed["canonical"] + "\n\n" + "\n".join(parsed["tests"]) + ok, _ = run_python(full) + if ok: problem_pool.append(parsed) + if sim_targets: + sim_prompts = [SIMPLIFY_PROMPT.format(original=p["raw"]) for p in sim_targets[:args.problems_per_iter//2]] + sim_outs = gen_batch(model, tok, sim_prompts, max_new=400, temperature=0.7) + for r in sim_outs: + parsed = parse_problem(r) + if not parsed: continue + full = parsed["canonical"] + "\n\n" + "\n".join(parsed["tests"]) + ok, _ = run_python(full) + if ok: problem_pool.append(parsed) + + with open(f"{out_dir}/pairs.jsonl", "w") as fh: + for r in accumulated: fh.write(json.dumps(r) + "\n") + + if it % args.train_every == 0 and len(accumulated) >= 10: + log(f" TRAINING on {len(accumulated)} pairs") + tok.padding_side = "right" + ds = HFDataset.from_list([make_train_example(r, tok) for r in accumulated]) + targs = TrainingArguments( + output_dir=f"{out_dir}/ckpt", num_train_epochs=2, + per_device_train_batch_size=1, gradient_accumulation_steps=4, + learning_rate=1e-4, bf16=True, logging_steps=10, + save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05, + ) + Trainer(model=model, args=targs, train_dataset=ds, processing_class=tok).train() + tok.padding_side = "left" + model.eval() + corr, tot = humaneval_full(model, tok) + log(f" HumanEval @ iter {it}: {corr}/{tot} Δ={corr-base_correct:+d}") + model.train() + + model.eval() + final_correct, final_total = humaneval_full(model, tok) + + result = { + "model": args.model, "iterations": args.iterations, + "n_pairs": len(accumulated), + "base": [base_correct, base_total], + "trained": [final_correct, final_total], + "delta": final_correct - base_correct, + "elapsed_s": time.time() - T0, + } + with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" CURRICULUM TINYFORGE-ZERO-CODE — {args.model}") + print(f" HumanEval: base={base_correct}/{base_total} trained={final_correct}/{final_total} Δ={final_correct-base_correct:+d}") + print(f" Self-mined pairs: {len(accumulated)}") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/recipe/math_bootstrap.py b/recipe/math_bootstrap.py new file mode 100644 index 0000000..7e897bf --- /dev/null +++ b/recipe/math_bootstrap.py @@ -0,0 +1,283 @@ +"""TinyForge-Zero on math word problems. + +Same recipe as code bootstrap, different verifier: + - Model generates (word_problem, python_expression_for_answer) pairs + - Python eval gives the canonical numerical answer + - Solver gets word problem only, must produce a number + - Compare solver's number to canonical → broken/fixed pairs + - Train on accumulated pairs + - Eval on GSM8K (held-out) +""" +import os, sys, json, time, re, gc, subprocess, tempfile, argparse, random +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ["CUDA_VISIBLE_DEVICES"] = "1" +os.environ["TRANSFORMERS_VERBOSITY"] = "error" +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer +from datasets import load_dataset, Dataset as HFDataset +from peft import LoraConfig, get_peft_model + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def extract_code(text): + if "```python" in text: text = text.split("```python", 1)[1] + elif "```" in text: text = text.split("```", 1)[1] + if "```" in text: text = text.split("```", 1)[0] + return text.strip() + + +def safe_eval(expr: str): + """Eval a numeric Python expression. Returns float or None.""" + try: + # Restrict to math operations + allowed = "0123456789+-*/.()% " + if not all(c in allowed or c.isspace() for c in expr): return None + return float(eval(expr, {"__builtins__": {}}, {})) + except Exception: + return None + + +def extract_answer(text: str): + """Pull a numeric answer from model output. Looks for last number or boxed.""" + # GSM8K style: "#### 42" + m = re.search(r"####\s*(-?\d+(?:\.\d+)?)", text) + if m: return float(m.group(1)) + # \boxed{42} + m = re.search(r"\\boxed\{(-?\d+(?:\.\d+)?)\}", text) + if m: return float(m.group(1)) + # "answer is 42" or "= 42" + matches = re.findall(r"-?\d+(?:\.\d+)?", text) + if matches: + try: return float(matches[-1]) + except: return None + return None + + +def gen_batch(model, tok, prompts, max_new=400, temperature=0.0, batch=4): + outs = [] + for i in range(0, len(prompts), batch): + chunk = prompts[i:i+batch] + texts = [] + for p in chunk: + msgs = [{"role": "system", "content": "You are a careful math tutor."}, + {"role": "user", "content": p}] + texts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)) + inp = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=1500).to(model.device) + with torch.no_grad(): + out = model.generate(**inp, max_new_tokens=max_new, do_sample=temperature > 0, + temperature=temperature if temperature > 0 else 1.0, top_p=0.95, + pad_token_id=tok.eos_token_id) + for j in range(out.size(0)): + outs.append(tok.decode(out[j][inp.input_ids.shape[1]:], skip_special_tokens=True)) + return outs + + +PROBLEM_GEN_PROMPT = """Generate ONE math word problem with a numerical answer. Output exactly this format: + +PROBLEM: +EXPRESSION: +ANSWER: + +Make the problem grade-school to middle-school level. The expression must evaluate to the answer.""" + + +def parse_generated_problem(text: str): + """Extract (problem, expression, answer) from model output.""" + p_m = re.search(r"PROBLEM:\s*(.+?)(?:\n|EXPRESSION:)", text, re.DOTALL) + e_m = re.search(r"EXPRESSION:\s*(.+?)(?:\n|ANSWER:)", text, re.DOTALL) + a_m = re.search(r"ANSWER:\s*(-?\d+(?:\.\d+)?)", text) + if not (p_m and e_m and a_m): return None + problem = p_m.group(1).strip() + expression = e_m.group(1).strip() + try: + claimed = float(a_m.group(1)) + except: return None + if len(problem) < 10 or len(expression) < 1: return None + # Verify: expression evaluates to claimed answer + actual = safe_eval(expression) + if actual is None: return None + if abs(actual - claimed) > 0.01: return None + return {"problem": problem, "expression": expression, "answer": claimed} + + +SOLVE_PROMPT_TEMPLATE = """Solve this math problem step by step. End with the answer on a new line as: #### + +Problem: {problem}""" + + +def solve_and_check(model, tok, problem_text: str, gold_answer: float, n_attempts: int = 4, temperature: float = 0.7): + """Sample N attempts, return list of (text, predicted_num, ok).""" + prompt = SOLVE_PROMPT_TEMPLATE.format(problem=problem_text) + outs = gen_batch(model, tok, [prompt] * n_attempts, max_new=400, temperature=temperature) + results = [] + for raw in outs: + pred = extract_answer(raw) + ok = pred is not None and abs(pred - gold_answer) < 0.01 + results.append({"text": raw, "pred": pred, "ok": ok}) + return results + + +def gsm8k_eval(model, tok, n=200): + ds = list(load_dataset("openai/gsm8k", "main", split="test")) + ds = ds[:n] + log(f" eval on GSM8K ({len(ds)} problems)") + prompts = [SOLVE_PROMPT_TEMPLATE.format(problem=p["question"]) for p in ds] + outs = gen_batch(model, tok, prompts, max_new=400, temperature=0.0, batch=4) + correct = 0 + for p, raw in zip(ds, outs): + # GSM8K's answer field has format "step-by-step\n#### 42" + gold_m = re.search(r"####\s*(-?\d+(?:,\d+)*(?:\.\d+)?)", p["answer"]) + if not gold_m: continue + gold = float(gold_m.group(1).replace(",", "")) + pred = extract_answer(raw) + if pred is not None and abs(pred - gold) < 0.01: correct += 1 + return correct, len(ds) + + +def make_train_example(r, tok): + user = SOLVE_PROMPT_TEMPLATE.format(problem=r["problem"]) + f"\n\nMy attempt:\n{r['broken']}\n\nThis is wrong. Solve it correctly and end with #### ." + assistant = r["fixed"] + msgs_pre = [{"role": "system", "content": "You are a careful math tutor."}, + {"role": "user", "content": user}] + msgs_full = msgs_pre + [{"role": "assistant", "content": assistant}] + pre = tok.apply_chat_template(msgs_pre, tokenize=False, add_generation_prompt=True) + full = tok.apply_chat_template(msgs_full, tokenize=False) + pre_ids = tok(pre, add_special_tokens=False)["input_ids"] + full_ids = tok(full, add_special_tokens=False)["input_ids"] + MAX = 1024 + full_ids = full_ids[:MAX] + labels = list(full_ids) + n_pre = min(len(pre_ids), len(labels)) + for i in range(n_pre): labels[i] = -100 + pad = MAX - len(full_ids) + return {"input_ids": full_ids + [tok.pad_token_id]*pad, + "attention_mask": [1]*len(full_ids) + [0]*pad, + "labels": labels + [-100]*pad} + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", default="Qwen/Qwen2.5-7B") + ap.add_argument("--iterations", type=int, default=20) + ap.add_argument("--problems_per_iter", type=int, default=16) + ap.add_argument("--train_every", type=int, default=8) + ap.add_argument("--eval_every", type=int, default=8) + ap.add_argument("--n_eval", type=int, default=200) + ap.add_argument("--seed", type=int, default=42) + ap.add_argument("--tag", required=True) + args = ap.parse_args() + + random.seed(args.seed); torch.manual_seed(args.seed) + out_dir = f"/workspace/math/{args.tag}" + os.makedirs(out_dir, exist_ok=True) + + log(f"loading {args.model}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + tok.padding_side = "left" + device = "cuda:0" # CUDA_VISIBLE_DEVICES=1 makes physical GPU 1 appear as cuda:0 + model = AutoModelForCausalLM.from_pretrained(args.model, dtype=torch.bfloat16, device_map=device) + log(f" loaded mem={torch.cuda.memory_allocated(device)/1e9:.1f}GB") + + # Initial eval + model.eval() + log("INITIAL eval on GSM8K") + init_correct, init_total = gsm8k_eval(model, tok, n=args.n_eval) + log(f" GSM8K base: {init_correct}/{init_total}") + + # LoRA + lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none", + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM") + model = get_peft_model(model, lora_cfg) + log(f" LoRA applied, trainable={sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6:.1f}M") + + accumulated_pairs = [] + eval_log = [{"iter": 0, "correct": init_correct, "total": init_total}] + iter_stats = [] + + for it in range(1, args.iterations + 1): + it_t = time.time() + # 1. Generate problems + gen_prompts = [PROBLEM_GEN_PROMPT for _ in range(args.problems_per_iter)] + raw_problems = gen_batch(model, tok, gen_prompts, max_new=300, temperature=0.9) + + # 2. Parse & verify (Python eval of expression) + valid = [] + for raw in raw_problems: + parsed = parse_generated_problem(raw) + if parsed: valid.append(parsed) + + if not valid: + log(f"iter {it}: 0 valid problems") + iter_stats.append({"iter": it, "valid": 0, "pairs": 0}) + continue + + # 3. Mine pairs from sampled solver outputs + new_pairs = 0 + for p in valid: + attempts = solve_and_check(model, tok, p["problem"], p["answer"], n_attempts=4, temperature=0.7) + ok_atts = [a for a in attempts if a["ok"]] + bad_atts = [a for a in attempts if not a["ok"]] + if ok_atts and bad_atts: + accumulated_pairs.append({ + "problem": p["problem"], + "answer": p["answer"], + "broken": bad_atts[0]["text"], + "fixed": ok_atts[0]["text"], + }) + new_pairs += 1 + + log(f"iter {it}: {len(valid)} valid problems, {new_pairs} pairs harvested (total: {len(accumulated_pairs)}) [{time.time()-it_t:.0f}s]") + iter_stats.append({"iter": it, "valid": len(valid), "pairs": new_pairs, "elapsed": time.time()-it_t}) + + # Save incrementally + with open(f"{out_dir}/pairs.jsonl", "w") as fh: + for r in accumulated_pairs: fh.write(json.dumps(r) + "\n") + + # 4. Train every N + if it % args.train_every == 0 and len(accumulated_pairs) >= 10: + log(f" TRAINING on {len(accumulated_pairs)} pairs") + tok.padding_side = "right" + ds = HFDataset.from_list([make_train_example(r, tok) for r in accumulated_pairs]) + targs = TrainingArguments( + output_dir=f"{out_dir}/ckpt", num_train_epochs=2, + per_device_train_batch_size=1, gradient_accumulation_steps=4, + learning_rate=1e-4, bf16=True, logging_steps=10, + save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05, + ) + Trainer(model=model, args=targs, train_dataset=ds, processing_class=tok).train() + tok.padding_side = "left" + + # 5. Eval every N + if it % args.eval_every == 0: + model.eval() + corr, tot = gsm8k_eval(model, tok, n=args.n_eval) + log(f" GSM8K @ iter {it}: {corr}/{tot}") + eval_log.append({"iter": it, "correct": corr, "total": tot}) + model.train() + + # Final eval + model.eval() + final_correct, final_total = gsm8k_eval(model, tok, n=args.n_eval) + eval_log.append({"iter": args.iterations, "correct": final_correct, "total": final_total, "final": True}) + + with open(f"{out_dir}/iter_stats.jsonl", "w") as fh: + for r in iter_stats: fh.write(json.dumps(r) + "\n") + with open(f"{out_dir}/eval_log.json", "w") as fh: + json.dump(eval_log, fh, indent=2) + + print() + print("=" * 70) + print(f" TINYFORGE-ZERO ON MATH ({args.model})") + print(f" GSM8K-mini ({final_total}): base={init_correct} final={final_correct} Δ={final_correct-init_correct:+d}") + print(f" Total pairs mined: {len(accumulated_pairs)}") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/tts/tts_aime.py b/tts/tts_aime.py new file mode 100644 index 0000000..a1cb9fc --- /dev/null +++ b/tts/tts_aime.py @@ -0,0 +1,103 @@ +"""TTS on AIME (Olympiad math). 90 problems, integer answers 0-999. +If 8B+best-of-N hits 30%+, that's matching frontier reasoning models.""" +import os, json, time, re, argparse +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from datasets import load_dataset + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def extract_int(text): + """AIME answers are integers 0-999. Try \boxed first, fall back to last integer.""" + m = re.search(r"\\boxed\{(\d+)\}", text) + if m: + try: return int(m.group(1)) + except: return None + # Last integer in last few lines + lines = text.strip().split("\n") + for line in reversed(lines[-5:]): + nums = re.findall(r"\b(\d+)\b", line) + if nums: + try: return int(nums[-1]) + except: pass + return None + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--n_samples", type=int, default=8) + ap.add_argument("--temperature", type=float, default=0.7) + ap.add_argument("--tag", required=True) + args = ap.parse_args() + + out_dir = f"/workspace/tts_aime/{args.tag}" + os.makedirs(out_dir, exist_ok=True) + + from vllm import LLM, SamplingParams + from transformers import AutoTokenizer + log(f"loading {args.model}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.90, max_model_len=3072) + log(f" loaded") + + ds = list(load_dataset("AI-MO/aimo-validation-aime", split="train")) + log(f" AIME: {len(ds)} problems") + + SYS = "You are a careful math problem solver. AIME answers are integers between 0 and 999. End with \\boxed{integer}." + UTMPL = "Solve this AIME problem. Show your reasoning, then put the final integer answer in \\boxed{{...}}.\n\nProblem: {problem}\n\nSolution:" + prompts = [] + for p in ds: + msgs = [{"role": "system", "content": SYS}, + {"role": "user", "content": UTMPL.format(problem=p["problem"])}] + try: + prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)) + except Exception: + prompts.append(UTMPL.format(problem=p["problem"])) + + log("=== GREEDY ===") + sp_g = SamplingParams(temperature=0, max_tokens=2000) + t0 = time.time() + g_outs = [o.outputs[0].text for o in llm.generate(prompts, sp_g, use_tqdm=False)] + log(f" gen in {time.time()-t0:.1f}s") + g_correct = 0 + for p, raw in zip(ds, g_outs): + pred = extract_int(raw) + gold = int(p["answer"]) + if pred == gold: g_correct += 1 + log(f" GREEDY: {g_correct}/{len(ds)} ({100*g_correct/len(ds):.1f}%)") + + log(f"=== BEST-OF-{args.n_samples} (temp={args.temperature}) ===") + sp_s = SamplingParams(temperature=args.temperature, top_p=0.95, max_tokens=2000, n=args.n_samples) + t0 = time.time() + s_outs = llm.generate(prompts, sp_s, use_tqdm=False) + log(f" gen in {time.time()-t0:.1f}s") + bN_correct = 0 + for p, outset in zip(ds, s_outs): + gold = int(p["answer"]) + for o in outset.outputs: + pred = extract_int(o.text) + if pred == gold: + bN_correct += 1; break + + result = {"model": args.model, "n_samples": args.n_samples, "temperature": args.temperature, + "greedy": g_correct, "best_of_N": bN_correct, "n": len(ds), "elapsed_s": time.time()-T0} + with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model} — AIME ({len(ds)} problems)") + print(f" Greedy: {g_correct}/{len(ds)} ({100*g_correct/len(ds):.1f}%)") + print(f" Best-of-{args.n_samples}: {bN_correct}/{len(ds)} ({100*bN_correct/len(ds):.1f}%)") + print(f" TTS Lift: +{bN_correct - g_correct} ({100*(bN_correct-g_correct)/len(ds):.1f}pp)") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/tts/tts_humaneval.py b/tts/tts_humaneval.py new file mode 100644 index 0000000..acfd711 --- /dev/null +++ b/tts/tts_humaneval.py @@ -0,0 +1,126 @@ +"""TTS on HumanEval+ (contamination-resistant) to verify the 92% isn't memorization.""" +import os, json, time, subprocess, tempfile, argparse +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from datasets import load_dataset + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def extract_code(text): + if "```python" in text: text = text.split("```python", 1)[1] + elif "```" in text: text = text.split("```", 1)[1] + if "```" in text: text = text.split("```", 1)[0] + return text.strip() + + +def run_python(code, timeout=15): + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f: + f.write(code); path = f.name + try: + r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp") + return r.returncode == 0 + except subprocess.TimeoutExpired: return False + finally: + try: os.unlink(path) + except: pass + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--n_samples", type=int, default=8) + ap.add_argument("--temperature", type=float, default=0.6) + ap.add_argument("--tag", required=True) + args = ap.parse_args() + + out_dir = f"/workspace/tts_hep/{args.tag}" + os.makedirs(out_dir, exist_ok=True) + + from vllm import LLM, SamplingParams + from transformers import AutoTokenizer + log(f"loading {args.model}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.90, max_model_len=2048) + log(f" loaded") + + hep = list(load_dataset("evalplus/humanevalplus", split="test")) + log(f" HE+: {len(hep)} problems") + + prompts = [] + for p in hep: + try: + msgs = [{"role": "system", "content": "You are a Python coder. Output one ```python block only."}, + {"role": "user", "content": p["prompt"] + "\n# Complete the function above."}] + prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)) + except Exception: + prompts.append(p["prompt"]) + + log("=== GREEDY ===") + sp_g = SamplingParams(temperature=0, max_tokens=400) + g_outs = [o.outputs[0].text for o in llm.generate(prompts, sp_g, use_tqdm=False)] + base_pass, plus_pass = 0, 0 + for p, raw in zip(hep, g_outs): + code = extract_code(raw) if "```" in raw else raw + full = p["prompt"] + "\n" + code if "def " not in code else code + # base test + b_test = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + b_ok = run_python(b_test, 15) + if b_ok: base_pass += 1 + # plus test (harder, hidden cases) + if "plus_test" in p: + p_test = full + "\n\n" + p["plus_test"] + f"\n\ncheck({p['entry_point']})" + if run_python(p_test, 15): plus_pass += 1 + else: + if b_ok: plus_pass += 1 + log(f" GREEDY base: {base_pass}/{len(hep)} plus(hidden): {plus_pass}/{len(hep)}") + + log(f"=== BEST-OF-{args.n_samples} (temp={args.temperature}) ===") + sp_s = SamplingParams(temperature=args.temperature, top_p=0.95, max_tokens=400, n=args.n_samples) + s_outs = llm.generate(prompts, sp_s, use_tqdm=False) + bN_base, bN_plus = 0, 0 + for p, outset in zip(hep, s_outs): + attempts = [o.text for o in outset.outputs] + base_ok_any = False + plus_ok_any = False + for a in attempts: + code = extract_code(a) if "```" in a else a + full = p["prompt"] + "\n" + code if "def " not in code else code + b_test = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + b_ok = run_python(b_test, 15) + if b_ok and not base_ok_any: + base_ok_any = True + if "plus_test" in p: + p_test = full + "\n\n" + p["plus_test"] + f"\n\ncheck({p['entry_point']})" + p_ok = run_python(p_test, 15) + if p_ok and not plus_ok_any: + plus_ok_any = True + elif b_ok and not plus_ok_any: + plus_ok_any = True + if base_ok_any and plus_ok_any: break + if base_ok_any: bN_base += 1 + if plus_ok_any: bN_plus += 1 + + result = {"model": args.model, "n_samples": args.n_samples, "temperature": args.temperature, + "greedy_base": base_pass, "greedy_plus": plus_pass, + "best_of_N_base": bN_base, "best_of_N_plus": bN_plus, + "n": len(hep), "elapsed_s": time.time()-T0} + with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model} — HumanEval+ ({len(hep)} problems)") + print(f" Greedy base: {base_pass}/{len(hep)} ({100*base_pass/len(hep):.1f}%)") + print(f" Greedy plus (hard): {plus_pass}/{len(hep)} ({100*plus_pass/len(hep):.1f}%)") + print(f" Best-of-{args.n_samples} base: {bN_base}/{len(hep)} ({100*bN_base/len(hep):.1f}%)") + print(f" Best-of-{args.n_samples} plus: {bN_plus}/{len(hep)} ({100*bN_plus/len(hep):.1f}%)") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/tts/tts_math500.py b/tts/tts_math500.py new file mode 100644 index 0000000..f0be95e --- /dev/null +++ b/tts/tts_math500.py @@ -0,0 +1,125 @@ +"""TTS on MATH-500: greedy + best-of-N pass@1. + +If TTS works on math like it does on code, we should see major lift. +""" +import os, json, time, re, argparse +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from datasets import load_dataset +import sympy +from sympy.parsing.latex import parse_latex + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def extract_boxed(text): + idx = text.rfind("\\boxed{") + if idx < 0: return None + start = idx + len("\\boxed{") + depth = 1; i = start + while i < len(text) and depth > 0: + if text[i] == "{": depth += 1 + elif text[i] == "}": depth -= 1 + i += 1 + if depth != 0: return None + return text[start:i-1].strip() + + +def normalize(s): + if s is None: return None + s = s.strip() + s = re.sub(r"^\$|\$$", "", s).strip() + s = re.sub(r"\\text\{([^}]*)\}", r"\1", s) + s = re.sub(r"\\mbox\{([^}]*)\}", r"\1", s) + s = re.sub(r"(?<=\d),(?=\d)", "", s) + s = s.replace("\\left", "").replace("\\right", "").replace("^\\circ", "").replace("^{\\circ}", "") + return s.strip() + + +def sympy_equal(a, b): + if a is None or b is None: return False + a, b = normalize(a), normalize(b) + if a == b: return True + try: + ea = parse_latex(a); eb = parse_latex(b) + if sympy.simplify(ea - eb) == 0: return True + except Exception: pass + try: + if abs(float(a) - float(b)) < 1e-6: return True + except Exception: pass + return False + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--n_samples", type=int, default=8) + ap.add_argument("--temperature", type=float, default=0.7) + ap.add_argument("--tag", required=True) + args = ap.parse_args() + + out_dir = f"/workspace/tts_math/{args.tag}" + os.makedirs(out_dir, exist_ok=True) + + from vllm import LLM, SamplingParams + from transformers import AutoTokenizer + log(f"loading {args.model}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.90, max_model_len=2048) + log(f" loaded") + + ds = list(load_dataset("HuggingFaceH4/MATH-500", split="test")) + log(f" MATH-500: {len(ds)} problems") + + SYS = "You are a careful math problem solver. End with \\boxed{answer}." + USER_TEMPLATE = "Solve this competition math problem. Show your reasoning, then put the final answer in \\boxed{{...}}.\n\nProblem: {problem}\n\nSolution:" + prompts = [] + for p in ds: + msgs = [{"role": "system", "content": SYS}, + {"role": "user", "content": USER_TEMPLATE.format(problem=p["problem"])}] + try: + prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)) + except Exception: + prompts.append(USER_TEMPLATE.format(problem=p["problem"])) + + # Greedy + log("=== GREEDY ===") + sp_g = SamplingParams(temperature=0, max_tokens=800) + t0 = time.time() + g_outs = [o.outputs[0].text for o in llm.generate(prompts, sp_g, use_tqdm=False)] + log(f" gen in {time.time()-t0:.1f}s") + g_correct = sum(1 for p, raw in zip(ds, g_outs) if sympy_equal(extract_boxed(raw), p["answer"])) + log(f" GREEDY: {g_correct}/{len(ds)} ({100*g_correct/len(ds):.1f}%)") + + # Best-of-N (any correct) + log(f"=== BEST-OF-{args.n_samples} (temp={args.temperature}) ===") + sp_s = SamplingParams(temperature=args.temperature, top_p=0.95, max_tokens=800, n=args.n_samples) + t0 = time.time() + s_outs = llm.generate(prompts, sp_s, use_tqdm=False) + log(f" gen in {time.time()-t0:.1f}s") + bN_correct = 0 + for p, outset in zip(ds, s_outs): + for o in outset.outputs: + if sympy_equal(extract_boxed(o.text), p["answer"]): + bN_correct += 1; break + + result = {"model": args.model, "n_samples": args.n_samples, "temperature": args.temperature, + "greedy": g_correct, "best_of_N": bN_correct, "n": len(ds), "elapsed_s": time.time()-T0} + with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model} — MATH-500 ({len(ds)} problems)") + print(f" Greedy: {g_correct}/{len(ds)} ({100*g_correct/len(ds):.1f}%)") + print(f" Best-of-{args.n_samples}: {bN_correct}/{len(ds)} ({100*bN_correct/len(ds):.1f}%)") + print(f" TTS Lift: +{bN_correct - g_correct} ({100*(bN_correct-g_correct)/len(ds):.1f}pp)") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/tts/tts_qwen14b_recipe.py b/tts/tts_qwen14b_recipe.py new file mode 100644 index 0000000..f40ec2c --- /dev/null +++ b/tts/tts_qwen14b_recipe.py @@ -0,0 +1,135 @@ +"""Test-time scaling on Qwen2.5-14B-Base + multi_v1 adapter. + +For each HumanEval problem: + 1. Sample 8 attempts at temp=0.6 from the trained model. + 2. Run each attempt against the tests. + 3. Accept the first that passes → pass@1 with best-of-N selection. + +Compared to greedy pass@1 (which gave 80.5%), this should push higher. +""" +import os, json, time, re, subprocess, tempfile, argparse +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from datasets import load_dataset + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def extract_code(text): + if "```python" in text: text = text.split("```python", 1)[1] + elif "```" in text: text = text.split("```", 1)[1] + if "```" in text: text = text.split("```", 1)[0] + return text.strip() + + +def run_python(code, timeout=15): + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f: + f.write(code); path = f.name + try: + r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp") + return r.returncode == 0 + except subprocess.TimeoutExpired: return False + finally: + try: os.unlink(path) + except: pass + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", default="Qwen/Qwen2.5-14B") + ap.add_argument("--adapter", default="/workspace/multi_v1_adapter") + ap.add_argument("--n_samples", type=int, default=8) + ap.add_argument("--temperature", type=float, default=0.6) + ap.add_argument("--tag", required=True) + args = ap.parse_args() + + out_dir = f"/workspace/tts/{args.tag}" + os.makedirs(out_dir, exist_ok=True) + + from vllm import LLM + from vllm.lora.request import LoRARequest + from transformers import AutoTokenizer + log(f"loading {args.model} with adapter {args.adapter}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.90, max_model_len=2048, + enable_lora=True, max_lora_rank=32) + lora_req = LoRARequest("multi_v1", 1, args.adapter) + log(f" loaded") + + he = list(load_dataset("openai_humaneval", split="test")) + log(f" HE: {len(he)} problems") + + # --- Greedy baseline (with adapter) + log("=== GREEDY pass@1 (with adapter) ===") + from vllm import SamplingParams + sp_greedy = SamplingParams(temperature=0, max_tokens=400) + # Use chat template for Qwen2.5 (it has one) + prompts = [] + for p in he: + msgs = [{"role": "system", "content": "You are a Python coder. Output one ```python block only."}, + {"role": "user", "content": p["prompt"] + "\n# Complete the function above."}] + prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)) + t0 = time.time() + greedy_outs = [o.outputs[0].text for o in llm.generate(prompts, sp_greedy, lora_request=lora_req, use_tqdm=False)] + log(f" greedy gen in {time.time()-t0:.1f}s") + greedy_correct = 0 + for p, raw in zip(he, greedy_outs): + code = extract_code(raw) if "```" in raw else raw + full = p["prompt"] + "\n" + code if "def " not in code else code + test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + if run_python(test_code, 15): greedy_correct += 1 + log(f" GREEDY pass@1: {greedy_correct}/{len(he)} ({100*greedy_correct/len(he):.1f}%)") + + # --- Test-time scaling: sample N, take first that passes (best-of-N pass@1) + log(f"=== TEST-TIME SCALING: N={args.n_samples}, temp={args.temperature} ===") + sp_sample = SamplingParams(temperature=args.temperature, top_p=0.95, + max_tokens=400, n=args.n_samples) + t0 = time.time() + sample_outs = llm.generate(prompts, sp_sample, lora_request=lora_req, use_tqdm=False) + log(f" sampling gen in {time.time()-t0:.1f}s") + + t1 = time.time() + bestN_correct = 0 + per_problem = [] + for p, outset in zip(he, sample_outs): + attempts = [o.text for o in outset.outputs] + any_pass = False + for a in attempts: + code = extract_code(a) if "```" in a else a + full = p["prompt"] + "\n" + code if "def " not in code else code + test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + if run_python(test_code, 15): + any_pass = True + break + if any_pass: bestN_correct += 1 + per_problem.append({"task_id": p["task_id"], "best_of_N_pass": any_pass}) + log(f" verify done in {time.time()-t1:.1f}s") + + result = { + "model": args.model, "adapter": args.adapter, + "n_samples": args.n_samples, "temperature": args.temperature, + "greedy_passN": greedy_correct, + "best_of_N_passN": bestN_correct, + "n_total": len(he), + "elapsed_s": time.time()-T0, + } + with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + with open(f"{out_dir}/per_problem.json", "w") as fh: json.dump(per_problem, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model} + adapter {args.adapter}") + print(f" HumanEval:") + print(f" Greedy pass@1: {greedy_correct}/{len(he)} ({100*greedy_correct/len(he):.1f}%)") + print(f" Best-of-{args.n_samples} pass@1: {bestN_correct}/{len(he)} ({100*bestN_correct/len(he):.1f}%)") + print(f" Lift: +{bestN_correct - greedy_correct} ({100*(bestN_correct-greedy_correct)/len(he):.1f}pp)") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/tts/tts_qwen3_8b_raw_control.py b/tts/tts_qwen3_8b_raw_control.py new file mode 100644 index 0000000..d706885 --- /dev/null +++ b/tts/tts_qwen3_8b_raw_control.py @@ -0,0 +1,118 @@ +"""Control: Qwen3-8B-Base RAW (no recipe) + best-of-8 on HumanEval. + +Tells us if the 89.6% headline on 14B+recipe is driven by recipe or by test-time scaling. +""" +import os, json, time, re, subprocess, tempfile, argparse +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from datasets import load_dataset + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def extract_code(text): + if "```python" in text: text = text.split("```python", 1)[1] + elif "```" in text: text = text.split("```", 1)[1] + if "```" in text: text = text.split("```", 1)[0] + return text.strip() + + +def run_python(code, timeout=15): + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f: + f.write(code); path = f.name + try: + r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp") + return r.returncode == 0 + except subprocess.TimeoutExpired: return False + finally: + try: os.unlink(path) + except: pass + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--n_samples", type=int, default=8) + ap.add_argument("--temperature", type=float, default=0.6) + ap.add_argument("--tag", required=True) + args = ap.parse_args() + + out_dir = f"/workspace/tts_raw/{args.tag}" + os.makedirs(out_dir, exist_ok=True) + + from vllm import LLM, SamplingParams + from transformers import AutoTokenizer + log(f"loading {args.model} (no adapter)") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.90, max_model_len=2048) + log(f" loaded") + + he = list(load_dataset("openai_humaneval", split="test")) + log(f" HE: {len(he)} problems") + + # Try chat-template style if available, else raw + prompts = [] + for p in he: + try: + msgs = [{"role": "system", "content": "You are a Python coder. Output one ```python block only."}, + {"role": "user", "content": p["prompt"] + "\n# Complete the function above."}] + prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)) + except Exception: + prompts.append(p["prompt"]) + + # --- Greedy + log("=== GREEDY pass@1 ===") + sp_g = SamplingParams(temperature=0, max_tokens=400) + t0 = time.time() + g_outs = [o.outputs[0].text for o in llm.generate(prompts, sp_g, use_tqdm=False)] + log(f" greedy gen in {time.time()-t0:.1f}s") + g_correct = 0 + for p, raw in zip(he, g_outs): + code = extract_code(raw) if "```" in raw else raw + full = p["prompt"] + "\n" + code if "def " not in code else code + test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + if run_python(test_code, 15): g_correct += 1 + log(f" GREEDY pass@1: {g_correct}/{len(he)} ({100*g_correct/len(he):.1f}%)") + + # --- Best-of-N + log(f"=== BEST-OF-{args.n_samples} (temp={args.temperature}) ===") + sp_s = SamplingParams(temperature=args.temperature, top_p=0.95, max_tokens=400, n=args.n_samples) + t0 = time.time() + s_outs = llm.generate(prompts, sp_s, use_tqdm=False) + log(f" sampling gen in {time.time()-t0:.1f}s") + t1 = time.time() + bN_correct = 0 + for p, outset in zip(he, s_outs): + attempts = [o.text for o in outset.outputs] + for a in attempts: + code = extract_code(a) if "```" in a else a + full = p["prompt"] + "\n" + code if "def " not in code else code + test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + if run_python(test_code, 15): + bN_correct += 1 + break + log(f" verify in {time.time()-t1:.1f}s") + + result = { + "model": args.model, "n_samples": args.n_samples, "temperature": args.temperature, + "greedy_passN": g_correct, "best_of_N_passN": bN_correct, "n_total": len(he), + "elapsed_s": time.time()-T0, + } + with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model} (NO ADAPTER) — HumanEval") + print(f" Greedy pass@1: {g_correct}/{len(he)} ({100*g_correct/len(he):.1f}%)") + print(f" Best-of-{args.n_samples} pass@1: {bN_correct}/{len(he)} ({100*bN_correct/len(he):.1f}%)") + print(f" Lift from TTS: +{bN_correct - g_correct} ({100*(bN_correct-g_correct)/len(he):.1f}pp)") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main() diff --git a/tts/tts_scaling.py b/tts/tts_scaling.py new file mode 100644 index 0000000..d16e625 --- /dev/null +++ b/tts/tts_scaling.py @@ -0,0 +1,165 @@ +"""TTS scaling sweep: pass@1 across N samples for HE + HE+ + MATH-500.""" +import os, json, time, re, subprocess, tempfile, argparse +os.environ.setdefault("HF_HOME", "/workspace/hf") +os.environ["TRANSFORMERS_VERBOSITY"] = "error" + +import torch +from datasets import load_dataset + +T0 = time.time() +def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True) + + +def extract_code(t): + if "```python" in t: t = t.split("```python", 1)[1] + elif "```" in t: t = t.split("```", 1)[1] + if "```" in t: t = t.split("```", 1)[0] + return t.strip() + + +def run_python(code, timeout=10): + with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f: + f.write(code); path = f.name + try: + r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp") + return r.returncode == 0 + except subprocess.TimeoutExpired: return False + finally: + try: os.unlink(path) + except: pass + + +def extract_boxed(text): + idx = text.rfind("\\boxed{") + if idx < 0: return None + start = idx + len("\\boxed{"); depth = 1; i = start + while i < len(text) and depth > 0: + if text[i] == "{": depth += 1 + elif text[i] == "}": depth -= 1 + i += 1 + if depth != 0: return None + return text[start:i-1].strip() + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", required=True) + ap.add_argument("--tag", required=True) + ap.add_argument("--out_dir", required=True) + args = ap.parse_args() + + os.makedirs(args.out_dir, exist_ok=True) + + from vllm import LLM, SamplingParams + from transformers import AutoTokenizer + log(f"loading {args.model}") + tok = AutoTokenizer.from_pretrained(args.model) + if tok.pad_token is None: tok.pad_token = tok.eos_token + llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048) + log("loaded") + + he = list(load_dataset("openai_humaneval", split="test")) + math500 = list(load_dataset("HuggingFaceH4/MATH-500", split="test"))[:200] + + # Build prompts + he_prompts = [] + for p in he: + try: + msgs = [{"role": "system", "content": "You are a Python coder. Output one ```python block only."}, + {"role": "user", "content": p["prompt"] + "\n# Complete the function above."}] + he_prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)) + except Exception: + he_prompts.append(p["prompt"]) + + math_prompts = [] + UTMPL = "Solve this competition math problem. End with \\boxed{{...}}.\n\nProblem: {p}\n\nSolution:" + for p in math500: + try: + msgs = [{"role": "system", "content": "Math solver. End with \\boxed{answer}."}, + {"role": "user", "content": UTMPL.format(p=p["problem"])}] + math_prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)) + except Exception: + math_prompts.append(UTMPL.format(p=p["problem"])) + + # Generate max-N samples ONCE per task (N=32), then compute pass@k for k ∈ {1, 2, 4, 8, 16, 32} + MAX_N = 32 + sp = SamplingParams(temperature=0.6, top_p=0.95, max_tokens=600, n=MAX_N) + log(f"generating MAX_N={MAX_N} samples per task") + t0 = time.time() + he_outs = llm.generate(he_prompts, sp, use_tqdm=False) + log(f" HE gen in {time.time()-t0:.1f}s") + t0 = time.time() + math_outs = llm.generate(math_prompts, sp, use_tqdm=False) + log(f" MATH gen in {time.time()-t0:.1f}s") + + # Compute correctness for each sample + def he_correct(p, raw): + code = extract_code(raw) if "```" in raw else raw + full = p["prompt"] + "\n" + code if "def " not in code else code + test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})" + return run_python(test_code, 10) + + log("verifying HE samples...") + he_results = [] # per task: list of bool + for p, outset in zip(he, he_outs): + per_task = [] + for o in outset.outputs: + per_task.append(he_correct(p, o.text)) + he_results.append(per_task) + log(f" HE verify done") + + import sympy + from sympy.parsing.latex import parse_latex + def sympy_eq(a, b): + if a is None or b is None: return False + a, b = a.strip(), b.strip() + if a == b: return True + try: + if sympy.simplify(parse_latex(a) - parse_latex(b)) == 0: return True + except Exception: pass + try: + if abs(float(a) - float(b)) < 1e-6: return True + except Exception: pass + return False + + log("verifying MATH samples...") + math_results = [] + for p, outset in zip(math500, math_outs): + per_task = [] + for o in outset.outputs: + pred = extract_boxed(o.text) + per_task.append(sympy_eq(pred, p["answer"])) + math_results.append(per_task) + log(f" MATH verify done") + + # Compute pass@k for each k + NS = [1, 2, 4, 8, 16, 32] + def best_of_k(results, k): + return sum(1 for r in results if any(r[:k])) + + he_scaling = {k: best_of_k(he_results, k) for k in NS} + math_scaling = {k: best_of_k(math_results, k) for k in NS} + + result = { + "model": args.model, "tag": args.tag, "MAX_N": MAX_N, + "humaneval_total": len(he), + "math500_total": len(math500), + "he_pass_at_k": he_scaling, + "math500_pass_at_k": math_scaling, + "elapsed_s": time.time() - T0, + } + with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2) + + print() + print("=" * 70) + print(f" {args.model} — TTS SCALING SWEEP") + print(f" N HE MATH-500") + for k in NS: + print(f" {k:>3} {he_scaling[k]:>3}/{len(he)} ({100*he_scaling[k]/len(he):.1f}%) " + f"{math_scaling[k]:>3}/{len(math500)} ({100*math_scaling[k]/len(math500):.1f}%)") + print(f" Time: {time.time()-T0:.0f}s") + print("=" * 70) + + +if __name__ == "__main__": + main()