Ship every paper-referenced experiment script

Reorganizes the repo so every section of the paper has a corresponding
script. Previously only the core recipe + control + evals were here.

New subdirs:
- tts/             — test-time sampling (§2.2, §3.3): scaling sweep, HE, MATH-500,
                     AIME, 14B-recipe + TTS, 8B-raw-TTS control.
- experiments/     — every §3 finding as a runnable script:
                     · self_consistency (§3.4)
                     · recipe_x_tts_synergy (§3.5, novel)
                     · mbpp_seeded_cross_arch (§3.9)
                     · cross_domain_code_to_math (§3.10)
                     · self_correction_math_{naive,fixed} (§3.10, the
                       catastrophic-then-recovered case)
                     · math500_seeded_mining (§3.10 distribution mismatch)
                     · bcb_hard_eval (§3.10 distribution mismatch)
                     · recursive_bootstrap (§3.10 plateau)
                     · diversity_cued_mining (§3.10 low yield)
                     · aime_scaling (TTS curve)
                     · star_baseline_gsm8k (related-work baseline)
- evals/           — moved out of recipe/ (eval_raw, eval_plus, confirm)

Also adds: bootstrap_14b_4bit_harvest, curriculum_code, math_bootstrap to
recipe/ for completeness.

REPRODUCE.md now maps each paper section / table / figure to its exact
script and expected output.
This commit is contained in:
Rana Usman 2026-05-13 21:09:54 +05:00
parent c867697f7c
commit 826f934d2e
27 changed files with 4467 additions and 134 deletions

View file

@ -37,29 +37,59 @@ A control experiment — replacing the mined pairs with **identically-formatted
```
tinyforge-zero/
├── recipe/
│ ├── train_on_pairs.py # Fast-path: train LoRA on a released pairs.jsonl
│ ├── bootstrap.py # Full-path: self-bootstrap pipeline (mining + train, 7B / 3B)
│ ├── multi_pair_14b.py # Full-path: aggressive multi-pair variant → 80.5% on 14B
│ ├── curriculum_math.py # Full-path: auto-difficulty curriculum for GSM8K
│ ├── eval_raw.py # HumanEval / MBPP / GSM8K eval (vLLM, raw-completion)
│ ├── eval_plus.py # HumanEval+ contamination-resistant eval
│ └── confirm.py # Confirmation re-eval against base
├── data/
│ ├── pairs_7b_40.jsonl # 40 self-mined pairs (Qwen2.5-7B-Base run)
│ ├── pairs_14b_multi_new60.jsonl # 60 aggressive-mined pairs for 14B (+ warmup 40 → 100 total)
│ └── pairs_math_13.jsonl # 13 curriculum-mined math pairs (Qwen2.5-3B-Base → GSM8K 32→66)
├── recipe/ # Training pipelines
│ ├── train_on_pairs.py # Fast-path: train LoRA on a released pairs.jsonl
│ ├── bootstrap.py # Self-bootstrap pipeline (mining + train, 7B / 3B)
│ ├── bootstrap_14b_4bit_harvest.py # 4-bit harvest variant (when full-precision OOMs)
│ ├── multi_pair_14b.py # Aggressive multi-pair variant → 80.5% on 14B
│ ├── curriculum_math.py # Auto-difficulty curriculum for GSM8K (§2.3, §3.8)
│ ├── curriculum_code.py # Auto-difficulty curriculum for code
│ └── math_bootstrap.py # Vanilla math bootstrap (regressed; see §3.8)
├── evals/ # Evaluation harnesses
│ ├── eval_raw.py # HumanEval / MBPP / GSM8K (vLLM, raw-completion)
│ ├── eval_plus.py # HumanEval+ contamination-resistant eval
│ └── confirm.py # Confirmation re-eval against base
├── tts/ # Test-time sampling (§2.2, §3.3)
│ ├── tts_scaling.py # Pass@N scaling sweep (HE, HE+, MATH-500)
│ ├── tts_humaneval.py # Best-of-N pass@1 on HE/HE+
│ ├── tts_math500.py # Best-of-N pass@1 on MATH-500
│ ├── tts_aime.py # Pass@k curve on AIME (k=1..64)
│ ├── tts_qwen14b_recipe.py # TTS on top of the 14B multi-pair adapter
│ └── tts_qwen3_8b_raw_control.py # Control: TTS on raw Qwen3-8B (recipe vs sampling)
├── experiments/ # Every paper experiment, one script each
│ ├── self_consistency.py # §3.4 — deployable TTS via majority vote (no oracle)
│ ├── recipe_x_tts_synergy.py # §3.5 — recipe × TTS synergy threshold (novel finding)
│ ├── cross_domain_code_to_math.py # §3.10 — code-trained recipe on math (+2, marginal)
│ ├── mbpp_seeded_cross_arch.py # §3.9 — Llama/Coder cross-architecture self-mining
│ ├── diversity_cued_mining.py # §3.10 — diversity-cued mining (low yield)
│ ├── recursive_bootstrap.py # §3.10 — recursive iter1→iter2→iter3 (plateau)
│ ├── self_correction_code.py # §3.10 — code self-correction recipe
│ ├── self_correction_math_naive.py # §3.10 — naive (wrong→fix only): catastrophic regress
│ ├── self_correction_math_fixed.py # §3.10 — fixed (mixed positives): recovered
│ ├── math500_seeded_mining.py # §3.10 — distribution-mismatch demo (catastrophic)
│ ├── aime_scaling.py # AIME pass@k = 1..64 sweep
│ ├── bcb_hard_eval.py # §3.10 — BigCodeBench-Hard distribution mismatch
│ └── star_baseline_gsm8k.py # Related-work baseline (STaR / rejection sampling FT)
├── controls/
│ └── mbpp_corrupt_control.py # The +0 negative-control experiment
│ └── mbpp_corrupt_control.py # §3.6 — the +0 negative-control experiment
├── data/ # Released mined pairs (drove paper numbers)
│ ├── pairs_7b_40.jsonl # 40 pairs for Qwen2.5-7B-Base
│ ├── pairs_14b_multi_new60.jsonl # 60 aggressive-mined pairs for 14B (+ warmup 40 = 100)
│ └── pairs_math_13.jsonl # 13 curriculum-mined math pairs (3B GSM8K)
├── docs/
│ ├── scaling_chart.png # Recipe lift vs base capability (paper Fig 1)
│ ├── fig1_headline.png # Headline result chart
│ └── fig6_boundary.png # Boundary conditions across 9 models
├── REPRODUCE.md # Paper figure/table → exact command mapping
│ ├── recipe_diagram.png # The 5-stage recipe diagram (rendered above)
│ ├── scaling_chart.png # Recipe lift vs base capability (paper Fig 1)
│ ├── fig1_headline.png # Headline result chart
│ └── fig6_boundary.png # Boundary conditions across 9 models
├── scripts/
│ └── make_recipe_diagram.py # Source for the rendered recipe diagram
├── REPRODUCE.md # Paper claim → exact command mapping (all sections)
├── requirements.txt
└── LICENSE
```
A note on these scripts: `recipe/`, `evals/`, and `controls/` are the clean replication paths — these have argparse CLIs and produce the headline numbers. The scripts under `experiments/` and `tts/` are the **original research scripts** used to produce each figure / table in the paper. They work, but they're closer to "research code" than "production tooling" — argument names vary, some have hard-coded paths to `/workspace/`, and they were each run on RunPod with a specific GPU. Read the top-of-file docstring of any experiment script for what it does and how to invoke it.
---
## Quickstart
@ -73,7 +103,7 @@ cd tinyforge-zero
pip install -r requirements.txt
# 3. Baseline the model (so you know the lift is real)
python recipe/eval_raw.py \
python evals/eval_raw.py \
--model Qwen/Qwen2.5-7B \
--bench humaneval
@ -85,7 +115,7 @@ python recipe/train_on_pairs.py \
--out adapter_7b --seed 13
# 5. Evaluate the trained adapter
python recipe/eval_raw.py \
python evals/eval_raw.py \
--model Qwen/Qwen2.5-7B \
--adapter adapter_7b \
--bench humaneval

View file

@ -1,154 +1,151 @@
# Reproduction Guide
Maps every paper claim → exact command. There are **two replication paths**:
Maps every paper claim → the script that produced it. Two replication paths:
- **Fast path** — use `recipe/train_on_pairs.py` with the released `data/*.jsonl`. Skips the mining stage. Gets you the trained adapter and the headline number in ~30 min on an H100.
- **Full path** — re-run the original research scripts (`bootstrap.py`, `multi_pair_14b.py`, `curriculum_math.py`) end-to-end including the self-mining step. This reproduces the recipe from scratch and verifies the mining is deterministic-ish (modulo sampling).
- **Fast path** — use `recipe/train_on_pairs.py` with `data/*.jsonl`. Reproduces the trained adapter and headline number in ~30 min on H100. Recommended for paper verification.
- **Full path** — re-run the original research scripts end-to-end including the self-mining stage. Use this if applying the recipe to a *new* base model.
The fast path is what you want for paper verification. The full path is what you want if you're trying the recipe on a *new* base model.
A note on script conventions: scripts under `recipe/`, `evals/`, and `controls/` are clean replication paths (argparse CLIs, no hardcoded paths). Scripts under `experiments/` and `tts/` are the original research code used to produce each finding — they work but use `--tag`-style outputs and sometimes assume `/workspace/` paths (set via `HF_HOME` env var). Read the top-of-file docstring of each to see exact invocation.
---
## Environment
Tested on:
- **H100 80GB** (recommended for 14B runs) — Debian 12, CUDA 12.4, driver 570+
- **RTX 6000 Ada 48GB** — sufficient for 7B and 3B runs
- **H100 80GB** — Debian 12, CUDA 12.4, driver 570+ (required for vLLM 0.8.5)
- **RTX 6000 Ada 48GB** — sufficient for ≤7B models
```bash
pip install -r requirements.txt
```
Exact stack used in the paper: `torch==2.6.0`, `transformers==4.51.3`, `vllm==0.8.5`, `peft==0.13.0`.
Pinned stack: `torch==2.6.0`, `transformers==4.51.3`, `vllm==0.8.5`, `peft==0.13.0`.
---
## FAST PATH — reproduce headline numbers from released pairs
# Mapping: paper claim → script
### Qwen2.5-7B-Base → 25 → 95112/164 (3-seed range)
## §2 Method
| Paper § | Method | Script | Notes |
|---|---|---|---|
| §2.1 | Self-bootstrap pipeline (code) | `recipe/bootstrap.py` | Generation → solving → mining → train, end-to-end |
| §2.1 | 4-bit harvest for large models | `recipe/bootstrap_14b_4bit_harvest.py` | NF4 quantization, harvest-only (no in-loop training) |
| §2.1 | Aggressive multi-pair mining | `recipe/multi_pair_14b.py` | The 14B 80.5% pipeline |
| §2.2 | Test-time sampling (oracle) | `tts/tts_scaling.py` | Pass@N for HE / HE+ / MATH-500 |
| §2.3 | Auto-difficulty curriculum (math) | `recipe/curriculum_math.py` | The GSM8K 32→66 pipeline |
| §2.3 | Auto-difficulty curriculum (code) | `recipe/curriculum_code.py` | Code variant |
---
## §3 Experiments
### §3.2 Recipe alone — HumanEval and HumanEval+
| Claim (paper Table 1) | Script + command |
|---|---|
| Qwen2.5-7B-Base: 25 → 112 (+87 best seed) | Fast path: `python recipe/train_on_pairs.py --model Qwen/Qwen2.5-7B --pairs data/pairs_7b_40.jsonl --seed 13 --lora-rank 16 --out adapter_7b_seed13` then `python evals/eval_raw.py --model Qwen/Qwen2.5-7B --adapter adapter_7b_seed13 --bench humaneval` |
| Qwen2.5-14B-Base: 44 → 131 / 80% on HE, 122/164 on HE+ | `cat data/pairs_7b_40.jsonl data/pairs_14b_multi_new60.jsonl > /tmp/14b.jsonl; python recipe/train_on_pairs.py --model Qwen/Qwen2.5-14B --pairs /tmp/14b.jsonl --lora-rank 32 --out adapter_14b_multi; python evals/eval_plus.py --model Qwen/Qwen2.5-14B --adapter adapter_14b_multi` |
| Multi-pair full path (re-mine + train) | `python recipe/multi_pair_14b.py --model Qwen/Qwen2.5-14B --warmup_pairs_path data/pairs_7b_40.jsonl --n_problems 200 --n_attempts 8 --max_pairs_per_problem 4 --lora_rank 32 --tag multi_rerun` |
| Boundary table for all 9 models | `python evals/eval_raw.py --model <each>` for baseline; recipe + re-eval per model. Cost: ~3 hr H100. |
### §3.3 Test-time sampling (TTS) alone
| Claim | Script | Expected |
|---|---|---|
| Qwen3-4B best-of-8 HE oracle = 92.7% | `python tts/tts_humaneval.py --model Qwen/Qwen3-4B-Base --n 8 --temperature 0.7` | 152/164 |
| Qwen3-8B best-of-8 HE oracle = 92.1% | `python tts/tts_humaneval.py --model Qwen/Qwen3-8B-Base --n 8 --temperature 0.7` | 151/164 |
| Qwen3-4B best-of-8 MATH-500 = 79.4% | `python tts/tts_math500.py --model Qwen/Qwen3-4B-Base --n 8` | 397/500 |
| Qwen3-8B best-of-8 MATH-500 = 81.0% | `python tts/tts_math500.py --model Qwen/Qwen3-8B-Base --n 8` | 405/500 |
| AIME pass@k curve (k=1..64) | `python tts/tts_aime.py --model Qwen/Qwen3-8B-Base --n 32` | 25.6 / 38.9% best-of-32 |
| Full TTS scaling sweep (Table 2) | `python tts/tts_scaling.py --model Qwen/Qwen3-4B-Base` | |
### §3.4 Self-consistency (deployable TTS, no oracle)
```bash
# 1. Baseline (raw-completion eval)
python recipe/eval_raw.py --model Qwen/Qwen2.5-7B --bench humaneval
# Expected: 25/164
# 2. Train on the released 40 pairs (try multiple seeds — small-data variance)
for SEED in 7 13 42; do
python recipe/train_on_pairs.py \
--model Qwen/Qwen2.5-7B \
--pairs data/pairs_7b_40.jsonl \
--out adapter_7b_seed${SEED} \
--seed ${SEED} --lora-rank 16 --epochs 2 --lr 1e-4
python recipe/eval_raw.py \
--model Qwen/Qwen2.5-7B \
--adapter adapter_7b_seed${SEED} \
--bench humaneval
done
# Expected: seed 7 → 104/164, seed 13 → 112/164, seed 42 → 95/164
python experiments/self_consistency.py \
--model Qwen/Qwen3-4B-Base \
--bench gsm8k --n 8
```
Tests if majority-vote selection without oracle access matches oracle pass@N. See paper Table 3.
### Qwen2.5-14B-Base → 132/164 (80.5%) and HumanEval+ 122/164 (74.4%)
The 14B run uses 100 pairs total: the 40 warmup pairs + 60 new aggressive-mined pairs. Concatenate first, then train.
### §3.5 Recipe × TTS synergy threshold (novel finding)
```bash
cat data/pairs_7b_40.jsonl data/pairs_14b_multi_new60.jsonl > /tmp/pairs_14b_100.jsonl
python recipe/train_on_pairs.py \
--model Qwen/Qwen2.5-14B \
--pairs /tmp/pairs_14b_100.jsonl \
--out adapter_14b_multi \
--lora-rank 32 --epochs 2 --lr 1e-4
python recipe/eval_raw.py \
--model Qwen/Qwen2.5-14B \
python experiments/recipe_x_tts_synergy.py \
--base-model Qwen/Qwen2.5-14B \
--adapter adapter_14b_multi \
--bench humaneval
# Expected: 132/164 (80.5%) in the multi-pair eval format
python recipe/eval_plus.py \
--model Qwen/Qwen2.5-14B \
--adapter adapter_14b_multi
# Expected: HumanEval+ 122/164 (74.4%)
--n 8
```
Compares: raw base | raw base + TTS | recipe-trained | recipe-trained + TTS. The novel finding: at sufficient mined-pair counts, recipe-trained + TTS > raw + TTS (+12.8pp). At too-few pairs, recipe-trained + TTS < raw + TTS (-4.9pp on Qwen2.5-3B with 36 pairs).
### Qwen2.5-3B-Base → GSM8K 32 → 66
### §3.6 Control: format alone does not explain the lift
```bash
python recipe/train_on_pairs.py \
--model Qwen/Qwen2.5-3B \
--pairs data/pairs_math_13.jsonl \
--out adapter_3b_math \
--lora-rank 16 --epochs 2 --lr 1e-4
# GSM8K eval — uses sympy as the verifier (no oracle math model needed).
# eval_raw.py auto-detects GSM8K format and runs the right verifier.
python recipe/eval_raw.py \
--model Qwen/Qwen2.5-3B \
--adapter adapter_3b_math \
--bench gsm8k
# Expected: 66/100
```
---
## FULL PATH — re-mine from scratch
These reproduce the *mining* step too. Each script does generation → solving → mining → training → eval as one pipeline. They write a `pairs.jsonl` and a `result.json` under `--tag`.
### Self-bootstrap from scratch on Qwen2.5-7B
```bash
python recipe/bootstrap.py \
python controls/mbpp_corrupt_control.py \
--model Qwen/Qwen2.5-7B \
--iterations 20 \
--problems_per_iter 16 \
--train_every 10 \
--eval_every 10 \
--tag bs_7b_rerun
# Writes: results/bs_7b_rerun/{pairs.jsonl,ckpt_iter*,eval_log.json,result.json}
# Expected final eval: 25 → 95112 (seed-dependent)
--tag mbpp_corrupt_control
```
Expected: HumanEval stays at 25/164 (Δ = 0). Confirms the signal is in self-mined content, not pair-formatted training data.
### Aggressive multi-pair mining on Qwen2.5-14B (the 80.5% headline)
### §3.7 Multi-pair mining at 14B (the 80.5% headline)
```bash
python recipe/multi_pair_14b.py \
--model Qwen/Qwen2.5-14B \
--warmup_pairs_path data/pairs_7b_40.jsonl \
--n_warmup_pairs 40 \
--n_problems 200 \
--n_attempts 8 \
--max_pairs_per_problem 4 \
--lora_rank 32 --epochs 2 --lr 1e-4 \
--n_problems 200 --n_attempts 8 \
--max_pairs_per_problem 4 --lora_rank 32 \
--tag multi_rerun
# Writes: results/multi_pair/multi_rerun/{pairs_new.jsonl,adapter/,result.json}
# Expected: trained 130134/164 (~80%)
```
Expected: base 67/164 → trained 132/164 (multi-pair eval format) / 131/164 chat-template / 122/164 HE+.
### GSM8K auto-difficulty curriculum on Qwen2.5-3B
### §3.8 Math: auto-difficulty curriculum
```bash
python recipe/curriculum_math.py \
--model Qwen/Qwen2.5-3B \
--iterations 16 \
--tag curr_3b_rerun
# Mines 1015 curriculum-difficulty pairs, trains, evals.
# Expected: GSM8K 32 → 6070 (some seed variance)
```
Expected: GSM8K 32/100 → 66/100. Compare to `recipe/math_bootstrap.py` (vanilla, no curriculum) which regresses.
### §3.9 Cross-architecture and cross-generation
| Model | Script | Expected |
|---|---|---|
| Llama-3.2-3B (own-mined 32) | `python experiments/mbpp_seeded_cross_arch.py --model meta-llama/Llama-3.2-3B` | HE 39→43 (+4) |
| Qwen2.5-Coder-7B-Base | `python experiments/mbpp_seeded_cross_arch.py --model Qwen/Qwen2.5-Coder-7B` | HE 83→87 (+4), MBPP 122→124 (+2) |
| Qwen3-4B-Base | Same script, Qwen3-4B-Base | HE 79→106 (+27), MBPP 135→148 (+13) |
### §3.10 Failure modes and negative results
Each negative finding has its own script. Run any of these to verify the documented failure.
| Failure mode | Script | Expected |
|---|---|---|
| Saturation (Qwen3-8B/14B HE) | `python recipe/bootstrap.py --model Qwen/Qwen3-8B-Base --tag sat_check` | 132 → 118133, no clean lift |
| BCB-Hard distribution mismatch | `python experiments/bcb_hard_eval.py --model Qwen/Qwen3-8B-Base --adapter adapter_7b_seed13` | No transfer; HE-style pairs don't generalize to library code |
| MATH-500 mining distribution mismatch | `python experiments/math500_seeded_mining.py --model Qwen/Qwen3-8B-Base` | 279/500 → 239/500 (40, catastrophic) |
| Self-correction over-correction (naive) | `python experiments/self_correction_math_naive.py --model Qwen/Qwen3-4B-Base` | 299/500 → 69/500 (Δ=230!) |
| Self-correction recovery (fixed) | `python experiments/self_correction_math_fixed.py --model Qwen/Qwen3-4B-Base` | Recovers to baseline + small lift via mixed positives |
| Recursive bootstrap plateau | `python experiments/recursive_bootstrap.py --model Qwen/Qwen2.5-7B --iters 3` | iter1 gives most lift, iter2/3 plateau |
| Cross-domain transfer (code→math) | `python experiments/cross_domain_code_to_math.py --code-adapter adapter_7b_seed13` | +2 marginal lift on GSM8K |
| Diversity-cued mining low yield | `python experiments/diversity_cued_mining.py --model Qwen/Qwen2.5-7B` | Fewer well-formed pairs than vanilla mining |
---
## Control experiment (Figure 2)
## §3.11 Boundary conditions summary (Figure 6)
Verifies the signal is in the *content* of self-mined pairs, not the format. Replaces the mined pairs with mechanically-corrupted external pairs (MBPP-style) that look identical structurally.
The 9-model boundary chart is the synthesis of per-model recipe runs. To regenerate:
```bash
python controls/mbpp_corrupt_control.py \
--model Qwen/Qwen2.5-7B \
--tag mbpp_corrupt_control
# Expected: HumanEval stays at 25/164 (Δ ≈ 0, ± seed noise)
for MODEL in Qwen/Qwen2.5-{3B,7B,14B,72B} Qwen/Qwen3-{1.7B,4B,8B,14B}-Base meta-llama/Llama-3.2-3B Qwen/Qwen2.5-Coder-7B allenai/OLMo-2-1124-7B; do
python evals/eval_raw.py --model "$MODEL" --bench humaneval # baseline
python recipe/bootstrap.py --model "$MODEL" --tag "boundary_$(echo $MODEL | tr '/' '_')"
done
```
Run time: ~3 hours on a single H100, ~$8 cost.
---
@ -161,42 +158,40 @@ for N in 10 21 40; do
--model Qwen/Qwen2.5-7B \
--pairs /tmp/pairs_$N.jsonl \
--out adapter_n$N --epochs 2
python recipe/eval_raw.py \
python evals/eval_raw.py \
--model Qwen/Qwen2.5-7B --adapter adapter_n$N --bench humaneval
done
# Expected: n=10 → ~51, n=21 → 8695, n=40 → 95112 (seed-dependent for small N)
```
Expected: n=10 → ~51, n=21 → mean ~91, n=40 → mean ~105 (seed-dependent for small N).
---
## Boundary conditions to verify (paper §3)
## Related-work baseline
| Claim | Hint | Expected |
|-------|------|----------|
| Qwen3-8B saturated on HE | Run multi_pair_14b.py with `--model Qwen/Qwen3-8B-Base` | Base 132, adapter ≈ 118133 — no clean lift |
| Qwen2.5-72B saturated | Same on 72B with 10 pairs | Base 83 → trained 73 (10) |
| MATH-500 distribution mismatch | Mining on simple problems + MATH-500 eval | Base 279/500 → trained 239/500 (40) |
| Self-correction over-correction | Train on wrong→fix triples only, no right→stays-right | Base 299/500 → trained 69/500 (230) |
| BCB-Hard distribution mismatch | Apply 7B 40-pair adapter, eval on BCB-Hard | No transfer |
| Method | Script | Use |
|---|---|---|
| STaR / rejection-sampling FT on GSM8K | `experiments/star_baseline_gsm8k.py` | Comparison point for the curriculum result |
---
## Notes on stochasticity
## Notes on stochasticity and reproducibility
- **vLLM sampling** is deterministic given a fixed seed, but vLLM 0.8.x occasionally changes pad/EOS handling between point releases. Pin to 0.8.5.
- **LoRA training is seed-sensitive at small N.** The 7B 40-pair run spans 95112/164 across seeds 7/13/42. The 14B 100-pair run is much tighter (130134/164).
- **Stop tokens matter.** Use `--stop "\nclass " --stop "\nif __name__"` for raw-completion eval. Wrong stop tokens cut output prematurely and produce artifactually low baselines. We saw this earlier in the project — see paper §2.
- **vLLM sampling** is deterministic given a fixed seed, but vLLM 0.8.x can change pad/EOS handling between point releases. Pin to 0.8.5.
- **LoRA training is seed-sensitive at small N.** 7B 40-pair: 95112/164 across seeds 7/13/42. 14B 100-pair: 130134/164 (tighter).
- **Stop tokens matter.** Use `--stop "\nclass " --stop "\nif __name__"` for raw-completion eval. Wrong stop tokens cut output and produce artifactually low baselines. We hit this earlier in the project; the paper §2 documents the fix.
---
## Cost reference (May 2026, RunPod)
| Workflow | Hardware | Wall time | Cost |
|----------|----------|-----------|------|
|---|---|---|---|
| 7B headline (fast path) | RTX 6000 Ada 48GB | ~30 min | ~$0.50 |
| 14B 80.5% (fast path) | H100 80GB | ~30 min | ~$1.50 |
| 14B 80.5% full path (mining + train) | H100 80GB | ~95 min | ~$3.50 |
| GSM8K 32→66 | RTX 6000 Ada | ~30 min | ~$0.50 |
| Full eval matrix (9 models) | H100 80GB | ~3 hrs | ~$8 |
| 14B 80.5% full path | H100 80GB | ~95 min | ~$3.50 |
| GSM8K 32→66 curriculum | RTX 6000 Ada | ~30 min | ~$0.50 |
| TTS scaling sweep (one model) | H100 80GB | ~30 min | ~$1.50 |
| Full 9-model boundary chart | H100 80GB | ~3 hrs | ~$8 |
| Every negative result | mixed | ~5 hrs total | ~$15 |
Total cost to verify all numbers in the paper via the fast path: **under $10**.
Verify all paper numbers via fast path: **under $10**. Full reproduction from scratch (including all negative results and the full TTS sweep): **~$50**, matching the paper's reported total spend.

View file

@ -0,0 +1,91 @@
"""TTS scaling on AIME — pass@k curve from k=1 to k=64."""
import os, json, time, re, argparse
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from datasets import load_dataset
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def extract_int(text):
m = re.search(r"\\boxed\{(\d+)\}", text)
if m:
try: return int(m.group(1))
except: return None
nums = re.findall(r"\b(\d+)\b", text.strip().split("\n")[-3:][-1] if text.strip().split("\n") else "")
if nums:
try: return int(nums[-1])
except: pass
return None
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--tag", required=True)
ap.add_argument("--out_dir", required=True)
args = ap.parse_args()
os.makedirs(args.out_dir, exist_ok=True)
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
log(f"loading {args.model}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=3072)
log("loaded")
ds = list(load_dataset("AI-MO/aimo-validation-aime", split="train"))
log(f" AIME: {len(ds)} problems")
UTMPL = "Solve this AIME problem. Answer is integer 0-999. End with \\boxed{{N}}.\n\nProblem: {p}\n\nSolution:"
prompts = []
for p in ds:
try:
msgs = [{"role": "system", "content": "AIME solver. End with \\boxed{integer}."},
{"role": "user", "content": UTMPL.format(p=p["problem"])}]
prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
except Exception:
prompts.append(UTMPL.format(p=p["problem"]))
MAX_N = 64
sp = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1500, n=MAX_N)
log(f"generating {MAX_N} samples per problem...")
t0 = time.time()
outs = llm.generate(prompts, sp, use_tqdm=False)
log(f" gen in {time.time()-t0:.1f}s")
# Per-task per-sample correctness
per_task_results = []
for p, outset in zip(ds, outs):
gold = int(p["answer"])
per_sample = []
for o in outset.outputs:
pred = extract_int(o.text)
per_sample.append(pred == gold)
per_task_results.append(per_sample)
NS = [1, 2, 4, 8, 16, 32, 64]
scaling = {}
for k in NS:
scaling[k] = sum(1 for r in per_task_results if any(r[:k]))
result = {"model": args.model, "tag": args.tag, "MAX_N": MAX_N,
"n_total": len(ds), "pass_at_k": scaling, "elapsed_s": time.time() - T0}
with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} — AIME TTS SCALING")
for k in NS:
print(f" pass@{k:<3}: {scaling[k]:>3}/{len(ds)} ({100*scaling[k]/len(ds):.1f}%)")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,190 @@
"""Train Qwen3-8B-Base with 40-pair recipe, eval on BigCodeBench-Hard.
BigCodeBench is harder than HumanEval (real-world Python tasks, library use).
Qwen3-8B-Base likely has headroom there (~30-45% baseline). Tests if recipe
generalizes to newer model AND harder benchmark.
"""
import os, json, time, re, subprocess, tempfile, argparse
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset, Dataset as HFDataset
from peft import LoraConfig, get_peft_model
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def extract_code(text):
if "```python" in text: text = text.split("```python", 1)[1]
elif "```" in text: text = text.split("```", 1)[1]
if "```" in text: text = text.split("```", 1)[0]
return text.strip()
def verify_bcb(code, test_code):
runner = "\n\nif __name__ == '__main__':\n import unittest; unittest.main(argv=['x'], exit=False, verbosity=0)\n"
body = code + "\n\n" + test_code + runner
with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
f.write(body); path = f.name
try:
r = subprocess.run(["python3", path], capture_output=True, timeout=20, text=True, cwd="/tmp")
out = (r.stdout or "") + "\n" + (r.stderr or "")
if "OK" in out and "FAILED" not in out and "Error" not in out and r.returncode == 0:
return True
return False
except subprocess.TimeoutExpired:
return False
finally:
try: os.unlink(path)
except: pass
def gen_batch(model, tok, prompts, max_new=600, temperature=0.0, batch=4):
outs = []
for i in range(0, len(prompts), batch):
chunk = prompts[i:i+batch]
texts = []
for p in chunk:
msgs = [{"role": "system", "content": "You are an expert Python coder. Output one ```python block with the complete solution."},
{"role": "user", "content": p}]
texts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
inp = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=2000).to(model.device)
with torch.no_grad():
out = model.generate(**inp, max_new_tokens=max_new, do_sample=temperature > 0,
temperature=temperature if temperature > 0 else 1.0, top_p=0.95,
pad_token_id=tok.eos_token_id)
for j in range(out.size(0)):
outs.append(tok.decode(out[j][inp.input_ids.shape[1]:], skip_special_tokens=True))
return outs
def eval_bcb_hard(model, tok, label, max_n=148):
bcb = list(load_dataset("bigcode/bigcodebench-hard", split="v0.1.4"))[:max_n]
log(f" BCB-Hard [{label}] ({len(bcb)})")
prompts = [p["instruct_prompt"] for p in bcb]
outs = gen_batch(model, tok, prompts, max_new=700, batch=4)
correct = 0
for i, (p, raw) in enumerate(zip(bcb, outs)):
code = extract_code(raw) if "```" in raw else raw
if verify_bcb(code, p["test"]): correct += 1
if (i+1) % 20 == 0: log(f" {label} BCB {i+1}/{len(bcb)}: {correct}")
return correct, len(bcb)
def eval_humaneval(model, tok, label):
he = list(load_dataset("openai_humaneval", split="test"))
log(f" HumanEval [{label}] ({len(he)})")
prompts = [p["prompt"] + "\n# Complete the function above." for p in he]
outs = gen_batch(model, tok, prompts, max_new=400, batch=4)
correct = 0
for i, (p, raw) in enumerate(zip(he, outs)):
code = extract_code(raw) if "```" in raw else raw
full = p["prompt"] + "\n" + code if "def " not in code else code
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
f.write(test_code); path = f.name
try:
r = subprocess.run(["python3", path], capture_output=True, timeout=10, text=True, cwd="/tmp")
if r.returncode == 0: correct += 1
except subprocess.TimeoutExpired: pass
finally:
try: os.unlink(path)
except: pass
if (i+1) % 40 == 0: log(f" {label} HE {i+1}/{len(he)}: {correct}")
return correct, len(he)
def make_example(r, tok):
user = (f"Implement: {r['signature']}\n\n"
f"Tests:\n{chr(10).join(r['tests'])}\n\n"
f"My attempt:\n```python\n{r['broken']}\n```\n\n"
f"Error:\n{r.get('error','')}\n\n"
f"Fix and output the corrected code only.")
assistant = f"```python\n{r['fixed']}\n```"
msgs_pre = [{"role": "system", "content": "You are an expert Python coder. Output one ```python block with the complete solution."},
{"role": "user", "content": user}]
msgs_full = msgs_pre + [{"role": "assistant", "content": assistant}]
pre = tok.apply_chat_template(msgs_pre, tokenize=False, add_generation_prompt=True)
full = tok.apply_chat_template(msgs_full, tokenize=False)
pre_ids = tok(pre, add_special_tokens=False)["input_ids"]
full_ids = tok(full, add_special_tokens=False)["input_ids"]
MAX = 1024
full_ids = full_ids[:MAX]
labels = list(full_ids)
n_pre = min(len(pre_ids), len(labels))
for i in range(n_pre): labels[i] = -100
pad = MAX - len(full_ids)
return {"input_ids": full_ids + [tok.pad_token_id]*pad,
"attention_mask": [1]*len(full_ids) + [0]*pad,
"labels": labels + [-100]*pad}
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--pairs", default="/workspace/saved_pairs/pairs_40.jsonl")
ap.add_argument("--n_pairs", type=int, default=40)
ap.add_argument("--tag", required=True)
args = ap.parse_args()
out_dir = f"/workspace/bcb_eval/{args.tag}"
os.makedirs(out_dir, exist_ok=True)
log(f"loading {args.model}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
tok.padding_side = "left"
model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0")
log(f" loaded mem={torch.cuda.memory_allocated('cuda:0')/1e9:.1f}GB")
model.eval()
log("=== BASE evals ===")
base_he, _ = eval_humaneval(model, tok, "BASE")
base_bcb, _ = eval_bcb_hard(model, tok, "BASE")
log(f" BASE: HumanEval={base_he}/164 BCB-Hard={base_bcb}/148")
pairs = [json.loads(l) for l in open(args.pairs)][:args.n_pairs]
log(f"=== TRAINING — {len(pairs)} pairs ===")
lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")
model = get_peft_model(model, lora_cfg)
tok.padding_side = "right"
ds = HFDataset.from_list([make_example(r, tok) for r in pairs])
targs = TrainingArguments(
output_dir=f"{out_dir}/ckpt", num_train_epochs=2,
per_device_train_batch_size=1, gradient_accumulation_steps=4,
learning_rate=1e-4, bf16=True, logging_steps=10,
save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05,
)
Trainer(model=model, args=targs, train_dataset=ds, processing_class=tok).train()
log(" training done")
tok.padding_side = "left"
model.eval()
log("=== TRAINED evals ===")
tr_he, _ = eval_humaneval(model, tok, "TRAINED")
tr_bcb, _ = eval_bcb_hard(model, tok, "TRAINED")
result = {
"model": args.model, "method": "warmup 40 pairs",
"humaneval": {"base": base_he, "trained": tr_he, "delta": tr_he-base_he, "n": 164},
"bcb_hard": {"base": base_bcb, "trained": tr_bcb, "delta": tr_bcb-base_bcb, "n": 148},
"elapsed_s": time.time() - T0,
}
with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model}")
print(f" HumanEval: base={base_he}/164 trained={tr_he}/164 Δ={tr_he-base_he:+d}")
print(f" BCB-Hard: base={base_bcb}/148 trained={tr_bcb}/148 Δ={tr_bcb-base_bcb:+d}")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,222 @@
"""Cross-domain transfer: train recipe on CODE, eval on MATH (no math training).
Tests if self-bootstrap teaches generic reasoning vs domain-specific patterns."""
import os, json, time, re, subprocess, tempfile, argparse, gc, random
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from datasets import load_dataset
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def run_python(code, timeout=10):
with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
f.write(code); path = f.name
try:
r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp")
return r.returncode == 0
except subprocess.TimeoutExpired: return False
finally:
try: os.unlink(path)
except: pass
def extract_boxed(text):
idx = text.rfind("\\boxed{")
if idx < 0: return None
start = idx + len("\\boxed{"); depth = 1; i = start
while i < len(text) and depth > 0:
if text[i] == "{": depth += 1
elif text[i] == "}": depth -= 1
i += 1
if depth != 0: return None
return text[start:i-1].strip()
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--train_domain", choices=["code", "math"], default="code")
ap.add_argument("--tag", required=True)
ap.add_argument("--out_dir", required=True)
args = ap.parse_args()
os.makedirs(args.out_dir, exist_ok=True)
random.seed(42)
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
log(f"loading {args.model}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048)
log("loaded")
# Eval sets
he = list(load_dataset("openai_humaneval", split="test"))[:80]
math500 = list(load_dataset("HuggingFaceH4/MATH-500", split="test"))[:100]
# Build prompts
he_prompts = [p["prompt"] for p in he]
math_prompts = []
for p in math500:
try:
msgs = [{"role": "system", "content": "Math solver. End with \\boxed{answer}."},
{"role": "user", "content": f"Solve. Problem: {p['problem']}\n\nSolution:"}]
math_prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
except Exception:
math_prompts.append(f"Solve. Problem: {p['problem']}\n\nSolution:")
import sympy
from sympy.parsing.latex import parse_latex
def sympy_eq(a, b):
if a is None or b is None: return False
if a.strip() == b.strip(): return True
try:
if sympy.simplify(parse_latex(a) - parse_latex(b)) == 0: return True
except Exception: pass
try:
if abs(float(a) - float(b)) < 1e-6: return True
except Exception: pass
return False
def eval_he(llm, lora_req=None):
sp = SamplingParams(temperature=0, max_tokens=400, stop=["\nclass ", "\nif __name__", "\n\nprint"])
outs = llm.generate(he_prompts, sp, lora_request=lora_req, use_tqdm=False) if lora_req else \
llm.generate(he_prompts, sp, use_tqdm=False)
outs = [o.outputs[0].text for o in outs]
c = 0
for p, raw in zip(he, outs):
full = p["prompt"] + "\n" + raw
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
if run_python(test_code, 10): c += 1
return c, len(he)
def eval_math(llm, lora_req=None):
sp = SamplingParams(temperature=0, max_tokens=800)
outs = llm.generate(math_prompts, sp, lora_request=lora_req, use_tqdm=False) if lora_req else \
llm.generate(math_prompts, sp, use_tqdm=False)
outs = [o.outputs[0].text for o in outs]
c = 0
for p, raw in zip(math500, outs):
if sympy_eq(extract_boxed(raw), p["answer"]): c += 1
return c, len(math500)
log("=== BASE evals ===")
base_he = eval_he(llm)
base_math = eval_math(llm)
log(f" base HE: {base_he[0]}/{base_he[1]} MATH: {base_math[0]}/{base_math[1]}")
# Mine code pairs
log("mining code pairs...")
mbpp_full = list(load_dataset("mbpp", split="train"))
random.shuffle(mbpp_full)
seeds = []
for p in mbpp_full[:200]:
prompt_text = p.get("prompt") or p.get("text", "")
if prompt_text and p.get("test_list"):
seeds.append({"prompt": prompt_text, "test_list": p["test_list"]})
def mbpp_prompt(p): return f"# Task: {p['prompt']}\n# Tests:\n# " + "\n# ".join(p["test_list"]) + "\n\n"
sp = SamplingParams(temperature=0, max_tokens=400, stop=["\nclass Test", "\nif __name__"])
g_outs = [o.outputs[0].text for o in llm.generate([mbpp_prompt(p) for p in seeds], sp, use_tqdm=False)]
hard_idx = []
for i, (p, raw) in enumerate(zip(seeds, g_outs)):
if not run_python(raw + "\n\n" + "\n".join(p["test_list"]), 8):
hard_idx.append(i)
log(f" greedy: {len(seeds)-len(hard_idx)} pass, {len(hard_idx)} hard")
pairs = []
if hard_idx:
sp2 = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=400, n=8,
stop=["\nclass Test", "\nif __name__"])
hard_prompts = [mbpp_prompt(seeds[i]) for i in hard_idx]
sample_outs = llm.generate(hard_prompts, sp2, use_tqdm=False)
for j, i in enumerate(hard_idx):
attempts = [o.text for o in sample_outs[j].outputs]
for a in attempts:
if run_python(a + "\n\n" + "\n".join(seeds[i]["test_list"]), 8):
pairs.append({"problem": seeds[i]["prompt"], "tests": seeds[i]["test_list"],
"broken": g_outs[i].strip(), "fixed": a.strip()})
break
log(f" mined {len(pairs)} code pairs")
if len(pairs) < 5:
log("too few pairs, skipping train")
result = {"model": args.model, "n_pairs": len(pairs),
"base_he": base_he[0], "base_math": base_math[0]}
with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
return
# Tear down vLLM, train LoRA
del llm; gc.collect(); torch.cuda.empty_cache()
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset as HFDataset
from peft import LoraConfig, get_peft_model
def mk_ex(r):
user = (f"# Task: {r['problem']}\n# Tests:\n# " + "\n# ".join(r['tests']) + "\n"
f"# My broken attempt:\n{r['broken']}\n# Corrected:\n")
full = user + r["fixed"]
full_ids = tok(full, add_special_tokens=False)["input_ids"]
user_ids = tok(user, add_special_tokens=False)["input_ids"]
MAX = 1024
full_ids = full_ids[:MAX]
labels = list(full_ids); n_user = min(len(user_ids), len(labels))
for i in range(n_user): labels[i] = -100
pad = MAX - len(full_ids)
return {"input_ids": full_ids + [tok.pad_token_id]*pad,
"attention_mask": [1]*len(full_ids) + [0]*pad,
"labels": labels + [-100]*pad}
log("training LoRA on code pairs...")
model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0")
lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")
model = get_peft_model(model, lora_cfg)
ds_train = HFDataset.from_list([mk_ex(r) for r in pairs])
targs = TrainingArguments(
output_dir=f"{args.out_dir}/ckpt", num_train_epochs=2,
per_device_train_batch_size=1, gradient_accumulation_steps=4,
learning_rate=1e-4, bf16=True, logging_steps=20,
save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05,
)
Trainer(model=model, args=targs, train_dataset=ds_train, tokenizer=tok).train()
adapter_dir = f"{args.out_dir}/adapter"
model.save_pretrained(adapter_dir)
del model; gc.collect(); torch.cuda.empty_cache()
log("training done")
# Re-eval with adapter
log("=== TRAINED evals ===")
from vllm import LLM as LLM2
from vllm.lora.request import LoRARequest
llm = LLM2(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048,
enable_lora=True, max_lora_rank=16)
lora_req = LoRARequest("trained", 1, adapter_dir)
tr_he = eval_he(llm, lora_req)
tr_math = eval_math(llm, lora_req)
log(f" trained HE: {tr_he[0]}/{tr_he[1]} MATH: {tr_math[0]}/{tr_math[1]}")
result = {
"model": args.model, "train_domain": args.train_domain,
"n_pairs": len(pairs),
"humaneval": {"base": base_he[0], "trained": tr_he[0], "delta": tr_he[0]-base_he[0], "n": base_he[1]},
"math500": {"base": base_math[0], "trained": tr_math[0], "delta": tr_math[0]-base_math[0], "n": base_math[1]},
"elapsed_s": time.time() - T0,
}
with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} — CROSS-DOMAIN ({args.train_domain} train, eval HE+MATH)")
print(f" HE: base={base_he[0]}/{base_he[1]} trained={tr_he[0]}/{tr_he[1]} Δ={tr_he[0]-base_he[0]:+d}")
print(f" MATH: base={base_math[0]}/{base_math[1]} trained={tr_math[0]}/{tr_math[1]} Δ={tr_math[0]-base_math[0]:+d}")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,180 @@
"""Diversity-aware mining: prompt model with multiple cognitive lenses, mine pairs WITHOUT including failed code.
Train on (problem, best_approach_summary, working_code) minimal traces."""
import os, json, time, re, subprocess, tempfile, argparse, gc, random
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from datasets import load_dataset
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def run_python(code, timeout=10):
with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
f.write(code); path = f.name
try:
r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp")
return r.returncode == 0
except subprocess.TimeoutExpired: return False
finally:
try: os.unlink(path)
except: pass
LENS_PROMPTS = [
("brute force iteration", "# Loop and check each case."),
("math formula", "# Use a closed-form formula."),
("hash map/set", "# Use a hashmap/set for O(1) lookup."),
("recursion", "# Solve recursively."),
]
def mbpp_prompt(p): return f"# Task: {p['prompt']}\n# Tests:\n# " + "\n# ".join(p["test_list"]) + "\n\n"
def he_prompt(p): return p["prompt"]
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--n_mining", type=int, default=150)
ap.add_argument("--tag", required=True)
ap.add_argument("--out_dir", required=True)
args = ap.parse_args()
os.makedirs(args.out_dir, exist_ok=True)
random.seed(42)
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
log(f"loading {args.model}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048)
log("loaded")
he = list(load_dataset("openai_humaneval", split="test"))
mbpp_test = list(load_dataset("mbpp", "sanitized", split="test"))[:100]
mbpp_full = list(load_dataset("mbpp", split="train"))
random.shuffle(mbpp_full)
seeds = []
for p in mbpp_full[:args.n_mining]:
prompt_text = p.get("prompt") or p.get("text", "")
if prompt_text and p.get("test_list"):
seeds.append({"prompt": prompt_text, "test_list": p["test_list"]})
log(f" HE: {len(he)}, MBPP-test: {len(mbpp_test)}, mining: {len(seeds)}")
# Base eval
sp_g = SamplingParams(temperature=0, max_tokens=400, stop=["\nclass ", "\nif __name__", "\n\nprint"])
he_outs = [o.outputs[0].text for o in llm.generate([he_prompt(p) for p in he], sp_g, use_tqdm=False)]
base_he = sum(1 for p, raw in zip(he, he_outs)
if run_python(p["prompt"] + "\n" + raw + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})", 10))
mbpp_outs = [o.outputs[0].text for o in llm.generate([mbpp_prompt(p) for p in mbpp_test], sp_g, use_tqdm=False)]
base_mbpp = sum(1 for p, raw in zip(mbpp_test, mbpp_outs)
if run_python(raw + "\n\n" + "\n".join(p["test_list"]), 10))
log(f"BASE: HE={base_he}/{len(he)} MBPP={base_mbpp}/{len(mbpp_test)}")
# Mine: for each problem, generate 4 lens-cued attempts, keep one that works
log("mining with cued diversity...")
pairs = []
for lens_name, lens_hint in LENS_PROMPTS:
log(f" lens: {lens_name}")
# Prefill prompts with lens hint
prefilled = []
for s in seeds:
base = mbpp_prompt(s) + f"# Approach: {lens_name}.\n{lens_hint}\ndef solution"
prefilled.append(base)
sp = SamplingParams(temperature=0.7, top_p=0.95, max_tokens=300,
stop=["\nclass Test", "\nif __name__", "\n\nprint", "\n# Task"])
outs = [o.outputs[0].text for o in llm.generate(prefilled, sp, use_tqdm=False)]
# Verify each
for s, raw in zip(seeds, outs):
code = "def solution" + raw
if run_python(code + "\n\n" + "\n".join(s["test_list"]), 8):
# Greedy attempt to use as broken
greedy = [o.outputs[0].text for o in llm.generate([mbpp_prompt(s)], sp_g, use_tqdm=False)][0]
if not run_python(greedy + "\n\n" + "\n".join(s["test_list"]), 8):
pairs.append({"problem": s["prompt"], "tests": s["test_list"],
"broken": greedy.strip(), "fixed": code.strip(),
"lens": lens_name})
log(f"mined {len(pairs)} pairs across lenses")
with open(f"{args.out_dir}/pairs.jsonl", "w") as fh:
for r in pairs: fh.write(json.dumps(r) + "\n")
if len(pairs) < 5:
result = {"model": args.model, "n_pairs": len(pairs), "base_he": base_he, "base_mbpp": base_mbpp}
with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
return
# Train flat
del llm; gc.collect(); torch.cuda.empty_cache()
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset as HFDataset
from peft import LoraConfig, get_peft_model
def mk_ex(r):
user = (f"# Task: {r['problem']}\n# Tests:\n# " + "\n# ".join(r['tests']) + "\n"
f"# My broken attempt:\n{r['broken']}\n# Corrected:\n")
full = user + r["fixed"]
full_ids = tok(full, add_special_tokens=False)["input_ids"]
user_ids = tok(user, add_special_tokens=False)["input_ids"]
MAX = 1024
full_ids = full_ids[:MAX]
labels = list(full_ids); n_user = min(len(user_ids), len(labels))
for i in range(n_user): labels[i] = -100
pad = MAX - len(full_ids)
return {"input_ids": full_ids + [tok.pad_token_id]*pad,
"attention_mask": [1]*len(full_ids) + [0]*pad,
"labels": labels + [-100]*pad}
log("training...")
model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0")
lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")
model = get_peft_model(model, lora_cfg)
ds_train = HFDataset.from_list([mk_ex(r) for r in pairs])
targs = TrainingArguments(
output_dir=f"{args.out_dir}/ckpt", num_train_epochs=2,
per_device_train_batch_size=1, gradient_accumulation_steps=4,
learning_rate=1e-4, bf16=True, logging_steps=20,
save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05,
)
Trainer(model=model, args=targs, train_dataset=ds_train, tokenizer=tok).train()
adapter_dir = f"{args.out_dir}/adapter"
model.save_pretrained(adapter_dir)
del model; gc.collect(); torch.cuda.empty_cache()
# Trained eval
from vllm import LLM as LLM2
from vllm.lora.request import LoRARequest
llm = LLM2(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048,
enable_lora=True, max_lora_rank=16)
lora_req = LoRARequest("trained", 1, adapter_dir)
he_outs = [o.outputs[0].text for o in llm.generate([he_prompt(p) for p in he], sp_g, lora_request=lora_req, use_tqdm=False)]
tr_he = sum(1 for p, raw in zip(he, he_outs)
if run_python(p["prompt"] + "\n" + raw + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})", 10))
mbpp_outs = [o.outputs[0].text for o in llm.generate([mbpp_prompt(p) for p in mbpp_test], sp_g, lora_request=lora_req, use_tqdm=False)]
tr_mbpp = sum(1 for p, raw in zip(mbpp_test, mbpp_outs)
if run_python(raw + "\n\n" + "\n".join(p["test_list"]), 10))
result = {
"model": args.model, "n_pairs": len(pairs),
"humaneval": {"base": base_he, "trained": tr_he, "delta": tr_he-base_he, "n": len(he)},
"mbpp": {"base": base_mbpp, "trained": tr_mbpp, "delta": tr_mbpp-base_mbpp, "n": len(mbpp_test)},
"elapsed_s": time.time() - T0,
}
with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} — DIVERSITY-CUED MINING ({len(pairs)} pairs)")
print(f" HE: base={base_he}/{len(he)} trained={tr_he}/{len(he)} Δ={tr_he-base_he:+d}")
print(f" MBPP: base={base_mbpp}/{len(mbpp_test)} trained={tr_mbpp}/{len(mbpp_test)} Δ={tr_mbpp-base_mbpp:+d}")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,276 @@
"""TinyForge-Zero math with MATH-train-split as problem seeds.
Recipe:
1. Sample N problems from MATH train split (NOT test).
2. Greedy solve each. Verify with sympy against gold answer.
3. If greedy correct save (problem, greedy_solution) as positive.
4. If greedy wrong, sample 4 attempts at temp=0.8.
Some pass mine pair: (problem, sampled_correct_solution).
5. Repeat until max_pairs.
6. Train LoRA on pairs.
7. Eval on MATH-500 (test).
Uses MATH train as problem source model still self-generates ALL solutions.
No human solutions used.
"""
import os, json, time, re, argparse, random
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset, Dataset as HFDataset
from peft import LoraConfig, get_peft_model
import sympy
from sympy.parsing.latex import parse_latex
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
SOLVE_PROMPT = """Solve this competition math problem. Show your reasoning, then put the final answer in \\boxed{{...}}.
Problem: {problem}
Solution:"""
def extract_boxed(text):
idx = text.rfind("\\boxed{")
if idx < 0: return None
start = idx + len("\\boxed{")
depth = 1; i = start
while i < len(text) and depth > 0:
if text[i] == "{": depth += 1
elif text[i] == "}": depth -= 1
i += 1
if depth != 0: return None
return text[start:i-1].strip()
def normalize(s):
if s is None: return None
s = s.strip()
s = re.sub(r"^\$|\$$", "", s).strip()
s = re.sub(r"\\text\{([^}]*)\}", r"\1", s)
s = re.sub(r"\\mbox\{([^}]*)\}", r"\1", s)
s = re.sub(r"(?<=\d),(?=\d)", "", s)
s = s.replace("\\left", "").replace("\\right", "").replace("^\\circ", "").replace("^{\\circ}", "")
return s.strip()
def sympy_equal(a, b):
if a is None or b is None: return False
a, b = normalize(a), normalize(b)
if a == b: return True
try:
ea = parse_latex(a); eb = parse_latex(b)
if sympy.simplify(ea - eb) == 0: return True
except Exception: pass
try:
fa = float(a); fb = float(b)
if abs(fa - fb) < 1e-6: return True
except Exception: pass
return False
def gen_batch(model, tok, prompts, max_new=600, temperature=0.0, batch=16):
outs = []
for i in range(0, len(prompts), batch):
chunk = prompts[i:i+batch]
texts = []
for p in chunk:
msgs = [{"role": "system", "content": "You are a careful math problem solver."},
{"role": "user", "content": p}]
try:
texts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
except Exception:
texts.append(p)
inp = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=1500).to(model.device)
with torch.no_grad():
out = model.generate(**inp, max_new_tokens=max_new, do_sample=temperature > 0,
temperature=temperature if temperature > 0 else 1.0, top_p=0.95,
pad_token_id=tok.eos_token_id)
for j in range(out.size(0)):
outs.append(tok.decode(out[j][inp.input_ids.shape[1]:], skip_special_tokens=True))
return outs
def math500_eval(model, tok, n=500, batch=16):
ds = list(load_dataset("HuggingFaceH4/MATH-500", split="test"))[:n]
log(f" eval on MATH-500 ({len(ds)} problems)")
prompts = [SOLVE_PROMPT.format(problem=p["problem"]) for p in ds]
outs = gen_batch(model, tok, prompts, max_new=600, temperature=0.0, batch=batch)
correct = 0
for p, raw in zip(ds, outs):
pred = extract_boxed(raw)
if sympy_equal(pred, p["answer"]): correct += 1
return correct, len(ds)
def make_train_example(problem, solution, tok):
user = SOLVE_PROMPT.format(problem=problem)
msgs_pre = [{"role": "system", "content": "You are a careful math problem solver."},
{"role": "user", "content": user}]
msgs_full = msgs_pre + [{"role": "assistant", "content": solution}]
pre = tok.apply_chat_template(msgs_pre, tokenize=False, add_generation_prompt=True)
full = tok.apply_chat_template(msgs_full, tokenize=False)
pre_ids = tok(pre, add_special_tokens=False)["input_ids"]
full_ids = tok(full, add_special_tokens=False)["input_ids"]
MAX = 1280
full_ids = full_ids[:MAX]
labels = list(full_ids)
n_pre = min(len(pre_ids), len(labels))
for i in range(n_pre): labels[i] = -100
pad = MAX - len(full_ids)
return {"input_ids": full_ids + [tok.pad_token_id]*pad,
"attention_mask": [1]*len(full_ids) + [0]*pad,
"labels": labels + [-100]*pad}
def train_on_pairs(model, tok, pairs, out_dir, lr=1e-4, epochs=2, rank=16):
log(f" training on {len(pairs)} pairs (lr={lr}, e={epochs}, r={rank})")
lora_cfg = LoraConfig(r=rank, lora_alpha=rank*2, lora_dropout=0.05, bias="none",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")
model = get_peft_model(model, lora_cfg)
tok.padding_side = "right"
ds = HFDataset.from_list([make_train_example(p["problem"], p["solution"], tok) for p in pairs])
targs = TrainingArguments(
output_dir=f"{out_dir}/ckpt", num_train_epochs=epochs,
per_device_train_batch_size=1, gradient_accumulation_steps=4,
learning_rate=lr, bf16=True, logging_steps=20,
save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05,
)
Trainer(model=model, args=targs, train_dataset=ds, processing_class=tok).train()
tok.padding_side = "left"
return model
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--iterations", type=int, default=6)
ap.add_argument("--problems_per_iter", type=int, default=32)
ap.add_argument("--n_eval", type=int, default=500)
ap.add_argument("--max_pairs", type=int, default=120)
ap.add_argument("--seed", type=int, default=42)
ap.add_argument("--tag", required=True)
args = ap.parse_args()
out_dir = f"/workspace/math500_seeded/{args.tag}"
os.makedirs(out_dir, exist_ok=True)
random.seed(args.seed); torch.manual_seed(args.seed)
log(f"loading {args.model}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
tok.padding_side = "left"
model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0")
log(f" loaded mem={torch.cuda.memory_allocated('cuda:0')/1e9:.1f}GB")
log("loading MATH train split")
train_ds = []
for cfg in ["algebra","counting_and_probability","geometry","intermediate_algebra","number_theory","prealgebra","precalculus"]:
try:
sub = list(load_dataset("EleutherAI/hendrycks_math", cfg, split="train"))
train_ds.extend(sub)
except Exception as e:
log(f" warn: failed to load {cfg}: {e}")
log(f" {len(train_ds)} train problems")
random.shuffle(train_ds)
model.eval()
log("INITIAL eval on MATH-500")
base_c, base_n = math500_eval(model, tok, n=args.n_eval)
log(f" MATH-500 base: {base_c}/{base_n} ({100*base_c/base_n:.1f}%)")
pairs = []
cursor = 0
def gold_of(p):
ans = p.get("solution", "")
b = extract_boxed(ans)
return b
for it in range(1, args.iterations + 1):
log(f"--- iter {it} ---")
batch_size = args.problems_per_iter
# Sample with gold extractable
batch_problems = []
while len(batch_problems) < batch_size and cursor < len(train_ds):
p = train_ds[cursor]; cursor += 1
gold = gold_of(p)
if gold is not None:
batch_problems.append({"problem": p["problem"], "gold": gold})
if not batch_problems:
log(" exhausted train problems"); break
# Greedy
prompts = [SOLVE_PROMPT.format(problem=p["problem"]) for p in batch_problems]
greedy_outs = gen_batch(model, tok, prompts, max_new=600, temperature=0.0, batch=16)
greedy_correct, hard_idx = 0, []
for i, (p, raw) in enumerate(zip(batch_problems, greedy_outs)):
pred = extract_boxed(raw)
if sympy_equal(pred, p["gold"]):
pairs.append({"problem": p["problem"], "solution": raw.strip(), "source": "greedy"})
greedy_correct += 1
else:
hard_idx.append(i)
log(f" iter {it}: {greedy_correct} greedy-correct, {len(hard_idx)} hard")
# Sampled for hard
if hard_idx:
hard_problems = [batch_problems[i] for i in hard_idx]
sample_prompts = []
for p in hard_problems:
sample_prompts.extend([SOLVE_PROMPT.format(problem=p["problem"])] * 4)
sample_outs = gen_batch(model, tok, sample_prompts, max_new=600, temperature=0.8, batch=16)
sampled_correct = 0
for i, p in enumerate(hard_problems):
attempts = sample_outs[i*4:(i+1)*4]
preds = [extract_boxed(a) for a in attempts]
correct_idx = [j for j, pr in enumerate(preds) if sympy_equal(pr, p["gold"])]
if correct_idx:
pairs.append({"problem": p["problem"], "solution": attempts[correct_idx[0]].strip(), "source": "sampled"})
sampled_correct += 1
log(f" iter {it}: {sampled_correct} sampled-correct (from {len(hard_idx)} hard)")
log(f" iter {it}: pairs total = {len(pairs)}")
if len(pairs) >= args.max_pairs:
log(f" reached max_pairs={args.max_pairs}, stopping")
break
log(f"=== mined {len(pairs)} total pairs ===")
with open(f"{out_dir}/pairs.jsonl", "w") as fh:
for p in pairs: fh.write(json.dumps(p) + "\n")
if not pairs:
log("no pairs — exiting"); return
model = train_on_pairs(model, tok, pairs, out_dir)
log("training done")
model.eval()
log("FINAL eval on MATH-500")
tr_c, tr_n = math500_eval(model, tok, n=args.n_eval)
log(f" MATH-500 trained: {tr_c}/{tr_n} ({100*tr_c/tr_n:.1f}%)")
result = {
"model": args.model, "n_pairs": len(pairs),
"base": base_c, "trained": tr_c, "n": tr_n,
"delta": tr_c - base_c, "elapsed_s": time.time() - T0,
}
with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model}")
print(f" MATH-500: base={base_c}/{tr_n} trained={tr_c}/{tr_n} Δ={tr_c-base_c:+d}")
print(f" Pairs mined: {len(pairs)}")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,241 @@
"""Self-bootstrap with MBPP-train as problem seeds + vLLM on H100.
- Use MBPP train (374 problems) as PROBLEM seeds (no human solutions used).
- For each: greedy attempt. If fails, sample N attempts at temp=0.8.
- Mine at-edge pairs (broken, fixed).
- Train LoRA. Eval on HumanEval + MBPP-test.
"""
import os, json, time, re, subprocess, tempfile, argparse, gc, random
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from datasets import load_dataset
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def run_python(code, timeout=8):
with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
f.write(code); path = f.name
try:
r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp")
return r.returncode == 0, (r.stderr or "")[:200]
except subprocess.TimeoutExpired: return False, "timeout"
finally:
try: os.unlink(path)
except: pass
def vllm_gen(llm, prompts, max_new=400, temperature=0.0, n=1, stops=None):
from vllm import SamplingParams
sp = SamplingParams(temperature=temperature, top_p=0.95 if temperature > 0 else 1.0,
max_tokens=max_new, n=n,
stop=stops or ["\nclass ", "\nif __name__", "\n\nprint", "\n\ndef "])
out = llm.generate(prompts, sp, use_tqdm=False)
# returns list of lists when n>1
if n == 1:
return [o.outputs[0].text for o in out]
return [[c.text for c in o.outputs] for o in out]
def he_prompt(p): return p["prompt"]
def mbpp_prompt(p):
return (f"# Task: {p['prompt']}\n"
f"# Tests:\n# " + "\n# ".join(p["test_list"]) + "\n\n")
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--attempts_per", type=int, default=8)
ap.add_argument("--max_pairs", type=int, default=200)
ap.add_argument("--tag", required=True)
args = ap.parse_args()
out_dir = f"/workspace/selfmine_mbpp/{args.tag}"
os.makedirs(out_dir, exist_ok=True)
from vllm import LLM
from transformers import AutoTokenizer
log(f"loading {args.model} into vLLM")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048)
log(f" loaded")
# --- Load benchmarks
he = list(load_dataset("openai_humaneval", split="test"))
mbpp_test = list(load_dataset("mbpp", "sanitized", split="test"))[:200]
mbpp_train = list(load_dataset("mbpp", "sanitized", split="train"))
log(f" HE: {len(he)}, MBPP-test: {len(mbpp_test)}, MBPP-train: {len(mbpp_train)}")
# --- BASE eval
log("=== BASE evals ===")
t0 = time.time()
he_outs = vllm_gen(llm, [he_prompt(p) for p in he], max_new=400)
log(f" HE base gen done in {time.time()-t0:.1f}s")
base_he = 0
for p, raw in zip(he, he_outs):
full = p["prompt"] + raw
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
ok, _ = run_python(test_code, timeout=10)
if ok: base_he += 1
t1 = time.time()
mbpp_outs = vllm_gen(llm, [mbpp_prompt(p) for p in mbpp_test], max_new=400)
log(f" MBPP-test base gen done in {time.time()-t1:.1f}s")
base_mbpp = 0
for p, raw in zip(mbpp_test, mbpp_outs):
test_code = raw + "\n\n" + "\n".join(p["test_list"])
ok, _ = run_python(test_code, timeout=10)
if ok: base_mbpp += 1
log(f" BASE: HE={base_he}/{len(he)} MBPP={base_mbpp}/{len(mbpp_test)}")
# --- Mine pairs from MBPP-train
log(f"=== mining from {len(mbpp_train)} MBPP-train problems ===")
train_prompts = [mbpp_prompt(p) for p in mbpp_train]
# greedy attempt
t0 = time.time()
greedy_outs = vllm_gen(llm, train_prompts, max_new=400)
log(f" greedy gen in {time.time()-t0:.1f}s")
pairs = []
hard_indices = []
for i, (p, raw) in enumerate(zip(mbpp_train, greedy_outs)):
test_code = raw + "\n\n" + "\n".join(p["test_list"])
ok, err = run_python(test_code, timeout=8)
if not ok:
hard_indices.append((i, p, raw, err))
log(f" {len(mbpp_train) - len(hard_indices)} greedy-correct, {len(hard_indices)} hard")
if not hard_indices:
log("nothing to mine — base too strong"); return
# sample N attempts per hard problem
log(f" sampling {args.attempts_per} attempts × {len(hard_indices)} hard problems...")
hard_prompts = []
for _i, p, _r, _e in hard_indices:
hard_prompts.append(mbpp_prompt(p))
t1 = time.time()
sample_outs = vllm_gen(llm, hard_prompts, max_new=400, temperature=0.8, n=args.attempts_per)
log(f" sample gen in {time.time()-t1:.1f}s")
t2 = time.time()
for (idx, p, greedy_raw, err), attempts in zip(hard_indices, sample_outs):
# check each attempt
passes = []
for a in attempts:
test_code = a + "\n\n" + "\n".join(p["test_list"])
ok, _ = run_python(test_code, timeout=8)
if ok: passes.append(a)
if passes:
pairs.append({
"problem": p["prompt"],
"tests": p["test_list"],
"broken": greedy_raw.strip(),
"fixed": passes[0].strip(),
"error": err,
})
if len(pairs) >= args.max_pairs: break
log(f" verification in {time.time()-t2:.1f}s — mined {len(pairs)} pairs")
with open(f"{out_dir}/pairs.jsonl", "w") as fh:
for r in pairs: fh.write(json.dumps(r) + "\n")
if len(pairs) < 5:
log("too few pairs — exiting"); return
# --- Train LoRA
log("=== TRAINING ===")
del llm; gc.collect(); torch.cuda.empty_cache()
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset as HFDataset
from peft import LoraConfig, get_peft_model
def make_ex(r):
user = (f"# Task: {r['problem']}\n"
f"# Tests:\n# " + "\n# ".join(r['tests']) + "\n"
f"# My broken attempt:\n{r['broken']}\n"
f"# Error: {r.get('error','')[:120]}\n"
f"# Corrected:\n")
target = r["fixed"]
full = user + target
full_ids = tok(full, add_special_tokens=False)["input_ids"]
user_ids = tok(user, add_special_tokens=False)["input_ids"]
MAX = 1024
full_ids = full_ids[:MAX]
labels = list(full_ids)
n_user = min(len(user_ids), len(labels))
for i in range(n_user): labels[i] = -100
pad = MAX - len(full_ids)
return {"input_ids": full_ids + [tok.pad_token_id]*pad,
"attention_mask": [1]*len(full_ids) + [0]*pad,
"labels": labels + [-100]*pad}
model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0")
lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")
model = get_peft_model(model, lora_cfg)
ds = HFDataset.from_list([make_ex(r) for r in pairs])
targs = TrainingArguments(
output_dir=f"{out_dir}/ckpt", num_train_epochs=2,
per_device_train_batch_size=2, gradient_accumulation_steps=4,
learning_rate=1e-4, bf16=True, logging_steps=20,
save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05,
)
Trainer(model=model, args=targs, train_dataset=ds, tokenizer=tok).train()
log("training done")
adapter_dir = f"{out_dir}/adapter"
model.save_pretrained(adapter_dir)
del model; gc.collect(); torch.cuda.empty_cache()
# --- TRAINED eval
from vllm import LLM
from vllm.lora.request import LoRARequest
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048,
enable_lora=True, max_lora_rank=16)
lora_req = LoRARequest("tf_adapter", 1, adapter_dir)
from vllm import SamplingParams
sp = SamplingParams(temperature=0, max_tokens=400, stop=["\nclass ", "\nif __name__", "\n\nprint", "\n\ndef "])
log("=== TRAINED evals ===")
t0 = time.time()
he_outs = [o.outputs[0].text for o in llm.generate([he_prompt(p) for p in he], sp, lora_request=lora_req, use_tqdm=False)]
log(f" HE trained gen in {time.time()-t0:.1f}s")
tr_he = 0
for p, raw in zip(he, he_outs):
full = p["prompt"] + raw
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
ok, _ = run_python(test_code, timeout=10)
if ok: tr_he += 1
t1 = time.time()
mbpp_outs = [o.outputs[0].text for o in llm.generate([mbpp_prompt(p) for p in mbpp_test], sp, lora_request=lora_req, use_tqdm=False)]
log(f" MBPP-test trained gen in {time.time()-t1:.1f}s")
tr_mbpp = 0
for p, raw in zip(mbpp_test, mbpp_outs):
test_code = raw + "\n\n" + "\n".join(p["test_list"])
ok, _ = run_python(test_code, timeout=10)
if ok: tr_mbpp += 1
result = {
"model": args.model, "n_pairs": len(pairs),
"humaneval": {"base": base_he, "trained": tr_he, "delta": tr_he-base_he, "n": len(he)},
"mbpp": {"base": base_mbpp, "trained": tr_mbpp, "delta": tr_mbpp-base_mbpp, "n": len(mbpp_test)},
"elapsed_s": time.time() - T0,
}
with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} — MBPP-train SEEDED ({len(pairs)} pairs)")
print(f" HumanEval: base={base_he}/{len(he)} trained={tr_he}/{len(he)} Δ={tr_he-base_he:+d}")
print(f" MBPP: base={base_mbpp}/{len(mbpp_test)} trained={tr_mbpp}/{len(mbpp_test)} Δ={tr_mbpp-base_mbpp:+d}")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,210 @@
"""Compound recipe + TTS: train recipe, then measure best-of-N on TOP of recipe-trained model.
Tests if recipe-trained model has BETTER sample diversity / quality at inference."""
import os, json, time, re, subprocess, tempfile, argparse, gc, random
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from datasets import load_dataset
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def run_python(code, timeout=10):
with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
f.write(code); path = f.name
try:
r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp")
return r.returncode == 0
except subprocess.TimeoutExpired: return False
finally:
try: os.unlink(path)
except: pass
def mbpp_prompt(p): return f"# Task: {p['prompt']}\n# Tests:\n# " + "\n# ".join(p["test_list"]) + "\n\n"
def he_prompt(p): return p["prompt"]
def he_score_outputs(he, outs):
c = 0
for p, raw in zip(he, outs):
code = raw
if "```python" in code:
code = code.split("```python",1)[1]
if "```" in code: code = code.split("```",1)[0]
full = p["prompt"] + "\n" + code
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
if run_python(test_code, 10): c += 1
return c
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--tag", required=True)
ap.add_argument("--out_dir", required=True)
args = ap.parse_args()
os.makedirs(args.out_dir, exist_ok=True)
random.seed(42)
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
log(f"loading {args.model}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048)
log("loaded")
he = list(load_dataset("openai_humaneval", split="test"))
# 4 metrics:
# A) raw greedy
# B) raw + best-of-8
# C) recipe greedy
# D) recipe + best-of-8
sp_g = SamplingParams(temperature=0, max_tokens=400, stop=["\nclass ", "\nif __name__", "\n\nprint"])
sp_s = SamplingParams(temperature=0.6, top_p=0.95, max_tokens=400, n=8,
stop=["\nclass ", "\nif __name__", "\n\nprint"])
log("A) raw greedy")
he_outs = [o.outputs[0].text for o in llm.generate([he_prompt(p) for p in he], sp_g, use_tqdm=False)]
A_raw_greedy = he_score_outputs(he, he_outs)
log(f" raw greedy: {A_raw_greedy}/{len(he)}")
log("B) raw best-of-8")
he_samples = llm.generate([he_prompt(p) for p in he], sp_s, use_tqdm=False)
B_raw_bo8 = 0
for p, outset in zip(he, he_samples):
for o in outset.outputs:
code = o.text
if "```python" in code:
code = code.split("```python",1)[1]
if "```" in code: code = code.split("```",1)[0]
full = p["prompt"] + "\n" + code
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
if run_python(test_code, 10):
B_raw_bo8 += 1; break
log(f" raw best-of-8: {B_raw_bo8}/{len(he)}")
# Mine pairs
log("mining pairs from MBPP-train...")
mbpp_full = list(load_dataset("mbpp", split="train"))
random.shuffle(mbpp_full)
seeds = []
for p in mbpp_full[:200]:
prompt_text = p.get("prompt") or p.get("text", "")
if prompt_text and p.get("test_list"):
seeds.append({"prompt": prompt_text, "test_list": p["test_list"]})
sp_mine = SamplingParams(temperature=0, max_tokens=400, stop=["\nclass Test", "\nif __name__"])
g_outs = [o.outputs[0].text for o in llm.generate([mbpp_prompt(p) for p in seeds], sp_mine, use_tqdm=False)]
hard_idx = [i for i, (p, raw) in enumerate(zip(seeds, g_outs))
if not run_python(raw + "\n\n" + "\n".join(p["test_list"]), 8)]
log(f" hard: {len(hard_idx)}")
pairs = []
if hard_idx:
sp_m2 = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=400, n=8,
stop=["\nclass Test", "\nif __name__"])
hard_prompts = [mbpp_prompt(seeds[i]) for i in hard_idx]
sample_outs = llm.generate(hard_prompts, sp_m2, use_tqdm=False)
for j, i in enumerate(hard_idx):
for o in sample_outs[j].outputs:
if run_python(o.text + "\n\n" + "\n".join(seeds[i]["test_list"]), 8):
pairs.append({"problem": seeds[i]["prompt"], "tests": seeds[i]["test_list"],
"broken": g_outs[i].strip(), "fixed": o.text.strip()}); break
log(f" mined {len(pairs)} pairs")
# Train LoRA
del llm; gc.collect(); torch.cuda.empty_cache()
if len(pairs) < 5:
log("too few pairs, exit"); return
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset as HFDataset
from peft import LoraConfig, get_peft_model
def mk_ex(r):
user = (f"# Task: {r['problem']}\n# Tests:\n# " + "\n# ".join(r['tests']) + "\n"
f"# My broken attempt:\n{r['broken']}\n# Corrected:\n")
full = user + r["fixed"]
full_ids = tok(full, add_special_tokens=False)["input_ids"]
user_ids = tok(user, add_special_tokens=False)["input_ids"]
MAX = 1024
full_ids = full_ids[:MAX]
labels = list(full_ids); n_user = min(len(user_ids), len(labels))
for i in range(n_user): labels[i] = -100
pad = MAX - len(full_ids)
return {"input_ids": full_ids + [tok.pad_token_id]*pad,
"attention_mask": [1]*len(full_ids) + [0]*pad,
"labels": labels + [-100]*pad}
log("training...")
model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0")
lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")
model = get_peft_model(model, lora_cfg)
ds_train = HFDataset.from_list([mk_ex(r) for r in pairs])
targs = TrainingArguments(
output_dir=f"{args.out_dir}/ckpt", num_train_epochs=2,
per_device_train_batch_size=1, gradient_accumulation_steps=4,
learning_rate=1e-4, bf16=True, logging_steps=20,
save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05,
)
Trainer(model=model, args=targs, train_dataset=ds_train, tokenizer=tok).train()
adapter_dir = f"{args.out_dir}/adapter"
model.save_pretrained(adapter_dir)
del model; gc.collect(); torch.cuda.empty_cache()
# C, D
from vllm import LLM as LLM2
from vllm.lora.request import LoRARequest
llm = LLM2(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048,
enable_lora=True, max_lora_rank=16)
lora_req = LoRARequest("trained", 1, adapter_dir)
log("C) recipe greedy")
he_outs = [o.outputs[0].text for o in llm.generate([he_prompt(p) for p in he], sp_g, lora_request=lora_req, use_tqdm=False)]
C_rec_greedy = he_score_outputs(he, he_outs)
log(f" recipe greedy: {C_rec_greedy}/{len(he)}")
log("D) recipe best-of-8")
he_samples = llm.generate([he_prompt(p) for p in he], sp_s, lora_request=lora_req, use_tqdm=False)
D_rec_bo8 = 0
for p, outset in zip(he, he_samples):
for o in outset.outputs:
code = o.text
if "```python" in code:
code = code.split("```python",1)[1]
if "```" in code: code = code.split("```",1)[0]
full = p["prompt"] + "\n" + code
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
if run_python(test_code, 10):
D_rec_bo8 += 1; break
log(f" recipe best-of-8: {D_rec_bo8}/{len(he)}")
result = {
"model": args.model, "n_pairs": len(pairs),
"raw_greedy": A_raw_greedy, "raw_bo8": B_raw_bo8,
"recipe_greedy": C_rec_greedy, "recipe_bo8": D_rec_bo8,
"n": len(he), "elapsed_s": time.time() - T0,
}
with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} — RECIPE × TTS COMPOUND (HumanEval, n={len(he)}, {len(pairs)} pairs)")
print(f" A) Raw greedy: {A_raw_greedy:>3}/{len(he)} ({100*A_raw_greedy/len(he):.1f}%)")
print(f" B) Raw best-of-8: {B_raw_bo8:>3}/{len(he)} ({100*B_raw_bo8/len(he):.1f}%)")
print(f" C) Recipe greedy: {C_rec_greedy:>3}/{len(he)} ({100*C_rec_greedy/len(he):.1f}%)")
print(f" D) Recipe best-of-8: {D_rec_bo8:>3}/{len(he)} ({100*D_rec_bo8/len(he):.1f}%)")
print(f" Synergy: D - max(B,C) = {D_rec_bo8 - max(B_raw_bo8, C_rec_greedy):+d} (>0 = real synergy)")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,219 @@
"""Recursive self-bootstrap: iter1->iter2->iter3.
Iter k:
- Use model from previous iter (or base for iter 1)
- Mine pairs on MBPP-train
- Train fresh LoRA from BASE on accumulated pairs
- Eval on HE
"""
import os, json, time, re, subprocess, tempfile, argparse, gc, random
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from datasets import load_dataset
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def run_python(code, timeout=10):
with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
f.write(code); path = f.name
try:
r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp")
return r.returncode == 0
except subprocess.TimeoutExpired: return False
finally:
try: os.unlink(path)
except: pass
def mbpp_prompt(p):
return f"# Task: {p['prompt']}\n# Tests:\n# " + "\n# ".join(p["test_list"]) + "\n\n"
def he_prompt(p): return p["prompt"]
def vllm_gen(llm, prompts, max_new=400, temperature=0.0, n=1, lora_req=None, stops=None):
from vllm import SamplingParams
sp = SamplingParams(temperature=temperature, top_p=0.95 if temperature > 0 else 1.0,
max_tokens=max_new, n=n,
stop=stops or ["\nclass Test", "\nif __name__", "\n\nprint", "\nassert "])
if lora_req:
out = llm.generate(prompts, sp, lora_request=lora_req, use_tqdm=False)
else:
out = llm.generate(prompts, sp, use_tqdm=False)
if n == 1: return [o.outputs[0].text for o in out]
return [[c.text for c in o.outputs] for o in out]
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--tag", required=True)
ap.add_argument("--out_dir", required=True)
ap.add_argument("--n_iters", type=int, default=3)
ap.add_argument("--n_mining", type=int, default=200)
ap.add_argument("--attempts_per", type=int, default=8)
args = ap.parse_args()
os.makedirs(args.out_dir, exist_ok=True)
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
from transformers import AutoTokenizer
log(f"loading {args.model}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
he = list(load_dataset("openai_humaneval", split="test"))
mbpp_full = list(load_dataset("mbpp", split="train"))
random.seed(42); random.shuffle(mbpp_full)
seeds_pool = []
for p in mbpp_full[:args.n_mining * args.n_iters]:
prompt_text = p.get("prompt") or p.get("text", "")
if prompt_text and p.get("test_list"):
seeds_pool.append({"prompt": prompt_text, "test_list": p["test_list"]})
log(f"seeds pool: {len(seeds_pool)}")
iter_results = []
accumulated_pairs = []
current_adapter = None # path
for it in range(1, args.n_iters + 1):
log(f"\n========== ITER {it} ==========")
# Load model (with current adapter if exists)
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85,
max_model_len=2048,
enable_lora=(current_adapter is not None), max_lora_rank=16)
lora_req = LoRARequest("cur", 1, current_adapter) if current_adapter else None
log(f" loaded {'(with adapter)' if current_adapter else '(base)'}")
# Mine pairs using current model
seeds = seeds_pool[(it-1)*args.n_mining:it*args.n_mining]
log(f" mining from {len(seeds)} new seeds")
prompts = [mbpp_prompt(p) for p in seeds]
greedy_outs = vllm_gen(llm, prompts, max_new=400, lora_req=lora_req)
hard_idx = []
for i, (p, raw) in enumerate(zip(seeds, greedy_outs)):
test_code = raw + "\n\n" + "\n".join(p["test_list"])
if not run_python(test_code, 8):
hard_idx.append(i)
log(f" greedy: {len(seeds)-len(hard_idx)} pass, {len(hard_idx)} hard")
if hard_idx:
hard_prompts = [mbpp_prompt(seeds[i]) for i in hard_idx]
sample_outs = vllm_gen(llm, hard_prompts, max_new=400, temperature=0.8,
n=args.attempts_per, lora_req=lora_req)
new_pairs = []
for j, i in enumerate(hard_idx):
attempts = sample_outs[j]
passes = []
for a in attempts:
if run_python(a + "\n\n" + "\n".join(seeds[i]["test_list"]), 8):
passes.append(a); break
if passes:
new_pairs.append({"problem": seeds[i]["prompt"], "tests": seeds[i]["test_list"],
"broken": greedy_outs[i].strip(), "fixed": passes[0].strip(),
"iter": it})
accumulated_pairs.extend(new_pairs)
log(f" mined {len(new_pairs)} new pairs (cumulative: {len(accumulated_pairs)})")
# Eval current model on HE
log(f" eval HE...")
he_outs = vllm_gen(llm, [he_prompt(p) for p in he], max_new=400, lora_req=lora_req,
stops=["\nclass ", "\nif __name__", "\n\nprint"])
he_correct = 0
for p, raw in zip(he, he_outs):
full = p["prompt"] + "\n" + raw
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
if run_python(test_code, 10): he_correct += 1
log(f" HE iter{it} (pre-train): {he_correct}/{len(he)}")
iter_results.append({"iter": it, "he_pretrain": he_correct, "cumulative_pairs": len(accumulated_pairs)})
# Tear down vLLM, train new adapter on accumulated pairs
del llm; gc.collect(); torch.cuda.empty_cache()
if len(accumulated_pairs) < 5:
log(f" too few pairs to train, skipping iter {it} training")
continue
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset as HFDataset
from peft import LoraConfig, get_peft_model
def mk_ex(r):
user = (f"# Task: {r['problem']}\n# Tests:\n# " + "\n# ".join(r['tests']) + "\n"
f"# My broken attempt:\n{r['broken']}\n# Corrected:\n")
target = r["fixed"]
full = user + target
full_ids = tok(full, add_special_tokens=False)["input_ids"]
user_ids = tok(user, add_special_tokens=False)["input_ids"]
MAX = 1024
full_ids = full_ids[:MAX]
labels = list(full_ids)
n_user = min(len(user_ids), len(labels))
for i in range(n_user): labels[i] = -100
pad = MAX - len(full_ids)
return {"input_ids": full_ids + [tok.pad_token_id]*pad,
"attention_mask": [1]*len(full_ids) + [0]*pad,
"labels": labels + [-100]*pad}
log(f" training fresh adapter on {len(accumulated_pairs)} pairs...")
model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0")
lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")
model = get_peft_model(model, lora_cfg)
ds_train = HFDataset.from_list([mk_ex(r) for r in accumulated_pairs])
targs = TrainingArguments(
output_dir=f"{args.out_dir}/iter{it}_ckpt", num_train_epochs=2,
per_device_train_batch_size=1, gradient_accumulation_steps=4,
learning_rate=1e-4, bf16=True, logging_steps=20,
save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05,
)
Trainer(model=model, args=targs, train_dataset=ds_train, tokenizer=tok).train()
adapter_dir = f"{args.out_dir}/iter{it}_adapter"
model.save_pretrained(adapter_dir)
del model; gc.collect(); torch.cuda.empty_cache()
current_adapter = adapter_dir
# Re-eval with new adapter to get post-train HE
log(f" eval post-train HE...")
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048,
enable_lora=True, max_lora_rank=16)
lora_req = LoRARequest(f"iter{it}", it, current_adapter)
he_outs = vllm_gen(llm, [he_prompt(p) for p in he], max_new=400, lora_req=lora_req,
stops=["\nclass ", "\nif __name__", "\n\nprint"])
he_correct = 0
for p, raw in zip(he, he_outs):
full = p["prompt"] + "\n" + raw
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
if run_python(test_code, 10): he_correct += 1
log(f" HE iter{it} (post-train): {he_correct}/{len(he)}")
iter_results[-1]["he_posttrain"] = he_correct
del llm; gc.collect(); torch.cuda.empty_cache()
# Save pairs and results
with open(f"{args.out_dir}/pairs.jsonl", "w") as fh:
for r in accumulated_pairs: fh.write(json.dumps(r) + "\n")
result = {"model": args.model, "tag": args.tag, "n_iters": args.n_iters,
"iter_results": iter_results, "total_pairs": len(accumulated_pairs),
"elapsed_s": time.time() - T0}
with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} — RECURSIVE BOOTSTRAP")
for r in iter_results:
pre = r.get("he_pretrain", "-")
post = r.get("he_posttrain", "-")
print(f" iter {r['iter']}: cum_pairs={r['cumulative_pairs']} HE_pre={pre} HE_post={post}")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,129 @@
"""Self-consistency selection: majority vote on N samples WITHOUT oracle access.
Tests if model's self-agreement is a good selector (deployable TTS without test cases)."""
import os, json, time, re, argparse
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from datasets import load_dataset
from collections import Counter
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def extract_boxed(text):
idx = text.rfind("\\boxed{")
if idx < 0: return None
start = idx + len("\\boxed{"); depth = 1; i = start
while i < len(text) and depth > 0:
if text[i] == "{": depth += 1
elif text[i] == "}": depth -= 1
i += 1
if depth != 0: return None
return text[start:i-1].strip()
def normalize(s):
if s is None: return None
s = s.strip().lower()
s = re.sub(r"[,$\s]", "", s)
return s
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--n_samples", type=int, default=16)
ap.add_argument("--tag", required=True)
ap.add_argument("--out_dir", required=True)
args = ap.parse_args()
os.makedirs(args.out_dir, exist_ok=True)
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
log(f"loading {args.model}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048)
log("loaded")
math500 = list(load_dataset("HuggingFaceH4/MATH-500", split="test"))[:200]
prompts = []
for p in math500:
try:
msgs = [{"role": "system", "content": "Math solver. End with \\boxed{answer}."},
{"role": "user", "content": f"Solve. Problem: {p['problem']}\n\nSolution:"}]
prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
except Exception:
prompts.append(f"Solve. Problem: {p['problem']}\n\nSolution:")
log(f"generating {args.n_samples} samples per problem...")
sp = SamplingParams(temperature=0.7, top_p=0.95, max_tokens=800, n=args.n_samples)
t0 = time.time()
outs = llm.generate(prompts, sp, use_tqdm=False)
log(f" gen in {time.time()-t0:.1f}s")
import sympy
from sympy.parsing.latex import parse_latex
def sympy_eq(a, b):
if a is None or b is None: return False
if a == b: return True
try:
if sympy.simplify(parse_latex(a) - parse_latex(b)) == 0: return True
except Exception: pass
try:
if abs(float(a) - float(b)) < 1e-6: return True
except Exception: pass
return False
# Three metrics:
# 1. Greedy: take first sample
# 2. Oracle pass@N: any correct
# 3. Self-consistency: majority vote on extracted boxed answer (normalize numbers/text)
greedy_correct = 0
oracle_correct = 0
sc_correct = 0
for p, outset in zip(math500, outs):
attempts = [o.text for o in outset.outputs]
preds = [extract_boxed(a) for a in attempts]
# Greedy: first sample
if sympy_eq(preds[0], p["answer"]): greedy_correct += 1
# Oracle: any pass
if any(sympy_eq(pr, p["answer"]) for pr in preds): oracle_correct += 1
# Self-consistency: majority vote on normalized answer
normalized = [normalize(pr) for pr in preds if pr is not None]
if normalized:
most_common, _ = Counter(normalized).most_common(1)[0]
# Find an original pred with this normalized form
for pr in preds:
if pr and normalize(pr) == most_common:
if sympy_eq(pr, p["answer"]): sc_correct += 1
break
result = {
"model": args.model, "n_samples": args.n_samples,
"greedy_first": greedy_correct,
"oracle_pass_at_N": oracle_correct,
"self_consistency": sc_correct,
"n": len(math500),
"elapsed_s": time.time() - T0,
}
with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} — SELF-CONSISTENCY vs ORACLE on MATH-500 (n={args.n_samples})")
print(f" First sample (greedy-like): {greedy_correct}/{len(math500)} ({100*greedy_correct/len(math500):.1f}%)")
print(f" Self-consistency (vote): {sc_correct}/{len(math500)} ({100*sc_correct/len(math500):.1f}%)")
print(f" Oracle (any-pass): {oracle_correct}/{len(math500)} ({100*oracle_correct/len(math500):.1f}%)")
sc_recovery = 100*(sc_correct - greedy_correct)/(oracle_correct - greedy_correct) if oracle_correct > greedy_correct else 0
print(f" SC recovers {sc_recovery:.0f}% of oracle-greedy gap")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,236 @@
"""Self-correction recipe for CODE. Same pattern as math sc_v2 (which gave +5 recovery).
Pipeline:
1. MBPP-train problems (374 sanitized + extended).
2. Greedy attempt. If passes save as rightstays-right positive.
3. If fails prompt with "Wait, let me reconsider" + sample 4 at temp=0.8.
If any pass mine (problem, wrong, reflection, correct) self-correction trace.
4. Train on mixed dataset.
5. Eval HE + MBPP.
Mix teaches model: commit to right answers, fix wrong ones.
"""
import os, json, time, re, subprocess, tempfile, argparse, gc, random
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from datasets import load_dataset
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
RECONSIDER_TAG = "\n\n# Wait — that doesn't look right. Let me reconsider:\n\n"
def run_python(code, timeout=8):
with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
f.write(code); path = f.name
try:
r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp")
return r.returncode == 0
except subprocess.TimeoutExpired: return False
finally:
try: os.unlink(path)
except: pass
def vllm_gen(llm, prompts, max_new=400, temperature=0.0, n=1, prefill_texts=None):
from vllm import SamplingParams
sp = SamplingParams(temperature=temperature, top_p=0.95 if temperature > 0 else 1.0,
max_tokens=max_new, n=n,
stop=["\nclass Test", "\nif __name__", "\n\nprint", "\nassert "])
if prefill_texts is None:
out = llm.generate(prompts, sp, use_tqdm=False)
else:
# Each prompt is concatenated with prefill text
full_prompts = [p + pre for p, pre in zip(prompts, prefill_texts)]
out = llm.generate(full_prompts, sp, use_tqdm=False)
if n == 1: return [o.outputs[0].text for o in out]
return [[c.text for c in o.outputs] for o in out]
def he_prompt(p): return p["prompt"]
def mbpp_prompt(p):
return f"# Task: {p['prompt']}\n# Tests:\n# " + "\n# ".join(p["test_list"]) + "\n\n"
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--n_mining", type=int, default=300)
ap.add_argument("--max_self_corrections", type=int, default=80)
ap.add_argument("--max_positives", type=int, default=80)
ap.add_argument("--tag", required=True)
args = ap.parse_args()
out_dir = f"/workspace/code_sc/{args.tag}"
os.makedirs(out_dir, exist_ok=True)
random.seed(42)
from vllm import LLM
from transformers import AutoTokenizer
log(f"loading {args.model} into vLLM")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048)
log(f" loaded")
he = list(load_dataset("openai_humaneval", split="test"))
mbpp_test = list(load_dataset("mbpp", "sanitized", split="test"))[:100]
mbpp_full = list(load_dataset("mbpp", split="train"))
random.shuffle(mbpp_full)
seeds = []
for p in mbpp_full[:args.n_mining]:
prompt_text = p.get("prompt") or p.get("text", "")
if prompt_text and p.get("test_list"):
seeds.append({"prompt": prompt_text, "test_list": p["test_list"]})
log(f" HE: {len(he)}, MBPP-test: {len(mbpp_test)}, mining seeds: {len(seeds)}")
# --- BASE eval
log("=== BASE eval ===")
he_outs = vllm_gen(llm, [he_prompt(p) for p in he], max_new=400)
base_he = sum(1 for p, raw in zip(he, he_outs)
if run_python(p["prompt"] + raw + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})", 10))
log(f" HE base: {base_he}/{len(he)}")
mbpp_outs = vllm_gen(llm, [mbpp_prompt(p) for p in mbpp_test], max_new=400)
base_mbpp = sum(1 for p, raw in zip(mbpp_test, mbpp_outs)
if run_python(raw + "\n\n" + "\n".join(p["test_list"]), 10))
log(f" MBPP base: {base_mbpp}/{len(mbpp_test)}")
# --- Mine: greedy on all seeds
log(f"=== mining: greedy attempt on {len(seeds)} seeds ===")
t0 = time.time()
greedy_outs = vllm_gen(llm, [mbpp_prompt(p) for p in seeds], max_new=400)
log(f" greedy gen in {time.time()-t0:.1f}s")
t1 = time.time()
right = [] # greedy correct (positives)
wrong = [] # greedy wrong (candidates for self-correction)
for p, raw in zip(seeds, greedy_outs):
test_code = raw + "\n\n" + "\n".join(p["test_list"])
if run_python(test_code, timeout=8):
right.append({"problem": p["prompt"], "tests": p["test_list"], "solution": raw.strip()})
else:
wrong.append({"problem": p["prompt"], "tests": p["test_list"], "wrong": raw.strip()})
log(f" verify: {len(right)} greedy-correct, {len(wrong)} hard")
# --- For wrong: prefill wrong + reconsider tag, sample 4 attempts
log(f"=== self-correction sampling on {len(wrong)} hard problems ===")
sc_pairs = []
if wrong:
base_prompts = [mbpp_prompt({"prompt": w["problem"], "test_list": w["tests"]}) for w in wrong]
prefills = [w["wrong"] + RECONSIDER_TAG for w in wrong]
# Generate 4 attempts each via temperature
t0 = time.time()
sc_outs = vllm_gen(llm, base_prompts, max_new=400, temperature=0.8, n=4, prefill_texts=prefills)
log(f" sc gen in {time.time()-t0:.1f}s")
t1 = time.time()
for w, attempts in zip(wrong, sc_outs):
for a in attempts:
test_code = a + "\n\n" + "\n".join(w["tests"])
if run_python(test_code, timeout=8):
full_trace = w["wrong"] + RECONSIDER_TAG + a.strip()
sc_pairs.append({"problem": w["problem"], "tests": w["tests"],
"full_trace": full_trace})
break # one per problem
log(f" sc verify in {time.time()-t1:.1f}s — {len(sc_pairs)} self-correction traces")
# Cap and sample
random.shuffle(right); random.shuffle(sc_pairs)
right = right[:args.max_positives]
sc_pairs = sc_pairs[:args.max_self_corrections]
log(f"=== final: {len(sc_pairs)} self-correction + {len(right)} right→stays-right = {len(sc_pairs)+len(right)} examples ===")
if len(sc_pairs) + len(right) < 10:
log("too few examples — exiting"); return
with open(f"{out_dir}/sc_pairs.jsonl", "w") as fh:
for r in sc_pairs: fh.write(json.dumps(r) + "\n")
with open(f"{out_dir}/positives.jsonl", "w") as fh:
for r in right: fh.write(json.dumps(r) + "\n")
# --- Train LoRA on MIXED dataset
log("=== TRAINING ===")
del llm; gc.collect(); torch.cuda.empty_cache()
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset as HFDataset
from peft import LoraConfig, get_peft_model
train_examples = []
for r in sc_pairs:
train_examples.append({"problem": r["problem"], "tests": r["tests"], "target": r["full_trace"]})
for r in right:
train_examples.append({"problem": r["problem"], "tests": r["tests"], "target": r["solution"]})
random.shuffle(train_examples)
def mk_ex(r):
user = f"# Task: {r['problem']}\n# Tests:\n# " + "\n# ".join(r['tests']) + "\n\n"
target = r["target"]
full = user + target
full_ids = tok(full, add_special_tokens=False)["input_ids"]
user_ids = tok(user, add_special_tokens=False)["input_ids"]
MAX = 1280
full_ids = full_ids[:MAX]
labels = list(full_ids)
n_user = min(len(user_ids), len(labels))
for i in range(n_user): labels[i] = -100
pad = MAX - len(full_ids)
return {"input_ids": full_ids + [tok.pad_token_id]*pad,
"attention_mask": [1]*len(full_ids) + [0]*pad,
"labels": labels + [-100]*pad}
model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0")
lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")
model = get_peft_model(model, lora_cfg)
ds_train = HFDataset.from_list([mk_ex(r) for r in train_examples])
targs = TrainingArguments(
output_dir=f"{out_dir}/ckpt", num_train_epochs=2,
per_device_train_batch_size=1, gradient_accumulation_steps=4,
learning_rate=1e-4, bf16=True, logging_steps=20,
save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05,
)
Trainer(model=model, args=targs, train_dataset=ds_train, tokenizer=tok).train()
log("training done")
adapter_dir = f"{out_dir}/adapter"
model.save_pretrained(adapter_dir)
del model; gc.collect(); torch.cuda.empty_cache()
# --- TRAINED eval
from vllm import LLM
from vllm.lora.request import LoRARequest
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048,
enable_lora=True, max_lora_rank=16)
lora_req = LoRARequest("tf_adapter", 1, adapter_dir)
from vllm import SamplingParams
sp = SamplingParams(temperature=0, max_tokens=500, stop=["\nclass Test", "\nif __name__"])
log("=== TRAINED eval ===")
he_outs = [o.outputs[0].text for o in llm.generate([he_prompt(p) for p in he], sp, lora_request=lora_req, use_tqdm=False)]
tr_he = sum(1 for p, raw in zip(he, he_outs)
if run_python(p["prompt"] + raw + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})", 10))
mbpp_outs = [o.outputs[0].text for o in llm.generate([mbpp_prompt(p) for p in mbpp_test], sp, lora_request=lora_req, use_tqdm=False)]
tr_mbpp = sum(1 for p, raw in zip(mbpp_test, mbpp_outs)
if run_python(raw + "\n\n" + "\n".join(p["test_list"]), 10))
result = {
"model": args.model,
"n_sc": len(sc_pairs), "n_positives": len(right), "n_total": len(train_examples),
"humaneval": {"base": base_he, "trained": tr_he, "delta": tr_he-base_he, "n": len(he)},
"mbpp": {"base": base_mbpp, "trained": tr_mbpp, "delta": tr_mbpp-base_mbpp, "n": len(mbpp_test)},
"elapsed_s": time.time()-T0,
}
with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} — CODE SELF-CORRECTION ({len(sc_pairs)} sc + {len(right)} positives)")
print(f" HumanEval: base={base_he}/{len(he)} trained={tr_he}/{len(he)} Δ={tr_he-base_he:+d}")
print(f" MBPP: base={base_mbpp}/{len(mbpp_test)} trained={tr_mbpp}/{len(mbpp_test)} Δ={tr_mbpp-base_mbpp:+d}")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,256 @@
"""Self-correction recipe FIXED: mix wrong→fix triples WITH right→stays-right.
Previous failure: training only on wrongfix taught model to over-doubt itself,
causing -230 regression on Qwen3-4B-Base.
Fix:
1. Use existing wrongfix triples (mined yesterday).
2. Add an equal/greater number of rightstays-right examples (greedy was correct).
3. Train on the mixed dataset model learns WHEN to self-correct.
4. Eval on MATH-500.
Uses vLLM on H100 for fast generation.
"""
import os, json, time, re, argparse, gc, random
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from datasets import load_dataset
import sympy
from sympy.parsing.latex import parse_latex
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
SOLVE_PROMPT = """Solve this competition math problem. Show your reasoning, then put the final answer in \\boxed{{...}}.
Problem: {problem}
Solution:"""
RECONSIDER_TAG = "\n\nWait, let me reconsider — I think there's an error above.\n\n"
def extract_boxed(text):
idx = text.rfind("\\boxed{")
if idx < 0: return None
start = idx + len("\\boxed{")
depth = 1; i = start
while i < len(text) and depth > 0:
if text[i] == "{": depth += 1
elif text[i] == "}": depth -= 1
i += 1
if depth != 0: return None
return text[start:i-1].strip()
def normalize(s):
if s is None: return None
s = s.strip()
s = re.sub(r"^\$|\$$", "", s).strip()
s = re.sub(r"\\text\{([^}]*)\}", r"\1", s)
s = re.sub(r"\\mbox\{([^}]*)\}", r"\1", s)
s = re.sub(r"(?<=\d),(?=\d)", "", s)
s = s.replace("\\left", "").replace("\\right", "").replace("^\\circ", "").replace("^{\\circ}", "")
return s.strip()
def sympy_equal(a, b):
if a is None or b is None: return False
a, b = normalize(a), normalize(b)
if a == b: return True
try:
ea = parse_latex(a); eb = parse_latex(b)
if sympy.simplify(ea - eb) == 0: return True
except Exception: pass
try:
fa = float(a); fb = float(b)
if abs(fa - fb) < 1e-6: return True
except Exception: pass
return False
def vllm_gen(llm, prompts, max_new=600, temperature=0.0, n=1):
from vllm import SamplingParams
sp = SamplingParams(temperature=temperature, top_p=0.95 if temperature > 0 else 1.0,
max_tokens=max_new, n=n)
out = llm.generate(prompts, sp, use_tqdm=False)
if n == 1: return [o.outputs[0].text for o in out]
return [[c.text for c in o.outputs] for o in out]
def math500_eval(gen_func, label):
ds = list(load_dataset("HuggingFaceH4/MATH-500", split="test"))
log(f" eval MATH-500 [{label}] ({len(ds)})")
prompts = [SOLVE_PROMPT.format(problem=p["problem"]) for p in ds]
t0 = time.time()
outs = gen_func(prompts, max_new=800)
log(f" gen done in {time.time()-t0:.1f}s")
correct = 0
for p, raw in zip(ds, outs):
if sympy_equal(extract_boxed(raw), p["answer"]): correct += 1
return correct, len(ds)
def make_train_example(problem, solution, tok):
user = SOLVE_PROMPT.format(problem=problem)
full = user + " " + solution
full_ids = tok(full, add_special_tokens=False)["input_ids"]
user_ids = tok(user + " ", add_special_tokens=False)["input_ids"]
MAX = 1536
full_ids = full_ids[:MAX]
labels = list(full_ids)
n_user = min(len(user_ids), len(labels))
for i in range(n_user): labels[i] = -100
pad = MAX - len(full_ids)
return {"input_ids": full_ids + [tok.pad_token_id]*pad,
"attention_mask": [1]*len(full_ids) + [0]*pad,
"labels": labels + [-100]*pad}
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--wrong_fix_pairs", required=True, help="Existing wrong→fix triples jsonl from prior run")
ap.add_argument("--n_positives", type=int, default=100, help="Number of right→stays-right examples to mine")
ap.add_argument("--tag", required=True)
args = ap.parse_args()
out_dir = f"/workspace/math500_sc_v2/{args.tag}"
os.makedirs(out_dir, exist_ok=True)
from vllm import LLM
from transformers import AutoTokenizer
log(f"loading {args.model} into vLLM")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048)
log(f" loaded")
# --- BASE eval
log("=== BASE eval ===")
base_c, base_n = math500_eval(lambda P, max_new=800: vllm_gen(llm, P, max_new=max_new), "BASE")
log(f" BASE: {base_c}/{base_n} ({100*base_c/base_n:.1f}%)")
# --- Load existing wrong→fix triples
wrong_fix = [json.loads(l) for l in open(args.wrong_fix_pairs)]
log(f" loaded {len(wrong_fix)} wrong→fix triples")
# --- Mine right→stays-right positives from MATH-train
log(f"=== mining {args.n_positives} right→stays-right positives ===")
train_ds = []
for cfg in ["algebra","counting_and_probability","geometry","intermediate_algebra","number_theory","prealgebra","precalculus"]:
try:
sub = list(load_dataset("EleutherAI/hendrycks_math", cfg, split="train"))
train_ds.extend(sub)
except Exception: pass
random.seed(42); random.shuffle(train_ds)
log(f" {len(train_ds)} train problems available")
def gold_of(p):
return extract_boxed(p.get("solution", ""))
positives = []
cursor = 0
while len(positives) < args.n_positives and cursor < len(train_ds):
batch = []
while len(batch) < 64 and cursor < len(train_ds):
p = train_ds[cursor]; cursor += 1
g = gold_of(p)
if g is not None: batch.append({"problem": p["problem"], "gold": g})
if not batch: break
prompts = [SOLVE_PROMPT.format(problem=p["problem"]) for p in batch]
outs = vllm_gen(llm, prompts, max_new=600, temperature=0.0)
for p, raw in zip(batch, outs):
if sympy_equal(extract_boxed(raw), p["gold"]):
# right→stays-right: model wrote a clean correct solution
positives.append({"problem": p["problem"], "solution": raw.strip()})
if len(positives) >= args.n_positives: break
log(f" positives: {len(positives)} / {args.n_positives}")
log(f"=== final dataset: {len(wrong_fix)} wrong→fix + {len(positives)} right→stays-right = {len(wrong_fix)+len(positives)} examples ===")
with open(f"{out_dir}/positives.jsonl", "w") as fh:
for p in positives: fh.write(json.dumps(p) + "\n")
# --- Build training data
train_examples = []
# wrong→fix as full self-correction traces
for r in wrong_fix:
train_examples.append({
"problem": r["problem"],
"solution": r["full_solution"], # already includes wrong + RECONSIDER_TAG + correct
})
# right→stays-right as plain solutions (no "wait" — model commits)
for r in positives:
train_examples.append({
"problem": r["problem"],
"solution": r["solution"],
})
random.shuffle(train_examples)
# --- Train LoRA
log("=== TRAINING ===")
del llm; gc.collect(); torch.cuda.empty_cache()
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset as HFDataset
from peft import LoraConfig, get_peft_model
model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0")
lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")
model = get_peft_model(model, lora_cfg)
ds_train = HFDataset.from_list([make_train_example(r["problem"], r["solution"], tok) for r in train_examples])
targs = TrainingArguments(
output_dir=f"{out_dir}/ckpt", num_train_epochs=2,
per_device_train_batch_size=1, gradient_accumulation_steps=4,
learning_rate=1e-4, bf16=True, logging_steps=20,
save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05,
)
Trainer(model=model, args=targs, train_dataset=ds_train, tokenizer=tok).train()
log("training done")
adapter_dir = f"{out_dir}/adapter"
model.save_pretrained(adapter_dir)
del model; gc.collect(); torch.cuda.empty_cache()
# --- TRAINED eval
from vllm import LLM
from vllm.lora.request import LoRARequest
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048,
enable_lora=True, max_lora_rank=16)
lora_req = LoRARequest("tf_adapter", 1, adapter_dir)
from vllm import SamplingParams
def gen_trained(prompts, max_new=800):
sp = SamplingParams(temperature=0, max_tokens=max_new)
return [o.outputs[0].text for o in llm.generate(prompts, sp, lora_request=lora_req, use_tqdm=False)]
log("=== TRAINED eval ===")
tr_c, tr_n = math500_eval(gen_trained, "TRAINED")
log(f" TRAINED: {tr_c}/{tr_n} ({100*tr_c/tr_n:.1f}%)")
result = {
"model": args.model,
"n_wrong_fix": len(wrong_fix),
"n_positives": len(positives),
"n_total": len(train_examples),
"base": base_c, "trained": tr_c, "n": tr_n,
"delta": tr_c - base_c,
"elapsed_s": time.time() - T0,
}
with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} — SELF-CORRECTION V2 (mixed: {len(wrong_fix)} wrong→fix + {len(positives)} right→stays)")
print(f" MATH-500: base={base_c}/{tr_n} ({100*base_c/tr_n:.1f}%) trained={tr_c}/{tr_n} ({100*tr_c/tr_n:.1f}%) Δ={tr_c-base_c:+d}")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,286 @@
"""TinyForge-Zero self-correction for MATH-500.
Recipe:
1. Sample real MATH-train problem (no human solutions used).
2. Model greedy-attempt wrong. Capture as wrong_attempt.
3. Re-prompt model: {problem} + wrong_attempt + "Wait, let me reconsider:"
Sample 4 completions at temp=0.8.
4. If any completion gets correct boxed answer (verified via sympy against gold),
MINE a triple: (problem, wrong_attempt, reflection+correct).
5. Train LoRA on full traces model learns to catch + fix own errors.
6. Eval on MATH-500 (test). Model naturally produces self-correction.
Key difference from rejection-sampling: training data teaches the FIX,
not just the answer. Same brokenfixed structure that worked for code.
"""
import os, json, time, re, argparse, random
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset, Dataset as HFDataset
from peft import LoraConfig, get_peft_model
import sympy
from sympy.parsing.latex import parse_latex
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
SOLVE_PROMPT = """Solve this competition math problem. Show your reasoning, then put the final answer in \\boxed{{...}}.
Problem: {problem}
Solution:"""
RECONSIDER_TAG = "\n\nWait, let me reconsider — I think there's an error above.\n\n"
def extract_boxed(text):
idx = text.rfind("\\boxed{")
if idx < 0: return None
start = idx + len("\\boxed{")
depth = 1; i = start
while i < len(text) and depth > 0:
if text[i] == "{": depth += 1
elif text[i] == "}": depth -= 1
i += 1
if depth != 0: return None
return text[start:i-1].strip()
def normalize(s):
if s is None: return None
s = s.strip()
s = re.sub(r"^\$|\$$", "", s).strip()
s = re.sub(r"\\text\{([^}]*)\}", r"\1", s)
s = re.sub(r"\\mbox\{([^}]*)\}", r"\1", s)
s = re.sub(r"(?<=\d),(?=\d)", "", s)
s = s.replace("\\left", "").replace("\\right", "").replace("^\\circ", "").replace("^{\\circ}", "")
return s.strip()
def sympy_equal(a, b):
if a is None or b is None: return False
a, b = normalize(a), normalize(b)
if a == b: return True
try:
ea = parse_latex(a); eb = parse_latex(b)
if sympy.simplify(ea - eb) == 0: return True
except Exception: pass
try:
fa = float(a); fb = float(b)
if abs(fa - fb) < 1e-6: return True
except Exception: pass
return False
def chat_messages(user_content):
return [{"role": "system", "content": "You are a careful math problem solver. If you make a mistake, catch it and correct yourself."},
{"role": "user", "content": user_content}]
def gen_batch(model, tok, prompts, max_new=600, temperature=0.0, batch=16, prefill_texts=None):
"""If prefill_texts provided, append each to its chat-templated prompt (forcing the model to continue from there)."""
outs = []
for i in range(0, len(prompts), batch):
chunk = prompts[i:i+batch]
pref_chunk = prefill_texts[i:i+batch] if prefill_texts else [""] * len(chunk)
texts = []
for p, pre in zip(chunk, pref_chunk):
msgs = chat_messages(p)
try:
base = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
except Exception:
base = p
texts.append(base + pre)
inp = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=2000).to(model.device)
with torch.no_grad():
out = model.generate(**inp, max_new_tokens=max_new, do_sample=temperature > 0,
temperature=temperature if temperature > 0 else 1.0, top_p=0.95,
pad_token_id=tok.eos_token_id)
for j in range(out.size(0)):
outs.append(tok.decode(out[j][inp.input_ids.shape[1]:], skip_special_tokens=True))
return outs
def math500_eval(model, tok, n=500, batch=16):
ds = list(load_dataset("HuggingFaceH4/MATH-500", split="test"))[:n]
log(f" eval on MATH-500 ({len(ds)} problems)")
prompts = [SOLVE_PROMPT.format(problem=p["problem"]) for p in ds]
outs = gen_batch(model, tok, prompts, max_new=800, temperature=0.0, batch=batch)
correct = 0
for p, raw in zip(ds, outs):
pred = extract_boxed(raw)
if sympy_equal(pred, p["answer"]): correct += 1
return correct, len(ds)
def make_train_example(problem, full_solution, tok):
"""Train on the full self-correction trace."""
user = SOLVE_PROMPT.format(problem=problem)
msgs_pre = chat_messages(user)
msgs_full = msgs_pre + [{"role": "assistant", "content": full_solution}]
pre = tok.apply_chat_template(msgs_pre, tokenize=False, add_generation_prompt=True)
full = tok.apply_chat_template(msgs_full, tokenize=False)
pre_ids = tok(pre, add_special_tokens=False)["input_ids"]
full_ids = tok(full, add_special_tokens=False)["input_ids"]
MAX = 1536
full_ids = full_ids[:MAX]
labels = list(full_ids)
n_pre = min(len(pre_ids), len(labels))
for i in range(n_pre): labels[i] = -100
pad = MAX - len(full_ids)
return {"input_ids": full_ids + [tok.pad_token_id]*pad,
"attention_mask": [1]*len(full_ids) + [0]*pad,
"labels": labels + [-100]*pad}
def train_on_pairs(model, tok, pairs, out_dir, lr=1e-4, epochs=2, rank=16):
log(f" training on {len(pairs)} traces (lr={lr}, e={epochs}, r={rank})")
lora_cfg = LoraConfig(r=rank, lora_alpha=rank*2, lora_dropout=0.05, bias="none",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")
model = get_peft_model(model, lora_cfg)
tok.padding_side = "right"
ds = HFDataset.from_list([make_train_example(p["problem"], p["full_solution"], tok) for p in pairs])
targs = TrainingArguments(
output_dir=f"{out_dir}/ckpt", num_train_epochs=epochs,
per_device_train_batch_size=1, gradient_accumulation_steps=4,
learning_rate=lr, bf16=True, logging_steps=20,
save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05,
)
Trainer(model=model, args=targs, train_dataset=ds, processing_class=tok).train()
tok.padding_side = "left"
return model
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--iterations", type=int, default=8)
ap.add_argument("--problems_per_iter", type=int, default=48)
ap.add_argument("--n_eval", type=int, default=500)
ap.add_argument("--max_pairs", type=int, default=100)
ap.add_argument("--seed", type=int, default=42)
ap.add_argument("--tag", required=True)
args = ap.parse_args()
out_dir = f"/workspace/math500_sc/{args.tag}"
os.makedirs(out_dir, exist_ok=True)
random.seed(args.seed); torch.manual_seed(args.seed)
log(f"loading {args.model}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
tok.padding_side = "left"
model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16, device_map="cuda:0")
log(f" loaded mem={torch.cuda.memory_allocated('cuda:0')/1e9:.1f}GB")
log("loading MATH train split")
train_ds = []
for cfg in ["algebra","counting_and_probability","geometry","intermediate_algebra","number_theory","prealgebra","precalculus"]:
try:
sub = list(load_dataset("EleutherAI/hendrycks_math", cfg, split="train"))
train_ds.extend(sub)
except Exception as e:
log(f" warn: failed to load {cfg}: {e}")
log(f" {len(train_ds)} train problems")
random.shuffle(train_ds)
def gold_of(p):
return extract_boxed(p.get("solution", ""))
model.eval()
log("INITIAL eval on MATH-500")
base_c, base_n = math500_eval(model, tok, n=args.n_eval)
log(f" MATH-500 base: {base_c}/{base_n} ({100*base_c/base_n:.1f}%)")
pairs = []
cursor = 0
for it in range(1, args.iterations + 1):
log(f"--- iter {it} ---")
# Sample problems from MATH-train
batch_problems = []
while len(batch_problems) < args.problems_per_iter and cursor < len(train_ds):
p = train_ds[cursor]; cursor += 1
g = gold_of(p)
if g is not None: batch_problems.append({"problem": p["problem"], "gold": g})
if not batch_problems:
log(" exhausted train problems"); break
# Step 1: Greedy attempt
prompts = [SOLVE_PROMPT.format(problem=p["problem"]) for p in batch_problems]
greedy_outs = gen_batch(model, tok, prompts, max_new=600, temperature=0.0, batch=16)
wrong_attempts = []
for i, (p, raw) in enumerate(zip(batch_problems, greedy_outs)):
pred = extract_boxed(raw)
if not sympy_equal(pred, p["gold"]):
wrong_attempts.append({"idx": i, "problem": p["problem"], "gold": p["gold"], "wrong": raw.strip()})
log(f" iter {it}: {len(wrong_attempts)}/{len(batch_problems)} wrong on greedy (mining candidates)")
if not wrong_attempts:
continue
# Step 2: Self-correct prompt (prefill wrong attempt + reconsider tag, sample 4)
sc_problems = []
prefills = []
for w in wrong_attempts:
for _ in range(4):
sc_problems.append(w["problem"])
prefills.append(w["wrong"] + RECONSIDER_TAG)
sc_prompts = [SOLVE_PROMPT.format(problem=p) for p in sc_problems]
sc_outs = gen_batch(model, tok, sc_prompts, max_new=600, temperature=0.8, batch=16, prefill_texts=prefills)
mined_this_iter = 0
for j, w in enumerate(wrong_attempts):
attempts = sc_outs[j*4:(j+1)*4]
preds = [extract_boxed(a) for a in attempts]
correct_idx = [k for k, pr in enumerate(preds) if sympy_equal(pr, w["gold"])]
if correct_idx:
# construct full trace
fix = attempts[correct_idx[0]].strip()
full = w["wrong"] + RECONSIDER_TAG + fix
pairs.append({"problem": w["problem"], "wrong_attempt": w["wrong"],
"correction": fix, "full_solution": full})
mined_this_iter += 1
log(f" iter {it}: MINED {mined_this_iter} self-correction triples — total={len(pairs)}")
if len(pairs) >= args.max_pairs:
log(f" reached max_pairs={args.max_pairs}, stopping"); break
log(f"=== mined {len(pairs)} total self-correction triples ===")
with open(f"{out_dir}/pairs.jsonl", "w") as fh:
for p in pairs: fh.write(json.dumps(p) + "\n")
if not pairs:
log("no triples — exiting"); return
model = train_on_pairs(model, tok, pairs, out_dir)
log("training done")
model.eval()
log("FINAL eval on MATH-500")
tr_c, tr_n = math500_eval(model, tok, n=args.n_eval)
log(f" MATH-500 trained: {tr_c}/{tr_n} ({100*tr_c/tr_n:.1f}%)")
result = {
"model": args.model, "n_pairs": len(pairs),
"base": base_c, "trained": tr_c, "n": tr_n,
"delta": tr_c - base_c, "elapsed_s": time.time() - T0,
}
with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} — SELF-CORRECTION recipe")
print(f" MATH-500: base={base_c}/{tr_n} trained={tr_c}/{tr_n} Δ={tr_c-base_c:+d}")
print(f" Triples mined: {len(pairs)}")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,204 @@
"""STaR / Rejection Sampling Fine-Tuning on GSM8K.
For each GSM8K-train problem:
- sample N reasoning chains at temp=0.8
- keep chains that produce correct final answer
- train on (problem, correct chain) pairs
Then eval on GSM8K-test.
"""
import os, sys, json, time, re, gc, argparse, random
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "1")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset, Dataset as HFDataset
from peft import LoraConfig, get_peft_model
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def extract_answer(text: str):
m = re.search(r"####\s*(-?\d+(?:\.\d+)?)", text)
if m: return float(m.group(1))
m = re.search(r"\\boxed\{(-?\d+(?:\.\d+)?)\}", text)
if m: return float(m.group(1))
matches = re.findall(r"-?\d+(?:\.\d+)?", text)
if matches:
try: return float(matches[-1])
except: return None
return None
def gen_batch(model, tok, prompts, max_new=400, temperature=0.0, batch=8):
outs = []
for i in range(0, len(prompts), batch):
chunk = prompts[i:i+batch]
texts = []
for p in chunk:
msgs = [{"role": "system", "content": "You are a careful math tutor."},
{"role": "user", "content": p}]
texts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
inp = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=1500).to(model.device)
with torch.no_grad():
out = model.generate(**inp, max_new_tokens=max_new, do_sample=temperature > 0,
temperature=temperature if temperature > 0 else 1.0, top_p=0.95,
pad_token_id=tok.eos_token_id)
for j in range(out.size(0)):
outs.append(tok.decode(out[j][inp.input_ids.shape[1]:], skip_special_tokens=True))
return outs
SOLVE_PROMPT = "Solve this math problem step by step. End with the answer on a new line as: #### <number>\n\nProblem: {problem}"
def parse_gold(answer_field: str):
m = re.search(r"####\s*(-?\d+(?:,\d+)*(?:\.\d+)?)", answer_field)
return float(m.group(1).replace(",", "")) if m else None
def gsm8k_eval(model, tok, n=200):
ds = list(load_dataset("openai/gsm8k", "main", split="test"))[:n]
log(f" eval on GSM8K-test ({len(ds)} problems)")
prompts = [SOLVE_PROMPT.format(problem=p["question"]) for p in ds]
outs = gen_batch(model, tok, prompts, max_new=400, temperature=0.0, batch=8)
correct = 0
for p, raw in zip(ds, outs):
gold = parse_gold(p["answer"])
if gold is None: continue
pred = extract_answer(raw)
if pred is not None and abs(pred - gold) < 0.01: correct += 1
return correct, len(ds)
def make_train_example(problem: str, solution: str, tok):
user = SOLVE_PROMPT.format(problem=problem)
msgs_pre = [{"role": "system", "content": "You are a careful math tutor."},
{"role": "user", "content": user}]
msgs_full = msgs_pre + [{"role": "assistant", "content": solution}]
pre = tok.apply_chat_template(msgs_pre, tokenize=False, add_generation_prompt=True)
full = tok.apply_chat_template(msgs_full, tokenize=False)
pre_ids = tok(pre, add_special_tokens=False)["input_ids"]
full_ids = tok(full, add_special_tokens=False)["input_ids"]
MAX = 1024
full_ids = full_ids[:MAX]
labels = list(full_ids)
n_pre = min(len(pre_ids), len(labels))
for i in range(n_pre): labels[i] = -100
pad = MAX - len(full_ids)
return {"input_ids": full_ids + [tok.pad_token_id]*pad,
"attention_mask": [1]*len(full_ids) + [0]*pad,
"labels": labels + [-100]*pad}
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", default="Qwen/Qwen2.5-3B")
ap.add_argument("--n_train_problems", type=int, default=300)
ap.add_argument("--n_chains", type=int, default=8)
ap.add_argument("--n_eval", type=int, default=200)
ap.add_argument("--epochs", type=int, default=2)
ap.add_argument("--seed", type=int, default=42)
ap.add_argument("--tag", required=True)
args = ap.parse_args()
random.seed(args.seed); torch.manual_seed(args.seed)
out_dir = f"/workspace/star/{args.tag}"
os.makedirs(out_dir, exist_ok=True)
log(f"loading {args.model}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
tok.padding_side = "left"
model = AutoModelForCausalLM.from_pretrained(args.model, dtype=torch.bfloat16, device_map="cuda:0")
log(f" loaded mem={torch.cuda.memory_allocated('cuda:0')/1e9:.1f}GB")
# Initial eval on GSM8K-test
model.eval()
log("INITIAL eval on GSM8K-test")
base_correct, base_total = gsm8k_eval(model, tok, n=args.n_eval)
log(f" GSM8K-test base: {base_correct}/{base_total}")
# Mine reasoning chains from GSM8K-train
log(f"mining reasoning chains from GSM8K-train ({args.n_train_problems} problems × {args.n_chains} chains)")
train_set = list(load_dataset("openai/gsm8k", "main", split="train"))[:args.n_train_problems]
pairs = []
BATCH_PROBLEMS = 8 # batch problems together
for batch_start in range(0, len(train_set), BATCH_PROBLEMS):
batch_end = min(batch_start + BATCH_PROBLEMS, len(train_set))
batch_problems = train_set[batch_start:batch_end]
# For each problem, generate N chains. So total = batch_size * N
prompts = []
for p in batch_problems:
for _ in range(args.n_chains):
prompts.append(SOLVE_PROMPT.format(problem=p["question"]))
outs = gen_batch(model, tok, prompts, max_new=400, temperature=0.8, batch=8)
# Outs are in problem-major × chain-major order
for i, p in enumerate(batch_problems):
gold = parse_gold(p["answer"])
if gold is None: continue
chain_outs = outs[i*args.n_chains : (i+1)*args.n_chains]
for raw in chain_outs:
pred = extract_answer(raw)
if pred is not None and abs(pred - gold) < 0.01:
pairs.append({"problem": p["question"], "solution": raw.strip()})
break # take first correct chain per problem
log(f" mined {len(pairs)} pairs from {batch_end} problems")
if not pairs:
log("FATAL: no pairs mined")
return
with open(f"{out_dir}/pairs.jsonl", "w") as fh:
for p in pairs: fh.write(json.dumps(p) + "\n")
log(f"total pairs mined: {len(pairs)} from {len(train_set)} problems "
f"(coverage: {len(pairs)/len(train_set)*100:.1f}%)")
# Train
log(f"TRAINING on {len(pairs)} pairs, {args.epochs} epochs")
lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")
model = get_peft_model(model, lora_cfg)
tok.padding_side = "right"
ds = HFDataset.from_list([make_train_example(p["problem"], p["solution"], tok) for p in pairs])
targs = TrainingArguments(
output_dir=f"{out_dir}/ckpt", num_train_epochs=args.epochs,
per_device_train_batch_size=1, gradient_accumulation_steps=4,
learning_rate=1e-4, bf16=True, logging_steps=20,
save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05,
)
Trainer(model=model, args=targs, train_dataset=ds, processing_class=tok).train()
log("training done")
tok.padding_side = "left"
# Final eval
model.eval()
log("FINAL eval on GSM8K-test")
trained_correct, trained_total = gsm8k_eval(model, tok, n=args.n_eval)
log(f" GSM8K-test trained: {trained_correct}/{trained_total}")
result = {
"model": args.model, "n_train_problems": args.n_train_problems,
"n_chains": args.n_chains, "n_pairs_mined": len(pairs),
"epochs": args.epochs, "seed": args.seed,
"base": [base_correct, base_total],
"trained": [trained_correct, trained_total],
"delta": trained_correct - base_correct,
"elapsed_s": time.time() - T0,
}
with open(f"{out_dir}/result.json", "w") as fh:
json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" STaR / RFT on GSM8K — {args.model}")
print(f" Mined {len(pairs)} pairs from {len(train_set)} GSM8K-train problems ({len(pairs)/len(train_set)*100:.1f}% coverage)")
print(f" GSM8K-test: base={base_correct}/{base_total} trained={trained_correct}/{trained_total} Δ={trained_correct-base_correct:+d}")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,191 @@
"""Bootstrap loop adapted for large models — uses 4-bit NF4 quantization and batch=1.
Just the harvest loop (no training during loop). Saves pairs.
"""
import os, sys, json, time, re, gc, subprocess, tempfile, argparse, random
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "1")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def extract_code(text):
if "```python" in text: text = text.split("```python", 1)[1]
elif "```" in text: text = text.split("```", 1)[1]
if "```" in text: text = text.split("```", 1)[0]
return text.strip()
def run_python(code, timeout=8):
with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
f.write(code); path = f.name
try:
r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp")
if r.returncode == 0: return True, ""
err = (r.stderr or r.stdout).strip().splitlines()
return False, "\n".join(err[-3:])[:300]
except subprocess.TimeoutExpired: return False, "timeout"
finally:
try: os.unlink(path)
except: pass
def gen_one(model, tok, prompt, max_new=400, temperature=0.0):
msgs = [{"role": "system", "content": "You are a Python coder."},
{"role": "user", "content": prompt}]
text = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
inp = tok(text, return_tensors="pt", truncation=True, max_length=1500).to(model.device)
with torch.no_grad():
out = model.generate(**inp, max_new_tokens=max_new, do_sample=temperature > 0,
temperature=temperature if temperature > 0 else 1.0, top_p=0.95,
pad_token_id=tok.eos_token_id)
return tok.decode(out[0][inp.input_ids.shape[1]:], skip_special_tokens=True)
PROBLEM_GEN_PROMPT = """Generate ONE simple Python coding problem with a clear function spec and 3 test assertions.
Output format (exactly one ```python block):
```python
def {function_name}({args}):
\"\"\"{one-line description of what the function does}\"\"\"
{implementation}
# tests
assert {function_name}(...) == ...
assert {function_name}(...) == ...
assert {function_name}(...) == ...
```
Make the function specific and concrete. Output ONLY the code block."""
def parse_problem(raw_code):
code = raw_code.strip()
if "def " not in code: return None
lines = code.split("\n")
func_start = next((i for i, l in enumerate(lines) if l.startswith("def ")), None)
if func_start is None: return None
tests = []
def_end = None
for i in range(func_start, len(lines)):
l = lines[i]
if l.startswith("def ") and i > func_start: break
if l.startswith("assert "):
tests.append(l)
if def_end is None: def_end = i
if len(tests) < 2: return None
if def_end is None: def_end = len(lines)
full_solution = "\n".join(lines[func_start:def_end]).strip()
if len(full_solution) < 30: return None
m = re.match(r"def\s+(\w+)\s*\(", lines[func_start])
if not m: return None
sig_lines = []
for i in range(func_start, def_end):
sig_lines.append(lines[i])
if i == func_start and not any('"""' in lines[j] for j in range(i, min(i+5, def_end))):
sig_lines.append(" pass"); break
if i > func_start and '"""' in lines[i] and ('"""' in lines[i-1] or lines[i].count('"""') >= 2):
break
return {"fn_name": m.group(1), "signature": "\n".join(sig_lines), "tests": tests, "canonical": full_solution}
def humaneval_full(model, tok):
he = list(load_dataset("openai_humaneval", split="test"))
log(f" full HumanEval: {len(he)} problems")
correct = 0
for i, p in enumerate(he):
prompt = p["prompt"] + "\n# Complete the function above."
raw = gen_one(model, tok, prompt, max_new=400, temperature=0.0)
code = extract_code(raw) if "```" in raw else raw
full = p["prompt"] + "\n" + code if "def " not in code else code
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
ok, _ = run_python(test_code, timeout=10)
if ok: correct += 1
if (i+1) % 20 == 0: log(f" eval {i+1}/{len(he)}: {correct} correct")
return correct, len(he)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", default="Qwen/Qwen2.5-14B")
ap.add_argument("--iterations", type=int, default=20)
ap.add_argument("--problems_per_iter", type=int, default=8)
ap.add_argument("--n_attempts", type=int, default=4)
ap.add_argument("--tag", required=True)
args = ap.parse_args()
out_dir = f"/workspace/bootstrap14b/{args.tag}"
os.makedirs(out_dir, exist_ok=True)
log(f"loading {args.model} in 4-bit NF4")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
tok.padding_side = "left"
bnb_cfg = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True)
model = AutoModelForCausalLM.from_pretrained(args.model, quantization_config=bnb_cfg,
device_map="cuda:0")
model.eval()
log(f" loaded mem={torch.cuda.memory_allocated('cuda:0')/1e9:.1f}GB")
log("INITIAL eval on full HumanEval")
base_correct, base_total = humaneval_full(model, tok)
log(f" base: {base_correct}/{base_total}")
accumulated = []
for it in range(1, args.iterations + 1):
it_t = time.time()
valid_problems = []
for _ in range(args.problems_per_iter):
raw = gen_one(model, tok, PROBLEM_GEN_PROMPT, max_new=400, temperature=0.9)
code = extract_code(raw) if "```" in raw else raw
parsed = parse_problem(code)
if not parsed: continue
full = parsed["canonical"] + "\n\n" + "\n".join(parsed["tests"])
ok, _ = run_python(full)
if ok: valid_problems.append(parsed)
new_pairs = 0
for p in valid_problems:
attempts = []
solve_prompt = f"Implement: {p['signature']}\n\nTests:\n{chr(10).join(p['tests'])}\n\nOutput only the function implementation in one ```python block."
for _ in range(args.n_attempts):
raw = gen_one(model, tok, solve_prompt, max_new=400, temperature=0.8)
attempts.append(raw)
broken = None; fixed = None
for raw in attempts:
code = extract_code(raw) if "```" in raw else raw
full = code + "\n\n" + "\n".join(p["tests"])
ok, err = run_python(full)
if ok and fixed is None: fixed = code
elif not ok and broken is None: broken = code; broken_err = err
if broken and fixed: break
if broken and fixed:
accumulated.append({"signature": p["signature"], "tests": p["tests"],
"broken": broken, "error": broken_err if 'broken_err' in dir() else "",
"fixed": fixed})
new_pairs += 1
log(f"iter {it}: {len(valid_problems)} valid, {new_pairs} pairs (total: {len(accumulated)}) [{time.time()-it_t:.0f}s]")
with open(f"{out_dir}/pairs.jsonl", "w") as fh:
for r in accumulated: fh.write(json.dumps(r) + "\n")
log(f"DONE — accumulated {len(accumulated)} pairs from {args.iterations} iters")
print()
print("=" * 70)
print(f" 14B BASELINE: {base_correct}/{base_total} on HumanEval")
print(f" Accumulated pairs: {len(accumulated)}")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

322
recipe/curriculum_code.py Normal file
View file

@ -0,0 +1,322 @@
"""TinyForge-Zero on CODE with self-difficulty curriculum.
Loop:
1. Generate problem (seeded fresh or amplified/simplified from pool)
2. Greedy solve. Verify against tests.
- If correct easy amplify
- If wrong try 4 sampled attempts
- If at-edge (some pass, some fail) MINE pair
- If all fail too hard simplify
3. Train periodically. Eval on HumanEval.
"""
import os, sys, json, time, re, gc, subprocess, tempfile, argparse, random
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset, Dataset as HFDataset
from peft import LoraConfig, get_peft_model
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def extract_code(text):
if "```python" in text: text = text.split("```python", 1)[1]
elif "```" in text: text = text.split("```", 1)[1]
if "```" in text: text = text.split("```", 1)[0]
return text.strip()
def run_python(code, timeout=8):
with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
f.write(code); path = f.name
try:
r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp")
if r.returncode == 0: return True, ""
err = (r.stderr or r.stdout).strip().splitlines()
return False, "\n".join(err[-3:])[:300]
except subprocess.TimeoutExpired: return False, "timeout"
finally:
try: os.unlink(path)
except: pass
def gen_batch(model, tok, prompts, max_new=400, temperature=0.0, batch=4):
outs = []
for i in range(0, len(prompts), batch):
chunk = prompts[i:i+batch]
texts = []
for p in chunk:
msgs = [{"role": "system", "content": "You are a Python coder."},
{"role": "user", "content": p}]
texts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
inp = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=1500).to(model.device)
with torch.no_grad():
out = model.generate(**inp, max_new_tokens=max_new, do_sample=temperature > 0,
temperature=temperature if temperature > 0 else 1.0, top_p=0.95,
pad_token_id=tok.eos_token_id)
for j in range(out.size(0)):
outs.append(tok.decode(out[j][inp.input_ids.shape[1]:], skip_special_tokens=True))
return outs
SEED_GEN_PROMPT = """Generate ONE simple Python coding problem with a clear function spec and 3 test assertions.
Output exactly:
```python
def {function_name}({args}):
\"\"\"{description}\"\"\"
{implementation}
# tests
assert {function_name}(...) == ...
assert {function_name}(...) == ...
assert {function_name}(...) == ...
```
Output ONLY the code block."""
AMPLIFY_PROMPT = """Take this Python coding problem and make it HARDER (add an edge case, additional constraint, or trickier logic). Keep the format with function + 3 assert tests.
Original:
```python
{original}
```
Output the harder version (function + tests) in one ```python block."""
SIMPLIFY_PROMPT = """Take this Python coding problem and make it EASIER (remove an edge case, simplify the logic). Keep the format with function + 3 assert tests.
Original:
```python
{original}
```
Output the easier version (function + tests) in one ```python block."""
def parse_problem(text):
code = extract_code(text) if "```" in text else text.strip()
if "def " not in code: return None
lines = code.split("\n")
func_start = next((i for i, l in enumerate(lines) if l.startswith("def ")), None)
if func_start is None: return None
tests = []
def_end = None
for i in range(func_start, len(lines)):
l = lines[i]
if l.startswith("def ") and i > func_start: break
if l.startswith("assert "):
tests.append(l)
if def_end is None: def_end = i
if len(tests) < 2: return None
if def_end is None: def_end = len(lines)
full_solution = "\n".join(lines[func_start:def_end]).strip()
if len(full_solution) < 30: return None
m = re.match(r"def\s+(\w+)\s*\(", lines[func_start])
if not m: return None
fn_name = m.group(1)
sig_lines = []
for i in range(func_start, def_end):
sig_lines.append(lines[i])
if i == func_start and not any('"""' in lines[j] for j in range(i, min(i+5, def_end))):
sig_lines.append(" pass"); break
if i > func_start and '"""' in lines[i] and (i > func_start+1 and '"""' in lines[i-1] or lines[i].count('"""') >= 2):
break
return {"fn_name": fn_name, "signature": "\n".join(sig_lines), "tests": tests,
"canonical": full_solution, "raw": code}
def humaneval_full(model, tok, n=164):
he = list(load_dataset("openai_humaneval", split="test"))[:n]
log(f" HumanEval ({len(he)} problems)")
prompts = [p["prompt"] + "\n# Complete the function above." for p in he]
outs = gen_batch(model, tok, prompts, max_new=400, temperature=0.0, batch=4)
correct = 0
for p, raw in zip(he, outs):
code = extract_code(raw) if "```" in raw else raw
full = p["prompt"] + "\n" + code if "def " not in code else code
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
ok, _ = run_python(test_code, timeout=10)
if ok: correct += 1
return correct, len(he)
def make_train_example(r, tok):
user = f"Implement: {r['signature']}\n\nTests:\n{chr(10).join(r['tests'])}\n\nMy attempt:\n```python\n{r['broken']}\n```\n\nError:\n{r['error']}\n\nFix and output the corrected code only."
assistant = f"```python\n{r['fixed']}\n```"
msgs_pre = [{"role": "system", "content": "You are a Python coder."},
{"role": "user", "content": user}]
msgs_full = msgs_pre + [{"role": "assistant", "content": assistant}]
pre = tok.apply_chat_template(msgs_pre, tokenize=False, add_generation_prompt=True)
full = tok.apply_chat_template(msgs_full, tokenize=False)
pre_ids = tok(pre, add_special_tokens=False)["input_ids"]
full_ids = tok(full, add_special_tokens=False)["input_ids"]
MAX = 1024
full_ids = full_ids[:MAX]
labels = list(full_ids)
n_pre = min(len(pre_ids), len(labels))
for i in range(n_pre): labels[i] = -100
pad = MAX - len(full_ids)
return {"input_ids": full_ids + [tok.pad_token_id]*pad,
"attention_mask": [1]*len(full_ids) + [0]*pad,
"labels": labels + [-100]*pad}
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", default="Qwen/Qwen2.5-7B")
ap.add_argument("--iterations", type=int, default=16)
ap.add_argument("--problems_per_iter", type=int, default=8)
ap.add_argument("--train_every", type=int, default=4)
ap.add_argument("--seed", type=int, default=42)
ap.add_argument("--tag", required=True)
args = ap.parse_args()
random.seed(args.seed); torch.manual_seed(args.seed)
out_dir = f"/workspace/curriculum_code/{args.tag}"
os.makedirs(out_dir, exist_ok=True)
log(f"loading {args.model}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
tok.padding_side = "left"
model = AutoModelForCausalLM.from_pretrained(args.model, dtype=torch.bfloat16, device_map="cuda:0")
log(f" loaded mem={torch.cuda.memory_allocated('cuda:0')/1e9:.1f}GB")
model.eval()
log("INITIAL eval on HumanEval")
base_correct, base_total = humaneval_full(model, tok)
log(f" base: {base_correct}/{base_total}")
lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")
model = get_peft_model(model, lora_cfg)
accumulated = []
problem_pool = []
for it in range(1, args.iterations + 1):
it_t = time.time()
if not problem_pool:
gen_prompts = [SEED_GEN_PROMPT for _ in range(args.problems_per_iter)]
raw = gen_batch(model, tok, gen_prompts, max_new=400, temperature=0.9)
seeded = []
for r in raw:
parsed = parse_problem(r)
if not parsed: continue
full = parsed["canonical"] + "\n\n" + "\n".join(parsed["tests"])
ok, _ = run_python(full)
if ok: seeded.append(parsed)
problem_pool.extend(seeded)
log(f"iter {it}: seeded {len(seeded)} fresh (pool={len(problem_pool)})")
random.shuffle(problem_pool)
attempt_problems = problem_pool[:args.problems_per_iter]
problem_pool = problem_pool[args.problems_per_iter:]
if not attempt_problems:
log(f"iter {it}: empty pool"); continue
# Greedy solve
greedy_prompts = [f"Implement: {p['signature']}\n\nTests:\n{chr(10).join(p['tests'])}\n\nOutput only the function in one ```python block." for p in attempt_problems]
greedy_outs = gen_batch(model, tok, greedy_prompts, max_new=300, temperature=0.0)
new_pairs = 0
amp_targets = []; sim_targets = []
for p, raw in zip(attempt_problems, greedy_outs):
code = extract_code(raw) if "```" in raw else raw
ok, _ = run_python(code + "\n\n" + "\n".join(p["tests"]))
if ok:
amp_targets.append(p)
else:
# at-edge check via sampling
solve_prompt = f"Implement: {p['signature']}\n\nTests:\n{chr(10).join(p['tests'])}\n\nOutput only the function in one ```python block."
atts = gen_batch(model, tok, [solve_prompt]*4, max_new=300, temperature=0.7)
broken = None; broken_err = None; fixed = None
for ra in atts:
c = extract_code(ra) if "```" in ra else ra
ok2, err = run_python(c + "\n\n" + "\n".join(p["tests"]))
if ok2 and fixed is None: fixed = c
elif not ok2 and broken is None: broken = c; broken_err = err
if broken and fixed: break
if broken and fixed:
accumulated.append({"signature": p["signature"], "tests": p["tests"],
"broken": broken, "error": broken_err, "fixed": fixed})
new_pairs += 1
else:
sim_targets.append(p)
log(f"iter {it}: {len(attempt_problems)} attempted, +{new_pairs} pairs (total: {len(accumulated)}). amp={len(amp_targets)}, sim={len(sim_targets)} [{time.time()-it_t:.0f}s]")
# Generate amplified / simplified for next iter
if amp_targets:
amp_prompts = [AMPLIFY_PROMPT.format(original=p["raw"]) for p in amp_targets[:args.problems_per_iter]]
amp_outs = gen_batch(model, tok, amp_prompts, max_new=400, temperature=0.7)
for r in amp_outs:
parsed = parse_problem(r)
if not parsed: continue
full = parsed["canonical"] + "\n\n" + "\n".join(parsed["tests"])
ok, _ = run_python(full)
if ok: problem_pool.append(parsed)
if sim_targets:
sim_prompts = [SIMPLIFY_PROMPT.format(original=p["raw"]) for p in sim_targets[:args.problems_per_iter//2]]
sim_outs = gen_batch(model, tok, sim_prompts, max_new=400, temperature=0.7)
for r in sim_outs:
parsed = parse_problem(r)
if not parsed: continue
full = parsed["canonical"] + "\n\n" + "\n".join(parsed["tests"])
ok, _ = run_python(full)
if ok: problem_pool.append(parsed)
with open(f"{out_dir}/pairs.jsonl", "w") as fh:
for r in accumulated: fh.write(json.dumps(r) + "\n")
if it % args.train_every == 0 and len(accumulated) >= 10:
log(f" TRAINING on {len(accumulated)} pairs")
tok.padding_side = "right"
ds = HFDataset.from_list([make_train_example(r, tok) for r in accumulated])
targs = TrainingArguments(
output_dir=f"{out_dir}/ckpt", num_train_epochs=2,
per_device_train_batch_size=1, gradient_accumulation_steps=4,
learning_rate=1e-4, bf16=True, logging_steps=10,
save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05,
)
Trainer(model=model, args=targs, train_dataset=ds, processing_class=tok).train()
tok.padding_side = "left"
model.eval()
corr, tot = humaneval_full(model, tok)
log(f" HumanEval @ iter {it}: {corr}/{tot} Δ={corr-base_correct:+d}")
model.train()
model.eval()
final_correct, final_total = humaneval_full(model, tok)
result = {
"model": args.model, "iterations": args.iterations,
"n_pairs": len(accumulated),
"base": [base_correct, base_total],
"trained": [final_correct, final_total],
"delta": final_correct - base_correct,
"elapsed_s": time.time() - T0,
}
with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" CURRICULUM TINYFORGE-ZERO-CODE — {args.model}")
print(f" HumanEval: base={base_correct}/{base_total} trained={final_correct}/{final_total} Δ={final_correct-base_correct:+d}")
print(f" Self-mined pairs: {len(accumulated)}")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

283
recipe/math_bootstrap.py Normal file
View file

@ -0,0 +1,283 @@
"""TinyForge-Zero on math word problems.
Same recipe as code bootstrap, different verifier:
- Model generates (word_problem, python_expression_for_answer) pairs
- Python eval gives the canonical numerical answer
- Solver gets word problem only, must produce a number
- Compare solver's number to canonical → broken/fixed pairs
- Train on accumulated pairs
- Eval on GSM8K (held-out)
"""
import os, sys, json, time, re, gc, subprocess, tempfile, argparse, random
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset, Dataset as HFDataset
from peft import LoraConfig, get_peft_model
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def extract_code(text):
if "```python" in text: text = text.split("```python", 1)[1]
elif "```" in text: text = text.split("```", 1)[1]
if "```" in text: text = text.split("```", 1)[0]
return text.strip()
def safe_eval(expr: str):
"""Eval a numeric Python expression. Returns float or None."""
try:
# Restrict to math operations
allowed = "0123456789+-*/.()% "
if not all(c in allowed or c.isspace() for c in expr): return None
return float(eval(expr, {"__builtins__": {}}, {}))
except Exception:
return None
def extract_answer(text: str):
"""Pull a numeric answer from model output. Looks for last number or boxed."""
# GSM8K style: "#### 42"
m = re.search(r"####\s*(-?\d+(?:\.\d+)?)", text)
if m: return float(m.group(1))
# \boxed{42}
m = re.search(r"\\boxed\{(-?\d+(?:\.\d+)?)\}", text)
if m: return float(m.group(1))
# "answer is 42" or "= 42"
matches = re.findall(r"-?\d+(?:\.\d+)?", text)
if matches:
try: return float(matches[-1])
except: return None
return None
def gen_batch(model, tok, prompts, max_new=400, temperature=0.0, batch=4):
outs = []
for i in range(0, len(prompts), batch):
chunk = prompts[i:i+batch]
texts = []
for p in chunk:
msgs = [{"role": "system", "content": "You are a careful math tutor."},
{"role": "user", "content": p}]
texts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
inp = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=1500).to(model.device)
with torch.no_grad():
out = model.generate(**inp, max_new_tokens=max_new, do_sample=temperature > 0,
temperature=temperature if temperature > 0 else 1.0, top_p=0.95,
pad_token_id=tok.eos_token_id)
for j in range(out.size(0)):
outs.append(tok.decode(out[j][inp.input_ids.shape[1]:], skip_special_tokens=True))
return outs
PROBLEM_GEN_PROMPT = """Generate ONE math word problem with a numerical answer. Output exactly this format:
PROBLEM: <a clear word problem with concrete numbers>
EXPRESSION: <a single Python arithmetic expression that evaluates to the answer, e.g. (5*3)+12>
ANSWER: <the numerical answer>
Make the problem grade-school to middle-school level. The expression must evaluate to the answer."""
def parse_generated_problem(text: str):
"""Extract (problem, expression, answer) from model output."""
p_m = re.search(r"PROBLEM:\s*(.+?)(?:\n|EXPRESSION:)", text, re.DOTALL)
e_m = re.search(r"EXPRESSION:\s*(.+?)(?:\n|ANSWER:)", text, re.DOTALL)
a_m = re.search(r"ANSWER:\s*(-?\d+(?:\.\d+)?)", text)
if not (p_m and e_m and a_m): return None
problem = p_m.group(1).strip()
expression = e_m.group(1).strip()
try:
claimed = float(a_m.group(1))
except: return None
if len(problem) < 10 or len(expression) < 1: return None
# Verify: expression evaluates to claimed answer
actual = safe_eval(expression)
if actual is None: return None
if abs(actual - claimed) > 0.01: return None
return {"problem": problem, "expression": expression, "answer": claimed}
SOLVE_PROMPT_TEMPLATE = """Solve this math problem step by step. End with the answer on a new line as: #### <number>
Problem: {problem}"""
def solve_and_check(model, tok, problem_text: str, gold_answer: float, n_attempts: int = 4, temperature: float = 0.7):
"""Sample N attempts, return list of (text, predicted_num, ok)."""
prompt = SOLVE_PROMPT_TEMPLATE.format(problem=problem_text)
outs = gen_batch(model, tok, [prompt] * n_attempts, max_new=400, temperature=temperature)
results = []
for raw in outs:
pred = extract_answer(raw)
ok = pred is not None and abs(pred - gold_answer) < 0.01
results.append({"text": raw, "pred": pred, "ok": ok})
return results
def gsm8k_eval(model, tok, n=200):
ds = list(load_dataset("openai/gsm8k", "main", split="test"))
ds = ds[:n]
log(f" eval on GSM8K ({len(ds)} problems)")
prompts = [SOLVE_PROMPT_TEMPLATE.format(problem=p["question"]) for p in ds]
outs = gen_batch(model, tok, prompts, max_new=400, temperature=0.0, batch=4)
correct = 0
for p, raw in zip(ds, outs):
# GSM8K's answer field has format "step-by-step\n#### 42"
gold_m = re.search(r"####\s*(-?\d+(?:,\d+)*(?:\.\d+)?)", p["answer"])
if not gold_m: continue
gold = float(gold_m.group(1).replace(",", ""))
pred = extract_answer(raw)
if pred is not None and abs(pred - gold) < 0.01: correct += 1
return correct, len(ds)
def make_train_example(r, tok):
user = SOLVE_PROMPT_TEMPLATE.format(problem=r["problem"]) + f"\n\nMy attempt:\n{r['broken']}\n\nThis is wrong. Solve it correctly and end with #### <number>."
assistant = r["fixed"]
msgs_pre = [{"role": "system", "content": "You are a careful math tutor."},
{"role": "user", "content": user}]
msgs_full = msgs_pre + [{"role": "assistant", "content": assistant}]
pre = tok.apply_chat_template(msgs_pre, tokenize=False, add_generation_prompt=True)
full = tok.apply_chat_template(msgs_full, tokenize=False)
pre_ids = tok(pre, add_special_tokens=False)["input_ids"]
full_ids = tok(full, add_special_tokens=False)["input_ids"]
MAX = 1024
full_ids = full_ids[:MAX]
labels = list(full_ids)
n_pre = min(len(pre_ids), len(labels))
for i in range(n_pre): labels[i] = -100
pad = MAX - len(full_ids)
return {"input_ids": full_ids + [tok.pad_token_id]*pad,
"attention_mask": [1]*len(full_ids) + [0]*pad,
"labels": labels + [-100]*pad}
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", default="Qwen/Qwen2.5-7B")
ap.add_argument("--iterations", type=int, default=20)
ap.add_argument("--problems_per_iter", type=int, default=16)
ap.add_argument("--train_every", type=int, default=8)
ap.add_argument("--eval_every", type=int, default=8)
ap.add_argument("--n_eval", type=int, default=200)
ap.add_argument("--seed", type=int, default=42)
ap.add_argument("--tag", required=True)
args = ap.parse_args()
random.seed(args.seed); torch.manual_seed(args.seed)
out_dir = f"/workspace/math/{args.tag}"
os.makedirs(out_dir, exist_ok=True)
log(f"loading {args.model}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
tok.padding_side = "left"
device = "cuda:0" # CUDA_VISIBLE_DEVICES=1 makes physical GPU 1 appear as cuda:0
model = AutoModelForCausalLM.from_pretrained(args.model, dtype=torch.bfloat16, device_map=device)
log(f" loaded mem={torch.cuda.memory_allocated(device)/1e9:.1f}GB")
# Initial eval
model.eval()
log("INITIAL eval on GSM8K")
init_correct, init_total = gsm8k_eval(model, tok, n=args.n_eval)
log(f" GSM8K base: {init_correct}/{init_total}")
# LoRA
lora_cfg = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")
model = get_peft_model(model, lora_cfg)
log(f" LoRA applied, trainable={sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6:.1f}M")
accumulated_pairs = []
eval_log = [{"iter": 0, "correct": init_correct, "total": init_total}]
iter_stats = []
for it in range(1, args.iterations + 1):
it_t = time.time()
# 1. Generate problems
gen_prompts = [PROBLEM_GEN_PROMPT for _ in range(args.problems_per_iter)]
raw_problems = gen_batch(model, tok, gen_prompts, max_new=300, temperature=0.9)
# 2. Parse & verify (Python eval of expression)
valid = []
for raw in raw_problems:
parsed = parse_generated_problem(raw)
if parsed: valid.append(parsed)
if not valid:
log(f"iter {it}: 0 valid problems")
iter_stats.append({"iter": it, "valid": 0, "pairs": 0})
continue
# 3. Mine pairs from sampled solver outputs
new_pairs = 0
for p in valid:
attempts = solve_and_check(model, tok, p["problem"], p["answer"], n_attempts=4, temperature=0.7)
ok_atts = [a for a in attempts if a["ok"]]
bad_atts = [a for a in attempts if not a["ok"]]
if ok_atts and bad_atts:
accumulated_pairs.append({
"problem": p["problem"],
"answer": p["answer"],
"broken": bad_atts[0]["text"],
"fixed": ok_atts[0]["text"],
})
new_pairs += 1
log(f"iter {it}: {len(valid)} valid problems, {new_pairs} pairs harvested (total: {len(accumulated_pairs)}) [{time.time()-it_t:.0f}s]")
iter_stats.append({"iter": it, "valid": len(valid), "pairs": new_pairs, "elapsed": time.time()-it_t})
# Save incrementally
with open(f"{out_dir}/pairs.jsonl", "w") as fh:
for r in accumulated_pairs: fh.write(json.dumps(r) + "\n")
# 4. Train every N
if it % args.train_every == 0 and len(accumulated_pairs) >= 10:
log(f" TRAINING on {len(accumulated_pairs)} pairs")
tok.padding_side = "right"
ds = HFDataset.from_list([make_train_example(r, tok) for r in accumulated_pairs])
targs = TrainingArguments(
output_dir=f"{out_dir}/ckpt", num_train_epochs=2,
per_device_train_batch_size=1, gradient_accumulation_steps=4,
learning_rate=1e-4, bf16=True, logging_steps=10,
save_strategy="no", report_to="none", remove_unused_columns=False, warmup_ratio=0.05,
)
Trainer(model=model, args=targs, train_dataset=ds, processing_class=tok).train()
tok.padding_side = "left"
# 5. Eval every N
if it % args.eval_every == 0:
model.eval()
corr, tot = gsm8k_eval(model, tok, n=args.n_eval)
log(f" GSM8K @ iter {it}: {corr}/{tot}")
eval_log.append({"iter": it, "correct": corr, "total": tot})
model.train()
# Final eval
model.eval()
final_correct, final_total = gsm8k_eval(model, tok, n=args.n_eval)
eval_log.append({"iter": args.iterations, "correct": final_correct, "total": final_total, "final": True})
with open(f"{out_dir}/iter_stats.jsonl", "w") as fh:
for r in iter_stats: fh.write(json.dumps(r) + "\n")
with open(f"{out_dir}/eval_log.json", "w") as fh:
json.dump(eval_log, fh, indent=2)
print()
print("=" * 70)
print(f" TINYFORGE-ZERO ON MATH ({args.model})")
print(f" GSM8K-mini ({final_total}): base={init_correct} final={final_correct} Δ={final_correct-init_correct:+d}")
print(f" Total pairs mined: {len(accumulated_pairs)}")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

103
tts/tts_aime.py Normal file
View file

@ -0,0 +1,103 @@
"""TTS on AIME (Olympiad math). 90 problems, integer answers 0-999.
If 8B+best-of-N hits 30%+, that's matching frontier reasoning models."""
import os, json, time, re, argparse
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from datasets import load_dataset
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def extract_int(text):
"""AIME answers are integers 0-999. Try \boxed first, fall back to last integer."""
m = re.search(r"\\boxed\{(\d+)\}", text)
if m:
try: return int(m.group(1))
except: return None
# Last integer in last few lines
lines = text.strip().split("\n")
for line in reversed(lines[-5:]):
nums = re.findall(r"\b(\d+)\b", line)
if nums:
try: return int(nums[-1])
except: pass
return None
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--n_samples", type=int, default=8)
ap.add_argument("--temperature", type=float, default=0.7)
ap.add_argument("--tag", required=True)
args = ap.parse_args()
out_dir = f"/workspace/tts_aime/{args.tag}"
os.makedirs(out_dir, exist_ok=True)
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
log(f"loading {args.model}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.90, max_model_len=3072)
log(f" loaded")
ds = list(load_dataset("AI-MO/aimo-validation-aime", split="train"))
log(f" AIME: {len(ds)} problems")
SYS = "You are a careful math problem solver. AIME answers are integers between 0 and 999. End with \\boxed{integer}."
UTMPL = "Solve this AIME problem. Show your reasoning, then put the final integer answer in \\boxed{{...}}.\n\nProblem: {problem}\n\nSolution:"
prompts = []
for p in ds:
msgs = [{"role": "system", "content": SYS},
{"role": "user", "content": UTMPL.format(problem=p["problem"])}]
try:
prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
except Exception:
prompts.append(UTMPL.format(problem=p["problem"]))
log("=== GREEDY ===")
sp_g = SamplingParams(temperature=0, max_tokens=2000)
t0 = time.time()
g_outs = [o.outputs[0].text for o in llm.generate(prompts, sp_g, use_tqdm=False)]
log(f" gen in {time.time()-t0:.1f}s")
g_correct = 0
for p, raw in zip(ds, g_outs):
pred = extract_int(raw)
gold = int(p["answer"])
if pred == gold: g_correct += 1
log(f" GREEDY: {g_correct}/{len(ds)} ({100*g_correct/len(ds):.1f}%)")
log(f"=== BEST-OF-{args.n_samples} (temp={args.temperature}) ===")
sp_s = SamplingParams(temperature=args.temperature, top_p=0.95, max_tokens=2000, n=args.n_samples)
t0 = time.time()
s_outs = llm.generate(prompts, sp_s, use_tqdm=False)
log(f" gen in {time.time()-t0:.1f}s")
bN_correct = 0
for p, outset in zip(ds, s_outs):
gold = int(p["answer"])
for o in outset.outputs:
pred = extract_int(o.text)
if pred == gold:
bN_correct += 1; break
result = {"model": args.model, "n_samples": args.n_samples, "temperature": args.temperature,
"greedy": g_correct, "best_of_N": bN_correct, "n": len(ds), "elapsed_s": time.time()-T0}
with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} — AIME ({len(ds)} problems)")
print(f" Greedy: {g_correct}/{len(ds)} ({100*g_correct/len(ds):.1f}%)")
print(f" Best-of-{args.n_samples}: {bN_correct}/{len(ds)} ({100*bN_correct/len(ds):.1f}%)")
print(f" TTS Lift: +{bN_correct - g_correct} ({100*(bN_correct-g_correct)/len(ds):.1f}pp)")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

126
tts/tts_humaneval.py Normal file
View file

@ -0,0 +1,126 @@
"""TTS on HumanEval+ (contamination-resistant) to verify the 92% isn't memorization."""
import os, json, time, subprocess, tempfile, argparse
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from datasets import load_dataset
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def extract_code(text):
if "```python" in text: text = text.split("```python", 1)[1]
elif "```" in text: text = text.split("```", 1)[1]
if "```" in text: text = text.split("```", 1)[0]
return text.strip()
def run_python(code, timeout=15):
with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
f.write(code); path = f.name
try:
r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp")
return r.returncode == 0
except subprocess.TimeoutExpired: return False
finally:
try: os.unlink(path)
except: pass
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--n_samples", type=int, default=8)
ap.add_argument("--temperature", type=float, default=0.6)
ap.add_argument("--tag", required=True)
args = ap.parse_args()
out_dir = f"/workspace/tts_hep/{args.tag}"
os.makedirs(out_dir, exist_ok=True)
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
log(f"loading {args.model}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.90, max_model_len=2048)
log(f" loaded")
hep = list(load_dataset("evalplus/humanevalplus", split="test"))
log(f" HE+: {len(hep)} problems")
prompts = []
for p in hep:
try:
msgs = [{"role": "system", "content": "You are a Python coder. Output one ```python block only."},
{"role": "user", "content": p["prompt"] + "\n# Complete the function above."}]
prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
except Exception:
prompts.append(p["prompt"])
log("=== GREEDY ===")
sp_g = SamplingParams(temperature=0, max_tokens=400)
g_outs = [o.outputs[0].text for o in llm.generate(prompts, sp_g, use_tqdm=False)]
base_pass, plus_pass = 0, 0
for p, raw in zip(hep, g_outs):
code = extract_code(raw) if "```" in raw else raw
full = p["prompt"] + "\n" + code if "def " not in code else code
# base test
b_test = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
b_ok = run_python(b_test, 15)
if b_ok: base_pass += 1
# plus test (harder, hidden cases)
if "plus_test" in p:
p_test = full + "\n\n" + p["plus_test"] + f"\n\ncheck({p['entry_point']})"
if run_python(p_test, 15): plus_pass += 1
else:
if b_ok: plus_pass += 1
log(f" GREEDY base: {base_pass}/{len(hep)} plus(hidden): {plus_pass}/{len(hep)}")
log(f"=== BEST-OF-{args.n_samples} (temp={args.temperature}) ===")
sp_s = SamplingParams(temperature=args.temperature, top_p=0.95, max_tokens=400, n=args.n_samples)
s_outs = llm.generate(prompts, sp_s, use_tqdm=False)
bN_base, bN_plus = 0, 0
for p, outset in zip(hep, s_outs):
attempts = [o.text for o in outset.outputs]
base_ok_any = False
plus_ok_any = False
for a in attempts:
code = extract_code(a) if "```" in a else a
full = p["prompt"] + "\n" + code if "def " not in code else code
b_test = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
b_ok = run_python(b_test, 15)
if b_ok and not base_ok_any:
base_ok_any = True
if "plus_test" in p:
p_test = full + "\n\n" + p["plus_test"] + f"\n\ncheck({p['entry_point']})"
p_ok = run_python(p_test, 15)
if p_ok and not plus_ok_any:
plus_ok_any = True
elif b_ok and not plus_ok_any:
plus_ok_any = True
if base_ok_any and plus_ok_any: break
if base_ok_any: bN_base += 1
if plus_ok_any: bN_plus += 1
result = {"model": args.model, "n_samples": args.n_samples, "temperature": args.temperature,
"greedy_base": base_pass, "greedy_plus": plus_pass,
"best_of_N_base": bN_base, "best_of_N_plus": bN_plus,
"n": len(hep), "elapsed_s": time.time()-T0}
with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} — HumanEval+ ({len(hep)} problems)")
print(f" Greedy base: {base_pass}/{len(hep)} ({100*base_pass/len(hep):.1f}%)")
print(f" Greedy plus (hard): {plus_pass}/{len(hep)} ({100*plus_pass/len(hep):.1f}%)")
print(f" Best-of-{args.n_samples} base: {bN_base}/{len(hep)} ({100*bN_base/len(hep):.1f}%)")
print(f" Best-of-{args.n_samples} plus: {bN_plus}/{len(hep)} ({100*bN_plus/len(hep):.1f}%)")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

125
tts/tts_math500.py Normal file
View file

@ -0,0 +1,125 @@
"""TTS on MATH-500: greedy + best-of-N pass@1.
If TTS works on math like it does on code, we should see major lift.
"""
import os, json, time, re, argparse
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from datasets import load_dataset
import sympy
from sympy.parsing.latex import parse_latex
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def extract_boxed(text):
idx = text.rfind("\\boxed{")
if idx < 0: return None
start = idx + len("\\boxed{")
depth = 1; i = start
while i < len(text) and depth > 0:
if text[i] == "{": depth += 1
elif text[i] == "}": depth -= 1
i += 1
if depth != 0: return None
return text[start:i-1].strip()
def normalize(s):
if s is None: return None
s = s.strip()
s = re.sub(r"^\$|\$$", "", s).strip()
s = re.sub(r"\\text\{([^}]*)\}", r"\1", s)
s = re.sub(r"\\mbox\{([^}]*)\}", r"\1", s)
s = re.sub(r"(?<=\d),(?=\d)", "", s)
s = s.replace("\\left", "").replace("\\right", "").replace("^\\circ", "").replace("^{\\circ}", "")
return s.strip()
def sympy_equal(a, b):
if a is None or b is None: return False
a, b = normalize(a), normalize(b)
if a == b: return True
try:
ea = parse_latex(a); eb = parse_latex(b)
if sympy.simplify(ea - eb) == 0: return True
except Exception: pass
try:
if abs(float(a) - float(b)) < 1e-6: return True
except Exception: pass
return False
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--n_samples", type=int, default=8)
ap.add_argument("--temperature", type=float, default=0.7)
ap.add_argument("--tag", required=True)
args = ap.parse_args()
out_dir = f"/workspace/tts_math/{args.tag}"
os.makedirs(out_dir, exist_ok=True)
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
log(f"loading {args.model}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.90, max_model_len=2048)
log(f" loaded")
ds = list(load_dataset("HuggingFaceH4/MATH-500", split="test"))
log(f" MATH-500: {len(ds)} problems")
SYS = "You are a careful math problem solver. End with \\boxed{answer}."
USER_TEMPLATE = "Solve this competition math problem. Show your reasoning, then put the final answer in \\boxed{{...}}.\n\nProblem: {problem}\n\nSolution:"
prompts = []
for p in ds:
msgs = [{"role": "system", "content": SYS},
{"role": "user", "content": USER_TEMPLATE.format(problem=p["problem"])}]
try:
prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
except Exception:
prompts.append(USER_TEMPLATE.format(problem=p["problem"]))
# Greedy
log("=== GREEDY ===")
sp_g = SamplingParams(temperature=0, max_tokens=800)
t0 = time.time()
g_outs = [o.outputs[0].text for o in llm.generate(prompts, sp_g, use_tqdm=False)]
log(f" gen in {time.time()-t0:.1f}s")
g_correct = sum(1 for p, raw in zip(ds, g_outs) if sympy_equal(extract_boxed(raw), p["answer"]))
log(f" GREEDY: {g_correct}/{len(ds)} ({100*g_correct/len(ds):.1f}%)")
# Best-of-N (any correct)
log(f"=== BEST-OF-{args.n_samples} (temp={args.temperature}) ===")
sp_s = SamplingParams(temperature=args.temperature, top_p=0.95, max_tokens=800, n=args.n_samples)
t0 = time.time()
s_outs = llm.generate(prompts, sp_s, use_tqdm=False)
log(f" gen in {time.time()-t0:.1f}s")
bN_correct = 0
for p, outset in zip(ds, s_outs):
for o in outset.outputs:
if sympy_equal(extract_boxed(o.text), p["answer"]):
bN_correct += 1; break
result = {"model": args.model, "n_samples": args.n_samples, "temperature": args.temperature,
"greedy": g_correct, "best_of_N": bN_correct, "n": len(ds), "elapsed_s": time.time()-T0}
with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} — MATH-500 ({len(ds)} problems)")
print(f" Greedy: {g_correct}/{len(ds)} ({100*g_correct/len(ds):.1f}%)")
print(f" Best-of-{args.n_samples}: {bN_correct}/{len(ds)} ({100*bN_correct/len(ds):.1f}%)")
print(f" TTS Lift: +{bN_correct - g_correct} ({100*(bN_correct-g_correct)/len(ds):.1f}pp)")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

135
tts/tts_qwen14b_recipe.py Normal file
View file

@ -0,0 +1,135 @@
"""Test-time scaling on Qwen2.5-14B-Base + multi_v1 adapter.
For each HumanEval problem:
1. Sample 8 attempts at temp=0.6 from the trained model.
2. Run each attempt against the tests.
3. Accept the first that passes pass@1 with best-of-N selection.
Compared to greedy pass@1 (which gave 80.5%), this should push higher.
"""
import os, json, time, re, subprocess, tempfile, argparse
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from datasets import load_dataset
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def extract_code(text):
if "```python" in text: text = text.split("```python", 1)[1]
elif "```" in text: text = text.split("```", 1)[1]
if "```" in text: text = text.split("```", 1)[0]
return text.strip()
def run_python(code, timeout=15):
with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
f.write(code); path = f.name
try:
r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp")
return r.returncode == 0
except subprocess.TimeoutExpired: return False
finally:
try: os.unlink(path)
except: pass
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", default="Qwen/Qwen2.5-14B")
ap.add_argument("--adapter", default="/workspace/multi_v1_adapter")
ap.add_argument("--n_samples", type=int, default=8)
ap.add_argument("--temperature", type=float, default=0.6)
ap.add_argument("--tag", required=True)
args = ap.parse_args()
out_dir = f"/workspace/tts/{args.tag}"
os.makedirs(out_dir, exist_ok=True)
from vllm import LLM
from vllm.lora.request import LoRARequest
from transformers import AutoTokenizer
log(f"loading {args.model} with adapter {args.adapter}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.90, max_model_len=2048,
enable_lora=True, max_lora_rank=32)
lora_req = LoRARequest("multi_v1", 1, args.adapter)
log(f" loaded")
he = list(load_dataset("openai_humaneval", split="test"))
log(f" HE: {len(he)} problems")
# --- Greedy baseline (with adapter)
log("=== GREEDY pass@1 (with adapter) ===")
from vllm import SamplingParams
sp_greedy = SamplingParams(temperature=0, max_tokens=400)
# Use chat template for Qwen2.5 (it has one)
prompts = []
for p in he:
msgs = [{"role": "system", "content": "You are a Python coder. Output one ```python block only."},
{"role": "user", "content": p["prompt"] + "\n# Complete the function above."}]
prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
t0 = time.time()
greedy_outs = [o.outputs[0].text for o in llm.generate(prompts, sp_greedy, lora_request=lora_req, use_tqdm=False)]
log(f" greedy gen in {time.time()-t0:.1f}s")
greedy_correct = 0
for p, raw in zip(he, greedy_outs):
code = extract_code(raw) if "```" in raw else raw
full = p["prompt"] + "\n" + code if "def " not in code else code
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
if run_python(test_code, 15): greedy_correct += 1
log(f" GREEDY pass@1: {greedy_correct}/{len(he)} ({100*greedy_correct/len(he):.1f}%)")
# --- Test-time scaling: sample N, take first that passes (best-of-N pass@1)
log(f"=== TEST-TIME SCALING: N={args.n_samples}, temp={args.temperature} ===")
sp_sample = SamplingParams(temperature=args.temperature, top_p=0.95,
max_tokens=400, n=args.n_samples)
t0 = time.time()
sample_outs = llm.generate(prompts, sp_sample, lora_request=lora_req, use_tqdm=False)
log(f" sampling gen in {time.time()-t0:.1f}s")
t1 = time.time()
bestN_correct = 0
per_problem = []
for p, outset in zip(he, sample_outs):
attempts = [o.text for o in outset.outputs]
any_pass = False
for a in attempts:
code = extract_code(a) if "```" in a else a
full = p["prompt"] + "\n" + code if "def " not in code else code
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
if run_python(test_code, 15):
any_pass = True
break
if any_pass: bestN_correct += 1
per_problem.append({"task_id": p["task_id"], "best_of_N_pass": any_pass})
log(f" verify done in {time.time()-t1:.1f}s")
result = {
"model": args.model, "adapter": args.adapter,
"n_samples": args.n_samples, "temperature": args.temperature,
"greedy_passN": greedy_correct,
"best_of_N_passN": bestN_correct,
"n_total": len(he),
"elapsed_s": time.time()-T0,
}
with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
with open(f"{out_dir}/per_problem.json", "w") as fh: json.dump(per_problem, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} + adapter {args.adapter}")
print(f" HumanEval:")
print(f" Greedy pass@1: {greedy_correct}/{len(he)} ({100*greedy_correct/len(he):.1f}%)")
print(f" Best-of-{args.n_samples} pass@1: {bestN_correct}/{len(he)} ({100*bestN_correct/len(he):.1f}%)")
print(f" Lift: +{bestN_correct - greedy_correct} ({100*(bestN_correct-greedy_correct)/len(he):.1f}pp)")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,118 @@
"""Control: Qwen3-8B-Base RAW (no recipe) + best-of-8 on HumanEval.
Tells us if the 89.6% headline on 14B+recipe is driven by recipe or by test-time scaling.
"""
import os, json, time, re, subprocess, tempfile, argparse
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from datasets import load_dataset
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def extract_code(text):
if "```python" in text: text = text.split("```python", 1)[1]
elif "```" in text: text = text.split("```", 1)[1]
if "```" in text: text = text.split("```", 1)[0]
return text.strip()
def run_python(code, timeout=15):
with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
f.write(code); path = f.name
try:
r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp")
return r.returncode == 0
except subprocess.TimeoutExpired: return False
finally:
try: os.unlink(path)
except: pass
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--n_samples", type=int, default=8)
ap.add_argument("--temperature", type=float, default=0.6)
ap.add_argument("--tag", required=True)
args = ap.parse_args()
out_dir = f"/workspace/tts_raw/{args.tag}"
os.makedirs(out_dir, exist_ok=True)
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
log(f"loading {args.model} (no adapter)")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.90, max_model_len=2048)
log(f" loaded")
he = list(load_dataset("openai_humaneval", split="test"))
log(f" HE: {len(he)} problems")
# Try chat-template style if available, else raw
prompts = []
for p in he:
try:
msgs = [{"role": "system", "content": "You are a Python coder. Output one ```python block only."},
{"role": "user", "content": p["prompt"] + "\n# Complete the function above."}]
prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
except Exception:
prompts.append(p["prompt"])
# --- Greedy
log("=== GREEDY pass@1 ===")
sp_g = SamplingParams(temperature=0, max_tokens=400)
t0 = time.time()
g_outs = [o.outputs[0].text for o in llm.generate(prompts, sp_g, use_tqdm=False)]
log(f" greedy gen in {time.time()-t0:.1f}s")
g_correct = 0
for p, raw in zip(he, g_outs):
code = extract_code(raw) if "```" in raw else raw
full = p["prompt"] + "\n" + code if "def " not in code else code
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
if run_python(test_code, 15): g_correct += 1
log(f" GREEDY pass@1: {g_correct}/{len(he)} ({100*g_correct/len(he):.1f}%)")
# --- Best-of-N
log(f"=== BEST-OF-{args.n_samples} (temp={args.temperature}) ===")
sp_s = SamplingParams(temperature=args.temperature, top_p=0.95, max_tokens=400, n=args.n_samples)
t0 = time.time()
s_outs = llm.generate(prompts, sp_s, use_tqdm=False)
log(f" sampling gen in {time.time()-t0:.1f}s")
t1 = time.time()
bN_correct = 0
for p, outset in zip(he, s_outs):
attempts = [o.text for o in outset.outputs]
for a in attempts:
code = extract_code(a) if "```" in a else a
full = p["prompt"] + "\n" + code if "def " not in code else code
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
if run_python(test_code, 15):
bN_correct += 1
break
log(f" verify in {time.time()-t1:.1f}s")
result = {
"model": args.model, "n_samples": args.n_samples, "temperature": args.temperature,
"greedy_passN": g_correct, "best_of_N_passN": bN_correct, "n_total": len(he),
"elapsed_s": time.time()-T0,
}
with open(f"{out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} (NO ADAPTER) — HumanEval")
print(f" Greedy pass@1: {g_correct}/{len(he)} ({100*g_correct/len(he):.1f}%)")
print(f" Best-of-{args.n_samples} pass@1: {bN_correct}/{len(he)} ({100*bN_correct/len(he):.1f}%)")
print(f" Lift from TTS: +{bN_correct - g_correct} ({100*(bN_correct-g_correct)/len(he):.1f}pp)")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()

165
tts/tts_scaling.py Normal file
View file

@ -0,0 +1,165 @@
"""TTS scaling sweep: pass@1 across N samples for HE + HE+ + MATH-500."""
import os, json, time, re, subprocess, tempfile, argparse
os.environ.setdefault("HF_HOME", "/workspace/hf")
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
import torch
from datasets import load_dataset
T0 = time.time()
def log(m): print(f"[{time.time()-T0:7.1f}s] {m}", flush=True)
def extract_code(t):
if "```python" in t: t = t.split("```python", 1)[1]
elif "```" in t: t = t.split("```", 1)[1]
if "```" in t: t = t.split("```", 1)[0]
return t.strip()
def run_python(code, timeout=10):
with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
f.write(code); path = f.name
try:
r = subprocess.run(["python3", path], capture_output=True, timeout=timeout, text=True, cwd="/tmp")
return r.returncode == 0
except subprocess.TimeoutExpired: return False
finally:
try: os.unlink(path)
except: pass
def extract_boxed(text):
idx = text.rfind("\\boxed{")
if idx < 0: return None
start = idx + len("\\boxed{"); depth = 1; i = start
while i < len(text) and depth > 0:
if text[i] == "{": depth += 1
elif text[i] == "}": depth -= 1
i += 1
if depth != 0: return None
return text[start:i-1].strip()
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--model", required=True)
ap.add_argument("--tag", required=True)
ap.add_argument("--out_dir", required=True)
args = ap.parse_args()
os.makedirs(args.out_dir, exist_ok=True)
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
log(f"loading {args.model}")
tok = AutoTokenizer.from_pretrained(args.model)
if tok.pad_token is None: tok.pad_token = tok.eos_token
llm = LLM(model=args.model, dtype="bfloat16", gpu_memory_utilization=0.85, max_model_len=2048)
log("loaded")
he = list(load_dataset("openai_humaneval", split="test"))
math500 = list(load_dataset("HuggingFaceH4/MATH-500", split="test"))[:200]
# Build prompts
he_prompts = []
for p in he:
try:
msgs = [{"role": "system", "content": "You are a Python coder. Output one ```python block only."},
{"role": "user", "content": p["prompt"] + "\n# Complete the function above."}]
he_prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
except Exception:
he_prompts.append(p["prompt"])
math_prompts = []
UTMPL = "Solve this competition math problem. End with \\boxed{{...}}.\n\nProblem: {p}\n\nSolution:"
for p in math500:
try:
msgs = [{"role": "system", "content": "Math solver. End with \\boxed{answer}."},
{"role": "user", "content": UTMPL.format(p=p["problem"])}]
math_prompts.append(tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True))
except Exception:
math_prompts.append(UTMPL.format(p=p["problem"]))
# Generate max-N samples ONCE per task (N=32), then compute pass@k for k ∈ {1, 2, 4, 8, 16, 32}
MAX_N = 32
sp = SamplingParams(temperature=0.6, top_p=0.95, max_tokens=600, n=MAX_N)
log(f"generating MAX_N={MAX_N} samples per task")
t0 = time.time()
he_outs = llm.generate(he_prompts, sp, use_tqdm=False)
log(f" HE gen in {time.time()-t0:.1f}s")
t0 = time.time()
math_outs = llm.generate(math_prompts, sp, use_tqdm=False)
log(f" MATH gen in {time.time()-t0:.1f}s")
# Compute correctness for each sample
def he_correct(p, raw):
code = extract_code(raw) if "```" in raw else raw
full = p["prompt"] + "\n" + code if "def " not in code else code
test_code = full + "\n\n" + p["test"] + f"\n\ncheck({p['entry_point']})"
return run_python(test_code, 10)
log("verifying HE samples...")
he_results = [] # per task: list of bool
for p, outset in zip(he, he_outs):
per_task = []
for o in outset.outputs:
per_task.append(he_correct(p, o.text))
he_results.append(per_task)
log(f" HE verify done")
import sympy
from sympy.parsing.latex import parse_latex
def sympy_eq(a, b):
if a is None or b is None: return False
a, b = a.strip(), b.strip()
if a == b: return True
try:
if sympy.simplify(parse_latex(a) - parse_latex(b)) == 0: return True
except Exception: pass
try:
if abs(float(a) - float(b)) < 1e-6: return True
except Exception: pass
return False
log("verifying MATH samples...")
math_results = []
for p, outset in zip(math500, math_outs):
per_task = []
for o in outset.outputs:
pred = extract_boxed(o.text)
per_task.append(sympy_eq(pred, p["answer"]))
math_results.append(per_task)
log(f" MATH verify done")
# Compute pass@k for each k
NS = [1, 2, 4, 8, 16, 32]
def best_of_k(results, k):
return sum(1 for r in results if any(r[:k]))
he_scaling = {k: best_of_k(he_results, k) for k in NS}
math_scaling = {k: best_of_k(math_results, k) for k in NS}
result = {
"model": args.model, "tag": args.tag, "MAX_N": MAX_N,
"humaneval_total": len(he),
"math500_total": len(math500),
"he_pass_at_k": he_scaling,
"math500_pass_at_k": math_scaling,
"elapsed_s": time.time() - T0,
}
with open(f"{args.out_dir}/result.json", "w") as fh: json.dump(result, fh, indent=2)
print()
print("=" * 70)
print(f" {args.model} — TTS SCALING SWEEP")
print(f" N HE MATH-500")
for k in NS:
print(f" {k:>3} {he_scaling[k]:>3}/{len(he)} ({100*he_scaling[k]/len(he):.1f}%) "
f"{math_scaling[k]:>3}/{len(math500)} ({100*math_scaling[k]/len(math500):.1f}%)")
print(f" Time: {time.time()-T0:.0f}s")
print("=" * 70)
if __name__ == "__main__":
main()