Doc-to-LoRA release

2026-04-29 18:16:24 +02:00 · 2026-02-27 03:47:04 +00:00 · 2026-02-27 03:47:04 +00:00 · 1abe8ae16d
commit 1abe8ae16d
92 changed files with 22131 additions and 0 deletions
--- a/data/build_drop_compact.py
+++ b/data/build_drop_compact.py
@ -0,0 +1,42 @@
+import gc
+
+from datasets import Dataset, load_dataset
+from tqdm import tqdm
+
+if __name__ == "__main__":
+    ds_name = "ucinlp/drop"
+
+    for split in ["train", "validation"]:
+        ctx_qa_dict = dict()
+        ds = load_dataset(ds_name, split=split)
+        print(f"Original size: {len(ds)}")
+        for i, sample in tqdm(enumerate(ds)):
+            ctx = sample["passage"]
+            if ctx not in ctx_qa_dict:
+                ctx_qa_dict[ctx] = {"prompts": [], "responses": []}
+            question = sample["question"]
+            answer = sample["answers_spans"]["spans"][0]
+            ctx_qa_dict[ctx]["prompts"].append(question)
+            ctx_qa_dict[ctx]["responses"].append(answer)
+
+        print(f"Unique contexts: {len(ctx_qa_dict)}")
+        # convert ctx_qa_dict to a list of dictionaries
+        samples = [
+            {
+                "context": ctx,
+                "prompts": ctx_qa_dict[ctx]["prompts"],
+                "responses": ctx_qa_dict[ctx]["responses"],
+            }
+            for ctx in ctx_qa_dict
+        ]
+        print(f"Sampled data: {samples[0]}")
+        # breakpoint()
+        # save to a new dataset
+        ds = Dataset.from_list(samples)
+
+        save_path = f"./data/raw_datasets/drop_compact/{split}/ds.parquet"
+        print(f"Saving dataset to {save_path}")
+        ds.to_parquet(save_path)
+        print("=" * 80)
+        del ds, samples, ctx_qa_dict
+        gc.collect()
--- a/data/build_pwc_compact.py
+++ b/data/build_pwc_compact.py
@ -0,0 +1,43 @@
+import gc
+
+from datasets import Dataset, load_dataset
+from tqdm import tqdm
+
+if __name__ == "__main__":
+    ds_name = "sggetao/PwC"
+
+    for split in ["train", "test"]:
+        ctx_qa_dict = dict()
+        ds = load_dataset(ds_name, split=split)
+        print(f"Original size: {len(ds)}")
+        for i, sample in tqdm(enumerate(ds)):
+            ctx = sample["input"]
+            if ctx not in ctx_qa_dict:
+                ctx_qa_dict[ctx] = {"prompts": [], "responses": []}
+            # question = closed_qa_prompting(sample["prompt"])
+            question = sample["prompt"]
+            answer = sample["answer"]
+            ctx_qa_dict[ctx]["prompts"].append(question)
+            ctx_qa_dict[ctx]["responses"].append(answer)
+
+        print(f"Unique contexts: {len(ctx_qa_dict)}")
+        # convert ctx_qa_dict to a list of dictionaries
+        samples = [
+            {
+                "context": ctx,
+                "prompts": ctx_qa_dict[ctx]["prompts"],
+                "responses": ctx_qa_dict[ctx]["responses"],
+            }
+            for ctx in ctx_qa_dict
+        ]
+        print(f"Sampled data: {samples[0]}")
+        # breakpoint()
+        # save to a new dataset
+        ds = Dataset.from_list(samples)
+
+        save_path = f"./data/raw_datasets/pwc_compact/{split}/ds.parquet"
+        print(f"Saving dataset to {save_path}")
+        ds.to_parquet(save_path)
+        print("=" * 80)
+        del ds, samples, ctx_qa_dict
+        gc.collect()
--- a/data/build_ropes_compact.py
+++ b/data/build_ropes_compact.py
@ -0,0 +1,45 @@
+import gc
+
+from datasets import Dataset, load_dataset
+from tqdm import tqdm
+
+if __name__ == "__main__":
+    ds_name = "allenai/ropes"
+
+    for split in ["train", "validation"]:
+        ctx_qa_dict = dict()
+        ds = load_dataset(ds_name, split=split)
+        print(f"Original size: {len(ds)}")
+        for i, sample in tqdm(enumerate(ds)):
+            ctx_template = "{background}\n{situation}"
+            response = sample["answers"]["text"][0]
+            bg_txt = sample["background"]
+            situation_txt = sample["situation"]
+            ctx = ctx_template.format(background=bg_txt, situation=situation_txt)
+            q = sample["question"]
+            if ctx not in ctx_qa_dict:
+                ctx_qa_dict[ctx] = {"prompts": [], "responses": []}
+            ctx_qa_dict[ctx]["prompts"].append(q)
+            ctx_qa_dict[ctx]["responses"].append(response)
+
+        print(f"Unique contexts: {len(ctx_qa_dict)}")
+        # convert ctx_qa_dict to a list of dictionaries
+        samples = [
+            {
+                "context": ctx,
+                "prompts": ctx_qa_dict[ctx]["prompts"],
+                "responses": ctx_qa_dict[ctx]["responses"],
+            }
+            for ctx in ctx_qa_dict
+        ]
+        print(f"Sampled data: {samples[0]}")
+        # breakpoint()
+        # save to a new dataset
+        ds = Dataset.from_list(samples)
+
+        save_path = f"./data/raw_datasets/ropes_compact/{split}/ds.parquet"
+        print(f"Saving dataset to {save_path}")
+        ds.to_parquet(save_path)
+        print("=" * 80)
+        del ds, samples, ctx_qa_dict
+        gc.collect()
--- a/data/build_squad_compact.py
+++ b/data/build_squad_compact.py
@ -0,0 +1,42 @@
+import gc
+
+from datasets import Dataset, load_dataset
+from tqdm import tqdm
+
+if __name__ == "__main__":
+    ds_name = "data/raw_datasets/squad"
+
+    for split in ["train", "validation"]:
+        ctx_qa_dict = dict()
+        ds = load_dataset(ds_name, split=split)
+        print(f"Original size: {len(ds)}")
+        for i, sample in tqdm(enumerate(ds)):
+            ctx = sample["context"]
+            if ctx not in ctx_qa_dict:
+                ctx_qa_dict[ctx] = {"prompts": [], "responses": []}
+            question = sample["question"]
+            answer = sample["answers"]["text"][0]
+            ctx_qa_dict[ctx]["prompts"].append(question)
+            ctx_qa_dict[ctx]["responses"].append(answer)
+
+        print(f"Unique contexts: {len(ctx_qa_dict)}")
+        # convert ctx_qa_dict to a list of dictionaries
+        samples = [
+            {
+                "context": ctx,
+                "prompts": ctx_qa_dict[ctx]["prompts"],
+                "responses": ctx_qa_dict[ctx]["responses"],
+            }
+            for ctx in ctx_qa_dict
+        ]
+        print(f"Sampled data: {samples[0]}")
+        # breakpoint()
+        # save to a new dataset
+        ds = Dataset.from_list(samples)
+
+        save_path = f"./data/raw_datasets/squad_compact/{split}/ds.parquet"
+        print(f"Saving dataset to {save_path}")
+        ds.to_parquet(save_path)
+        print("=" * 80)
+        del ds, samples, ctx_qa_dict
+        gc.collect()
--- a/data/download_fineweb_edu.py
+++ b/data/download_fineweb_edu.py
@ -0,0 +1,10 @@
+from huggingface_hub import snapshot_download
+
+if __name__ == "__main__":
+    fw_dir = "./data/raw_datasets/fineweb_edu/"
+    snapshot_download(
+        "HuggingFaceFW/fineweb-edu",
+        repo_type="dataset",
+        local_dir=fw_dir,
+        allow_patterns="sample/100BT/*",
+    )
--- a/data/generate_ctx_magic_number.py
+++ b/data/generate_ctx_magic_number.py
@ -0,0 +1,288 @@
+import argparse
+import json
+import math
+import os
+import random
+
+# -----------------------------
+# Config knobs (edit or use CLI)
+# -----------------------------
+TOKENS_PER_BLOCK = 40  # rough heuristic tokens per noise block
+BASE_SAMPLES_PER_BIN = (
+    320_000  # training samples budget scaler only (val/test fixed at 1000 each)
+)
+RNG_SEED = 42
+NOISE_BLOCK = "The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again."
+SPECIAL_TPL = "The special magic number is {magic_number}."
+SEP = "\n"  # between blocks
+
+
+def save_jsonl(data: list[dict], filepath: str) -> None:
+    parent_dir = os.path.dirname(filepath)
+    if parent_dir:
+        os.makedirs(parent_dir, exist_ok=True)
+    with open(filepath, "w") as f:
+        for entry in data:
+            json.dump(entry, f)
+            f.write("\n")
+
+
+essential_digits4 = lambda: f"{random.randint(0, 9_999):04d}"
+
+
+def _choose_position(total_blocks: int, depth_bin: int) -> int:
+    """Choose an insertion index for the special sentence within [0, total_blocks-1]
+    such that its relative depth falls within the depth bin [i/10, (i+1)/10).
+    """
+    if total_blocks <= 0:
+        return 0
+    # Use floor for start and ceil for end to cover boundaries evenly
+    start = math.floor(total_blocks * (depth_bin / 10))
+    end = math.ceil(total_blocks * ((depth_bin + 1) / 10)) - 1
+    # clamp
+    start = max(0, min(start, total_blocks - 1))
+    end = max(start, min(end, total_blocks - 1))
+    return random.randint(start, end)
+
+
+def _build_example(total_blocks: int, depth_bin: int) -> dict:
+    """Build one example with a special line inserted among noise blocks.
+
+    total_blocks: total number of blocks in the final context (including the special one)
+    depth_bin: integer in [0, 9]
+    """
+    total_blocks = max(1, total_blocks)
+
+    # Prepare blocks
+    magic = essential_digits4()
+    special_line = SPECIAL_TPL.format(magic_number=magic)
+
+    # We'll have (total_blocks - 1) noise blocks and 1 special line
+    noise_count = max(0, total_blocks - 1)
+    blocks: list[str] = [NOISE_BLOCK for _ in range(noise_count)]
+
+    insert_at = _choose_position(total_blocks, depth_bin)
+    # Insert special line at the desired position within the final sequence
+    # If noise_count == 0, we just return special
+    if noise_count == 0:
+        final_blocks = [special_line]
+    else:
+        # Compose by interleaving noise and inserting special at index
+        # Build a list of length `total_blocks` and fill
+        final_blocks = []
+        noise_idx = 0
+        for idx in range(total_blocks):
+            if idx == insert_at:
+                final_blocks.append(special_line)
+            else:
+                final_blocks.append(blocks[noise_idx])
+                noise_idx += 1
+
+    context = SEP.join(final_blocks)
+    prompt = "What is the special magic number? Reply with only the number."
+    response = magic
+    return {"context": context, "prompt": prompt, "response": response}
+
+
+def generate_examples(n: int, k: int) -> list[dict]:
+    """Generate n examples (all for block length k) evenly across 10 depth bins."""
+    if n <= 0:
+        return []
+    base = n // 10
+    rem = n % 10
+    counts = [base + (1 if i < rem else 0) for i in range(10)]
+    out: list[dict] = []
+    for depth_bin, c in enumerate(counts):
+        for _ in range(c):
+            out.append(_build_example(total_blocks=k, depth_bin=depth_bin))
+    random.shuffle(out)
+    return out
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate noise-wrapped special magic number dataset (similar structure to generate_ctx_kv.py)",
+    )
+    parser.add_argument("--seed", type=int, default=RNG_SEED, help="Random seed")
+    parser.add_argument(
+        "--tokenizer-name",
+        type=str,
+        default="google/gemma-2-2b-it",
+        help=("Tokenizer name"),
+    )
+    parser.add_argument(
+        "--base-samples-per-bin",
+        type=int,
+        default=BASE_SAMPLES_PER_BIN,
+        help="Baseline number of TRAINING samples per token bin (scaled by bin width). Validation & test are always 1000 each.",
+    )
+    parser.add_argument(
+        "--out-prefix",
+        type=str,
+        default="data/raw_datasets/ctx_magic_number",
+        help="Output directory prefix (bin range will be appended)",
+    )
+    parser.add_argument(
+        "--tokens-per-block",
+        "--tokens-per-pair",
+        dest="tokens_per_block",
+        type=int,
+        default=TOKENS_PER_BLOCK,
+        help="Heuristic tokens per noise block for bucketing",
+    )
+    parser.add_argument(
+        "--only-first-n-bins",
+        type=int,
+        default=None,
+        help="For quick tests: only generate the first N token bins",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print a small sample and exit without writing files",
+    )
+
+    args = parser.parse_args()
+
+    random.seed(args.seed)
+
+    # ----------------------------------------------------
+    # Optional: report tokenizer-based token length stats
+    # ----------------------------------------------------
+    if args.tokenizer_name:
+        try:
+            from transformers import AutoTokenizer  # type: ignore
+        except Exception as e:  # pragma: no cover
+            raise RuntimeError(
+                "Failed to import transformers. Install it or omit --tokenizer-name."
+            ) from e
+
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=True)
+        noise_token_count = len(tokenizer(NOISE_BLOCK).input_ids)
+        special_example = SPECIAL_TPL.format(magic_number="0000")
+        special_token_count = len(tokenizer(special_example).input_ids)
+        print(
+            f"[Tokenizer: {args.tokenizer_name}] Noise block tokens: {noise_token_count} | Special line tokens: {special_token_count}"
+        )
+
+    tok_bins = [(32, 128), (128, 256), (256, 512), (512, 1024), (32, 1024)] + [
+        (1024 * i, 1024 * (i + 1)) for i in range(1, 16)
+    ]
+    tok_bins += [(2**14 + 2**12 * (i), 2**14 + 2**12 * (i + 1)) for i in range(4)]
+    tok_bins += [(2**15 + 2**13 * (i), 2**15 + 2**13 * (i + 1)) for i in range(12)]
+    if args.only_first_n_bins is not None:
+        tok_bins = tok_bins[: args.only_first_n_bins]
+
+    if args.tokenizer_name:
+        max_hi = max(hi for _, hi in tok_bins)
+
+        def measure_len(k: int) -> int:
+            if k == 1:
+                ctx = SPECIAL_TPL.format(magic_number="0000")
+            else:
+                blocks = [NOISE_BLOCK] * (k - 1) + [
+                    SPECIAL_TPL.format(magic_number="0000")
+                ]
+                ctx = SEP.join(blocks)
+            return len(tokenizer(ctx).input_ids)
+
+        lengths: list[int] = [0]
+        k = 1
+        while True:
+            L = measure_len(k)
+            lengths.append(L)
+            if L >= max_hi:
+                break
+            k += 1
+
+        len_bins = []
+        for lo, hi in tok_bins:
+            k_lo = None
+            for kk in range(1, len(lengths)):
+                if lengths[kk] >= lo:
+                    k_lo = kk
+                    break
+            if k_lo is None or lengths[k_lo] >= hi:
+                len_bins.append((0, 0))
+                continue
+            k_hi = len(lengths)
+            for kk in range(k_lo, len(lengths)):
+                if lengths[kk] >= hi:
+                    k_hi = kk
+                    break
+            len_bins.append((k_lo, k_hi))
+
+        base_tokens = lengths[1]
+        delta = (lengths[2] - lengths[1]) if len(lengths) > 2 else 0
+        print(
+            f"Using tokenizer-measured block ranges. base_tokens={base_tokens} approx_delta={delta}"
+        )
+    else:
+        len_bins = [
+            (lo // args.tokens_per_block, hi // args.tokens_per_block)
+            for (lo, hi) in tok_bins
+        ]
+
+    if args.dry_run:
+        for lb in len_bins:
+            if lb[1] > lb[0]:
+                k = max(1, lb[0])
+                sample = generate_examples(10, k)
+                print("Sample entry:")
+                print(json.dumps(sample[0], indent=2))
+                break
+        return
+    # -----------------------------------------------
+    # Main generation per token bin
+    # -----------------------------------------------
+    TARGET_VAL = 1000
+    TARGET_TEST = 1000
+    for len_bin, tok_bin in zip(len_bins, tok_bins):
+        if len_bin[1] <= len_bin[0]:
+            print(f"Skipping token bin {tok_bin} (no valid block counts)")
+            continue
+        k_start = max(1, len_bin[0])
+        k_end = max(1, len_bin[1])
+        k_values = list(range(k_start, k_end))
+        bin_size = len(k_values)
+        save_dir = f"{args.out_prefix}_{tok_bin[0]}_{tok_bin[1]}"
+        training_enabled = tok_bin[1] <= 1024  # unchanged policy
+        if training_enabled:
+            train_data: list[dict] = []
+            # Distribute training budget across k values.
+            # Scale: per_k = base_samples_per_bin / bin_size
+            per_k_train = max(1, args.base_samples_per_bin // max(1, bin_size))
+            for k in k_values:
+                train_data += generate_examples(per_k_train, k)
+        val_data: list[dict] = []
+        test_data: list[dict] = []
+        base_val = TARGET_VAL // bin_size
+        rem_val = TARGET_VAL % bin_size
+        base_test = TARGET_TEST // bin_size
+        rem_test = TARGET_TEST % bin_size
+        for idx, k in enumerate(k_values):
+            n_val_k = base_val + (1 if idx < rem_val else 0)
+            n_test_k = base_test + (1 if idx < rem_test else 0)
+            if n_val_k:
+                val_data += generate_examples(n_val_k, k)
+            if n_test_k:
+                test_data += generate_examples(n_test_k, k)
+        random.shuffle(val_data)
+        random.shuffle(test_data)
+        os.makedirs(save_dir, exist_ok=True)
+        if training_enabled:
+            save_jsonl(train_data, f"{save_dir}/train.jsonl")
+        save_jsonl(val_data, f"{save_dir}/val.jsonl")
+        save_jsonl(test_data, f"{save_dir}/test.jsonl")
+        if training_enabled:
+            print(
+                f"Dataset generated at {save_dir} (train={len(train_data)} val={len(val_data)} test={len(test_data)})"
+            )
+        else:
+            print(
+                f"Dataset (val/test only) generated at {save_dir} (val={len(val_data)} test={len(test_data)})"
+            )
+
+
+if __name__ == "__main__":
+    main()
--- a/data/generate_fw_edu_qa_v2.py
+++ b/data/generate_fw_edu_qa_v2.py
@ -0,0 +1,269 @@
+import argparse
+import os
+import re
+from glob import glob
+
+import pandas as pd
+from datasets import Dataset, load_dataset
+from vllm import LLM, SamplingParams
+
+STOP_STRINGS = {
+    "google/gemma-3-12b-it": ["<eos>", "<end_of_turn>"],
+}
+
+SYSTEM_TEMPLATE = (
+    "You are a creative and helpful assistant.\n"
+    "You are tasked with generating questions for reading comprehension tests.\n"
+    "You will be given a context and you need to generate questions and corresponding answers from the given context.\n"
+    "The questions should be highly specific to the information provided in the context, not general questions that suit any context.\n"
+    "**DO NOT** hallucinate or make up information."
+)
+
+# based on Make Your LLM Fully Utilize the Context (https://arxiv.org/pdf/2404.16811)
+PROMPT_TEMPLATE = (
+    "### Instructions ###\n"
+    "Generate questions and corresponding answers from the given context. The questions should be highly specific to the "
+    "information provided in the context, not general questions that suit any context.\n\n"
+    "### Context ###\n"
+    "{context}\n\n\n"
+    "### Rules ###\n"
+    "Rules to follow when generating the questions:\n"
+    "1. The questions must be specific to the given context and fully answerable from information present in the given context.\n"
+    "2. Ask questions that are fact-seeking based on the information provided.\n"
+    "3. Make sure the questions are clear and unambiguous.\n"
+    "4. Phrases like 'based on the provided context', 'according to the context', 'in the context', etc., are **NOT ALLOWED** to appear in "
+    "the questions.\n"
+    "5. The questions should not overlap. They should be diverse, covering many aspects of the context.\n"
+    "6. Do not give away too much information in the questions. For example, ask 'Who is X?' instead of 'Who is X that did Y?' when Y is clear from the context.\n"
+    "7. Ignore the text formatting of the context, e.g., bold, italic, underline, etc.\n"
+    "8. Ignore typos, spacing, and grammatical errors in the context.\n\n"
+    "Rules to follow when generating the answers:\n"
+    "1. The answers must use the (implied) information provided in the context.\n"
+    "2. Phrases like 'based on the provided context', 'according to the context', 'in the context', etc., are **NOT ALLOWED** to appear in "
+    "the answers.\n"
+    "3. Do not just copy words from the context. Answer the question in your own words.\n"
+    "4. The answers should be detailed and comprehensive. Please include additional specific details from the context.\n\n"
+    "Respond with {n_qa_pairs} question-answer pairs.\n"
+    "Always use proper grammar and punctuation.\n"
+    "Try to use different question forms and styles.\n"
+    "Use simple words and make sure that the answers are clear and comprehensive.\n\n"
+    "The question-answer pairs should be in the following format:\n"
+    "Question 1: {{question_1}}\n"
+    "Answer 1: {{answer_1}}\n"
+    "Question 2: {{question_2}}\n"
+    "Answer 2: {{answer_2}}\n"
+    "..."
+)
+
+
+def get_prompt(context, n_qa_pairs):
+    prompt = PROMPT_TEMPLATE.format(context=context, n_qa_pairs=n_qa_pairs)
+    return prompt
+
+
+def check_should_skip(txt: str, vllm_model: str) -> bool:
+    """Check if the response should be skipped based on stop strings."""
+    for stop in STOP_STRINGS[vllm_model]:
+        if stop in txt[-len(stop) :]:
+            return (txt.split(stop)[0], False)  # Found a valid stop string
+    return (txt, True)  # No valid stop string found, skip this response
+
+
+def postprocess_qa_pairs(res_txt: str):
+    """
+    Postprocesses the QA pairs from the response text.
+
+    Args:
+        res_txt: The response text.
+        n_qa_pairs: The number of QA pairs.
+
+    Returns:
+        A tuple of two lists, the first containing the questions and the second containing the answers.
+    """
+    # capture everything after each "Question {number}:" until "Answer"
+    res_txt = remove_think(res_txt)
+    q_pattern = r"Question \d+:(.*?)(?=Answer|$)"  # thanks chatgpt
+    questions = re.findall(q_pattern, res_txt, flags=re.S)
+
+    a_pattern = r"Answer \d+:(.*?)(?=Question|$)"  # thanks chatgpt
+    answers = re.findall(a_pattern, res_txt, flags=re.S)
+
+    if len(questions) != len(answers):
+        print(f"Warning---number of questions and answers do not match")
+        print(f"Number of questions: {len(questions)}")
+        print(f"Number of answers: {len(answers)}")
+
+    out_q = []
+    out_a = []
+    n_skips = 0
+    if (len(questions) > 0) and (len(answers) > 0):
+        n_gen_pairs = min(len(questions), len(answers))
+        has_left_over = n_gen_pairs < len(questions) or n_gen_pairs < len(answers)
+        for i in range(n_gen_pairs):
+            response = answers[i].strip()
+            question = questions[i].strip()
+            if not response or not question:
+                print(f"Skipping empty question or answer at index {i}")
+                continue
+            if (not has_left_over) and (i == n_gen_pairs - 1):
+                response, skip = check_should_skip(response, vllm_model)
+                if skip:
+                    print(f"Skipping due to missing stop string")
+                    n_skips += 1
+                    continue
+            out_q.append(question.strip())
+            out_a.append(response.strip())
+    print(f"Skipped {n_skips} responses due to missing stop strings")
+
+    return out_q, out_a
+
+
+def length_filter(sample, min_len, max_len):
+    return min_len <= len(sample["text"]) <= max_len
+
+
+def remove_think(txt):
+    return txt.split("</think>")[-1]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate QA pairs from FineWeb Edu dataset"
+    )
+    parser.add_argument(
+        "--vllm_model",
+        type=str,
+        default=os.environ.get("vllm_model", "google/gemma-2-27b-it"),
+        help="VLLM model to use for generation",
+    )
+    parser.add_argument(
+        "--shard_pattern",
+        type=str,
+        required=True,
+        help="Pattern to match shard files (e.g., '000_0000*')",
+    )
+    parser.add_argument(
+        "--n_qa_pairs",
+        type=int,
+        required=True,
+        help="Number of question-answer pairs to generate per context",
+    )
+    parser.add_argument(
+        "--min_length",
+        type=int,
+        default=0,
+        help="Minimum length of the context to consider for generation",
+    )
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=2000,
+        help="Maximum length of the context to consider for generation",
+    )
+    parser.add_argument(
+        "--max_model_length",
+        type=int,
+        default=2**14,
+        help="Maximum length of the model input (context + prompt + response) in tokens",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Debug mode - process only first 100 samples",
+    )
+
+    args = parser.parse_args()
+    vllm_model = args.vllm_model
+    print(f"Using model: {vllm_model}")
+    llm_kwargs = dict(
+        model=vllm_model,
+        dtype="bfloat16",
+        enable_prefix_caching=True,
+        enable_chunked_prefill=True,
+        max_model_len=args.max_model_length,
+        limit_mm_per_prompt={"image": 0},
+    )
+
+    llm = LLM(**llm_kwargs)
+    tokenizer = llm.get_tokenizer()
+    shard_pattern = args.shard_pattern
+    n_qa_pairs = args.n_qa_pairs
+
+    paths = glob(
+        f"./data/raw_datasets/fineweb_edu/sample/100BT/{shard_pattern}.parquet"
+    )
+
+    split = "train[:100]" if args.debug else "train"
+    for path in paths:
+        ds = load_dataset(
+            "parquet",
+            data_files=path,
+            split=split,
+        )
+        ds = ds.filter(
+            length_filter,
+            fn_kwargs={"min_len": args.min_length, "max_len": args.max_length},
+            num_proc=8,
+        )
+
+        ctxs = [sample["text"] for sample in iter(ds)]
+        messages = [
+            [
+                {"role": "system", "content": SYSTEM_TEMPLATE},
+                {"role": "user", "content": get_prompt(ctx, n_qa_pairs)},
+            ]
+            for ctx in ctxs
+        ]
+
+        print(f"Generating from {len(messages)} contexts")
+        completions = llm.chat(
+            messages,
+            sampling_params=SamplingParams(
+                max_tokens=2048,
+                temperature=0.0,
+                # needed for checking if stop tokens are present
+                skip_special_tokens=False,
+                include_stop_str_in_output=True,
+            ),
+        )
+        samples = []
+        for ctx, completion in zip(ctxs, completions):
+            questions, answers = postprocess_qa_pairs(completion.outputs[0].text)
+            samples.append(
+                {
+                    "context": ctx,
+                    "prompts_level_0": questions,
+                    "responses_level_0": answers,
+                }
+            )
+            if args.debug:
+                print(f"{ctx=}")
+                print(f"{completion.outputs[0].text=}")
+                for q, a in zip(questions, answers):
+                    print(f"{q=}")
+                    print(f"{a=}")
+                    print()
+                print("=" * 80)
+
+        print(f"Generated {len(samples)} samples")
+        df = pd.DataFrame(samples)
+        ds = Dataset.from_pandas(df)
+        val_ds = ds.take(10)
+        ds = ds.skip(10)
+
+        shard_name = path.split("/")[-1].split(".")[0]
+        shard_name += "_level_0"
+        if args.debug:
+            shard_name += "_debug"
+        ds.to_parquet(
+            f"data/raw_datasets/fw_qa_v2/min_{args.min_length}_to_{args.max_length}/{shard_name}.parquet"
+        )
+        val_ds.to_parquet(
+            f"data/raw_datasets/fw_qa_v2/min_{args.min_length}_to_{args.max_length}/{shard_name}_val.parquet"
+        )
+        print(
+            f"Saved to data/raw_datasets/fw_qa_v2/min_{args.min_length}_to_{args.max_length}/{shard_name}.parquet"
+        )
+        print(
+            f"Saved to data/raw_datasets/fw_qa_v2/min_{args.min_length}_to_{args.max_length}/{shard_name}_val.parquet"
+        )
--- a/data/generate_fw_edu_qa_v2_repeat.py
+++ b/data/generate_fw_edu_qa_v2_repeat.py
@ -0,0 +1,296 @@
+import argparse
+import gc
+import os
+import re
+from glob import glob
+
+from datasets import load_dataset
+from vllm import LLM, SamplingParams
+
+STOP_STRINGS = {
+    "google/gemma-3-12b-it": ["<eos>", "<end_of_turn>"],
+}
+
+SYSTEM_TEMPLATE = (
+    "You are a creative and helpful assistant.\n"
+    "You are tasked with generating questions for reading comprehension tests.\n"
+    "You will be given a context and you need to generate questions and corresponding answers from the given context.\n"
+    "The questions should be highly specific to the information provided in the context, not general questions that suit any context.\n"
+    "**DO NOT** hallucinate or make up information."
+)
+
+# based on Make Your LLM Fully Utilize the Context (https://arxiv.org/pdf/2404.16811)
+PROMPT_TEMPLATE = (
+    "### Instructions ###\n"
+    "Generate questions and corresponding answers from the given context. The questions should be highly specific to the "
+    "information provided in the context, not general questions that suit any context.\n\n"
+    "### Context ###\n"
+    "{context}\n\n\n"
+    "### Example Question-Answer Pairs ###\n"
+    "{qa_pairs}\n\n\n"
+    "### Rules ###\n"
+    "Rules to follow when generating the questions:\n"
+    "1. The questions must be specific to the given context and fully answerable from information present in *or* implied from the given context.\n"
+    "2. The questions must *not* be redundant with the example questions-answer pairs provided.\n"
+    "3. You should prioritize fact-seeking questions. Consider reversal questions, e.g., asking 'What causes X to happen?' is valid when 'Y causes X' is presented in the context.\n"
+    "4. If all the facts in the context are already covered by the provided examples, you must generate *more complicated* questions that require reasoning beyond simple information retrieval.\nThis includes asking about information that can be inferred, requiring synthesizing information from multiple parts of the text, or understanding relationships between concepts, events, or individuals mentioned in the context. For example, if the context says 'The Eiffel Tower was completed in 1889 after 2 years of construction', you can ask 'When did the construction of the Eiffel Tower begin?'. Here's another example: if the context says 'Alice is Bob's mother. Bob is Charlie's Dad', you can ask 'Who is Charlie's grandmother?'.\n"
+    "5. Phrases like 'based on the provided context', 'according to the context', 'in the context', etc., are **NOT ALLOWED** to appear in "
+    "the questions.\n"
+    "6. The questions should not overlap. They should be diverse, covering many aspects of the context.\n"
+    "7. Do not give away too much information in the questions. For example, ask 'Who is X?' instead of 'Who is X that did Y?' when Y is clear from the context.\n"
+    "8. Ignore the text formatting of the context, e.g., bold, italic, underline, etc.\n"
+    "9. Ignore typos, spacing, and grammatical errors in the context.\n\n"
+    "Rules to follow when generating the answers:\n"
+    "1. The answers must use the (implied) information provided in the context.\n"
+    "2. Phrases like 'based on the provided context', 'according to the context', 'in the context', etc., are **NOT ALLOWED** to appear in "
+    "the answers.\n"
+    "3. Do not just copy words from the context. Answer the question in your own words.\n"
+    "4. The answers should be detailed and comprehensive. Please include additional specific details from the context.\n\n"
+    "Respond with {n_qa_pairs} question-answer pairs.\n"
+    "Always use proper grammar and punctuation.\n"
+    "Try to use different question forms and styles.\n"
+    "Use simple words and make sure that the answers are clear and comprehensive.\n\n"
+    "The question-answer pairs should be in the following format:\n"
+    "Question 1: {{question_1}}\n"
+    "Answer 1: {{answer_1}}\n"
+    "Question 2: {{question_2}}\n"
+    "Answer 2: {{answer_2}}\n"
+    "..."
+)
+
+
+def get_prompt(context, example_qa_pairs, n_qa_pairs):
+    prompt = PROMPT_TEMPLATE.format(
+        context=context,
+        qa_pairs=example_qa_pairs,
+        n_qa_pairs=n_qa_pairs,
+    )
+    return prompt
+
+
+def check_should_skip(txt: str, vllm_model: str) -> bool:
+    """Check if the response should be skipped based on stop strings."""
+    for stop in STOP_STRINGS[vllm_model]:
+        if stop in txt[-len(stop) :]:
+            return (txt.split(stop)[0], False)  # Found a valid stop string
+    return (txt, True)  # No valid stop string found, skip this response
+
+
+def postprocess_qa_pairs(res_txt: str):
+    """
+    Postprocesses the QA pairs from the response text.
+
+    Args:
+        res_txt: The response text.
+        n_qa_pairs: The number of QA pairs.
+
+    Returns:
+        A tuple of two lists, the first containing the questions and the second containing the answers.
+    """
+    # capture everything after each "Question {number}:" until "Answer"
+    res_txt = remove_think(res_txt)
+    q_pattern = r"Question \d+:(.*?)(?=Answer|$)"  # thanks chatgpt
+    questions = re.findall(q_pattern, res_txt, flags=re.S)
+
+    a_pattern = r"Answer \d+:(.*?)(?=Question|$)"  # thanks chatgpt
+    answers = re.findall(a_pattern, res_txt, flags=re.S)
+
+    if len(questions) != len(answers):
+        print(f"Warning---number of questions and answers do not match")
+        print(f"Number of questions: {len(questions)}")
+        print(f"Number of answers: {len(answers)}")
+
+    out_q = []
+    out_a = []
+    n_skips = 0
+    if (len(questions) > 0) and (len(answers) > 0):
+        n_gen_pairs = min(len(questions), len(answers))
+        has_left_over = n_gen_pairs < len(questions) or n_gen_pairs < len(answers)
+        for i in range(n_gen_pairs):
+            response = answers[i].strip()
+            question = questions[i].strip()
+            if not response or not question:
+                print(f"Skipping empty question or answer at index {i}")
+                continue
+            if (not has_left_over) and (i == n_gen_pairs - 1):
+                response, skip = check_should_skip(response, vllm_model)
+                if skip:
+                    print(f"Skipping due to missing stop string")
+                    n_skips += 1
+                    continue
+            out_q.append(question.strip())
+            out_a.append(response.strip())
+    print(f"Skipped {n_skips} responses due to missing stop strings")
+
+    return out_q, out_a
+
+
+def flatten_list(l):
+    out = []
+    for x in l:
+        out += x
+    return out
+
+
+def remove_think(txt):
+    return txt.split("</think>")[-1]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate QA pairs from FineWeb Edu dataset"
+    )
+    parser.add_argument(
+        "--vllm_model",
+        type=str,
+        default=os.environ.get("vllm_model", "google/gemma-2-27b-it"),
+        help="VLLM model to use for generation",
+    )
+    parser.add_argument(
+        "--shard_pattern",
+        type=str,
+        required=True,
+        help="Pattern to match shard files (e.g., '000_0000*')",
+    )
+    parser.add_argument(
+        "--n_qa_pairs",
+        type=int,
+        required=True,
+        help="Number of question-answer pairs to generate per context",
+    )
+    parser.add_argument(
+        "--max_model_length",
+        type=int,
+        default=2**12,
+        help="Maximum length of the model input (context + prompt + response) in tokens",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Debug mode - process only first 100 samples",
+    )
+
+    args = parser.parse_args()
+    vllm_model = args.vllm_model
+    print(f"Using model: {vllm_model}")
+    llm_kwargs = dict(
+        model=vllm_model,
+        dtype="bfloat16",
+        enable_prefix_caching=True,
+        enable_chunked_prefill=True,
+        max_model_len=2**14,
+        limit_mm_per_prompt={"image": 0},
+    )
+
+    llm = LLM(**llm_kwargs)
+    tokenizer = llm.get_tokenizer()
+    shard_pattern = args.shard_pattern
+    n_qa_pairs = args.n_qa_pairs
+
+    paths = glob(f"./data/raw_datasets/fw_qa_v2/{shard_pattern}.parquet")
+
+    split = "train[:100]" if args.debug else "train"
+    for path in paths:
+        assert "_level" in path, (
+            "Path must contain '_level' to indicate the dataset level"
+        )
+        shard_name = path.split("/")[-1].split(".")[0].split("_debug")[0]
+        if "/" in shard_pattern:
+            shard_name = "/".join(shard_pattern.split("/")[:-1]) + "/" + shard_name
+        cur_level = int(shard_name.split("_level_")[-1])
+        next_level = cur_level + 1
+        ds = load_dataset(
+            "parquet",
+            data_files=path,
+            split=split,
+        )
+        prompt_cols = [col for col in ds.column_names if col.startswith("prompts")]
+        response_cols = [col for col in ds.column_names if col.startswith("responses")]
+        assert len(prompt_cols) > 0, "No prompt columns found in the dataset"
+        if len(prompt_cols) != len(response_cols):
+            raise ValueError(
+                "Number of prompt columns does not match number of response columns"
+            )
+
+        samples_data = []
+        for sample in iter(ds):
+            # Format existing QA pairs as examples
+            example_qa_pairs = ""
+            questions = flatten_list([sample[col] for col in prompt_cols])
+            answers = flatten_list([sample[col] for col in response_cols])
+            for i, (q, a) in enumerate(zip(questions, answers), 1):
+                example_qa_pairs += f"Question {i}: {q}\nAnswer {i}: {a}\n"
+
+            samples_data.append(
+                {"context": sample["context"], "example_qa_pairs": example_qa_pairs}
+            )
+        del ds
+        gc.collect()
+
+        messages = [
+            [
+                {"role": "system", "content": SYSTEM_TEMPLATE},
+                {
+                    "role": "user",
+                    "content": get_prompt(
+                        sample["context"], sample["example_qa_pairs"], n_qa_pairs
+                    ),
+                },
+            ]
+            for sample in samples_data
+        ]
+
+        print(f"Generating from {len(messages)} contexts")
+        completions = llm.chat(
+            messages,
+            sampling_params=SamplingParams(
+                temperature=0.0,
+                # needed for checking if stop tokens are present
+                skip_special_tokens=False,
+                include_stop_str_in_output=True,
+            ),
+        )
+        samples = []
+        for sample_data, completion in zip(samples_data, completions):
+            questions, answers = postprocess_qa_pairs(completion.outputs[0].text)
+            samples.append(
+                {
+                    "context": sample_data["context"],
+                    f"prompts_level_{next_level}": questions,
+                    f"responses_level_{next_level}": answers,
+                }
+            )
+            if args.debug:
+                print(f"context={sample_data['context']}")
+                print(f"example_qa_pairs={sample_data['example_qa_pairs']}")
+                print(f"{completion.outputs[0].text=}")
+                for q, a in zip(questions, answers):
+                    print(f"{q=}")
+                    print(f"{a=}")
+                    print()
+                print("=" * 80)
+
+        del samples_data
+        gc.collect()
+
+        print(f"Generated {len(samples)} samples")
+        ds = load_dataset(
+            "parquet",
+            data_files=path,
+            split=split,
+        )
+        ds = ds.add_column(
+            f"prompts_level_{next_level}",
+            [sample[f"prompts_level_{next_level}"] for sample in samples],
+        )
+        ds = ds.add_column(
+            f"responses_level_{next_level}",
+            [sample[f"responses_level_{next_level}"] for sample in samples],
+        )
+
+        shard_name_base = shard_name.split("_level_")[0]
+        shard_name = f"{shard_name_base}_level_{next_level}"
+        if args.debug:
+            shard_name += "_debug"
+        ds.to_parquet(f"data/raw_datasets/fw_qa_v2/{shard_name}.parquet")
+        print(f"Saved to data/raw_datasets/fw_qa_v2/{shard_name}.parquet")
--- a/data/gutenburg_sample.txt
+++ b/data/gutenburg_sample.txt
@ -0,0 +1,387 @@
+The Project Gutenberg eBook, Addison, by William John Courthope
+
+
+This eBook is for the use of anyone anywhere at no cost and with
+almost no restrictions whatsoever.  You may copy it, give it away or
+re-use it under the terms of the Project Gutenberg License included
+with this eBook or online at www.gutenberg.org
+
+
+
+
+
+Title: Addison
+
+
+Author: William John Courthope
+
+
+
+Release Date: November 27, 2012  [eBook #41496]
+
+Language: English
+
+Character set encoding: ISO-8859-1
+
+
+***START OF THE PROJECT GUTENBERG EBOOK ADDISON***
+
+
+E-text prepared by the Online Distributed Proofreading Team
+(http://www.pgdp.net) from page images generously made available by
+Internet Archive (http://archive.org)
+
+
+
+Note: Images of the original pages are available through
+      Internet Archive. See
+      http://archive.org/details/addison_00cour
+
+
+Transcriber's note:
+
+      Text enclosed by underscores is in italics (_italics_).
+
+      Text enclosed by curly brackets is superscripted
+      (example: y{e}).
+
+
+
+
+
+English Men of Letters
+
+Edited by John Morley
+
+ADDISON
+
+by
+
+W. J. COURTHOPE
+
+
+
+
+
+
+
+Harper & Brothers Publishers
+New York and London
+1902
+
+     *       *       *       *       *
+
+ENGLISH MEN OF LETTERS.
+
+EDITED BY JOHN MORLEY.
+
+  JOHNSON                Leslie Stephen.
+  GIBBON                 J. C. Morison.
+  SCOTT                  R. H. Hutton.
+  SHELLEY                J. A. Symonds.
+  HUME                   T. H. Huxley.
+  GOLDSMITH              William Black.
+  DEFOE                  William Minto.
+  BURNS                  J. C. Shairp.
+  SPENSER                R. W. Church.
+  THACKERAY              Anthony Trollope.
+  BURKE                  John Morley.
+  MILTON                 Mark Pattison.
+  HAWTHORNE              Henry James, Jr.
+  SOUTHEY                E. Dowden.
+  CHAUCER                A. W. Ward.
+  BUNYAN                 J. A. Froude.
+  COWPER                 Goldwin Smith.
+  POPE                   Leslie Stephen.
+  BYRON                  John Nichol.
+  LOCKE                  Thomas Fowler.
+  WORDSWORTH             F. Myers.
+  DRYDEN                 G. Saintsbury.
+  LANDOR                 Sidney Colvin.
+  DE QUINCEY             David Masson.
+  LAMB                   Alfred Ainger.
+  BENTLEY                R. C. Jebb.
+  DICKENS                A. W. Ward.
+  GRAY                   E. W. Gosse.
+  SWIFT                  Leslie Stephen.
+  STERNE                 H. D. Traill.
+  MACAULAY               J. Cotter Morison.
+  FIELDING               Austin Dobson.
+  SHERIDAN               Mrs. Oliphant.
+  ADDISON                W. J. Courthope.
+  BACON                  R. W. Church.
+  COLERIDGE              H. D. Traill.
+  SIR PHILIP SIDNEY      J. A. Symonds.
+  KEATS                  Sidney Colvin.
+  CARLYLE                John Nichol.
+
+12mo, Cloth, 75 cents per volume.
+
+_Other volumes in preparation._
+
+PUBLISHED BY HARPER & BROTHERS, NEW YORK.
+
+_Any of the above works will be sent by mail, postage prepaid, to any part
+of the United States, Canada, or Mexico, on receipt of the price._
+
+     *       *       *       *       *
+
+
+
+CONTENTS.
+
+
+                                                PAGE
+
+  CHAPTER I.
+    THE STATE OF ENGLISH SOCIETY AND LETTERS
+      AFTER THE RESTORATION                        1
+
+  CHAPTER II.
+    ADDISON'S FAMILY AND EDUCATION                21
+
+  CHAPTER III.
+    ADDISON ON HIS TRAVELS                        38
+
+  CHAPTER IV.
+    HIS EMPLOYMENT IN AFFAIRS OF STATE            53
+
+  CHAPTER V.
+    THE "TATLER" AND "SPECTATOR"                  78
+
+  CHAPTER VI.
+    "CATO"                                       110
+
+  CHAPTER VII.
+    ADDISON'S QUARREL WITH POPE                  125
+
+  CHAPTER VIII.
+    THE LAST YEARS OF HIS LIFE                   139
+
+  CHAPTER IX.
+    THE GENIUS OF ADDISON                        153
+
+
+
+
+ADDISON.
+
+
+
+
+CHAPTER I.
+
+THE STATE OF ENGLISH SOCIETY AND LETTERS AFTER THE RESTORATION.
+
+
+Of the four English men of letters whose writings most fully embody the
+spirit of the eighteenth century, the one who provides the biographer with
+the scantiest materials is Addison. In his _Journal to Stella_, his social
+verses, and his letters to his friends, we have a vivid picture of those
+relations with women and that protracted suffering which invest with such
+tragic interest the history of Swift. Pope, by the publication of his own
+correspondence, has enabled us, in a way that he never intended, to
+understand the strange moral twist which distorted a nature by no means
+devoid of noble instincts. Johnson was fortunate in the companionship of
+perhaps the best biographer who ever lived. But of the real life and
+character of Addison scarcely any contemporary record remains. The formal
+narrative prefixed to his works by Tickell is, by that writer's own
+admission, little more than a bibliography. Steele, who might have told us
+more than any man about his boyhood and his manner of life in London, had
+become estranged from his old friend before his death. No writer has
+taken the trouble to preserve any account of the wit and wisdom that
+enlivened the "little senate" at Button's. His own letters are, as a rule,
+compositions as finished as his papers in the _Spectator_. Those features
+in his character which excite the greatest interest have been delineated
+by the hand of an enemy--an enemy who possessed an unrivalled power of
+satirical portrait-painting, and was restrained by no regard for truth
+from creating in the public mind such impressions about others as might
+serve to heighten the favourable opinion of himself.
+
+This absence of dramatic incident in Addison's life would lead us
+naturally to conclude that he was deficient in the energy and passion
+which cause a powerful nature to leave a mark upon its age. Yet such a
+judgment would certainly be erroneous. Shy and reserved as he was, the
+unanimous verdict of his most illustrious contemporaries is decisive as to
+the respect and admiration which he excited among them. The man who could
+exert so potent an influence over the mercurial Steele, who could
+fascinate the haughty and cynical intellect of Swift, whose conversation,
+by the admission of his satirist Pope, had in it something more charming
+than that of any other man; of whom it was said that he might have been
+chosen king if he wished it; such a man, though to the coarse perception
+of Mandeville he might have seemed no more than "a parson in a tye-wig,"
+can hardly have been deficient in force of character.
+
+Nor would it have been possible for a writer distinguished by mere
+elegance and refinement to leave a lasting impress on the literature and
+society of his country. In one generation after another, men representing
+opposing elements of rank, class, interest, and taste, have agreed in
+acknowledging Addison's extraordinary merits. "Whoever wishes," says
+Johnson--at the end of a biography strongly coloured with the
+prepossessions of a semi-Jacobite Tory--"whoever wishes to attain an
+English style, familiar but not coarse, and elegant but not ostentatious,
+must give his days and nights to the volumes of Addison." "Such a mark of
+national respect," says Macaulay, the best representative of middle-class
+opinion in the present century, speaking of the statue erected to Addison
+in Westminster Abbey, "was due to the unsullied statesman, to the
+accomplished scholar, to the master of pure English eloquence, to the
+consummate painter of life and manners. It was due, above all, to the
+great satirist who alone knew how to use ridicule without abusing it; who,
+without inflicting a wound, effected a great social reform, and who
+reconciled wit and virtue after a long and disastrous separation, during
+which wit had been led astray by profligacy, and virtue by fanaticism."
+
+This verdict of a great critic is accepted by an age to which the grounds
+of it are, perhaps, not very apparent. The author of any ideal creation--a
+poem, a drama, or a novel--has an imprescriptible property in the fame of
+his work. But to harmonise conflicting social elements, to bring order out
+of chaos in the sphere of criticism, to form right ways of thinking about
+questions of morals, taste, and breeding, are operations of which the
+credit, though it is certainly to be ascribed to particular individuals,
+is generally absorbed by society itself. Macaulay's eulogy is as just as
+it is eloquent, but the pages of the _Spectator_ alone will hardly show
+the reader why Addison should be so highly praised for having reconciled
+wit with virtue. Nor, looking at him as a critic, will it appear a great
+achievement to have pointed out to English society the beauties of
+_Paradise Lost_, unless it be remembered that the taste of the preceding
+generation still influenced Addison's contemporaries, and that in that
+generation Cowley was accounted a greater poet than Milton.
+
+To estimate Addison at his real value we must regard him as the chief
+architect of Public Opinion in the eighteenth century. But here again we
+are met by an initial difficulty, because it has become almost a
+commonplace of contemporary criticism to represent the eighteenth century
+as a period of sheer destruction. It is tacitly assumed by a school of
+distinguished philosophical writers that we have arrived at a stage in the
+world's history in which it is possible to take a positive and scientific
+view of human affairs. As it is of course necessary that from such a
+system all belief in the supernatural shall be jealously excluded, it has
+not seemed impossible to write the history of Thought itself in the
+eighteenth century. And in tracing the course of this supposed continuous
+stream it is natural that all the great English writers of the period
+should be described as in one way or another helping to pull down, or
+vainly to strengthen, the theological barriers erected by centuries of
+bigotry against the irresistible tide of enlightened progress.
+
+It would be of course entirely out of place to discuss here the merits of
+this new school of history. Those who consider that, whatever glimpses we
+may obtain of the law and order of the universe, man is, as he always has
+been and always will be, a mystery to himself, will hardly allow that the
+operations of the human spirit can be traced in the dissecting-room. But
+it is, in any case, obvious that to treat the great _imaginative_ writers
+of any age as if they were only mechanical agents in an evolution of
+thought is to do them grave injustice. Such writers are, above all things,
+creative. Their first aim is to "show the very age and body of the time
+his form and pressure." No work of the eighteenth century, composed in a
+consciously destructive spirit, has taken its place among the acknowledged
+classics of the language. Even the _Tale of a Tub_ is to be regarded as a
+satire upon the aberrations of theologians from right reason, not upon the
+principles of Christianity itself. The _Essay on Man_ has, no doubt,
+logically a tendency towards Deism, but nobody ever read the poem for the
+sake of its philosophy; and it is well known that Pope was much alarmed
+when it was pointed out to him that his conclusions might be represented
+as incompatible with the doctrines of revealed religion.
+
+The truth indeed seems to be the exact converse of what is alleged by the
+scientific historians. So far from the eighteenth century in England being
+an age of destructive analysis, its energies were chiefly devoted to
+political, social, and literary reconstruction. Whatever revolution in
+faith and manners the English nation had undergone had been the work of
+the two preceding centuries, and though the historic foundations of
+society remained untouched, the whole form of the superstructure had been
+profoundly modified.
+
+    "So tenacious are we," said Burke, towards the close of the last
+    century, "of our old ecclesiastical modes and fashions of institution
+    that very little change has been made in them since the fourteenth or
+    fifteenth centuries, adhering in this particular as in all else to our
+    old settled maxim never entirely nor at once to depart from antiquity.
+    We found these institutions on the whole favourable to morality and
+    discipline, and we thought they were susceptible of amendment without
+    altering the ground. We thought they were capable of receiving and
+    meliorating, and, above all, of preserving the accessories of science
+    and literature as the order of Providence should successively produce
+    them. And after all, with this Gothic and monkish education (for such
+    it is the groundwork), we may put in our claim to as ample and early
+    a share in all the improvements in science, in arts, and in literature
+    which have illuminated the modern world as any other nation in Europe.
+    We think one main cause of this improvement was our not despising the
+    patrimony of knowledge which was left us by our forefathers."
+
+All this is, in substance, true of our political as well as our
+ecclesiastical institutions. And yet, when Burke wrote, the great feudal
+and mediæval structure of England had been so transformed by the Wars of
+the Roses, the Reformation, the Rebellion, and the Revolution, that its
+ancient outlines were barely visible. In so far, therefore, as his words
+seem to imply that the social evolution he describes was produced by an
+imperceptible and almost mechanical process of national instinct, the
+impression they tend to create is entirely erroneous.
+
+If we have been hitherto saved from such corruption as undermined the
+republics of Italy, from the religious wars that so long enfeebled and
+divided Germany, and from the Revolution that has severed modern France
+from her ancient history, thanks for this are due partly, no doubt, to
+favouring conditions of nature and society, but quite as much to the
+genius of great individuals who prepared the mind of the nation for the
+gradual assimilation of new ideas. Thus Langland and Wycliffe and their
+numerous followers, long before the Reformation, had so familiarised the
+minds of the people with their ideas of the Christian religion that the
+Sovereign was able to assume the Headship of the Church without the shock
+of a social convulsion. Fresh feelings and instincts grew up in the hearts
+of whole classes of the nation without at first producing any change in
+outward habits of life, and even without arousing a sense of their logical
+incongruity. These mixed ideas were constantly brought before the
+imagination in the works of the poets. Shakespeare abounds with passages
+in which, side by side with the old feudal, monarchical, catholic, and
+patriotic instincts of Englishmen, we find the sentiments of the Italian
+Renaissance. Spenser conveys Puritan doctrines sometimes by the mouth of
+shepherds, whose originals he had found in Theocritus and Virgil;
+sometimes under allegorical forms derived from books of chivalry and the
+ceremonial of the Catholic Church. Milton, the most rigidly Calvinistic of
+all the English poets in his opinions, is also the most severely classical
+in his style.
+
+It was the task of Addison to carry on the reconciling traditions of our
+literature. It is his praise to have accomplished his task under
+conditions far more difficult than any that his predecessors had
+experienced. What they had done was to give instinctive and characteristic
+expression to the floating ideas of the society about them; what Addison
+and his contemporaries did was to found a public opinion by a conscious
+effort of reason and persuasion. Before the Civil Wars there had been at
+least no visible breach in the principle of Authority in Church and State.
+At the beginning of the eighteenth century constituted authority had been
+recently overthrown; one king had been beheaded, another had been
+expelled; the Episcopalian form of Church Government had been violently
+displaced in favour of the Presbyterian, and had been with almost equal
+violence restored. Whole classes of the population had been drawn into
+opposing camps during the Civil War, and still stood confronting each
+other with all the harsh antagonism of sentiment inherited from that
+conflict. Such a bare summary alone is sufficient to indicate the nature
+of the difficulties Addison had to encounter in his efforts to harmonise
+public opinion; but a more detailed examination of the state of society
+after the Restoration is required to place in its full light the
+extraordinary merits of the success that he achieved.
+
+There was, to begin with, a vehement opposition between town and country.
+In the country the old ideas of Feudalism, modified by circumstances, but
+vigorous and deep-rooted, still prevailed. True, the military system of
+land-tenure had disappeared with the Restoration, but it was not so with
+the relations of life, and the habits of thought and feeling which the
+system had created. The features of surviving Feudalism have been
+inimitably preserved for us in the character of Sir Roger de Coverley.
+Living in the patriarchal fashion, in the midst of tenants and retainers,
+who looked up to him as their chief, and for whose welfare and protection
+he considered himself responsible, the country gentleman valued above all
+things the principle of Loyalty. To the moneyed classes in the towns he
+was instinctively opposed; he regarded their interests, both social and
+commercial, as contrary to his own; he looked with dislike and suspicion
+on the economical principles of government and conduct on which these
+classes naturally rely. Even the younger sons of county families had in
+Addison's day abandoned the custom, common enough in the feudal times, of
+seeking their fortune in trade. Many a Will Wimble now spent his whole
+life in the country, training dogs for his neighbours, fishing their
+streams, making whips for their young heirs, and even garters for their
+wives and daughters.[1]
+
+
+
--- a/data/sakana_wiki.txt
+++ b/data/sakana_wiki.txt
@ -0,0 +1,8 @@
+Sakana AI Co, Ltd. is a Japanese artificial intelligence company based in Tokyo.
+Overview
+
+Sakana AI's main research fields are evolution and collective intelligence of AI. The company's name is derived from the Japanese word さかな (sakana), which means fish. This represents the idea of a school of fish coming together and forming a coherent entity from simple rules, which is an analogy of collective intelligence.[2]
+
+The company was founded by David Ha, Llion Jones and Ren Ito. Llion Jones co-authored the famous paper "Attention Is All You Need" when he was working for Google in 2017. The company raised $30M in its seed funding round from Lux Capital and Khosla Ventures.[3] The company raised approximately $200M from companies such as Mitsubishi UFJ, SMBC, Mizuho, Itochu, KDDI, Nomura and Nvidia in its series A funding round in 2024.[4]
+
+In January 2024, Sakana AI developed a method to build new AI models by 'breeding' multiple existing models, which it sees as a means to democratise AI development, as this process does not require large computational resources.[5] Sakana AI is also developing a model called the AI Scientist, which automates the entire process of scientific research.[6] The Nikkei estimated the company's value at 19 billion yen in 2024.[7] 
--- a/data/self_generate_qa.py
+++ b/data/self_generate_qa.py
@ -0,0 +1,620 @@
+import argparse
+import os
+import random
+import re
+from glob import glob
+
+import numpy as np
+import yaml
+from datasets import Dataset, load_dataset
+from vllm import LLM, SamplingParams
+
+from ctx_to_lora.data.definitions import (
+    CLOSED_QA_INTX_TEMPLATES,
+    RAW_DATA_DIR,
+    SELF_GEN_DATA_DIR,
+)
+from ctx_to_lora.data.processing import (
+    filter_none,
+    get_preprocessing_fn,
+    load_and_process_dataset,
+    tokenize_ctx_text,
+)
+from ctx_to_lora.data.self_gen_template import (
+    PRE_CTX,
+    PROMPT_TEMPLATE,
+    QA_PROMPT_TEMPLATE,
+    SELF_GEN_SYSTEM_MSG,
+    SELF_QA_INTX,
+)
+from ctx_to_lora.model_loading import get_tokenizer
+from ctx_to_lora.utils import clear_gpu
+
+STOP_STRINGS = {
+    "google/gemma-2-2b-it": ["<eos>", "<end_of_turn>"],
+}
+
+MODEL_CTX_LEN = {
+    "google/gemma-2-27b-it": 8192,
+    "google/gemma-2-2b-it": 8192,
+    "google/gemma-2-9b-it": 8192,
+    # qwen 4b has 256k ctx length but using lower max lengths is faster
+    "Qwen/Qwen3-4B-Instruct-2507": 2**13 + 2**12,
+}
+
+
+def truncate_middle_if_too_long(
+    input_ids: list[int],
+    max_length: int,
+    max_new_tokens: int = 256,
+) -> list[int]:
+    """
+    Truncate the middle of a list of tokens to fit within a maximum length.
+
+    Args:
+        tokens: List of token IDs
+        max_length: Maximum length for the truncated tokens
+
+    Returns:
+        List of truncated token IDs
+    """
+    max_new_tokens_half = max_new_tokens // 2
+    # leave max_new_tokens for generation
+    half = max_length // 2 - max_new_tokens_half
+    if len(input_ids) > max_length:
+        return input_ids[:half] + input_ids[-half:]
+    return input_ids
+
+
+def get_prompt(context: str, q: str, remove_qa_template: bool) -> str:
+    prompt = QA_PROMPT_TEMPLATE if not remove_qa_template else PROMPT_TEMPLATE
+    return prompt.format(context=context, question=q)
+
+
+def add_closed_qa_prompt(q: str, closed_qa_prob: float = 0.1) -> str:
+    if random.random() <= closed_qa_prob:
+        q = random.choice(CLOSED_QA_INTX_TEMPLATES).format(input=q)
+    return q
+
+
+def load_config(config_path: str) -> dict:
+    """Load dataset names from YAML config file."""
+    with open(config_path) as f:
+        config = yaml.safe_load(f)
+    return config
+
+
+def get_dataset_configs(
+    ds_names: list[str] | None,
+    config: dict | None,
+    split: str | None,
+) -> list[tuple[str, str]]:
+    assert not (ds_names and config), "Cannot provide both ds_names and config"
+    if ds_names:
+        assert split, "When using ds_names, --split must be provided"
+        # Validate ds_names format
+        for ds_name in ds_names:
+            if not isinstance(ds_name, str):
+                raise ValueError(f"Invalid dataset name: {ds_name}")
+        return [(ds_name, split) for ds_name in ds_names]
+
+    if config:
+        dataset_configs = []
+
+        # Process train datasets
+        train_ds_names = config.get("train_ds_names", [])
+        # self_gen_train_ds_names = [
+        #     (ds_name.split("/")[-1], "train")
+        #     for ds_name in train_ds_names
+        #     if ds_name.startswith("self_gen/")
+        # ]
+        self_gen_train_ds_names = [
+            (ds_name, "train")
+            for ds_name in train_ds_names
+            if ds_name.startswith("self_gen/")
+        ]
+        if not self_gen_train_ds_names:
+            print("No self_gen datasets found in train_ds_names")
+        dataset_configs.extend(self_gen_train_ds_names)
+
+        # Process validation datasets
+        val_ds_names = config.get("val_ds_names", [])
+        self_gen_val_ds_names = [
+            (ds_name, "validation")
+            for ds_name in val_ds_names
+            if ds_name.startswith("self_gen/")
+        ]
+        if not self_gen_val_ds_names:
+            print("No self_gen datasets found in val_ds_names")
+        dataset_configs.extend(self_gen_val_ds_names)
+
+        return dataset_configs
+
+
+def create_messages(
+    ctxs: list[str],
+    questions: list[list[str]],
+    vllm_model: str,
+    system_template: str,
+    remove_qa_template: bool,
+) -> list[list[dict]]:
+    """Create chat messages for the model."""
+    # if "gemma" in vllm_model:
+    # gemma models do not support system messages
+    return [
+        [
+            {
+                "role": "user",
+                "content": (
+                    system_template + "\n\n\n" + get_prompt(ctx, q, remove_qa_template)
+                ).strip(),
+            }
+        ]
+        for ctx, q_list in zip(ctxs, questions)
+        for q in q_list
+    ]
+    # else:
+    #     return [
+    #         [
+    #             {"role": "system", "content": system_template},
+    #             {"role": "user", "content": get_prompt(ctx, q)},
+    #         ]
+    #         for ctx, q_list in zip(ctxs, questions)
+    #         for q in q_list
+    #     ]
+
+
+def self_generate(
+    ds_name: str,
+    split: str,
+    args: argparse.Namespace,
+    llm: LLM,
+    system_template: str,
+    parquet_file: str | None = None,
+    do_truncate: bool = False,
+) -> None:
+    """Process a single dataset and generate QA pairs."""
+
+    shard_name = ""
+
+    # Conflict checks for ds_name-derived overrides
+    if ds_name is not None:
+        # temperature & closed_qa already handled later; add new ones
+        if "_temp_" in ds_name and args.temp != 0.0:
+            raise ValueError(
+                f"Multiple sources of truth for temperature: CLI arg --temp={args.temp} and dataset name contains temp specification."
+            )
+        if "_closed_qa_prob_" in ds_name and args.closed_qa_prob != 0.0:
+            raise ValueError(
+                f"Multiple sources of truth for closed_qa_prob: CLI arg --closed_qa_prob={args.closed_qa_prob} and dataset name contains closed_qa_prob specification."
+            )
+
+    # Base values from args
+    temp = args.temp
+    closed_qa_prob = args.closed_qa_prob
+
+    # Overrides from ds_name pattern if present
+    if ds_name is not None:
+        if "_temp_" in ds_name:
+            m = re.search(r"_temp_([\d.]+)", ds_name)
+            if m:
+                temp = float(m.group(1))
+        if "_closed_qa_prob_" in ds_name:
+            m = re.search(r"_closed_qa_prob_([\d.]+)", ds_name)
+            if m:
+                closed_qa_prob = float(m.group(1))
+
+    print(f"Processing dataset: {ds_name}, split: {split}")
+    print(f"Using temperature: {temp}")
+    print(f"Using closed QA prompt probability: {closed_qa_prob}")
+
+    if parquet_file:
+        print(f"Loading dataset from parquet file: {parquet_file}")
+
+        split = "train"
+        ds_name = "/".join(parquet_file.split(RAW_DATA_DIR)[-1].split("/")[:-1])
+
+        shard_name = "_" + os.path.basename(parquet_file).replace(".parquet", "")
+        ds = load_dataset(path="parquet", data_files=[parquet_file], split="train")
+        processing_fn = get_preprocessing_fn(ds_name, is_eval=False)
+        ds = ds.map(processing_fn, num_proc=8)
+
+    else:
+        ds_name = ds_name.split("/")[-1]  # Extract just the dataset name
+
+        print(f"Loading dataset: {ds_name} with split: {split}")
+        kwargs = dict(ds_name=ds_name, split=split)
+
+        ds = load_and_process_dataset(**kwargs, num_proc=8, remove_cols=False)
+    print(f"Loaded dataset: {ds_name} with split: {split}")
+
+    if args.debug:
+        ds = ds.take(10)
+
+    ds = ds.filter(filter_none, batched=False, num_proc=8)
+
+    tk = get_tokenizer(args.vllm_model, train=True)
+
+    self_qa_intx_tokens = tk(SELF_QA_INTX, add_special_tokens=False)["input_ids"][1:]
+    if args.remove_qa_template:
+        self_qa_intx_tokens = tk("\n\n", add_special_tokens=False)["input_ids"]
+    n_self_qa_intx_tokens = len(self_qa_intx_tokens)
+    pre_ctx_tokens = tk(PRE_CTX, add_special_tokens=False)["input_ids"]
+    n_pre_ctx_tokens = len(pre_ctx_tokens)
+    sys_tokens = tk(system_template.split("\n")[0], add_special_tokens=False)[
+        "input_ids"
+    ][:-1]
+    n_sys_tokens = len(sys_tokens)
+    os.environ["TOKENIZERS_PARALLELISM"] = "true"
+    ds = ds.map(
+        tokenize_ctx_text,
+        fn_kwargs={"tokenizer": tk},
+        batched=True,
+        batch_size=50_000,
+        keep_in_memory=True,
+    )
+
+    ctxs = [sample["context"] for sample in ds]
+    questions = [
+        [add_closed_qa_prompt(q, closed_qa_prob) for q in sample["prompts"] if q]
+        for sample in ds
+    ]
+
+    questions = [q_list for q_list in ds["prompts"] if len(q_list) > 0]
+
+    print(f"Loaded {len(ctxs)} contexts and {len(questions)} questions")
+
+    k = 16
+    fpath = f"{SELF_GEN_DATA_DIR}/{args.vllm_model}_temp_{temp}_closed_qa_prob_{closed_qa_prob}/{ds_name}/{split}/ds{shard_name}"
+
+    chunk_size = 1_000
+    for chunk_idx, start in enumerate(range(0, len(ctxs), chunk_size)):
+        print(f"Processing chunk {chunk_idx}")
+
+        chunk_ctxs = ctxs[start : start + chunk_size]
+        chunk_questions = questions[start : start + chunk_size]
+        chunk_messages = create_messages(
+            chunk_ctxs,
+            chunk_questions,
+            args.vllm_model,
+            SELF_GEN_SYSTEM_MSG,
+            args.remove_qa_template,
+        )
+
+        if do_truncate:
+            # we should only do this for evaluation data
+            tokenized_contents = tk(
+                [m[0]["content"] for m in chunk_messages],
+                add_special_tokens=False,
+                return_attention_mask=False,
+            )
+            tokenized_contents["input_ids"] = [
+                truncate_middle_if_too_long(
+                    ids,
+                    max_length=MODEL_CTX_LEN[args.vllm_model],
+                    max_new_tokens=args.max_new_tokens,
+                )
+                for ids in tokenized_contents["input_ids"]
+            ]
+            contents = tk.batch_decode(
+                tokenized_contents["input_ids"], skip_special_tokens=True
+            )
+            for c, m in zip(contents, chunk_messages):
+                m[0]["content"] = c
+
+        print(f"Generating from {len(chunk_messages)} contexts")
+
+        # Clear GPU memory before processing the next chunk
+        clear_gpu()
+        execute_qa_generation(
+            fpath + f"_{chunk_idx:04d}",
+            args,
+            llm,
+            temp,
+            tk,
+            self_qa_intx_tokens,
+            n_self_qa_intx_tokens,
+            sys_tokens,
+            n_sys_tokens,
+            chunk_ctxs,
+            ds[start : start + chunk_size]["ctx_ids"],
+            chunk_questions,
+            chunk_messages,
+            k,
+        )
+
+
+def execute_qa_generation(
+    fpath,
+    args,
+    llm,
+    temp,
+    tk,
+    self_qa_intx_tokens,
+    n_self_qa_intx_tokens,
+    sys_tokens,
+    n_sys_tokens,
+    ctxs,
+    ctx_ids,
+    questions,
+    messages,
+    k,
+):
+    completions = llm.chat(
+        messages,
+        sampling_params=SamplingParams(
+            max_tokens=args.max_new_tokens,
+            logprobs=k,
+            temperature=temp,
+            seed=42,
+            spaces_between_special_tokens=False,
+            skip_special_tokens=False,
+            include_stop_str_in_output=True,
+        ),
+    )
+
+    self_gen_data = {
+        ctx: {
+            "ctx_ids": ctx_ids,
+            "input_ids": [],
+            "response_start_end": [],
+            "logprobs_vals": [],
+            "logprobs_indices": [],
+        }
+        for ctx, ctx_ids in zip(ctxs, ctx_ids)
+    }
+    c = 0
+    n_skips = 0
+    sys_start = None
+    for ctx, q_list in zip(ctxs, questions):
+        # self_gen_data[ctx]["ctx_ids"] = ctx_ids
+        for i, _ in enumerate(q_list):
+            # response = completions[c + i].outputs[0].text
+            reason = completions[c + i].outputs[0].finish_reason
+            if reason != "stop":
+                # print(f"idx: {c + i}")
+                print(f"finish_reason: {completions[c + i].outputs[0].finish_reason}")
+                print(f"Skipping due to finish_reason={reason} != 'stop'")
+                n_skips += 1
+                continue
+
+            # includes the logprob before the first response token
+            # but excludes the logprob from eos token
+            logp = completions[c + i].outputs[0].logprobs
+
+            # len = num response tokens
+            n_response_tokens = len(completions[c + i].outputs[0].token_ids)
+
+            logp_indices = np.empty((n_response_tokens, k), dtype=np.int32)
+            # float-16 is better for this range
+            logp_vals = np.empty((n_response_tokens, k), dtype=np.float16)
+            assert len(logp) == n_response_tokens, (
+                f"Expected {n_response_tokens} logp entries, got {len(logp)}"
+            )
+
+            for li, info_d in enumerate(logp):
+                for j, (idx, tok_info) in enumerate(info_d.items()):
+                    logp_indices[li, j] = idx
+                    logp_vals[li, j] = tok_info.logprob
+
+            prompt_ids = completions[c + i].prompt_token_ids  # 1d list
+            # token_ids only includes generated tokens, not the prompt
+            response_token_ids = completions[c + i].outputs[0].token_ids  # 1d list
+            all_ids = prompt_ids + response_token_ids
+            res_start = len(prompt_ids)
+            res_end = res_start + n_response_tokens
+
+            if sys_start is None:
+                for ii in range(len(prompt_ids) - n_sys_tokens):
+                    if prompt_ids[ii : ii + n_sys_tokens] == sys_tokens:
+                        # found the start of the system message
+                        sys_start = ii
+                        break
+
+            q_start = None
+            for ii in range(
+                len(prompt_ids) - n_self_qa_intx_tokens,
+                -1,
+                -1,
+            ):
+                if prompt_ids[ii : ii + n_self_qa_intx_tokens] == self_qa_intx_tokens:
+                    # found the start of the user input
+                    q_start = ii + n_self_qa_intx_tokens
+                    break
+
+            # bos + question + eos + start model turn + response + eos
+            input_ids = all_ids[:sys_start] + all_ids[q_start:res_end]
+
+            # relative to the input_ids
+            res_start = res_start - q_start + sys_start
+            res_end = res_start + n_response_tokens
+
+            # arrays will be saved as nested lists of numbers
+
+            self_gen_data[ctx]["input_ids"].append(input_ids)
+            # assume single-turn chat
+            self_gen_data[ctx]["response_start_end"].append((res_start, res_end))
+            self_gen_data[ctx]["logprobs_vals"].append(logp_vals)
+            self_gen_data[ctx]["logprobs_indices"].append(logp_indices)
+
+        c += i + 1
+
+    print(f"Skipped {n_skips} responses due to missing stop strings")
+    samples = [
+        {
+            # "context": ctx,
+            # "prompts": q_list,
+            # "responses": self_gen_data[ctx]["responses"],
+            "ctx_ids": self_gen_data[ctx]["ctx_ids"],
+            "input_ids": self_gen_data[ctx]["input_ids"],
+            "response_start_end": self_gen_data[ctx]["response_start_end"],
+            # "prompt_start_end": self_gen_data[ctx]["prompt_start_end"],
+            "logprobs_vals": self_gen_data[ctx]["logprobs_vals"],
+            "logprobs_indices": self_gen_data[ctx]["logprobs_indices"],
+        }
+        for ctx, q_list in zip(ctxs, questions)
+    ]
+
+    if args.debug:
+        for sample in samples:
+            # print(f"context={tk.decode(sample['ctx_ids'])}")
+            print(f"QA={[tk.decode(ids) for ids in sample['input_ids']]}")
+
+            for input_ids, (start, end) in zip(
+                sample["input_ids"], sample["response_start_end"]
+            ):
+                print(f"start={start}, end={end}")
+                print(f"response={tk.decode(input_ids[start:end])}")
+
+            print(f"logprobs_vals={[x.shape for x in sample['logprobs_vals']]}")
+            print(f"logprobs_indices={[x.shape for x in sample['logprobs_indices']]}")
+            for indices in sample["logprobs_indices"]:
+                print(f"logprobs_indices={indices[-1]}")
+            print("=" * 80)
+
+    print(f"Generated {len(samples)} samples")
+    # random.shuffle(samples)
+
+    # Save results
+    # df = pd.DataFrame(samples)
+    # ds_out = Dataset.from_pandas(df)
+    ds_out = Dataset.from_list(samples)
+    # fpath = f"{SELF_GEN_DATA_DIR}/{args.vllm_model}_temp_{temp}_closed_qa_prob_{closed_qa_prob}/{ds_name}/{split}/ds{shard_name}"
+
+    if args.debug:
+        fpath += "_debug"
+    os.makedirs(os.path.dirname(fpath), exist_ok=True)
+
+    fpath = f"{fpath}.parquet"
+    ds_out.to_parquet(fpath)
+    print(f"Saved to {fpath}")
+
+    # Cleanup
+    del samples, ds_out, completions, messages, ctxs, questions
+    clear_gpu()
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Generate QA pairs using VLLM")
+    parser.add_argument(
+        "--vllm_model",
+        type=str,
+        required=True,
+        help="VLLM model name (e.g., google/gemma-2-2b-it)",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Enable debug mode (process only 10 samples)",
+    )
+
+    # Either config file OR ds_names + split
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument(
+        "--config",
+        type=str,
+        help="Path to YAML config file with train_ds_names/val_ds_names",
+    )
+    group.add_argument(
+        "--ds_names",
+        type=str,
+        nargs="+",
+        help="List of dataset names/shard patterns",
+    )
+    group.add_argument(
+        "--glob_pattern",
+        type=str,
+        help="Glob pattern to match dataset names (e.g., 'data/raw_datasets/fw_qa_3/*')",
+    )
+
+    parser.add_argument(
+        "--split",
+        type=str,
+        help="Dataset split to use when using --ds_names (required with --ds_names)",
+    )
+    parser.add_argument(
+        "--temp",
+        type=float,
+        default=0.0,
+        help="Temperature for sampling (default: 0.0)",
+    )
+    parser.add_argument(
+        "--closed_qa_prob",
+        type=float,
+        default=0.0,
+        help="Probability of using closed QA prompt template (default: 0.0)",
+    )
+    parser.add_argument(
+        "--do_truncate",
+        action="store_true",
+        help="Truncate contexts to fit model context length",
+    )
+    parser.add_argument(
+        "--remove_qa_template",
+        action="store_true",
+        help="Remove QA template formatting from prompts",
+    )
+    parser.add_argument(
+        "--max_new_tokens",
+        type=int,
+        default=256,
+        help="Maximum number of new tokens to generate (default: 256)",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    # Validate arguments
+    if args.ds_names and not args.split:
+        raise ValueError("--split is required when using --ds_names")
+
+    vllm_model = args.vllm_model
+    print(f"Using model: {vllm_model}")
+
+    # Setup model-specific configurations
+    llm_kwargs = dict(
+        model=vllm_model,
+        dtype="bfloat16",
+        enable_prefix_caching=True,
+        enable_chunked_prefill=True,
+        max_model_len=MODEL_CTX_LEN.get(vllm_model),
+        max_num_batched_tokens=16384,
+        max_num_seqs=32,  # avoid oom when getting logprobs
+    )
+
+    print(f"{llm_kwargs=}")
+    llm = LLM(**llm_kwargs)
+
+    # Get dataset configs from config or CLI args
+    config = load_config(args.config) if args.config else None
+    if args.ds_names or args.config:
+        dataset_configs = get_dataset_configs(
+            ds_names=args.ds_names,
+            config=config,
+            split=args.split,
+        )
+
+        # Process each dataset
+        for ds_name, split in dataset_configs:
+            print(f"Processing dataset: {ds_name}, split: {split}")
+            self_generate(
+                ds_name, split, args, llm, SELF_GEN_SYSTEM_MSG, None, args.do_truncate
+            )
+    else:
+        assert args.glob_pattern, (
+            "glob_pattern must be provided if no ds_names or config"
+        )
+        files = glob(args.glob_pattern)
+        for file in files:
+            print(f"Processing file: {file}")
+            self_generate(
+                ds_name=None,
+                parquet_file=file,
+                split=args.split,
+                args=args,
+                llm=llm,
+                system_template=SELF_GEN_SYSTEM_MSG,
+                do_truncate=args.do_truncate,
+            )