Doc-to-LoRA release

This commit is contained in:
51616 2026-02-27 03:47:04 +00:00
commit 1abe8ae16d
92 changed files with 22131 additions and 0 deletions

View file

@ -0,0 +1,42 @@
import gc
from datasets import Dataset, load_dataset
from tqdm import tqdm
if __name__ == "__main__":
ds_name = "ucinlp/drop"
for split in ["train", "validation"]:
ctx_qa_dict = dict()
ds = load_dataset(ds_name, split=split)
print(f"Original size: {len(ds)}")
for i, sample in tqdm(enumerate(ds)):
ctx = sample["passage"]
if ctx not in ctx_qa_dict:
ctx_qa_dict[ctx] = {"prompts": [], "responses": []}
question = sample["question"]
answer = sample["answers_spans"]["spans"][0]
ctx_qa_dict[ctx]["prompts"].append(question)
ctx_qa_dict[ctx]["responses"].append(answer)
print(f"Unique contexts: {len(ctx_qa_dict)}")
# convert ctx_qa_dict to a list of dictionaries
samples = [
{
"context": ctx,
"prompts": ctx_qa_dict[ctx]["prompts"],
"responses": ctx_qa_dict[ctx]["responses"],
}
for ctx in ctx_qa_dict
]
print(f"Sampled data: {samples[0]}")
# breakpoint()
# save to a new dataset
ds = Dataset.from_list(samples)
save_path = f"./data/raw_datasets/drop_compact/{split}/ds.parquet"
print(f"Saving dataset to {save_path}")
ds.to_parquet(save_path)
print("=" * 80)
del ds, samples, ctx_qa_dict
gc.collect()

43
data/build_pwc_compact.py Normal file
View file

@ -0,0 +1,43 @@
import gc
from datasets import Dataset, load_dataset
from tqdm import tqdm
if __name__ == "__main__":
ds_name = "sggetao/PwC"
for split in ["train", "test"]:
ctx_qa_dict = dict()
ds = load_dataset(ds_name, split=split)
print(f"Original size: {len(ds)}")
for i, sample in tqdm(enumerate(ds)):
ctx = sample["input"]
if ctx not in ctx_qa_dict:
ctx_qa_dict[ctx] = {"prompts": [], "responses": []}
# question = closed_qa_prompting(sample["prompt"])
question = sample["prompt"]
answer = sample["answer"]
ctx_qa_dict[ctx]["prompts"].append(question)
ctx_qa_dict[ctx]["responses"].append(answer)
print(f"Unique contexts: {len(ctx_qa_dict)}")
# convert ctx_qa_dict to a list of dictionaries
samples = [
{
"context": ctx,
"prompts": ctx_qa_dict[ctx]["prompts"],
"responses": ctx_qa_dict[ctx]["responses"],
}
for ctx in ctx_qa_dict
]
print(f"Sampled data: {samples[0]}")
# breakpoint()
# save to a new dataset
ds = Dataset.from_list(samples)
save_path = f"./data/raw_datasets/pwc_compact/{split}/ds.parquet"
print(f"Saving dataset to {save_path}")
ds.to_parquet(save_path)
print("=" * 80)
del ds, samples, ctx_qa_dict
gc.collect()

View file

@ -0,0 +1,45 @@
import gc
from datasets import Dataset, load_dataset
from tqdm import tqdm
if __name__ == "__main__":
ds_name = "allenai/ropes"
for split in ["train", "validation"]:
ctx_qa_dict = dict()
ds = load_dataset(ds_name, split=split)
print(f"Original size: {len(ds)}")
for i, sample in tqdm(enumerate(ds)):
ctx_template = "{background}\n{situation}"
response = sample["answers"]["text"][0]
bg_txt = sample["background"]
situation_txt = sample["situation"]
ctx = ctx_template.format(background=bg_txt, situation=situation_txt)
q = sample["question"]
if ctx not in ctx_qa_dict:
ctx_qa_dict[ctx] = {"prompts": [], "responses": []}
ctx_qa_dict[ctx]["prompts"].append(q)
ctx_qa_dict[ctx]["responses"].append(response)
print(f"Unique contexts: {len(ctx_qa_dict)}")
# convert ctx_qa_dict to a list of dictionaries
samples = [
{
"context": ctx,
"prompts": ctx_qa_dict[ctx]["prompts"],
"responses": ctx_qa_dict[ctx]["responses"],
}
for ctx in ctx_qa_dict
]
print(f"Sampled data: {samples[0]}")
# breakpoint()
# save to a new dataset
ds = Dataset.from_list(samples)
save_path = f"./data/raw_datasets/ropes_compact/{split}/ds.parquet"
print(f"Saving dataset to {save_path}")
ds.to_parquet(save_path)
print("=" * 80)
del ds, samples, ctx_qa_dict
gc.collect()

View file

@ -0,0 +1,42 @@
import gc
from datasets import Dataset, load_dataset
from tqdm import tqdm
if __name__ == "__main__":
ds_name = "data/raw_datasets/squad"
for split in ["train", "validation"]:
ctx_qa_dict = dict()
ds = load_dataset(ds_name, split=split)
print(f"Original size: {len(ds)}")
for i, sample in tqdm(enumerate(ds)):
ctx = sample["context"]
if ctx not in ctx_qa_dict:
ctx_qa_dict[ctx] = {"prompts": [], "responses": []}
question = sample["question"]
answer = sample["answers"]["text"][0]
ctx_qa_dict[ctx]["prompts"].append(question)
ctx_qa_dict[ctx]["responses"].append(answer)
print(f"Unique contexts: {len(ctx_qa_dict)}")
# convert ctx_qa_dict to a list of dictionaries
samples = [
{
"context": ctx,
"prompts": ctx_qa_dict[ctx]["prompts"],
"responses": ctx_qa_dict[ctx]["responses"],
}
for ctx in ctx_qa_dict
]
print(f"Sampled data: {samples[0]}")
# breakpoint()
# save to a new dataset
ds = Dataset.from_list(samples)
save_path = f"./data/raw_datasets/squad_compact/{split}/ds.parquet"
print(f"Saving dataset to {save_path}")
ds.to_parquet(save_path)
print("=" * 80)
del ds, samples, ctx_qa_dict
gc.collect()

View file

@ -0,0 +1,10 @@
from huggingface_hub import snapshot_download
if __name__ == "__main__":
fw_dir = "./data/raw_datasets/fineweb_edu/"
snapshot_download(
"HuggingFaceFW/fineweb-edu",
repo_type="dataset",
local_dir=fw_dir,
allow_patterns="sample/100BT/*",
)

View file

@ -0,0 +1,288 @@
import argparse
import json
import math
import os
import random
# -----------------------------
# Config knobs (edit or use CLI)
# -----------------------------
TOKENS_PER_BLOCK = 40 # rough heuristic tokens per noise block
BASE_SAMPLES_PER_BIN = (
320_000 # training samples budget scaler only (val/test fixed at 1000 each)
)
RNG_SEED = 42
NOISE_BLOCK = "The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again."
SPECIAL_TPL = "The special magic number is {magic_number}."
SEP = "\n" # between blocks
def save_jsonl(data: list[dict], filepath: str) -> None:
parent_dir = os.path.dirname(filepath)
if parent_dir:
os.makedirs(parent_dir, exist_ok=True)
with open(filepath, "w") as f:
for entry in data:
json.dump(entry, f)
f.write("\n")
essential_digits4 = lambda: f"{random.randint(0, 9_999):04d}"
def _choose_position(total_blocks: int, depth_bin: int) -> int:
"""Choose an insertion index for the special sentence within [0, total_blocks-1]
such that its relative depth falls within the depth bin [i/10, (i+1)/10).
"""
if total_blocks <= 0:
return 0
# Use floor for start and ceil for end to cover boundaries evenly
start = math.floor(total_blocks * (depth_bin / 10))
end = math.ceil(total_blocks * ((depth_bin + 1) / 10)) - 1
# clamp
start = max(0, min(start, total_blocks - 1))
end = max(start, min(end, total_blocks - 1))
return random.randint(start, end)
def _build_example(total_blocks: int, depth_bin: int) -> dict:
"""Build one example with a special line inserted among noise blocks.
total_blocks: total number of blocks in the final context (including the special one)
depth_bin: integer in [0, 9]
"""
total_blocks = max(1, total_blocks)
# Prepare blocks
magic = essential_digits4()
special_line = SPECIAL_TPL.format(magic_number=magic)
# We'll have (total_blocks - 1) noise blocks and 1 special line
noise_count = max(0, total_blocks - 1)
blocks: list[str] = [NOISE_BLOCK for _ in range(noise_count)]
insert_at = _choose_position(total_blocks, depth_bin)
# Insert special line at the desired position within the final sequence
# If noise_count == 0, we just return special
if noise_count == 0:
final_blocks = [special_line]
else:
# Compose by interleaving noise and inserting special at index
# Build a list of length `total_blocks` and fill
final_blocks = []
noise_idx = 0
for idx in range(total_blocks):
if idx == insert_at:
final_blocks.append(special_line)
else:
final_blocks.append(blocks[noise_idx])
noise_idx += 1
context = SEP.join(final_blocks)
prompt = "What is the special magic number? Reply with only the number."
response = magic
return {"context": context, "prompt": prompt, "response": response}
def generate_examples(n: int, k: int) -> list[dict]:
"""Generate n examples (all for block length k) evenly across 10 depth bins."""
if n <= 0:
return []
base = n // 10
rem = n % 10
counts = [base + (1 if i < rem else 0) for i in range(10)]
out: list[dict] = []
for depth_bin, c in enumerate(counts):
for _ in range(c):
out.append(_build_example(total_blocks=k, depth_bin=depth_bin))
random.shuffle(out)
return out
def main():
parser = argparse.ArgumentParser(
description="Generate noise-wrapped special magic number dataset (similar structure to generate_ctx_kv.py)",
)
parser.add_argument("--seed", type=int, default=RNG_SEED, help="Random seed")
parser.add_argument(
"--tokenizer-name",
type=str,
default="google/gemma-2-2b-it",
help=("Tokenizer name"),
)
parser.add_argument(
"--base-samples-per-bin",
type=int,
default=BASE_SAMPLES_PER_BIN,
help="Baseline number of TRAINING samples per token bin (scaled by bin width). Validation & test are always 1000 each.",
)
parser.add_argument(
"--out-prefix",
type=str,
default="data/raw_datasets/ctx_magic_number",
help="Output directory prefix (bin range will be appended)",
)
parser.add_argument(
"--tokens-per-block",
"--tokens-per-pair",
dest="tokens_per_block",
type=int,
default=TOKENS_PER_BLOCK,
help="Heuristic tokens per noise block for bucketing",
)
parser.add_argument(
"--only-first-n-bins",
type=int,
default=None,
help="For quick tests: only generate the first N token bins",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Print a small sample and exit without writing files",
)
args = parser.parse_args()
random.seed(args.seed)
# ----------------------------------------------------
# Optional: report tokenizer-based token length stats
# ----------------------------------------------------
if args.tokenizer_name:
try:
from transformers import AutoTokenizer # type: ignore
except Exception as e: # pragma: no cover
raise RuntimeError(
"Failed to import transformers. Install it or omit --tokenizer-name."
) from e
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=True)
noise_token_count = len(tokenizer(NOISE_BLOCK).input_ids)
special_example = SPECIAL_TPL.format(magic_number="0000")
special_token_count = len(tokenizer(special_example).input_ids)
print(
f"[Tokenizer: {args.tokenizer_name}] Noise block tokens: {noise_token_count} | Special line tokens: {special_token_count}"
)
tok_bins = [(32, 128), (128, 256), (256, 512), (512, 1024), (32, 1024)] + [
(1024 * i, 1024 * (i + 1)) for i in range(1, 16)
]
tok_bins += [(2**14 + 2**12 * (i), 2**14 + 2**12 * (i + 1)) for i in range(4)]
tok_bins += [(2**15 + 2**13 * (i), 2**15 + 2**13 * (i + 1)) for i in range(12)]
if args.only_first_n_bins is not None:
tok_bins = tok_bins[: args.only_first_n_bins]
if args.tokenizer_name:
max_hi = max(hi for _, hi in tok_bins)
def measure_len(k: int) -> int:
if k == 1:
ctx = SPECIAL_TPL.format(magic_number="0000")
else:
blocks = [NOISE_BLOCK] * (k - 1) + [
SPECIAL_TPL.format(magic_number="0000")
]
ctx = SEP.join(blocks)
return len(tokenizer(ctx).input_ids)
lengths: list[int] = [0]
k = 1
while True:
L = measure_len(k)
lengths.append(L)
if L >= max_hi:
break
k += 1
len_bins = []
for lo, hi in tok_bins:
k_lo = None
for kk in range(1, len(lengths)):
if lengths[kk] >= lo:
k_lo = kk
break
if k_lo is None or lengths[k_lo] >= hi:
len_bins.append((0, 0))
continue
k_hi = len(lengths)
for kk in range(k_lo, len(lengths)):
if lengths[kk] >= hi:
k_hi = kk
break
len_bins.append((k_lo, k_hi))
base_tokens = lengths[1]
delta = (lengths[2] - lengths[1]) if len(lengths) > 2 else 0
print(
f"Using tokenizer-measured block ranges. base_tokens={base_tokens} approx_delta={delta}"
)
else:
len_bins = [
(lo // args.tokens_per_block, hi // args.tokens_per_block)
for (lo, hi) in tok_bins
]
if args.dry_run:
for lb in len_bins:
if lb[1] > lb[0]:
k = max(1, lb[0])
sample = generate_examples(10, k)
print("Sample entry:")
print(json.dumps(sample[0], indent=2))
break
return
# -----------------------------------------------
# Main generation per token bin
# -----------------------------------------------
TARGET_VAL = 1000
TARGET_TEST = 1000
for len_bin, tok_bin in zip(len_bins, tok_bins):
if len_bin[1] <= len_bin[0]:
print(f"Skipping token bin {tok_bin} (no valid block counts)")
continue
k_start = max(1, len_bin[0])
k_end = max(1, len_bin[1])
k_values = list(range(k_start, k_end))
bin_size = len(k_values)
save_dir = f"{args.out_prefix}_{tok_bin[0]}_{tok_bin[1]}"
training_enabled = tok_bin[1] <= 1024 # unchanged policy
if training_enabled:
train_data: list[dict] = []
# Distribute training budget across k values.
# Scale: per_k = base_samples_per_bin / bin_size
per_k_train = max(1, args.base_samples_per_bin // max(1, bin_size))
for k in k_values:
train_data += generate_examples(per_k_train, k)
val_data: list[dict] = []
test_data: list[dict] = []
base_val = TARGET_VAL // bin_size
rem_val = TARGET_VAL % bin_size
base_test = TARGET_TEST // bin_size
rem_test = TARGET_TEST % bin_size
for idx, k in enumerate(k_values):
n_val_k = base_val + (1 if idx < rem_val else 0)
n_test_k = base_test + (1 if idx < rem_test else 0)
if n_val_k:
val_data += generate_examples(n_val_k, k)
if n_test_k:
test_data += generate_examples(n_test_k, k)
random.shuffle(val_data)
random.shuffle(test_data)
os.makedirs(save_dir, exist_ok=True)
if training_enabled:
save_jsonl(train_data, f"{save_dir}/train.jsonl")
save_jsonl(val_data, f"{save_dir}/val.jsonl")
save_jsonl(test_data, f"{save_dir}/test.jsonl")
if training_enabled:
print(
f"Dataset generated at {save_dir} (train={len(train_data)} val={len(val_data)} test={len(test_data)})"
)
else:
print(
f"Dataset (val/test only) generated at {save_dir} (val={len(val_data)} test={len(test_data)})"
)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,269 @@
import argparse
import os
import re
from glob import glob
import pandas as pd
from datasets import Dataset, load_dataset
from vllm import LLM, SamplingParams
STOP_STRINGS = {
"google/gemma-3-12b-it": ["<eos>", "<end_of_turn>"],
}
SYSTEM_TEMPLATE = (
"You are a creative and helpful assistant.\n"
"You are tasked with generating questions for reading comprehension tests.\n"
"You will be given a context and you need to generate questions and corresponding answers from the given context.\n"
"The questions should be highly specific to the information provided in the context, not general questions that suit any context.\n"
"**DO NOT** hallucinate or make up information."
)
# based on Make Your LLM Fully Utilize the Context (https://arxiv.org/pdf/2404.16811)
PROMPT_TEMPLATE = (
"### Instructions ###\n"
"Generate questions and corresponding answers from the given context. The questions should be highly specific to the "
"information provided in the context, not general questions that suit any context.\n\n"
"### Context ###\n"
"{context}\n\n\n"
"### Rules ###\n"
"Rules to follow when generating the questions:\n"
"1. The questions must be specific to the given context and fully answerable from information present in the given context.\n"
"2. Ask questions that are fact-seeking based on the information provided.\n"
"3. Make sure the questions are clear and unambiguous.\n"
"4. Phrases like 'based on the provided context', 'according to the context', 'in the context', etc., are **NOT ALLOWED** to appear in "
"the questions.\n"
"5. The questions should not overlap. They should be diverse, covering many aspects of the context.\n"
"6. Do not give away too much information in the questions. For example, ask 'Who is X?' instead of 'Who is X that did Y?' when Y is clear from the context.\n"
"7. Ignore the text formatting of the context, e.g., bold, italic, underline, etc.\n"
"8. Ignore typos, spacing, and grammatical errors in the context.\n\n"
"Rules to follow when generating the answers:\n"
"1. The answers must use the (implied) information provided in the context.\n"
"2. Phrases like 'based on the provided context', 'according to the context', 'in the context', etc., are **NOT ALLOWED** to appear in "
"the answers.\n"
"3. Do not just copy words from the context. Answer the question in your own words.\n"
"4. The answers should be detailed and comprehensive. Please include additional specific details from the context.\n\n"
"Respond with {n_qa_pairs} question-answer pairs.\n"
"Always use proper grammar and punctuation.\n"
"Try to use different question forms and styles.\n"
"Use simple words and make sure that the answers are clear and comprehensive.\n\n"
"The question-answer pairs should be in the following format:\n"
"Question 1: {{question_1}}\n"
"Answer 1: {{answer_1}}\n"
"Question 2: {{question_2}}\n"
"Answer 2: {{answer_2}}\n"
"..."
)
def get_prompt(context, n_qa_pairs):
prompt = PROMPT_TEMPLATE.format(context=context, n_qa_pairs=n_qa_pairs)
return prompt
def check_should_skip(txt: str, vllm_model: str) -> bool:
"""Check if the response should be skipped based on stop strings."""
for stop in STOP_STRINGS[vllm_model]:
if stop in txt[-len(stop) :]:
return (txt.split(stop)[0], False) # Found a valid stop string
return (txt, True) # No valid stop string found, skip this response
def postprocess_qa_pairs(res_txt: str):
"""
Postprocesses the QA pairs from the response text.
Args:
res_txt: The response text.
n_qa_pairs: The number of QA pairs.
Returns:
A tuple of two lists, the first containing the questions and the second containing the answers.
"""
# capture everything after each "Question {number}:" until "Answer"
res_txt = remove_think(res_txt)
q_pattern = r"Question \d+:(.*?)(?=Answer|$)" # thanks chatgpt
questions = re.findall(q_pattern, res_txt, flags=re.S)
a_pattern = r"Answer \d+:(.*?)(?=Question|$)" # thanks chatgpt
answers = re.findall(a_pattern, res_txt, flags=re.S)
if len(questions) != len(answers):
print(f"Warning---number of questions and answers do not match")
print(f"Number of questions: {len(questions)}")
print(f"Number of answers: {len(answers)}")
out_q = []
out_a = []
n_skips = 0
if (len(questions) > 0) and (len(answers) > 0):
n_gen_pairs = min(len(questions), len(answers))
has_left_over = n_gen_pairs < len(questions) or n_gen_pairs < len(answers)
for i in range(n_gen_pairs):
response = answers[i].strip()
question = questions[i].strip()
if not response or not question:
print(f"Skipping empty question or answer at index {i}")
continue
if (not has_left_over) and (i == n_gen_pairs - 1):
response, skip = check_should_skip(response, vllm_model)
if skip:
print(f"Skipping due to missing stop string")
n_skips += 1
continue
out_q.append(question.strip())
out_a.append(response.strip())
print(f"Skipped {n_skips} responses due to missing stop strings")
return out_q, out_a
def length_filter(sample, min_len, max_len):
return min_len <= len(sample["text"]) <= max_len
def remove_think(txt):
return txt.split("</think>")[-1]
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Generate QA pairs from FineWeb Edu dataset"
)
parser.add_argument(
"--vllm_model",
type=str,
default=os.environ.get("vllm_model", "google/gemma-2-27b-it"),
help="VLLM model to use for generation",
)
parser.add_argument(
"--shard_pattern",
type=str,
required=True,
help="Pattern to match shard files (e.g., '000_0000*')",
)
parser.add_argument(
"--n_qa_pairs",
type=int,
required=True,
help="Number of question-answer pairs to generate per context",
)
parser.add_argument(
"--min_length",
type=int,
default=0,
help="Minimum length of the context to consider for generation",
)
parser.add_argument(
"--max_length",
type=int,
default=2000,
help="Maximum length of the context to consider for generation",
)
parser.add_argument(
"--max_model_length",
type=int,
default=2**14,
help="Maximum length of the model input (context + prompt + response) in tokens",
)
parser.add_argument(
"--debug",
action="store_true",
help="Debug mode - process only first 100 samples",
)
args = parser.parse_args()
vllm_model = args.vllm_model
print(f"Using model: {vllm_model}")
llm_kwargs = dict(
model=vllm_model,
dtype="bfloat16",
enable_prefix_caching=True,
enable_chunked_prefill=True,
max_model_len=args.max_model_length,
limit_mm_per_prompt={"image": 0},
)
llm = LLM(**llm_kwargs)
tokenizer = llm.get_tokenizer()
shard_pattern = args.shard_pattern
n_qa_pairs = args.n_qa_pairs
paths = glob(
f"./data/raw_datasets/fineweb_edu/sample/100BT/{shard_pattern}.parquet"
)
split = "train[:100]" if args.debug else "train"
for path in paths:
ds = load_dataset(
"parquet",
data_files=path,
split=split,
)
ds = ds.filter(
length_filter,
fn_kwargs={"min_len": args.min_length, "max_len": args.max_length},
num_proc=8,
)
ctxs = [sample["text"] for sample in iter(ds)]
messages = [
[
{"role": "system", "content": SYSTEM_TEMPLATE},
{"role": "user", "content": get_prompt(ctx, n_qa_pairs)},
]
for ctx in ctxs
]
print(f"Generating from {len(messages)} contexts")
completions = llm.chat(
messages,
sampling_params=SamplingParams(
max_tokens=2048,
temperature=0.0,
# needed for checking if stop tokens are present
skip_special_tokens=False,
include_stop_str_in_output=True,
),
)
samples = []
for ctx, completion in zip(ctxs, completions):
questions, answers = postprocess_qa_pairs(completion.outputs[0].text)
samples.append(
{
"context": ctx,
"prompts_level_0": questions,
"responses_level_0": answers,
}
)
if args.debug:
print(f"{ctx=}")
print(f"{completion.outputs[0].text=}")
for q, a in zip(questions, answers):
print(f"{q=}")
print(f"{a=}")
print()
print("=" * 80)
print(f"Generated {len(samples)} samples")
df = pd.DataFrame(samples)
ds = Dataset.from_pandas(df)
val_ds = ds.take(10)
ds = ds.skip(10)
shard_name = path.split("/")[-1].split(".")[0]
shard_name += "_level_0"
if args.debug:
shard_name += "_debug"
ds.to_parquet(
f"data/raw_datasets/fw_qa_v2/min_{args.min_length}_to_{args.max_length}/{shard_name}.parquet"
)
val_ds.to_parquet(
f"data/raw_datasets/fw_qa_v2/min_{args.min_length}_to_{args.max_length}/{shard_name}_val.parquet"
)
print(
f"Saved to data/raw_datasets/fw_qa_v2/min_{args.min_length}_to_{args.max_length}/{shard_name}.parquet"
)
print(
f"Saved to data/raw_datasets/fw_qa_v2/min_{args.min_length}_to_{args.max_length}/{shard_name}_val.parquet"
)

View file

@ -0,0 +1,296 @@
import argparse
import gc
import os
import re
from glob import glob
from datasets import load_dataset
from vllm import LLM, SamplingParams
STOP_STRINGS = {
"google/gemma-3-12b-it": ["<eos>", "<end_of_turn>"],
}
SYSTEM_TEMPLATE = (
"You are a creative and helpful assistant.\n"
"You are tasked with generating questions for reading comprehension tests.\n"
"You will be given a context and you need to generate questions and corresponding answers from the given context.\n"
"The questions should be highly specific to the information provided in the context, not general questions that suit any context.\n"
"**DO NOT** hallucinate or make up information."
)
# based on Make Your LLM Fully Utilize the Context (https://arxiv.org/pdf/2404.16811)
PROMPT_TEMPLATE = (
"### Instructions ###\n"
"Generate questions and corresponding answers from the given context. The questions should be highly specific to the "
"information provided in the context, not general questions that suit any context.\n\n"
"### Context ###\n"
"{context}\n\n\n"
"### Example Question-Answer Pairs ###\n"
"{qa_pairs}\n\n\n"
"### Rules ###\n"
"Rules to follow when generating the questions:\n"
"1. The questions must be specific to the given context and fully answerable from information present in *or* implied from the given context.\n"
"2. The questions must *not* be redundant with the example questions-answer pairs provided.\n"
"3. You should prioritize fact-seeking questions. Consider reversal questions, e.g., asking 'What causes X to happen?' is valid when 'Y causes X' is presented in the context.\n"
"4. If all the facts in the context are already covered by the provided examples, you must generate *more complicated* questions that require reasoning beyond simple information retrieval.\nThis includes asking about information that can be inferred, requiring synthesizing information from multiple parts of the text, or understanding relationships between concepts, events, or individuals mentioned in the context. For example, if the context says 'The Eiffel Tower was completed in 1889 after 2 years of construction', you can ask 'When did the construction of the Eiffel Tower begin?'. Here's another example: if the context says 'Alice is Bob's mother. Bob is Charlie's Dad', you can ask 'Who is Charlie's grandmother?'.\n"
"5. Phrases like 'based on the provided context', 'according to the context', 'in the context', etc., are **NOT ALLOWED** to appear in "
"the questions.\n"
"6. The questions should not overlap. They should be diverse, covering many aspects of the context.\n"
"7. Do not give away too much information in the questions. For example, ask 'Who is X?' instead of 'Who is X that did Y?' when Y is clear from the context.\n"
"8. Ignore the text formatting of the context, e.g., bold, italic, underline, etc.\n"
"9. Ignore typos, spacing, and grammatical errors in the context.\n\n"
"Rules to follow when generating the answers:\n"
"1. The answers must use the (implied) information provided in the context.\n"
"2. Phrases like 'based on the provided context', 'according to the context', 'in the context', etc., are **NOT ALLOWED** to appear in "
"the answers.\n"
"3. Do not just copy words from the context. Answer the question in your own words.\n"
"4. The answers should be detailed and comprehensive. Please include additional specific details from the context.\n\n"
"Respond with {n_qa_pairs} question-answer pairs.\n"
"Always use proper grammar and punctuation.\n"
"Try to use different question forms and styles.\n"
"Use simple words and make sure that the answers are clear and comprehensive.\n\n"
"The question-answer pairs should be in the following format:\n"
"Question 1: {{question_1}}\n"
"Answer 1: {{answer_1}}\n"
"Question 2: {{question_2}}\n"
"Answer 2: {{answer_2}}\n"
"..."
)
def get_prompt(context, example_qa_pairs, n_qa_pairs):
prompt = PROMPT_TEMPLATE.format(
context=context,
qa_pairs=example_qa_pairs,
n_qa_pairs=n_qa_pairs,
)
return prompt
def check_should_skip(txt: str, vllm_model: str) -> bool:
"""Check if the response should be skipped based on stop strings."""
for stop in STOP_STRINGS[vllm_model]:
if stop in txt[-len(stop) :]:
return (txt.split(stop)[0], False) # Found a valid stop string
return (txt, True) # No valid stop string found, skip this response
def postprocess_qa_pairs(res_txt: str):
"""
Postprocesses the QA pairs from the response text.
Args:
res_txt: The response text.
n_qa_pairs: The number of QA pairs.
Returns:
A tuple of two lists, the first containing the questions and the second containing the answers.
"""
# capture everything after each "Question {number}:" until "Answer"
res_txt = remove_think(res_txt)
q_pattern = r"Question \d+:(.*?)(?=Answer|$)" # thanks chatgpt
questions = re.findall(q_pattern, res_txt, flags=re.S)
a_pattern = r"Answer \d+:(.*?)(?=Question|$)" # thanks chatgpt
answers = re.findall(a_pattern, res_txt, flags=re.S)
if len(questions) != len(answers):
print(f"Warning---number of questions and answers do not match")
print(f"Number of questions: {len(questions)}")
print(f"Number of answers: {len(answers)}")
out_q = []
out_a = []
n_skips = 0
if (len(questions) > 0) and (len(answers) > 0):
n_gen_pairs = min(len(questions), len(answers))
has_left_over = n_gen_pairs < len(questions) or n_gen_pairs < len(answers)
for i in range(n_gen_pairs):
response = answers[i].strip()
question = questions[i].strip()
if not response or not question:
print(f"Skipping empty question or answer at index {i}")
continue
if (not has_left_over) and (i == n_gen_pairs - 1):
response, skip = check_should_skip(response, vllm_model)
if skip:
print(f"Skipping due to missing stop string")
n_skips += 1
continue
out_q.append(question.strip())
out_a.append(response.strip())
print(f"Skipped {n_skips} responses due to missing stop strings")
return out_q, out_a
def flatten_list(l):
out = []
for x in l:
out += x
return out
def remove_think(txt):
return txt.split("</think>")[-1]
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Generate QA pairs from FineWeb Edu dataset"
)
parser.add_argument(
"--vllm_model",
type=str,
default=os.environ.get("vllm_model", "google/gemma-2-27b-it"),
help="VLLM model to use for generation",
)
parser.add_argument(
"--shard_pattern",
type=str,
required=True,
help="Pattern to match shard files (e.g., '000_0000*')",
)
parser.add_argument(
"--n_qa_pairs",
type=int,
required=True,
help="Number of question-answer pairs to generate per context",
)
parser.add_argument(
"--max_model_length",
type=int,
default=2**12,
help="Maximum length of the model input (context + prompt + response) in tokens",
)
parser.add_argument(
"--debug",
action="store_true",
help="Debug mode - process only first 100 samples",
)
args = parser.parse_args()
vllm_model = args.vllm_model
print(f"Using model: {vllm_model}")
llm_kwargs = dict(
model=vllm_model,
dtype="bfloat16",
enable_prefix_caching=True,
enable_chunked_prefill=True,
max_model_len=2**14,
limit_mm_per_prompt={"image": 0},
)
llm = LLM(**llm_kwargs)
tokenizer = llm.get_tokenizer()
shard_pattern = args.shard_pattern
n_qa_pairs = args.n_qa_pairs
paths = glob(f"./data/raw_datasets/fw_qa_v2/{shard_pattern}.parquet")
split = "train[:100]" if args.debug else "train"
for path in paths:
assert "_level" in path, (
"Path must contain '_level' to indicate the dataset level"
)
shard_name = path.split("/")[-1].split(".")[0].split("_debug")[0]
if "/" in shard_pattern:
shard_name = "/".join(shard_pattern.split("/")[:-1]) + "/" + shard_name
cur_level = int(shard_name.split("_level_")[-1])
next_level = cur_level + 1
ds = load_dataset(
"parquet",
data_files=path,
split=split,
)
prompt_cols = [col for col in ds.column_names if col.startswith("prompts")]
response_cols = [col for col in ds.column_names if col.startswith("responses")]
assert len(prompt_cols) > 0, "No prompt columns found in the dataset"
if len(prompt_cols) != len(response_cols):
raise ValueError(
"Number of prompt columns does not match number of response columns"
)
samples_data = []
for sample in iter(ds):
# Format existing QA pairs as examples
example_qa_pairs = ""
questions = flatten_list([sample[col] for col in prompt_cols])
answers = flatten_list([sample[col] for col in response_cols])
for i, (q, a) in enumerate(zip(questions, answers), 1):
example_qa_pairs += f"Question {i}: {q}\nAnswer {i}: {a}\n"
samples_data.append(
{"context": sample["context"], "example_qa_pairs": example_qa_pairs}
)
del ds
gc.collect()
messages = [
[
{"role": "system", "content": SYSTEM_TEMPLATE},
{
"role": "user",
"content": get_prompt(
sample["context"], sample["example_qa_pairs"], n_qa_pairs
),
},
]
for sample in samples_data
]
print(f"Generating from {len(messages)} contexts")
completions = llm.chat(
messages,
sampling_params=SamplingParams(
temperature=0.0,
# needed for checking if stop tokens are present
skip_special_tokens=False,
include_stop_str_in_output=True,
),
)
samples = []
for sample_data, completion in zip(samples_data, completions):
questions, answers = postprocess_qa_pairs(completion.outputs[0].text)
samples.append(
{
"context": sample_data["context"],
f"prompts_level_{next_level}": questions,
f"responses_level_{next_level}": answers,
}
)
if args.debug:
print(f"context={sample_data['context']}")
print(f"example_qa_pairs={sample_data['example_qa_pairs']}")
print(f"{completion.outputs[0].text=}")
for q, a in zip(questions, answers):
print(f"{q=}")
print(f"{a=}")
print()
print("=" * 80)
del samples_data
gc.collect()
print(f"Generated {len(samples)} samples")
ds = load_dataset(
"parquet",
data_files=path,
split=split,
)
ds = ds.add_column(
f"prompts_level_{next_level}",
[sample[f"prompts_level_{next_level}"] for sample in samples],
)
ds = ds.add_column(
f"responses_level_{next_level}",
[sample[f"responses_level_{next_level}"] for sample in samples],
)
shard_name_base = shard_name.split("_level_")[0]
shard_name = f"{shard_name_base}_level_{next_level}"
if args.debug:
shard_name += "_debug"
ds.to_parquet(f"data/raw_datasets/fw_qa_v2/{shard_name}.parquet")
print(f"Saved to data/raw_datasets/fw_qa_v2/{shard_name}.parquet")

387
data/gutenburg_sample.txt Normal file
View file

@ -0,0 +1,387 @@
The Project Gutenberg eBook, Addison, by William John Courthope
This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever. You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org
Title: Addison
Author: William John Courthope
Release Date: November 27, 2012 [eBook #41496]
Language: English
Character set encoding: ISO-8859-1
***START OF THE PROJECT GUTENBERG EBOOK ADDISON***
E-text prepared by the Online Distributed Proofreading Team
(http://www.pgdp.net) from page images generously made available by
Internet Archive (http://archive.org)
Note: Images of the original pages are available through
Internet Archive. See
http://archive.org/details/addison_00cour
Transcriber's note:
Text enclosed by underscores is in italics (_italics_).
Text enclosed by curly brackets is superscripted
(example: y{e}).
English Men of Letters
Edited by John Morley
ADDISON
by
W. J. COURTHOPE
Harper & Brothers Publishers
New York and London
1902
* * * * *
ENGLISH MEN OF LETTERS.
EDITED BY JOHN MORLEY.
JOHNSON Leslie Stephen.
GIBBON J. C. Morison.
SCOTT R. H. Hutton.
SHELLEY J. A. Symonds.
HUME T. H. Huxley.
GOLDSMITH William Black.
DEFOE William Minto.
BURNS J. C. Shairp.
SPENSER R. W. Church.
THACKERAY Anthony Trollope.
BURKE John Morley.
MILTON Mark Pattison.
HAWTHORNE Henry James, Jr.
SOUTHEY E. Dowden.
CHAUCER A. W. Ward.
BUNYAN J. A. Froude.
COWPER Goldwin Smith.
POPE Leslie Stephen.
BYRON John Nichol.
LOCKE Thomas Fowler.
WORDSWORTH F. Myers.
DRYDEN G. Saintsbury.
LANDOR Sidney Colvin.
DE QUINCEY David Masson.
LAMB Alfred Ainger.
BENTLEY R. C. Jebb.
DICKENS A. W. Ward.
GRAY E. W. Gosse.
SWIFT Leslie Stephen.
STERNE H. D. Traill.
MACAULAY J. Cotter Morison.
FIELDING Austin Dobson.
SHERIDAN Mrs. Oliphant.
ADDISON W. J. Courthope.
BACON R. W. Church.
COLERIDGE H. D. Traill.
SIR PHILIP SIDNEY J. A. Symonds.
KEATS Sidney Colvin.
CARLYLE John Nichol.
12mo, Cloth, 75 cents per volume.
_Other volumes in preparation._
PUBLISHED BY HARPER & BROTHERS, NEW YORK.
_Any of the above works will be sent by mail, postage prepaid, to any part
of the United States, Canada, or Mexico, on receipt of the price._
* * * * *
CONTENTS.
PAGE
CHAPTER I.
THE STATE OF ENGLISH SOCIETY AND LETTERS
AFTER THE RESTORATION 1
CHAPTER II.
ADDISON'S FAMILY AND EDUCATION 21
CHAPTER III.
ADDISON ON HIS TRAVELS 38
CHAPTER IV.
HIS EMPLOYMENT IN AFFAIRS OF STATE 53
CHAPTER V.
THE "TATLER" AND "SPECTATOR" 78
CHAPTER VI.
"CATO" 110
CHAPTER VII.
ADDISON'S QUARREL WITH POPE 125
CHAPTER VIII.
THE LAST YEARS OF HIS LIFE 139
CHAPTER IX.
THE GENIUS OF ADDISON 153
ADDISON.
CHAPTER I.
THE STATE OF ENGLISH SOCIETY AND LETTERS AFTER THE RESTORATION.
Of the four English men of letters whose writings most fully embody the
spirit of the eighteenth century, the one who provides the biographer with
the scantiest materials is Addison. In his _Journal to Stella_, his social
verses, and his letters to his friends, we have a vivid picture of those
relations with women and that protracted suffering which invest with such
tragic interest the history of Swift. Pope, by the publication of his own
correspondence, has enabled us, in a way that he never intended, to
understand the strange moral twist which distorted a nature by no means
devoid of noble instincts. Johnson was fortunate in the companionship of
perhaps the best biographer who ever lived. But of the real life and
character of Addison scarcely any contemporary record remains. The formal
narrative prefixed to his works by Tickell is, by that writer's own
admission, little more than a bibliography. Steele, who might have told us
more than any man about his boyhood and his manner of life in London, had
become estranged from his old friend before his death. No writer has
taken the trouble to preserve any account of the wit and wisdom that
enlivened the "little senate" at Button's. His own letters are, as a rule,
compositions as finished as his papers in the _Spectator_. Those features
in his character which excite the greatest interest have been delineated
by the hand of an enemy--an enemy who possessed an unrivalled power of
satirical portrait-painting, and was restrained by no regard for truth
from creating in the public mind such impressions about others as might
serve to heighten the favourable opinion of himself.
This absence of dramatic incident in Addison's life would lead us
naturally to conclude that he was deficient in the energy and passion
which cause a powerful nature to leave a mark upon its age. Yet such a
judgment would certainly be erroneous. Shy and reserved as he was, the
unanimous verdict of his most illustrious contemporaries is decisive as to
the respect and admiration which he excited among them. The man who could
exert so potent an influence over the mercurial Steele, who could
fascinate the haughty and cynical intellect of Swift, whose conversation,
by the admission of his satirist Pope, had in it something more charming
than that of any other man; of whom it was said that he might have been
chosen king if he wished it; such a man, though to the coarse perception
of Mandeville he might have seemed no more than "a parson in a tye-wig,"
can hardly have been deficient in force of character.
Nor would it have been possible for a writer distinguished by mere
elegance and refinement to leave a lasting impress on the literature and
society of his country. In one generation after another, men representing
opposing elements of rank, class, interest, and taste, have agreed in
acknowledging Addison's extraordinary merits. "Whoever wishes," says
Johnson--at the end of a biography strongly coloured with the
prepossessions of a semi-Jacobite Tory--"whoever wishes to attain an
English style, familiar but not coarse, and elegant but not ostentatious,
must give his days and nights to the volumes of Addison." "Such a mark of
national respect," says Macaulay, the best representative of middle-class
opinion in the present century, speaking of the statue erected to Addison
in Westminster Abbey, "was due to the unsullied statesman, to the
accomplished scholar, to the master of pure English eloquence, to the
consummate painter of life and manners. It was due, above all, to the
great satirist who alone knew how to use ridicule without abusing it; who,
without inflicting a wound, effected a great social reform, and who
reconciled wit and virtue after a long and disastrous separation, during
which wit had been led astray by profligacy, and virtue by fanaticism."
This verdict of a great critic is accepted by an age to which the grounds
of it are, perhaps, not very apparent. The author of any ideal creation--a
poem, a drama, or a novel--has an imprescriptible property in the fame of
his work. But to harmonise conflicting social elements, to bring order out
of chaos in the sphere of criticism, to form right ways of thinking about
questions of morals, taste, and breeding, are operations of which the
credit, though it is certainly to be ascribed to particular individuals,
is generally absorbed by society itself. Macaulay's eulogy is as just as
it is eloquent, but the pages of the _Spectator_ alone will hardly show
the reader why Addison should be so highly praised for having reconciled
wit with virtue. Nor, looking at him as a critic, will it appear a great
achievement to have pointed out to English society the beauties of
_Paradise Lost_, unless it be remembered that the taste of the preceding
generation still influenced Addison's contemporaries, and that in that
generation Cowley was accounted a greater poet than Milton.
To estimate Addison at his real value we must regard him as the chief
architect of Public Opinion in the eighteenth century. But here again we
are met by an initial difficulty, because it has become almost a
commonplace of contemporary criticism to represent the eighteenth century
as a period of sheer destruction. It is tacitly assumed by a school of
distinguished philosophical writers that we have arrived at a stage in the
world's history in which it is possible to take a positive and scientific
view of human affairs. As it is of course necessary that from such a
system all belief in the supernatural shall be jealously excluded, it has
not seemed impossible to write the history of Thought itself in the
eighteenth century. And in tracing the course of this supposed continuous
stream it is natural that all the great English writers of the period
should be described as in one way or another helping to pull down, or
vainly to strengthen, the theological barriers erected by centuries of
bigotry against the irresistible tide of enlightened progress.
It would be of course entirely out of place to discuss here the merits of
this new school of history. Those who consider that, whatever glimpses we
may obtain of the law and order of the universe, man is, as he always has
been and always will be, a mystery to himself, will hardly allow that the
operations of the human spirit can be traced in the dissecting-room. But
it is, in any case, obvious that to treat the great _imaginative_ writers
of any age as if they were only mechanical agents in an evolution of
thought is to do them grave injustice. Such writers are, above all things,
creative. Their first aim is to "show the very age and body of the time
his form and pressure." No work of the eighteenth century, composed in a
consciously destructive spirit, has taken its place among the acknowledged
classics of the language. Even the _Tale of a Tub_ is to be regarded as a
satire upon the aberrations of theologians from right reason, not upon the
principles of Christianity itself. The _Essay on Man_ has, no doubt,
logically a tendency towards Deism, but nobody ever read the poem for the
sake of its philosophy; and it is well known that Pope was much alarmed
when it was pointed out to him that his conclusions might be represented
as incompatible with the doctrines of revealed religion.
The truth indeed seems to be the exact converse of what is alleged by the
scientific historians. So far from the eighteenth century in England being
an age of destructive analysis, its energies were chiefly devoted to
political, social, and literary reconstruction. Whatever revolution in
faith and manners the English nation had undergone had been the work of
the two preceding centuries, and though the historic foundations of
society remained untouched, the whole form of the superstructure had been
profoundly modified.
"So tenacious are we," said Burke, towards the close of the last
century, "of our old ecclesiastical modes and fashions of institution
that very little change has been made in them since the fourteenth or
fifteenth centuries, adhering in this particular as in all else to our
old settled maxim never entirely nor at once to depart from antiquity.
We found these institutions on the whole favourable to morality and
discipline, and we thought they were susceptible of amendment without
altering the ground. We thought they were capable of receiving and
meliorating, and, above all, of preserving the accessories of science
and literature as the order of Providence should successively produce
them. And after all, with this Gothic and monkish education (for such
it is the groundwork), we may put in our claim to as ample and early
a share in all the improvements in science, in arts, and in literature
which have illuminated the modern world as any other nation in Europe.
We think one main cause of this improvement was our not despising the
patrimony of knowledge which was left us by our forefathers."
All this is, in substance, true of our political as well as our
ecclesiastical institutions. And yet, when Burke wrote, the great feudal
and mediæval structure of England had been so transformed by the Wars of
the Roses, the Reformation, the Rebellion, and the Revolution, that its
ancient outlines were barely visible. In so far, therefore, as his words
seem to imply that the social evolution he describes was produced by an
imperceptible and almost mechanical process of national instinct, the
impression they tend to create is entirely erroneous.
If we have been hitherto saved from such corruption as undermined the
republics of Italy, from the religious wars that so long enfeebled and
divided Germany, and from the Revolution that has severed modern France
from her ancient history, thanks for this are due partly, no doubt, to
favouring conditions of nature and society, but quite as much to the
genius of great individuals who prepared the mind of the nation for the
gradual assimilation of new ideas. Thus Langland and Wycliffe and their
numerous followers, long before the Reformation, had so familiarised the
minds of the people with their ideas of the Christian religion that the
Sovereign was able to assume the Headship of the Church without the shock
of a social convulsion. Fresh feelings and instincts grew up in the hearts
of whole classes of the nation without at first producing any change in
outward habits of life, and even without arousing a sense of their logical
incongruity. These mixed ideas were constantly brought before the
imagination in the works of the poets. Shakespeare abounds with passages
in which, side by side with the old feudal, monarchical, catholic, and
patriotic instincts of Englishmen, we find the sentiments of the Italian
Renaissance. Spenser conveys Puritan doctrines sometimes by the mouth of
shepherds, whose originals he had found in Theocritus and Virgil;
sometimes under allegorical forms derived from books of chivalry and the
ceremonial of the Catholic Church. Milton, the most rigidly Calvinistic of
all the English poets in his opinions, is also the most severely classical
in his style.
It was the task of Addison to carry on the reconciling traditions of our
literature. It is his praise to have accomplished his task under
conditions far more difficult than any that his predecessors had
experienced. What they had done was to give instinctive and characteristic
expression to the floating ideas of the society about them; what Addison
and his contemporaries did was to found a public opinion by a conscious
effort of reason and persuasion. Before the Civil Wars there had been at
least no visible breach in the principle of Authority in Church and State.
At the beginning of the eighteenth century constituted authority had been
recently overthrown; one king had been beheaded, another had been
expelled; the Episcopalian form of Church Government had been violently
displaced in favour of the Presbyterian, and had been with almost equal
violence restored. Whole classes of the population had been drawn into
opposing camps during the Civil War, and still stood confronting each
other with all the harsh antagonism of sentiment inherited from that
conflict. Such a bare summary alone is sufficient to indicate the nature
of the difficulties Addison had to encounter in his efforts to harmonise
public opinion; but a more detailed examination of the state of society
after the Restoration is required to place in its full light the
extraordinary merits of the success that he achieved.
There was, to begin with, a vehement opposition between town and country.
In the country the old ideas of Feudalism, modified by circumstances, but
vigorous and deep-rooted, still prevailed. True, the military system of
land-tenure had disappeared with the Restoration, but it was not so with
the relations of life, and the habits of thought and feeling which the
system had created. The features of surviving Feudalism have been
inimitably preserved for us in the character of Sir Roger de Coverley.
Living in the patriarchal fashion, in the midst of tenants and retainers,
who looked up to him as their chief, and for whose welfare and protection
he considered himself responsible, the country gentleman valued above all
things the principle of Loyalty. To the moneyed classes in the towns he
was instinctively opposed; he regarded their interests, both social and
commercial, as contrary to his own; he looked with dislike and suspicion
on the economical principles of government and conduct on which these
classes naturally rely. Even the younger sons of county families had in
Addison's day abandoned the custom, common enough in the feudal times, of
seeking their fortune in trade. Many a Will Wimble now spent his whole
life in the country, training dogs for his neighbours, fishing their
streams, making whips for their young heirs, and even garters for their
wives and daughters.[1]

8
data/sakana_wiki.txt Normal file
View file

@ -0,0 +1,8 @@
Sakana AI Co, Ltd. is a Japanese artificial intelligence company based in Tokyo.
Overview
Sakana AI's main research fields are evolution and collective intelligence of AI. The company's name is derived from the Japanese word さかな (sakana), which means fish. This represents the idea of a school of fish coming together and forming a coherent entity from simple rules, which is an analogy of collective intelligence.[2]
The company was founded by David Ha, Llion Jones and Ren Ito. Llion Jones co-authored the famous paper "Attention Is All You Need" when he was working for Google in 2017. The company raised $30M in its seed funding round from Lux Capital and Khosla Ventures.[3] The company raised approximately $200M from companies such as Mitsubishi UFJ, SMBC, Mizuho, Itochu, KDDI, Nomura and Nvidia in its series A funding round in 2024.[4]
In January 2024, Sakana AI developed a method to build new AI models by 'breeding' multiple existing models, which it sees as a means to democratise AI development, as this process does not require large computational resources.[5] Sakana AI is also developing a model called the AI Scientist, which automates the entire process of scientific research.[6] The Nikkei estimated the company's value at 19 billion yen in 2024.[7]

620
data/self_generate_qa.py Normal file
View file

@ -0,0 +1,620 @@
import argparse
import os
import random
import re
from glob import glob
import numpy as np
import yaml
from datasets import Dataset, load_dataset
from vllm import LLM, SamplingParams
from ctx_to_lora.data.definitions import (
CLOSED_QA_INTX_TEMPLATES,
RAW_DATA_DIR,
SELF_GEN_DATA_DIR,
)
from ctx_to_lora.data.processing import (
filter_none,
get_preprocessing_fn,
load_and_process_dataset,
tokenize_ctx_text,
)
from ctx_to_lora.data.self_gen_template import (
PRE_CTX,
PROMPT_TEMPLATE,
QA_PROMPT_TEMPLATE,
SELF_GEN_SYSTEM_MSG,
SELF_QA_INTX,
)
from ctx_to_lora.model_loading import get_tokenizer
from ctx_to_lora.utils import clear_gpu
STOP_STRINGS = {
"google/gemma-2-2b-it": ["<eos>", "<end_of_turn>"],
}
MODEL_CTX_LEN = {
"google/gemma-2-27b-it": 8192,
"google/gemma-2-2b-it": 8192,
"google/gemma-2-9b-it": 8192,
# qwen 4b has 256k ctx length but using lower max lengths is faster
"Qwen/Qwen3-4B-Instruct-2507": 2**13 + 2**12,
}
def truncate_middle_if_too_long(
input_ids: list[int],
max_length: int,
max_new_tokens: int = 256,
) -> list[int]:
"""
Truncate the middle of a list of tokens to fit within a maximum length.
Args:
tokens: List of token IDs
max_length: Maximum length for the truncated tokens
Returns:
List of truncated token IDs
"""
max_new_tokens_half = max_new_tokens // 2
# leave max_new_tokens for generation
half = max_length // 2 - max_new_tokens_half
if len(input_ids) > max_length:
return input_ids[:half] + input_ids[-half:]
return input_ids
def get_prompt(context: str, q: str, remove_qa_template: bool) -> str:
prompt = QA_PROMPT_TEMPLATE if not remove_qa_template else PROMPT_TEMPLATE
return prompt.format(context=context, question=q)
def add_closed_qa_prompt(q: str, closed_qa_prob: float = 0.1) -> str:
if random.random() <= closed_qa_prob:
q = random.choice(CLOSED_QA_INTX_TEMPLATES).format(input=q)
return q
def load_config(config_path: str) -> dict:
"""Load dataset names from YAML config file."""
with open(config_path) as f:
config = yaml.safe_load(f)
return config
def get_dataset_configs(
ds_names: list[str] | None,
config: dict | None,
split: str | None,
) -> list[tuple[str, str]]:
assert not (ds_names and config), "Cannot provide both ds_names and config"
if ds_names:
assert split, "When using ds_names, --split must be provided"
# Validate ds_names format
for ds_name in ds_names:
if not isinstance(ds_name, str):
raise ValueError(f"Invalid dataset name: {ds_name}")
return [(ds_name, split) for ds_name in ds_names]
if config:
dataset_configs = []
# Process train datasets
train_ds_names = config.get("train_ds_names", [])
# self_gen_train_ds_names = [
# (ds_name.split("/")[-1], "train")
# for ds_name in train_ds_names
# if ds_name.startswith("self_gen/")
# ]
self_gen_train_ds_names = [
(ds_name, "train")
for ds_name in train_ds_names
if ds_name.startswith("self_gen/")
]
if not self_gen_train_ds_names:
print("No self_gen datasets found in train_ds_names")
dataset_configs.extend(self_gen_train_ds_names)
# Process validation datasets
val_ds_names = config.get("val_ds_names", [])
self_gen_val_ds_names = [
(ds_name, "validation")
for ds_name in val_ds_names
if ds_name.startswith("self_gen/")
]
if not self_gen_val_ds_names:
print("No self_gen datasets found in val_ds_names")
dataset_configs.extend(self_gen_val_ds_names)
return dataset_configs
def create_messages(
ctxs: list[str],
questions: list[list[str]],
vllm_model: str,
system_template: str,
remove_qa_template: bool,
) -> list[list[dict]]:
"""Create chat messages for the model."""
# if "gemma" in vllm_model:
# gemma models do not support system messages
return [
[
{
"role": "user",
"content": (
system_template + "\n\n\n" + get_prompt(ctx, q, remove_qa_template)
).strip(),
}
]
for ctx, q_list in zip(ctxs, questions)
for q in q_list
]
# else:
# return [
# [
# {"role": "system", "content": system_template},
# {"role": "user", "content": get_prompt(ctx, q)},
# ]
# for ctx, q_list in zip(ctxs, questions)
# for q in q_list
# ]
def self_generate(
ds_name: str,
split: str,
args: argparse.Namespace,
llm: LLM,
system_template: str,
parquet_file: str | None = None,
do_truncate: bool = False,
) -> None:
"""Process a single dataset and generate QA pairs."""
shard_name = ""
# Conflict checks for ds_name-derived overrides
if ds_name is not None:
# temperature & closed_qa already handled later; add new ones
if "_temp_" in ds_name and args.temp != 0.0:
raise ValueError(
f"Multiple sources of truth for temperature: CLI arg --temp={args.temp} and dataset name contains temp specification."
)
if "_closed_qa_prob_" in ds_name and args.closed_qa_prob != 0.0:
raise ValueError(
f"Multiple sources of truth for closed_qa_prob: CLI arg --closed_qa_prob={args.closed_qa_prob} and dataset name contains closed_qa_prob specification."
)
# Base values from args
temp = args.temp
closed_qa_prob = args.closed_qa_prob
# Overrides from ds_name pattern if present
if ds_name is not None:
if "_temp_" in ds_name:
m = re.search(r"_temp_([\d.]+)", ds_name)
if m:
temp = float(m.group(1))
if "_closed_qa_prob_" in ds_name:
m = re.search(r"_closed_qa_prob_([\d.]+)", ds_name)
if m:
closed_qa_prob = float(m.group(1))
print(f"Processing dataset: {ds_name}, split: {split}")
print(f"Using temperature: {temp}")
print(f"Using closed QA prompt probability: {closed_qa_prob}")
if parquet_file:
print(f"Loading dataset from parquet file: {parquet_file}")
split = "train"
ds_name = "/".join(parquet_file.split(RAW_DATA_DIR)[-1].split("/")[:-1])
shard_name = "_" + os.path.basename(parquet_file).replace(".parquet", "")
ds = load_dataset(path="parquet", data_files=[parquet_file], split="train")
processing_fn = get_preprocessing_fn(ds_name, is_eval=False)
ds = ds.map(processing_fn, num_proc=8)
else:
ds_name = ds_name.split("/")[-1] # Extract just the dataset name
print(f"Loading dataset: {ds_name} with split: {split}")
kwargs = dict(ds_name=ds_name, split=split)
ds = load_and_process_dataset(**kwargs, num_proc=8, remove_cols=False)
print(f"Loaded dataset: {ds_name} with split: {split}")
if args.debug:
ds = ds.take(10)
ds = ds.filter(filter_none, batched=False, num_proc=8)
tk = get_tokenizer(args.vllm_model, train=True)
self_qa_intx_tokens = tk(SELF_QA_INTX, add_special_tokens=False)["input_ids"][1:]
if args.remove_qa_template:
self_qa_intx_tokens = tk("\n\n", add_special_tokens=False)["input_ids"]
n_self_qa_intx_tokens = len(self_qa_intx_tokens)
pre_ctx_tokens = tk(PRE_CTX, add_special_tokens=False)["input_ids"]
n_pre_ctx_tokens = len(pre_ctx_tokens)
sys_tokens = tk(system_template.split("\n")[0], add_special_tokens=False)[
"input_ids"
][:-1]
n_sys_tokens = len(sys_tokens)
os.environ["TOKENIZERS_PARALLELISM"] = "true"
ds = ds.map(
tokenize_ctx_text,
fn_kwargs={"tokenizer": tk},
batched=True,
batch_size=50_000,
keep_in_memory=True,
)
ctxs = [sample["context"] for sample in ds]
questions = [
[add_closed_qa_prompt(q, closed_qa_prob) for q in sample["prompts"] if q]
for sample in ds
]
questions = [q_list for q_list in ds["prompts"] if len(q_list) > 0]
print(f"Loaded {len(ctxs)} contexts and {len(questions)} questions")
k = 16
fpath = f"{SELF_GEN_DATA_DIR}/{args.vllm_model}_temp_{temp}_closed_qa_prob_{closed_qa_prob}/{ds_name}/{split}/ds{shard_name}"
chunk_size = 1_000
for chunk_idx, start in enumerate(range(0, len(ctxs), chunk_size)):
print(f"Processing chunk {chunk_idx}")
chunk_ctxs = ctxs[start : start + chunk_size]
chunk_questions = questions[start : start + chunk_size]
chunk_messages = create_messages(
chunk_ctxs,
chunk_questions,
args.vllm_model,
SELF_GEN_SYSTEM_MSG,
args.remove_qa_template,
)
if do_truncate:
# we should only do this for evaluation data
tokenized_contents = tk(
[m[0]["content"] for m in chunk_messages],
add_special_tokens=False,
return_attention_mask=False,
)
tokenized_contents["input_ids"] = [
truncate_middle_if_too_long(
ids,
max_length=MODEL_CTX_LEN[args.vllm_model],
max_new_tokens=args.max_new_tokens,
)
for ids in tokenized_contents["input_ids"]
]
contents = tk.batch_decode(
tokenized_contents["input_ids"], skip_special_tokens=True
)
for c, m in zip(contents, chunk_messages):
m[0]["content"] = c
print(f"Generating from {len(chunk_messages)} contexts")
# Clear GPU memory before processing the next chunk
clear_gpu()
execute_qa_generation(
fpath + f"_{chunk_idx:04d}",
args,
llm,
temp,
tk,
self_qa_intx_tokens,
n_self_qa_intx_tokens,
sys_tokens,
n_sys_tokens,
chunk_ctxs,
ds[start : start + chunk_size]["ctx_ids"],
chunk_questions,
chunk_messages,
k,
)
def execute_qa_generation(
fpath,
args,
llm,
temp,
tk,
self_qa_intx_tokens,
n_self_qa_intx_tokens,
sys_tokens,
n_sys_tokens,
ctxs,
ctx_ids,
questions,
messages,
k,
):
completions = llm.chat(
messages,
sampling_params=SamplingParams(
max_tokens=args.max_new_tokens,
logprobs=k,
temperature=temp,
seed=42,
spaces_between_special_tokens=False,
skip_special_tokens=False,
include_stop_str_in_output=True,
),
)
self_gen_data = {
ctx: {
"ctx_ids": ctx_ids,
"input_ids": [],
"response_start_end": [],
"logprobs_vals": [],
"logprobs_indices": [],
}
for ctx, ctx_ids in zip(ctxs, ctx_ids)
}
c = 0
n_skips = 0
sys_start = None
for ctx, q_list in zip(ctxs, questions):
# self_gen_data[ctx]["ctx_ids"] = ctx_ids
for i, _ in enumerate(q_list):
# response = completions[c + i].outputs[0].text
reason = completions[c + i].outputs[0].finish_reason
if reason != "stop":
# print(f"idx: {c + i}")
print(f"finish_reason: {completions[c + i].outputs[0].finish_reason}")
print(f"Skipping due to finish_reason={reason} != 'stop'")
n_skips += 1
continue
# includes the logprob before the first response token
# but excludes the logprob from eos token
logp = completions[c + i].outputs[0].logprobs
# len = num response tokens
n_response_tokens = len(completions[c + i].outputs[0].token_ids)
logp_indices = np.empty((n_response_tokens, k), dtype=np.int32)
# float-16 is better for this range
logp_vals = np.empty((n_response_tokens, k), dtype=np.float16)
assert len(logp) == n_response_tokens, (
f"Expected {n_response_tokens} logp entries, got {len(logp)}"
)
for li, info_d in enumerate(logp):
for j, (idx, tok_info) in enumerate(info_d.items()):
logp_indices[li, j] = idx
logp_vals[li, j] = tok_info.logprob
prompt_ids = completions[c + i].prompt_token_ids # 1d list
# token_ids only includes generated tokens, not the prompt
response_token_ids = completions[c + i].outputs[0].token_ids # 1d list
all_ids = prompt_ids + response_token_ids
res_start = len(prompt_ids)
res_end = res_start + n_response_tokens
if sys_start is None:
for ii in range(len(prompt_ids) - n_sys_tokens):
if prompt_ids[ii : ii + n_sys_tokens] == sys_tokens:
# found the start of the system message
sys_start = ii
break
q_start = None
for ii in range(
len(prompt_ids) - n_self_qa_intx_tokens,
-1,
-1,
):
if prompt_ids[ii : ii + n_self_qa_intx_tokens] == self_qa_intx_tokens:
# found the start of the user input
q_start = ii + n_self_qa_intx_tokens
break
# bos + question + eos + start model turn + response + eos
input_ids = all_ids[:sys_start] + all_ids[q_start:res_end]
# relative to the input_ids
res_start = res_start - q_start + sys_start
res_end = res_start + n_response_tokens
# arrays will be saved as nested lists of numbers
self_gen_data[ctx]["input_ids"].append(input_ids)
# assume single-turn chat
self_gen_data[ctx]["response_start_end"].append((res_start, res_end))
self_gen_data[ctx]["logprobs_vals"].append(logp_vals)
self_gen_data[ctx]["logprobs_indices"].append(logp_indices)
c += i + 1
print(f"Skipped {n_skips} responses due to missing stop strings")
samples = [
{
# "context": ctx,
# "prompts": q_list,
# "responses": self_gen_data[ctx]["responses"],
"ctx_ids": self_gen_data[ctx]["ctx_ids"],
"input_ids": self_gen_data[ctx]["input_ids"],
"response_start_end": self_gen_data[ctx]["response_start_end"],
# "prompt_start_end": self_gen_data[ctx]["prompt_start_end"],
"logprobs_vals": self_gen_data[ctx]["logprobs_vals"],
"logprobs_indices": self_gen_data[ctx]["logprobs_indices"],
}
for ctx, q_list in zip(ctxs, questions)
]
if args.debug:
for sample in samples:
# print(f"context={tk.decode(sample['ctx_ids'])}")
print(f"QA={[tk.decode(ids) for ids in sample['input_ids']]}")
for input_ids, (start, end) in zip(
sample["input_ids"], sample["response_start_end"]
):
print(f"start={start}, end={end}")
print(f"response={tk.decode(input_ids[start:end])}")
print(f"logprobs_vals={[x.shape for x in sample['logprobs_vals']]}")
print(f"logprobs_indices={[x.shape for x in sample['logprobs_indices']]}")
for indices in sample["logprobs_indices"]:
print(f"logprobs_indices={indices[-1]}")
print("=" * 80)
print(f"Generated {len(samples)} samples")
# random.shuffle(samples)
# Save results
# df = pd.DataFrame(samples)
# ds_out = Dataset.from_pandas(df)
ds_out = Dataset.from_list(samples)
# fpath = f"{SELF_GEN_DATA_DIR}/{args.vllm_model}_temp_{temp}_closed_qa_prob_{closed_qa_prob}/{ds_name}/{split}/ds{shard_name}"
if args.debug:
fpath += "_debug"
os.makedirs(os.path.dirname(fpath), exist_ok=True)
fpath = f"{fpath}.parquet"
ds_out.to_parquet(fpath)
print(f"Saved to {fpath}")
# Cleanup
del samples, ds_out, completions, messages, ctxs, questions
clear_gpu()
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Generate QA pairs using VLLM")
parser.add_argument(
"--vllm_model",
type=str,
required=True,
help="VLLM model name (e.g., google/gemma-2-2b-it)",
)
parser.add_argument(
"--debug",
action="store_true",
help="Enable debug mode (process only 10 samples)",
)
# Either config file OR ds_names + split
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
"--config",
type=str,
help="Path to YAML config file with train_ds_names/val_ds_names",
)
group.add_argument(
"--ds_names",
type=str,
nargs="+",
help="List of dataset names/shard patterns",
)
group.add_argument(
"--glob_pattern",
type=str,
help="Glob pattern to match dataset names (e.g., 'data/raw_datasets/fw_qa_3/*')",
)
parser.add_argument(
"--split",
type=str,
help="Dataset split to use when using --ds_names (required with --ds_names)",
)
parser.add_argument(
"--temp",
type=float,
default=0.0,
help="Temperature for sampling (default: 0.0)",
)
parser.add_argument(
"--closed_qa_prob",
type=float,
default=0.0,
help="Probability of using closed QA prompt template (default: 0.0)",
)
parser.add_argument(
"--do_truncate",
action="store_true",
help="Truncate contexts to fit model context length",
)
parser.add_argument(
"--remove_qa_template",
action="store_true",
help="Remove QA template formatting from prompts",
)
parser.add_argument(
"--max_new_tokens",
type=int,
default=256,
help="Maximum number of new tokens to generate (default: 256)",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# Validate arguments
if args.ds_names and not args.split:
raise ValueError("--split is required when using --ds_names")
vllm_model = args.vllm_model
print(f"Using model: {vllm_model}")
# Setup model-specific configurations
llm_kwargs = dict(
model=vllm_model,
dtype="bfloat16",
enable_prefix_caching=True,
enable_chunked_prefill=True,
max_model_len=MODEL_CTX_LEN.get(vllm_model),
max_num_batched_tokens=16384,
max_num_seqs=32, # avoid oom when getting logprobs
)
print(f"{llm_kwargs=}")
llm = LLM(**llm_kwargs)
# Get dataset configs from config or CLI args
config = load_config(args.config) if args.config else None
if args.ds_names or args.config:
dataset_configs = get_dataset_configs(
ds_names=args.ds_names,
config=config,
split=args.split,
)
# Process each dataset
for ds_name, split in dataset_configs:
print(f"Processing dataset: {ds_name}, split: {split}")
self_generate(
ds_name, split, args, llm, SELF_GEN_SYSTEM_MSG, None, args.do_truncate
)
else:
assert args.glob_pattern, (
"glob_pattern must be provided if no ds_names or config"
)
files = glob(args.glob_pattern)
for file in files:
print(f"Processing file: {file}")
self_generate(
ds_name=None,
parquet_file=file,
split=args.split,
args=args,
llm=llm,
system_template=SELF_GEN_SYSTEM_MSG,
do_truncate=args.do_truncate,
)