doc-to-lora/scripts/main_exp/0-download_data.py
2026-02-27 03:47:04 +00:00

17 lines
785 B
Python

from huggingface_hub import snapshot_download
if __name__ == "__main__":
self_gen_data_dir = "./data/raw_datasets/self_gen/"
snapshot_download(
"SakanaAI/self_gen_qa_d2l",
repo_type="dataset",
local_dir=self_gen_data_dir,
# we can filter based on model by using the `allow_patterns` argument
# based on https://huggingface.co/datasets/SakanaAI/self_gen_qa_d2l/tree/main
# we can use
# - `Qwen` for downloading the data for `Qwen/Qwen3-4B-Instruct-2507`
# - `google` for downloading the data for `google/gemma-2-2b-it`
# - `mistralai` for downloading the data for `mistralai/Mistral-7B-Instruct-v0.2`
#
# allow_patterns="google/*", # downloading the data for `google/gemma-2-2b-it`
)