147 lines
No EOL
4 KiB
Python
147 lines
No EOL
4 KiB
Python
# CRITICAL: Import unsloth BEFORE any other packages so all optimizations are applied (Unsloth recommendation)
|
|
import os
|
|
import warnings
|
|
from unsloth import FastLanguageModel, is_bfloat16_supported
|
|
import pandas as pd
|
|
import torch
|
|
from datasets import Dataset
|
|
from transformers import TrainingArguments
|
|
from trl.trainer.sft_trainer import SFTTrainer
|
|
|
|
|
|
warnings.filterwarnings("ignore")
|
|
|
|
# ==========================================
|
|
# 1. CONFIGURATION
|
|
# ==========================================
|
|
|
|
# Update these paths
|
|
DATA_PATH = "YOUR_PARQUET_FILE_PATH"
|
|
OUTPUT_DIR = "./model"
|
|
# Training params, change these to fit your hardware
|
|
BATCH_SIZE = 2
|
|
GRADIENT_ACCUMULATION_STEPS = 8
|
|
LEARNING_RATE = 2e-4
|
|
MAX_LENGTH = 4096
|
|
TRAIN_EPOCHS = 1
|
|
|
|
# Check device
|
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
print(f"Using device: {device}")
|
|
|
|
# ==========================================
|
|
# 2. LOAD DATA AND FILTER
|
|
# ==========================================
|
|
|
|
print("Loading data...")
|
|
df = pd.read_parquet(DATA_PATH)
|
|
|
|
# Check required columns
|
|
required_cols = ["question", "answer"]
|
|
missing_cols = [c for c in required_cols if c not in df.columns]
|
|
if missing_cols:
|
|
raise ValueError(f"Missing columns in Parquet file: {missing_cols}")
|
|
|
|
|
|
print(f"Loaded {len(df)} samples.")
|
|
|
|
# ==========================================
|
|
# 3. PREPARE DATASETS
|
|
# ==========================================
|
|
|
|
|
|
def format_example(example):
|
|
"""
|
|
Formats the Question and Answer into a ChatML-style prompt
|
|
that the model can understand.
|
|
"""
|
|
text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example['question']}<|im_end|>\n<|im_start|>assistant\n{example['answer']}<|im_end|>"
|
|
return {"text": text}
|
|
|
|
|
|
# Convert pandas to HuggingFace Dataset
|
|
dataset = Dataset.from_pandas(df)
|
|
dataset = dataset.map(format_example, remove_columns=["question", "answer"])
|
|
|
|
print("Dataset prepared.")
|
|
|
|
# ==========================================
|
|
# 4. LOAD MODEL
|
|
# ==========================================
|
|
|
|
print("Loading Model...")
|
|
model, tokenizer = FastLanguageModel.from_pretrained(
|
|
model_name="example/example", # <-- Your model goes here
|
|
max_seq_length=MAX_LENGTH,
|
|
dtype=None,
|
|
load_in_4bit=False, # Set to True if your card doesnt have enough VRAM for training in FP16/BF16
|
|
)
|
|
|
|
# Apply LoRA Config (Unsloth default)
|
|
print("Applying LoRA...")
|
|
model = FastLanguageModel.get_peft_model(
|
|
model,
|
|
r=16,
|
|
lora_alpha=16,
|
|
lora_dropout=0,
|
|
bias="none",
|
|
use_gradient_checkpointing="unsloth",
|
|
random_state=3407,
|
|
)
|
|
|
|
print("Model loaded successfully.")
|
|
|
|
# ==========================================
|
|
# 5. TRAINING SETUP
|
|
# ==========================================
|
|
|
|
print("Setting up Trainer...")
|
|
|
|
# Configure Tokenizer
|
|
tokenizer.pad_token_id = tokenizer.eos_token_id
|
|
tokenizer.padding_side = "right"
|
|
|
|
|
|
trainer = SFTTrainer(
|
|
model=model,
|
|
tokenizer=tokenizer,
|
|
train_dataset=dataset,
|
|
dataset_text_field="text",
|
|
max_seq_length=MAX_LENGTH,
|
|
args=TrainingArguments(
|
|
per_device_train_batch_size=BATCH_SIZE,
|
|
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
|
|
warmup_steps=5,
|
|
max_steps=-1,
|
|
num_train_epochs=TRAIN_EPOCHS,
|
|
learning_rate=LEARNING_RATE,
|
|
fp16=not is_bfloat16_supported(),
|
|
bf16=is_bfloat16_supported(),
|
|
logging_steps=10,
|
|
output_dir=OUTPUT_DIR,
|
|
save_strategy="no",
|
|
optim="adamw_8bit",
|
|
weight_decay=0.001,
|
|
report_to="none",
|
|
lr_scheduler_type="linear",
|
|
save_only_model=True,
|
|
load_best_model_at_end=False,
|
|
),
|
|
)
|
|
|
|
# ==========================================
|
|
# 6. TRAIN
|
|
# ==========================================
|
|
|
|
print("Starting Training...")
|
|
trainer.train()
|
|
|
|
# ==========================================
|
|
# 7. SAVE FINETUNED MODEL
|
|
# ==========================================
|
|
|
|
print("Saving Finetuned Model...")
|
|
model.save_pretrained(OUTPUT_DIR)
|
|
tokenizer.save_pretrained(OUTPUT_DIR)
|
|
|
|
print(f"Training complete! Finetuned model saved to {OUTPUT_DIR}") |