From da2c8e636c4dcbf14a522bc019302331d44fef45 Mon Sep 17 00:00:00 2001 From: Oracle Date: Tue, 2 Jun 2026 15:45:59 +0200 Subject: [PATCH] Initial commit --- README.md | 176 ++++++++++++++++++++++++++++++ finetune.py | 147 +++++++++++++++++++++++++ merge.py | 89 +++++++++++++++ requirements.txt | 7 ++ run-pipeline.sh | 23 ++++ scripts/finetune.sh | 3 + scripts/generate-data.sh | 3 + scripts/merge-and-convert.sh | 6 + scripts/run-model.sh | 2 + setup.sh | 94 ++++++++++++++++ synthetic-data.py | 205 +++++++++++++++++++++++++++++++++++ 11 files changed, 755 insertions(+) create mode 100644 README.md create mode 100644 finetune.py create mode 100644 merge.py create mode 100644 requirements.txt create mode 100755 run-pipeline.sh create mode 100755 scripts/finetune.sh create mode 100755 scripts/generate-data.sh create mode 100755 scripts/merge-and-convert.sh create mode 100755 scripts/run-model.sh create mode 100755 setup.sh create mode 100644 synthetic-data.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..8c5109c --- /dev/null +++ b/README.md @@ -0,0 +1,176 @@ +# Unsloth Fine-Tune Template + +> **Linux only** — This template is designed for Linux systems with NVIDIA GPU (CUDA), AMD GPU (ROCm), or Vulkan support. + +A template for fine-tuning LLMs using [Unsloth](https://github.com/unslothai/unsloth) and converting to GGUF format with [llama.cpp](https://github.com/ggerganov/llama.cpp). + +## Prerequisites + +- Linux OS +- Python 3.10+ +- NVIDIA GPU (CUDA) or AMD GPU (ROCm) or Vulkan-compatible GPU +- [cmake](https://cmake.org/) +- [git](https://git-scm.com/) + +## Quick Start + +```bash +# 1. Setup (clones llama.cpp, builds it, installs dependencies) +bash setup.sh + +# 2. Configure scripts (see variables below) + +# 3. Run full pipeline +bash run-pipeline.sh +``` + +## Workflow + +``` +scripts/generate-data.sh → Generate synthetic training data (optional) +scripts/finetune.sh → Fine-tune model with LoRA adapters +scripts/merge-and-convert.sh → Merge LoRA into base model and convert to GGUF +scripts/run-model.sh → Run the converted GGUF model +run-pipeline.sh → Run finetune → merge/convert → run in sequence +``` + +## Setup + +`setup.sh` will: +1. Create a Python virtual environment and install Python dependencies +2. Clone [llama.cpp](https://github.com/ggml-org/llama.cpp) +3. Build llama.cpp with your selected GPU backend +4. Install llama-cpp-python bindings with matching backend flags + +### Backend Selection + +| Choice | Backend | Requirements | +|---|---|---| +| 1 | CUDA (NVIDIA) | NVIDIA drivers, CUDA toolkit | +| 2 | ROCm (AMD) | AMD drivers, HIP toolkit | +| 3 | Vulkan | Vulkan drivers | +| 4 | CPU only | None | + +## Scripts + +### 1. scripts/generate-data.sh + +Generates synthetic training data using a GGUF model via llama.cpp. Run this if you need to create or extend a training dataset. + +**Edit `synthetic-data.py`:** + +| Variable | Description | Example | +|---|---|---| +| `GGUF_MODEL_PATH` | Path to the GGUF model used for generation | `./path/to/model.gguf` | +| `INPUT_PARQUET_PATH` | Path to existing training data to extend | `./data/train.parquet` | +| `OUTPUT_PARQUET_PATH` | Path to save the combined dataset | `./data/output.parquet` | +| `NEW_ROWS_COUNT` | Number of synthetic records to generate | `100` | + +```bash +bash scripts/generate-data.sh +``` + +### 2. scripts/finetune.sh + +Fine-tunes a model using Unsloth with LoRA adapters. Saves LoRA weights to `./model/`. + +**Edit `finetune.py`:** + +| Variable | Description | Example | +|---|---|---| +| `DATA_PATH` | Path to training Parquet file | `./data/output.parquet` | +| `OUTPUT_DIR` | Directory to save LoRA adapters | `./model` | +| `BATCH_SIZE` | Per-device batch size | `2` | +| `GRADIENT_ACCUMULATION_STEPS` | Gradient accumulation steps | `8` | +| `LEARNING_RATE` | Training learning rate | `2e-4` | +| `MAX_LENGTH` | Maximum sequence length | `4096` | +| `TRAIN_EPOCHS` | Number of training epochs | `1` | +| `model_name` (line 74) | Base model to fine-tune | `"unsloth/Llama-3.2-3B-Instruct"` | + +```bash +bash scripts/finetune.sh +``` + +### 3. scripts/merge-and-convert.sh + +Merges LoRA adapters into the base model, saves the merged model, then converts to GGUF format using llama.cpp. + +**Edit `merge.py`:** + +| Variable | Description | Example | +|---|---|---| +| `BASE_MODEL_PATH` | Path to the base model | `""` (empty to load from HuggingFace) | +| `LORA_DIR` | Path to LoRA adapters | `./model` | +| `MERGED_MODEL_PATH` | Output directory for merged model | `./merged_model` | + +```bash +bash scripts/merge-and-convert.sh +``` + +### 4. scripts/run-model.sh + +Runs the converted GGUF model using llama.cpp's CLI interface for inference. + +**Edit `run-model.sh`:** + +| Variable | Description | Example | +|---|---|---| +| Model path | Path to the GGUF file | `./merged_model/model.gguf` | + +```bash +bash scripts/run-model.sh +``` + +## Output Structure + +``` +./model/ ← LoRA adapters (from finetune.sh) +./merged_model/ ← Merged HF model + GGUF file (from merge-and-convert.sh) +llama.cpp/ ← llama.cpp repository (created by setup.sh) +scripts/ ← Individual pipeline step scripts +setup.sh ← Setup script (venv + llama.cpp build) +run-pipeline.sh ← Run full pipeline (finetune → merge/convert → run) +``` + +## Troubleshooting + +### llama.cpp build fails + +See the official build guide: +https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md + +Common issues: +- **CUDA**: Ensure NVIDIA drivers and CUDA toolkit are installed +- **ROCm**: Ensure AMD drivers and HIP toolkit are installed +- **Vulkan**: Ensure Vulkan drivers and SDK are installed +- **cmake**: Install via `sudo apt install cmake` (Debian/Ubuntu) + +### Out of memory during training + +- Reduce `BATCH_SIZE` in `finetune.py` +- Increase `GRADIENT_ACCUMULATION_STEPS` to compensate +- Reduce `MAX_LENGTH` to fit shorter sequences +- Set `load_in_4bit=True` in `finetune.py` (line 77) + +### llama-cpp-python install fails + +- Ensure llama.cpp is built successfully first +- Try CPU-only install first to verify: `pip install llama-cpp-python` +- Check [llama-cpp-python docs](https://llama-cpp-python.readthedocs.io/en/latest/) for other backends + +## Project Structure + +``` +├── finetune.py ← Training script +├── merge.py ← Merge LoRA into base model +├── synthetic-data.py ← Generate synthetic training data +├── requirements.txt ← Python dependencies +├── setup.sh ← One-time setup +├── run-pipeline.sh ← Run full pipeline +├── scripts/ +│ ├── generate-data.sh +│ ├── finetune.sh +│ ├── merge-and-convert.sh +│ └── run-model.sh +└── README.md +``` diff --git a/finetune.py b/finetune.py new file mode 100644 index 0000000..2f4753c --- /dev/null +++ b/finetune.py @@ -0,0 +1,147 @@ +# CRITICAL: Import unsloth BEFORE any other packages +import os +import warnings +from unsloth import FastLanguageModel, is_bfloat16_supported +import pandas as pd +import torch +from datasets import Dataset +from transformers import TrainingArguments +from trl.trainer.sft_trainer import SFTTrainer + + +warnings.filterwarnings("ignore") + +# ========================================== +# 1. CONFIGURATION +# ========================================== + +# Update these paths +DATA_PATH = "YOUR_PAQUET_FILE_PATH" +OUTPUT_DIR = "./model" +# Training params, change these to fit your hardware +BATCH_SIZE = 2 +GRADIENT_ACCUMULATION_STEPS = 8 +LEARNING_RATE = 2e-4 +MAX_LENGTH = 4096 +TRAIN_EPOCHS = 1 + +# Check device +device = "cuda" if torch.cuda.is_available() else "cpu" +print(f"Using device: {device}") + +# ========================================== +# 2. LOAD DATA AND FILTER +# ========================================== + +print("Loading data...") +df = pd.read_parquet(DATA_PATH) + +# Check required columns +required_cols = ["question", "answer", "label"] +missing_cols = [c for c in required_cols if c not in df.columns] +if missing_cols: + raise ValueError(f"Missing columns in Parquet file: {missing_cols}") + + +print(f"Loaded {len(df)} samples.") + +# ========================================== +# 3. PREPARE DATASETS +# ========================================== + + +def format_example(example): + """ + Formats the Question and Answer into a ChatML-style prompt + that the model can understand. + """ + text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example['question']}<|im_end|>\n<|im_start|>assistant\n{example['answer']}<|im_end|>" + return {"text": text} + + +# Convert pandas to HuggingFace Dataset +dataset = Dataset.from_pandas(df) +dataset = dataset.map(format_example, remove_columns=["question", "answer"]) + +print("Dataset prepared.") + +# ========================================== +# 4. LOAD MODEL +# ========================================== + +print("Loading Model...") +model, tokenizer = FastLanguageModel.from_pretrained( + model_name="example/example", # <-- Your model goes here + max_seq_length=MAX_LENGTH, + dtype=None, + load_in_4bit=False, # Set to True if your card doesnt have enough VRAM for training in FP16/BF16 +) + +# Apply LoRA Config (Unsloth default) +print("Applying LoRA...") +model = FastLanguageModel.get_peft_model( + model, + r=16, + lora_alpha=16, + lora_dropout=0, + bias="none", + use_gradient_checkpointing="unsloth", + random_state=3407, +) + +print("Model loaded successfully.") + +# ========================================== +# 5. TRAINING SETUP +# ========================================== + +print("Setting up Trainer...") + +# Configure Tokenizer +tokenizer.pad_token_id = tokenizer.eos_token_id +tokenizer.padding_side = "right" + + +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=dataset, + dataset_text_field="text", + max_seq_length=MAX_LENGTH, + args=TrainingArguments( + per_device_train_batch_size=BATCH_SIZE, + gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, + warmup_steps=5, + max_steps=-1, + num_train_epochs=TRAIN_EPOCHS, + learning_rate=LEARNING_RATE, + fp16=not is_bfloat16_supported(), + bf16=is_bfloat16_supported(), + logging_steps=10, + output_dir=OUTPUT_DIR, + save_strategy="no", + optim="adamw_8bit", + weight_decay=0.001, + report_to="none", + lr_scheduler_type="linear", + save_only_model=True, + load_best_model_at_end=False, + ), +) + +# ========================================== +# 6. TRAIN +# ========================================== + +print("Starting Training...") +trainer.train() + +# ========================================== +# 7. SAVE FINETUNED MODEL +# ========================================== + +print("Saving Finetuned Model...") +model.save_pretrained(OUTPUT_DIR) +tokenizer.save_pretrained(OUTPUT_DIR) + +print(f"Training complete! Finetuned model saved to {OUTPUT_DIR}") \ No newline at end of file diff --git a/merge.py b/merge.py new file mode 100644 index 0000000..288eb9d --- /dev/null +++ b/merge.py @@ -0,0 +1,89 @@ +import os +import torch +import unsloth +from unsloth import FastLanguageModel +from unsloth.chat_templates import get_chat_template +from peft import AutoPeftModelForCausalLM +from transformers import AutoModelForCausalLM, AutoTokenizer + +print("="*60) +print("UNSLOTH LORA") +print("="*60) + +# ========================================== +# 1. CONFIGURATION +# ========================================== + +BASE_MODEL_PATH = "" +LORA_DIR = "./model" +MERGED_MODEL_PATH = "./merged_model" + +device = "cuda" if torch.cuda.is_available() else "cpu" +print(f"Using device: {device}") + +# ========================================== +# 2. LOAD BASE MODEL +# ========================================== + +print("Loading Base Model from local path...") +base_model = AutoModelForCausalLM.from_pretrained( + BASE_MODEL_PATH, + device_map="auto", + torch_dtype=torch.float16, # Adjust if you only did 4bit finetuning + trust_remote_code=True +) + +print("✓ Base model loaded successfully") + +# ========================================== +# 3. LOAD LORA ADAPTERS USING PEFT +# ========================================== + +print("Loading LoRA adapters using PEFT...") +lora_model = AutoPeftModelForCausalLM.from_pretrained( + LORA_DIR, + torch_dtype=torch.float16, # Adjust if you only did 4bit finetuning + device_map="auto" +) + +print("✓ LoRA adapters loaded successfully") + +# ========================================== +# 4. MERGE LORA INTO BASE MODEL +# ========================================== + +print("Merging LoRA adapters into base model...") +merged_model = lora_model.merge_and_unload() + +print("✓ Adapters merged") + +# ========================================== +# 5. CONFIGURE TOKENIZER +# ========================================== + +print("Configuring tokenizer") + +tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH, trust_remote_code=True) +# Set pad token +tokenizer.pad_token = tokenizer.eos_token +tokenizer.padding_side = "right" + +print("✓ Tokenizer configured") + +# ========================================== +# 6. SAVE FOR VLLM DEPLOYMENT +# ========================================== + +print("Saving merged model...") +merged_model.save_pretrained(MERGED_MODEL_PATH, safe_serialization=True) +tokenizer.save_pretrained(MERGED_MODEL_PATH) + +print(f"✓ Merged model saved to {MERGED_MODEL_PATH}") + +# ========================================== +# 7. VERIFY OUTPUT +# ========================================== + +model_files = os.listdir(MERGED_MODEL_PATH) +print(f"\n✓ Model files created: {len(model_files)} files") +print(f" Files: {model_files}") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..054066e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +unsloth +peft +torch +transformers +datasets +trl +pandas diff --git a/run-pipeline.sh b/run-pipeline.sh new file mode 100755 index 0000000..b898e2e --- /dev/null +++ b/run-pipeline.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -e + +echo "============================================" +echo " Unsloth Fine-Tune Pipeline" +echo "============================================" + +echo "" +echo "Step 1/3: Fine-tuning model..." +bash scripts/finetune.sh + +echo "" +echo "Step 2/3: Merging and converting to GGUF..." +bash scripts/merge-and-convert.sh + +echo "" +echo "Step 3/3: Running model..." +bash scripts/run-model.sh + +echo "" +echo "============================================" +echo " Pipeline complete!" +echo "============================================" diff --git a/scripts/finetune.sh b/scripts/finetune.sh new file mode 100755 index 0000000..fa8706c --- /dev/null +++ b/scripts/finetune.sh @@ -0,0 +1,3 @@ +#!/bin/bash +source venv/bin/activate +python finetune.py diff --git a/scripts/generate-data.sh b/scripts/generate-data.sh new file mode 100755 index 0000000..0f73d0c --- /dev/null +++ b/scripts/generate-data.sh @@ -0,0 +1,3 @@ +#!/bin/bash +source venv/bin/activate +python synthetic-data.py diff --git a/scripts/merge-and-convert.sh b/scripts/merge-and-convert.sh new file mode 100755 index 0000000..6cc1df6 --- /dev/null +++ b/scripts/merge-and-convert.sh @@ -0,0 +1,6 @@ +#!/bin/bash +source venv/bin/activate +python merge.py +cd llama.cpp +source convertgguf_venv/bin/activate +python convert_hf_to_gguf.py ../merged_model/ diff --git a/scripts/run-model.sh b/scripts/run-model.sh new file mode 100755 index 0000000..31aa1fe --- /dev/null +++ b/scripts/run-model.sh @@ -0,0 +1,2 @@ +#!/bin/bash +./llama.cpp/build/bin/llama-cli -m ./merged_model/Merged_Model.gguf diff --git a/setup.sh b/setup.sh new file mode 100755 index 0000000..499657f --- /dev/null +++ b/setup.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +echo "============================================" +echo " Unsloth Fine-Tune Setup" +echo "============================================" + +# Create main virtual environment +echo "" +echo "Creating main virtual environment..." +python -m venv venv +source venv/bin/activate +pip install -r requirements.txt + +# Prompt for backend +echo "" +echo "Select llama.cpp backend:" +echo " 1) CUDA (NVIDIA GPU)" +echo " 2) ROCm (AMD GPU)" +echo " 3) Vulkan (Cross-vendor GPU)" +echo " 4) CPU only" +echo "" +read -p "Enter choice (1-4): " BACKEND + +# Clone llama.cpp +echo "" +echo "Cloning llama.cpp..." +if [ ! -d "llama.cpp" ]; then + git clone https://github.com/ggml-org/llama.cpp.git +else + echo "llama.cpp already exists, skipping clone." +fi + +# Build llama.cpp with correct flags +echo "" +echo "Building llama.cpp..." +cd llama.cpp + +BUILD_FAILED=0 + +case $BACKEND in + 1) + echo "Building with CUDA support..." + cmake -B build -DGGML_CUDA=ON || BUILD_FAILED=1 + [ $BUILD_FAILED -eq 0 ] && cmake --build build --config Release -j$(nproc) || BUILD_FAILED=1 + ;; + 2) + echo "Building with ROCm support..." + HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ + cmake -S . -B build -DGGML_HIP=ON -DGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release || BUILD_FAILED=1 + [ $BUILD_FAILED -eq 0 ] && cmake --build build --config Release -j$(nproc) || BUILD_FAILED=1 + ;; + 3) + echo "Building with Vulkan support..." + cmake -B build -DGGML_VULKAN=1 || BUILD_FAILED=1 + [ $BUILD_FAILED -eq 0 ] && cmake --build build --config Release -j$(nproc) || BUILD_FAILED=1 + ;; + 4) + echo "Building CPU-only..." + cmake -B build || BUILD_FAILED=1 + [ $BUILD_FAILED -eq 0 ] && cmake --build build --config Release -j$(nproc) || BUILD_FAILED=1 + ;; + *) + echo "Invalid choice. Building CPU-only." + cmake -B build || BUILD_FAILED=1 + [ $BUILD_FAILED -eq 0 ] && cmake --build build --config Release -j$(nproc) || BUILD_FAILED=1 + ;; +esac + +cd .. + +# Install llama-cpp-python in main venv +echo "" +echo "Installing llama-cpp-python..." + +source venv/bin/activate + +case $BACKEND in + 1) CMAKE_ARGS="-DGGML_CUDA=on" ;; + 2) CMAKE_ARGS="-DGGML_HIP=on" ;; + 3) CMAKE_ARGS="-DGGML_VULKAN=on" ;; + *) CMAKE_ARGS="" ;; +esac + +eval "CMAKE_ARGS=\"$CMAKE_ARGS\" pip install llama-cpp-python" + +echo "" +echo "Setup complete! Configure the scripts and run:" +echo " bash run-pipeline.sh" + +if [ $BUILD_FAILED -ne 0 ]; then + echo "" + echo "Build failed. See the build guide for help:" + echo " https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md" +fi diff --git a/synthetic-data.py b/synthetic-data.py new file mode 100644 index 0000000..692fb1c --- /dev/null +++ b/synthetic-data.py @@ -0,0 +1,205 @@ +import os +import re + +from datasets import Dataset, concatenate_datasets, load_dataset +from llama_cpp import Llama + +# 1. CONFIGURATION +GGUF_MODEL_PATH = "./path/to/model.gguf" +INPUT_PARQUET_PATH = "./path/to/input.parquet" +OUTPUT_PARQUET_PATH = "./path/to/output.parquet" +NEW_ROWS_COUNT = 100 + +# Check if files exist +if not os.path.exists(GGUF_MODEL_PATH): + print(f"❌ Error: GGUF model file not found at {GGUF_MODEL_PATH}") + exit() +if not os.path.exists(INPUT_PARQUET_PATH): + print(f"❌ Error: Input Parquet file not found at {INPUT_PARQUET_PATH}") + exit() + +# 2. LOAD GGUF MODEL - GPU (Vulkan) ONLY +print("Loading llama.cpp model...") +try: + model = Llama( + model_path=GGUF_MODEL_PATH, + n_ctx=8192, + n_gpu_layers=-1, # ALL layers to GPU + verbose=False, # No logging + n_batch=512, + logits_all=False, + use_mmap=True, + use_mlock=False, + ) + print("✅ llama.cpp model loaded with Vulkan GPU.") +except Exception as e: + print(f"❌ Error loading model: {e}") + exit() + +# 3. LOAD EXISTING DATASET +print("Loading existing dataset from INPUT...") +try: + original_ds = load_dataset( + "parquet", data_files=[INPUT_PARQUET_PATH], split="train" + ) + print(f"Original Columns: {original_ds.column_names}") + print(f"Original Dataset Shape: {original_ds.shape}") +except Exception as e: + print(f"❌ Error loading dataset: {e}") + exit() + +existing_labels = list(set(original_ds["label"])) + +# 4. GENERATE SYNTHETIC DATA - STRUCTURED OUTPUT +print(f"Generating {NEW_ROWS_COUNT} synthetic records...") +synthetic_data = [] + +for i in range(NEW_ROWS_COUNT): + try: + # Use chat format for Qwen + messages = [ + { + "role": "system", + "content": "You are a data generator. Output ONLY the format below, nothing else.", + }, + { + "role": "user", + "content": """You are generating a gender bias avoidance dataset. + You must output ONLY raw text in the following format. + DO NOT use JSON. DO NOT use markdown code blocks (no ```). + DO NOT repeat the instructions. + Questions should be around 1-2 sentences long. Answers should be around 5 paragraphs in lengh essays. + Answers should be answered thoroughly and detailled. + Questions can vary from simple to complex systemic societal issues. + Physiological differences are still real and should be accounted for when encountering a question related to it. + Questions should be equaly distributed across all categories, like job/works, societal, relationships, personal, financial etc... + + Format: + Question: [Ask a question which is stereotypically answered with gender bias] + Answer: [Provide an answer which is COMPLETELY unbiased] + Label: unbiased + + DO NOT repeat the format without actually filling it out and DO NOT create empty placeholder questions. + ---- + Make sure that the content and Question: or Answer: are on the same line. Like this: + Question: Here goes the question. It can continue in new lines but needs to start here. + and not like this: + Question: + It doesnt go here without having a previouse sentence after the Question: tag. + ----- + Now generate one record strictly adhering to the format, filling out both question and answer. + Question: + Answer: + Label: unbiased""", + }, + ] + + # Generate with sampling parameters + response = model.create_chat_completion( + messages=messages, + max_tokens=200, + temperature=1.0, + top_p=0.95, + top_k=20, + min_p=0.0, + ) + + # Get response text + generated_text = response["choices"][0]["message"]["content"].strip() + + # DIRECTLY PARSE TO STRUCTURED FORMAT + parsed_row = {} + lines = generated_text.split("\n") + + question = None + answer = None + label = None + found_question = False + found_answer = False + found_label = False + + for line in lines: + line = line.strip() + + # Extract Question + if "Question:" in line and "Answer:" not in line: + match = re.search( + r"Question:\s*(.+?)(?:\nAnswer|\nLabel|$)", line, re.IGNORECASE + ) + if match: + question = match.group(1).strip() + found_question = True + + # Extract Answer + elif "Answer:" in line: + match = re.search(r"Answer:\s*(.+?)(?:\nLabel|$)", line, re.IGNORECASE) + if match: + answer = match.group(1).strip() + found_answer = True + + # Extract Label + elif "Label:" in line: + match = re.search(r"Label:\s*(.+)", line, re.IGNORECASE) + if match: + label = match.group(1).strip() + found_label = True + + # VALIDATION + if not all([question, answer]): + print(f"⚠️ Row {i + 1}: Incomplete output. Skipping.") + for line in lines: + print(line) + continue + + if not label: + label = "unbiased" + else: + # Normalize label + label = ( + label.lower().strip('"').strip("'").replace("[", "").replace("]", "") + ) + + if label not in existing_labels: + print(f"⚠️ Row {i + 1}: Invalid label '{label}'. Skipping.") + continue + + # Clean up + question = re.sub(r"```.*?```", "", question).strip() + answer = re.sub(r"```.*?```", "", answer).strip() + + parsed_row = {"question": question, "answer": answer, "label": label} + + # PRINT PARSED DATA IN TERMINAL + print(f"✅ ROW {i + 1} PARSED:") + print(f" Question: {question}") + print(f" Answer: {answer}") + print(f" Label: {label}") + print() + + synthetic_data.append(parsed_row) + + except Exception as e: + print(f"❌ Row {i + 1}: Error: {e}") + continue + +# 5. SAVE TO PARQUET +if synthetic_data: + print(f"Adding {len(synthetic_data)} synthetic records...") + synthetic_ds = Dataset.from_list(synthetic_data) + + base_ds = None + if os.path.exists(INPUT_PARQUET_PATH): + base_ds = load_dataset( + "parquet", data_files=[INPUT_PARQUET_PATH], split="train" + ) + print(f"Existing: {len(base_ds)} rows") + else: + base_ds = original_ds + + combined_ds = concatenate_datasets([base_ds, synthetic_ds]) + print(f"Combined: {len(combined_ds)} rows") + + combined_ds.to_parquet(OUTPUT_PARQUET_PATH) + print(f"✅ Saved to {OUTPUT_PARQUET_PATH}") +else: + print("❌ No valid records generated.")