Initial commit
This commit is contained in:
commit
da2c8e636c
11 changed files with 755 additions and 0 deletions
205
synthetic-data.py
Normal file
205
synthetic-data.py
Normal file
|
|
@ -0,0 +1,205 @@
|
|||
import os
|
||||
import re
|
||||
|
||||
from datasets import Dataset, concatenate_datasets, load_dataset
|
||||
from llama_cpp import Llama
|
||||
|
||||
# 1. CONFIGURATION
|
||||
GGUF_MODEL_PATH = "./path/to/model.gguf"
|
||||
INPUT_PARQUET_PATH = "./path/to/input.parquet"
|
||||
OUTPUT_PARQUET_PATH = "./path/to/output.parquet"
|
||||
NEW_ROWS_COUNT = 100
|
||||
|
||||
# Check if files exist
|
||||
if not os.path.exists(GGUF_MODEL_PATH):
|
||||
print(f"❌ Error: GGUF model file not found at {GGUF_MODEL_PATH}")
|
||||
exit()
|
||||
if not os.path.exists(INPUT_PARQUET_PATH):
|
||||
print(f"❌ Error: Input Parquet file not found at {INPUT_PARQUET_PATH}")
|
||||
exit()
|
||||
|
||||
# 2. LOAD GGUF MODEL - GPU (Vulkan) ONLY
|
||||
print("Loading llama.cpp model...")
|
||||
try:
|
||||
model = Llama(
|
||||
model_path=GGUF_MODEL_PATH,
|
||||
n_ctx=8192,
|
||||
n_gpu_layers=-1, # ALL layers to GPU
|
||||
verbose=False, # No logging
|
||||
n_batch=512,
|
||||
logits_all=False,
|
||||
use_mmap=True,
|
||||
use_mlock=False,
|
||||
)
|
||||
print("✅ llama.cpp model loaded with Vulkan GPU.")
|
||||
except Exception as e:
|
||||
print(f"❌ Error loading model: {e}")
|
||||
exit()
|
||||
|
||||
# 3. LOAD EXISTING DATASET
|
||||
print("Loading existing dataset from INPUT...")
|
||||
try:
|
||||
original_ds = load_dataset(
|
||||
"parquet", data_files=[INPUT_PARQUET_PATH], split="train"
|
||||
)
|
||||
print(f"Original Columns: {original_ds.column_names}")
|
||||
print(f"Original Dataset Shape: {original_ds.shape}")
|
||||
except Exception as e:
|
||||
print(f"❌ Error loading dataset: {e}")
|
||||
exit()
|
||||
|
||||
existing_labels = list(set(original_ds["label"]))
|
||||
|
||||
# 4. GENERATE SYNTHETIC DATA - STRUCTURED OUTPUT
|
||||
print(f"Generating {NEW_ROWS_COUNT} synthetic records...")
|
||||
synthetic_data = []
|
||||
|
||||
for i in range(NEW_ROWS_COUNT):
|
||||
try:
|
||||
# Use chat format for Qwen
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a data generator. Output ONLY the format below, nothing else.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": """You are generating a gender bias avoidance dataset.
|
||||
You must output ONLY raw text in the following format.
|
||||
DO NOT use JSON. DO NOT use markdown code blocks (no ```).
|
||||
DO NOT repeat the instructions.
|
||||
Questions should be around 1-2 sentences long. Answers should be around 5 paragraphs in lengh essays.
|
||||
Answers should be answered thoroughly and detailled.
|
||||
Questions can vary from simple to complex systemic societal issues.
|
||||
Physiological differences are still real and should be accounted for when encountering a question related to it.
|
||||
Questions should be equaly distributed across all categories, like job/works, societal, relationships, personal, financial etc...
|
||||
|
||||
Format:
|
||||
Question: [Ask a question which is stereotypically answered with gender bias]
|
||||
Answer: [Provide an answer which is COMPLETELY unbiased]
|
||||
Label: unbiased
|
||||
|
||||
DO NOT repeat the format without actually filling it out and DO NOT create empty placeholder questions.
|
||||
----
|
||||
Make sure that the content and Question: or Answer: are on the same line. Like this:
|
||||
Question: Here goes the question. It can continue in new lines but needs to start here.
|
||||
and not like this:
|
||||
Question:
|
||||
It doesnt go here without having a previouse sentence after the Question: tag.
|
||||
-----
|
||||
Now generate one record strictly adhering to the format, filling out both question and answer.
|
||||
Question:
|
||||
Answer:
|
||||
Label: unbiased""",
|
||||
},
|
||||
]
|
||||
|
||||
# Generate with sampling parameters
|
||||
response = model.create_chat_completion(
|
||||
messages=messages,
|
||||
max_tokens=200,
|
||||
temperature=1.0,
|
||||
top_p=0.95,
|
||||
top_k=20,
|
||||
min_p=0.0,
|
||||
)
|
||||
|
||||
# Get response text
|
||||
generated_text = response["choices"][0]["message"]["content"].strip()
|
||||
|
||||
# DIRECTLY PARSE TO STRUCTURED FORMAT
|
||||
parsed_row = {}
|
||||
lines = generated_text.split("\n")
|
||||
|
||||
question = None
|
||||
answer = None
|
||||
label = None
|
||||
found_question = False
|
||||
found_answer = False
|
||||
found_label = False
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
|
||||
# Extract Question
|
||||
if "Question:" in line and "Answer:" not in line:
|
||||
match = re.search(
|
||||
r"Question:\s*(.+?)(?:\nAnswer|\nLabel|$)", line, re.IGNORECASE
|
||||
)
|
||||
if match:
|
||||
question = match.group(1).strip()
|
||||
found_question = True
|
||||
|
||||
# Extract Answer
|
||||
elif "Answer:" in line:
|
||||
match = re.search(r"Answer:\s*(.+?)(?:\nLabel|$)", line, re.IGNORECASE)
|
||||
if match:
|
||||
answer = match.group(1).strip()
|
||||
found_answer = True
|
||||
|
||||
# Extract Label
|
||||
elif "Label:" in line:
|
||||
match = re.search(r"Label:\s*(.+)", line, re.IGNORECASE)
|
||||
if match:
|
||||
label = match.group(1).strip()
|
||||
found_label = True
|
||||
|
||||
# VALIDATION
|
||||
if not all([question, answer]):
|
||||
print(f"⚠️ Row {i + 1}: Incomplete output. Skipping.")
|
||||
for line in lines:
|
||||
print(line)
|
||||
continue
|
||||
|
||||
if not label:
|
||||
label = "unbiased"
|
||||
else:
|
||||
# Normalize label
|
||||
label = (
|
||||
label.lower().strip('"').strip("'").replace("[", "").replace("]", "")
|
||||
)
|
||||
|
||||
if label not in existing_labels:
|
||||
print(f"⚠️ Row {i + 1}: Invalid label '{label}'. Skipping.")
|
||||
continue
|
||||
|
||||
# Clean up
|
||||
question = re.sub(r"```.*?```", "", question).strip()
|
||||
answer = re.sub(r"```.*?```", "", answer).strip()
|
||||
|
||||
parsed_row = {"question": question, "answer": answer, "label": label}
|
||||
|
||||
# PRINT PARSED DATA IN TERMINAL
|
||||
print(f"✅ ROW {i + 1} PARSED:")
|
||||
print(f" Question: {question}")
|
||||
print(f" Answer: {answer}")
|
||||
print(f" Label: {label}")
|
||||
print()
|
||||
|
||||
synthetic_data.append(parsed_row)
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Row {i + 1}: Error: {e}")
|
||||
continue
|
||||
|
||||
# 5. SAVE TO PARQUET
|
||||
if synthetic_data:
|
||||
print(f"Adding {len(synthetic_data)} synthetic records...")
|
||||
synthetic_ds = Dataset.from_list(synthetic_data)
|
||||
|
||||
base_ds = None
|
||||
if os.path.exists(INPUT_PARQUET_PATH):
|
||||
base_ds = load_dataset(
|
||||
"parquet", data_files=[INPUT_PARQUET_PATH], split="train"
|
||||
)
|
||||
print(f"Existing: {len(base_ds)} rows")
|
||||
else:
|
||||
base_ds = original_ds
|
||||
|
||||
combined_ds = concatenate_datasets([base_ds, synthetic_ds])
|
||||
print(f"Combined: {len(combined_ds)} rows")
|
||||
|
||||
combined_ds.to_parquet(OUTPUT_PARQUET_PATH)
|
||||
print(f"✅ Saved to {OUTPUT_PARQUET_PATH}")
|
||||
else:
|
||||
print("❌ No valid records generated.")
|
||||
Loading…
Add table
Add a link
Reference in a new issue