Initial commit

2026-06-02 15:45:59 +02:00 · 2026-06-02 15:45:59 +02:00 · da2c8e636c
commit da2c8e636c
11 changed files with 755 additions and 0 deletions
--- a/synthetic-data.py
+++ b/synthetic-data.py
@ -0,0 +1,205 @@
+import os
+import re
+
+from datasets import Dataset, concatenate_datasets, load_dataset
+from llama_cpp import Llama
+
+# 1. CONFIGURATION
+GGUF_MODEL_PATH = "./path/to/model.gguf"
+INPUT_PARQUET_PATH = "./path/to/input.parquet"
+OUTPUT_PARQUET_PATH = "./path/to/output.parquet"
+NEW_ROWS_COUNT = 100
+
+# Check if files exist
+if not os.path.exists(GGUF_MODEL_PATH):
+    print(f"❌ Error: GGUF model file not found at {GGUF_MODEL_PATH}")
+    exit()
+if not os.path.exists(INPUT_PARQUET_PATH):
+    print(f"❌ Error: Input Parquet file not found at {INPUT_PARQUET_PATH}")
+    exit()
+
+# 2. LOAD GGUF MODEL - GPU (Vulkan) ONLY
+print("Loading llama.cpp model...")
+try:
+    model = Llama(
+        model_path=GGUF_MODEL_PATH,
+        n_ctx=8192,
+        n_gpu_layers=-1,  # ALL layers to GPU
+        verbose=False,  # No logging
+        n_batch=512,
+        logits_all=False,
+        use_mmap=True,
+        use_mlock=False,
+    )
+    print("✅ llama.cpp model loaded with Vulkan GPU.")
+except Exception as e:
+    print(f"❌ Error loading model: {e}")
+    exit()
+
+# 3. LOAD EXISTING DATASET
+print("Loading existing dataset from INPUT...")
+try:
+    original_ds = load_dataset(
+        "parquet", data_files=[INPUT_PARQUET_PATH], split="train"
+    )
+    print(f"Original Columns: {original_ds.column_names}")
+    print(f"Original Dataset Shape: {original_ds.shape}")
+except Exception as e:
+    print(f"❌ Error loading dataset: {e}")
+    exit()
+
+existing_labels = list(set(original_ds["label"]))
+
+# 4. GENERATE SYNTHETIC DATA - STRUCTURED OUTPUT
+print(f"Generating {NEW_ROWS_COUNT} synthetic records...")
+synthetic_data = []
+
+for i in range(NEW_ROWS_COUNT):
+    try:
+        # Use chat format for Qwen
+        messages = [
+            {
+                "role": "system",
+                "content": "You are a data generator. Output ONLY the format below, nothing else.",
+            },
+            {
+                "role": "user",
+                "content": """You are generating a gender bias avoidance dataset.
+                                You must output ONLY raw text in the following format.
+                                DO NOT use JSON. DO NOT use markdown code blocks (no ```).
+                                DO NOT repeat the instructions.
+                                Questions should be around 1-2 sentences long. Answers should be around 5 paragraphs in lengh essays.
+                                Answers should be answered thoroughly and detailled.
+                                Questions can vary from simple to complex systemic societal issues.
+                                Physiological differences are still real and should be accounted for when encountering a question related to it.
+                                Questions should be equaly distributed across all categories, like job/works, societal, relationships, personal, financial etc...
+
+                                Format:
+                                Question: [Ask a question which is stereotypically answered with gender bias]
+                                Answer: [Provide an answer which is COMPLETELY unbiased]
+                                Label: unbiased
+
+                                DO NOT repeat the format without actually filling it out and DO NOT create empty placeholder questions.
+                                ----
+                                Make sure that the content and Question: or Answer: are on the same line. Like this:
+                                Question: Here goes the question. It can continue in new lines but needs to start here.
+                                and not like this:
+                                Question:
+                                It doesnt go here without having a previouse sentence after the Question: tag.
+                                -----
+                                Now generate one record strictly adhering to the format, filling out both question and answer.
+                                Question:
+                                Answer:
+                                Label: unbiased""",
+            },
+        ]
+
+        # Generate with sampling parameters
+        response = model.create_chat_completion(
+            messages=messages,
+            max_tokens=200,
+            temperature=1.0,
+            top_p=0.95,
+            top_k=20,
+            min_p=0.0,
+        )
+
+        # Get response text
+        generated_text = response["choices"][0]["message"]["content"].strip()
+
+        # DIRECTLY PARSE TO STRUCTURED FORMAT
+        parsed_row = {}
+        lines = generated_text.split("\n")
+
+        question = None
+        answer = None
+        label = None
+        found_question = False
+        found_answer = False
+        found_label = False
+
+        for line in lines:
+            line = line.strip()
+
+            # Extract Question
+            if "Question:" in line and "Answer:" not in line:
+                match = re.search(
+                    r"Question:\s*(.+?)(?:\nAnswer|\nLabel|$)", line, re.IGNORECASE
+                )
+                if match:
+                    question = match.group(1).strip()
+                    found_question = True
+
+            # Extract Answer
+            elif "Answer:" in line:
+                match = re.search(r"Answer:\s*(.+?)(?:\nLabel|$)", line, re.IGNORECASE)
+                if match:
+                    answer = match.group(1).strip()
+                    found_answer = True
+
+            # Extract Label
+            elif "Label:" in line:
+                match = re.search(r"Label:\s*(.+)", line, re.IGNORECASE)
+                if match:
+                    label = match.group(1).strip()
+                    found_label = True
+
+        # VALIDATION
+        if not all([question, answer]):
+            print(f"⚠️ Row {i + 1}: Incomplete output. Skipping.")
+            for line in lines:
+                print(line)
+            continue
+
+        if not label:
+            label = "unbiased"
+        else:
+            # Normalize label
+            label = (
+                label.lower().strip('"').strip("'").replace("[", "").replace("]", "")
+            )
+
+        if label not in existing_labels:
+            print(f"⚠️ Row {i + 1}: Invalid label '{label}'. Skipping.")
+            continue
+
+        # Clean up
+        question = re.sub(r"```.*?```", "", question).strip()
+        answer = re.sub(r"```.*?```", "", answer).strip()
+
+        parsed_row = {"question": question, "answer": answer, "label": label}
+
+        # PRINT PARSED DATA IN TERMINAL
+        print(f"✅ ROW {i + 1} PARSED:")
+        print(f"   Question: {question}")
+        print(f"   Answer: {answer}")
+        print(f"   Label: {label}")
+        print()
+
+        synthetic_data.append(parsed_row)
+
+    except Exception as e:
+        print(f"❌ Row {i + 1}: Error: {e}")
+        continue
+
+# 5. SAVE TO PARQUET
+if synthetic_data:
+    print(f"Adding {len(synthetic_data)} synthetic records...")
+    synthetic_ds = Dataset.from_list(synthetic_data)
+
+    base_ds = None
+    if os.path.exists(INPUT_PARQUET_PATH):
+        base_ds = load_dataset(
+            "parquet", data_files=[INPUT_PARQUET_PATH], split="train"
+        )
+        print(f"Existing: {len(base_ds)} rows")
+    else:
+        base_ds = original_ds
+
+    combined_ds = concatenate_datasets([base_ds, synthetic_ds])
+    print(f"Combined: {len(combined_ds)} rows")
+
+    combined_ds.to_parquet(OUTPUT_PARQUET_PATH)
+    print(f"✅ Saved to {OUTPUT_PARQUET_PATH}")
+else:
+    print("❌ No valid records generated.")