From 96e6e141da7222cb03d5990c2a6c21b9ff191fc8 Mon Sep 17 00:00:00 2001
From: Oracle <otis.schmedt@gmx.de>
Date: Tue, 2 Jun 2026 16:50:03 +0200
Subject: [PATCH] Replace false prompt with placeholder

---
 README.md         |  2 ++
 synthetic-data.py | 29 +----------------------------
 2 files changed, 3 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index f51cfd3..2c199be 100644
--- a/README.md
+++ b/README.md
@@ -81,6 +81,8 @@ Generates synthetic training data using a GGUF model via llama.cpp. Run this if
 | `INPUT_PARQUET_PATH` | Path to existing training data to extend | `./data/train.parquet` |
 | `OUTPUT_PARQUET_PATH` | Path to save the combined dataset | `./data/output.parquet` |
 | `NEW_ROWS_COUNT` | Number of synthetic records to generate | `100` |
+| User prompt (line 67) | Replace `"YOUR PROMPT GOES HERE"` with generation instructions | `Generate questions about machine learning...` |
+| System message (line 63) | Controls the model's role | `"You are a data generator. Output ONLY the format below..."` |
 
 ```bash
 bash scripts/generate-data.sh
diff --git a/synthetic-data.py b/synthetic-data.py
index 692fb1c..93e094b 100644
--- a/synthetic-data.py
+++ b/synthetic-data.py
@@ -56,7 +56,6 @@ synthetic_data = []
 
 for i in range(NEW_ROWS_COUNT):
     try:
-        # Use chat format for Qwen
         messages = [
             {
                 "role": "system",
@@ -64,33 +63,7 @@ for i in range(NEW_ROWS_COUNT):
             },
             {
                 "role": "user",
-                "content": """You are generating a gender bias avoidance dataset.
-                                You must output ONLY raw text in the following format.
-                                DO NOT use JSON. DO NOT use markdown code blocks (no ```).
-                                DO NOT repeat the instructions.
-                                Questions should be around 1-2 sentences long. Answers should be around 5 paragraphs in lengh essays.
-                                Answers should be answered thoroughly and detailled.
-                                Questions can vary from simple to complex systemic societal issues.
-                                Physiological differences are still real and should be accounted for when encountering a question related to it.
-                                Questions should be equaly distributed across all categories, like job/works, societal, relationships, personal, financial etc...
-
-                                Format:
-                                Question: [Ask a question which is stereotypically answered with gender bias]
-                                Answer: [Provide an answer which is COMPLETELY unbiased]
-                                Label: unbiased
-
-                                DO NOT repeat the format without actually filling it out and DO NOT create empty placeholder questions.
-                                ----
-                                Make sure that the content and Question: or Answer: are on the same line. Like this:
-                                Question: Here goes the question. It can continue in new lines but needs to start here.
-                                and not like this:
-                                Question:
-                                It doesnt go here without having a previouse sentence after the Question: tag.
-                                -----
-                                Now generate one record strictly adhering to the format, filling out both question and answer.
-                                Question:
-                                Answer:
-                                Label: unbiased""",
+                "content": """YOUR PROMPT GOES HERE""",
             },
         ]