From 96e6e141da7222cb03d5990c2a6c21b9ff191fc8 Mon Sep 17 00:00:00 2001 From: Oracle Date: Tue, 2 Jun 2026 16:50:03 +0200 Subject: [PATCH] Replace false prompt with placeholder --- README.md | 2 ++ synthetic-data.py | 29 +---------------------------- 2 files changed, 3 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index f51cfd3..2c199be 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,8 @@ Generates synthetic training data using a GGUF model via llama.cpp. Run this if | `INPUT_PARQUET_PATH` | Path to existing training data to extend | `./data/train.parquet` | | `OUTPUT_PARQUET_PATH` | Path to save the combined dataset | `./data/output.parquet` | | `NEW_ROWS_COUNT` | Number of synthetic records to generate | `100` | +| User prompt (line 67) | Replace `"YOUR PROMPT GOES HERE"` with generation instructions | `Generate questions about machine learning...` | +| System message (line 63) | Controls the model's role | `"You are a data generator. Output ONLY the format below..."` | ```bash bash scripts/generate-data.sh diff --git a/synthetic-data.py b/synthetic-data.py index 692fb1c..93e094b 100644 --- a/synthetic-data.py +++ b/synthetic-data.py @@ -56,7 +56,6 @@ synthetic_data = [] for i in range(NEW_ROWS_COUNT): try: - # Use chat format for Qwen messages = [ { "role": "system", @@ -64,33 +63,7 @@ for i in range(NEW_ROWS_COUNT): }, { "role": "user", - "content": """You are generating a gender bias avoidance dataset. - You must output ONLY raw text in the following format. - DO NOT use JSON. DO NOT use markdown code blocks (no ```). - DO NOT repeat the instructions. - Questions should be around 1-2 sentences long. Answers should be around 5 paragraphs in lengh essays. - Answers should be answered thoroughly and detailled. - Questions can vary from simple to complex systemic societal issues. - Physiological differences are still real and should be accounted for when encountering a question related to it. - Questions should be equaly distributed across all categories, like job/works, societal, relationships, personal, financial etc... - - Format: - Question: [Ask a question which is stereotypically answered with gender bias] - Answer: [Provide an answer which is COMPLETELY unbiased] - Label: unbiased - - DO NOT repeat the format without actually filling it out and DO NOT create empty placeholder questions. - ---- - Make sure that the content and Question: or Answer: are on the same line. Like this: - Question: Here goes the question. It can continue in new lines but needs to start here. - and not like this: - Question: - It doesnt go here without having a previouse sentence after the Question: tag. - ----- - Now generate one record strictly adhering to the format, filling out both question and answer. - Question: - Answer: - Label: unbiased""", + "content": """YOUR PROMPT GOES HERE""", }, ]