From 815c9aadec67548b286ef3a395778251d12107d0 Mon Sep 17 00:00:00 2001
From: Oracle <otis.schmedt@gmx.de>
Date: Tue, 2 Jun 2026 17:08:39 +0200
Subject: [PATCH] Use existing llama.cpp build for python bindings if possible

---
 README.md            | 37 ++++++++++++++++++++----
 finetune.py          |  4 +--
 scripts/run-model.sh |  2 +-
 setup.sh             | 69 +++++++++++++++++++++++++++-----------------
 synthetic-data.py    | 58 ++++++++++---------------------------
 5 files changed, 92 insertions(+), 78 deletions(-)
diff --git a/README.md b/README.md
index 2c199be..cac3fbf 100644
--- a/README.md
+++ b/README.md
@@ -38,14 +38,16 @@ run-pipeline.sh            → Run finetune → merge/convert → run in sequenc
 
 `setup.sh` will:
 1. Create a Python virtual environment and install Python dependencies
-2. Clone [llama.cpp](https://github.com/ggml-org/llama.cpp) or symlink an existing build
-3. Build llama.cpp with your selected GPU backend (skip if using existing)
-4. Install llama-cpp-python bindings with matching backend flags
+2. Clone [llama.cpp](https://github.com/ggml-org/llama.cpp) (fresh build) or symlink an existing build
+3. Build llama.cpp with shared libraries (`-DBUILD_SHARED_LIBS=ON`)
+4. Install llama-cpp-python bindings linked against the shared library (`-DLLAMA_BUILD=OFF`)
 
-**Using an existing llama.cpp build:** Choose option 2 and provide the absolute path to your existing build. Setup will create a symlink at `./llama.cpp`.
+**Using an existing llama.cpp build:** Choose option 2 and provide the absolute path. The build must have been created with `-DBUILD_SHARED_LIBS=ON` and contain `libllama.so`. Setup will create a symlink at `./llama.cpp`.
 
 ### Backend Selection
 
+Backend is only prompted when building llama.cpp from scratch. Choose based on your GPU:
+
 | Choice | Backend | Requirements |
 |---|---|---|
 | 1 | CUDA (NVIDIA) | Systemwide CUDA installation (NVIDIA drivers + CUDA toolkit) |
@@ -67,6 +69,17 @@ vulkaninfo
 
 Should run without errors.
 
+### Existing llama.cpp Build
+
+If using option 2 (existing build), ensure it was compiled with shared libraries:
+
+```bash
+cmake -B build -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON  # or -DGGML_HIP=ON / -DGGML_VULKAN=1
+cmake --build build --config Release -j$(nproc)
+```
+
+The build must contain `libllama.so` (typically at `build/libllama.so`).
+
 ## Scripts
 
 ### 1. scripts/generate-data.sh
@@ -81,8 +94,20 @@ Generates synthetic training data using a GGUF model via llama.cpp. Run this if
 | `INPUT_PARQUET_PATH` | Path to existing training data to extend | `./data/train.parquet` |
 | `OUTPUT_PARQUET_PATH` | Path to save the combined dataset | `./data/output.parquet` |
 | `NEW_ROWS_COUNT` | Number of synthetic records to generate | `100` |
-| User prompt (line 67) | Replace `"YOUR PROMPT GOES HERE"` with generation instructions | `Generate questions about machine learning...` |
-| System message (line 63) | Controls the model's role | `"You are a data generator. Output ONLY the format below..."` |
+| `User prompt` (line 66) | Replace `"YOUR PROMPT GOES HERE"` with generation instructions | `Generate questions about machine learning...` |
+| `System message` (line 62) | Controls the model's role | `"You are a data generator. Output ONLY the format below..."` |
+| `max_tokens` | Max tokens per response | `200` |
+| `temperature` | Creativity of generation | `0.7` |
+| `top_p` | Nucleus sampling threshold | `0.95` |
+| `top_k` | Top-k sampling threshold | `50` |
+| `min_p` | Minimum probability threshold | `0.05` |
+
+The model expects output in the format:
+
+```
+Question: <generated question>
+Answer: <generated answer>
+```
 
 ```bash
 bash scripts/generate-data.sh
diff --git a/finetune.py b/finetune.py
index 2f4753c..90e7ada 100644
--- a/finetune.py
+++ b/finetune.py
@@ -16,7 +16,7 @@ warnings.filterwarnings("ignore")
 # ==========================================
 
 # Update these paths
-DATA_PATH = "YOUR_PAQUET_FILE_PATH"
+DATA_PATH = "YOUR_PARQUET_FILE_PATH"
 OUTPUT_DIR = "./model"
 # Training params, change these to fit your hardware
 BATCH_SIZE = 2
@@ -37,7 +37,7 @@ print("Loading data...")
 df = pd.read_parquet(DATA_PATH)
 
 # Check required columns
-required_cols = ["question", "answer", "label"]
+required_cols = ["question", "answer"]
 missing_cols = [c for c in required_cols if c not in df.columns]
 if missing_cols:
     raise ValueError(f"Missing columns in Parquet file: {missing_cols}")
diff --git a/scripts/run-model.sh b/scripts/run-model.sh
index 31aa1fe..6ea73d1 100755
--- a/scripts/run-model.sh
+++ b/scripts/run-model.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-./llama.cpp/build/bin/llama-cli -m ./merged_model/Merged_Model.gguf
+./llama.cpp/build/bin/llama-cli -m ./merged_model/model.gguf
diff --git a/setup.sh b/setup.sh
index a2b3007..e94741d 100755
--- a/setup.sh
+++ b/setup.sh
@@ -11,25 +11,26 @@ python -m venv venv
 source venv/bin/activate
 pip install -r requirements.txt
 
-# Select backend for llama-cpp-python binding
-echo ""
-echo "Select llama.cpp backend:"
-echo "  1) CUDA (NVIDIA GPU)"
-echo "  2) ROCm (AMD GPU)"
-echo "  3) Vulkan (Cross-vendor GPU)"
-echo "  4) CPU only"
-echo ""
-read -p "Enter choice (1-4): " BACKEND
-
 # Ask if fresh build or existing
 echo ""
 echo "Would you like to:"
 echo "  1) Clone and build a fresh copy of llama.cpp"
-echo "  2) Use an existing llama.cpp build (symlink)"
+echo "  2) Use an existing llama.cpp build"
 echo ""
 read -p "Enter choice (1-2): " BUILD_CHOICE
 
+LLAMA_CPP_PATH=""
+
 if [ "$BUILD_CHOICE" = "1" ]; then
+    echo ""
+    echo "Select llama.cpp backend:"
+    echo "  1) CUDA (NVIDIA GPU)"
+    echo "  2) ROCm (AMD GPU)"
+    echo "  3) Vulkan (Cross-vendor GPU)"
+    echo "  4) CPU only"
+    echo ""
+    read -p "Enter choice (1-4): " BACKEND
+
     echo ""
     echo "Cloning llama.cpp..."
     git clone https://github.com/ggml-org/llama.cpp.git
@@ -44,32 +45,35 @@ if [ "$BUILD_CHOICE" = "1" ]; then
     case $BACKEND in
         1)
             echo "Building with CUDA support..."
-            cmake -B build -DGGML_CUDA=ON || BUILD_FAILED=1
+            cmake -B build -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON || BUILD_FAILED=1
             [ $BUILD_FAILED -eq 0 ] && cmake --build build --config Release -j$(nproc) || BUILD_FAILED=1
             ;;
         2)
             echo "Building with ROCm support..."
+            read -p "Enter GPU target (e.g., gfx1030, gfx942, gfx1100): " ROCM_TARGET
+            [ -z "$ROCM_TARGET" ] && ROCM_TARGET="gfx1030"
             HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-                cmake -S . -B build -DGGML_HIP=ON -DGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release || BUILD_FAILED=1
+                cmake -S . -B build -DBUILD_SHARED_LIBS=ON -DGGML_HIP=ON -DGPU_TARGETS="$ROCM_TARGET" -DCMAKE_BUILD_TYPE=Release || BUILD_FAILED=1
             [ $BUILD_FAILED -eq 0 ] && cmake --build build --config Release -j$(nproc) || BUILD_FAILED=1
             ;;
         3)
             echo "Building with Vulkan support..."
-            cmake -B build -DGGML_VULKAN=1 || BUILD_FAILED=1
+            cmake -B build -DBUILD_SHARED_LIBS=ON -DGGML_VULKAN=1 || BUILD_FAILED=1
             [ $BUILD_FAILED -eq 0 ] && cmake --build build --config Release -j$(nproc) || BUILD_FAILED=1
             ;;
         4)
             echo "Building CPU-only..."
-            cmake -B build || BUILD_FAILED=1
+            cmake -B build -DBUILD_SHARED_LIBS=ON || BUILD_FAILED=1
             [ $BUILD_FAILED -eq 0 ] && cmake --build build --config Release -j$(nproc) || BUILD_FAILED=1
             ;;
         *)
             echo "Invalid choice. Building CPU-only."
-            cmake -B build || BUILD_FAILED=1
+            cmake -B build -DBUILD_SHARED_LIBS=ON || BUILD_FAILED=1
             [ $BUILD_FAILED -eq 0 ] && cmake --build build --config Release -j$(nproc) || BUILD_FAILED=1
             ;;
     esac
 
+    LLAMA_CPP_PATH=$(pwd)
     cd ..
 else
     read -p "Enter absolute path to existing llama.cpp build: " LLAMA_CPP_PATH
@@ -83,24 +87,37 @@ else
     # Resolve to absolute path
     LLAMA_CPP_PATH=$(realpath "$LLAMA_CPP_PATH")
 
+    # Check for shared library
+    if [ ! -f "$LLAMA_CPP_PATH/build/libllama.so" ] && [ ! -f "$LLAMA_CPP_PATH/libllama.so" ]; then
+        echo "Error: Could not find libllama.so in $LLAMA_CPP_PATH"
+        echo "Make sure llama.cpp was built with -DBUILD_SHARED_LIBS=ON"
+        exit 1
+    fi
+
     echo ""
-    echo "Creating symlink: ./llama.cpp -> $LLAMA_CPP_PATH"
+    echo "Using existing build: $LLAMA_CPP_PATH"
     ln -sfn "$LLAMA_CPP_PATH" llama.cpp
 fi
 
-# Install llama-cpp-python in main venv
+# Find the shared library
+if [ -f "$LLAMA_CPP_PATH/build/libllama.so" ]; then
+    LLAMA_CPP_LIB="$LLAMA_CPP_PATH/build/libllama.so"
+elif [ -f "$LLAMA_CPP_PATH/libllama.so" ]; then
+    LLAMA_CPP_LIB="$LLAMA_CPP_PATH/libllama.so"
+else
+    echo "Error: Could not locate libllama.so"
+    exit 1
+fi
+
+echo ""
+echo "Using shared library: $LLAMA_CPP_LIB"
+
+# Install llama-cpp-python with existing build
 echo ""
 echo "Installing llama-cpp-python..."
 
-case $BACKEND in
-    1) CMAKE_ARGS="-DGGML_CUDA=on" ;;
-    2) CMAKE_ARGS="-DGGML_HIP=on" ;;
-    3) CMAKE_ARGS="-DGGML_VULKAN=on" ;;
-    *) CMAKE_ARGS="" ;;
-esac
-
 source venv/bin/activate
-eval "CMAKE_ARGS=\"$CMAKE_ARGS\" pip install llama-cpp-python"
+LLAMA_CPP_LIB="$LLAMA_CPP_LIB" LLAMA_CPP_LIB_PATH="$LLAMA_CPP_LIB" CMAKE_ARGS="-DLLAMA_BUILD=OFF" pip install llama-cpp-python
 
 # Create convertgguf_venv for llama.cpp Python tools
 echo ""
diff --git a/synthetic-data.py b/synthetic-data.py
index 93e094b..b9d325f 100644
--- a/synthetic-data.py
+++ b/synthetic-data.py
@@ -9,6 +9,11 @@ GGUF_MODEL_PATH = "./path/to/model.gguf"
 INPUT_PARQUET_PATH = "./path/to/input.parquet"
 OUTPUT_PARQUET_PATH = "./path/to/output.parquet"
 NEW_ROWS_COUNT = 100
+MAX_TOKENS = 200
+TEMPERATURE = 0.7
+TOP_P = 0.95
+TOP_K = 50
+MIN_P = 0.05
 
 # Check if files exist
 if not os.path.exists(GGUF_MODEL_PATH):
@@ -18,7 +23,7 @@ if not os.path.exists(INPUT_PARQUET_PATH):
     print(f"❌ Error: Input Parquet file not found at {INPUT_PARQUET_PATH}")
     exit()
 
-# 2. LOAD GGUF MODEL - GPU (Vulkan) ONLY
+# 2. LOAD GGUF MODEL - GPU
 print("Loading llama.cpp model...")
 try:
     model = Llama(
@@ -31,7 +36,7 @@ try:
         use_mmap=True,
         use_mlock=False,
     )
-    print("✅ llama.cpp model loaded with Vulkan GPU.")
+    print("✅ llama.cpp model loaded.")
 except Exception as e:
     print(f"❌ Error loading model: {e}")
     exit()
@@ -48,8 +53,6 @@ except Exception as e:
     print(f"❌ Error loading dataset: {e}")
     exit()
 
-existing_labels = list(set(original_ds["label"]))
-
 # 4. GENERATE SYNTHETIC DATA - STRUCTURED OUTPUT
 print(f"Generating {NEW_ROWS_COUNT} synthetic records...")
 synthetic_data = []
@@ -70,11 +73,11 @@ for i in range(NEW_ROWS_COUNT):
         # Generate with sampling parameters
         response = model.create_chat_completion(
             messages=messages,
-            max_tokens=200,
-            temperature=1.0,
-            top_p=0.95,
-            top_k=20,
-            min_p=0.0,
+            max_tokens=MAX_TOKENS,
+            temperature=TEMPERATURE,
+            top_p=TOP_P,
+            top_k=TOP_K,
+            min_p=MIN_P,
         )
 
         # Get response text
@@ -86,67 +89,36 @@ for i in range(NEW_ROWS_COUNT):
 
         question = None
         answer = None
-        label = None
-        found_question = False
-        found_answer = False
-        found_label = False
 
         for line in lines:
             line = line.strip()
 
-            # Extract Question
             if "Question:" in line and "Answer:" not in line:
                 match = re.search(
-                    r"Question:\s*(.+?)(?:\nAnswer|\nLabel|$)", line, re.IGNORECASE
+                    r"Question:\s*(.+?)(?:\nAnswer|$)", line, re.IGNORECASE
                 )
                 if match:
                     question = match.group(1).strip()
-                    found_question = True
 
-            # Extract Answer
             elif "Answer:" in line:
-                match = re.search(r"Answer:\s*(.+?)(?:\nLabel|$)", line, re.IGNORECASE)
+                match = re.search(r"Answer:\s*(.+)", line, re.IGNORECASE)
                 if match:
                     answer = match.group(1).strip()
-                    found_answer = True
 
-            # Extract Label
-            elif "Label:" in line:
-                match = re.search(r"Label:\s*(.+)", line, re.IGNORECASE)
-                if match:
-                    label = match.group(1).strip()
-                    found_label = True
-
-        # VALIDATION
         if not all([question, answer]):
             print(f"⚠️ Row {i + 1}: Incomplete output. Skipping.")
             for line in lines:
                 print(line)
             continue
 
-        if not label:
-            label = "unbiased"
-        else:
-            # Normalize label
-            label = (
-                label.lower().strip('"').strip("'").replace("[", "").replace("]", "")
-            )
-
-        if label not in existing_labels:
-            print(f"⚠️ Row {i + 1}: Invalid label '{label}'. Skipping.")
-            continue
-
-        # Clean up
         question = re.sub(r"```.*?```", "", question).strip()
         answer = re.sub(r"```.*?```", "", answer).strip()
 
-        parsed_row = {"question": question, "answer": answer, "label": label}
+        parsed_row = {"question": question, "answer": answer}
 
-        # PRINT PARSED DATA IN TERMINAL
         print(f"✅ ROW {i + 1} PARSED:")
         print(f"   Question: {question}")
         print(f"   Answer: {answer}")
-        print(f"   Label: {label}")
         print()
 
         synthetic_data.append(parsed_row)