From cb7ecd4604495f42f7c2bdf0875f33cfabcd574c Mon Sep 17 00:00:00 2001
From: alainnothere <164234422+alainnothere@users.noreply.github.com>
Date: Fri, 20 Mar 2026 01:50:01 +0000
Subject: [PATCH] Add files via upload

---
 vastai_rys_eval.sh | 382 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 382 insertions(+)
 create mode 100644 vastai_rys_eval.sh
diff --git a/vastai_rys_eval.sh b/vastai_rys_eval.sh
new file mode 100644
index 0000000..30374f8
--- /dev/null
+++ b/vastai_rys_eval.sh
@@ -0,0 +1,382 @@
+#!/usr/bin/env bash
+# ============================================================================
+# RYS Layer Surgery Evaluation on Vast.ai
+# ============================================================================
+#
+# This script runs on a Vast.ai instance (NVIDIA CUDA template, H200 GPU).
+# It downloads the base model, performs layer surgery, then runs lm_eval
+# on both models and compares results.
+#
+# ---- BEFORE RUNNING THIS SCRIPT ----
+#
+# 1. From YOUR machine, find an H200 offer:
+#
+#    vastai search offers 'gpu_name=H200 num_gpus=1 disk_space>=80 verified=true rentable=true' -o 'dph+'
+#
+#    If no H200 available, fall back to H100 SXM:
+#
+#    vastai search offers 'gpu_name=H100_SXM num_gpus=1 disk_space>=80 verified=true rentable=true' -o 'dph+'
+#
+# 2. Create the instance using the NVIDIA CUDA devel template:
+#
+#    vastai create instance <OFFER_ID> \
+#      --image vastai/base-image:cuda-12.8.1-cudnn-devel-ubuntu22.04 \
+#      --disk 80 \
+#      --direct \
+#      --ssh
+#
+# 3. Wait for it to boot (~2-3 min), then get SSH info:
+#
+#    vastai show instances
+#
+# 4. SCP this script and the surgery script to the instance:
+#
+#    scp -P <PORT> vastai_rys_eval.sh gguf_surgery.py compare_eval.py root@<SSH_ADDR>:/workspace/
+#
+# 5. SSH in and run:
+#
+#    ssh -p <PORT> root@<SSH_ADDR>
+#    cd /workspace && chmod +x vastai_rys_eval.sh && ./vastai_rys_eval.sh
+#
+# 6. When done, grab results from YOUR machine:
+#
+#    scp -P <PORT> -r root@<SSH_ADDR>:/workspace/eval_base root@<SSH_ADDR>:/workspace/eval_surgery ~/Downloads/claudeOutput/ggufSurgery/
+#    scp -P <PORT> root@<SSH_ADDR>:/workspace/comparison.txt ~/Downloads/claudeOutput/ggufSurgery/
+#
+# 7. Destroy the instance:
+#
+#    vastai destroy instance <INSTANCE_ID>
+#
+# ============================================================================
+
+set -euo pipefail
+
+WORKDIR=/workspace
+MODEL_DIR="${WORKDIR}/models"
+BASE_GGUF="Devstral-Small-2-24B-Instruct-2512-UD-Q4_K_XL.gguf"
+SURGERY_GGUF="devstral_rys_12_15.gguf"
+HF_REPO="unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF"
+LLAMA_PORT=8080
+
+# Surgery parameters: duplicate layers 12, 13, 14
+DUP_START=12
+DUP_END=15
+
+EVAL_TASKS="gsm8k_cot,ifeval,bbh_cot_fewshot_causal_judgement,bbh_cot_fewshot_date_understanding,bbh_cot_fewshot_logical_deduction_five_objects,bbh_cot_fewshot_navigate,mbpp"
+
+# Detect python3 / pip3
+PY=$(command -v python3 || command -v python)
+PIP=$(command -v pip3 || command -v pip)
+echo "Using Python: ${PY}"
+echo "Using pip: ${PIP}"
+
+# ============================================================================
+echo "============================================================"
+echo " STEP 1: Verify GPU"
+echo "============================================================"
+nvidia-smi || { echo "ERROR: No GPU found. Did you pick a GPU instance?"; exit 1; }
+echo ""
+
+# ============================================================================
+echo "============================================================"
+echo " STEP 2: Install Python packages"
+echo "============================================================"
+${PIP} install --upgrade pip
+${PIP} install gguf numpy tqdm huggingface-hub 'lm-eval[api]' transformers langdetect immutabledict
+
+# Ensure pip-installed scripts are in PATH
+export PATH="$PATH:/usr/local/bin:$HOME/.local/bin"
+# Ensure 'python' command exists (some containers only have python3)
+if ! command -v python &>/dev/null && command -v python3 &>/dev/null; then
+    ln -sf "$(command -v python3)" /usr/local/bin/python
+fi
+# Allow code execution for mbpp benchmark
+export HF_ALLOW_CODE_EVAL=1
+echo ""
+
+# ============================================================================
+echo "============================================================"
+echo " STEP 3: Build llama.cpp with CUDA"
+echo "============================================================"
+apt-get update && apt-get install -y libcurl4-openssl-dev cmake build-essential git
+
+if [ ! -f "${WORKDIR}/llama.cpp/build/bin/llama-server" ]; then
+    cd "${WORKDIR}"
+
+    # Clone only if not already cloned
+    if [ ! -d "${WORKDIR}/llama.cpp/.git" ]; then
+        git clone --depth 1 https://github.com/ggerganov/llama.cpp
+    else
+        echo "llama.cpp repo already cloned, skipping clone."
+    fi
+
+    # Auto-detect GPU compute capability — only build for THIS GPU
+    # Cuts compile time from ~30min to ~5min
+    CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.')
+    if [ -z "$CUDA_ARCH" ]; then
+        echo "WARNING: Could not detect GPU arch, building for all (slow)"
+        CUDA_ARCH="native"
+    else
+        echo "Detected GPU compute capability: ${CUDA_ARCH} — building only for this arch"
+    fi
+
+    cmake llama.cpp -B llama.cpp/build \
+        -DBUILD_SHARED_LIBS=OFF \
+        -DGGML_CUDA=ON \
+        -DLLAMA_CURL=ON \
+        -DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCH}"
+    cmake --build llama.cpp/build --config Release -j "$(nproc)" \
+        --target llama-server llama-cli llama-bench
+    echo "llama.cpp built successfully."
+else
+    echo "llama.cpp already built, skipping."
+fi
+
+LLAMA_SERVER="${WORKDIR}/llama.cpp/build/bin/llama-server"
+ls -la "$LLAMA_SERVER"
+echo ""
+
+# ============================================================================
+echo "============================================================"
+echo " STEP 4: Download base model from HuggingFace"
+echo "============================================================"
+mkdir -p "${MODEL_DIR}"
+
+if [ ! -f "${MODEL_DIR}/${BASE_GGUF}" ]; then
+    ${PY} -c "
+from huggingface_hub import hf_hub_download
+print('Downloading ${BASE_GGUF} from ${HF_REPO}...')
+hf_hub_download('${HF_REPO}', '${BASE_GGUF}', local_dir='${MODEL_DIR}')
+print('Download complete.')
+"
+else
+    echo "Base model already present, skipping download."
+fi
+
+ls -lh "${MODEL_DIR}/${BASE_GGUF}"
+echo ""
+
+# ============================================================================
+echo "============================================================"
+echo " STEP 5: Perform layer surgery (duplicate layers ${DUP_START}..$(( DUP_END - 1 )))"
+echo "============================================================"
+
+if [ ! -f "${MODEL_DIR}/${SURGERY_GGUF}" ]; then
+    ${PY} "${WORKDIR}/gguf_surgery.py" \
+        "${MODEL_DIR}/${BASE_GGUF}" \
+        "${MODEL_DIR}/${SURGERY_GGUF}" \
+        -i "${DUP_START}" -j "${DUP_END}" -v
+    echo "Surgery complete."
+else
+    echo "Surgery model already present, skipping."
+fi
+
+ls -lh "${MODEL_DIR}/${SURGERY_GGUF}"
+echo ""
+
+# ============================================================================
+# Helper: start llama-server, wait for it to be ready, return PID
+# ============================================================================
+start_server() {
+    local model_path="$1"
+    echo "Starting llama-server with: $(basename "$model_path")"
+
+    "$LLAMA_SERVER" \
+        -m "$model_path" \
+        --host 0.0.0.0 \
+        --port "${LLAMA_PORT}" \
+        -ngl 999 \
+        --flash-attn on \
+        --ctx-size 32768 \
+        > /tmp/llama_server.log 2>&1 &
+
+    local server_pid=$!
+    echo "Server PID: ${server_pid}"
+
+    # Wait for server to become ready (up to 120 seconds)
+    echo "Waiting for server to load model..."
+    local attempts=0
+    local max_attempts=60
+    while [ $attempts -lt $max_attempts ]; do
+        if curl -s "http://127.0.0.1:${LLAMA_PORT}/health" | grep -q "ok"; then
+            echo "Server ready after ~$(( attempts * 2 ))s"
+            return 0
+        fi
+        # Check if process died
+        if ! kill -0 "$server_pid" 2>/dev/null; then
+            echo "ERROR: llama-server died. Last 20 lines of log:"
+            tail -20 /tmp/llama_server.log
+            return 1
+        fi
+        sleep 2
+        attempts=$(( attempts + 1 ))
+    done
+
+    echo "ERROR: Server did not become ready in time. Last 20 lines of log:"
+    tail -20 /tmp/llama_server.log
+    return 1
+}
+
+stop_server() {
+    echo "Stopping llama-server..."
+    pkill -f llama-server || true
+    sleep 3
+    # Make sure it's dead
+    pkill -9 -f llama-server 2>/dev/null || true
+    sleep 1
+    echo "Server stopped."
+}
+
+# ============================================================================
+# Common lm_eval args
+# ============================================================================
+LM_EVAL_MODEL_ARGS="model=mistralai/Devstral-Small-2-24B-Instruct-2512,base_url=http://127.0.0.1:${LLAMA_PORT}/v1/completions,num_concurrent=3,tokenized_requests=False"
+
+# Split tasks into array
+IFS=',' read -ra TASKS <<< "${EVAL_TASKS}"
+
+# ============================================================================
+echo "============================================================"
+echo " STEP 6: Smoke test (--limit 1 per task, base model)"
+echo "============================================================"
+start_server "${MODEL_DIR}/${BASE_GGUF}"
+
+echo "Running 1 sample per task to verify everything works..."
+lm_eval --model local-completions \
+    --model_args "${LM_EVAL_MODEL_ARGS}" \
+    --tasks "${EVAL_TASKS}" \
+    --confirm_run_unsafe_code \
+    --limit 1 \
+    --output_path "${WORKDIR}/eval_smoke" \
+    --log_samples
+
+stop_server
+
+echo ""
+echo "Smoke test PASSED. Proceeding with evaluation."
+echo ""
+
+# ============================================================================
+# run_eval_pass: run all tasks interleaved (base then surgery) with comparison
+#   $1 = pass name (e.g. "quick" or "full")
+#   $2 = limit flag (e.g. "--limit 200" or "")
+#   $3 = base output dir
+#   $4 = surgery output dir
+# ============================================================================
+run_eval_pass() {
+    local pass_name="$1"
+    local limit_flag="$2"
+    local base_out="$3"
+    local surgery_out="$4"
+
+    echo ""
+    echo "============================================================"
+    echo " PASS: ${pass_name} ${limit_flag}"
+    echo " Running each task on BOTH models before moving to the next."
+    echo "============================================================"
+    echo ""
+
+    for i in "${!TASKS[@]}"; do
+        TASK="${TASKS[$i]}"
+        TASK_NUM=$(( i + 1 ))
+        TASK_TOTAL=${#TASKS[@]}
+
+        echo ""
+        echo "------------------------------------------------------------"
+        echo " [${pass_name}] Task ${TASK_NUM}/${TASK_TOTAL}: ${TASK}"
+        echo "------------------------------------------------------------"
+
+        # --- Base model ---
+        echo ""
+        echo "  >>> BASE model: ${TASK}"
+        start_server "${MODEL_DIR}/${BASE_GGUF}"
+
+        lm_eval --model local-completions \
+            --model_args "${LM_EVAL_MODEL_ARGS}" \
+            --tasks "${TASK}" \
+            --confirm_run_unsafe_code \
+            ${limit_flag} \
+            --output_path "${base_out}" \
+            --log_samples
+
+        stop_server
+
+        # --- Surgery model ---
+        echo ""
+        echo "  >>> SURGERY model: ${TASK}"
+        start_server "${MODEL_DIR}/${SURGERY_GGUF}"
+
+        lm_eval --model local-completions \
+            --model_args "${LM_EVAL_MODEL_ARGS}" \
+            --tasks "${TASK}" \
+            --confirm_run_unsafe_code \
+            ${limit_flag} \
+            --output_path "${surgery_out}" \
+            --log_samples
+
+        stop_server
+
+        # --- Incremental comparison ---
+        echo ""
+        echo "  --- [${pass_name}] Results so far (${TASK_NUM}/${TASK_TOTAL} tasks) ---"
+        ${PY} "${WORKDIR}/compare_eval.py" \
+            "${base_out}" "${surgery_out}" \
+            --names base "rys_${DUP_START}_${DUP_END}" \
+            2>/dev/null || echo "  (comparison will work after both models have results)"
+        echo ""
+    done
+
+    echo ""
+    echo "============================================================"
+    echo " ${pass_name} pass complete"
+    echo "============================================================"
+    ${PY} "${WORKDIR}/compare_eval.py" \
+        "${base_out}" "${surgery_out}" \
+        --names base "rys_${DUP_START}_${DUP_END}" \
+        | tee "${WORKDIR}/comparison_${pass_name}.txt"
+    echo ""
+}
+
+# ============================================================================
+echo "============================================================"
+echo " STEP 7: Quick pass (--limit 200)"
+echo "============================================================"
+run_eval_pass "quick" "--limit 200" \
+    "${WORKDIR}/eval_base_quick" "${WORKDIR}/eval_surgery_quick"
+
+# ============================================================================
+echo "============================================================"
+echo " STEP 8: Full pass (no limit)"
+echo "============================================================"
+run_eval_pass "full" "" \
+    "${WORKDIR}/eval_base_full" "${WORKDIR}/eval_surgery_full"
+
+# ============================================================================
+echo "============================================================"
+echo " STEP 9: Final summary"
+echo "============================================================"
+echo ""
+echo "=== QUICK (--limit 200) ==="
+cat "${WORKDIR}/comparison_quick.txt"
+echo ""
+echo "=== FULL ==="
+cat "${WORKDIR}/comparison_full.txt"
+
+echo ""
+echo "============================================================"
+echo " DONE"
+echo "============================================================"
+echo ""
+echo "Results saved in:"
+echo "  ${WORKDIR}/eval_base_quick/    (--limit 200)"
+echo "  ${WORKDIR}/eval_surgery_quick/ (--limit 200)"
+echo "  ${WORKDIR}/comparison_quick.txt"
+echo "  ${WORKDIR}/eval_base_full/"
+echo "  ${WORKDIR}/eval_surgery_full/"
+echo "  ${WORKDIR}/comparison_full.txt"
+echo ""
+echo "From your local machine, grab results with:"
+echo '  scp -P <PORT> -r root@<SSH_ADDR>:/workspace/eval_*  root@<SSH_ADDR>:/workspace/comparison_* ~/Downloads/claudeOutput/ggufSurgery/'
+echo ""
+echo "Then destroy the instance:"
+echo '  vastai destroy instance <INSTANCE_ID>'