From cb7ecd4604495f42f7c2bdf0875f33cfabcd574c Mon Sep 17 00:00:00 2001 From: alainnothere <164234422+alainnothere@users.noreply.github.com> Date: Fri, 20 Mar 2026 01:50:01 +0000 Subject: [PATCH] Add files via upload --- vastai_rys_eval.sh | 382 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 382 insertions(+) create mode 100644 vastai_rys_eval.sh diff --git a/vastai_rys_eval.sh b/vastai_rys_eval.sh new file mode 100644 index 0000000..30374f8 --- /dev/null +++ b/vastai_rys_eval.sh @@ -0,0 +1,382 @@ +#!/usr/bin/env bash +# ============================================================================ +# RYS Layer Surgery Evaluation on Vast.ai +# ============================================================================ +# +# This script runs on a Vast.ai instance (NVIDIA CUDA template, H200 GPU). +# It downloads the base model, performs layer surgery, then runs lm_eval +# on both models and compares results. +# +# ---- BEFORE RUNNING THIS SCRIPT ---- +# +# 1. From YOUR machine, find an H200 offer: +# +# vastai search offers 'gpu_name=H200 num_gpus=1 disk_space>=80 verified=true rentable=true' -o 'dph+' +# +# If no H200 available, fall back to H100 SXM: +# +# vastai search offers 'gpu_name=H100_SXM num_gpus=1 disk_space>=80 verified=true rentable=true' -o 'dph+' +# +# 2. Create the instance using the NVIDIA CUDA devel template: +# +# vastai create instance \ +# --image vastai/base-image:cuda-12.8.1-cudnn-devel-ubuntu22.04 \ +# --disk 80 \ +# --direct \ +# --ssh +# +# 3. Wait for it to boot (~2-3 min), then get SSH info: +# +# vastai show instances +# +# 4. SCP this script and the surgery script to the instance: +# +# scp -P vastai_rys_eval.sh gguf_surgery.py compare_eval.py root@:/workspace/ +# +# 5. SSH in and run: +# +# ssh -p root@ +# cd /workspace && chmod +x vastai_rys_eval.sh && ./vastai_rys_eval.sh +# +# 6. When done, grab results from YOUR machine: +# +# scp -P -r root@:/workspace/eval_base root@:/workspace/eval_surgery ~/Downloads/claudeOutput/ggufSurgery/ +# scp -P root@:/workspace/comparison.txt ~/Downloads/claudeOutput/ggufSurgery/ +# +# 7. Destroy the instance: +# +# vastai destroy instance +# +# ============================================================================ + +set -euo pipefail + +WORKDIR=/workspace +MODEL_DIR="${WORKDIR}/models" +BASE_GGUF="Devstral-Small-2-24B-Instruct-2512-UD-Q4_K_XL.gguf" +SURGERY_GGUF="devstral_rys_12_15.gguf" +HF_REPO="unsloth/Devstral-Small-2-24B-Instruct-2512-GGUF" +LLAMA_PORT=8080 + +# Surgery parameters: duplicate layers 12, 13, 14 +DUP_START=12 +DUP_END=15 + +EVAL_TASKS="gsm8k_cot,ifeval,bbh_cot_fewshot_causal_judgement,bbh_cot_fewshot_date_understanding,bbh_cot_fewshot_logical_deduction_five_objects,bbh_cot_fewshot_navigate,mbpp" + +# Detect python3 / pip3 +PY=$(command -v python3 || command -v python) +PIP=$(command -v pip3 || command -v pip) +echo "Using Python: ${PY}" +echo "Using pip: ${PIP}" + +# ============================================================================ +echo "============================================================" +echo " STEP 1: Verify GPU" +echo "============================================================" +nvidia-smi || { echo "ERROR: No GPU found. Did you pick a GPU instance?"; exit 1; } +echo "" + +# ============================================================================ +echo "============================================================" +echo " STEP 2: Install Python packages" +echo "============================================================" +${PIP} install --upgrade pip +${PIP} install gguf numpy tqdm huggingface-hub 'lm-eval[api]' transformers langdetect immutabledict + +# Ensure pip-installed scripts are in PATH +export PATH="$PATH:/usr/local/bin:$HOME/.local/bin" +# Ensure 'python' command exists (some containers only have python3) +if ! command -v python &>/dev/null && command -v python3 &>/dev/null; then + ln -sf "$(command -v python3)" /usr/local/bin/python +fi +# Allow code execution for mbpp benchmark +export HF_ALLOW_CODE_EVAL=1 +echo "" + +# ============================================================================ +echo "============================================================" +echo " STEP 3: Build llama.cpp with CUDA" +echo "============================================================" +apt-get update && apt-get install -y libcurl4-openssl-dev cmake build-essential git + +if [ ! -f "${WORKDIR}/llama.cpp/build/bin/llama-server" ]; then + cd "${WORKDIR}" + + # Clone only if not already cloned + if [ ! -d "${WORKDIR}/llama.cpp/.git" ]; then + git clone --depth 1 https://github.com/ggerganov/llama.cpp + else + echo "llama.cpp repo already cloned, skipping clone." + fi + + # Auto-detect GPU compute capability — only build for THIS GPU + # Cuts compile time from ~30min to ~5min + CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d '.') + if [ -z "$CUDA_ARCH" ]; then + echo "WARNING: Could not detect GPU arch, building for all (slow)" + CUDA_ARCH="native" + else + echo "Detected GPU compute capability: ${CUDA_ARCH} — building only for this arch" + fi + + cmake llama.cpp -B llama.cpp/build \ + -DBUILD_SHARED_LIBS=OFF \ + -DGGML_CUDA=ON \ + -DLLAMA_CURL=ON \ + -DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCH}" + cmake --build llama.cpp/build --config Release -j "$(nproc)" \ + --target llama-server llama-cli llama-bench + echo "llama.cpp built successfully." +else + echo "llama.cpp already built, skipping." +fi + +LLAMA_SERVER="${WORKDIR}/llama.cpp/build/bin/llama-server" +ls -la "$LLAMA_SERVER" +echo "" + +# ============================================================================ +echo "============================================================" +echo " STEP 4: Download base model from HuggingFace" +echo "============================================================" +mkdir -p "${MODEL_DIR}" + +if [ ! -f "${MODEL_DIR}/${BASE_GGUF}" ]; then + ${PY} -c " +from huggingface_hub import hf_hub_download +print('Downloading ${BASE_GGUF} from ${HF_REPO}...') +hf_hub_download('${HF_REPO}', '${BASE_GGUF}', local_dir='${MODEL_DIR}') +print('Download complete.') +" +else + echo "Base model already present, skipping download." +fi + +ls -lh "${MODEL_DIR}/${BASE_GGUF}" +echo "" + +# ============================================================================ +echo "============================================================" +echo " STEP 5: Perform layer surgery (duplicate layers ${DUP_START}..$(( DUP_END - 1 )))" +echo "============================================================" + +if [ ! -f "${MODEL_DIR}/${SURGERY_GGUF}" ]; then + ${PY} "${WORKDIR}/gguf_surgery.py" \ + "${MODEL_DIR}/${BASE_GGUF}" \ + "${MODEL_DIR}/${SURGERY_GGUF}" \ + -i "${DUP_START}" -j "${DUP_END}" -v + echo "Surgery complete." +else + echo "Surgery model already present, skipping." +fi + +ls -lh "${MODEL_DIR}/${SURGERY_GGUF}" +echo "" + +# ============================================================================ +# Helper: start llama-server, wait for it to be ready, return PID +# ============================================================================ +start_server() { + local model_path="$1" + echo "Starting llama-server with: $(basename "$model_path")" + + "$LLAMA_SERVER" \ + -m "$model_path" \ + --host 0.0.0.0 \ + --port "${LLAMA_PORT}" \ + -ngl 999 \ + --flash-attn on \ + --ctx-size 32768 \ + > /tmp/llama_server.log 2>&1 & + + local server_pid=$! + echo "Server PID: ${server_pid}" + + # Wait for server to become ready (up to 120 seconds) + echo "Waiting for server to load model..." + local attempts=0 + local max_attempts=60 + while [ $attempts -lt $max_attempts ]; do + if curl -s "http://127.0.0.1:${LLAMA_PORT}/health" | grep -q "ok"; then + echo "Server ready after ~$(( attempts * 2 ))s" + return 0 + fi + # Check if process died + if ! kill -0 "$server_pid" 2>/dev/null; then + echo "ERROR: llama-server died. Last 20 lines of log:" + tail -20 /tmp/llama_server.log + return 1 + fi + sleep 2 + attempts=$(( attempts + 1 )) + done + + echo "ERROR: Server did not become ready in time. Last 20 lines of log:" + tail -20 /tmp/llama_server.log + return 1 +} + +stop_server() { + echo "Stopping llama-server..." + pkill -f llama-server || true + sleep 3 + # Make sure it's dead + pkill -9 -f llama-server 2>/dev/null || true + sleep 1 + echo "Server stopped." +} + +# ============================================================================ +# Common lm_eval args +# ============================================================================ +LM_EVAL_MODEL_ARGS="model=mistralai/Devstral-Small-2-24B-Instruct-2512,base_url=http://127.0.0.1:${LLAMA_PORT}/v1/completions,num_concurrent=3,tokenized_requests=False" + +# Split tasks into array +IFS=',' read -ra TASKS <<< "${EVAL_TASKS}" + +# ============================================================================ +echo "============================================================" +echo " STEP 6: Smoke test (--limit 1 per task, base model)" +echo "============================================================" +start_server "${MODEL_DIR}/${BASE_GGUF}" + +echo "Running 1 sample per task to verify everything works..." +lm_eval --model local-completions \ + --model_args "${LM_EVAL_MODEL_ARGS}" \ + --tasks "${EVAL_TASKS}" \ + --confirm_run_unsafe_code \ + --limit 1 \ + --output_path "${WORKDIR}/eval_smoke" \ + --log_samples + +stop_server + +echo "" +echo "Smoke test PASSED. Proceeding with evaluation." +echo "" + +# ============================================================================ +# run_eval_pass: run all tasks interleaved (base then surgery) with comparison +# $1 = pass name (e.g. "quick" or "full") +# $2 = limit flag (e.g. "--limit 200" or "") +# $3 = base output dir +# $4 = surgery output dir +# ============================================================================ +run_eval_pass() { + local pass_name="$1" + local limit_flag="$2" + local base_out="$3" + local surgery_out="$4" + + echo "" + echo "============================================================" + echo " PASS: ${pass_name} ${limit_flag}" + echo " Running each task on BOTH models before moving to the next." + echo "============================================================" + echo "" + + for i in "${!TASKS[@]}"; do + TASK="${TASKS[$i]}" + TASK_NUM=$(( i + 1 )) + TASK_TOTAL=${#TASKS[@]} + + echo "" + echo "------------------------------------------------------------" + echo " [${pass_name}] Task ${TASK_NUM}/${TASK_TOTAL}: ${TASK}" + echo "------------------------------------------------------------" + + # --- Base model --- + echo "" + echo " >>> BASE model: ${TASK}" + start_server "${MODEL_DIR}/${BASE_GGUF}" + + lm_eval --model local-completions \ + --model_args "${LM_EVAL_MODEL_ARGS}" \ + --tasks "${TASK}" \ + --confirm_run_unsafe_code \ + ${limit_flag} \ + --output_path "${base_out}" \ + --log_samples + + stop_server + + # --- Surgery model --- + echo "" + echo " >>> SURGERY model: ${TASK}" + start_server "${MODEL_DIR}/${SURGERY_GGUF}" + + lm_eval --model local-completions \ + --model_args "${LM_EVAL_MODEL_ARGS}" \ + --tasks "${TASK}" \ + --confirm_run_unsafe_code \ + ${limit_flag} \ + --output_path "${surgery_out}" \ + --log_samples + + stop_server + + # --- Incremental comparison --- + echo "" + echo " --- [${pass_name}] Results so far (${TASK_NUM}/${TASK_TOTAL} tasks) ---" + ${PY} "${WORKDIR}/compare_eval.py" \ + "${base_out}" "${surgery_out}" \ + --names base "rys_${DUP_START}_${DUP_END}" \ + 2>/dev/null || echo " (comparison will work after both models have results)" + echo "" + done + + echo "" + echo "============================================================" + echo " ${pass_name} pass complete" + echo "============================================================" + ${PY} "${WORKDIR}/compare_eval.py" \ + "${base_out}" "${surgery_out}" \ + --names base "rys_${DUP_START}_${DUP_END}" \ + | tee "${WORKDIR}/comparison_${pass_name}.txt" + echo "" +} + +# ============================================================================ +echo "============================================================" +echo " STEP 7: Quick pass (--limit 200)" +echo "============================================================" +run_eval_pass "quick" "--limit 200" \ + "${WORKDIR}/eval_base_quick" "${WORKDIR}/eval_surgery_quick" + +# ============================================================================ +echo "============================================================" +echo " STEP 8: Full pass (no limit)" +echo "============================================================" +run_eval_pass "full" "" \ + "${WORKDIR}/eval_base_full" "${WORKDIR}/eval_surgery_full" + +# ============================================================================ +echo "============================================================" +echo " STEP 9: Final summary" +echo "============================================================" +echo "" +echo "=== QUICK (--limit 200) ===" +cat "${WORKDIR}/comparison_quick.txt" +echo "" +echo "=== FULL ===" +cat "${WORKDIR}/comparison_full.txt" + +echo "" +echo "============================================================" +echo " DONE" +echo "============================================================" +echo "" +echo "Results saved in:" +echo " ${WORKDIR}/eval_base_quick/ (--limit 200)" +echo " ${WORKDIR}/eval_surgery_quick/ (--limit 200)" +echo " ${WORKDIR}/comparison_quick.txt" +echo " ${WORKDIR}/eval_base_full/" +echo " ${WORKDIR}/eval_surgery_full/" +echo " ${WORKDIR}/comparison_full.txt" +echo "" +echo "From your local machine, grab results with:" +echo ' scp -P -r root@:/workspace/eval_* root@:/workspace/comparison_* ~/Downloads/claudeOutput/ggufSurgery/' +echo "" +echo "Then destroy the instance:" +echo ' vastai destroy instance '