cuGenOpt/benchmark/experiments/e0_diagnosis/run_diagnosis.sh
2026-03-20 00:33:45 +08:00

93 lines
3.3 KiB
Bash
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# GenSolver 性能诊断 - 一键启动脚本
#
# 用法:
# ./run_diagnosis.sh [host] # 运行完整诊断all 模式)
# ./run_diagnosis.sh [host] profile # 仅 nvprof profiling
#
# host: tc_new (T4) | tch (V100), 默认 tc_new
set -e
DIAG_DIR="$(cd "$(dirname "$0")" && pwd)"
BENCH_DIR="$(dirname "$DIAG_DIR")"
ROOT_DIR="$(dirname "$BENCH_DIR")"
RESULTS_DIR="$DIAG_DIR/results"
REMOTE_HOST="${1:-tc_new}"
MODE="${2:-all}"
REMOTE_DIR="~/gensolver"
echo ">>> 使用服务器: $REMOTE_HOST"
ARCH="sm_75"
if [ "$REMOTE_HOST" = "tch" ]; then
ARCH="sm_70"
fi
NVCC_CMD="nvcc -arch=$ARCH -O2 -std=c++17 --extended-lambda -I ../../prototype/core -I ../../prototype/problems"
mkdir -p "$RESULTS_DIR"
echo "=========================================="
echo " GenSolver 性能诊断"
echo " 时间: $(date)"
echo " 服务器: $REMOTE_HOST (arch=$ARCH)"
echo "=========================================="
sync_code() {
echo ">>> 同步代码到 $REMOTE_HOST ..."
ssh $REMOTE_HOST "mkdir -p $REMOTE_DIR/prototype/core $REMOTE_DIR/prototype/problems $REMOTE_DIR/benchmark/experiments/e0_diagnosis"
scp "$ROOT_DIR"/prototype/core/*.cuh $REMOTE_HOST:$REMOTE_DIR/prototype/core/
scp "$ROOT_DIR"/prototype/problems/*.cuh $REMOTE_HOST:$REMOTE_DIR/prototype/problems/
scp "$DIAG_DIR"/bench_diagnosis.cu $REMOTE_HOST:$REMOTE_DIR/benchmark/experiments/e0_diagnosis/
echo " done."
}
compile() {
echo ">>> 编译 bench_diagnosis (arch=$ARCH) ..."
ssh $REMOTE_HOST "export PATH=/usr/local/cuda/bin:\$PATH && cd $REMOTE_DIR/benchmark/experiments/e0_diagnosis && $NVCC_CMD -o bench_diagnosis bench_diagnosis.cu 2>&1"
echo " done."
}
run_all() {
echo ">>> 运行完整诊断 ..."
local gpu_name=$(ssh $REMOTE_HOST "nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1" | tr ' ' '_')
local outfile="bench_${gpu_name}_$(date +%Y%m%d_%H%M%S).csv"
ssh $REMOTE_HOST "export PATH=/usr/local/cuda/bin:\$PATH && cd $REMOTE_DIR/benchmark/experiments/e0_diagnosis && ./bench_diagnosis all 2>&1 >/tmp/diag_out.csv && cat /tmp/diag_out.csv" > "$RESULTS_DIR/$outfile"
echo " 结果: $RESULTS_DIR/$outfile"
local lines=$(wc -l < "$RESULTS_DIR/$outfile" 2>/dev/null || echo 0)
echo " 数据行: $((lines - 1))"
}
run_profile() {
echo ">>> 运行 nvprof profiling ..."
echo "--- baseline (batch=2000, AOS=off) ---"
ssh $REMOTE_HOST "export PATH=/usr/local/cuda/bin:\$PATH && cd $REMOTE_DIR/benchmark/experiments/e0_diagnosis && nvprof --print-gpu-summary ./bench_diagnosis baseline 2>&1" | tee "$RESULTS_DIR/nvprof_baseline_$REMOTE_HOST.txt"
echo ""
echo "--- default (batch=50, AOS=on) ---"
ssh $REMOTE_HOST "export PATH=/usr/local/cuda/bin:\$PATH && cd $REMOTE_DIR/benchmark/experiments/e0_diagnosis && nvprof --print-gpu-summary ./bench_diagnosis default 2>&1" | tee "$RESULTS_DIR/nvprof_default_$REMOTE_HOST.txt"
}
sync_code
compile
case "$MODE" in
all) run_all ;;
profile) run_profile ;;
*)
echo "未知模式: $MODE"
echo "用法: ./run_diagnosis.sh [host] [all|profile]"
exit 1
;;
esac
echo ""
echo "=========================================="
echo " 诊断完成"
echo " 服务器: $REMOTE_HOST"
echo " 结果目录: $RESULTS_DIR"
echo "=========================================="
ls -lh "$RESULTS_DIR"/ 2>/dev/null || true