fix: harden CUDA safety checks and translate comments to English

Safety fixes (4 critical, 4 warning) from code review: - qap.cuh: fix clone_to_device cross-device D2H by retaining host matrices - types.cuh: add CUDA_CHECK to InjectBuffer, track owner_gpu for safe destroy - types.cuh: add bounds check on lexicographic priority index - solver.cuh: cap migrate_kernel islands to MAX_ISLANDS=64 to prevent stack overflow - multi_gpu_solver.cuh: guard against 0 GPUs, propagate stop_reason from best GPU - types.cuh: warn on SeqRegistry overflow - solver.cuh: warn when constraint_directed/phased_search disabled without AOS Translate all Chinese comments to English across 25+ source files (core/*.cuh, problems/*.cuh, Makefile, multi-GPU tests). Verified on V100S×2 (sm_70, CUDA 12.8): e5 (12 problem types, all optimal), e13 (multi-objective + multi-GPU, 9 configs, all passed).
2026-06-09 19:15:13 +02:00 · 2026-03-25 11:52:50 +08:00 · 2026-03-25 11:52:50 +08:00 · a848730459
commit a848730459
parent ab278d0e82
25 changed files with 1147 additions and 1167 deletions
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@
 [![CUDA](https://img.shields.io/badge/CUDA-11.0%2B-green.svg)](https://developer.nvidia.com/cuda-toolkit)
 [![Python](https://img.shields.io/badge/Python-3.8%2B-blue.svg)](https://www.python.org/)

-**Paper**: [cuGenOpt: A GPU-Accelerated General-Purpose Metaheuristic Framework for Combinatorial Optimization](http://arxiv.org/abs/2603.19163) 
+**Paper**: [cuGenOpt: A GPU-Accelerated General-Purpose Metaheuristic Framework for Combinatorial Optimization](https://arxiv.org/abs/2603.19163) 

 ---

@ -114,28 +114,7 @@ Define your own problem by inheriting `ProblemBase` and implementing `compute_ob
 └─────────────────────────────────────────────────────────┘
 ```

---

-## Project Structure
-
-```
-generic_solver/
-├── prototype/              # Core framework (header-only .cuh files)
-│   ├── core/              #   Solver, operators, population, types
-│   └── problems/          #   12+ problem implementations
-├── python/                 # Python wrapper (pip install cugenopt)
-│   ├── cugenopt/          #   Python package (built-ins + JIT compiler)
-│   └── tests/             #   Test suite
-├── benchmark/              # Experiments and benchmarks
-│   ├── experiments/       #   E0-E13: 14 experiment groups
-│   ├── data/              #   Standard instances (TSPLIB, Solomon, QAPLIB)
-│   └── results/           #   Experimental reports
-├── paper_v3_en/            # Paper source (LaTeX)
-├── STATUS.md               # Project status and roadmap
-└── README.md               # This file
-```
-
---

 ## Performance Highlights

@ -186,8 +165,7 @@ generic_solver/
 ## Installation

 ### Python Package
-
-coming soon～
+come soon
 ```bash
 pip install cugenopt
 ```
@ -207,18 +185,7 @@ cd prototype
 make all
 ```

---

-## Documentation
-
-| Document | Description |
-|----------|-------------|
-| [STATUS.md](STATUS.md) | Project status, roadmap, and design decisions |
-| [Python API Guide](python/README.md) | Detailed Python API documentation |
-| [Benchmark Design](benchmark/DESIGN.md) | Experimental methodology |
-| [Paper](paper_v3_en/) | Full technical details and evaluation |
-
---

 ## Citation

--- a/prototype/Makefile
+++ b/prototype/Makefile
@ -1,10 +1,10 @@
 # GenSolver Makefile
 #
-# 用法:
-#   make e1 e2 e3 e4 e5 e6   → 编译单个实验
-#   make diag                  → 编译诊断程序
-#   make all                   → 编译全部
-#   make clean                 → 清理
+# Usage:
+#   make e1 e2 e3 e4 e5 e6   → Build individual experiments
+#   make diag                  → Build diagnostic program
+#   make all                   → Build all
+#   make clean                 → Clean

 NVCC     = nvcc
 ARCH     ?= -arch=sm_75
@ -40,10 +40,10 @@ $(EXP_DIR)/%/gpu: $(EXP_DIR)/%/gpu.cu $(ALL_HEADERS) problems/tsplib_data.h
 $(EXP_DIR)/e0_diagnosis/bench_diagnosis: $(EXP_DIR)/e0_diagnosis/bench_diagnosis.cu $(ALL_HEADERS)
 	$(NVCC) $(ARCH) $(CFLAGS) $(INCLUDES) -o $@ $<

-test_multi_gpu: test_multi_gpu.cu $(ALL_HEADERS)
+test_multi_gpu: $(EXP_DIR)/e9_multi_gpu_b3/test_multi_gpu.cu $(ALL_HEADERS)
 	$(NVCC) $(ARCH) $(CFLAGS) $(INCLUDES) -o $@ $<

-test_multi_gpu_b3: test_multi_gpu_b3.cu $(ALL_HEADERS)
+test_multi_gpu_b3: $(EXP_DIR)/e9_multi_gpu_b3/test_multi_gpu_b3.cu $(ALL_HEADERS)
 	$(NVCC) $(ARCH) $(CFLAGS) $(INCLUDES) -o $@ $<

 clean:
--- a/prototype/core/cuda_utils.cuh
+++ b/prototype/core/cuda_utils.cuh
@ -1,8 +1,8 @@
 /**
- * cuda_utils.cuh - CUDA 工具集
+ * cuda_utils.cuh - CUDA utilities
 * 
- * 职责：错误检查、设备信息、随机数工具
- * 规则：所有 CUDA API 调用都必须用 CUDA_CHECK 包裹
+ * Responsibilities: error checking, device info, random number utilities
+ * Rule: every CUDA API call must be wrapped with CUDA_CHECK
 */

 #pragma once
@ -11,7 +11,7 @@
 #include <curand_kernel.h>

 // ============================================================
-// 错误检查
+// Error checking
 // ============================================================

 #define CUDA_CHECK(call) do {                                       \
@ -23,7 +23,7 @@
    }                                                               \
 } while(0)

-// kernel launch 后检查（捕获异步错误）
+// Check after kernel launch (catches async errors)
 #define CUDA_CHECK_LAST() do {                                      \
    cudaError_t err = cudaGetLastError();                            \
    if (err != cudaSuccess) {                                       \
@ -34,7 +34,7 @@
 } while(0)

 // ============================================================
-// 设备信息
+// Device info
 // ============================================================

 inline void print_device_info() {
@ -52,10 +52,10 @@ inline void print_device_info() {
 }

 // ============================================================
-// 随机数工具 (Device 端)
+// Random number utilities (device-side)
 // ============================================================

-// 初始化 curand 状态，每个线程一个
+// Initialize curand state: one per thread
 __global__ void init_curand_kernel(curandState* states, unsigned long long seed, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
@ -63,12 +63,12 @@ __global__ void init_curand_kernel(curandState* states, unsigned long long seed,
    }
 }

-// Device 端：生成 [0, bound) 的随机整数
+// Device-side: random integer in [0, bound)
 __device__ inline int rand_int(curandState* state, int bound) {
    return curand(state) % bound;
 }

-// Device 端：Fisher-Yates shuffle，对 arr[0..n-1] 做随机排列
+// Device-side: Fisher-Yates shuffle of arr[0..n-1]
 __device__ inline void shuffle(int* arr, int n, curandState* state) {
    for (int i = n - 1; i > 0; i--) {
        int j = rand_int(state, i + 1);
@ -79,12 +79,12 @@ __device__ inline void shuffle(int* arr, int n, curandState* state) {
 }

 // ============================================================
-// Kernel 启动参数计算
+// Kernel launch grid sizing
 // ============================================================

 inline int div_ceil(int a, int b) { return (a + b - 1) / b; }

-// 计算合适的 block 数量
+// Compute suitable number of blocks
 inline int calc_grid_size(int n, int block_size = 256) {
    return div_ceil(n, block_size);
 }
--- a/prototype/core/gpu_cache.cuh
+++ b/prototype/core/gpu_cache.cuh
@ -1,20 +1,20 @@
 /**
- * gpu_cache.cuh - GPU 全局内存哈希表（通用缓存组件）
+ * gpu_cache.cuh - GPU global-memory hash table (generic cache component)
 * 
- * 设计：
- *   - 开放寻址，固定容量（power of 2），线性探测
- *   - key = uint64_t（由 Problem 自行计算 hash）
- *   - value = float（单个指标值）
- *   - 无锁：允许 race condition（缓存语义，偶尔脏读可接受）
- *   - 自带命中/未命中原子计数器
+ * Design:
+ *   - Open addressing, fixed capacity (power of 2), linear probing
+ *   - key = uint64_t (hash computed by Problem)
+ *   - value = float (single metric value)
+ *   - Lock-free: race conditions allowed (cache semantics; occasional dirty reads OK)
+ *   - Built-in atomic hit/miss counters
 * 
- * 用法：
+ * Usage:
 *   GpuCache cache = GpuCache::allocate(65536);   // host
 *   // ... pass cache as Problem member to kernels ...
 *   cache.print_stats();                           // host
 *   cache.destroy();                               // host
 * 
- * 参考：scute 项目 LRUCache（key = metric_type + content_hash）
+ * Reference: scute project LRUCache (key = metric_type + content_hash)
 */

 #pragma once
@ -22,25 +22,25 @@
 #include <cstdint>

 // ============================================================
-// 常量
+// Constants
 // ============================================================

 static constexpr uint64_t CACHE_EMPTY_KEY = 0xFFFFFFFFFFFFFFFFULL;
-static constexpr int CACHE_MAX_PROBE = 8;   // 最大线性探测步数
+static constexpr int CACHE_MAX_PROBE = 8;   // Max linear probing steps

 // ============================================================
-// GpuCache 结构体（POD，可安全拷贝到 kernel）
+// GpuCache struct (POD, safe to copy to kernel)
 // ============================================================

 struct GpuCache {
-    uint64_t* keys;             // GPU 全局内存
-    float*    values;           // GPU 全局内存
-    unsigned int* d_hits;       // 原子计数器（GPU）
-    unsigned int* d_misses;     // 原子计数器（GPU）
-    int capacity;               // 必须是 2 的幂
+    uint64_t* keys;             // GPU global memory
+    float*    values;           // GPU global memory
+    unsigned int* d_hits;       // Atomic counters (GPU)
+    unsigned int* d_misses;     // Atomic counters (GPU)
+    int capacity;               // Must be a power of 2
    int mask;                   // = capacity - 1
    
-    // ---- Host 操作 ----
+    // ---- Host operations ----
    
    static GpuCache allocate(int cap = 65536) {
        GpuCache c;
@ -94,20 +94,20 @@ struct GpuCache {
 };

 // ============================================================
-// Device 函数：哈希 / 查找 / 插入
+// Device functions: hash / lookup / insert
 // ============================================================

-/// FNV-1a 哈希：对一段有序 int 序列（如路线中的客户 ID）
+/// FNV-1a hash over an ordered int sequence (e.g. customer IDs on a route)
 __device__ inline uint64_t route_hash(const int* data, int len) {
    uint64_t h = 14695981039346656037ULL;   // FNV offset basis
    for (int i = 0; i < len; i++) {
        h ^= (uint64_t)(unsigned int)data[i];
        h *= 1099511628211ULL;               // FNV prime
    }
-    return (h == CACHE_EMPTY_KEY) ? h - 1 : h;  // 避免与哨兵值碰撞
+    return (h == CACHE_EMPTY_KEY) ? h - 1 : h;  // Avoid collision with sentinel value
 }

-/// 查找：命中返回 true + 写入 out
+/// Lookup: on hit returns true and writes out
 __device__ inline bool cache_lookup(const GpuCache& c, uint64_t key, float& out) {
    int slot = (int)(key & (uint64_t)c.mask);
    for (int p = 0; p < CACHE_MAX_PROBE; p++) {
@ -117,12 +117,12 @@ __device__ inline bool cache_lookup(const GpuCache& c, uint64_t key, float& out)
            out = c.values[idx];
            return true;
        }
-        if (k == CACHE_EMPTY_KEY) return false;  // 空槽 → 一定不存在
+        if (k == CACHE_EMPTY_KEY) return false;  // Empty slot -> key not present
    }
-    return false;   // 探测用尽
+    return false;   // Probing exhausted
 }

-/// 插入：写入 key-value，同 key 覆盖，探测满则驱逐首槽
+/// Insert: write key-value; same key overwrites; if probe full, evict first slot
 __device__ inline void cache_insert(const GpuCache& c, uint64_t key, float value) {
    int slot = (int)(key & (uint64_t)c.mask);
    for (int p = 0; p < CACHE_MAX_PROBE; p++) {
@ -134,7 +134,7 @@ __device__ inline void cache_insert(const GpuCache& c, uint64_t key, float value
            return;
        }
    }
-    // 探测满：驱逐首槽
+    // Probe full: evict first slot
    int idx = slot & c.mask;
    c.keys[idx]   = key;
    c.values[idx] = value;
--- a/prototype/core/init_heuristic.cuh
+++ b/prototype/core/init_heuristic.cuh
@ -6,7 +6,7 @@

 namespace heuristic_init {

-// 单行排列：所有行填相同排列
+// Single-row layout: same permutation in every row
 template<typename Sol>
 static void build_sorted_permutation(Sol& sol, const std::vector<int>& order,
                                     int dim1, int dim2) {
@ -19,7 +19,7 @@ static void build_sorted_permutation(Sol& sol, const std::vector<int>& order,
    for (int i = 0; i < MAX_OBJ; i++) sol.objectives[i] = 0.0f;
 }

-// Partition 模式：排列均匀切分到 dim1 行，元素不重复
+// Partition mode: split permutation evenly across dim1 rows, no duplicate elements
 template<typename Sol>
 static void build_partition_from_order(Sol& sol, const std::vector<int>& order,
                                       int dim1, int total_elements) {
@ -66,8 +66,8 @@ std::vector<Sol> build_from_matrices(const HeuristicMatrix* matrices, int num_ma
                col_sum[j] += mat[i * N + j];
            }

-        // 对于 Partition (VRPTW)，距离矩阵含 depot (index 0)，
-        // 排序只针对客户 (index 1..N-1)，输出值为 0-based 客户编号
+        // For Partition (VRPTW), the distance matrix includes depot (index 0);
+        // sorting is only over customers (indices 1..N-1); output values are 0-based customer ids
        std::vector<int> idx;
        if (partition_mode && N > elem_count) {
            for (int i = 1; i <= elem_count; i++) idx.push_back(i);
--- a/prototype/core/init_selection.cuh
+++ b/prototype/core/init_selection.cuh
@ -1,15 +1,15 @@
 /**
- * init_selection.cuh - 初始解采样择优 + NSGA-II 选择
+ * init_selection.cuh - Initial-solution sampling and NSGA-II selection
 *
- * Host 端逻辑，在 solver 初始化阶段调用一次。
- * 从 K × pop_size 个候选解中选出 pop_size 个作为初始种群。
+ * Host-side logic; called once during solver initialization.
+ * Selects pop_size individuals from K × pop_size candidates as the initial population.
 *
- * 选择策略：
- *   1. 核心目标预留名额（按 importance 分配）
- *   2. NSGA-II 选择（非支配排序 + 加权拥挤度）
- *   3. 纯随机保底（多样性）
+ * Selection strategy:
+ *   1. Reserve slots for core objectives (by importance)
+ *   2. NSGA-II selection (non-dominated sort + weighted crowding)
+ *   3. Pure random fallback (diversity)
 *
- * 单目标时自动退化为 top-N 排序，无需分支。
+ * Single-objective case automatically reduces to top-N sorting; no extra branching.
 */

 #pragma once
@ -22,36 +22,36 @@
 namespace init_sel {

 // ============================================================
-// 候选解的目标信息（从 GPU 下载后在 host 端使用）
+// Per-candidate objective info (used on host after download from GPU)
 // ============================================================
 struct CandidateInfo {
-    int   idx;           // 在候选数组中的原始索引
-    float objs[MAX_OBJ]; // 归一化后的目标值（越小越好）
+    int   idx;           // Original index in the candidate array
+    float objs[MAX_OBJ]; // Normalized objectives (lower is better)
    float penalty;
-    int   rank;          // 非支配排序层级（0 = Pareto 前沿）
-    float crowding;      // 拥挤度距离
-    bool  selected;      // 是否已被选中
+    int   rank;          // Non-dominated sort front (0 = Pareto front)
+    float crowding;      // Crowding distance
+    bool  selected;      // Whether already selected
 };

 // ============================================================
-// 非支配排序（Fast Non-dominated Sort）
+// Non-dominated sort (Fast Non-dominated Sort)
 // ============================================================
-// 复杂度：O(M × N²)，M = 目标数，N = 候选数
-// 对初始化场景（N ≤ 几千，M ≤ 4）完全可接受
+// Complexity: O(M × N²), M = number of objectives, N = number of candidates
+// Acceptable for initialization (N up to a few thousand, M ≤ 4)

 inline void fast_nondominated_sort(std::vector<CandidateInfo>& cands,
                                    int num_obj,
                                    std::vector<std::vector<int>>& fronts) {
    int n = (int)cands.size();
-    std::vector<int> dom_count(n, 0);        // 被多少个解支配
-    std::vector<std::vector<int>> dom_set(n); // 支配了哪些解
+    std::vector<int> dom_count(n, 0);        // How many solutions dominate this one
+    std::vector<std::vector<int>> dom_set(n); // Which solutions this one dominates
    
-    // 判断 a 是否支配 b：a 在所有目标上 ≤ b，且至少一个 <
-    // 先处理 penalty：可行解支配不可行解
+    // Whether a dominates b: a ≤ b on all objectives, and strictly < on at least one
+    // Handle penalty first: feasible dominates infeasible
    auto dominates = [&](int a, int b) -> bool {
        const auto& ca = cands[a];
        const auto& cb = cands[b];
-        // penalty 处理
+        // Penalty handling
        if (ca.penalty <= 0.0f && cb.penalty > 0.0f) return true;
        if (ca.penalty > 0.0f && cb.penalty <= 0.0f) return false;
        if (ca.penalty > 0.0f && cb.penalty > 0.0f) return ca.penalty < cb.penalty;
@ -65,7 +65,7 @@ inline void fast_nondominated_sort(std::vector<CandidateInfo>& cands,
        return all_leq && any_lt;
    };
    
-    // 计算支配关系
+    // Compute dominance relations
    for (int i = 0; i < n; i++) {
        for (int j = i + 1; j < n; j++) {
            if (dominates(i, j)) {
@ -78,7 +78,7 @@ inline void fast_nondominated_sort(std::vector<CandidateInfo>& cands,
        }
    }
    
-    // 提取各层前沿
+    // Extract each front layer
    fronts.clear();
    std::vector<int> current_front;
    for (int i = 0; i < n; i++) {
@ -107,9 +107,9 @@ inline void fast_nondominated_sort(std::vector<CandidateInfo>& cands,
 }

 // ============================================================
-// 加权拥挤度距离
+// Weighted crowding distance
 // ============================================================
-// 标准拥挤度 + importance 加权：核心目标维度上的间距贡献更大
+// Standard crowding + importance weighting: larger gap contribution on core objectives

 inline void weighted_crowding_distance(std::vector<CandidateInfo>& cands,
                                        const std::vector<int>& front,
@ -117,7 +117,7 @@ inline void weighted_crowding_distance(std::vector<CandidateInfo>& cands,
                                        const float* importance) {
    int n = (int)front.size();
    if (n <= 2) {
-        for (int i : front) cands[i].crowding = 1e18f;  // 边界解无穷大
+        for (int i : front) cands[i].crowding = 1e18f;  // Boundary solutions: infinite
        return;
    }
    
@ -126,18 +126,18 @@ inline void weighted_crowding_distance(std::vector<CandidateInfo>& cands,
    std::vector<int> sorted_idx(front.begin(), front.end());
    
    for (int m = 0; m < num_obj; m++) {
-        // 按目标 m 排序
+        // Sort by objective m
        std::sort(sorted_idx.begin(), sorted_idx.end(),
                  [&](int a, int b) { return cands[a].objs[m] < cands[b].objs[m]; });
        
        float range = cands[sorted_idx[n-1]].objs[m] - cands[sorted_idx[0]].objs[m];
-        if (range < 1e-12f) continue;  // 该目标无区分度
+        if (range < 1e-12f) continue;  // No spread on this objective
        
-        // 边界解设为无穷大
+        // Boundary solutions: infinite crowding
        cands[sorted_idx[0]].crowding += 1e18f;
        cands[sorted_idx[n-1]].crowding += 1e18f;
        
-        // 中间解：相邻间距 × importance 权重
+        // Interior: neighbor gap × importance weight
        float w = importance[m];
        for (int i = 1; i < n - 1; i++) {
            float gap = cands[sorted_idx[i+1]].objs[m] - cands[sorted_idx[i-1]].objs[m];
@ -147,29 +147,29 @@ inline void weighted_crowding_distance(std::vector<CandidateInfo>& cands,
 }

 // ============================================================
-// 主选择函数：从 N 个候选中选出 target 个
+// Main selection: pick target candidates from N
 // ============================================================
-// 返回被选中的候选索引
+// Returns indices of selected candidates

 inline std::vector<int> nsga2_select(std::vector<CandidateInfo>& cands,
                                      int num_obj,
                                      const float* importance,
                                      int target,
                                      int num_reserved_random) {
-    // --- 1. 核心目标预留名额 ---
+    // --- 1. Reserve slots for core objectives ---
    int num_reserve_total = target - num_reserved_random;
-    // 预留比例：importance[i] × 30% 的名额（剩余 70% 给 NSGA-II）
+    // Reserve ratio: importance[i] × 30% of slots (remaining 70% for NSGA-II)
    float reserve_ratio = 0.3f;
    
    std::vector<int> selected;
    selected.reserve(target);
    
-    // 对每个目标，按该目标排序取 top
+    // For each objective, sort by that objective and take top
    for (int m = 0; m < num_obj; m++) {
        int quota = (int)(num_reserve_total * importance[m] * reserve_ratio);
-        if (quota < 1 && num_obj > 1) quota = 1;  // 每个目标至少 1 个
+        if (quota < 1 && num_obj > 1) quota = 1;  // At least one per objective
        
-        // 按目标 m 排序（越小越好）
+        // Sort by objective m (lower is better)
        std::vector<int> by_obj(cands.size());
        for (int i = 0; i < (int)cands.size(); i++) by_obj[i] = i;
        std::sort(by_obj.begin(), by_obj.end(),
@ -186,32 +186,32 @@ inline std::vector<int> nsga2_select(std::vector<CandidateInfo>& cands,
        }
    }
    
-    // --- 2. NSGA-II 选择填充剩余名额 ---
+    // --- 2. NSGA-II fills remaining slots ---
    int remaining = target - num_reserved_random - (int)selected.size();
    
    if (remaining > 0) {
-        // 非支配排序
+        // Non-dominated sort
        std::vector<std::vector<int>> fronts;
        fast_nondominated_sort(cands, num_obj, fronts);
        
        for (auto& front : fronts) {
            if (remaining <= 0) break;
            
-            // 过滤已选中的
+            // Filter out already selected
            std::vector<int> available;
            for (int i : front) {
                if (!cands[i].selected) available.push_back(i);
            }
            
            if ((int)available.size() <= remaining) {
-                // 整层都选
+                // Take the whole front
                for (int i : available) {
                    cands[i].selected = true;
                    selected.push_back(i);
                    remaining--;
                }
            } else {
-                // 该层需要截断：按加权拥挤度选
+                // Truncate this front: pick by weighted crowding
                weighted_crowding_distance(cands, available, num_obj, importance);
                std::sort(available.begin(), available.end(),
                          [&](int a, int b) { return cands[a].crowding > cands[b].crowding; });
@ -228,14 +228,14 @@ inline std::vector<int> nsga2_select(std::vector<CandidateInfo>& cands,
 }

 // ============================================================
-// 单目标快速路径：直接按标量排序取 top
+// Single-objective fast path: scalar sort and take top
 // ============================================================
 inline std::vector<int> top_n_select(std::vector<CandidateInfo>& cands,
                                      int target,
                                      int num_reserved_random) {
    int to_select = target - num_reserved_random;
    
-    // 按 penalty 优先，然后按 objs[0]（已归一化为越小越好）
+    // Prefer lower penalty, then objs[0] (normalized, lower is better)
    std::vector<int> indices(cands.size());
    for (int i = 0; i < (int)cands.size(); i++) indices[i] = i;
    std::sort(indices.begin(), indices.end(), [&](int a, int b) {
--- a/prototype/core/multi_gpu_solver.cuh
+++ b/prototype/core/multi_gpu_solver.cuh
@ -1,12 +1,12 @@
 /**
- * multi_gpu_solver.cuh - 多 GPU 协同求解
+ * multi_gpu_solver.cuh - Multi-GPU cooperative solving
 * 
- * v5.0 方案 B3: 被动注入 + GPU 无感知
- *   - 每块 GPU 独立运行 solve()，各自用不同 seed
- *   - 每个 GPU 有一个 InjectBuffer（设备端）
- *   - CPU 协调线程定期（每 N 秒）收集各 GPU 的 best，异步写入其他 GPU 的 InjectBuffer
- *   - GPU 在 migrate_kernel 后检查 InjectBuffer，如果有新解则注入
- *   - 完全解耦：GPU 无需暂停，CPU 异步写入，通过 CUDA Stream 同步保证安全
+ * v5.0 plan B3: passive injection + GPU-agnostic design
+ *   - Each GPU runs solve() independently with its own seed
+ *   - Each GPU has an InjectBuffer (device memory)
+ *   - A CPU coordinator thread periodically (every N seconds) collects each GPU's best and asynchronously writes to other GPUs' InjectBuffers
+ *   - After migrate_kernel, each GPU checks InjectBuffer and injects if a new solution is present
+ *   - Fully decoupled: GPUs need not pause; CPU writes asynchronously; CUDA stream sync ensures safety
 */

 #pragma once
@ -18,25 +18,26 @@
 #include <chrono>

 // ============================================================
-// MultiGpuContext — 每个 GPU 的上下文
+// MultiGpuContext — per-GPU context
 // ============================================================

 template<typename Problem>
 struct MultiGpuContext {
    using Sol = typename Problem::Sol;
    
-    int gpu_id;                      // GPU 设备 ID
-    Problem* problem;                // Problem 实例（设备指针指向该 GPU）
-    SolverConfig config;             // 求解器配置（独立 seed）
+    int gpu_id;                      // GPU device ID
+    Problem* problem;                // Problem instance (device pointer for this GPU)
+    SolverConfig config;             // Solver config (independent seed)
    
-    Sol best_solution;               // 当前最优解（host 端）
-    std::mutex best_mutex;           // 保护 best_solution 的互斥锁
+    Sol best_solution;               // Current best solution (host)
+    SolveResult<Sol> solve_result;   // Full result from solve()
+    std::mutex best_mutex;           // Mutex protecting best_solution
    
-    InjectBuffer<Sol>* d_inject_buf; // Device 端注入缓冲区（在该 GPU 上分配）
-    Sol* d_global_best;              // Device 端全局最优解指针（由 solve() 导出）
+    InjectBuffer<Sol>* d_inject_buf; // Device-side inject buffer (allocated on this GPU)
+    Sol* d_global_best;              // Device pointer to global best (exported by solve())
    
-    std::atomic<bool> stop_flag;     // 停止标志
-    std::atomic<bool> running;       // 运行状态标志（用于协调线程判断）
+    std::atomic<bool> stop_flag;     // Stop flag
+    std::atomic<bool> running;       // Running flag (for coordinator thread)
    
    MultiGpuContext(int id) : gpu_id(id), problem(nullptr), d_inject_buf(nullptr), 
                               d_global_best(nullptr), stop_flag(false), running(false) {
@ -47,45 +48,46 @@ struct MultiGpuContext {
 };

 // ============================================================
-// GPU Worker 线程函数（方案 B3）
+// GPU worker thread (plan B3)
 // ============================================================

 template<typename Problem>
 void gpu_worker(MultiGpuContext<Problem>* ctx) {
    using Sol = typename Problem::Sol;
    
-    // 设置当前线程使用的 GPU
+    // Set GPU for this thread
    CUDA_CHECK(cudaSetDevice(ctx->gpu_id));
    
-    // 标记开始运行
+    // Mark as running
    ctx->running.store(true);
    
-    // 运行 solve（传入 inject_buf 和 d_global_best_out）
+    // Run solve (pass inject_buf and d_global_best_out)
    SolveResult<Sol> result = solve(*ctx->problem, ctx->config, 
                                     nullptr, 0, nullptr, ctx->d_inject_buf, &ctx->d_global_best);
    
-    // 标记运行结束
+    // Mark as finished running
    ctx->running.store(false);
    
-    // 更新最优解
+    // Update best solution and full result
    {
        std::lock_guard<std::mutex> lock(ctx->best_mutex);
        ctx->best_solution = result.best_solution;
+        ctx->solve_result = result;
    }
    
-    // 标记完成
+    // Mark complete
    ctx->stop_flag.store(true);
 }

 // ============================================================
-// 协调线程函数（方案 B3）
+// Coordinator thread (plan B3)
 // ============================================================
-// 定期从各 GPU 的 d_global_best 读取当前 best，计算 global_best，注入到其他 GPU
+// Periodically read each GPU's current best from d_global_best, compute global_best, inject to other GPUs
 //
-// 关键设计：
-// 1. 直接从各 GPU 的 d_global_best 读取（由 solve() 导出）
-// 2. 要求启用 SA（否则无 d_global_best）
-// 3. 轻量侵入：solve() 只需导出一个指针，对单 GPU 无影响
+// Key design:
+// 1. Read directly from each GPU's d_global_best (exported by solve())
+// 2. Requires SA enabled (otherwise no d_global_best)
+// 3. Light touch: solve() only exports a pointer; single-GPU path unchanged

 template<typename Problem>
 void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
@ -96,7 +98,7 @@ void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
    auto interval_ms = std::chrono::milliseconds(static_cast<int>(interval_sec * 1000));
    int round = 0;
    
-    // 等待所有 GPU 的 d_global_best 就绪
+    // Wait until all GPUs' d_global_best are ready
    bool all_ready = false;
    while (!all_ready) {
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
@ -110,10 +112,10 @@ void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
    }
    
    while (true) {
-        // 等待指定时间间隔
+        // Wait for the configured interval
        std::this_thread::sleep_for(interval_ms);
        
-        // 检查是否所有 GPU 都已停止
+        // Check whether all GPUs have stopped
        bool all_stopped = true;
        for (auto* ctx : contexts) {
            if (ctx->running.load()) {
@ -125,17 +127,17 @@ void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
        
        round++;
        
-        // 收集各 GPU 的当前最优解（从 d_global_best 读取）
+        // Collect each GPU's current best (from d_global_best)
        Sol global_best;
        global_best.penalty = 1e30f;
        global_best.objectives[0] = 1e30f;
        int best_gpu = -1;
        
        for (int i = 0; i < (int)contexts.size(); i++) {
-            if (!contexts[i]->running.load()) continue;  // 已停止的 GPU 跳过
-            if (contexts[i]->d_global_best == nullptr) continue;  // 未就绪跳过
+            if (!contexts[i]->running.load()) continue;  // skip stopped GPUs
+            if (contexts[i]->d_global_best == nullptr) continue;  // skip not ready
            
-            // 从该 GPU 的 d_global_best 读取
+            // Read from this GPU's d_global_best
            Sol gpu_best;
            cudaSetDevice(contexts[i]->gpu_id);
            cudaMemcpy(&gpu_best, contexts[i]->d_global_best, sizeof(Sol), cudaMemcpyDeviceToHost);
@ -146,23 +148,23 @@ void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
            }
        }
        
-        if (best_gpu == -1) continue;  // 所有 GPU 都已停止或未就绪
+        if (best_gpu == -1) continue;  // all GPUs stopped or not ready
        
        if (verbose) {
            printf("  [Coordinator Round %d] Global best from GPU %d: obj=%.2f, penalty=%.2f\n",
                   round, best_gpu, global_best.objectives[0], global_best.penalty);
        }
        
-        // 将 global_best 注入到其他 GPU（除了 best_gpu 自己）
+        // Inject global_best into other GPUs (except best_gpu)
        for (int i = 0; i < (int)contexts.size(); i++) {
-            if (i == best_gpu) continue;  // 不注入到自己
-            if (!contexts[i]->running.load()) continue;  // 已停止的 GPU 不注入
+            if (i == best_gpu) continue;  // do not inject to self
+            if (!contexts[i]->running.load()) continue;  // do not inject to stopped GPUs
            
-            // 读取 InjectBuffer 结构（从 device 到 host）
+            // Read InjectBuffer struct (device to host)
            InjectBuffer<Sol> buf;
            cudaMemcpy(&buf, contexts[i]->d_inject_buf, sizeof(InjectBuffer<Sol>), cudaMemcpyDeviceToHost);
            
-            // 同步写入（会自动切换设备）
+            // Synchronous write (switches device as needed)
            buf.write_sync(global_best, contexts[i]->gpu_id);
        }
    }
@ -173,7 +175,7 @@ void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
 }

 // ============================================================
-// 多 GPU 协同求解主函数（方案 B3）
+// Multi-GPU cooperative solve entry (plan B3)
 // ============================================================

 template<typename Problem>
@ -181,13 +183,17 @@ SolveResult<typename Problem::Sol> solve_multi_gpu(Problem& prob, const SolverCo
    using Sol = typename Problem::Sol;
    
    if (cfg.num_gpus <= 1) {
-        // 单 GPU 模式，直接调用普通 solve
+        // Single-GPU mode: call plain solve
        return solve(prob, cfg);
    }
    
-    // 检查可用 GPU 数量
-    int device_count;
+    // Check available GPU count
+    int device_count = 0;
    CUDA_CHECK(cudaGetDeviceCount(&device_count));
+    if (device_count <= 0) {
+        fprintf(stderr, "Error: No CUDA devices available\n");
+        return SolveResult<Sol>{};
+    }
    int actual_gpus = std::min(cfg.num_gpus, device_count);
    
    if (cfg.verbose) {
@ -199,15 +205,15 @@ SolveResult<typename Problem::Sol> solve_multi_gpu(Problem& prob, const SolverCo
               cfg.multi_gpu_inject_mode == MultiGpuInjectMode::HalfIslands ? "HalfIslands" : "AllIslands");
    }
    
-    // 创建各 GPU 的上下文
+    // Create per-GPU contexts
    std::vector<MultiGpuContext<Problem>*> contexts;
    for (int i = 0; i < actual_gpus; i++) {
        auto* ctx = new MultiGpuContext<Problem>(i);
        ctx->config = cfg;
-        ctx->config.seed = cfg.seed + i * 1000;  // 每个 GPU 用不同 seed
-        ctx->config.num_gpus = 1;  // 单 GPU 模式运行
+        ctx->config.seed = cfg.seed + i * 1000;  // distinct seed per GPU
+        ctx->config.num_gpus = 1;  // run as single-GPU per device
        
-        // 克隆 Problem 到该 GPU
+        // Clone Problem onto this GPU
        ctx->problem = prob.clone_to_device(i);
        if (ctx->problem == nullptr) {
            fprintf(stderr, "Error: Failed to clone problem to GPU %d\n", i);
@ -218,10 +224,10 @@ SolveResult<typename Problem::Sol> solve_multi_gpu(Problem& prob, const SolverCo
            return SolveResult<Sol>{};
        }
        
-        // 分配 InjectBuffer（在该 GPU 上）
+        // Allocate InjectBuffer on this GPU
        InjectBuffer<Sol> buf = InjectBuffer<Sol>::allocate(i);
        
-        // 将 InjectBuffer 拷贝到 device 端（传给 kernel）
+        // Copy InjectBuffer to device (for kernels)
        InjectBuffer<Sol>* d_buf;
        CUDA_CHECK(cudaSetDevice(i));
        CUDA_CHECK(cudaMalloc(&d_buf, sizeof(InjectBuffer<Sol>)));
@ -231,34 +237,36 @@ SolveResult<typename Problem::Sol> solve_multi_gpu(Problem& prob, const SolverCo
        contexts.push_back(ctx);
    }
    
-    // 启动 worker 线程
+    // Start worker threads
    std::vector<std::thread> workers;
    for (auto* ctx : contexts) {
        workers.emplace_back(gpu_worker<Problem>, ctx);
    }
    
-    // 启动协调线程（定期注入 global_best）
+    // Start coordinator thread (periodic global_best injection)
    std::thread coordinator(coordinator_thread<Problem>, std::ref(contexts),
                            cfg.multi_gpu_interval_sec, cfg.verbose);
    
-    // 等待所有 worker 完成
+    // Wait for all workers to finish
    for (auto& w : workers) w.join();
    
-    // 等待协调线程完成
+    // Wait for coordinator to finish
    coordinator.join();
    
-    // 收集最终结果
+    // Collect final result from best GPU
    Sol final_best = contexts[0]->best_solution;
+    int best_ctx = 0;
    ObjConfig oc = prob.obj_config();
    for (int i = 1; i < (int)contexts.size(); i++) {
        if (is_better(contexts[i]->best_solution, final_best, oc)) {
            final_best = contexts[i]->best_solution;
+            best_ctx = i;
        }
    }
    
-    // 清理
+    // Cleanup
    for (auto* ctx : contexts) {
-        // 读取 InjectBuffer 的内容（用于释放）
+        // Read InjectBuffer content (for teardown)
        InjectBuffer<Sol> buf;
        CUDA_CHECK(cudaSetDevice(ctx->gpu_id));
        CUDA_CHECK(cudaMemcpy(&buf, ctx->d_inject_buf, sizeof(InjectBuffer<Sol>), cudaMemcpyDeviceToHost));
@ -269,10 +277,9 @@ SolveResult<typename Problem::Sol> solve_multi_gpu(Problem& prob, const SolverCo
        delete ctx;
    }
    
-    // 构造返回结果
-    SolveResult<Sol> result;
+    // Build return value from best GPU's result
+    SolveResult<Sol> result = contexts[best_ctx]->solve_result;
    result.best_solution = final_best;
-    result.stop_reason = StopReason::MaxGen;
    
    return result;
 }
--- a/prototype/core/operators.cuh
+++ b/prototype/core/operators.cuh
@ -1,40 +1,40 @@
 /**
- * operators.cuh - 四层搜索算子体系（Device 端）
+ * operators.cuh - Four-layer search operator hierarchy (device side)
 *
- * v1.0: 二维通用编码的完整算子层次
+ * v1.0: Full operator hierarchy for 2D universal encoding
 *
- * 层次结构（所有算子只看 data[D1][D2] + dim2_sizes，不感知问题语义）：
+ * Hierarchy (all operators only see data[D1][D2] + dim2_sizes, no problem semantics):
 *
- *   第 1 层 - 元素级（Element）: 操作单个元素
- *     行内: swap, reverse(2-opt), insert, flip
- *     跨行: cross_relocate（单元素移行）, cross_swap（单元素换行）
+ *   Layer 1 - Element: operate on single elements
+ *     Within row: swap, reverse(2-opt), insert, flip
+ *     Cross-row: cross_relocate (move one element across rows), cross_swap (swap one element per row)
 *
- *   第 2 层 - 片段级（Segment）: 操作连续片段
- *     行内: or_opt（移动连续 k 个元素到行内新位置）
- *     跨行: seg_relocate（片段从一行移到另一行）
- *            seg_swap（两行各取一段互换，即 2-opt*）
+ *   Layer 2 - Segment: operate on contiguous segments
+ *     Within row: or_opt (move contiguous k elements to a new position in the row)
+ *     Cross-row: seg_relocate (move a segment from one row to another)
+ *            seg_swap (swap two segments from two rows each, i.e. 2-opt*)
 *
- *   第 3 层 - 行级（Row）: 操作整行
- *     row_swap（交换两行全部内容和长度）
- *     row_reverse（反转行的排列顺序）
- *     row_split（一行拆成两行）
- *     row_merge（两行合并为一行）
+ *   Layer 3 - Row: operate on whole rows
+ *     row_swap (swap full contents and lengths of two rows)
+ *     row_reverse (reverse row order)
+ *     row_split (split one row into two)
+ *     row_merge (merge two rows into one)
 *
- *   第 4 层 - 交叉（Crossover）: 组合两个解
- *     row_crossover（从父代 A/B 各取若干行组成子代）
- *     uniform_crossover（逐元素从两个父代中选）
+ *   Layer 4 - Crossover: combine two solutions
+ *     row_crossover (child takes some rows from parent A and B)
+ *     uniform_crossover (pick per element from two parents)
 *
- * Move 描述符：
- *   row, row2: 行索引（row2=-1 表示行内）
- *   op:        操作码
- *   pos1, pos2: 位置参数
- *   seg_len:   片段长度（第 2 层使用）
+ * Move descriptor:
+ *   row, row2: row indices (row2=-1 means within-row)
+ *   op:        operation code
+ *   pos1, pos2: position parameters
+ *   seg_len:   segment length (used by layer 2)
 *
- * 设计原则：
- *   - 所有算子对问题类型无感知，只操作二维数组
- *   - 每个算子都有对应的 undo 操作
- *   - 空行安全：自动降级为 no-op
- *   - 编码类型决定可用算子集
+ * Design principles:
+ *   - All operators are problem-agnostic; they only manipulate a 2D array
+ *   - Each operator has a corresponding undo
+ *   - Empty-row safe: automatically degrades to no-op
+ *   - Encoding type determines the available operator set
 */

 #pragma once
@ -44,61 +44,61 @@
 namespace ops {

 // ============================================================
-// Op 码常量 — 按层次编号，避免冲突
+// Op code constants — numbered by layer to avoid collisions
 // ============================================================

-// 通用
+// General
 constexpr int OP_NOOP             = -1;

-// --- 第 1 层：元素级 ---
-// Permutation 行内
-constexpr int PERM_SWAP           = 0;   // 交换两个位置
-constexpr int PERM_REVERSE        = 1;   // 反转区间（2-opt）
-constexpr int PERM_INSERT         = 2;   // 移动单个元素到新位置
-// Permutation 跨行
-constexpr int PERM_CROSS_RELOCATE = 3;   // 单元素从一行移到另一行
-constexpr int PERM_CROSS_SWAP     = 4;   // 两行各一个元素互换
-// Binary 行内
-constexpr int BIN_FLIP            = 0;   // 翻转一个位
-constexpr int BIN_SWAP            = 1;   // 交换两个位
-// Binary 跨行
-constexpr int BIN_CROSS_SWAP      = 2;   // 两行各一个位互换
+// --- Layer 1: element ---
+// Permutation within row
+constexpr int PERM_SWAP           = 0;   // swap two positions
+constexpr int PERM_REVERSE        = 1;   // reverse interval (2-opt)
+constexpr int PERM_INSERT         = 2;   // move one element to a new position
+// Permutation cross-row
+constexpr int PERM_CROSS_RELOCATE = 3;   // move one element from one row to another
+constexpr int PERM_CROSS_SWAP     = 4;   // swap one element per row between two rows
+// Binary within row
+constexpr int BIN_FLIP            = 0;   // flip one bit
+constexpr int BIN_SWAP            = 1;   // swap two bits
+// Binary cross-row
+constexpr int BIN_CROSS_SWAP      = 2;   // swap one bit per row between two rows

-// --- 第 1 层（续）：排列行内 ---
-constexpr int PERM_3OPT           = 5;   // 3-opt：断 3 条边重连
+// --- Layer 1 (cont.): permutation within row ---
+constexpr int PERM_3OPT           = 5;   // 3-opt: break 3 edges and reconnect

-// --- 第 2 层：片段级 ---
-constexpr int PERM_OR_OPT         = 10;  // 行内：移动连续 k 个元素
-constexpr int PERM_SEG_RELOCATE   = 11;  // 跨行：片段从一行移到另一行
-constexpr int PERM_SEG_SWAP       = 12;  // 跨行：两行各取一段互换（2-opt*）
-constexpr int PERM_CROSS_EXCHANGE = 15;  // 跨行：两行各取一段互换（保持各自内部顺序）
-constexpr int BIN_SEG_FLIP        = 13;  // 行内：翻转连续 k 个位
-constexpr int BIN_SEG_CROSS_SWAP  = 14;  // 跨行：两行各取一段互换
-constexpr int BIN_K_FLIP          = 16;  // 行内：同时翻转 k 个随机位
+// --- Layer 2: segment ---
+constexpr int PERM_OR_OPT         = 10;  // within row: move contiguous k elements
+constexpr int PERM_SEG_RELOCATE   = 11;  // cross-row: move segment from one row to another
+constexpr int PERM_SEG_SWAP       = 12;  // cross-row: swap two segments from two rows each (2-opt*)
+constexpr int PERM_CROSS_EXCHANGE = 15;  // cross-row: swap two segments (preserve internal order each)
+constexpr int BIN_SEG_FLIP        = 13;  // within row: flip contiguous k bits
+constexpr int BIN_SEG_CROSS_SWAP  = 14;  // cross-row: swap two segments from two rows each
+constexpr int BIN_K_FLIP          = 16;  // within row: flip k random bits at once

-// --- 第 3 层：行级 ---
-constexpr int ROW_SWAP            = 20;  // 交换两行全部内容
-constexpr int ROW_REVERSE         = 21;  // 反转行的排列顺序（行号重排）
-constexpr int ROW_SPLIT           = 22;  // 一行拆成两行
-constexpr int ROW_MERGE           = 23;  // 两行合并为一行
+// --- Layer 3: row ---
+constexpr int ROW_SWAP            = 20;  // swap full contents of two rows
+constexpr int ROW_REVERSE         = 21;  // reverse row order (row index permutation)
+constexpr int ROW_SPLIT           = 22;  // split one row into two
+constexpr int ROW_MERGE           = 23;  // merge two rows into one

-// --- 特殊：扰动（连续多步 move，不可 undo，用于跳出局部最优）---
+// --- Special: perturbation (multi-step moves, no undo, escape local optima) ---
 constexpr int PERTURBATION        = 40;

-// --- 第 4 层：交叉 ---
-constexpr int CROSS_ROW           = 30;  // 行级交叉：从两个父代各取若干行
-constexpr int CROSS_UNIFORM       = 31;  // 均匀交叉：逐元素从两个父代选
+// --- Layer 4: crossover ---
+constexpr int CROSS_ROW           = 30;  // row crossover: take some rows from each parent
+constexpr int CROSS_UNIFORM       = 31;  // uniform crossover: pick per element from two parents

 // ============================================================
-// Move 描述符 — 编码级别的变动描述
+// Move descriptor — encoding-level change description
 // ============================================================

 struct Move {
-    int row;            // 源行（或第一行）
-    int row2;           // 目标行（-1 = 行内）
-    int op;             // 操作码
-    int pos1, pos2;     // 位置参数
-    int seg_len;        // 片段长度（第 2 层使用，其他层 = 0）
+    int row;            // source row (or first row)
+    int row2;           // target row (-1 = within-row)
+    int op;             // operation code
+    int pos1, pos2;     // position parameters
+    int seg_len;        // segment length (layer 2; 0 for other layers)
 };

 }  // namespace ops
@ -106,10 +106,10 @@ struct Move {
 namespace ops {

 // ============================================================
-// 第 1 层：元素级底层操作
+// Layer 1: element-level primitives
 // ============================================================

-// --- Permutation 行内 ---
+// --- Permutation within row ---

 __device__ inline void perm_swap(int* row, int i, int j) {
    int tmp = row[i]; row[i] = row[j]; row[j] = tmp;
@ -126,9 +126,9 @@ __device__ inline void perm_insert(int* row, int from, int to, int size) {
    row[to] = val;
 }

-// --- Permutation 跨行 ---
+// --- Permutation cross-row ---

-/// cross_relocate: 从 src_row[src_pos] 取出元素，插入 dst_row[dst_pos]
+/// cross_relocate: take element from src_row[src_pos], insert at dst_row[dst_pos]
 __device__ inline void perm_cross_relocate(int* src_row, int& src_size,
                                            int* dst_row, int& dst_size,
                                            int src_pos, int dst_pos) {
@ -142,24 +142,24 @@ __device__ inline void perm_cross_relocate(int* src_row, int& src_size,
    dst_size++;
 }

-/// cross_swap: 交换 rowA[posA] 和 rowB[posB]
+/// cross_swap: swap rowA[posA] and rowB[posB]
 __device__ inline void cross_swap_elem(int* rowA, int posA, int* rowB, int posB) {
    int tmp = rowA[posA]; rowA[posA] = rowB[posB]; rowB[posB] = tmp;
 }

-// --- Permutation 行内：3-opt ---
-// 断开 3 条边，选择最佳重连方式（共 8 种组合，取随机一种非恒等变换）
-// 参数：3 个断点 i < j < k，将路线分为 seg0=[0,i] seg1=[i+1,j] seg2=[j+1,k] seg3=[k+1,end]
-// 实现：随机选一种重连（reverse seg1, reverse seg2, 或两者都反转）
-// pos1=i, pos2=j, seg_len 编码 k
+// --- Permutation within row: 3-opt ---
+// Break 3 edges and pick a reconnection (8 combinations; pick one random non-identity)
+// Args: three breakpoints i < j < k, route splits seg0=[0,i] seg1=[i+1,j] seg2=[j+1,k] seg3=[k+1,end]
+// Impl: random reconnection (reverse seg1, reverse seg2, or both)
+// pos1=i, pos2=j, seg_len encodes k
 __device__ inline void perm_3opt(int* row, int size, int i, int j, int k) {
-    // 3-opt 有多种重连方式，这里实现最常用的 3 种非恒等变换：
-    //   type 1: reverse [i+1, j]                    — 等价于 2-opt(i+1, j)
-    //   type 2: reverse [j+1, k]                    — 等价于 2-opt(j+1, k)
-    //   type 3: reverse [i+1, j] + reverse [j+1, k] — 真正的 3-opt move
-    //   type 4: 将 seg1 和 seg2 互换位置（不反转）  — or-opt 的泛化
-    // 我们随机选 type 3 或 type 4（type 1/2 已被 2-opt 覆盖）
-    // 这里固定做 type 3（双反转），因为它是 2-opt 无法达到的唯一新邻域
+    // 3-opt has several reconnections; here we use the most common non-identity variants:
+    //   type 1: reverse [i+1, j]                    — same as 2-opt(i+1, j)
+    //   type 2: reverse [j+1, k]                    — same as 2-opt(j+1, k)
+    //   type 3: reverse [i+1, j] + reverse [j+1, k] — true 3-opt move
+    //   type 4: swap seg1 and seg2 (no reverse)     — generalization of or-opt
+    // We would randomize type 3 or 4 (types 1/2 are covered by 2-opt)
+    // Here we fix type 3 (double reverse) as the only new neighborhood 2-opt cannot reach
    // reverse [i+1, j]
    int lo = i + 1, hi = j;
    while (lo < hi) { int t = row[lo]; row[lo] = row[hi]; row[hi] = t; lo++; hi--; }
@ -168,12 +168,12 @@ __device__ inline void perm_3opt(int* row, int size, int i, int j, int k) {
    while (lo < hi) { int t = row[lo]; row[lo] = row[hi]; row[hi] = t; lo++; hi--; }
 }

-// 3-opt undo: 再做一次相同操作即可恢复（双反转是自反的）
+// 3-opt undo: repeat the same move to restore (double reverse is self-inverse)
 __device__ inline void perm_3opt_undo(int* row, int size, int i, int j, int k) {
-    perm_3opt(row, size, i, j, k);  // 自反
+    perm_3opt(row, size, i, j, k);  // self-inverse
 }

-// --- Binary 行内 ---
+// --- Binary within row ---

 __device__ inline void bin_flip(int* row, int i) { row[i] = 1 - row[i]; }

@ -182,51 +182,51 @@ __device__ inline void bin_swap(int* row, int i, int j) {
 }

 // ============================================================
-// 第 2 层：片段级底层操作
+// Layer 2: segment-level primitives
 // ============================================================

-/// or_opt: 行内移动连续 seg_len 个元素（从 from 开始）到 to 位置
-/// 等价于：取出 [from, from+seg_len)，插入到 to 之前
-/// 约束：from + seg_len <= size, to 不在 [from, from+seg_len) 内
+/// or_opt: within row, move contiguous seg_len elements (starting at from) to position to
+/// Same as: take [from, from+seg_len), insert before to
+/// Constraints: from + seg_len <= size, to not in [from, from+seg_len)
 __device__ inline void perm_or_opt(int* row, int size, int from, int to, int seg_len) {
-    // 临时缓冲（片段最大长度受限于寄存器，实际 seg_len 通常 <= 4）
-    int buf[8];  // 足够覆盖常见 seg_len
+    // Temp buffer (max segment length limited by registers; seg_len usually <= 4)
+    int buf[8];  // enough for typical seg_len
    int actual_len = (seg_len > 8) ? 8 : seg_len;
    
-    // 保存片段
+    // Save segment
    for (int i = 0; i < actual_len; i++) buf[i] = row[from + i];
    
-    // 移除片段（左移填补空洞）
+    // Remove segment (shift left to close gap)
    int new_size = size - actual_len;
    for (int k = from; k < new_size; k++) row[k] = row[k + actual_len];
    
-    // 计算插入位置（移除后的坐标系）
+    // Insert position after removal (coords after removal)
    int ins = (to > from) ? to - actual_len : to;
    if (ins < 0) ins = 0;
    if (ins > new_size) ins = new_size;
    
-    // 插入片段（右移腾位）
+    // Insert segment (shift right to make room)
    for (int k = new_size - 1; k >= ins; k--) row[k + actual_len] = row[k];
    for (int i = 0; i < actual_len; i++) row[ins + i] = buf[i];
 }

-/// seg_relocate: 从 src_row 取出连续 seg_len 个元素，插入 dst_row 的 dst_pos
-/// src_size 减 seg_len，dst_size 加 seg_len
+/// seg_relocate: take contiguous seg_len elements from src_row, insert at dst_pos in dst_row
+/// src_size -= seg_len, dst_size += seg_len
 __device__ inline void perm_seg_relocate(int* src_row, int& src_size,
                                          int* dst_row, int& dst_size,
                                          int src_pos, int dst_pos, int seg_len) {
    int buf[8];
    int actual_len = (seg_len > 8) ? 8 : seg_len;
    
-    // 保存片段
+    // Save segment
    for (int i = 0; i < actual_len; i++) buf[i] = src_row[src_pos + i];
    
-    // 源行：移除（左移）
+    // Source row: remove (shift left)
    for (int k = src_pos; k < src_size - actual_len; k++)
        src_row[k] = src_row[k + actual_len];
    src_size -= actual_len;
    
-    // 目标行：插入（右移）
+    // Destination row: insert (shift right)
    for (int k = dst_size - 1; k >= dst_pos; k--)
        dst_row[k + actual_len] = dst_row[k];
    for (int i = 0; i < actual_len; i++)
@ -234,29 +234,29 @@ __device__ inline void perm_seg_relocate(int* src_row, int& src_size,
    dst_size += actual_len;
 }

-/// seg_swap: 两行各取一段互换（2-opt* 的通用形式）
+/// seg_swap: swap one segment from each row (general 2-opt*)
 /// rowA[posA..posA+lenA) <-> rowB[posB..posB+lenB)
-/// 行长变化：sizeA += (lenB - lenA), sizeB += (lenA - lenB)
+/// Row lengths: sizeA += (lenB - lenA), sizeB += (lenA - lenB)
 __device__ inline void perm_seg_swap(int* rowA, int& sizeA, int posA, int lenA,
                                      int* rowB, int& sizeB, int posB, int lenB) {
    int bufA[8], bufB[8];
    int aLen = (lenA > 8) ? 8 : lenA;
    int bLen = (lenB > 8) ? 8 : lenB;
    
-    // 保存两段
+    // Save both segments
    for (int i = 0; i < aLen; i++) bufA[i] = rowA[posA + i];
    for (int i = 0; i < bLen; i++) bufB[i] = rowB[posB + i];
    
-    // 从 rowA 移除 segA，腾出空间插入 segB
-    // 先移除
+    // Remove segA from rowA to make room for segB
+    // Remove first
    int newSizeA = sizeA - aLen;
    for (int k = posA; k < newSizeA; k++) rowA[k] = rowA[k + aLen];
-    // 再插入 segB
+    // Then insert segB
    for (int k = newSizeA - 1; k >= posA; k--) rowA[k + bLen] = rowA[k];
    for (int i = 0; i < bLen; i++) rowA[posA + i] = bufB[i];
    sizeA = newSizeA + bLen;
    
-    // 从 rowB 移除 segB，腾出空间插入 segA
+    // Remove segB from rowB to make room for segA
    int newSizeB = sizeB - bLen;
    for (int k = posB; k < newSizeB; k++) rowB[k] = rowB[k + bLen];
    for (int k = newSizeB - 1; k >= posB; k--) rowB[k + aLen] = rowB[k];
@ -264,10 +264,10 @@ __device__ inline void perm_seg_swap(int* rowA, int& sizeA, int posA, int lenA,
    sizeB = newSizeB + aLen;
 }

-/// cross_exchange: 两行各取一段互换，保持各自内部顺序
-/// 与 seg_swap 的区别：seg_swap 是等长互换，cross_exchange 允许不等长
+/// cross_exchange: swap one segment from each row, preserving internal order each
+/// Unlike seg_swap: seg_swap is equal-length swap; cross_exchange allows unequal lengths
 /// rowA[posA..posA+lenA) <-> rowB[posB..posB+lenB)
-/// 行长变化：sizeA += (lenB - lenA), sizeB += (lenA - lenB)
+/// Row lengths: sizeA += (lenB - lenA), sizeB += (lenA - lenB)
 __device__ inline void perm_cross_exchange(int* rowA, int& sizeA, int posA, int lenA,
                                            int* rowB, int& sizeB, int posB, int lenB) {
    int bufA[8], bufB[8];
@ -277,14 +277,14 @@ __device__ inline void perm_cross_exchange(int* rowA, int& sizeA, int posA, int
    for (int i = 0; i < aLen; i++) bufA[i] = rowA[posA + i];
    for (int i = 0; i < bLen; i++) bufB[i] = rowB[posB + i];
    
-    // rowA: 移除 segA，插入 segB
+    // rowA: remove segA, insert segB
    int newSizeA = sizeA - aLen;
    for (int k = posA; k < newSizeA; k++) rowA[k] = rowA[k + aLen];
    for (int k = newSizeA - 1; k >= posA; k--) rowA[k + bLen] = rowA[k];
    for (int i = 0; i < bLen; i++) rowA[posA + i] = bufB[i];
    sizeA = newSizeA + bLen;
    
-    // rowB: 移除 segB，插入 segA
+    // rowB: remove segB, insert segA
    int newSizeB = sizeB - bLen;
    for (int k = posB; k < newSizeB; k++) rowB[k] = rowB[k + bLen];
    for (int k = newSizeB - 1; k >= posB; k--) rowB[k + aLen] = rowB[k];
@ -292,8 +292,8 @@ __device__ inline void perm_cross_exchange(int* rowA, int& sizeA, int posA, int
    sizeB = newSizeB + aLen;
 }

-/// k-bit flip: 同时翻转 k 个随机位（Binary 编码）
-/// positions 数组存储要翻转的位置，k = 实际翻转数
+/// k-bit flip: flip k random bits at once (Binary encoding)
+/// positions array holds indices to flip; k = number of flips
 __device__ inline void bin_k_flip(int* row, int size, int k, curandState* rng) {
    for (int i = 0; i < k; i++) {
        int pos = rand_int(rng, size);
@ -301,12 +301,12 @@ __device__ inline void bin_k_flip(int* row, int size, int k, curandState* rng) {
    }
 }

-/// seg_flip: 翻转行内连续 seg_len 个位（Binary 编码）
+/// seg_flip: flip contiguous seg_len bits within row (Binary encoding)
 __device__ inline void bin_seg_flip(int* row, int pos, int seg_len) {
    for (int i = 0; i < seg_len; i++) row[pos + i] = 1 - row[pos + i];
 }

-/// seg_cross_swap: 两行各取一段互换（Binary 编码，等长）
+/// seg_cross_swap: swap one segment from each row (Binary encoding, equal length)
 __device__ inline void bin_seg_cross_swap(int* rowA, int posA,
                                           int* rowB, int posB, int seg_len) {
    for (int i = 0; i < seg_len; i++) {
@ -317,23 +317,23 @@ __device__ inline void bin_seg_cross_swap(int* rowA, int posA,
 }

 // ============================================================
-// Integer 编码底层操作
+// Integer encoding primitives
 // ============================================================

-/// int_clamp: 将值限制在 [lb, ub] 范围内
+/// int_clamp: clamp value to [lb, ub]
 __device__ inline int int_clamp(int v, int lb, int ub) {
    if (v < lb) return lb;
    if (v > ub) return ub;
    return v;
 }

-/// int_random_reset: 随机一个位置重置为 [lb, ub] 内随机值
+/// int_random_reset: reset one random position to uniform random in [lb, ub]
 __device__ inline void int_random_reset(int* row, int pos, int lb, int ub,
                                         curandState* rng) {
    row[pos] = lb + (curand(rng) % (ub - lb + 1));
 }

-/// int_delta: 随机一个位置 ±k（clamp 到 [lb, ub]）
+/// int_delta: random position, add ±k (clamped to [lb, ub])
 __device__ inline void int_delta(int* row, int pos, int lb, int ub,
                                  curandState* rng) {
    int range = ub - lb + 1;
@ -343,7 +343,7 @@ __device__ inline void int_delta(int* row, int pos, int lb, int ub,
    row[pos] = int_clamp(row[pos] + step, lb, ub);
 }

-/// int_seg_reset: 连续 k 个位置全部重置为 [lb, ub] 内随机值
+/// int_seg_reset: reset k contiguous positions to uniform random in [lb, ub]
 __device__ inline void int_seg_reset(int* row, int pos, int seg_len,
                                      int lb, int ub, curandState* rng) {
    int range = ub - lb + 1;
@ -351,7 +351,7 @@ __device__ inline void int_seg_reset(int* row, int pos, int seg_len,
        row[pos + i] = lb + (curand(rng) % range);
 }

-/// int_k_delta: 随机 k 个位置各自 ±1
+/// int_k_delta: k random positions, each ±1
 __device__ inline void int_k_delta(int* row, int size, int k,
                                    int lb, int ub, curandState* rng) {
    for (int i = 0; i < k; i++) {
@ -362,21 +362,21 @@ __device__ inline void int_k_delta(int* row, int size, int k,
 }

 // ============================================================
-// 第 3 层：行级底层操作
+// Layer 3: row-level primitives
 // ============================================================

-/// row_swap: 交换两行的全部内容和长度
+/// row_swap: swap full contents and lengths of two rows
 template<typename Sol>
 __device__ inline void row_swap(Sol& sol, int r1, int r2) {
-    // 交换长度
+    // Swap lengths
    int tmp_size = sol.dim2_sizes[r1];
    sol.dim2_sizes[r1] = sol.dim2_sizes[r2];
    sol.dim2_sizes[r2] = tmp_size;
-    // 交换数据（取两行中较长的长度）
+    // Swap data (use the longer of the two row lengths)
    int max_len = (sol.dim2_sizes[r1] > sol.dim2_sizes[r2]) 
                  ? sol.dim2_sizes[r1] : sol.dim2_sizes[r2];
-    // 交换后 r1 的长度是原 r2 的，r2 的长度是原 r1 的
-    // 所以需要交换 max(原r1长度, 原r2长度) 个元素
+    // After swap, r1 has old r2 length and r2 has old r1 length
+    // So swap max(old r1 len, old r2 len) elements
    max_len = (tmp_size > max_len) ? tmp_size : max_len;
    for (int c = 0; c < max_len; c++) {
        int tmp = sol.data[r1][c];
@ -385,8 +385,8 @@ __device__ inline void row_swap(Sol& sol, int r1, int r2) {
    }
 }

-/// row_reverse: 反转 [r1, r2] 范围内的行排列顺序
-/// 例如 row_reverse(sol, 1, 4) 把行 1,2,3,4 变成 4,3,2,1
+/// row_reverse: reverse row order in [r1, r2]
+/// e.g. row_reverse(sol, 1, 4) turns rows 1,2,3,4 into 4,3,2,1
 template<typename Sol>
 __device__ inline void row_reverse_range(Sol& sol, int r1, int r2) {
    while (r1 < r2) {
@ -395,23 +395,23 @@ __device__ inline void row_reverse_range(Sol& sol, int r1, int r2) {
    }
 }

-/// row_split: 将 row 从 split_pos 处拆成两行
-/// row 保留 [0, split_pos)，empty_row 接收 [split_pos, size)
-/// 要求 empty_row 当前为空或有足够空间
+/// row_split: split row at split_pos into two rows
+/// row keeps [0, split_pos), empty_row gets [split_pos, size)
+/// requires empty_row empty or with enough space
 template<typename Sol>
 __device__ inline void row_split(Sol& sol, int row, int empty_row, int split_pos) {
    int orig_size = sol.dim2_sizes[row];
    int move_count = orig_size - split_pos;
-    // 复制后半段到 empty_row
+    // Copy tail to empty_row
    for (int i = 0; i < move_count; i++)
        sol.data[empty_row][i] = sol.data[row][split_pos + i];
    sol.dim2_sizes[empty_row] = move_count;
    sol.dim2_sizes[row] = split_pos;
 }

-/// row_merge: 将 src_row 的全部内容追加到 dst_row 末尾
-/// src_row 清空，dst_row 长度增加
-/// 要求 dst_size + src_size <= DIM2
+/// row_merge: append full contents of src_row to end of dst_row
+/// src_row cleared, dst_row length increased
+/// requires dst_size + src_size <= DIM2
 template<typename Sol>
 __device__ inline void row_merge(Sol& sol, int dst_row, int src_row) {
    int dst_size = sol.dim2_sizes[dst_row];
@ -423,33 +423,33 @@ __device__ inline void row_merge(Sol& sol, int dst_row, int src_row) {
 }

 // ============================================================
-// 第 4 层：交叉底层操作
+// Layer 4: crossover primitives
 // ============================================================
 //
-// 排列编码：OX 家族（统一框架）
-//   核心逻辑：A 中标记一组"保留位置"不动，空位按 B 的全局顺序填充
-//   三个变体只是"怎么选保留集"不同，填充逻辑完全共享
-//   天然保证唯一性：从 B 中按序取不在保留集中的元素，不会重复
-//   行长度不变（= A 的行长度），行边界不变
+// Permutation encoding: OX family (unified framework)
+//   Core: mark "kept" positions from A; fill gaps in B's global order
+//   Three variants differ only in how the keep set is chosen; fill logic is shared
+//   Uniqueness: take from B in order elements not in keep set, no duplicates
+//   Row lengths unchanged (= A's row lengths), row boundaries unchanged
 //
-// Binary 编码：uniform_crossover（逐元素随机选）
+// Binary encoding: uniform_crossover (random pick per element)
 //
 // ============================================================

-// ---- OX 核心填充逻辑 ----
-// keep[r][c] = true 表示 child[r][c] 保留 A 的值，false 表示空位
-// 空位按 B 中元素的出现顺序（逐行扫描）填充
-// 要求：child 已拷贝自 A，dim2_sizes 已设为 A 的行长度
+// ---- OX core fill logic ----
+// keep[r][c] = true means child[r][c] keeps A's value; false = gap to fill
+// Gaps filled in order of appearance of elements in B (row-major scan)
+// Requires: child copied from A, dim2_sizes set to A's row lengths
 //
-// 参数 total_elements: 分区模式下的总元素数，非分区模式下 = 单行长度
-//   用于确定 B 中扫描的元素范围
+// total_elements: total elements in partitioned mode; in non-partitioned = single row length
+//   Used to bound the scan range in B

 template<typename Sol>
 __device__ inline void ox_fill_from_b(Sol& child, const Sol& parentB,
                                       const bool* keep_flat,
                                       int dim1, int total_elements) {
-    // 统计 A 中保留位置的每个值的出现次数（支持多重集排列）
-    // keep_flat 是按行展平的：keep_flat[r * DIM2 + c]
+    // Count occurrences of each value at kept positions in A (multiset permutations)
+    // keep_flat is row-major flat: keep_flat[r * DIM2 + c]
    int keep_count[512];
    for (int i = 0; i < total_elements; i++) keep_count[i] = 0;
    
@ -460,21 +460,21 @@ __device__ inline void ox_fill_from_b(Sol& child, const Sol& parentB,
                if (v >= 0 && v < total_elements) keep_count[v]++;
            }
    
-    // 从 B 中按行扫描顺序收集：每个值只取"需要填充"的份数
-    // 标准排列：每个值最多 1 份，多重集：每个值最多 repeat_count 份
+    // Collect from B in row scan order: take only as many of each value as needed to fill
+    // Standard permutation: at most 1 of each value; multiset: up to repeat_count each
    int fill_buf[512];
    int fill_count = 0;
    for (int r = 0; r < dim1; r++)
        for (int c = 0; c < parentB.dim2_sizes[r]; c++) {
            int val = parentB.data[r][c];
            if (val >= 0 && val < total_elements && keep_count[val] > 0) {
-                keep_count[val]--;  // 消耗一个保留名额
+                keep_count[val]--;  // consume one kept slot
            } else if (val >= 0 && val < total_elements) {
                fill_buf[fill_count++] = val;
            }
        }
    
-    // 按空位顺序（逐行从左到右）填入
+    // Fill gaps in order (row by row, left to right)
    int fi = 0;
    for (int r = 0; r < dim1; r++)
        for (int c = 0; c < child.dim2_sizes[r]; c++)
@ -482,26 +482,26 @@ __device__ inline void ox_fill_from_b(Sol& child, const Sol& parentB,
                child.data[r][c] = fill_buf[fi++];
 }

-// ---- 变体 1: OX-区间 ----
-// 每行随机选一个连续区间保留，保留邻接关系
+// ---- Variant 1: OX-interval ----
+// Per row, random contiguous interval kept; preserves adjacency
 template<typename Sol>
 __device__ inline void ox_interval(Sol& child, const Sol& parentA, const Sol& parentB,
                                    int dim1, int total_elements, curandState* rng) {
    bool keep[Sol::DIM1 * Sol::DIM2];
    for (int i = 0; i < Sol::DIM1 * Sol::DIM2; i++) keep[i] = false;
    
-    // child = A，同时标记每行的保留区间
+    // child = A, mark each row's kept interval
    for (int r = 0; r < dim1; r++) {
        int sz = parentA.dim2_sizes[r];
        child.dim2_sizes[r] = sz;
        for (int c = 0; c < sz; c++) child.data[r][c] = parentA.data[r][c];
        
        if (sz < 2) {
-            // 长度 0 或 1：全部保留
+            // length 0 or 1: keep all
            for (int c = 0; c < sz; c++) keep[r * Sol::DIM2 + c] = true;
            continue;
        }
-        // 随机选区间 [lo, hi]
+        // Random interval [lo, hi]
        int lo = rand_int(rng, sz);
        int hi = rand_int(rng, sz);
        if (lo > hi) { int tmp = lo; lo = hi; hi = tmp; }
@ -511,8 +511,8 @@ __device__ inline void ox_interval(Sol& child, const Sol& parentA, const Sol& pa
    ox_fill_from_b(child, parentB, keep, dim1, total_elements);
 }

-// ---- 变体 2: OX-子集 ----
-// 随机选约 50% 的元素值保留其在 A 中的位置，通用性最强
+// ---- Variant 2: OX-subset ----
+// Randomly keep ~50% of positions at their A values; most general
 template<typename Sol>
 __device__ inline void ox_subset(Sol& child, const Sol& parentA, const Sol& parentB,
                                  int dim1, int total_elements, curandState* rng) {
@ -526,7 +526,7 @@ __device__ inline void ox_subset(Sol& child, const Sol& parentA, const Sol& pare
            child.data[r][c] = parentA.data[r][c];
    }
    
-    // 每个位置 50% 概率保留
+    // 50% keep per position
    for (int r = 0; r < dim1; r++)
        for (int c = 0; c < child.dim2_sizes[r]; c++)
            keep[r * Sol::DIM2 + c] = (curand_uniform(rng) < 0.5f);
@ -534,9 +534,9 @@ __device__ inline void ox_subset(Sol& child, const Sol& parentA, const Sol& pare
    ox_fill_from_b(child, parentB, keep, dim1, total_elements);
 }

-// ---- 变体 3: OX-行 ----
-// 随机选若干整行保留，其余行的元素全部按 B 的顺序重填
-// 保留整条路线结构，VRP 受益
+// ---- Variant 3: OX-row ----
+// Randomly keep whole rows; refill non-kept rows from B's order
+// Preserves full route structure; good for VRP
 template<typename Sol>
 __device__ inline void ox_row(Sol& child, const Sol& parentA, const Sol& parentB,
                               int dim1, int total_elements, curandState* rng) {
@ -550,7 +550,7 @@ __device__ inline void ox_row(Sol& child, const Sol& parentA, const Sol& parentB
            child.data[r][c] = parentA.data[r][c];
    }
    
-    // 每行 50% 概率整行保留
+    // 50% chance to keep whole row
    int kept = 0;
    for (int r = 0; r < dim1; r++) {
        if (curand_uniform(rng) < 0.5f) {
@ -559,14 +559,14 @@ __device__ inline void ox_row(Sol& child, const Sol& parentA, const Sol& parentB
            kept++;
        }
    }
-    // 确保不是全保留或全不保留
+    // Ensure not all-kept or all-unkept
    if (kept == 0) {
        int r = rand_int(rng, dim1);
-        // 不标记任何 keep → 全部重填（至少有一行不保留）
-        // 实际上 kept==0 意味着全部重填，这是合法的（child = B 的顺序填入 A 的结构）
+        // No keep marks → full refill (at least one row not kept)
+        // kept==0 means full refill; valid (child gets B's order into A's structure)
    }
    if (kept == dim1 && dim1 > 1) {
-        // 全保留 → 随机取消一行
+        // All kept → randomly un-keep one row
        int r = rand_int(rng, dim1);
        for (int c = 0; c < child.dim2_sizes[r]; c++)
            keep[r * Sol::DIM2 + c] = false;
@ -575,14 +575,14 @@ __device__ inline void ox_row(Sol& child, const Sol& parentA, const Sol& parentB
    ox_fill_from_b(child, parentB, keep, dim1, total_elements);
 }

-// ---- OX 统一入口 ----
-// 随机选一个变体执行
-// dim1==1 时只用区间和子集（行变体无意义）
+// ---- OX unified entry ----
+// Pick one variant at random
+// When dim1==1 use only interval and subset (row variant useless)
 template<typename Sol>
 __device__ inline void perm_ox_crossover(Sol& child, const Sol& parentA, const Sol& parentB,
                                          int dim1, int total_elements, curandState* rng) {
    int n_variants = (dim1 > 1) ? 3 : 2;
-    int variant = rand_int(rng, n_variants);  // 0: 区间, 1: 子集, [2: 行]
+    int variant = rand_int(rng, n_variants);  // 0: interval, 1: subset, [2: row]
    switch (variant) {
        case 0: ox_interval(child, parentA, parentB, dim1, total_elements, rng); break;
        case 1: ox_subset(child, parentA, parentB, dim1, total_elements, rng); break;
@ -590,8 +590,8 @@ __device__ inline void perm_ox_crossover(Sol& child, const Sol& parentA, const S
    }
 }

-/// uniform_crossover: 逐元素从两个父代中随机选择
-/// 适用于 Binary 编码（不破坏排列约束）
+/// uniform_crossover: random parent choice per element
+/// Suitable for Binary encoding (does not break permutation constraints)
 template<typename Sol>
 __device__ inline void uniform_crossover(Sol& child, const Sol& parentA, const Sol& parentB,
                                          int dim1, curandState* rng) {
@ -607,15 +607,15 @@ __device__ inline void uniform_crossover(Sol& child, const Sol& parentA, const S
    }
 }

-// [已删除] generate_move_for_seq / sample_and_generate / apply_move / undo_move
-// P0 重构后主路径统一使用 execute_sequence，旧的 Move 生成+应用+撤销路径不再需要
+// [removed] generate_move_for_seq / sample_and_generate / apply_move / undo_move
+// After P0 refactor the main path uses execute_sequence; old Move gen/apply/undo path removed

 // ============================================================
-// execute_sequence — 统一接口：生成参数并直接执行（不返回 Move）
+// execute_sequence — unified API: generate params and execute directly (no Move returned)
 // ============================================================
-// 返回 true 若 sol 被修改，false 若 NOOP
-// d_G, d_O, rel_N: 可选的关系矩阵指针（SEQ_LNS_GUIDED_REBUILD 使用）
-// val_lb, val_ub: Integer 编码的值域范围（其他编码忽略）
+// Returns true if sol modified, false if NOOP
+// d_G, d_O, rel_N: optional relation matrices (for SEQ_LNS_GUIDED_REBUILD)
+// val_lb, val_ub: Integer encoding value range (ignored for other encodings)

 template<typename Sol>
 __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
@ -627,7 +627,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
                                         int val_ub = 1,
                                         const void* prob_data = nullptr) {
    // ============================================================
-    // Permutation 序列
+    // Permutation sequences
    // ============================================================
    if (encoding == EncodingType::Permutation) {
        switch (seq_id) {
@ -841,15 +841,15 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
            return true;
        }
        case seq::SEQ_LNS_GUIDED_REBUILD: {
-            // 关系矩阵引导重建：
-            //   1. 随机选种子元素 seed
-            //   2. 查 G[seed] 找分组倾向最强的 K 个元素
-            //   3. 在解中找到这些元素的位置
-            //   4. 按 O 矩阵引导的顺序重排这些位置的元素
+            // Relation-matrix guided rebuild:
+            //   1. Pick random seed element seed
+            //   2. Look up G[seed] for K elements with strongest grouping affinity
+            //   3. Find positions of these elements in the solution
+            //   4. Reorder these positions by order guided by O matrix
            //
-            // 如果没有关系矩阵（冷启动），退化为 scatter_shuffle
+            // Without relation matrices (cold start), fall back to scatter_shuffle
            if (!d_G || !d_O || rel_N <= 0) {
-                // 退化：随机 scatter shuffle
+                // Fallback: random scatter shuffle
                int row = (dim1 > 1) ? rand_int(rng, dim1) : 0;
                int sz = sol.dim2_sizes[row];
                if (sz < 4) return false;
@ -872,21 +872,21 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
                return true;
            }
            
-            // --- 有关系矩阵：引导重建 ---
-            // 通用策略（不感知问题类型）：
-            //   G 矩阵 → 选哪些元素（分组倾向弱的 = 可能放错位置的）
-            //   O 矩阵 → 怎么排（排序倾向引导重排顺序）
-            //   两者协同：G 选人，O 排序
+            // --- With relation matrices: guided rebuild ---
+            // Generic strategy (problem-agnostic):
+            //   G matrix → which elements (weak grouping with seed = likely misplaced)
+            //   O matrix → how to order (ordering affinity guides reorder)
+            //   Together: G picks, O orders
            int row = (dim1 > 1) ? rand_int(rng, dim1) : 0;
            int sz = sol.dim2_sizes[row];
            if (sz < 4) return false;
            
-            // 选种子元素
+            // Pick seed element
            int seed_pos = rand_int(rng, sz);
            int seed_val = sol.data[row][seed_pos];
            if (seed_val < 0 || seed_val >= rel_N) return false;
            
-            // 检查矩阵是否有足够信息（G 和 O 任一有信号即可）
+            // Check matrices have enough signal (either G or O)
            float max_signal = 0.0f;
            for (int c = 0; c < sz; c++) {
                int v = sol.data[row][c];
@ -897,11 +897,11 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
                    if (o > max_signal) max_signal = o;
                }
            }
-            if (max_signal < 0.05f) return false;  // 信息不足，跳过
+            if (max_signal < 0.05f) return false;  // insufficient signal, skip
            
-            // 破坏：锦标赛选择 G 值低的元素（t=2）
-            // G 值低 = 与 seed 分组倾向弱 = 可能放错位置
-            // 锦标赛：随机抽 2 个，取 G 值更低的那个，重复 count 次
+            // Destroy: tournament pick low-G elements (t=2)
+            // Low G = weak grouping with seed = likely misplaced
+            // Tournament: draw 2 at random, take lower G, repeat count times
            constexpr int MAX_REBUILD = 10;
            constexpr int TOUR_SIZE = 2;
            int count = sz / 5;  // ~20%
@ -911,12 +911,12 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
            
            int sel_pos[MAX_REBUILD];
            int sel_val[MAX_REBUILD];
-            bool used[128] = {};  // 标记已选位置，防止重复
+            bool used[128] = {};  // mark chosen positions to avoid duplicates
            int picked = 0;
-            int max_attempts = count * 4;  // 防止死循环
+            int max_attempts = count * 4;  // avoid infinite loop
            
            for (int attempt = 0; attempt < max_attempts && picked < count; attempt++) {
-                // 锦标赛：随机抽 TOUR_SIZE 个候选，取 G 值最低的
+                // Tournament: draw TOUR_SIZE candidates at random, take lowest G
                int best_c = -1;
                float best_g = 1e30f;
                for (int t = 0; t < TOUR_SIZE; t++) {
@ -936,15 +936,15 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
            if (picked < 2) return false;
            count = picked;
            
-            // 修复：锦标赛排序（O 矩阵引导 + 随机扰动）
-            // 插入排序，比较时加噪声实现概率性：O 值高的大概率排前面，但不绝对
+            // Repair: tournament sort (O-guided + random noise)
+            // Insertion sort with noisy comparison: high O tends to go first, not guaranteed
            for (int i = 1; i < count; i++) {
                int key = sel_val[i];
                int j = i - 1;
                while (j >= 0) {
                    float o_key_before = d_O[key * rel_N + sel_val[j]];
                    float o_j_before   = d_O[sel_val[j] * rel_N + key];
-                    // 噪声幅度 0.05：O 值差距 >0.05 时基本确定，<0.05 时随机
+                    // Noise scale 0.05: if O gap >0.05 mostly deterministic, else random
                    float noise = (curand_uniform(rng) - 0.5f) * 0.1f;
                    if (o_key_before + noise > o_j_before) {
                        sel_val[j + 1] = sel_val[j];
@ -956,7 +956,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
                sel_val[j + 1] = key;
            }
            
-            // 对 sel_pos 排序（升序），使写回位置有序
+            // Sort sel_pos ascending so write-back order is stable
            for (int i = 1; i < count; i++) {
                int key = sel_pos[i];
                int j = i - 1;
@ -967,7 +967,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
                sel_pos[j + 1] = key;
            }
            
-            // 检查是否真的改变了排列
+            // Check whether permutation actually changed
            bool any_change = false;
            for (int i = 0; i < count; i++) {
                if (sol.data[row][sel_pos[i]] != sel_val[i]) {
@ -977,7 +977,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
            }
            if (!any_change) return false;
            
-            // 写回
+            // Write back
            for (int i = 0; i < count; i++) {
                sol.data[row][sel_pos[i]] = sel_val[i];
            }
@ -989,7 +989,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
    }

    // ============================================================
-    // Binary 序列
+    // Binary sequences
    // ============================================================
    if (encoding == EncodingType::Binary) {
        switch (seq_id) {
@ -1063,7 +1063,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
    }

    // ============================================================
-    // Integer 序列
+    // Integer sequences
    // ============================================================
    if (encoding == EncodingType::Integer) {
        switch (seq_id) {
@ -1131,7 +1131,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
    }

    // ============================================================
-    // 共享：行级序列（编码无关）
+    // Shared: row-level sequences (encoding-agnostic)
    // ============================================================
    switch (seq_id) {
    case seq::SEQ_ROW_SWAP: {
@ -1194,11 +1194,11 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
 }

 // ============================================================
-// sample_and_execute — 从 SeqRegistry 按权重采样 + 直接执行
+// sample_and_execute — sample from SeqRegistry by weight and execute directly
 // ============================================================
-// 返回 true 若 sol 被修改，false 若 NOOP
-// 输出参数 out_seq_idx：采样到的序列在 registry 中的索引
-// d_G, d_O, rel_N: 可选的关系矩阵（传递给 execute_sequence）
+// Returns true if sol modified, false if NOOP
+// out_seq_idx: index of sampled sequence in registry
+// d_G, d_O, rel_N: optional relation matrices (passed to execute_sequence)

 template<typename Sol>
 __device__ inline bool sample_and_execute(const SeqRegistry& reg,
@ -1212,7 +1212,7 @@ __device__ inline bool sample_and_execute(const SeqRegistry& reg,
                                          int val_lb = 0,
                                          int val_ub = 1,
                                          const void* prob_data = nullptr) {
-    // 延迟归一化：使用缓存的 weights_sum
+    // Lazy normalization: use cached weights_sum
    float r = curand_uniform(rng) * reg.weights_sum;  // r ∈ [0, weights_sum)
    float cumsum = 0.0f;
    out_seq_idx = reg.count - 1;
--- a/prototype/core/population.cuh
+++ b/prototype/core/population.cuh
@ -1,10 +1,10 @@
 /**
- * population.cuh - 种群管理
+ * population.cuh - Population management
 * 
- * v2.0: Block 级架构
- *   - RNG 数组大小 = pop_size * block_size（每个 block 内每个线程独立 RNG）
- *   - 初始化 kernel 保持 1-thread-per-solution（初始化只做一次，不需要并行）
- *   - find_best_kernel 保持单线程（种群规模不大）
+ * v2.0: Block-level architecture
+ *   - RNG array size = pop_size * block_size (one independent RNG per thread within each block)
+ *   - Init kernel stays 1-thread-per-solution (initialization runs once; parallelism not needed)
+ *   - find_best_kernel remains single-threaded (population size is modest)
 */

 #pragma once
@ -12,7 +12,7 @@
 #include "cuda_utils.cuh"

 // ============================================================
-// Device 端 Kernel（模板化）
+// Device-side kernels (templated)
 // ============================================================

 template<typename Sol>
@ -65,9 +65,9 @@ __global__ void init_integer_kernel(Sol* pop, int pop_size,
 }

 // ============================================================
-// 多重集排列初始化 — 每个值 [0, N) 重复 R 次，总长度 N*R
+// Multiset permutation init — each value in [0, N) repeated R times, total length N*R
 // ============================================================
-// 用于 JSP 工序排列编码：N=num_jobs, R=num_ops，值 j 出现 R 次表示工件 j
+// For JSP operation-sequence encoding: N=num_jobs, R=num_ops; value j appearing R times means job j

 template<typename Sol>
 __global__ void init_multiset_perm_kernel(Sol* pop, int pop_size,
@ -90,7 +90,7 @@ __global__ void init_multiset_perm_kernel(Sol* pop, int pop_size,
 }

 // ============================================================
-// 分区初始化 — 元素 {0..total_elements-1} 不重复分配到 dim1 行
+// Partition init — elements {0..total_elements-1} assigned without duplication across dim1 rows
 // ============================================================

 template<typename Sol>
@ -131,21 +131,21 @@ __global__ void find_best_kernel(const Sol* pop, int pop_size,
 }

 // ============================================================
-// Host 端 RAII 类（模板化）
+// Host-side RAII class (templated)
 // ============================================================

 template<typename Sol>
 class Population {
 public:
    Sol*         d_solutions  = nullptr;
-    curandState* d_rng_states = nullptr;  // 大小 = pop_size * block_size
+    curandState* d_rng_states = nullptr;  // size = pop_size * block_size
    int          size         = 0;
-    int          rng_count    = 0;        // RNG 状态总数
+    int          rng_count    = 0;        // total RNG states

    Population() = default;
    
-    // block_size: Block 级架构下每个 block 的线程数
-    // RNG 数组大小 = pop_size * block_size（每个 block 内每个线程独立 RNG）
+    // block_size: threads per block under block-level architecture
+    // RNG array size = pop_size * block_size (one independent RNG per thread within each block)
    void allocate(int pop_size, int block_size = 128) {
        size = pop_size;
        rng_count = pop_size * block_size;
--- a/prototype/core/relation_matrix.cuh
+++ b/prototype/core/relation_matrix.cuh
@ -1,20 +1,20 @@
 /**
- * relation_matrix.cuh - G/O 关系矩阵管理
+ * relation_matrix.cuh - G/O relation matrix management
 *
- * G[i][j]: 分组倾向（元素 i 和 j 应在同一行的倾向，对称）
- * O[i][j]: 排序倾向（元素 i 应排在 j 前面的倾向，不对称）
+ * G[i][j]: grouping affinity (tendency for elements i and j to be on the same row; symmetric)
+ * O[i][j]: ordering affinity (tendency for element i to appear before j; asymmetric)
 *
- * 更新来源：历史最优解统计
- *   每当 host 端获取到当前 best 解，扫描所有元素对关系：
- *     - 同行 → G[i][j] 增强
- *     - i 在 j 前 → O[i][j] 增强
- *   使用 EMA 衰减：M[i][j] = α * M[i][j] + (1-α) * signal
+ * Update source: statistics from historical best solutions
+ *   Whenever the host obtains the current best solution, scan all element-pair relations:
+ *     - Same row → strengthen G[i][j]
+ *     - i before j → strengthen O[i][j]
+ *   EMA decay: M[i][j] = α * M[i][j] + (1-α) * signal
 *
- * 生命周期：
- *   1. relation_matrix_create(N)  — 分配 host/device 内存，初始化为 0
- *   2. relation_matrix_update(rm, sol, dim1) — 从一个解更新 G/O（host 端）
- *   3. relation_matrix_upload(rm) — 上传 h_G/h_O 到 d_G/d_O
- *   4. relation_matrix_destroy(rm) — 释放内存
+ * Lifecycle:
+ *   1. relation_matrix_create(N)  — allocate host/device memory, initialize to 0
+ *   2. relation_matrix_update(rm, sol, dim1) — update G/O from one solution (host)
+ *   3. relation_matrix_upload(rm) — upload h_G/h_O to d_G/d_O
+ *   4. relation_matrix_destroy(rm) — free memory
 */

 #pragma once
@ -23,7 +23,7 @@
 #include <cstring>

 // ============================================================
-// 创建 / 销毁
+// Create / destroy
 // ============================================================

 inline RelationMatrix relation_matrix_create(int N, float decay = 0.95f) {
@ -58,19 +58,19 @@ inline void relation_matrix_destroy(RelationMatrix& rm) {
 }

 // ============================================================
-// 从一个解更新 G/O（host 端）
+// Update G/O from one solution (host)
 // ============================================================
-// sol: 当前最优解（已下载到 host）
-// dim1: 实际使用的行数
+// sol: current best solution (already copied to host)
+// dim1: number of rows in use
 //
-// 逻辑：
-//   对 sol 中每对元素 (val_a, val_b)：
-//     如果在同一行 → G[val_a][val_b] 增强
-//     如果 val_a 在 val_b 前面 → O[val_a][val_b] 增强
+// Logic:
+//   For each pair (val_a, val_b) in sol:
+//     If on the same row → strengthen G[val_a][val_b]
+//     If val_a appears before val_b → strengthen O[val_a][val_b]
 //
-// 注意：元素值 val 必须在 [0, N) 范围内才有意义
-//       对于 partition 编码（VRP），元素值就是客户编号
-//       对于单行排列（TSP），元素值就是城市编号
+// Note: element values val are meaningful only in [0, N)
+//       For partition encoding (VRP), values are customer IDs
+//       For single-row permutation (TSP), values are city IDs

 template<typename Sol>
 void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) {
@ -78,13 +78,13 @@ void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) {
    float alpha = rm.decay;
    float signal_strength = 1.0f;
    
-    // 衰减所有现有值
+    // Decay all existing values
    for (int i = 0; i < N * N; i++) {
        rm.h_G[i] *= alpha;
        rm.h_O[i] *= alpha;
    }
    
-    // 扫描解中的元素对关系
+    // Scan element-pair relations in the solution
    for (int r = 0; r < dim1; r++) {
        int sz = sol.dim2_sizes[r];
        for (int c1 = 0; c1 < sz; c1++) {
@ -95,17 +95,17 @@ void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) {
                int val_b = sol.data[r][c2];
                if (val_b < 0 || val_b >= N) continue;
                
-                // 同行 → G 增强（对称）
+                // Same row → strengthen G (symmetric)
                rm.h_G[val_a * N + val_b] += (1.0f - alpha) * signal_strength;
                rm.h_G[val_b * N + val_a] += (1.0f - alpha) * signal_strength;
                
-                // val_a 在 val_b 前 → O[val_a][val_b] 增强
+                // val_a before val_b → strengthen O[val_a][val_b]
                rm.h_O[val_a * N + val_b] += (1.0f - alpha) * signal_strength;
            }
        }
    }
    
-    // 裁剪到 [0, 1]
+    // Clamp to [0, 1]
    for (int i = 0; i < N * N; i++) {
        if (rm.h_G[i] > 1.0f) rm.h_G[i] = 1.0f;
        if (rm.h_O[i] > 1.0f) rm.h_O[i] = 1.0f;
@ -115,7 +115,7 @@ void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) {
 }

 // ============================================================
-// 上传到 GPU
+// Upload to GPU
 // ============================================================

 inline void relation_matrix_upload(const RelationMatrix& rm) {
--- a/prototype/core/solver.cuh
+++ b/prototype/core/solver.cuh
--- a/prototype/core/types.cuh
+++ b/prototype/core/types.cuh
@ -1,38 +1,39 @@
 /**
- * types.cuh - 核心类型定义
+ * types.cuh - Core type definitions
 * 
- * 包含：编码类型、Solution 模板、ProblemConfig/SolverConfig、
- *       SeqRegistry（AOS 序列级权重）、KStepConfig（多步执行）、
- *       RelationMatrix（G/O 关系矩阵）、ProblemBase（CRTP 基类）
+ * Contains: encoding types, Solution template, ProblemConfig/SolverConfig,
+ *           SeqRegistry (AOS sequence-level weights), KStepConfig (multi-step execution),
+ *           RelationMatrix (G/O relation matrix), ProblemBase (CRTP base class)
 */

 #pragma once
 #include <cstdio>
+#include "cuda_utils.cuh"

 // ============================================================
-// 编译时常量
+// Compile-time constants
 // ============================================================
-constexpr int MAX_OBJ = 4;    // 最多 4 个目标（16字节，不值得模板化）
-constexpr int MAX_SEQ = 32;   // 最大序列数（内置 ~16 + 自定义算子 ≤8，留余量）
-constexpr int MAX_K   = 3;    // 多步执行的最大步数（K=1,2,3）
-// AOS 权重上下限（归一化后）
-constexpr float AOS_WEIGHT_FLOOR = 0.05f;  // 最低权重保底（确保充分探索）
-constexpr float AOS_WEIGHT_CAP   = 0.35f;  // 最高权重上限（防止赢者通吃）
+constexpr int MAX_OBJ = 4;    // Max 4 objectives (16 bytes, not worth templatizing)
+constexpr int MAX_SEQ = 32;   // Max sequences (built-in ~16 + custom ops ≤8, with margin)
+constexpr int MAX_K   = 3;    // Max steps for multi-step execution (K=1,2,3)
+// AOS weight bounds
+constexpr float AOS_WEIGHT_FLOOR = 0.05f;  // Minimum weight floor (ensures sufficient exploration)
+constexpr float AOS_WEIGHT_CAP   = 0.35f;  // Maximum weight cap (prevents winner-take-all)

 // ============================================================
-// 枚举类型
+// Enum types
 // ============================================================

 enum class EncodingType {
-    Permutation,    // 排列：元素不重复
-    Binary,         // 0-1：flip 是主要算子
-    Integer         // 有界整数
+    Permutation,    // Permutation: elements are unique
+    Binary,         // 0-1: flip is the main operator
+    Integer         // Bounded integers
 };

 enum class RowMode {
-    Single,     // dim1=1，单行（TSP/QAP/Knapsack 等大部分问题）
-    Fixed,      // dim1>1，行等长不可变（JSP-Int/Schedule，禁止 SPLIT/MERGE）
-    Partition   // dim1>1，元素分区到各行，行长可变（CVRP/VRPTW）
+    Single,     // dim1=1, single row (most problems: TSP/QAP/Knapsack, etc.)
+    Fixed,      // dim1>1, equal row lengths fixed (JSP-Int/Schedule; SPLIT/MERGE disallowed)
+    Partition   // dim1>1, elements partitioned across rows, variable row lengths (CVRP/VRPTW)
 };

 enum class ObjDir {
@ -40,241 +41,235 @@ enum class ObjDir {
    Maximize
 };

-// 多目标比较模式
+// Multi-objective comparison mode
 enum class CompareMode {
-    Weighted,       // 加权求和：sum(weight[i] * obj[i])，越小越好
-    Lexicographic   // 字典法：按优先级逐目标比较，前面的目标优先
+    Weighted,       // Weighted sum: sum(weight[i] * obj[i]), lower is better
+    Lexicographic   // Lexicographic: compare objectives by priority order
 };

 enum class MigrateStrategy {
-    Ring,       // 环形：各岛最优→邻岛最差（慢传播，高多样性）
-    TopN,       // 全局 Top-N 轮转分发（快传播，强收敛）
-    Hybrid      // 两者兼顾：Top-N 替换最差 + Ring 替换次差
+    Ring,       // Ring: each island's best → neighbor's worst (slow spread, high diversity)
+    TopN,       // Global Top-N round-robin (fast spread, strong convergence)
+    Hybrid      // Hybrid: Top-N replaces worst + Ring replaces second-worst
 };

-// v5.0: 多 GPU 协同 — 解注入模式
+// v5.0: multi-GPU coordination — solution injection mode
 enum class MultiGpuInjectMode {
-    OneIsland,   // 注入到 1 个岛的 worst（保守，保持多样性）
-    HalfIslands, // 注入到 num_islands/2 个岛的 worst（平衡）
-    AllIslands   // 注入到所有岛的 worst（激进，快速传播）
+    OneIsland,   // Inject into worst of 1 island (conservative, preserves diversity)
+    HalfIslands, // Inject into worst on num_islands/2 islands (balanced)
+    AllIslands   // Inject into worst on all islands (aggressive, fast spread)
 };

-// v5.0 方案 B3: InjectBuffer — 被动注入缓冲区
-// GPU 无感知，CPU 同步写入，GPU 在 migrate_kernel 中检查并应用
-// 设计要点：
-// 1. 使用同步 cudaMemcpy 避免与 solve() 的 stream/Graph 冲突
-// 2. 写入顺序：先 solution 后 flag，GPU 端原子读 flag 确保一致性
-// 3. 完全解耦：不依赖 solve() 的任何内部状态
+// v5.0 option B3: InjectBuffer — passive injection buffer
+// GPU has no awareness; CPU writes synchronously; GPU checks and applies in migrate_kernel
+// Design notes:
+// 1. Use synchronous cudaMemcpy to avoid conflicts with solve() stream/Graph
+// 2. Write order: solution first, then flag; GPU atomic flag read ensures consistency
+// 3. Fully decoupled: does not depend on any internal state of solve()
 template<typename Sol>
 struct InjectBuffer {
-    Sol*  d_solution;    // Device 端解缓冲区（单个解）
-    int*  d_flag;        // Device 端标志位：0=空，1=有新解
+    Sol*  d_solution = nullptr;  // Device solution buffer (single solution)
+    int*  d_flag     = nullptr;  // Device flag: 0=empty, 1=new solution
+    int   owner_gpu  = 0;       // GPU that owns the allocation
    
-    // 分配 InjectBuffer（在指定 GPU 上）
+    // Allocate InjectBuffer (on given GPU)
    static InjectBuffer<Sol> allocate(int gpu_id) {
        InjectBuffer<Sol> buf;
+        buf.owner_gpu = gpu_id;
        
-        // 保存原设备，切换到目标 GPU
        int orig_device;
-        cudaGetDevice(&orig_device);
-        cudaSetDevice(gpu_id);
+        CUDA_CHECK(cudaGetDevice(&orig_device));
+        CUDA_CHECK(cudaSetDevice(gpu_id));
        
-        // 分配设备内存
-        cudaMalloc(&buf.d_solution, sizeof(Sol));
-        cudaMalloc(&buf.d_flag, sizeof(int));
+        CUDA_CHECK(cudaMalloc(&buf.d_solution, sizeof(Sol)));
+        CUDA_CHECK(cudaMalloc(&buf.d_flag, sizeof(int)));
        
-        // 初始化 flag 为 0
        int zero = 0;
-        cudaMemcpy(buf.d_flag, &zero, sizeof(int), cudaMemcpyHostToDevice);
+        CUDA_CHECK(cudaMemcpy(buf.d_flag, &zero, sizeof(int), cudaMemcpyHostToDevice));
        
-        // 恢复原设备
-        cudaSetDevice(orig_device);
+        CUDA_CHECK(cudaSetDevice(orig_device));
        
        return buf;
    }
    
-    // 释放 InjectBuffer
+    // Free InjectBuffer (switches to owner GPU before freeing)
    void destroy() {
-        if (d_solution) {
-            cudaFree(d_solution);
-            d_solution = nullptr;
-        }
-        if (d_flag) {
-            cudaFree(d_flag);
-            d_flag = nullptr;
+        if (d_solution || d_flag) {
+            int orig_device;
+            cudaGetDevice(&orig_device);
+            cudaSetDevice(owner_gpu);
+            if (d_solution) { cudaFree(d_solution); d_solution = nullptr; }
+            if (d_flag)     { cudaFree(d_flag);     d_flag = nullptr;     }
+            cudaSetDevice(orig_device);
        }
    }
    
-    // CPU 端写入新解
-    // 注意：使用同步 cudaMemcpy 避免与 solve() 的 stream 冲突
-    // 顺序：先写 solution，再写 flag（GPU 端原子读 flag 确保不会读到半写状态）
+    // CPU-side write of new solution
+    // Note: synchronous cudaMemcpy avoids stream conflicts with solve()
+    // Order: write solution first, then flag (GPU atomic flag read avoids half-written reads)
    void write_sync(const Sol& sol, int target_gpu) {
-        // 保存原设备，切换到目标 GPU
        int orig_device;
-        cudaGetDevice(&orig_device);
-        cudaSetDevice(target_gpu);
+        CUDA_CHECK(cudaGetDevice(&orig_device));
+        CUDA_CHECK(cudaSetDevice(target_gpu));
        
-        // 先写解数据
-        cudaMemcpy(d_solution, &sol, sizeof(Sol), cudaMemcpyHostToDevice);
-        // 再写标志位（确保解数据已写完）
+        CUDA_CHECK(cudaMemcpy(d_solution, &sol, sizeof(Sol), cudaMemcpyHostToDevice));
        int flag = 1;
-        cudaMemcpy(d_flag, &flag, sizeof(int), cudaMemcpyHostToDevice);
+        CUDA_CHECK(cudaMemcpy(d_flag, &flag, sizeof(int), cudaMemcpyHostToDevice));
        
-        // 恢复原设备
-        cudaSetDevice(orig_device);
+        CUDA_CHECK(cudaSetDevice(orig_device));
    }
 };


 // ============================================================
-// SeqID — 统一的 OperationSequence 编号
+// SeqID — unified OperationSequence IDs
 // ============================================================
-// 每个 SeqID 对应一种具体的搜索操作（原子或多步）
-// AOS 权重跟踪粒度 = SeqID（每个序列独立权重）
+// Each SeqID maps to one concrete search operation (atomic or multi-step)
+// AOS weight granularity = SeqID (independent weight per sequence)
 //
-// 命名规则：SEQ_{编码}_{操作名}
-// 跨编码共享的行级操作统一编号
+// Naming: SEQ_{encoding}_{operation}
+// Row-level ops shared across encodings use unified numbering

 namespace seq {

-// --- Permutation 行内（元素级）---
-constexpr int SEQ_PERM_SWAP           = 0;   // swap 两个位置
-constexpr int SEQ_PERM_REVERSE        = 1;   // 2-opt（反转区间）
-constexpr int SEQ_PERM_INSERT         = 2;   // insert（移动到新位置）
-constexpr int SEQ_PERM_3OPT           = 3;   // 3-opt（断 3 边重连）
+// --- Permutation in-row (element-level) ---
+constexpr int SEQ_PERM_SWAP           = 0;   // swap two positions
+constexpr int SEQ_PERM_REVERSE        = 1;   // 2-opt (reverse segment)
+constexpr int SEQ_PERM_INSERT         = 2;   // insert (move to new position)
+constexpr int SEQ_PERM_3OPT           = 3;   // 3-opt (reconnect after 3 edges)

-// --- Permutation 行内（片段级）---
-constexpr int SEQ_PERM_OR_OPT         = 4;   // or-opt（移动连续 k 个元素）
+// --- Permutation in-row (segment-level) ---
+constexpr int SEQ_PERM_OR_OPT         = 4;   // or-opt (move k consecutive elements)

-// --- Permutation 行内（组合级）---
-constexpr int SEQ_PERM_DOUBLE_SWAP    = 30;  // 连续两次 swap（同行）
-constexpr int SEQ_PERM_TRIPLE_SWAP    = 31;  // 连续三次 swap（同行）
+// --- Permutation in-row (combo-level) ---
+constexpr int SEQ_PERM_DOUBLE_SWAP    = 30;  // two consecutive swaps (same row)
+constexpr int SEQ_PERM_TRIPLE_SWAP    = 31;  // three consecutive swaps (same row)

-// --- Permutation 跨行（元素级）---
-constexpr int SEQ_PERM_CROSS_RELOCATE = 5;   // 单元素移行
-constexpr int SEQ_PERM_CROSS_SWAP     = 6;   // 单元素换行
+// --- Permutation cross-row (element-level) ---
+constexpr int SEQ_PERM_CROSS_RELOCATE = 5;   // single element moves row
+constexpr int SEQ_PERM_CROSS_SWAP     = 6;   // single element swaps rows

-// --- Permutation 跨行（片段级）---
-constexpr int SEQ_PERM_SEG_RELOCATE   = 7;   // 片段移行
-constexpr int SEQ_PERM_SEG_SWAP       = 8;   // 片段换行（2-opt*）
-constexpr int SEQ_PERM_CROSS_EXCHANGE = 9;   // 片段互换（保序）
+// --- Permutation cross-row (segment-level) ---
+constexpr int SEQ_PERM_SEG_RELOCATE   = 7;   // segment moves row
+constexpr int SEQ_PERM_SEG_SWAP       = 8;   // segment swaps rows (2-opt*)
+constexpr int SEQ_PERM_CROSS_EXCHANGE = 9;   // segment exchange (order preserved)

-// --- Binary 行内（元素级）---
-constexpr int SEQ_BIN_FLIP            = 0;   // 翻转一个位
-constexpr int SEQ_BIN_SWAP            = 1;   // 交换两个位
+// --- Binary in-row (element-level) ---
+constexpr int SEQ_BIN_FLIP            = 0;   // flip one bit
+constexpr int SEQ_BIN_SWAP            = 1;   // swap two bits

-// --- Binary 行内（片段级）---
-constexpr int SEQ_BIN_SEG_FLIP        = 2;   // 翻转连续 k 个位
-constexpr int SEQ_BIN_K_FLIP          = 3;   // 同时翻转 k 个随机位
+// --- Binary in-row (segment-level) ---
+constexpr int SEQ_BIN_SEG_FLIP        = 2;   // flip k consecutive bits
+constexpr int SEQ_BIN_K_FLIP          = 3;   // flip k random bits at once

-// --- Binary 跨行 ---
-constexpr int SEQ_BIN_CROSS_SWAP      = 4;   // 两行各一个位互换
-constexpr int SEQ_BIN_SEG_CROSS_SWAP  = 5;   // 两行各取一段互换
+// --- Binary cross-row ---
+constexpr int SEQ_BIN_CROSS_SWAP      = 4;   // swap one bit per row across two rows
+constexpr int SEQ_BIN_SEG_CROSS_SWAP  = 5;   // swap a segment from each row

-// --- 共享：行级（编码无关）---
-constexpr int SEQ_ROW_SWAP            = 10;  // 交换两行
-constexpr int SEQ_ROW_REVERSE         = 11;  // 反转行排列
-constexpr int SEQ_ROW_SPLIT           = 12;  // 一行拆两行
-constexpr int SEQ_ROW_MERGE           = 13;  // 两行合并
+// --- Shared: row-level (encoding-agnostic) ---
+constexpr int SEQ_ROW_SWAP            = 10;  // swap two rows
+constexpr int SEQ_ROW_REVERSE         = 11;  // reverse row order
+constexpr int SEQ_ROW_SPLIT           = 12;  // split one row into two
+constexpr int SEQ_ROW_MERGE           = 13;  // merge two rows

-// --- 特殊 ---
-constexpr int SEQ_PERTURBATION        = 14;  // 扰动（多步不可逆）
+// --- Special ---
+constexpr int SEQ_PERTURBATION        = 14;  // perturbation (multi-step, irreversible)

-// --- Integer 行内（元素级）---
-constexpr int SEQ_INT_RANDOM_RESET    = 0;   // 随机一个位置重置为 [lb, ub] 内随机值
-constexpr int SEQ_INT_DELTA           = 1;   // 随机一个位置 ±k（clamp 到 [lb, ub]）
-constexpr int SEQ_INT_SWAP            = 2;   // 交换两个位置的值
+// --- Integer in-row (element-level) ---
+constexpr int SEQ_INT_RANDOM_RESET    = 0;   // reset one position to random in [lb, ub]
+constexpr int SEQ_INT_DELTA           = 1;   // one position ±k (clamped to [lb, ub])
+constexpr int SEQ_INT_SWAP            = 2;   // swap values at two positions

-// --- Integer 行内（片段级）---
-constexpr int SEQ_INT_SEG_RESET       = 3;   // 连续 k 个位置全部重置
-constexpr int SEQ_INT_K_DELTA         = 4;   // 随机 k 个位置各自 ±1
+// --- Integer in-row (segment-level) ---
+constexpr int SEQ_INT_SEG_RESET       = 3;   // reset k consecutive positions
+constexpr int SEQ_INT_K_DELTA         = 4;   // k positions each ±1 at random

-// --- Integer 跨行 ---
-constexpr int SEQ_INT_CROSS_SWAP      = 5;   // 两行各一个位置互换
+// --- Integer cross-row ---
+constexpr int SEQ_INT_CROSS_SWAP      = 5;   // swap one position per row across two rows

-// --- LNS（大邻域搜索）---
-constexpr int SEQ_LNS_SEGMENT_SHUFFLE = 20;  // 打乱连续片段
-constexpr int SEQ_LNS_SCATTER_SHUFFLE = 21;  // 打乱随机分散位置
-constexpr int SEQ_LNS_GUIDED_REBUILD  = 22;  // 关系矩阵引导重建
+// --- LNS (large neighborhood search) ---
+constexpr int SEQ_LNS_SEGMENT_SHUFFLE = 20;  // shuffle a contiguous segment
+constexpr int SEQ_LNS_SCATTER_SHUFFLE = 21;  // shuffle a scattered set of positions
+constexpr int SEQ_LNS_GUIDED_REBUILD  = 22;  // guided rebuild from relation matrix

 }  // namespace seq

 // ============================================================
-// RelationMatrix — G/O 关系矩阵（GPU global memory）
+// RelationMatrix — G/O relation matrix (GPU global memory)
 // ============================================================
-// G[i][j]: 元素 i 和 j 的分组倾向（对称，越大越倾向同组）
-// O[i][j]: 元素 i 排在 j 前面的倾向（不对称）
-// 存储为一维数组 [N * N]，行优先
-// 小规模 N<200 直接 Dense，P2 再做稀疏化
+// G[i][j]: grouping tendency of elements i and j (symmetric; higher → more same-group)
+// O[i][j]: tendency for element i to precede j (asymmetric)
+// Stored as a 1D row-major array [N * N]
+// For small N<200 use dense directly; P2 may add sparsification
 //
-// 更新时机：host 端，每个 batch 间隙
-// 使用时机：kernel 中 SEQ_LNS_GUIDED_REBUILD 读取
+// Updated on: host, between batches
+// Read in: kernel for SEQ_LNS_GUIDED_REBUILD

 struct RelationMatrix {
-    float* d_G;           // GPU 上的 G 矩阵 [N * N]
-    float* d_O;           // GPU 上的 O 矩阵 [N * N]
-    float* h_G;           // Host 上的 G 矩阵 [N * N]（用于更新后上传）
-    float* h_O;           // Host 上的 O 矩阵 [N * N]
-    int    N;             // 元素总数
-    float  decay;         // 衰减系数 α（默认 0.95）
-    int    update_count;  // 已更新次数（用于冷启动判断）
+    float* d_G;           // G matrix on GPU [N * N]
+    float* d_O;           // O matrix on GPU [N * N]
+    float* h_G;           // G matrix on host [N * N] (for upload after update)
+    float* h_O;           // O matrix on host [N * N]
+    int    N;             // total number of elements
+    float  decay;         // decay factor α (default 0.95)
+    int    update_count;  // number of updates so far (for cold-start logic)
 };

 // ============================================================
-// SeqRegistry — 运行时可用序列注册表
+// SeqRegistry — runtime-available sequence registry
 // ============================================================
-// 根据 EncodingType 和 dim1 自动确定哪些序列可用
-// 传到 GPU 供 sample_sequence() 使用
+// Which sequences are available is determined from EncodingType and dim1
+// Passed to GPU for sample_sequence()

 enum class SeqCategory : int {
-    InRow    = 0,   // 行内算子（swap, reverse, insert, ...）
-    CrossRow = 1,   // 跨行算子（cross_relocate, cross_swap, seg_relocate, ...）
-    RowLevel = 2,   // 行级算子（row_swap, row_reverse, split, merge）
-    LNS      = 3,   // 大邻域搜索
+    InRow    = 0,   // within-row operators (swap, reverse, insert, ...)
+    CrossRow = 1,   // cross-row operators (cross_relocate, cross_swap, seg_relocate, ...)
+    RowLevel = 2,   // row-level operators (row_swap, row_reverse, split, merge)
+    LNS      = 3,   // large neighborhood search
 };

 struct SeqRegistry {
-    int   ids[MAX_SEQ];       // 可用序列的 SeqID 列表
-    int   count;              // 可用序列数量
-    float weights[MAX_SEQ];   // 每个序列的当前权重（未归一化，延迟归一化）
-    float weights_sum;        // 权重和（缓存，用于延迟归一化）
-    float max_w[MAX_SEQ];     // 每个序列的权重上限（0 = 不限，用全局 cap）
-    SeqCategory categories[MAX_SEQ];  // 每个序列的分类（约束导向用）
+    int   ids[MAX_SEQ];       // SeqID list of available sequences
+    int   count;              // number of available sequences
+    float weights[MAX_SEQ];   // current weight per sequence (unnormalized; lazy normalization)
+    float weights_sum;        // sum of weights (cached for lazy normalization)
+    float max_w[MAX_SEQ];     // per-sequence weight cap (0 = unlimited, use global cap)
+    SeqCategory categories[MAX_SEQ];  // category per sequence (for constraint-directed mode)
 };

 // ============================================================
-// KStepConfig — 多步执行的步数选择配置
+// KStepConfig — step-count selection for multi-step execution
 // ============================================================
-// K=1: 单步（当前行为），K=2/3: 连续执行多个序列后再评估
-// 两层权重体系的第一层
+// K=1: single step (current behavior); K=2/3: run several sequences then evaluate
+// First layer of the two-level weight system
 //
-// 自适应策略：
-//   - 初始 K=1 权重很大（保守），K>1 权重小
-//   - K>1 带来改进 → 增大该 K 的权重
-//   - 长时间无改进 → 重置/增大 K>1 权重（跳出局部最优）
+// Adaptive policy:
+//   - Initially K=1 has large weight (conservative), K>1 small
+//   - If K>1 yields improvement → increase that K's weight
+//   - Long stagnation → reset / boost K>1 weights (escape local optima)

 struct KStepConfig {
-    float weights[MAX_K];     // K=1,2,3 的采样权重（归一化）
-    int   stagnation_count;   // 连续无改进的 batch 数（用于触发重置）
-    int   stagnation_limit;   // 触发重置的阈值（默认 5 个 batch）
+    float weights[MAX_K];     // sampling weights for K=1,2,3 (normalized)
+    int   stagnation_count;   // consecutive batches without improvement (triggers reset)
+    int   stagnation_limit;   // threshold to trigger reset (default 5 batches)
 };

-// 构建默认 K 步配置
+// Build default K-step configuration
 inline KStepConfig build_kstep_config() {
    KStepConfig kc;
-    kc.weights[0] = 0.80f;   // K=1: 初始主导
-    kc.weights[1] = 0.15f;   // K=2: 少量探索
-    kc.weights[2] = 0.05f;   // K=3: 极少探索
+    kc.weights[0] = 0.80f;   // K=1: dominates initially
+    kc.weights[1] = 0.15f;   // K=2: little exploration
+    kc.weights[2] = 0.05f;   // K=3: minimal exploration
    kc.stagnation_count = 0;
    kc.stagnation_limit = 5;
    return kc;
 };

 // ============================================================
-// ProblemProfile — 基于结构特征推断的问题画像
+// ProblemProfile — problem profile inferred from structural features
 // ============================================================
-// 第一层：纯结构推断（不感知语义），用于驱动算子注册和初始权重
-// 未来第二层：可扩展更细粒度的画像（如多属性、高约束等）
+// Layer 1: structure-only inference (no semantics), drives operator registration and initial weights
+// Future layer 2: finer profiles (e.g. multi-attribute, high constraint)

 enum class ScaleClass  { Small, Medium, Large };
 enum class StructClass { SingleSeq, MultiFixed, MultiPartition };
@ -286,10 +281,10 @@ struct ProblemProfile {
    float         cross_row_prob;
 };

-// classify_problem() 定义在 ProblemConfig 之后
+// classify_problem() is defined after ProblemConfig

 // ============================================================
-// 权重预设 — 由 ScaleClass 驱动
+// Weight presets — driven by ScaleClass
 // ============================================================

 struct WeightPreset {
@ -308,100 +303,100 @@ inline WeightPreset get_weight_preset(ScaleClass scale) {
    return { 0.50f, 0.80f, 0.006f, 0.01f };
 }

-// classify_problem() 和 build_seq_registry() 定义在 ProblemConfig 之后
+// classify_problem() and build_seq_registry() are defined after ProblemConfig

 // ============================================================
-// Solution<D1, D2> — 解的模板化表示
+// Solution<D1, D2> — templated solution representation
 // ============================================================
-// D1: 行数上限 (TSP=1, VRP≤16, Schedule≤8)
-// D2: 每行列数上限 (TSP≤64, 背包≤32)
-// 每个 Problem 选择最小够用的 D1/D2，编译器生成紧凑的结构
+// D1: max number of rows (TSP=1, VRP≤16, Schedule≤8)
+// D2: max columns per row (TSP≤64, knapsack≤32)
+// Each Problem picks the smallest sufficient D1/D2; compiler emits a compact layout

 template<int D1, int D2>
 struct Solution {
-    static constexpr int DIM1 = D1;   // 编译时行数上限
-    static constexpr int DIM2 = D2;   // 编译时列数上限
-    int   data[D1][D2];               // D1×D2×4 字节
-    int   dim2_sizes[D1];             // D1×4 字节
-    float objectives[MAX_OBJ];        // 16 字节（固定）
-    float penalty;                    // 4 字节
+    static constexpr int DIM1 = D1;   // compile-time max rows
+    static constexpr int DIM2 = D2;   // compile-time max columns per row
+    int   data[D1][D2];               // D1×D2×4 bytes
+    int   dim2_sizes[D1];             // D1×4 bytes
+    float objectives[MAX_OBJ];        // 16 bytes (fixed)
+    float penalty;                    // 4 bytes
 };

 // ============================================================
-// ProblemConfig — 问题的运行时元信息
+// ProblemConfig — runtime metadata for a problem
 // ============================================================

 struct ProblemConfig {
    EncodingType encoding;
-    int   dim1;                       // 实际使用的行数 (≤ D1)
-    int   dim2_default;               // 实际使用的列数 (≤ D2)
+    int   dim1;                       // actual number of rows used (≤ D1)
+    int   dim2_default;               // actual number of columns used (≤ D2)
    int   num_objectives;
    ObjDir obj_dirs[MAX_OBJ];
-    float obj_weights[MAX_OBJ];       // Weighted 模式下的权重
-    // 多目标比较
+    float obj_weights[MAX_OBJ];       // weights in Weighted mode
+    // Multi-objective comparison
    CompareMode compare_mode = CompareMode::Weighted;
-    int   obj_priority[MAX_OBJ] = {0, 1, 2, 3};  // Lexicographic 模式下的比较顺序（索引）
-    float obj_tolerance[MAX_OBJ] = {0.0f, 0.0f, 0.0f, 0.0f};  // 字典法容差：差值 <= tol 视为相等
+    int   obj_priority[MAX_OBJ] = {0, 1, 2, 3};  // comparison order in Lexicographic mode (indices)
+    float obj_tolerance[MAX_OBJ] = {0.0f, 0.0f, 0.0f, 0.0f};  // lexicographic tolerance: |diff| ≤ tol ⇒ tie
    int   value_lower_bound;
    int   value_upper_bound;
-    // v3.4: 统一行模式
-    RowMode row_mode      = RowMode::Single;  // 行模式（Single/Fixed/Partition）
-    float cross_row_prob  = 0.0f;     // 跨行 move 概率（0=纯行内操作）
-    int   total_elements  = 0;        // Partition 模式下的总元素数
-    int   perm_repeat_count = 1;      // 排列中每个值的重复次数（1=标准排列，>1=多重集排列）
+    // v3.4: unified row mode
+    RowMode row_mode      = RowMode::Single;  // row mode (Single/Fixed/Partition)
+    float cross_row_prob  = 0.0f;     // probability of cross-row moves (0 = within-row only)
+    int   total_elements  = 0;        // total elements in Partition mode
+    int   perm_repeat_count = 1;      // repeats per value in permutation (1 = standard; >1 = multiset)
 };

 // ============================================================
-// SolverConfig — 求解器参数
+// SolverConfig — solver parameters
 // ============================================================

 struct SolverConfig {
-    int   pop_size         = 0;       // 种群大小（0 = 自动匹配 GPU 最大并行度）
+    int   pop_size         = 0;       // population size (0 = auto to max GPU parallelism)
    int   max_gen          = 1000;
    float mutation_rate    = 0.1f;
    unsigned seed          = 42;
    bool  verbose          = true;
    int   print_every      = 100;
-    // 岛屿模型参数
-    int   num_islands      = 1;       // 0 = 自适应，1 = 纯爬山（无岛屿），>1 = 岛屿模型
-    int   migrate_interval = 100;     // 每隔多少代执行一次迁移
+    // Island model
+    int   num_islands      = 1;       // 0 = adaptive, 1 = pure hill climbing (no islands), >1 = island model
+    int   migrate_interval = 100;     // migrate every this many generations
    MigrateStrategy migrate_strategy = MigrateStrategy::Hybrid;
-    // 模拟退火参数
-    float sa_temp_init     = 0.0f;    // 初始温度（0 = 禁用 SA，纯爬山）
-    float sa_alpha         = 0.998f;  // 冷却率（每代乘以 alpha）
-    // v1.0: 交叉参数
-    float crossover_rate   = 0.1f;    // 每代中执行交叉的概率（vs 变异）
-    // v2.0: 自适应算子选择
-    bool  use_aos          = false;   // 启用 AOS（batch 间更新算子权重）
-    float aos_weight_floor = AOS_WEIGHT_FLOOR;  // 运行时可覆盖的 floor
-    float aos_weight_cap   = AOS_WEIGHT_CAP;    // 运行时可覆盖的 cap
-    // v2.1: 初始解策略
-    int   init_oversample  = 4;       // 采样倍数（1 = 不做采样择优，即纯随机）
-    float init_random_ratio = 0.3f;   // 纯随机解占比（多样性保底）
-    // v3.0: 工程可用性
-    float time_limit_sec   = 0.0f;   // 时间限制（秒，0 = 不限制，按 max_gen 跑完）
-    int   stagnation_limit = 0;      // 收敛检测：连续多少个 batch 无改进后 reheat（0 = 禁用）
-    float reheat_ratio     = 0.5f;   // reheat 时温度恢复到初始温度的比例
+    // Simulated annealing
+    float sa_temp_init     = 0.0f;    // initial temperature (0 = disable SA, hill climb only)
+    float sa_alpha         = 0.998f;  // cooling rate (multiply by alpha each generation)
+    // v1.0: crossover
+    float crossover_rate   = 0.1f;    // probability of crossover per generation (vs mutation)
+    // v2.0: adaptive operator selection
+    bool  use_aos          = false;   // enable AOS (update operator weights between batches)
+    float aos_weight_floor = AOS_WEIGHT_FLOOR;  // runtime-overridable floor
+    float aos_weight_cap   = AOS_WEIGHT_CAP;    // runtime-overridable cap
+    // v2.1: initial solution strategy
+    int   init_oversample  = 4;       // oversampling factor (1 = no sampling selection, pure random)
+    float init_random_ratio = 0.3f;   // fraction of purely random solutions (diversity floor)
+    // v3.0: engineering usability
+    float time_limit_sec   = 0.0f;   // time limit in seconds (0 = none, run to max_gen)
+    int   stagnation_limit = 0;      // convergence: reheat after this many batches without improvement (0 = off)
+    float reheat_ratio     = 0.5f;   // on reheat, fraction of initial temperature to restore
    // v3.5: CUDA Graph
-    bool  use_cuda_graph   = false;  // 启用 CUDA Graph（减少 kernel launch 开销）
-    // v3.6: AOS 更新频率控制
-    int   aos_update_interval = 10;  // 每隔多少个 batch 更新一次 AOS 权重（降低 cudaMemcpy 同步频率）
-    // v4.0: 约束导向 + 分层搜索
-    bool  use_constraint_directed = false;  // 启用约束导向（根据 penalty 比例动态调整跨行算子权重）
-    bool  use_phased_search       = false;  // 启用分层搜索（按进度调整全局 floor/cap）
-    // 分层搜索参数：三期阈值
-    float phase_explore_end  = 0.30f;  // 探索期结束（进度比例）
-    float phase_refine_start = 0.70f;  // 精细期开始（进度比例）
-    // 约束导向参数
-    float constraint_boost_max = 2.5f; // 高约束时跨行算子 cap 提升倍率上限
-    // v5.0: 多 GPU 协同
-    int   num_gpus             = 1;    // 使用的 GPU 数量（1 = 单 GPU，>1 = 多 GPU 协同）
-    float multi_gpu_interval_sec = 10.0f;  // GPU 间交换最优解的时间间隔（秒）
-    MultiGpuInjectMode multi_gpu_inject_mode = MultiGpuInjectMode::HalfIslands;  // 注入模式
+    bool  use_cuda_graph   = false;  // enable CUDA Graph (fewer kernel launch overheads)
+    // v3.6: AOS update frequency
+    int   aos_update_interval = 10;  // update AOS weights every this many batches (lower cudaMemcpy sync rate)
+    // v4.0: constraint-directed + phased search
+    bool  use_constraint_directed = false;  // constraint-directed mode (scale cross-row weights by penalty ratio)
+    bool  use_phased_search       = false;  // phased search (adjust global floor/cap by progress)
+    // Phased search: three-phase thresholds
+    float phase_explore_end  = 0.30f;  // end of exploration phase (progress fraction)
+    float phase_refine_start = 0.70f;  // start of refinement phase (progress fraction)
+    // Constraint-directed parameters
+    float constraint_boost_max = 2.5f; // max multiplier boost for cross-row cap under high constraint
+    // v5.0: multi-GPU cooperation
+    int   num_gpus             = 1;    // number of GPUs (1 = single GPU, >1 = multi-GPU)
+    float multi_gpu_interval_sec = 10.0f;  // interval in seconds to exchange best solutions across GPUs
+    MultiGpuInjectMode multi_gpu_inject_mode = MultiGpuInjectMode::HalfIslands;  // injection mode
 };

 // ============================================================
-// classify_problem — 从 ProblemConfig 推断问题画像
+// classify_problem — infer problem profile from ProblemConfig
 // ============================================================

 inline ProblemProfile classify_problem(const ProblemConfig& pcfg) {
@ -424,7 +419,7 @@ inline ProblemProfile classify_problem(const ProblemConfig& pcfg) {
 }

 // ============================================================
-// build_seq_registry — 由 ProblemProfile 驱动的算子注册
+// build_seq_registry — operator registration driven by ProblemProfile
 // ============================================================

 inline SeqRegistry build_seq_registry(const ProblemProfile& prof) {
@ -436,7 +431,10 @@ inline SeqRegistry build_seq_registry(const ProblemProfile& prof) {
    }

    auto add = [&](int id, float w, SeqCategory cat, float cap = 0.0f) {
-        if (reg.count >= MAX_SEQ) return;
+        if (reg.count >= MAX_SEQ) {
+            printf("[WARN] SeqRegistry full (MAX_SEQ=%d), ignoring SeqID %d\n", MAX_SEQ, id);
+            return;
+        }
        reg.ids[reg.count] = id;
        reg.weights[reg.count] = w;
        reg.max_w[reg.count] = cap;
@ -514,7 +512,7 @@ inline SeqRegistry build_seq_registry(const ProblemProfile& prof) {
        }
    }

-    // 延迟归一化：只计算权重和，不归一化
+    // Lazy normalization: only sum weights; do not normalize here
    reg.weights_sum = 0.0f;
    for (int i = 0; i < reg.count; i++) {
        reg.weights_sum += reg.weights[i];
@ -523,19 +521,19 @@ inline SeqRegistry build_seq_registry(const ProblemProfile& prof) {
 }

 // ============================================================
-// ObjConfig — 传到 GPU 的目标比较配置（紧凑结构）
+// ObjConfig — compact objective comparison config for GPU
 // ============================================================

 struct ObjConfig {
    int         num_obj;
    CompareMode mode;
-    ObjDir      dirs[MAX_OBJ];       // 每个目标的方向
-    float       weights[MAX_OBJ];    // Weighted 模式下的权重
-    int         priority[MAX_OBJ];   // Lexicographic 模式下的比较顺序
-    float       tolerance[MAX_OBJ];  // Lexicographic 模式下的容差
+    ObjDir      dirs[MAX_OBJ];       // direction per objective
+    float       weights[MAX_OBJ];    // weights in Weighted mode
+    int         priority[MAX_OBJ];   // comparison order in Lexicographic mode
+    float       tolerance[MAX_OBJ];  // tolerance in Lexicographic mode
 };

-// 从 ProblemConfig 构造 ObjConfig（CPU 端）
+// Build ObjConfig from ProblemConfig (CPU side)
 inline ObjConfig make_obj_config(const ProblemConfig& pcfg) {
    ObjConfig oc;
    oc.num_obj = pcfg.num_objectives;
@ -550,7 +548,7 @@ inline ObjConfig make_obj_config(const ProblemConfig& pcfg) {
 }

 // ============================================================
-// SolveResult — solve() 的返回值
+// SolveResult — return value of solve()
 // ============================================================

 enum class StopReason { MaxGen, TimeLimit, Stagnation };
@ -564,12 +562,12 @@ struct SolveResult {
 };

 // ============================================================
-// 目标重要性映射 — 统一 Weighted / Lexicographic 的重要性度量
+// Objective importance mapping — unified importance for Weighted / Lexicographic
 // ============================================================
-// 用于初始化选种（NSGA-II 加权拥挤度 + 核心目标预留名额）
+// Used for initial selection (NSGA-II weighted crowding + core-object slots)
 // Weighted:      importance[i] = weight[i] / Σweight
 // Lexicographic: importance[i] = 0.5^rank[i] / Σ(0.5^rank)
-//   → 第一优先级 ~57%，第二 ~29%，第三 ~14%
+//   → first priority ~57%, second ~29%, third ~14%

 inline void compute_importance(const ObjConfig& oc, float* importance) {
    float sum = 0.0f;
@ -590,26 +588,26 @@ inline void compute_importance(const ObjConfig& oc, float* importance) {
 }

 // ============================================================
-// 比较工具 — 支持 Weighted / Lexicographic
+// Comparison utilities — Weighted / Lexicographic
 // ============================================================

-// 将目标值统一为"越小越好"：Maximize 目标取负
+// Normalize objectives to "smaller is better": negate Maximize objectives
 __device__ __host__ inline float normalize_obj(float val, ObjDir dir) {
    return (dir == ObjDir::Maximize) ? -val : val;
 }

-// 核心比较：a 是否优于 b
-// v5.0: 添加 __host__ 支持多 GPU 在 CPU 端比较解
+// Core comparison: whether a is better than b
+// v5.0: add __host__ so multi-GPU can compare solutions on CPU
 template<typename Sol>
 __device__ __host__ inline bool is_better(const Sol& a, const Sol& b,
                                  const ObjConfig& oc) {
-    // penalty 优先：可行解一定优于不可行解
+    // Penalty first: feasible beats infeasible
    if (a.penalty <= 0.0f && b.penalty > 0.0f) return true;
    if (a.penalty > 0.0f && b.penalty <= 0.0f) return false;
    if (a.penalty > 0.0f && b.penalty > 0.0f) return a.penalty < b.penalty;
    
    if (oc.mode == CompareMode::Weighted) {
-        // 加权求和（权重已包含方向信息：Maximize 目标用负权重，或由 normalize_obj 处理）
+        // Weighted sum (weights may encode direction: negative for Maximize, or use normalize_obj)
        float sum_a = 0.0f, sum_b = 0.0f;
        for (int i = 0; i < oc.num_obj; i++) {
            float na = normalize_obj(a.objectives[i], oc.dirs[i]);
@ -619,21 +617,22 @@ __device__ __host__ inline bool is_better(const Sol& a, const Sol& b,
        }
        return sum_a < sum_b;
    } else {
-        // 字典法：按 priority 顺序逐目标比较
+        // Lexicographic: compare objectives in priority order
        for (int p = 0; p < oc.num_obj; p++) {
            int idx = oc.priority[p];
+            if (idx < 0 || idx >= oc.num_obj) continue;
            float va = normalize_obj(a.objectives[idx], oc.dirs[idx]);
            float vb = normalize_obj(b.objectives[idx], oc.dirs[idx]);
            float diff = va - vb;
-            if (diff < -oc.tolerance[idx]) return true;   // a 明显更好
-            if (diff >  oc.tolerance[idx]) return false;  // b 明显更好
-            // 在容差内视为相等 → 继续比较下一个目标
+            if (diff < -oc.tolerance[idx]) return true;   // a clearly better
+            if (diff >  oc.tolerance[idx]) return false;  // b clearly better
+            // Within tolerance → tie, continue to next objective
        }
-        return false;  // 所有目标都在容差内相等
+        return false;  // all objectives tied within tolerance
    }
 }

-// 标量化（SA 接受概率用）：返回越小越好的标量
+// Scalarization (for SA acceptance): smaller is better
 template<typename Sol>
 __device__ __host__ inline float scalar_objective(const Sol& sol,
                                                    const ObjConfig& oc) {
@ -643,13 +642,14 @@ __device__ __host__ inline float scalar_objective(const Sol& sol,
            sum += oc.weights[i] * normalize_obj(sol.objectives[i], oc.dirs[i]);
        return sum;
    } else {
-        // 字典法下 SA 用第一优先级目标作为标量
+        // Under lexicographic SA, use first-priority objective as scalar
        int idx = oc.priority[0];
+        if (idx < 0 || idx >= oc.num_obj) idx = 0;
        return normalize_obj(sol.objectives[idx], oc.dirs[idx]);
    }
 }

-// 轻量比较：直接操作 float[] 目标数组（避免复制整个 Sol）
+// Lightweight comparison: operate on float[] objectives (avoid copying full Sol)
 __device__ inline bool obj_is_better(const float* new_objs, const float* old_objs,
                                      const ObjConfig& oc) {
    if (oc.mode == CompareMode::Weighted) {
@ -662,6 +662,7 @@ __device__ inline bool obj_is_better(const float* new_objs, const float* old_obj
    } else {
        for (int p = 0; p < oc.num_obj; p++) {
            int idx = oc.priority[p];
+            if (idx < 0 || idx >= oc.num_obj) continue;
            float va = normalize_obj(new_objs[idx], oc.dirs[idx]);
            float vb = normalize_obj(old_objs[idx], oc.dirs[idx]);
            float diff = va - vb;
@ -672,7 +673,7 @@ __device__ inline bool obj_is_better(const float* new_objs, const float* old_obj
    }
 }

-// 轻量标量化：直接操作 float[] 目标数组
+// Lightweight scalarization: operate on float[] objectives
 __device__ __host__ inline float obj_scalar(const float* objs, const ObjConfig& oc) {
    if (oc.mode == CompareMode::Weighted) {
        float sum = 0.0f;
@ -681,60 +682,61 @@ __device__ __host__ inline float obj_scalar(const float* objs, const ObjConfig&
        return sum;
    } else {
        int idx = oc.priority[0];
+        if (idx < 0 || idx >= oc.num_obj) idx = 0;
        return normalize_obj(objs[idx], oc.dirs[idx]);
    }
 }

 // ============================================================
-// AOSStats — 自适应算子选择统计（每个 block 一份）
+// AOSStats — adaptive operator selection stats (one per block)
 // ============================================================
-// v3.0: 粒度从 3 层 → MAX_SEQ 个序列
-// 记录每个序列的使用次数和改进次数
-// batch 结束后由 host 聚合，更新 SeqRegistry 权重
+// v3.0: granularity from 3 layers → MAX_SEQ sequences
+// Records per-sequence usage and improvement counts
+// Host aggregates after each batch and updates SeqRegistry weights

 struct AOSStats {
-    // 算子层统计（第二层）
-    int usage[MAX_SEQ];       // 各序列使用次数
-    int improvement[MAX_SEQ]; // 各序列改进次数（delta < 0 且被接受）
-    // K 步数层统计（第一层）
-    int k_usage[MAX_K];       // K=1,2,3 各自使用次数
-    int k_improvement[MAX_K]; // K=1,2,3 各自改进次数
+    // Operator-level stats (second layer)
+    int usage[MAX_SEQ];       // per-sequence usage counts
+    int improvement[MAX_SEQ]; // per-sequence improvements (delta < 0 and accepted)
+    // K-step layer stats (first layer)
+    int k_usage[MAX_K];       // usage counts for K=1,2,3
+    int k_improvement[MAX_K]; // improvement counts for K=1,2,3
 };

 // ============================================================
-// ObjDef — 单个目标的定义（编译期常量）
+// ObjDef — single-objective definition (compile-time constant)
 // ============================================================

 struct ObjDef {
-    ObjDir dir;           // 优化方向
-    float  weight;        // Weighted 模式下的权重
-    float  tolerance;     // Lexicographic 模式下的容差
+    ObjDir dir;           // optimization direction
+    float  weight;        // weight in Weighted mode
+    float  tolerance;     // tolerance in Lexicographic mode
 };

 // ============================================================
-// HeuristicMatrix — 启发式初始解构造用的数据矩阵描述
+// HeuristicMatrix — data matrix descriptor for heuristic initial solutions
 // ============================================================

 struct HeuristicMatrix {
-    const float* data;   // host 端 N*N 矩阵
-    int N;               // 维度
+    const float* data;   // N×N matrix on host
+    int N;               // dimension
 };

 // ============================================================
-// ProblemBase<Derived, D1, D2> — CRTP 基类
+// ProblemBase<Derived, D1, D2> — CRTP base class
 //
-// 用户继承此基类，提供：
-//   static constexpr ObjDef OBJ_DEFS[] = {...};   — 目标元信息
-//   __device__ float compute_obj(int idx, ...) const;  — 目标分发
+// Users inherit this base and provide:
+//   static constexpr ObjDef OBJ_DEFS[] = {...};   — objective metadata
+//   __device__ float compute_obj(int idx, ...) const;  — objective dispatch
 //   __device__ float compute_penalty(...) const;
 //
-// 约定：OBJ_DEFS 和 compute_obj 紧挨着写，case N 对应 OBJ_DEFS[N]
-// NUM_OBJ 由 sizeof(OBJ_DEFS) 自动推导，无需手动维护
+// Convention: OBJ_DEFS and compute_obj stay aligned; case N maps to OBJ_DEFS[N]
+// NUM_OBJ is derived from sizeof(OBJ_DEFS); no manual count
 //
-// 基类自动提供：
-//   evaluate(sol)           — 遍历目标列表调用 compute_obj
-//   fill_obj_config(cfg)    — 从 OBJ_DEFS 自动填充 ProblemConfig
-//   obj_config()            — 直接生成 ObjConfig
+// Base class provides:
+//   evaluate(sol)           — loop objectives and call compute_obj
+//   fill_obj_config(cfg)    — fill ProblemConfig from OBJ_DEFS
+//   obj_config()            — build ObjConfig directly
 // ============================================================

 template<typename Derived, int D1_, int D2_>
@ -743,10 +745,10 @@ struct ProblemBase {
    static constexpr int D2 = D2_;
    using Sol = Solution<D1, D2>;
    
-    // NUM_OBJ 从 OBJ_DEFS 数组自动推导
+    // NUM_OBJ derived from OBJ_DEFS array size
    static constexpr int NUM_OBJ = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
    
-    // 自动评估：遍历目标列表
+    // Automatic evaluation: iterate objectives
    __device__ void evaluate(Sol& sol) const {
        const auto& self = static_cast<const Derived&>(*this);
        constexpr int n = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
@ -755,7 +757,7 @@ struct ProblemBase {
        sol.penalty = self.compute_penalty(sol);
    }
    
-    // 从 OBJ_DEFS 自动填充 ProblemConfig 的目标部分
+    // Fill objective fields of ProblemConfig from OBJ_DEFS
    void fill_obj_config(ProblemConfig& cfg) const {
        constexpr int n = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
        cfg.num_objectives = n;
@ -763,59 +765,59 @@ struct ProblemBase {
            cfg.obj_dirs[i]      = Derived::OBJ_DEFS[i].dir;
            cfg.obj_weights[i]   = Derived::OBJ_DEFS[i].weight;
            cfg.obj_tolerance[i] = Derived::OBJ_DEFS[i].tolerance;
-            cfg.obj_priority[i]  = i;  // 列表顺序即优先级
+            cfg.obj_priority[i]  = i;  // list order is priority order
        }
    }
    
-    // 直接生成 ObjConfig（供 solver 使用）
+    // Build ObjConfig directly (for solver)
    ObjConfig obj_config() const {
        ProblemConfig pcfg;
        fill_obj_config(pcfg);
        return make_obj_config(pcfg);
    }
    
-    // 可选：返回 shared memory 需求（字节）
-    // 默认返回 0（不使用 shared memory）
-    // 子类覆盖：如果问题数据可以放入 shared memory，返回实际大小
+    // Optional: shared memory requirement (bytes)
+    // Default 0 (no shared memory)
+    // Override if problem data fits in shared memory; return actual size
    size_t shared_mem_bytes() const {
        return 0;
    }
    
-    // 可选：加载问题数据到 shared memory
-    // 默认空实现（不使用 shared memory）
-    // 子类覆盖：如果 shared_mem_bytes() > 0，实现数据加载逻辑
+    // Optional: load problem data into shared memory
+    // Default no-op (no shared memory)
+    // Override if shared_mem_bytes() > 0 to implement loading
    __device__ void load_shared(char* smem, int tid, int bsz) {
-        (void)smem; (void)tid; (void)bsz;  // 默认：不做任何事
+        (void)smem; (void)tid; (void)bsz;  // default: no-op
    }
    
-    // 每个 block 在 global memory 中的热数据工作集大小（字节）
-    // 用于 auto pop_size 估算 L2 cache 压力
-    // 默认 = shared_mem_bytes()（数据在 smem 时，gmem 工作集为 0 不影响）
-    // 子类覆盖：当 shared_mem_bytes() 返回 0（数据放不进 smem）时，
-    //           返回实际数据大小（如距离矩阵 n*n*sizeof(float)）
+    // Hot working-set size in global memory per block (bytes)
+    // Used for auto pop_size L2 cache pressure estimate
+    // Default = shared_mem_bytes() (when data is in smem, gmem working set is 0)
+    // Override when shared_mem_bytes() is 0 (data does not fit in smem):
+    //           return actual data size (e.g. distance matrix n*n*sizeof(float))
    size_t working_set_bytes() const {
        return static_cast<const Derived&>(*this).shared_mem_bytes();
    }
    
-    // 可选：初始化 G/O 关系矩阵（为 GUIDED_REBUILD 提供先验知识）
-    // G[i*N+j]: 元素 i 和 j 的分组倾向（对称，[0,1]，越大越倾向同组）
-    // O[i*N+j]: 元素 i 排在 j 前面的倾向（不对称，[0,1]）
-    // 默认不提供（全零），搜索过程中通过 EMA 从历史好解积累
-    // 用户覆盖示例：距离近 → G 和 O 都高
+    // Optional: initialize G/O relation matrix (prior for GUIDED_REBUILD)
+    // G[i*N+j]: grouping tendency of i and j (symmetric, [0,1]; higher → same group)
+    // O[i*N+j]: tendency for i before j (asymmetric, [0,1])
+    // Default none (zeros); EMA accumulates from good solutions during search
+    // Example override: close distance → high G and O
    void init_relation_matrix(float* h_G, float* h_O, int N) const {
-        (void)h_G; (void)h_O; (void)N;  // 默认：不做任何事（保持全零）
+        (void)h_G; (void)h_O; (void)N;  // default: no-op (keep zeros)
    }
    
-    // 可选：返回 host 端数据矩阵供启发式初始解构造
-    // 默认返回 0（不提供），子类 override 后填充 out 数组并返回实际数量
+    // Optional: host-side data matrices for heuristic initial solutions
+    // Default 0 (none); override to fill out[] and return count
    int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
        (void)out; (void)max_count;
        return 0;
    }
    
-    // v5.0: 多 GPU 协同 — 克隆 Problem 到指定 GPU
-    // 子类需实现：cudaSetDevice(gpu_id) + 分配设备内存 + 拷贝数据
-    // 返回新的 Problem 实例指针（在 host 端，但其内部设备指针指向 gpu_id）
+    // v5.0: multi-GPU — clone Problem to a given GPU
+    // Subclasses implement: cudaSetDevice(gpu_id) + device alloc + copy
+    // Returns new Problem* on host; internal device pointers target gpu_id
    virtual Derived* clone_to_device(int gpu_id) const {
        (void)gpu_id;
        fprintf(stderr, "Error: clone_to_device() not implemented for this Problem type\n");
--- a/prototype/problems/assignment.cuh
+++ b/prototype/problems/assignment.cuh
@ -1,7 +1,7 @@
 /**
- * assignment.cuh - 指派问题
- * 
- * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ * assignment.cuh - assignment problem
+ *
+ * Extends ProblemBase with ObjDef objective registration.
 */

 #pragma once
@ -11,10 +11,10 @@

 struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
    const float* d_cost;
-    const float* h_cost;  // host 端成本矩阵（用于 init_relation_matrix）
+    const float* h_cost;  // host cost matrix (for init_relation_matrix)
    int n;
    
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
    __device__ float calc_total_cost(const Sol& sol) const {
        float total = 0.0f;
        const int* assign = sol.data[0];
@ -24,7 +24,7 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
        return total;
    }
    
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
    static constexpr ObjDef OBJ_DEFS[] = {
        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_cost
    };
@ -47,7 +47,7 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
        return cfg;
    }
    
-    // ---- shared memory 接口 ----
+    // ---- shared memory interface ----
    static constexpr size_t SMEM_LIMIT = 48 * 1024;
    
    size_t shared_mem_bytes() const {
@ -66,12 +66,12 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
        d_cost = sc;
    }
    
-    // 成本先验：task j 和 task k 如果被相似 agent 偏好，G 值高
-    // O 矩阵：task j 在位置 i 成本低 → O[j][k] 略高（j 倾向排在 k 前面的位置）
+    // Cost prior: if tasks j and k are similarly preferred by agents, G is high
+    // O matrix: low cost for task j at slot i → slightly higher O[j][k] (j tends before k)
    void init_relation_matrix(float* G, float* O, int N) const {
        if (!h_cost || N != n) return;
-        // 对每个 task，构建成本向量，task 间余弦相似度 → G
-        // 简化：成本列向量的相关性
+        // Per task, build cost vectors; cosine similarity between tasks → G
+        // Simplified: correlation of cost columns
        float max_c = 0.0f;
        for (int i = 0; i < N * N; i++)
            if (h_cost[i] > max_c) max_c = h_cost[i];
@ -80,7 +80,7 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
        for (int j = 0; j < N; j++)
            for (int k = 0; k < N; k++) {
                if (j == k) continue;
-                // G: 两个 task 的成本向量越相似 → 越可能互换
+                // G: more similar cost columns → more likely to swap tasks
                float dot = 0.0f, nj = 0.0f, nk = 0.0f;
                for (int i = 0; i < N; i++) {
                    float cj = h_cost[i * N + j] / max_c;
--- a/prototype/problems/bin_packing.cuh
+++ b/prototype/problems/bin_packing.cuh
@ -1,13 +1,13 @@
 /**
- * bin_packing.cuh - 一维装箱问题（Integer 编码 + 约束）
- * 
- * N 个物品，每个重量 w[i]，装入最多 B 个箱子，每个箱子容量 C。
- * 决策变量：data[0][i] ∈ [0, B-1]，表示物品 i 放入的箱子编号。
- * 目标：最小化使用的箱子数。
- * 约束：每个箱子总重不超过 C，超出部分作为 penalty。
- * 
- * 验证实例：8 物品 weights=[7,5,3,4,6,2,8,1], C=10, 最优=4 箱
- *   箱0={7,3}=10, 箱1={5,4,1}=10, 箱2={6,2}=8, 箱3={8}=8
+ * bin_packing.cuh - one-dimensional bin packing (Integer encoding + constraints)
+ *
+ * N items with weights w[i], at most B bins, capacity C per bin.
+ * Decision: data[0][i] in [0, B-1] = bin index for item i.
+ * Objective: minimize number of bins used.
+ * Constraint: bin load ≤ C; overflow contributes to penalty.
+ *
+ * Validation instance: 8 items weights=[7,5,3,4,6,2,8,1], C=10, optimum=4 bins
+ *   bin0={7,3}=10, bin1={5,4,1}=10, bin2={6,2}=8, bin3={8}=8
 */

 #pragma once
@ -16,9 +16,9 @@

 struct BinPackingProblem : ProblemBase<BinPackingProblem, 1, 64> {
    const float* d_weights;
-    int n;              // 物品数
-    int max_bins;       // 最大箱子数 B
-    float capacity;     // 箱子容量 C
+    int n;              // number of items
+    int max_bins;       // max bins B
+    float capacity;     // bin capacity C
    
    __device__ float calc_bins_used(const Sol& sol) const {
        bool used[32] = {};
--- a/prototype/problems/graph_color.cuh
+++ b/prototype/problems/graph_color.cuh
@ -1,11 +1,11 @@
 /**
- * graph_color.cuh - 图着色问题（Integer 编码）
- * 
- * N 个节点的图，用 k 种颜色着色。
- * 决策变量：data[0][i] ∈ [0, k-1]，表示节点 i 的颜色。
- * 目标：最小化冲突边数（相邻节点同色的边数）。
- * 
- * 验证实例：Petersen 图（10 节点 15 边，色数=3，最优冲突=0）
+ * graph_color.cuh - graph coloring (Integer encoding)
+ *
+ * Graph on N nodes, k colors.
+ * Decision: data[0][i] in [0, k-1] = color of node i.
+ * Objective: minimize number of conflicting edges (adjacent same color).
+ *
+ * Validation instance: Petersen graph (10 nodes, 15 edges, chromatic number 3, optimal conflicts=0)
 */

 #pragma once
@ -13,9 +13,9 @@
 #include "cuda_utils.cuh"

 struct GraphColorProblem : ProblemBase<GraphColorProblem, 1, 64> {
-    const int* d_adj;   // 邻接矩阵 [N*N]（1=相邻, 0=不相邻）
-    int n;              // 节点数
-    int k;              // 颜色数
+    const int* d_adj;   // adjacency [N*N] (1=edge, 0=no edge)
+    int n;              // number of nodes
+    int k;              // number of colors
    
    __device__ float calc_conflicts(const Sol& sol) const {
        int conflicts = 0;
--- a/prototype/problems/jsp.cuh
+++ b/prototype/problems/jsp.cuh
@ -1,26 +1,26 @@
 /**
- * jsp.cuh - 车间调度问题 (Job Shop Scheduling Problem)
- * 
- * J 个工件，每个工件有 O 道工序，每道工序指定机器和耗时。
- * 
- * === 编码方案 A：Integer 多行（时间表编码）===
- * JSPProblem: data[j][i] = 工件 j 的第 i 道工序的开始时间
+ * jsp.cuh - Job Shop Scheduling Problem (JSSP)
+ *
+ * J jobs, each with O operations; each op specifies machine and duration.
+ *
+ * === Encoding A: multi-row Integer (time-table encoding) ===
+ * JSPProblem: data[j][i] = start time of job j's i-th operation
 *   dim1 = num_jobs, dim2_default = num_ops
- *   row_mode = Fixed（禁止 ROW_SPLIT/ROW_MERGE）
- *   每行代表一个工件的固定工序序列，行长度不可变
- * 
- * === 编码方案 B：Permutation 多重集（工序排列编码）===
- * JSPPermProblem: data[0][k] = 工件编号（0..J-1），长度 J*O
- *   值 j 出现 O 次。从左到右扫描，第 t 次遇到值 j 表示工件 j 的第 t 道工序。
+ *   row_mode = Fixed (no ROW_SPLIT/ROW_MERGE)
+ *   Each row is a fixed op sequence for one job; row length is fixed.
+ *
+ * === Encoding B: Permutation multiset (operation sequence encoding) ===
+ * JSPPermProblem: data[0][k] = job id (0..J-1), length J*O
+ *   Value j appears O times. Left-to-right scan: t-th occurrence of j is job j's t-th op.
 *   dim1 = 1, dim2_default = J*O, perm_repeat_count = O
- *   标准 Permutation 算子（swap/reverse/insert）天然保持多重集结构
- * 
- * 目标：Minimize makespan（所有工件完成时间的最大值）。
- * 约束：
- *   (a) 工序顺序：同一工件的工序必须按序执行
- *   (b) 机器冲突：同一机器同一时刻只能处理一个工序
- * 
- * 验证实例：自定义 3 工件 3 机器 (3x3)，最优 makespan = 12
+ *   Standard permutation ops (swap/reverse/insert) preserve multiset structure.
+ *
+ * Objective: minimize makespan (max completion time over jobs).
+ * Constraints:
+ *   (a) Precedence: ops of the same job must run in order.
+ *   (b) Machine conflict: one op per machine at a time.
+ *
+ * Validation instance: custom 3 jobs × 3 machines (3x3), optimal makespan = 12
 */

 #pragma once
@ -28,16 +28,16 @@
 #include "cuda_utils.cuh"

 // ============================================================
-// 编码方案 A：Integer 多行（时间表编码）
+// Encoding A: multi-row Integer (time-table encoding)
 // ============================================================

 struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
-    const int*   d_machine;     // 工序所需机器 [J*O]
-    const float* d_duration;    // 工序耗时 [J*O]
-    int num_jobs;               // 工件数 J
-    int num_ops;                // 每工件工序数 O
-    int num_machines;           // 机器数 M
-    int time_horizon;           // 时间上界
+    const int*   d_machine;     // machine per op [J*O]
+    const float* d_duration;    // op duration [J*O]
+    int num_jobs;               // number of jobs J
+    int num_ops;                // ops per job O
+    int num_machines;           // number of machines M
+    int time_horizon;           // time horizon upper bound
    
    __device__ float calc_makespan(const Sol& sol) const {
        float makespan = 0.0f;
@ -62,7 +62,7 @@ struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
    __device__ float compute_penalty(const Sol& sol) const {
        float penalty = 0.0f;
        
-        // (a) 工序顺序约束
+        // (a) Precedence constraints
        for (int j = 0; j < num_jobs; j++) {
            for (int i = 1; i < num_ops; i++) {
                float prev_end = (float)sol.data[j][i-1] + d_duration[j * num_ops + (i-1)];
@ -72,7 +72,7 @@ struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
            }
        }
        
-        // (b) 机器冲突约束
+        // (b) Machine conflict constraints
        int total = num_jobs * num_ops;
        for (int a = 0; a < total; a++) {
            int ja = a / num_ops, ia = a % num_ops;
@ -151,28 +151,28 @@ struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
 };

 // ============================================================
-// 编码方案 B：Permutation 多重集（工序排列编码）
+// Encoding B: Permutation multiset (operation sequence encoding)
 // ============================================================
-// data[0] 是长度 J*O 的排列，值域 [0, J)，每个值出现 O 次
-// 从左到右扫描：第 t 次遇到值 j → 安排工件 j 的第 t 道工序
-// 贪心解码：每道工序安排在"最早可行时间"（满足工序顺序 + 机器空闲）
+// data[0] is a length-J*O sequence with values in [0, J), each appearing O times.
+// Left-to-right: t-th occurrence of j schedules job j's t-th operation.
+// Greedy decode: each op at earliest feasible time (precedence + machine free).

 struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
-    const int*   d_machine;     // 工序所需机器 [J*O]
-    const float* d_duration;    // 工序耗时 [J*O]
+    const int*   d_machine;     // machine per op [J*O]
+    const float* d_duration;    // op duration [J*O]
    int num_jobs;
    int num_ops;
    int num_machines;
    
-    // 贪心解码：从排列生成调度方案，返回 makespan
+    // Greedy decode: build schedule from permutation, return makespan
    __device__ float decode_and_makespan(const Sol& sol) const {
        int total = num_jobs * num_ops;
        int size = sol.dim2_sizes[0];
        if (size < total) return 1e9f;
        
-        float job_avail[8];     // 每个工件的下一道工序最早开始时间
-        float mach_avail[8];    // 每台机器的最早空闲时间
-        int   job_next_op[8];   // 每个工件的下一道待安排工序编号
+        float job_avail[8];     // earliest start for next op of each job
+        float mach_avail[8];    // earliest machine free time
+        int   job_next_op[8];   // next op index to schedule per job
        
        for (int j = 0; j < num_jobs; j++) { job_avail[j] = 0.0f; job_next_op[j] = 0; }
        for (int m = 0; m < num_machines; m++) mach_avail[m] = 0.0f;
@ -182,13 +182,13 @@ struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
            int j = sol.data[0][k];
            if (j < 0 || j >= num_jobs) return 1e9f;
            int op = job_next_op[j];
-            if (op >= num_ops) continue;  // 该工件已安排完
+            if (op >= num_ops) continue;  // job already fully scheduled
            
            int flat = j * num_ops + op;
            int m = d_machine[flat];
            float dur = d_duration[flat];
            
-            // 最早开始时间 = max(工件前序完成, 机器空闲)
+            // Earliest start = max(job predecessor done, machine free)
            float start = fmaxf(job_avail[j], mach_avail[m]);
            float end = start + dur;
            
@ -212,7 +212,7 @@ struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
        }
    }
    
-    // 贪心解码天然满足约束，penalty 始终为 0
+    // Greedy decode satisfies constraints; penalty is always 0
    __device__ float compute_penalty(const Sol& sol) const {
        return 0.0f;
    }
--- a/prototype/problems/knapsack.cuh
+++ b/prototype/problems/knapsack.cuh
@ -1,7 +1,7 @@
 /**
- * knapsack.cuh - 0-1 背包问题
- * 
- * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ * knapsack.cuh - 0-1 knapsack
+ *
+ * Extends ProblemBase with ObjDef objective registration.
 */

 #pragma once
@ -10,13 +10,13 @@
 #include "operators.cuh"

 struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
-    // 问题数据（d_weights 是物品重量，非目标权重）
+    // problem data (d_weights are item weights, not objective weights)
    const float* d_weights;
    const float* d_values;
    float capacity;
    int n;
    
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
    __device__ float calc_total_value(const Sol& sol) const {
        float tv = 0.0f;
        const int* sel = sol.data[0];
@ -26,7 +26,7 @@ struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
        return tv;
    }
    
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
    static constexpr ObjDef OBJ_DEFS[] = {
        {ObjDir::Maximize, 1.0f, 0.0f},   // case 0: calc_total_value
    };
@ -55,7 +55,7 @@ struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
        return cfg;
    }
    
-    // ---- shared memory 接口 ----
+    // ---- shared memory interface ----
    size_t shared_mem_bytes() const {
        return 2 * (size_t)n * sizeof(float);
    }
--- a/prototype/problems/load_balance.cuh
+++ b/prototype/problems/load_balance.cuh
@ -1,12 +1,12 @@
 /**
- * load_balance.cuh - 离散负载均衡问题（Integer 编码验证）
- * 
- * N 个任务分配到 M 台机器，每个任务有一个处理时间 p[i]。
- * 决策变量：data[0][i] ∈ [0, M-1]，表示任务 i 分配到哪台机器。
- * 目标：最小化 makespan（最大机器负载）。
- * 
- * 已知 NP-hard（等价于 multiprocessor scheduling / load balancing）。
- * LPT（最长处理时间优先）贪心可得 4/3 近似。
+ * load_balance.cuh - discrete load balancing (Integer encoding sanity check)
+ *
+ * N tasks on M machines, processing time p[i] per task.
+ * Decision: data[0][i] in [0, M-1] = machine for task i.
+ * Objective: minimize makespan (max machine load).
+ *
+ * NP-hard (same as multiprocessor scheduling / load balancing).
+ * LPT (longest processing time first) greedy achieves 4/3 approximation.
 */

 #pragma once
@ -14,12 +14,12 @@
 #include "cuda_utils.cuh"

 struct LoadBalanceProblem : ProblemBase<LoadBalanceProblem, 1, 64> {
-    const float* d_proc_time;   // 任务处理时间 [N]
-    int n;                      // 任务数
-    int m;                      // 机器数
+    const float* d_proc_time;   // task processing times [N]
+    int n;                      // number of tasks
+    int m;                      // number of machines
    
    __device__ float calc_makespan(const Sol& sol) const {
-        float load[32] = {};    // 最多 32 台机器
+        float load[32] = {};    // at most 32 machines
        int size = sol.dim2_sizes[0];
        for (int i = 0; i < size; i++) {
            int machine = sol.data[0][i];
@ -43,7 +43,7 @@ struct LoadBalanceProblem : ProblemBase<LoadBalanceProblem, 1, 64> {
    }
    
    __device__ float compute_penalty(const Sol& sol) const {
-        return 0.0f;   // 无约束（任何分配都合法）
+        return 0.0f;   // no side constraints (any assignment is feasible)
    }
    
    ProblemConfig config() const {
--- a/prototype/problems/qap.cuh
+++ b/prototype/problems/qap.cuh
@ -1,14 +1,14 @@
 /**
- * qap.cuh - 二次分配问题 (Quadratic Assignment Problem)
- * 
- * N 个设施分配到 N 个位置（排列编码）。
- * 决策变量：data[0][i] = 设施 i 分配到的位置。
- * 目标：Minimize sum(flow[i][j] * dist[perm[i]][perm[j]])
- * 
- * 验证实例：自定义 5x5
- *   flow: 设施间的物流量
- *   dist: 位置间的距离
- *   已知最优 = 58
+ * qap.cuh - Quadratic Assignment Problem (QAP)
+ *
+ * Assign N facilities to N locations (permutation encoding).
+ * Decision: data[0][i] = location assigned to facility i.
+ * Objective: Minimize sum(flow[i][j] * dist[perm[i]][perm[j]])
+ *
+ * Validation instance: custom 5x5
+ *   flow: inter-facility flow
+ *   dist: inter-location distances
+ *   known optimum = 58
 */

 #pragma once
@ -16,8 +16,10 @@
 #include "cuda_utils.cuh"

 struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
-    const float* d_flow;    // 物流量矩阵 [N*N]
-    const float* d_dist;    // 距离矩阵 [N*N]
+    const float* d_flow;    // flow matrix [N*N] (device)
+    const float* d_dist;    // distance matrix [N*N] (device)
+    const float* h_flow;    // flow matrix [N*N] (host, for clone_to_device)
+    const float* h_dist;    // distance matrix [N*N] (host, for clone_to_device)
    int n;
    
    __device__ float calc_cost(const Sol& sol) const {
@ -64,14 +66,16 @@ struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
        d_dist = sd;
    }
    
-    static QAPProblem create(const float* h_flow, const float* h_dist, int n) {
+    static QAPProblem create(const float* h_flow_in, const float* h_dist_in, int n) {
        QAPProblem prob;
        prob.n = n;
+        prob.h_flow = h_flow_in;
+        prob.h_dist = h_dist_in;
        float *df, *dd;
        CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n));
        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
-        CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice));
-        CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(df, h_flow_in, sizeof(float) * n * n, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist_in, sizeof(float) * n * n, cudaMemcpyHostToDevice));
        prob.d_flow = df; prob.d_dist = dd;
        return prob;
    }
@ -82,18 +86,12 @@ struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
        d_flow = nullptr; d_dist = nullptr;
    }
    
-    // v5.0: 多 GPU 协同 — 克隆到指定 GPU
+    // v5.0: multi-GPU — clone onto a given device
    QAPProblem* clone_to_device(int gpu_id) const override {
        int orig_device;
        CUDA_CHECK(cudaGetDevice(&orig_device));
        
-        // 先下载数据到 host（从当前设备）
-        float* h_flow = new float[n * n];
-        float* h_dist = new float[n * n];
-        CUDA_CHECK(cudaMemcpy(h_flow, d_flow, sizeof(float) * n * n, cudaMemcpyDeviceToHost));
-        CUDA_CHECK(cudaMemcpy(h_dist, d_dist, sizeof(float) * n * n, cudaMemcpyDeviceToHost));
-        
-        // 切换到目标 GPU 并上传
+        // Use host-side matrices directly (no D2H needed)
        CUDA_CHECK(cudaSetDevice(gpu_id));
        float *df, *dd;
        CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n));
@ -101,15 +99,12 @@ struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
        CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
        
-        delete[] h_flow;
-        delete[] h_dist;
-        
-        // 恢复原设备
        CUDA_CHECK(cudaSetDevice(orig_device));
        
-        // 创建新实例
        QAPProblem* new_prob = new QAPProblem();
        new_prob->n = n;
+        new_prob->h_flow = h_flow;
+        new_prob->h_dist = h_dist;
        new_prob->d_flow = df;
        new_prob->d_dist = dd;
        
--- a/prototype/problems/schedule.cuh
+++ b/prototype/problems/schedule.cuh
@ -1,8 +1,8 @@
 /**
- * schedule.cuh - 排班问题
- * 
- * 继承 ProblemBase，使用 ObjDef 目标注册机制
- * 2 个目标：总成本（min）+ 不公平度（min，权重更高）
+ * schedule.cuh - staff scheduling
+ *
+ * Extends ProblemBase with ObjDef objective registration.
+ * Two objectives: total cost (min) + unfairness (min, higher weight).
 */

 #pragma once
@ -14,7 +14,7 @@ struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
    const float* d_cost;
    int days, emps, required;
    
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
    __device__ float calc_total_cost(const Sol& sol) const {
        float total = 0.0f;
        for (int d = 0; d < days; d++)
@ -37,7 +37,7 @@ struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
        return (float)(max_w - min_w);
    }
    
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
    static constexpr ObjDef OBJ_DEFS[] = {
        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_cost
        {ObjDir::Minimize, 5.0f, 0.0f},   // case 1: calc_unfairness
@ -71,9 +71,9 @@ struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
        return cfg;
    }
    
-    // 默认回退全量（基类行为）— 不需要覆盖 evaluate_move
+    // Default full re-eval (base behavior) — no need to override evaluate_move
    
-    // ---- shared memory 接口 ----
+    // ---- shared memory interface ----
    size_t shared_mem_bytes() const {
        return (size_t)days * emps * sizeof(float);
    }
--- a/prototype/problems/tsp.cuh
+++ b/prototype/problems/tsp.cuh
@ -1,7 +1,7 @@
 /**
- * tsp.cuh - TSP 问题定义
- * 
- * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ * tsp.cuh - Traveling Salesman Problem (TSP) definition
+ *
+ * Extends ProblemBase with ObjDef objective registration.
 */

 #pragma once
@ -10,12 +10,12 @@
 #include "operators.cuh"

 struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
-    // 问题数据
+    // problem data
    const float* d_dist;
-    const float* h_dist;  // host 端距离矩阵（用于 init_relation_matrix）
+    const float* h_dist;  // host distance matrix (for init_relation_matrix)
    int n;
    
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
    __device__ float calc_total_distance(const Sol& sol) const {
        float total = 0.0f;
        const int* route = sol.data[0];
@ -25,7 +25,7 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
        return total;
    }
    
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
    static constexpr ObjDef OBJ_DEFS[] = {
        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_distance
    };
@ -37,10 +37,10 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
    }
    
    __device__ float compute_penalty(const Sol& sol) const {
-        return 0.0f;  // TSP 无约束
+        return 0.0f;  // TSP has no side constraints
    }
    
-    // ---- config（编码/维度部分，目标由基类自动填充）----
+    // ---- config (encoding/dims; objectives filled by base class) ----
    ProblemConfig config() const {
        ProblemConfig cfg;
        cfg.encoding = EncodingType::Permutation;
@ -49,7 +49,7 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
        return cfg;
    }
    
-    // ---- shared memory 接口 ----
+    // ---- shared memory interface ----
    static constexpr size_t SMEM_LIMIT = 48 * 1024;
    
    size_t shared_mem_bytes() const {
@ -69,7 +69,7 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
        d_dist = sd;
    }
    
-    // 距离先验：距离近 → G/O 分数高
+    // Distance prior: closer cities → higher G/O scores
    void init_relation_matrix(float* G, float* O, int N) const {
        if (!h_dist || N != n) return;
        float max_d = 0.0f;
@ -108,21 +108,21 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
        h_dist = nullptr;
    }
    
-    // v5.0: 多 GPU 协同 — 克隆到指定 GPU
+    // v5.0: multi-GPU — clone onto a given device
    TSPProblem* clone_to_device(int gpu_id) const override {
        int orig_device;
        CUDA_CHECK(cudaGetDevice(&orig_device));
        CUDA_CHECK(cudaSetDevice(gpu_id));
        
-        // 分配设备内存并拷贝距离矩阵
+        // Allocate device memory and copy distance matrix
        float* dd;
        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
        CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
        
-        // 恢复原设备
+        // Restore original device
        CUDA_CHECK(cudaSetDevice(orig_device));
        
-        // 创建新的 Problem 实例（在 host 端）
+        // Create new Problem instance (on host)
        TSPProblem* new_prob = new TSPProblem();
        new_prob->n = n;
        new_prob->h_dist = h_dist;
--- a/prototype/problems/tsp_large.cuh
+++ b/prototype/problems/tsp_large.cuh
@ -1,7 +1,7 @@
 /**
- * tsp_large.cuh - 大规模 TSP 问题定义 (最多 256 城市)
- * 
- * 继承 ProblemBase，逻辑与 tsp.cuh 一致，仅 D2 上限不同
+ * tsp_large.cuh - large-scale TSP definition (up to 256 cities)
+ *
+ * Same logic as tsp.cuh under ProblemBase; only D2 cap differs.
 */

 #pragma once
@ -14,7 +14,7 @@ struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
    const float* h_dist;
    int n;
    
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
    __device__ float calc_total_distance(const Sol& sol) const {
        float total = 0.0f;
        const int* route = sol.data[0];
@ -24,7 +24,7 @@ struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
        return total;
    }
    
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
    static constexpr ObjDef OBJ_DEFS[] = {
        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_distance
    };
@ -54,7 +54,7 @@ struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
        return need <= SMEM_LIMIT ? need : 0;
    }
    
-    // 距离矩阵的实际大小（不管是否放进 smem）
+    // Actual distance matrix size (whether or not placed in smem)
    size_t working_set_bytes() const {
        return (size_t)n * n * sizeof(float);
    }
--- a/prototype/problems/tsp_xlarge.cuh
+++ b/prototype/problems/tsp_xlarge.cuh
@ -1,9 +1,9 @@
 /**
- * tsp_xlarge.cuh - 超大规模 TSP 问题定义 (最多 512 城市)
- * 
- * 继承 ProblemBase，逻辑与 tsp_large.cuh 一致，D2=512
- * 注意：距离矩阵 512×512×4B = 1MB，远超 48KB shared memory
- *       因此 shared_mem_bytes() 返回 0，距离矩阵留在 global memory
+ * tsp_xlarge.cuh - very large TSP definition (up to 512 cities)
+ *
+ * Same as tsp_large.cuh under ProblemBase, with D2=512.
+ * Note: distance matrix 512×512×4B = 1MB, far above 48KB shared memory,
+ *       so shared_mem_bytes() returns 0 and the matrix stays in global memory.
 */

 #pragma once
@ -13,7 +13,7 @@

 struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
    const float* d_dist;
-    const float* h_dist;  // host 端距离矩阵（用于 init_relation_matrix）
+    const float* h_dist;  // host distance matrix (for init_relation_matrix)
    int n;
    
    __device__ float calc_total_distance(const Sol& sol) const {
@ -45,7 +45,7 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
        return cfg;
    }
    
-    // 距离矩阵太大，不放 shared memory
+    // Distance matrix too large for shared memory
    size_t shared_mem_bytes() const { return 0; }
    __device__ void load_shared(char*, int, int) {}
    
@ -53,10 +53,10 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
        return (size_t)n * n * sizeof(float);
    }
    
-    // 用距离矩阵初始化 G/O 先验：距离近 → 分数高
+    // Initialize G/O priors from distances: closer → higher score
    void init_relation_matrix(float* G, float* O, int N) const {
        if (!h_dist || N != n) return;
-        // 找最大距离用于归一化
+        // Max distance for normalization
        float max_d = 0.0f;
        for (int i = 0; i < N; i++)
            for (int j = 0; j < N; j++)
@ -66,10 +66,10 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                if (i == j) continue;
-                // 距离近 → G 高（分组倾向强）
+                // Closer → higher G (stronger grouping signal)
                float proximity = 1.0f - h_dist[i * N + j] / max_d;
-                G[i * N + j] = proximity * 0.3f;  // 初始信号不要太强，留空间给 EMA
-                // 距离近 → O 也给一点信号（对称的，不偏向任何方向）
+                G[i * N + j] = proximity * 0.3f;  // keep initial signal moderate for EMA headroom
+                // Closer → small O signal too (symmetric, no directional bias)
                O[i * N + j] = proximity * 0.1f;
            }
        }
@ -84,7 +84,7 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
    static TSPXLargeProblem create(const float* h_dist_ptr, int n) {
        TSPXLargeProblem prob;
        prob.n = n;
-        prob.h_dist = h_dist_ptr;  // 保留 host 指针
+        prob.h_dist = h_dist_ptr;  // keep host pointer
        float* dd;
        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
        CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));
--- a/prototype/problems/vrp.cuh
+++ b/prototype/problems/vrp.cuh
@ -1,8 +1,8 @@
 /**
- * vrp.cuh - 容量约束车辆路径问题 (CVRP)
- * 
- * 继承 ProblemBase，使用 ObjDef 目标注册机制
- * 多行编码（D1=K 条路线，分区初始化 + 跨行算子）
+ * vrp.cuh - Capacitated Vehicle Routing Problem (CVRP)
+ *
+ * Extends ProblemBase with ObjDef objective registration.
+ * Multi-row encoding (D1 = K routes, partition init + cross-row operators).
 */

 #pragma once
@ -12,11 +12,11 @@
 #include "gpu_cache.cuh"

 struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
-    // GPU 数据
+    // GPU data
    const float* d_dist;
    const float* d_demand;
-    const float* h_dist;    // host 端距离矩阵（含 depot，用于 init_relation_matrix）
-    const float* h_demand;  // host 端需求数组（用于 clone_to_device）
+    const float* h_dist;    // host distance matrix (includes depot; for init_relation_matrix)
+    const float* h_demand;  // host demand array (for clone_to_device)
    int n;
    int stride;
    float capacity;
@ -24,7 +24,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
    int max_vehicles;
    GpuCache cache;
    
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
    __device__ float compute_route_dist(const int* route, int size) const {
        if (size == 0) return 0.0f;
        float dist = 0.0f;
@ -61,7 +61,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
        return total;
    }
    
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
    static constexpr ObjDef OBJ_DEFS[] = {
        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_distance
    };
@ -102,7 +102,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
        return cfg;
    }
    
-    // ---- shared memory 接口 ----
+    // ---- shared memory interface ----
    static constexpr size_t SMEM_LIMIT = 48 * 1024;
    
    size_t shared_mem_bytes() const {
@ -129,14 +129,14 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
    void enable_cache(int cap = 65536) { cache = GpuCache::allocate(cap); }
    void print_cache_stats() const { cache.print_stats(); }
    
-    // 距离先验：客户间距离近 → G/O 分数高
-    // 注意：h_dist 含 depot（stride×stride），元素编号 0..n-1 对应 node 1..n
+    // Distance prior: closer customers → higher G/O scores
+    // Note: h_dist includes depot (stride×stride); indices 0..n-1 map to nodes 1..n
    void init_relation_matrix(float* G, float* O, int N) const {
        if (!h_dist || N != n) return;
        float max_d = 0.0f;
        for (int i = 0; i < N; i++)
            for (int j = 0; j < N; j++) {
-                float d = h_dist[(i + 1) * stride + (j + 1)];  // 跳过 depot
+                float d = h_dist[(i + 1) * stride + (j + 1)];  // skip depot
                if (d > max_d) max_d = d;
            }
        if (max_d <= 0.0f) return;
@ -161,7 +161,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
        prob.max_vehicles = max_vehicles;
        prob.cache = GpuCache::disabled();
        prob.h_dist = h_dist_ptr;
-        prob.h_demand = h_demand_ptr;  // 保存 host 端指针
+        prob.h_demand = h_demand_ptr;  // keep host pointer
        
        int n_nodes = n + 1;
        float* dd;
@ -185,13 +185,13 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
        cache.destroy();
    }
    
-    // v5.0: 多 GPU 协同 — 克隆到指定 GPU
+    // v5.0: multi-GPU — clone onto a given device
    VRPProblem* clone_to_device(int gpu_id) const override {
        int orig_device;
        CUDA_CHECK(cudaGetDevice(&orig_device));
        CUDA_CHECK(cudaSetDevice(gpu_id));
        
-        // 从 host 端数据直接拷贝到目标 GPU（避免跨设备 D2H 拷贝）
+        // Copy from host straight to target GPU (avoid cross-device D2H staging)
        int n_nodes = n + 1;
        float* dd;
        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n_nodes * n_nodes));
--- a/prototype/problems/vrptw.cuh
+++ b/prototype/problems/vrptw.cuh
@ -1,12 +1,12 @@
 /**
- * vrptw.cuh - 带时间窗的车辆路径问题 (VRPTW)
- * 
- * 在 CVRP 基础上增加时间窗约束。
- * 编码：Perm 多行分区（同 CVRP），data[r][j] = 路线 r 的第 j 个客户。
- * 目标：Minimize 总距离。
- * 约束：(a) 容量约束, (b) 时间窗约束（到达时间必须 ≤ latest，早到需等待）。
- * 
- * 验证实例：8 客户 3 车, 手工设计坐标+时间窗, 确保有已知可行解。
+ * vrptw.cuh - Vehicle Routing Problem with Time Windows (VRPTW)
+ *
+ * CVRP plus time window constraints.
+ * Encoding: multi-row perm partition (same as CVRP); data[r][j] = j-th customer on route r.
+ * Objective: minimize total distance.
+ * Constraints: (a) capacity, (b) time windows (arrival ≤ latest; early arrival waits).
+ *
+ * Validation instance: 8 customers, 3 vehicles; hand-crafted coords + windows with known feasible solution.
 */

 #pragma once
@ -14,12 +14,12 @@
 #include "cuda_utils.cuh"

 struct VRPTWProblem : ProblemBase<VRPTWProblem, 8, 64> {
-    const float* d_dist;        // 距离矩阵 [(n+1)*(n+1)]（含 depot）
-    const float* d_demand;      // 需求 [n]
-    const float* d_earliest;    // 最早服务时间 [n+1]（含 depot）
-    const float* d_latest;      // 最晚服务时间 [n+1]（含 depot）
-    const float* d_service;     // 服务耗时 [n+1]（含 depot）
-    int n;                      // 客户数（不含 depot）
+    const float* d_dist;        // distance matrix [(n+1)*(n+1)] (includes depot)
+    const float* d_demand;      // demand [n]
+    const float* d_earliest;    // earliest service time [n+1] (includes depot)
+    const float* d_latest;      // latest service time [n+1] (includes depot)
+    const float* d_service;     // service time [n+1] (includes depot)
+    int n;                      // number of customers (excludes depot)
    int stride;                 // n+1
    float capacity;
    int num_vehicles;
@ -63,30 +63,30 @@ struct VRPTWProblem : ProblemBase<VRPTWProblem, 8, 64> {
            if (size == 0) continue;
            active++;
            
-            // 容量约束
+            // Capacity constraint
            float load = 0.0f;
            for (int j = 0; j < size; j++)
                load += d_demand[sol.data[r][j]];
            if (load > capacity)
                penalty += (load - capacity) * 100.0f;
            
-            // 时间窗约束：模拟路线行驶
+            // Time windows: simulate route travel
            float time = 0.0f;
            int prev = 0;
            for (int j = 0; j < size; j++) {
                int node = sol.data[r][j] + 1;
                float travel = d_dist[prev * stride + node];
                time += travel;
-                // 早到需等待
+                // Wait if early
                if (time < d_earliest[node])
                    time = d_earliest[node];
-                // 迟到产生惩罚
+                // Penalize lateness
                if (time > d_latest[node])
                    penalty += (time - d_latest[node]) * 50.0f;
                time += d_service[node];
                prev = node;
            }
-            // 返回 depot 的时间窗
+            // Time window returning to depot
            float return_time = time + d_dist[prev * stride + 0];
            if (return_time > d_latest[0])
                penalty += (return_time - d_latest[0]) * 50.0f;