From a848730459da9b8a84efdf105b184f212e384ed2 Mon Sep 17 00:00:00 2001
From: L-yang-yang <15251858055@163.com>
Date: Wed, 25 Mar 2026 11:52:50 +0800
Subject: [PATCH] fix: harden CUDA safety checks and translate comments to
 English
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Safety fixes (4 critical, 4 warning) from code review:

- qap.cuh: fix clone_to_device cross-device D2H by retaining host matrices
- types.cuh: add CUDA_CHECK to InjectBuffer, track owner_gpu for safe destroy
- types.cuh: add bounds check on lexicographic priority index
- solver.cuh: cap migrate_kernel islands to MAX_ISLANDS=64 to prevent stack overflow
- multi_gpu_solver.cuh: guard against 0 GPUs, propagate stop_reason from best GPU
- types.cuh: warn on SeqRegistry overflow
- solver.cuh: warn when constraint_directed/phased_search disabled without AOS

Translate all Chinese comments to English across 25+ source files
(core/*.cuh, problems/*.cuh, Makefile, multi-GPU tests).

Verified on V100S×2 (sm_70, CUDA 12.8): e5 (12 problem types, all optimal),
e13 (multi-objective + multi-GPU, 9 configs, all passed).
---
 README.md                           |  37 +-
 prototype/Makefile                  |  14 +-
 prototype/core/cuda_utils.cuh       |  24 +-
 prototype/core/gpu_cache.cuh        |  52 +--
 prototype/core/init_heuristic.cuh   |   8 +-
 prototype/core/init_selection.cuh   |  90 ++--
 prototype/core/multi_gpu_solver.cuh | 133 +++---
 prototype/core/operators.cuh        | 446 ++++++++++----------
 prototype/core/population.cuh       |  28 +-
 prototype/core/relation_matrix.cuh  |  60 +--
 prototype/core/solver.cuh           | 409 +++++++++---------
 prototype/core/types.cuh            | 616 ++++++++++++++--------------
 prototype/problems/assignment.cuh   |  24 +-
 prototype/problems/bin_packing.cuh  |  24 +-
 prototype/problems/graph_color.cuh  |  20 +-
 prototype/problems/jsp.cuh          |  84 ++--
 prototype/problems/knapsack.cuh     |  14 +-
 prototype/problems/load_balance.cuh |  26 +-
 prototype/problems/qap.cuh          |  51 ++-
 prototype/problems/schedule.cuh     |  16 +-
 prototype/problems/tsp.cuh          |  30 +-
 prototype/problems/tsp_large.cuh    |  12 +-
 prototype/problems/tsp_xlarge.cuh   |  26 +-
 prototype/problems/vrp.cuh          |  32 +-
 prototype/problems/vrptw.cuh        |  38 +-
 25 files changed, 1147 insertions(+), 1167 deletions(-)

diff --git a/README.md b/README.md
index 1da5968..ae58554 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 [![CUDA](https://img.shields.io/badge/CUDA-11.0%2B-green.svg)](https://developer.nvidia.com/cuda-toolkit)
 [![Python](https://img.shields.io/badge/Python-3.8%2B-blue.svg)](https://www.python.org/)
 
-**Paper**: [cuGenOpt: A GPU-Accelerated General-Purpose Metaheuristic Framework for Combinatorial Optimization](http://arxiv.org/abs/2603.19163) 
+**Paper**: [cuGenOpt: A GPU-Accelerated General-Purpose Metaheuristic Framework for Combinatorial Optimization](https://arxiv.org/abs/2603.19163) 
 
 ---
 
@@ -114,28 +114,7 @@ Define your own problem by inheriting `ProblemBase` and implementing `compute_ob
 └─────────────────────────────────────────────────────────┘
 ```
 
----
 
-## Project Structure
-
-```
-generic_solver/
-├── prototype/              # Core framework (header-only .cuh files)
-│   ├── core/              #   Solver, operators, population, types
-│   └── problems/          #   12+ problem implementations
-├── python/                 # Python wrapper (pip install cugenopt)
-│   ├── cugenopt/          #   Python package (built-ins + JIT compiler)
-│   └── tests/             #   Test suite
-├── benchmark/              # Experiments and benchmarks
-│   ├── experiments/       #   E0-E13: 14 experiment groups
-│   ├── data/              #   Standard instances (TSPLIB, Solomon, QAPLIB)
-│   └── results/           #   Experimental reports
-├── paper_v3_en/            # Paper source (LaTeX)
-├── STATUS.md               # Project status and roadmap
-└── README.md               # This file
-```
-
----
 
 ## Performance Highlights
 
@@ -186,8 +165,7 @@ generic_solver/
 ## Installation
 
 ### Python Package
-
-coming soon～
+come soon
 ```bash
 pip install cugenopt
 ```
@@ -207,18 +185,7 @@ cd prototype
 make all
 ```
 
----
 
-## Documentation
-
-| Document | Description |
-|----------|-------------|
-| [STATUS.md](STATUS.md) | Project status, roadmap, and design decisions |
-| [Python API Guide](python/README.md) | Detailed Python API documentation |
-| [Benchmark Design](benchmark/DESIGN.md) | Experimental methodology |
-| [Paper](paper_v3_en/) | Full technical details and evaluation |
-
----
 
 ## Citation
 
diff --git a/prototype/Makefile b/prototype/Makefile
index 32ebcdf..c72945a 100644
--- a/prototype/Makefile
+++ b/prototype/Makefile
@@ -1,10 +1,10 @@
 # GenSolver Makefile
 #
-# 用法:
-#   make e1 e2 e3 e4 e5 e6   → 编译单个实验
-#   make diag                  → 编译诊断程序
-#   make all                   → 编译全部
-#   make clean                 → 清理
+# Usage:
+#   make e1 e2 e3 e4 e5 e6   → Build individual experiments
+#   make diag                  → Build diagnostic program
+#   make all                   → Build all
+#   make clean                 → Clean
 
 NVCC     = nvcc
 ARCH     ?= -arch=sm_75
@@ -40,10 +40,10 @@ $(EXP_DIR)/%/gpu: $(EXP_DIR)/%/gpu.cu $(ALL_HEADERS) problems/tsplib_data.h
 $(EXP_DIR)/e0_diagnosis/bench_diagnosis: $(EXP_DIR)/e0_diagnosis/bench_diagnosis.cu $(ALL_HEADERS)
 	$(NVCC) $(ARCH) $(CFLAGS) $(INCLUDES) -o $@ $<
 
-test_multi_gpu: test_multi_gpu.cu $(ALL_HEADERS)
+test_multi_gpu: $(EXP_DIR)/e9_multi_gpu_b3/test_multi_gpu.cu $(ALL_HEADERS)
 	$(NVCC) $(ARCH) $(CFLAGS) $(INCLUDES) -o $@ $<
 
-test_multi_gpu_b3: test_multi_gpu_b3.cu $(ALL_HEADERS)
+test_multi_gpu_b3: $(EXP_DIR)/e9_multi_gpu_b3/test_multi_gpu_b3.cu $(ALL_HEADERS)
 	$(NVCC) $(ARCH) $(CFLAGS) $(INCLUDES) -o $@ $<
 
 clean:
diff --git a/prototype/core/cuda_utils.cuh b/prototype/core/cuda_utils.cuh
index 72ea103..205f15a 100644
--- a/prototype/core/cuda_utils.cuh
+++ b/prototype/core/cuda_utils.cuh
@@ -1,8 +1,8 @@
 /**
- * cuda_utils.cuh - CUDA 工具集
+ * cuda_utils.cuh - CUDA utilities
  * 
- * 职责：错误检查、设备信息、随机数工具
- * 规则：所有 CUDA API 调用都必须用 CUDA_CHECK 包裹
+ * Responsibilities: error checking, device info, random number utilities
+ * Rule: every CUDA API call must be wrapped with CUDA_CHECK
  */
 
 #pragma once
@@ -11,7 +11,7 @@
 #include <curand_kernel.h>
 
 // ============================================================
-// 错误检查
+// Error checking
 // ============================================================
 
 #define CUDA_CHECK(call) do {                                       \
@@ -23,7 +23,7 @@
     }                                                               \
 } while(0)
 
-// kernel launch 后检查（捕获异步错误）
+// Check after kernel launch (catches async errors)
 #define CUDA_CHECK_LAST() do {                                      \
     cudaError_t err = cudaGetLastError();                            \
     if (err != cudaSuccess) {                                       \
@@ -34,7 +34,7 @@
 } while(0)
 
 // ============================================================
-// 设备信息
+// Device info
 // ============================================================
 
 inline void print_device_info() {
@@ -52,10 +52,10 @@ inline void print_device_info() {
 }
 
 // ============================================================
-// 随机数工具 (Device 端)
+// Random number utilities (device-side)
 // ============================================================
 
-// 初始化 curand 状态，每个线程一个
+// Initialize curand state: one per thread
 __global__ void init_curand_kernel(curandState* states, unsigned long long seed, int n) {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid < n) {
@@ -63,12 +63,12 @@ __global__ void init_curand_kernel(curandState* states, unsigned long long seed,
     }
 }
 
-// Device 端：生成 [0, bound) 的随机整数
+// Device-side: random integer in [0, bound)
 __device__ inline int rand_int(curandState* state, int bound) {
     return curand(state) % bound;
 }
 
-// Device 端：Fisher-Yates shuffle，对 arr[0..n-1] 做随机排列
+// Device-side: Fisher-Yates shuffle of arr[0..n-1]
 __device__ inline void shuffle(int* arr, int n, curandState* state) {
     for (int i = n - 1; i > 0; i--) {
         int j = rand_int(state, i + 1);
@@ -79,12 +79,12 @@ __device__ inline void shuffle(int* arr, int n, curandState* state) {
 }
 
 // ============================================================
-// Kernel 启动参数计算
+// Kernel launch grid sizing
 // ============================================================
 
 inline int div_ceil(int a, int b) { return (a + b - 1) / b; }
 
-// 计算合适的 block 数量
+// Compute suitable number of blocks
 inline int calc_grid_size(int n, int block_size = 256) {
     return div_ceil(n, block_size);
 }
diff --git a/prototype/core/gpu_cache.cuh b/prototype/core/gpu_cache.cuh
index f7c2e06..ae69413 100644
--- a/prototype/core/gpu_cache.cuh
+++ b/prototype/core/gpu_cache.cuh
@@ -1,20 +1,20 @@
 /**
- * gpu_cache.cuh - GPU 全局内存哈希表（通用缓存组件）
+ * gpu_cache.cuh - GPU global-memory hash table (generic cache component)
  * 
- * 设计：
- *   - 开放寻址，固定容量（power of 2），线性探测
- *   - key = uint64_t（由 Problem 自行计算 hash）
- *   - value = float（单个指标值）
- *   - 无锁：允许 race condition（缓存语义，偶尔脏读可接受）
- *   - 自带命中/未命中原子计数器
+ * Design:
+ *   - Open addressing, fixed capacity (power of 2), linear probing
+ *   - key = uint64_t (hash computed by Problem)
+ *   - value = float (single metric value)
+ *   - Lock-free: race conditions allowed (cache semantics; occasional dirty reads OK)
+ *   - Built-in atomic hit/miss counters
  * 
- * 用法：
+ * Usage:
  *   GpuCache cache = GpuCache::allocate(65536);   // host
  *   // ... pass cache as Problem member to kernels ...
  *   cache.print_stats();                           // host
  *   cache.destroy();                               // host
  * 
- * 参考：scute 项目 LRUCache（key = metric_type + content_hash）
+ * Reference: scute project LRUCache (key = metric_type + content_hash)
  */
 
 #pragma once
@@ -22,25 +22,25 @@
 #include <cstdint>
 
 // ============================================================
-// 常量
+// Constants
 // ============================================================
 
 static constexpr uint64_t CACHE_EMPTY_KEY = 0xFFFFFFFFFFFFFFFFULL;
-static constexpr int CACHE_MAX_PROBE = 8;   // 最大线性探测步数
+static constexpr int CACHE_MAX_PROBE = 8;   // Max linear probing steps
 
 // ============================================================
-// GpuCache 结构体（POD，可安全拷贝到 kernel）
+// GpuCache struct (POD, safe to copy to kernel)
 // ============================================================
 
 struct GpuCache {
-    uint64_t* keys;             // GPU 全局内存
-    float*    values;           // GPU 全局内存
-    unsigned int* d_hits;       // 原子计数器（GPU）
-    unsigned int* d_misses;     // 原子计数器（GPU）
-    int capacity;               // 必须是 2 的幂
+    uint64_t* keys;             // GPU global memory
+    float*    values;           // GPU global memory
+    unsigned int* d_hits;       // Atomic counters (GPU)
+    unsigned int* d_misses;     // Atomic counters (GPU)
+    int capacity;               // Must be a power of 2
     int mask;                   // = capacity - 1
     
-    // ---- Host 操作 ----
+    // ---- Host operations ----
     
     static GpuCache allocate(int cap = 65536) {
         GpuCache c;
@@ -94,20 +94,20 @@ struct GpuCache {
 };
 
 // ============================================================
-// Device 函数：哈希 / 查找 / 插入
+// Device functions: hash / lookup / insert
 // ============================================================
 
-/// FNV-1a 哈希：对一段有序 int 序列（如路线中的客户 ID）
+/// FNV-1a hash over an ordered int sequence (e.g. customer IDs on a route)
 __device__ inline uint64_t route_hash(const int* data, int len) {
     uint64_t h = 14695981039346656037ULL;   // FNV offset basis
     for (int i = 0; i < len; i++) {
         h ^= (uint64_t)(unsigned int)data[i];
         h *= 1099511628211ULL;               // FNV prime
     }
-    return (h == CACHE_EMPTY_KEY) ? h - 1 : h;  // 避免与哨兵值碰撞
+    return (h == CACHE_EMPTY_KEY) ? h - 1 : h;  // Avoid collision with sentinel value
 }
 
-/// 查找：命中返回 true + 写入 out
+/// Lookup: on hit returns true and writes out
 __device__ inline bool cache_lookup(const GpuCache& c, uint64_t key, float& out) {
     int slot = (int)(key & (uint64_t)c.mask);
     for (int p = 0; p < CACHE_MAX_PROBE; p++) {
@@ -117,12 +117,12 @@ __device__ inline bool cache_lookup(const GpuCache& c, uint64_t key, float& out)
             out = c.values[idx];
             return true;
         }
-        if (k == CACHE_EMPTY_KEY) return false;  // 空槽 → 一定不存在
+        if (k == CACHE_EMPTY_KEY) return false;  // Empty slot -> key not present
     }
-    return false;   // 探测用尽
+    return false;   // Probing exhausted
 }
 
-/// 插入：写入 key-value，同 key 覆盖，探测满则驱逐首槽
+/// Insert: write key-value; same key overwrites; if probe full, evict first slot
 __device__ inline void cache_insert(const GpuCache& c, uint64_t key, float value) {
     int slot = (int)(key & (uint64_t)c.mask);
     for (int p = 0; p < CACHE_MAX_PROBE; p++) {
@@ -134,7 +134,7 @@ __device__ inline void cache_insert(const GpuCache& c, uint64_t key, float value
             return;
         }
     }
-    // 探测满：驱逐首槽
+    // Probe full: evict first slot
     int idx = slot & c.mask;
     c.keys[idx]   = key;
     c.values[idx] = value;
diff --git a/prototype/core/init_heuristic.cuh b/prototype/core/init_heuristic.cuh
index 716284a..0a8cb90 100644
--- a/prototype/core/init_heuristic.cuh
+++ b/prototype/core/init_heuristic.cuh
@@ -6,7 +6,7 @@
 
 namespace heuristic_init {
 
-// 单行排列：所有行填相同排列
+// Single-row layout: same permutation in every row
 template<typename Sol>
 static void build_sorted_permutation(Sol& sol, const std::vector<int>& order,
                                      int dim1, int dim2) {
@@ -19,7 +19,7 @@ static void build_sorted_permutation(Sol& sol, const std::vector<int>& order,
     for (int i = 0; i < MAX_OBJ; i++) sol.objectives[i] = 0.0f;
 }
 
-// Partition 模式：排列均匀切分到 dim1 行，元素不重复
+// Partition mode: split permutation evenly across dim1 rows, no duplicate elements
 template<typename Sol>
 static void build_partition_from_order(Sol& sol, const std::vector<int>& order,
                                        int dim1, int total_elements) {
@@ -66,8 +66,8 @@ std::vector<Sol> build_from_matrices(const HeuristicMatrix* matrices, int num_ma
                 col_sum[j] += mat[i * N + j];
             }
 
-        // 对于 Partition (VRPTW)，距离矩阵含 depot (index 0)，
-        // 排序只针对客户 (index 1..N-1)，输出值为 0-based 客户编号
+        // For Partition (VRPTW), the distance matrix includes depot (index 0);
+        // sorting is only over customers (indices 1..N-1); output values are 0-based customer ids
         std::vector<int> idx;
         if (partition_mode && N > elem_count) {
             for (int i = 1; i <= elem_count; i++) idx.push_back(i);
diff --git a/prototype/core/init_selection.cuh b/prototype/core/init_selection.cuh
index 17f37e4..f8d8a86 100644
--- a/prototype/core/init_selection.cuh
+++ b/prototype/core/init_selection.cuh
@@ -1,15 +1,15 @@
 /**
- * init_selection.cuh - 初始解采样择优 + NSGA-II 选择
+ * init_selection.cuh - Initial-solution sampling and NSGA-II selection
  *
- * Host 端逻辑，在 solver 初始化阶段调用一次。
- * 从 K × pop_size 个候选解中选出 pop_size 个作为初始种群。
+ * Host-side logic; called once during solver initialization.
+ * Selects pop_size individuals from K × pop_size candidates as the initial population.
  *
- * 选择策略：
- *   1. 核心目标预留名额（按 importance 分配）
- *   2. NSGA-II 选择（非支配排序 + 加权拥挤度）
- *   3. 纯随机保底（多样性）
+ * Selection strategy:
+ *   1. Reserve slots for core objectives (by importance)
+ *   2. NSGA-II selection (non-dominated sort + weighted crowding)
+ *   3. Pure random fallback (diversity)
  *
- * 单目标时自动退化为 top-N 排序，无需分支。
+ * Single-objective case automatically reduces to top-N sorting; no extra branching.
  */
 
 #pragma once
@@ -22,36 +22,36 @@
 namespace init_sel {
 
 // ============================================================
-// 候选解的目标信息（从 GPU 下载后在 host 端使用）
+// Per-candidate objective info (used on host after download from GPU)
 // ============================================================
 struct CandidateInfo {
-    int   idx;           // 在候选数组中的原始索引
-    float objs[MAX_OBJ]; // 归一化后的目标值（越小越好）
+    int   idx;           // Original index in the candidate array
+    float objs[MAX_OBJ]; // Normalized objectives (lower is better)
     float penalty;
-    int   rank;          // 非支配排序层级（0 = Pareto 前沿）
-    float crowding;      // 拥挤度距离
-    bool  selected;      // 是否已被选中
+    int   rank;          // Non-dominated sort front (0 = Pareto front)
+    float crowding;      // Crowding distance
+    bool  selected;      // Whether already selected
 };
 
 // ============================================================
-// 非支配排序（Fast Non-dominated Sort）
+// Non-dominated sort (Fast Non-dominated Sort)
 // ============================================================
-// 复杂度：O(M × N²)，M = 目标数，N = 候选数
-// 对初始化场景（N ≤ 几千，M ≤ 4）完全可接受
+// Complexity: O(M × N²), M = number of objectives, N = number of candidates
+// Acceptable for initialization (N up to a few thousand, M ≤ 4)
 
 inline void fast_nondominated_sort(std::vector<CandidateInfo>& cands,
                                     int num_obj,
                                     std::vector<std::vector<int>>& fronts) {
     int n = (int)cands.size();
-    std::vector<int> dom_count(n, 0);        // 被多少个解支配
-    std::vector<std::vector<int>> dom_set(n); // 支配了哪些解
+    std::vector<int> dom_count(n, 0);        // How many solutions dominate this one
+    std::vector<std::vector<int>> dom_set(n); // Which solutions this one dominates
     
-    // 判断 a 是否支配 b：a 在所有目标上 ≤ b，且至少一个 <
-    // 先处理 penalty：可行解支配不可行解
+    // Whether a dominates b: a ≤ b on all objectives, and strictly < on at least one
+    // Handle penalty first: feasible dominates infeasible
     auto dominates = [&](int a, int b) -> bool {
         const auto& ca = cands[a];
         const auto& cb = cands[b];
-        // penalty 处理
+        // Penalty handling
         if (ca.penalty <= 0.0f && cb.penalty > 0.0f) return true;
         if (ca.penalty > 0.0f && cb.penalty <= 0.0f) return false;
         if (ca.penalty > 0.0f && cb.penalty > 0.0f) return ca.penalty < cb.penalty;
@@ -65,7 +65,7 @@ inline void fast_nondominated_sort(std::vector<CandidateInfo>& cands,
         return all_leq && any_lt;
     };
     
-    // 计算支配关系
+    // Compute dominance relations
     for (int i = 0; i < n; i++) {
         for (int j = i + 1; j < n; j++) {
             if (dominates(i, j)) {
@@ -78,7 +78,7 @@ inline void fast_nondominated_sort(std::vector<CandidateInfo>& cands,
         }
     }
     
-    // 提取各层前沿
+    // Extract each front layer
     fronts.clear();
     std::vector<int> current_front;
     for (int i = 0; i < n; i++) {
@@ -107,9 +107,9 @@ inline void fast_nondominated_sort(std::vector<CandidateInfo>& cands,
 }
 
 // ============================================================
-// 加权拥挤度距离
+// Weighted crowding distance
 // ============================================================
-// 标准拥挤度 + importance 加权：核心目标维度上的间距贡献更大
+// Standard crowding + importance weighting: larger gap contribution on core objectives
 
 inline void weighted_crowding_distance(std::vector<CandidateInfo>& cands,
                                         const std::vector<int>& front,
@@ -117,7 +117,7 @@ inline void weighted_crowding_distance(std::vector<CandidateInfo>& cands,
                                         const float* importance) {
     int n = (int)front.size();
     if (n <= 2) {
-        for (int i : front) cands[i].crowding = 1e18f;  // 边界解无穷大
+        for (int i : front) cands[i].crowding = 1e18f;  // Boundary solutions: infinite
         return;
     }
     
@@ -126,18 +126,18 @@ inline void weighted_crowding_distance(std::vector<CandidateInfo>& cands,
     std::vector<int> sorted_idx(front.begin(), front.end());
     
     for (int m = 0; m < num_obj; m++) {
-        // 按目标 m 排序
+        // Sort by objective m
         std::sort(sorted_idx.begin(), sorted_idx.end(),
                   [&](int a, int b) { return cands[a].objs[m] < cands[b].objs[m]; });
         
         float range = cands[sorted_idx[n-1]].objs[m] - cands[sorted_idx[0]].objs[m];
-        if (range < 1e-12f) continue;  // 该目标无区分度
+        if (range < 1e-12f) continue;  // No spread on this objective
         
-        // 边界解设为无穷大
+        // Boundary solutions: infinite crowding
         cands[sorted_idx[0]].crowding += 1e18f;
         cands[sorted_idx[n-1]].crowding += 1e18f;
         
-        // 中间解：相邻间距 × importance 权重
+        // Interior: neighbor gap × importance weight
         float w = importance[m];
         for (int i = 1; i < n - 1; i++) {
             float gap = cands[sorted_idx[i+1]].objs[m] - cands[sorted_idx[i-1]].objs[m];
@@ -147,29 +147,29 @@ inline void weighted_crowding_distance(std::vector<CandidateInfo>& cands,
 }
 
 // ============================================================
-// 主选择函数：从 N 个候选中选出 target 个
+// Main selection: pick target candidates from N
 // ============================================================
-// 返回被选中的候选索引
+// Returns indices of selected candidates
 
 inline std::vector<int> nsga2_select(std::vector<CandidateInfo>& cands,
                                       int num_obj,
                                       const float* importance,
                                       int target,
                                       int num_reserved_random) {
-    // --- 1. 核心目标预留名额 ---
+    // --- 1. Reserve slots for core objectives ---
     int num_reserve_total = target - num_reserved_random;
-    // 预留比例：importance[i] × 30% 的名额（剩余 70% 给 NSGA-II）
+    // Reserve ratio: importance[i] × 30% of slots (remaining 70% for NSGA-II)
     float reserve_ratio = 0.3f;
     
     std::vector<int> selected;
     selected.reserve(target);
     
-    // 对每个目标，按该目标排序取 top
+    // For each objective, sort by that objective and take top
     for (int m = 0; m < num_obj; m++) {
         int quota = (int)(num_reserve_total * importance[m] * reserve_ratio);
-        if (quota < 1 && num_obj > 1) quota = 1;  // 每个目标至少 1 个
+        if (quota < 1 && num_obj > 1) quota = 1;  // At least one per objective
         
-        // 按目标 m 排序（越小越好）
+        // Sort by objective m (lower is better)
         std::vector<int> by_obj(cands.size());
         for (int i = 0; i < (int)cands.size(); i++) by_obj[i] = i;
         std::sort(by_obj.begin(), by_obj.end(),
@@ -186,32 +186,32 @@ inline std::vector<int> nsga2_select(std::vector<CandidateInfo>& cands,
         }
     }
     
-    // --- 2. NSGA-II 选择填充剩余名额 ---
+    // --- 2. NSGA-II fills remaining slots ---
     int remaining = target - num_reserved_random - (int)selected.size();
     
     if (remaining > 0) {
-        // 非支配排序
+        // Non-dominated sort
         std::vector<std::vector<int>> fronts;
         fast_nondominated_sort(cands, num_obj, fronts);
         
         for (auto& front : fronts) {
             if (remaining <= 0) break;
             
-            // 过滤已选中的
+            // Filter out already selected
             std::vector<int> available;
             for (int i : front) {
                 if (!cands[i].selected) available.push_back(i);
             }
             
             if ((int)available.size() <= remaining) {
-                // 整层都选
+                // Take the whole front
                 for (int i : available) {
                     cands[i].selected = true;
                     selected.push_back(i);
                     remaining--;
                 }
             } else {
-                // 该层需要截断：按加权拥挤度选
+                // Truncate this front: pick by weighted crowding
                 weighted_crowding_distance(cands, available, num_obj, importance);
                 std::sort(available.begin(), available.end(),
                           [&](int a, int b) { return cands[a].crowding > cands[b].crowding; });
@@ -228,14 +228,14 @@ inline std::vector<int> nsga2_select(std::vector<CandidateInfo>& cands,
 }
 
 // ============================================================
-// 单目标快速路径：直接按标量排序取 top
+// Single-objective fast path: scalar sort and take top
 // ============================================================
 inline std::vector<int> top_n_select(std::vector<CandidateInfo>& cands,
                                       int target,
                                       int num_reserved_random) {
     int to_select = target - num_reserved_random;
     
-    // 按 penalty 优先，然后按 objs[0]（已归一化为越小越好）
+    // Prefer lower penalty, then objs[0] (normalized, lower is better)
     std::vector<int> indices(cands.size());
     for (int i = 0; i < (int)cands.size(); i++) indices[i] = i;
     std::sort(indices.begin(), indices.end(), [&](int a, int b) {
diff --git a/prototype/core/multi_gpu_solver.cuh b/prototype/core/multi_gpu_solver.cuh
index 1169685..84dc78f 100644
--- a/prototype/core/multi_gpu_solver.cuh
+++ b/prototype/core/multi_gpu_solver.cuh
@@ -1,12 +1,12 @@
 /**
- * multi_gpu_solver.cuh - 多 GPU 协同求解
+ * multi_gpu_solver.cuh - Multi-GPU cooperative solving
  * 
- * v5.0 方案 B3: 被动注入 + GPU 无感知
- *   - 每块 GPU 独立运行 solve()，各自用不同 seed
- *   - 每个 GPU 有一个 InjectBuffer（设备端）
- *   - CPU 协调线程定期（每 N 秒）收集各 GPU 的 best，异步写入其他 GPU 的 InjectBuffer
- *   - GPU 在 migrate_kernel 后检查 InjectBuffer，如果有新解则注入
- *   - 完全解耦：GPU 无需暂停，CPU 异步写入，通过 CUDA Stream 同步保证安全
+ * v5.0 plan B3: passive injection + GPU-agnostic design
+ *   - Each GPU runs solve() independently with its own seed
+ *   - Each GPU has an InjectBuffer (device memory)
+ *   - A CPU coordinator thread periodically (every N seconds) collects each GPU's best and asynchronously writes to other GPUs' InjectBuffers
+ *   - After migrate_kernel, each GPU checks InjectBuffer and injects if a new solution is present
+ *   - Fully decoupled: GPUs need not pause; CPU writes asynchronously; CUDA stream sync ensures safety
  */
 
 #pragma once
@@ -18,25 +18,26 @@
 #include <chrono>
 
 // ============================================================
-// MultiGpuContext — 每个 GPU 的上下文
+// MultiGpuContext — per-GPU context
 // ============================================================
 
 template<typename Problem>
 struct MultiGpuContext {
     using Sol = typename Problem::Sol;
     
-    int gpu_id;                      // GPU 设备 ID
-    Problem* problem;                // Problem 实例（设备指针指向该 GPU）
-    SolverConfig config;             // 求解器配置（独立 seed）
+    int gpu_id;                      // GPU device ID
+    Problem* problem;                // Problem instance (device pointer for this GPU)
+    SolverConfig config;             // Solver config (independent seed)
     
-    Sol best_solution;               // 当前最优解（host 端）
-    std::mutex best_mutex;           // 保护 best_solution 的互斥锁
+    Sol best_solution;               // Current best solution (host)
+    SolveResult<Sol> solve_result;   // Full result from solve()
+    std::mutex best_mutex;           // Mutex protecting best_solution
     
-    InjectBuffer<Sol>* d_inject_buf; // Device 端注入缓冲区（在该 GPU 上分配）
-    Sol* d_global_best;              // Device 端全局最优解指针（由 solve() 导出）
+    InjectBuffer<Sol>* d_inject_buf; // Device-side inject buffer (allocated on this GPU)
+    Sol* d_global_best;              // Device pointer to global best (exported by solve())
     
-    std::atomic<bool> stop_flag;     // 停止标志
-    std::atomic<bool> running;       // 运行状态标志（用于协调线程判断）
+    std::atomic<bool> stop_flag;     // Stop flag
+    std::atomic<bool> running;       // Running flag (for coordinator thread)
     
     MultiGpuContext(int id) : gpu_id(id), problem(nullptr), d_inject_buf(nullptr), 
                                d_global_best(nullptr), stop_flag(false), running(false) {
@@ -47,45 +48,46 @@ struct MultiGpuContext {
 };
 
 // ============================================================
-// GPU Worker 线程函数（方案 B3）
+// GPU worker thread (plan B3)
 // ============================================================
 
 template<typename Problem>
 void gpu_worker(MultiGpuContext<Problem>* ctx) {
     using Sol = typename Problem::Sol;
     
-    // 设置当前线程使用的 GPU
+    // Set GPU for this thread
     CUDA_CHECK(cudaSetDevice(ctx->gpu_id));
     
-    // 标记开始运行
+    // Mark as running
     ctx->running.store(true);
     
-    // 运行 solve（传入 inject_buf 和 d_global_best_out）
+    // Run solve (pass inject_buf and d_global_best_out)
     SolveResult<Sol> result = solve(*ctx->problem, ctx->config, 
                                      nullptr, 0, nullptr, ctx->d_inject_buf, &ctx->d_global_best);
     
-    // 标记运行结束
+    // Mark as finished running
     ctx->running.store(false);
     
-    // 更新最优解
+    // Update best solution and full result
     {
         std::lock_guard<std::mutex> lock(ctx->best_mutex);
         ctx->best_solution = result.best_solution;
+        ctx->solve_result = result;
     }
     
-    // 标记完成
+    // Mark complete
     ctx->stop_flag.store(true);
 }
 
 // ============================================================
-// 协调线程函数（方案 B3）
+// Coordinator thread (plan B3)
 // ============================================================
-// 定期从各 GPU 的 d_global_best 读取当前 best，计算 global_best，注入到其他 GPU
+// Periodically read each GPU's current best from d_global_best, compute global_best, inject to other GPUs
 //
-// 关键设计：
-// 1. 直接从各 GPU 的 d_global_best 读取（由 solve() 导出）
-// 2. 要求启用 SA（否则无 d_global_best）
-// 3. 轻量侵入：solve() 只需导出一个指针，对单 GPU 无影响
+// Key design:
+// 1. Read directly from each GPU's d_global_best (exported by solve())
+// 2. Requires SA enabled (otherwise no d_global_best)
+// 3. Light touch: solve() only exports a pointer; single-GPU path unchanged
 
 template<typename Problem>
 void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
@@ -96,7 +98,7 @@ void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
     auto interval_ms = std::chrono::milliseconds(static_cast<int>(interval_sec * 1000));
     int round = 0;
     
-    // 等待所有 GPU 的 d_global_best 就绪
+    // Wait until all GPUs' d_global_best are ready
     bool all_ready = false;
     while (!all_ready) {
         std::this_thread::sleep_for(std::chrono::milliseconds(100));
@@ -110,10 +112,10 @@ void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
     }
     
     while (true) {
-        // 等待指定时间间隔
+        // Wait for the configured interval
         std::this_thread::sleep_for(interval_ms);
         
-        // 检查是否所有 GPU 都已停止
+        // Check whether all GPUs have stopped
         bool all_stopped = true;
         for (auto* ctx : contexts) {
             if (ctx->running.load()) {
@@ -125,17 +127,17 @@ void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
         
         round++;
         
-        // 收集各 GPU 的当前最优解（从 d_global_best 读取）
+        // Collect each GPU's current best (from d_global_best)
         Sol global_best;
         global_best.penalty = 1e30f;
         global_best.objectives[0] = 1e30f;
         int best_gpu = -1;
         
         for (int i = 0; i < (int)contexts.size(); i++) {
-            if (!contexts[i]->running.load()) continue;  // 已停止的 GPU 跳过
-            if (contexts[i]->d_global_best == nullptr) continue;  // 未就绪跳过
+            if (!contexts[i]->running.load()) continue;  // skip stopped GPUs
+            if (contexts[i]->d_global_best == nullptr) continue;  // skip not ready
             
-            // 从该 GPU 的 d_global_best 读取
+            // Read from this GPU's d_global_best
             Sol gpu_best;
             cudaSetDevice(contexts[i]->gpu_id);
             cudaMemcpy(&gpu_best, contexts[i]->d_global_best, sizeof(Sol), cudaMemcpyDeviceToHost);
@@ -146,23 +148,23 @@ void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
             }
         }
         
-        if (best_gpu == -1) continue;  // 所有 GPU 都已停止或未就绪
+        if (best_gpu == -1) continue;  // all GPUs stopped or not ready
         
         if (verbose) {
             printf("  [Coordinator Round %d] Global best from GPU %d: obj=%.2f, penalty=%.2f\n",
                    round, best_gpu, global_best.objectives[0], global_best.penalty);
         }
         
-        // 将 global_best 注入到其他 GPU（除了 best_gpu 自己）
+        // Inject global_best into other GPUs (except best_gpu)
         for (int i = 0; i < (int)contexts.size(); i++) {
-            if (i == best_gpu) continue;  // 不注入到自己
-            if (!contexts[i]->running.load()) continue;  // 已停止的 GPU 不注入
+            if (i == best_gpu) continue;  // do not inject to self
+            if (!contexts[i]->running.load()) continue;  // do not inject to stopped GPUs
             
-            // 读取 InjectBuffer 结构（从 device 到 host）
+            // Read InjectBuffer struct (device to host)
             InjectBuffer<Sol> buf;
             cudaMemcpy(&buf, contexts[i]->d_inject_buf, sizeof(InjectBuffer<Sol>), cudaMemcpyDeviceToHost);
             
-            // 同步写入（会自动切换设备）
+            // Synchronous write (switches device as needed)
             buf.write_sync(global_best, contexts[i]->gpu_id);
         }
     }
@@ -173,7 +175,7 @@ void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
 }
 
 // ============================================================
-// 多 GPU 协同求解主函数（方案 B3）
+// Multi-GPU cooperative solve entry (plan B3)
 // ============================================================
 
 template<typename Problem>
@@ -181,13 +183,17 @@ SolveResult<typename Problem::Sol> solve_multi_gpu(Problem& prob, const SolverCo
     using Sol = typename Problem::Sol;
     
     if (cfg.num_gpus <= 1) {
-        // 单 GPU 模式，直接调用普通 solve
+        // Single-GPU mode: call plain solve
         return solve(prob, cfg);
     }
     
-    // 检查可用 GPU 数量
-    int device_count;
+    // Check available GPU count
+    int device_count = 0;
     CUDA_CHECK(cudaGetDeviceCount(&device_count));
+    if (device_count <= 0) {
+        fprintf(stderr, "Error: No CUDA devices available\n");
+        return SolveResult<Sol>{};
+    }
     int actual_gpus = std::min(cfg.num_gpus, device_count);
     
     if (cfg.verbose) {
@@ -199,15 +205,15 @@ SolveResult<typename Problem::Sol> solve_multi_gpu(Problem& prob, const SolverCo
                cfg.multi_gpu_inject_mode == MultiGpuInjectMode::HalfIslands ? "HalfIslands" : "AllIslands");
     }
     
-    // 创建各 GPU 的上下文
+    // Create per-GPU contexts
     std::vector<MultiGpuContext<Problem>*> contexts;
     for (int i = 0; i < actual_gpus; i++) {
         auto* ctx = new MultiGpuContext<Problem>(i);
         ctx->config = cfg;
-        ctx->config.seed = cfg.seed + i * 1000;  // 每个 GPU 用不同 seed
-        ctx->config.num_gpus = 1;  // 单 GPU 模式运行
+        ctx->config.seed = cfg.seed + i * 1000;  // distinct seed per GPU
+        ctx->config.num_gpus = 1;  // run as single-GPU per device
         
-        // 克隆 Problem 到该 GPU
+        // Clone Problem onto this GPU
         ctx->problem = prob.clone_to_device(i);
         if (ctx->problem == nullptr) {
             fprintf(stderr, "Error: Failed to clone problem to GPU %d\n", i);
@@ -218,10 +224,10 @@ SolveResult<typename Problem::Sol> solve_multi_gpu(Problem& prob, const SolverCo
             return SolveResult<Sol>{};
         }
         
-        // 分配 InjectBuffer（在该 GPU 上）
+        // Allocate InjectBuffer on this GPU
         InjectBuffer<Sol> buf = InjectBuffer<Sol>::allocate(i);
         
-        // 将 InjectBuffer 拷贝到 device 端（传给 kernel）
+        // Copy InjectBuffer to device (for kernels)
         InjectBuffer<Sol>* d_buf;
         CUDA_CHECK(cudaSetDevice(i));
         CUDA_CHECK(cudaMalloc(&d_buf, sizeof(InjectBuffer<Sol>)));
@@ -231,34 +237,36 @@ SolveResult<typename Problem::Sol> solve_multi_gpu(Problem& prob, const SolverCo
         contexts.push_back(ctx);
     }
     
-    // 启动 worker 线程
+    // Start worker threads
     std::vector<std::thread> workers;
     for (auto* ctx : contexts) {
         workers.emplace_back(gpu_worker<Problem>, ctx);
     }
     
-    // 启动协调线程（定期注入 global_best）
+    // Start coordinator thread (periodic global_best injection)
     std::thread coordinator(coordinator_thread<Problem>, std::ref(contexts),
                             cfg.multi_gpu_interval_sec, cfg.verbose);
     
-    // 等待所有 worker 完成
+    // Wait for all workers to finish
     for (auto& w : workers) w.join();
     
-    // 等待协调线程完成
+    // Wait for coordinator to finish
     coordinator.join();
     
-    // 收集最终结果
+    // Collect final result from best GPU
     Sol final_best = contexts[0]->best_solution;
+    int best_ctx = 0;
     ObjConfig oc = prob.obj_config();
     for (int i = 1; i < (int)contexts.size(); i++) {
         if (is_better(contexts[i]->best_solution, final_best, oc)) {
             final_best = contexts[i]->best_solution;
+            best_ctx = i;
         }
     }
     
-    // 清理
+    // Cleanup
     for (auto* ctx : contexts) {
-        // 读取 InjectBuffer 的内容（用于释放）
+        // Read InjectBuffer content (for teardown)
         InjectBuffer<Sol> buf;
         CUDA_CHECK(cudaSetDevice(ctx->gpu_id));
         CUDA_CHECK(cudaMemcpy(&buf, ctx->d_inject_buf, sizeof(InjectBuffer<Sol>), cudaMemcpyDeviceToHost));
@@ -269,10 +277,9 @@ SolveResult<typename Problem::Sol> solve_multi_gpu(Problem& prob, const SolverCo
         delete ctx;
     }
     
-    // 构造返回结果
-    SolveResult<Sol> result;
+    // Build return value from best GPU's result
+    SolveResult<Sol> result = contexts[best_ctx]->solve_result;
     result.best_solution = final_best;
-    result.stop_reason = StopReason::MaxGen;
     
     return result;
 }
diff --git a/prototype/core/operators.cuh b/prototype/core/operators.cuh
index f1db6e6..179ee63 100644
--- a/prototype/core/operators.cuh
+++ b/prototype/core/operators.cuh
@@ -1,40 +1,40 @@
 /**
- * operators.cuh - 四层搜索算子体系（Device 端）
+ * operators.cuh - Four-layer search operator hierarchy (device side)
  *
- * v1.0: 二维通用编码的完整算子层次
+ * v1.0: Full operator hierarchy for 2D universal encoding
  *
- * 层次结构（所有算子只看 data[D1][D2] + dim2_sizes，不感知问题语义）：
+ * Hierarchy (all operators only see data[D1][D2] + dim2_sizes, no problem semantics):
  *
- *   第 1 层 - 元素级（Element）: 操作单个元素
- *     行内: swap, reverse(2-opt), insert, flip
- *     跨行: cross_relocate（单元素移行）, cross_swap（单元素换行）
+ *   Layer 1 - Element: operate on single elements
+ *     Within row: swap, reverse(2-opt), insert, flip
+ *     Cross-row: cross_relocate (move one element across rows), cross_swap (swap one element per row)
  *
- *   第 2 层 - 片段级（Segment）: 操作连续片段
- *     行内: or_opt（移动连续 k 个元素到行内新位置）
- *     跨行: seg_relocate（片段从一行移到另一行）
- *            seg_swap（两行各取一段互换，即 2-opt*）
+ *   Layer 2 - Segment: operate on contiguous segments
+ *     Within row: or_opt (move contiguous k elements to a new position in the row)
+ *     Cross-row: seg_relocate (move a segment from one row to another)
+ *            seg_swap (swap two segments from two rows each, i.e. 2-opt*)
  *
- *   第 3 层 - 行级（Row）: 操作整行
- *     row_swap（交换两行全部内容和长度）
- *     row_reverse（反转行的排列顺序）
- *     row_split（一行拆成两行）
- *     row_merge（两行合并为一行）
+ *   Layer 3 - Row: operate on whole rows
+ *     row_swap (swap full contents and lengths of two rows)
+ *     row_reverse (reverse row order)
+ *     row_split (split one row into two)
+ *     row_merge (merge two rows into one)
  *
- *   第 4 层 - 交叉（Crossover）: 组合两个解
- *     row_crossover（从父代 A/B 各取若干行组成子代）
- *     uniform_crossover（逐元素从两个父代中选）
+ *   Layer 4 - Crossover: combine two solutions
+ *     row_crossover (child takes some rows from parent A and B)
+ *     uniform_crossover (pick per element from two parents)
  *
- * Move 描述符：
- *   row, row2: 行索引（row2=-1 表示行内）
- *   op:        操作码
- *   pos1, pos2: 位置参数
- *   seg_len:   片段长度（第 2 层使用）
+ * Move descriptor:
+ *   row, row2: row indices (row2=-1 means within-row)
+ *   op:        operation code
+ *   pos1, pos2: position parameters
+ *   seg_len:   segment length (used by layer 2)
  *
- * 设计原则：
- *   - 所有算子对问题类型无感知，只操作二维数组
- *   - 每个算子都有对应的 undo 操作
- *   - 空行安全：自动降级为 no-op
- *   - 编码类型决定可用算子集
+ * Design principles:
+ *   - All operators are problem-agnostic; they only manipulate a 2D array
+ *   - Each operator has a corresponding undo
+ *   - Empty-row safe: automatically degrades to no-op
+ *   - Encoding type determines the available operator set
  */
 
 #pragma once
@@ -44,61 +44,61 @@
 namespace ops {
 
 // ============================================================
-// Op 码常量 — 按层次编号，避免冲突
+// Op code constants — numbered by layer to avoid collisions
 // ============================================================
 
-// 通用
+// General
 constexpr int OP_NOOP             = -1;
 
-// --- 第 1 层：元素级 ---
-// Permutation 行内
-constexpr int PERM_SWAP           = 0;   // 交换两个位置
-constexpr int PERM_REVERSE        = 1;   // 反转区间（2-opt）
-constexpr int PERM_INSERT         = 2;   // 移动单个元素到新位置
-// Permutation 跨行
-constexpr int PERM_CROSS_RELOCATE = 3;   // 单元素从一行移到另一行
-constexpr int PERM_CROSS_SWAP     = 4;   // 两行各一个元素互换
-// Binary 行内
-constexpr int BIN_FLIP            = 0;   // 翻转一个位
-constexpr int BIN_SWAP            = 1;   // 交换两个位
-// Binary 跨行
-constexpr int BIN_CROSS_SWAP      = 2;   // 两行各一个位互换
+// --- Layer 1: element ---
+// Permutation within row
+constexpr int PERM_SWAP           = 0;   // swap two positions
+constexpr int PERM_REVERSE        = 1;   // reverse interval (2-opt)
+constexpr int PERM_INSERT         = 2;   // move one element to a new position
+// Permutation cross-row
+constexpr int PERM_CROSS_RELOCATE = 3;   // move one element from one row to another
+constexpr int PERM_CROSS_SWAP     = 4;   // swap one element per row between two rows
+// Binary within row
+constexpr int BIN_FLIP            = 0;   // flip one bit
+constexpr int BIN_SWAP            = 1;   // swap two bits
+// Binary cross-row
+constexpr int BIN_CROSS_SWAP      = 2;   // swap one bit per row between two rows
 
-// --- 第 1 层（续）：排列行内 ---
-constexpr int PERM_3OPT           = 5;   // 3-opt：断 3 条边重连
+// --- Layer 1 (cont.): permutation within row ---
+constexpr int PERM_3OPT           = 5;   // 3-opt: break 3 edges and reconnect
 
-// --- 第 2 层：片段级 ---
-constexpr int PERM_OR_OPT         = 10;  // 行内：移动连续 k 个元素
-constexpr int PERM_SEG_RELOCATE   = 11;  // 跨行：片段从一行移到另一行
-constexpr int PERM_SEG_SWAP       = 12;  // 跨行：两行各取一段互换（2-opt*）
-constexpr int PERM_CROSS_EXCHANGE = 15;  // 跨行：两行各取一段互换（保持各自内部顺序）
-constexpr int BIN_SEG_FLIP        = 13;  // 行内：翻转连续 k 个位
-constexpr int BIN_SEG_CROSS_SWAP  = 14;  // 跨行：两行各取一段互换
-constexpr int BIN_K_FLIP          = 16;  // 行内：同时翻转 k 个随机位
+// --- Layer 2: segment ---
+constexpr int PERM_OR_OPT         = 10;  // within row: move contiguous k elements
+constexpr int PERM_SEG_RELOCATE   = 11;  // cross-row: move segment from one row to another
+constexpr int PERM_SEG_SWAP       = 12;  // cross-row: swap two segments from two rows each (2-opt*)
+constexpr int PERM_CROSS_EXCHANGE = 15;  // cross-row: swap two segments (preserve internal order each)
+constexpr int BIN_SEG_FLIP        = 13;  // within row: flip contiguous k bits
+constexpr int BIN_SEG_CROSS_SWAP  = 14;  // cross-row: swap two segments from two rows each
+constexpr int BIN_K_FLIP          = 16;  // within row: flip k random bits at once
 
-// --- 第 3 层：行级 ---
-constexpr int ROW_SWAP            = 20;  // 交换两行全部内容
-constexpr int ROW_REVERSE         = 21;  // 反转行的排列顺序（行号重排）
-constexpr int ROW_SPLIT           = 22;  // 一行拆成两行
-constexpr int ROW_MERGE           = 23;  // 两行合并为一行
+// --- Layer 3: row ---
+constexpr int ROW_SWAP            = 20;  // swap full contents of two rows
+constexpr int ROW_REVERSE         = 21;  // reverse row order (row index permutation)
+constexpr int ROW_SPLIT           = 22;  // split one row into two
+constexpr int ROW_MERGE           = 23;  // merge two rows into one
 
-// --- 特殊：扰动（连续多步 move，不可 undo，用于跳出局部最优）---
+// --- Special: perturbation (multi-step moves, no undo, escape local optima) ---
 constexpr int PERTURBATION        = 40;
 
-// --- 第 4 层：交叉 ---
-constexpr int CROSS_ROW           = 30;  // 行级交叉：从两个父代各取若干行
-constexpr int CROSS_UNIFORM       = 31;  // 均匀交叉：逐元素从两个父代选
+// --- Layer 4: crossover ---
+constexpr int CROSS_ROW           = 30;  // row crossover: take some rows from each parent
+constexpr int CROSS_UNIFORM       = 31;  // uniform crossover: pick per element from two parents
 
 // ============================================================
-// Move 描述符 — 编码级别的变动描述
+// Move descriptor — encoding-level change description
 // ============================================================
 
 struct Move {
-    int row;            // 源行（或第一行）
-    int row2;           // 目标行（-1 = 行内）
-    int op;             // 操作码
-    int pos1, pos2;     // 位置参数
-    int seg_len;        // 片段长度（第 2 层使用，其他层 = 0）
+    int row;            // source row (or first row)
+    int row2;           // target row (-1 = within-row)
+    int op;             // operation code
+    int pos1, pos2;     // position parameters
+    int seg_len;        // segment length (layer 2; 0 for other layers)
 };
 
 }  // namespace ops
@@ -106,10 +106,10 @@ struct Move {
 namespace ops {
 
 // ============================================================
-// 第 1 层：元素级底层操作
+// Layer 1: element-level primitives
 // ============================================================
 
-// --- Permutation 行内 ---
+// --- Permutation within row ---
 
 __device__ inline void perm_swap(int* row, int i, int j) {
     int tmp = row[i]; row[i] = row[j]; row[j] = tmp;
@@ -126,9 +126,9 @@ __device__ inline void perm_insert(int* row, int from, int to, int size) {
     row[to] = val;
 }
 
-// --- Permutation 跨行 ---
+// --- Permutation cross-row ---
 
-/// cross_relocate: 从 src_row[src_pos] 取出元素，插入 dst_row[dst_pos]
+/// cross_relocate: take element from src_row[src_pos], insert at dst_row[dst_pos]
 __device__ inline void perm_cross_relocate(int* src_row, int& src_size,
                                             int* dst_row, int& dst_size,
                                             int src_pos, int dst_pos) {
@@ -142,24 +142,24 @@ __device__ inline void perm_cross_relocate(int* src_row, int& src_size,
     dst_size++;
 }
 
-/// cross_swap: 交换 rowA[posA] 和 rowB[posB]
+/// cross_swap: swap rowA[posA] and rowB[posB]
 __device__ inline void cross_swap_elem(int* rowA, int posA, int* rowB, int posB) {
     int tmp = rowA[posA]; rowA[posA] = rowB[posB]; rowB[posB] = tmp;
 }
 
-// --- Permutation 行内：3-opt ---
-// 断开 3 条边，选择最佳重连方式（共 8 种组合，取随机一种非恒等变换）
-// 参数：3 个断点 i < j < k，将路线分为 seg0=[0,i] seg1=[i+1,j] seg2=[j+1,k] seg3=[k+1,end]
-// 实现：随机选一种重连（reverse seg1, reverse seg2, 或两者都反转）
-// pos1=i, pos2=j, seg_len 编码 k
+// --- Permutation within row: 3-opt ---
+// Break 3 edges and pick a reconnection (8 combinations; pick one random non-identity)
+// Args: three breakpoints i < j < k, route splits seg0=[0,i] seg1=[i+1,j] seg2=[j+1,k] seg3=[k+1,end]
+// Impl: random reconnection (reverse seg1, reverse seg2, or both)
+// pos1=i, pos2=j, seg_len encodes k
 __device__ inline void perm_3opt(int* row, int size, int i, int j, int k) {
-    // 3-opt 有多种重连方式，这里实现最常用的 3 种非恒等变换：
-    //   type 1: reverse [i+1, j]                    — 等价于 2-opt(i+1, j)
-    //   type 2: reverse [j+1, k]                    — 等价于 2-opt(j+1, k)
-    //   type 3: reverse [i+1, j] + reverse [j+1, k] — 真正的 3-opt move
-    //   type 4: 将 seg1 和 seg2 互换位置（不反转）  — or-opt 的泛化
-    // 我们随机选 type 3 或 type 4（type 1/2 已被 2-opt 覆盖）
-    // 这里固定做 type 3（双反转），因为它是 2-opt 无法达到的唯一新邻域
+    // 3-opt has several reconnections; here we use the most common non-identity variants:
+    //   type 1: reverse [i+1, j]                    — same as 2-opt(i+1, j)
+    //   type 2: reverse [j+1, k]                    — same as 2-opt(j+1, k)
+    //   type 3: reverse [i+1, j] + reverse [j+1, k] — true 3-opt move
+    //   type 4: swap seg1 and seg2 (no reverse)     — generalization of or-opt
+    // We would randomize type 3 or 4 (types 1/2 are covered by 2-opt)
+    // Here we fix type 3 (double reverse) as the only new neighborhood 2-opt cannot reach
     // reverse [i+1, j]
     int lo = i + 1, hi = j;
     while (lo < hi) { int t = row[lo]; row[lo] = row[hi]; row[hi] = t; lo++; hi--; }
@@ -168,12 +168,12 @@ __device__ inline void perm_3opt(int* row, int size, int i, int j, int k) {
     while (lo < hi) { int t = row[lo]; row[lo] = row[hi]; row[hi] = t; lo++; hi--; }
 }
 
-// 3-opt undo: 再做一次相同操作即可恢复（双反转是自反的）
+// 3-opt undo: repeat the same move to restore (double reverse is self-inverse)
 __device__ inline void perm_3opt_undo(int* row, int size, int i, int j, int k) {
-    perm_3opt(row, size, i, j, k);  // 自反
+    perm_3opt(row, size, i, j, k);  // self-inverse
 }
 
-// --- Binary 行内 ---
+// --- Binary within row ---
 
 __device__ inline void bin_flip(int* row, int i) { row[i] = 1 - row[i]; }
 
@@ -182,51 +182,51 @@ __device__ inline void bin_swap(int* row, int i, int j) {
 }
 
 // ============================================================
-// 第 2 层：片段级底层操作
+// Layer 2: segment-level primitives
 // ============================================================
 
-/// or_opt: 行内移动连续 seg_len 个元素（从 from 开始）到 to 位置
-/// 等价于：取出 [from, from+seg_len)，插入到 to 之前
-/// 约束：from + seg_len <= size, to 不在 [from, from+seg_len) 内
+/// or_opt: within row, move contiguous seg_len elements (starting at from) to position to
+/// Same as: take [from, from+seg_len), insert before to
+/// Constraints: from + seg_len <= size, to not in [from, from+seg_len)
 __device__ inline void perm_or_opt(int* row, int size, int from, int to, int seg_len) {
-    // 临时缓冲（片段最大长度受限于寄存器，实际 seg_len 通常 <= 4）
-    int buf[8];  // 足够覆盖常见 seg_len
+    // Temp buffer (max segment length limited by registers; seg_len usually <= 4)
+    int buf[8];  // enough for typical seg_len
     int actual_len = (seg_len > 8) ? 8 : seg_len;
     
-    // 保存片段
+    // Save segment
     for (int i = 0; i < actual_len; i++) buf[i] = row[from + i];
     
-    // 移除片段（左移填补空洞）
+    // Remove segment (shift left to close gap)
     int new_size = size - actual_len;
     for (int k = from; k < new_size; k++) row[k] = row[k + actual_len];
     
-    // 计算插入位置（移除后的坐标系）
+    // Insert position after removal (coords after removal)
     int ins = (to > from) ? to - actual_len : to;
     if (ins < 0) ins = 0;
     if (ins > new_size) ins = new_size;
     
-    // 插入片段（右移腾位）
+    // Insert segment (shift right to make room)
     for (int k = new_size - 1; k >= ins; k--) row[k + actual_len] = row[k];
     for (int i = 0; i < actual_len; i++) row[ins + i] = buf[i];
 }
 
-/// seg_relocate: 从 src_row 取出连续 seg_len 个元素，插入 dst_row 的 dst_pos
-/// src_size 减 seg_len，dst_size 加 seg_len
+/// seg_relocate: take contiguous seg_len elements from src_row, insert at dst_pos in dst_row
+/// src_size -= seg_len, dst_size += seg_len
 __device__ inline void perm_seg_relocate(int* src_row, int& src_size,
                                           int* dst_row, int& dst_size,
                                           int src_pos, int dst_pos, int seg_len) {
     int buf[8];
     int actual_len = (seg_len > 8) ? 8 : seg_len;
     
-    // 保存片段
+    // Save segment
     for (int i = 0; i < actual_len; i++) buf[i] = src_row[src_pos + i];
     
-    // 源行：移除（左移）
+    // Source row: remove (shift left)
     for (int k = src_pos; k < src_size - actual_len; k++)
         src_row[k] = src_row[k + actual_len];
     src_size -= actual_len;
     
-    // 目标行：插入（右移）
+    // Destination row: insert (shift right)
     for (int k = dst_size - 1; k >= dst_pos; k--)
         dst_row[k + actual_len] = dst_row[k];
     for (int i = 0; i < actual_len; i++)
@@ -234,29 +234,29 @@ __device__ inline void perm_seg_relocate(int* src_row, int& src_size,
     dst_size += actual_len;
 }
 
-/// seg_swap: 两行各取一段互换（2-opt* 的通用形式）
+/// seg_swap: swap one segment from each row (general 2-opt*)
 /// rowA[posA..posA+lenA) <-> rowB[posB..posB+lenB)
-/// 行长变化：sizeA += (lenB - lenA), sizeB += (lenA - lenB)
+/// Row lengths: sizeA += (lenB - lenA), sizeB += (lenA - lenB)
 __device__ inline void perm_seg_swap(int* rowA, int& sizeA, int posA, int lenA,
                                       int* rowB, int& sizeB, int posB, int lenB) {
     int bufA[8], bufB[8];
     int aLen = (lenA > 8) ? 8 : lenA;
     int bLen = (lenB > 8) ? 8 : lenB;
     
-    // 保存两段
+    // Save both segments
     for (int i = 0; i < aLen; i++) bufA[i] = rowA[posA + i];
     for (int i = 0; i < bLen; i++) bufB[i] = rowB[posB + i];
     
-    // 从 rowA 移除 segA，腾出空间插入 segB
-    // 先移除
+    // Remove segA from rowA to make room for segB
+    // Remove first
     int newSizeA = sizeA - aLen;
     for (int k = posA; k < newSizeA; k++) rowA[k] = rowA[k + aLen];
-    // 再插入 segB
+    // Then insert segB
     for (int k = newSizeA - 1; k >= posA; k--) rowA[k + bLen] = rowA[k];
     for (int i = 0; i < bLen; i++) rowA[posA + i] = bufB[i];
     sizeA = newSizeA + bLen;
     
-    // 从 rowB 移除 segB，腾出空间插入 segA
+    // Remove segB from rowB to make room for segA
     int newSizeB = sizeB - bLen;
     for (int k = posB; k < newSizeB; k++) rowB[k] = rowB[k + bLen];
     for (int k = newSizeB - 1; k >= posB; k--) rowB[k + aLen] = rowB[k];
@@ -264,10 +264,10 @@ __device__ inline void perm_seg_swap(int* rowA, int& sizeA, int posA, int lenA,
     sizeB = newSizeB + aLen;
 }
 
-/// cross_exchange: 两行各取一段互换，保持各自内部顺序
-/// 与 seg_swap 的区别：seg_swap 是等长互换，cross_exchange 允许不等长
+/// cross_exchange: swap one segment from each row, preserving internal order each
+/// Unlike seg_swap: seg_swap is equal-length swap; cross_exchange allows unequal lengths
 /// rowA[posA..posA+lenA) <-> rowB[posB..posB+lenB)
-/// 行长变化：sizeA += (lenB - lenA), sizeB += (lenA - lenB)
+/// Row lengths: sizeA += (lenB - lenA), sizeB += (lenA - lenB)
 __device__ inline void perm_cross_exchange(int* rowA, int& sizeA, int posA, int lenA,
                                             int* rowB, int& sizeB, int posB, int lenB) {
     int bufA[8], bufB[8];
@@ -277,14 +277,14 @@ __device__ inline void perm_cross_exchange(int* rowA, int& sizeA, int posA, int
     for (int i = 0; i < aLen; i++) bufA[i] = rowA[posA + i];
     for (int i = 0; i < bLen; i++) bufB[i] = rowB[posB + i];
     
-    // rowA: 移除 segA，插入 segB
+    // rowA: remove segA, insert segB
     int newSizeA = sizeA - aLen;
     for (int k = posA; k < newSizeA; k++) rowA[k] = rowA[k + aLen];
     for (int k = newSizeA - 1; k >= posA; k--) rowA[k + bLen] = rowA[k];
     for (int i = 0; i < bLen; i++) rowA[posA + i] = bufB[i];
     sizeA = newSizeA + bLen;
     
-    // rowB: 移除 segB，插入 segA
+    // rowB: remove segB, insert segA
     int newSizeB = sizeB - bLen;
     for (int k = posB; k < newSizeB; k++) rowB[k] = rowB[k + bLen];
     for (int k = newSizeB - 1; k >= posB; k--) rowB[k + aLen] = rowB[k];
@@ -292,8 +292,8 @@ __device__ inline void perm_cross_exchange(int* rowA, int& sizeA, int posA, int
     sizeB = newSizeB + aLen;
 }
 
-/// k-bit flip: 同时翻转 k 个随机位（Binary 编码）
-/// positions 数组存储要翻转的位置，k = 实际翻转数
+/// k-bit flip: flip k random bits at once (Binary encoding)
+/// positions array holds indices to flip; k = number of flips
 __device__ inline void bin_k_flip(int* row, int size, int k, curandState* rng) {
     for (int i = 0; i < k; i++) {
         int pos = rand_int(rng, size);
@@ -301,12 +301,12 @@ __device__ inline void bin_k_flip(int* row, int size, int k, curandState* rng) {
     }
 }
 
-/// seg_flip: 翻转行内连续 seg_len 个位（Binary 编码）
+/// seg_flip: flip contiguous seg_len bits within row (Binary encoding)
 __device__ inline void bin_seg_flip(int* row, int pos, int seg_len) {
     for (int i = 0; i < seg_len; i++) row[pos + i] = 1 - row[pos + i];
 }
 
-/// seg_cross_swap: 两行各取一段互换（Binary 编码，等长）
+/// seg_cross_swap: swap one segment from each row (Binary encoding, equal length)
 __device__ inline void bin_seg_cross_swap(int* rowA, int posA,
                                            int* rowB, int posB, int seg_len) {
     for (int i = 0; i < seg_len; i++) {
@@ -317,23 +317,23 @@ __device__ inline void bin_seg_cross_swap(int* rowA, int posA,
 }
 
 // ============================================================
-// Integer 编码底层操作
+// Integer encoding primitives
 // ============================================================
 
-/// int_clamp: 将值限制在 [lb, ub] 范围内
+/// int_clamp: clamp value to [lb, ub]
 __device__ inline int int_clamp(int v, int lb, int ub) {
     if (v < lb) return lb;
     if (v > ub) return ub;
     return v;
 }
 
-/// int_random_reset: 随机一个位置重置为 [lb, ub] 内随机值
+/// int_random_reset: reset one random position to uniform random in [lb, ub]
 __device__ inline void int_random_reset(int* row, int pos, int lb, int ub,
                                          curandState* rng) {
     row[pos] = lb + (curand(rng) % (ub - lb + 1));
 }
 
-/// int_delta: 随机一个位置 ±k（clamp 到 [lb, ub]）
+/// int_delta: random position, add ±k (clamped to [lb, ub])
 __device__ inline void int_delta(int* row, int pos, int lb, int ub,
                                   curandState* rng) {
     int range = ub - lb + 1;
@@ -343,7 +343,7 @@ __device__ inline void int_delta(int* row, int pos, int lb, int ub,
     row[pos] = int_clamp(row[pos] + step, lb, ub);
 }
 
-/// int_seg_reset: 连续 k 个位置全部重置为 [lb, ub] 内随机值
+/// int_seg_reset: reset k contiguous positions to uniform random in [lb, ub]
 __device__ inline void int_seg_reset(int* row, int pos, int seg_len,
                                       int lb, int ub, curandState* rng) {
     int range = ub - lb + 1;
@@ -351,7 +351,7 @@ __device__ inline void int_seg_reset(int* row, int pos, int seg_len,
         row[pos + i] = lb + (curand(rng) % range);
 }
 
-/// int_k_delta: 随机 k 个位置各自 ±1
+/// int_k_delta: k random positions, each ±1
 __device__ inline void int_k_delta(int* row, int size, int k,
                                     int lb, int ub, curandState* rng) {
     for (int i = 0; i < k; i++) {
@@ -362,21 +362,21 @@ __device__ inline void int_k_delta(int* row, int size, int k,
 }
 
 // ============================================================
-// 第 3 层：行级底层操作
+// Layer 3: row-level primitives
 // ============================================================
 
-/// row_swap: 交换两行的全部内容和长度
+/// row_swap: swap full contents and lengths of two rows
 template<typename Sol>
 __device__ inline void row_swap(Sol& sol, int r1, int r2) {
-    // 交换长度
+    // Swap lengths
     int tmp_size = sol.dim2_sizes[r1];
     sol.dim2_sizes[r1] = sol.dim2_sizes[r2];
     sol.dim2_sizes[r2] = tmp_size;
-    // 交换数据（取两行中较长的长度）
+    // Swap data (use the longer of the two row lengths)
     int max_len = (sol.dim2_sizes[r1] > sol.dim2_sizes[r2]) 
                   ? sol.dim2_sizes[r1] : sol.dim2_sizes[r2];
-    // 交换后 r1 的长度是原 r2 的，r2 的长度是原 r1 的
-    // 所以需要交换 max(原r1长度, 原r2长度) 个元素
+    // After swap, r1 has old r2 length and r2 has old r1 length
+    // So swap max(old r1 len, old r2 len) elements
     max_len = (tmp_size > max_len) ? tmp_size : max_len;
     for (int c = 0; c < max_len; c++) {
         int tmp = sol.data[r1][c];
@@ -385,8 +385,8 @@ __device__ inline void row_swap(Sol& sol, int r1, int r2) {
     }
 }
 
-/// row_reverse: 反转 [r1, r2] 范围内的行排列顺序
-/// 例如 row_reverse(sol, 1, 4) 把行 1,2,3,4 变成 4,3,2,1
+/// row_reverse: reverse row order in [r1, r2]
+/// e.g. row_reverse(sol, 1, 4) turns rows 1,2,3,4 into 4,3,2,1
 template<typename Sol>
 __device__ inline void row_reverse_range(Sol& sol, int r1, int r2) {
     while (r1 < r2) {
@@ -395,23 +395,23 @@ __device__ inline void row_reverse_range(Sol& sol, int r1, int r2) {
     }
 }
 
-/// row_split: 将 row 从 split_pos 处拆成两行
-/// row 保留 [0, split_pos)，empty_row 接收 [split_pos, size)
-/// 要求 empty_row 当前为空或有足够空间
+/// row_split: split row at split_pos into two rows
+/// row keeps [0, split_pos), empty_row gets [split_pos, size)
+/// requires empty_row empty or with enough space
 template<typename Sol>
 __device__ inline void row_split(Sol& sol, int row, int empty_row, int split_pos) {
     int orig_size = sol.dim2_sizes[row];
     int move_count = orig_size - split_pos;
-    // 复制后半段到 empty_row
+    // Copy tail to empty_row
     for (int i = 0; i < move_count; i++)
         sol.data[empty_row][i] = sol.data[row][split_pos + i];
     sol.dim2_sizes[empty_row] = move_count;
     sol.dim2_sizes[row] = split_pos;
 }
 
-/// row_merge: 将 src_row 的全部内容追加到 dst_row 末尾
-/// src_row 清空，dst_row 长度增加
-/// 要求 dst_size + src_size <= DIM2
+/// row_merge: append full contents of src_row to end of dst_row
+/// src_row cleared, dst_row length increased
+/// requires dst_size + src_size <= DIM2
 template<typename Sol>
 __device__ inline void row_merge(Sol& sol, int dst_row, int src_row) {
     int dst_size = sol.dim2_sizes[dst_row];
@@ -423,33 +423,33 @@ __device__ inline void row_merge(Sol& sol, int dst_row, int src_row) {
 }
 
 // ============================================================
-// 第 4 层：交叉底层操作
+// Layer 4: crossover primitives
 // ============================================================
 //
-// 排列编码：OX 家族（统一框架）
-//   核心逻辑：A 中标记一组"保留位置"不动，空位按 B 的全局顺序填充
-//   三个变体只是"怎么选保留集"不同，填充逻辑完全共享
-//   天然保证唯一性：从 B 中按序取不在保留集中的元素，不会重复
-//   行长度不变（= A 的行长度），行边界不变
+// Permutation encoding: OX family (unified framework)
+//   Core: mark "kept" positions from A; fill gaps in B's global order
+//   Three variants differ only in how the keep set is chosen; fill logic is shared
+//   Uniqueness: take from B in order elements not in keep set, no duplicates
+//   Row lengths unchanged (= A's row lengths), row boundaries unchanged
 //
-// Binary 编码：uniform_crossover（逐元素随机选）
+// Binary encoding: uniform_crossover (random pick per element)
 //
 // ============================================================
 
-// ---- OX 核心填充逻辑 ----
-// keep[r][c] = true 表示 child[r][c] 保留 A 的值，false 表示空位
-// 空位按 B 中元素的出现顺序（逐行扫描）填充
-// 要求：child 已拷贝自 A，dim2_sizes 已设为 A 的行长度
+// ---- OX core fill logic ----
+// keep[r][c] = true means child[r][c] keeps A's value; false = gap to fill
+// Gaps filled in order of appearance of elements in B (row-major scan)
+// Requires: child copied from A, dim2_sizes set to A's row lengths
 //
-// 参数 total_elements: 分区模式下的总元素数，非分区模式下 = 单行长度
-//   用于确定 B 中扫描的元素范围
+// total_elements: total elements in partitioned mode; in non-partitioned = single row length
+//   Used to bound the scan range in B
 
 template<typename Sol>
 __device__ inline void ox_fill_from_b(Sol& child, const Sol& parentB,
                                        const bool* keep_flat,
                                        int dim1, int total_elements) {
-    // 统计 A 中保留位置的每个值的出现次数（支持多重集排列）
-    // keep_flat 是按行展平的：keep_flat[r * DIM2 + c]
+    // Count occurrences of each value at kept positions in A (multiset permutations)
+    // keep_flat is row-major flat: keep_flat[r * DIM2 + c]
     int keep_count[512];
     for (int i = 0; i < total_elements; i++) keep_count[i] = 0;
     
@@ -460,21 +460,21 @@ __device__ inline void ox_fill_from_b(Sol& child, const Sol& parentB,
                 if (v >= 0 && v < total_elements) keep_count[v]++;
             }
     
-    // 从 B 中按行扫描顺序收集：每个值只取"需要填充"的份数
-    // 标准排列：每个值最多 1 份，多重集：每个值最多 repeat_count 份
+    // Collect from B in row scan order: take only as many of each value as needed to fill
+    // Standard permutation: at most 1 of each value; multiset: up to repeat_count each
     int fill_buf[512];
     int fill_count = 0;
     for (int r = 0; r < dim1; r++)
         for (int c = 0; c < parentB.dim2_sizes[r]; c++) {
             int val = parentB.data[r][c];
             if (val >= 0 && val < total_elements && keep_count[val] > 0) {
-                keep_count[val]--;  // 消耗一个保留名额
+                keep_count[val]--;  // consume one kept slot
             } else if (val >= 0 && val < total_elements) {
                 fill_buf[fill_count++] = val;
             }
         }
     
-    // 按空位顺序（逐行从左到右）填入
+    // Fill gaps in order (row by row, left to right)
     int fi = 0;
     for (int r = 0; r < dim1; r++)
         for (int c = 0; c < child.dim2_sizes[r]; c++)
@@ -482,26 +482,26 @@ __device__ inline void ox_fill_from_b(Sol& child, const Sol& parentB,
                 child.data[r][c] = fill_buf[fi++];
 }
 
-// ---- 变体 1: OX-区间 ----
-// 每行随机选一个连续区间保留，保留邻接关系
+// ---- Variant 1: OX-interval ----
+// Per row, random contiguous interval kept; preserves adjacency
 template<typename Sol>
 __device__ inline void ox_interval(Sol& child, const Sol& parentA, const Sol& parentB,
                                     int dim1, int total_elements, curandState* rng) {
     bool keep[Sol::DIM1 * Sol::DIM2];
     for (int i = 0; i < Sol::DIM1 * Sol::DIM2; i++) keep[i] = false;
     
-    // child = A，同时标记每行的保留区间
+    // child = A, mark each row's kept interval
     for (int r = 0; r < dim1; r++) {
         int sz = parentA.dim2_sizes[r];
         child.dim2_sizes[r] = sz;
         for (int c = 0; c < sz; c++) child.data[r][c] = parentA.data[r][c];
         
         if (sz < 2) {
-            // 长度 0 或 1：全部保留
+            // length 0 or 1: keep all
             for (int c = 0; c < sz; c++) keep[r * Sol::DIM2 + c] = true;
             continue;
         }
-        // 随机选区间 [lo, hi]
+        // Random interval [lo, hi]
         int lo = rand_int(rng, sz);
         int hi = rand_int(rng, sz);
         if (lo > hi) { int tmp = lo; lo = hi; hi = tmp; }
@@ -511,8 +511,8 @@ __device__ inline void ox_interval(Sol& child, const Sol& parentA, const Sol& pa
     ox_fill_from_b(child, parentB, keep, dim1, total_elements);
 }
 
-// ---- 变体 2: OX-子集 ----
-// 随机选约 50% 的元素值保留其在 A 中的位置，通用性最强
+// ---- Variant 2: OX-subset ----
+// Randomly keep ~50% of positions at their A values; most general
 template<typename Sol>
 __device__ inline void ox_subset(Sol& child, const Sol& parentA, const Sol& parentB,
                                   int dim1, int total_elements, curandState* rng) {
@@ -526,7 +526,7 @@ __device__ inline void ox_subset(Sol& child, const Sol& parentA, const Sol& pare
             child.data[r][c] = parentA.data[r][c];
     }
     
-    // 每个位置 50% 概率保留
+    // 50% keep per position
     for (int r = 0; r < dim1; r++)
         for (int c = 0; c < child.dim2_sizes[r]; c++)
             keep[r * Sol::DIM2 + c] = (curand_uniform(rng) < 0.5f);
@@ -534,9 +534,9 @@ __device__ inline void ox_subset(Sol& child, const Sol& parentA, const Sol& pare
     ox_fill_from_b(child, parentB, keep, dim1, total_elements);
 }
 
-// ---- 变体 3: OX-行 ----
-// 随机选若干整行保留，其余行的元素全部按 B 的顺序重填
-// 保留整条路线结构，VRP 受益
+// ---- Variant 3: OX-row ----
+// Randomly keep whole rows; refill non-kept rows from B's order
+// Preserves full route structure; good for VRP
 template<typename Sol>
 __device__ inline void ox_row(Sol& child, const Sol& parentA, const Sol& parentB,
                                int dim1, int total_elements, curandState* rng) {
@@ -550,7 +550,7 @@ __device__ inline void ox_row(Sol& child, const Sol& parentA, const Sol& parentB
             child.data[r][c] = parentA.data[r][c];
     }
     
-    // 每行 50% 概率整行保留
+    // 50% chance to keep whole row
     int kept = 0;
     for (int r = 0; r < dim1; r++) {
         if (curand_uniform(rng) < 0.5f) {
@@ -559,14 +559,14 @@ __device__ inline void ox_row(Sol& child, const Sol& parentA, const Sol& parentB
             kept++;
         }
     }
-    // 确保不是全保留或全不保留
+    // Ensure not all-kept or all-unkept
     if (kept == 0) {
         int r = rand_int(rng, dim1);
-        // 不标记任何 keep → 全部重填（至少有一行不保留）
-        // 实际上 kept==0 意味着全部重填，这是合法的（child = B 的顺序填入 A 的结构）
+        // No keep marks → full refill (at least one row not kept)
+        // kept==0 means full refill; valid (child gets B's order into A's structure)
     }
     if (kept == dim1 && dim1 > 1) {
-        // 全保留 → 随机取消一行
+        // All kept → randomly un-keep one row
         int r = rand_int(rng, dim1);
         for (int c = 0; c < child.dim2_sizes[r]; c++)
             keep[r * Sol::DIM2 + c] = false;
@@ -575,14 +575,14 @@ __device__ inline void ox_row(Sol& child, const Sol& parentA, const Sol& parentB
     ox_fill_from_b(child, parentB, keep, dim1, total_elements);
 }
 
-// ---- OX 统一入口 ----
-// 随机选一个变体执行
-// dim1==1 时只用区间和子集（行变体无意义）
+// ---- OX unified entry ----
+// Pick one variant at random
+// When dim1==1 use only interval and subset (row variant useless)
 template<typename Sol>
 __device__ inline void perm_ox_crossover(Sol& child, const Sol& parentA, const Sol& parentB,
                                           int dim1, int total_elements, curandState* rng) {
     int n_variants = (dim1 > 1) ? 3 : 2;
-    int variant = rand_int(rng, n_variants);  // 0: 区间, 1: 子集, [2: 行]
+    int variant = rand_int(rng, n_variants);  // 0: interval, 1: subset, [2: row]
     switch (variant) {
         case 0: ox_interval(child, parentA, parentB, dim1, total_elements, rng); break;
         case 1: ox_subset(child, parentA, parentB, dim1, total_elements, rng); break;
@@ -590,8 +590,8 @@ __device__ inline void perm_ox_crossover(Sol& child, const Sol& parentA, const S
     }
 }
 
-/// uniform_crossover: 逐元素从两个父代中随机选择
-/// 适用于 Binary 编码（不破坏排列约束）
+/// uniform_crossover: random parent choice per element
+/// Suitable for Binary encoding (does not break permutation constraints)
 template<typename Sol>
 __device__ inline void uniform_crossover(Sol& child, const Sol& parentA, const Sol& parentB,
                                           int dim1, curandState* rng) {
@@ -607,15 +607,15 @@ __device__ inline void uniform_crossover(Sol& child, const Sol& parentA, const S
     }
 }
 
-// [已删除] generate_move_for_seq / sample_and_generate / apply_move / undo_move
-// P0 重构后主路径统一使用 execute_sequence，旧的 Move 生成+应用+撤销路径不再需要
+// [removed] generate_move_for_seq / sample_and_generate / apply_move / undo_move
+// After P0 refactor the main path uses execute_sequence; old Move gen/apply/undo path removed
 
 // ============================================================
-// execute_sequence — 统一接口：生成参数并直接执行（不返回 Move）
+// execute_sequence — unified API: generate params and execute directly (no Move returned)
 // ============================================================
-// 返回 true 若 sol 被修改，false 若 NOOP
-// d_G, d_O, rel_N: 可选的关系矩阵指针（SEQ_LNS_GUIDED_REBUILD 使用）
-// val_lb, val_ub: Integer 编码的值域范围（其他编码忽略）
+// Returns true if sol modified, false if NOOP
+// d_G, d_O, rel_N: optional relation matrices (for SEQ_LNS_GUIDED_REBUILD)
+// val_lb, val_ub: Integer encoding value range (ignored for other encodings)
 
 template<typename Sol>
 __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
@@ -627,7 +627,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
                                          int val_ub = 1,
                                          const void* prob_data = nullptr) {
     // ============================================================
-    // Permutation 序列
+    // Permutation sequences
     // ============================================================
     if (encoding == EncodingType::Permutation) {
         switch (seq_id) {
@@ -841,15 +841,15 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
             return true;
         }
         case seq::SEQ_LNS_GUIDED_REBUILD: {
-            // 关系矩阵引导重建：
-            //   1. 随机选种子元素 seed
-            //   2. 查 G[seed] 找分组倾向最强的 K 个元素
-            //   3. 在解中找到这些元素的位置
-            //   4. 按 O 矩阵引导的顺序重排这些位置的元素
+            // Relation-matrix guided rebuild:
+            //   1. Pick random seed element seed
+            //   2. Look up G[seed] for K elements with strongest grouping affinity
+            //   3. Find positions of these elements in the solution
+            //   4. Reorder these positions by order guided by O matrix
             //
-            // 如果没有关系矩阵（冷启动），退化为 scatter_shuffle
+            // Without relation matrices (cold start), fall back to scatter_shuffle
             if (!d_G || !d_O || rel_N <= 0) {
-                // 退化：随机 scatter shuffle
+                // Fallback: random scatter shuffle
                 int row = (dim1 > 1) ? rand_int(rng, dim1) : 0;
                 int sz = sol.dim2_sizes[row];
                 if (sz < 4) return false;
@@ -872,21 +872,21 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
                 return true;
             }
             
-            // --- 有关系矩阵：引导重建 ---
-            // 通用策略（不感知问题类型）：
-            //   G 矩阵 → 选哪些元素（分组倾向弱的 = 可能放错位置的）
-            //   O 矩阵 → 怎么排（排序倾向引导重排顺序）
-            //   两者协同：G 选人，O 排序
+            // --- With relation matrices: guided rebuild ---
+            // Generic strategy (problem-agnostic):
+            //   G matrix → which elements (weak grouping with seed = likely misplaced)
+            //   O matrix → how to order (ordering affinity guides reorder)
+            //   Together: G picks, O orders
             int row = (dim1 > 1) ? rand_int(rng, dim1) : 0;
             int sz = sol.dim2_sizes[row];
             if (sz < 4) return false;
             
-            // 选种子元素
+            // Pick seed element
             int seed_pos = rand_int(rng, sz);
             int seed_val = sol.data[row][seed_pos];
             if (seed_val < 0 || seed_val >= rel_N) return false;
             
-            // 检查矩阵是否有足够信息（G 和 O 任一有信号即可）
+            // Check matrices have enough signal (either G or O)
             float max_signal = 0.0f;
             for (int c = 0; c < sz; c++) {
                 int v = sol.data[row][c];
@@ -897,11 +897,11 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
                     if (o > max_signal) max_signal = o;
                 }
             }
-            if (max_signal < 0.05f) return false;  // 信息不足，跳过
+            if (max_signal < 0.05f) return false;  // insufficient signal, skip
             
-            // 破坏：锦标赛选择 G 值低的元素（t=2）
-            // G 值低 = 与 seed 分组倾向弱 = 可能放错位置
-            // 锦标赛：随机抽 2 个，取 G 值更低的那个，重复 count 次
+            // Destroy: tournament pick low-G elements (t=2)
+            // Low G = weak grouping with seed = likely misplaced
+            // Tournament: draw 2 at random, take lower G, repeat count times
             constexpr int MAX_REBUILD = 10;
             constexpr int TOUR_SIZE = 2;
             int count = sz / 5;  // ~20%
@@ -911,12 +911,12 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
             
             int sel_pos[MAX_REBUILD];
             int sel_val[MAX_REBUILD];
-            bool used[128] = {};  // 标记已选位置，防止重复
+            bool used[128] = {};  // mark chosen positions to avoid duplicates
             int picked = 0;
-            int max_attempts = count * 4;  // 防止死循环
+            int max_attempts = count * 4;  // avoid infinite loop
             
             for (int attempt = 0; attempt < max_attempts && picked < count; attempt++) {
-                // 锦标赛：随机抽 TOUR_SIZE 个候选，取 G 值最低的
+                // Tournament: draw TOUR_SIZE candidates at random, take lowest G
                 int best_c = -1;
                 float best_g = 1e30f;
                 for (int t = 0; t < TOUR_SIZE; t++) {
@@ -936,15 +936,15 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
             if (picked < 2) return false;
             count = picked;
             
-            // 修复：锦标赛排序（O 矩阵引导 + 随机扰动）
-            // 插入排序，比较时加噪声实现概率性：O 值高的大概率排前面，但不绝对
+            // Repair: tournament sort (O-guided + random noise)
+            // Insertion sort with noisy comparison: high O tends to go first, not guaranteed
             for (int i = 1; i < count; i++) {
                 int key = sel_val[i];
                 int j = i - 1;
                 while (j >= 0) {
                     float o_key_before = d_O[key * rel_N + sel_val[j]];
                     float o_j_before   = d_O[sel_val[j] * rel_N + key];
-                    // 噪声幅度 0.05：O 值差距 >0.05 时基本确定，<0.05 时随机
+                    // Noise scale 0.05: if O gap >0.05 mostly deterministic, else random
                     float noise = (curand_uniform(rng) - 0.5f) * 0.1f;
                     if (o_key_before + noise > o_j_before) {
                         sel_val[j + 1] = sel_val[j];
@@ -956,7 +956,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
                 sel_val[j + 1] = key;
             }
             
-            // 对 sel_pos 排序（升序），使写回位置有序
+            // Sort sel_pos ascending so write-back order is stable
             for (int i = 1; i < count; i++) {
                 int key = sel_pos[i];
                 int j = i - 1;
@@ -967,7 +967,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
                 sel_pos[j + 1] = key;
             }
             
-            // 检查是否真的改变了排列
+            // Check whether permutation actually changed
             bool any_change = false;
             for (int i = 0; i < count; i++) {
                 if (sol.data[row][sel_pos[i]] != sel_val[i]) {
@@ -977,7 +977,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
             }
             if (!any_change) return false;
             
-            // 写回
+            // Write back
             for (int i = 0; i < count; i++) {
                 sol.data[row][sel_pos[i]] = sel_val[i];
             }
@@ -989,7 +989,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
     }
 
     // ============================================================
-    // Binary 序列
+    // Binary sequences
     // ============================================================
     if (encoding == EncodingType::Binary) {
         switch (seq_id) {
@@ -1063,7 +1063,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
     }
 
     // ============================================================
-    // Integer 序列
+    // Integer sequences
     // ============================================================
     if (encoding == EncodingType::Integer) {
         switch (seq_id) {
@@ -1131,7 +1131,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
     }
 
     // ============================================================
-    // 共享：行级序列（编码无关）
+    // Shared: row-level sequences (encoding-agnostic)
     // ============================================================
     switch (seq_id) {
     case seq::SEQ_ROW_SWAP: {
@@ -1194,11 +1194,11 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
 }
 
 // ============================================================
-// sample_and_execute — 从 SeqRegistry 按权重采样 + 直接执行
+// sample_and_execute — sample from SeqRegistry by weight and execute directly
 // ============================================================
-// 返回 true 若 sol 被修改，false 若 NOOP
-// 输出参数 out_seq_idx：采样到的序列在 registry 中的索引
-// d_G, d_O, rel_N: 可选的关系矩阵（传递给 execute_sequence）
+// Returns true if sol modified, false if NOOP
+// out_seq_idx: index of sampled sequence in registry
+// d_G, d_O, rel_N: optional relation matrices (passed to execute_sequence)
 
 template<typename Sol>
 __device__ inline bool sample_and_execute(const SeqRegistry& reg,
@@ -1212,7 +1212,7 @@ __device__ inline bool sample_and_execute(const SeqRegistry& reg,
                                           int val_lb = 0,
                                           int val_ub = 1,
                                           const void* prob_data = nullptr) {
-    // 延迟归一化：使用缓存的 weights_sum
+    // Lazy normalization: use cached weights_sum
     float r = curand_uniform(rng) * reg.weights_sum;  // r ∈ [0, weights_sum)
     float cumsum = 0.0f;
     out_seq_idx = reg.count - 1;
diff --git a/prototype/core/population.cuh b/prototype/core/population.cuh
index 4418ea8..338e548 100644
--- a/prototype/core/population.cuh
+++ b/prototype/core/population.cuh
@@ -1,10 +1,10 @@
 /**
- * population.cuh - 种群管理
+ * population.cuh - Population management
  * 
- * v2.0: Block 级架构
- *   - RNG 数组大小 = pop_size * block_size（每个 block 内每个线程独立 RNG）
- *   - 初始化 kernel 保持 1-thread-per-solution（初始化只做一次，不需要并行）
- *   - find_best_kernel 保持单线程（种群规模不大）
+ * v2.0: Block-level architecture
+ *   - RNG array size = pop_size * block_size (one independent RNG per thread within each block)
+ *   - Init kernel stays 1-thread-per-solution (initialization runs once; parallelism not needed)
+ *   - find_best_kernel remains single-threaded (population size is modest)
  */
 
 #pragma once
@@ -12,7 +12,7 @@
 #include "cuda_utils.cuh"
 
 // ============================================================
-// Device 端 Kernel（模板化）
+// Device-side kernels (templated)
 // ============================================================
 
 template<typename Sol>
@@ -65,9 +65,9 @@ __global__ void init_integer_kernel(Sol* pop, int pop_size,
 }
 
 // ============================================================
-// 多重集排列初始化 — 每个值 [0, N) 重复 R 次，总长度 N*R
+// Multiset permutation init — each value in [0, N) repeated R times, total length N*R
 // ============================================================
-// 用于 JSP 工序排列编码：N=num_jobs, R=num_ops，值 j 出现 R 次表示工件 j
+// For JSP operation-sequence encoding: N=num_jobs, R=num_ops; value j appearing R times means job j
 
 template<typename Sol>
 __global__ void init_multiset_perm_kernel(Sol* pop, int pop_size,
@@ -90,7 +90,7 @@ __global__ void init_multiset_perm_kernel(Sol* pop, int pop_size,
 }
 
 // ============================================================
-// 分区初始化 — 元素 {0..total_elements-1} 不重复分配到 dim1 行
+// Partition init — elements {0..total_elements-1} assigned without duplication across dim1 rows
 // ============================================================
 
 template<typename Sol>
@@ -131,21 +131,21 @@ __global__ void find_best_kernel(const Sol* pop, int pop_size,
 }
 
 // ============================================================
-// Host 端 RAII 类（模板化）
+// Host-side RAII class (templated)
 // ============================================================
 
 template<typename Sol>
 class Population {
 public:
     Sol*         d_solutions  = nullptr;
-    curandState* d_rng_states = nullptr;  // 大小 = pop_size * block_size
+    curandState* d_rng_states = nullptr;  // size = pop_size * block_size
     int          size         = 0;
-    int          rng_count    = 0;        // RNG 状态总数
+    int          rng_count    = 0;        // total RNG states
 
     Population() = default;
     
-    // block_size: Block 级架构下每个 block 的线程数
-    // RNG 数组大小 = pop_size * block_size（每个 block 内每个线程独立 RNG）
+    // block_size: threads per block under block-level architecture
+    // RNG array size = pop_size * block_size (one independent RNG per thread within each block)
     void allocate(int pop_size, int block_size = 128) {
         size = pop_size;
         rng_count = pop_size * block_size;
diff --git a/prototype/core/relation_matrix.cuh b/prototype/core/relation_matrix.cuh
index 89fb2ea..0fc0548 100644
--- a/prototype/core/relation_matrix.cuh
+++ b/prototype/core/relation_matrix.cuh
@@ -1,20 +1,20 @@
 /**
- * relation_matrix.cuh - G/O 关系矩阵管理
+ * relation_matrix.cuh - G/O relation matrix management
  *
- * G[i][j]: 分组倾向（元素 i 和 j 应在同一行的倾向，对称）
- * O[i][j]: 排序倾向（元素 i 应排在 j 前面的倾向，不对称）
+ * G[i][j]: grouping affinity (tendency for elements i and j to be on the same row; symmetric)
+ * O[i][j]: ordering affinity (tendency for element i to appear before j; asymmetric)
  *
- * 更新来源：历史最优解统计
- *   每当 host 端获取到当前 best 解，扫描所有元素对关系：
- *     - 同行 → G[i][j] 增强
- *     - i 在 j 前 → O[i][j] 增强
- *   使用 EMA 衰减：M[i][j] = α * M[i][j] + (1-α) * signal
+ * Update source: statistics from historical best solutions
+ *   Whenever the host obtains the current best solution, scan all element-pair relations:
+ *     - Same row → strengthen G[i][j]
+ *     - i before j → strengthen O[i][j]
+ *   EMA decay: M[i][j] = α * M[i][j] + (1-α) * signal
  *
- * 生命周期：
- *   1. relation_matrix_create(N)  — 分配 host/device 内存，初始化为 0
- *   2. relation_matrix_update(rm, sol, dim1) — 从一个解更新 G/O（host 端）
- *   3. relation_matrix_upload(rm) — 上传 h_G/h_O 到 d_G/d_O
- *   4. relation_matrix_destroy(rm) — 释放内存
+ * Lifecycle:
+ *   1. relation_matrix_create(N)  — allocate host/device memory, initialize to 0
+ *   2. relation_matrix_update(rm, sol, dim1) — update G/O from one solution (host)
+ *   3. relation_matrix_upload(rm) — upload h_G/h_O to d_G/d_O
+ *   4. relation_matrix_destroy(rm) — free memory
  */
 
 #pragma once
@@ -23,7 +23,7 @@
 #include <cstring>
 
 // ============================================================
-// 创建 / 销毁
+// Create / destroy
 // ============================================================
 
 inline RelationMatrix relation_matrix_create(int N, float decay = 0.95f) {
@@ -58,19 +58,19 @@ inline void relation_matrix_destroy(RelationMatrix& rm) {
 }
 
 // ============================================================
-// 从一个解更新 G/O（host 端）
+// Update G/O from one solution (host)
 // ============================================================
-// sol: 当前最优解（已下载到 host）
-// dim1: 实际使用的行数
+// sol: current best solution (already copied to host)
+// dim1: number of rows in use
 //
-// 逻辑：
-//   对 sol 中每对元素 (val_a, val_b)：
-//     如果在同一行 → G[val_a][val_b] 增强
-//     如果 val_a 在 val_b 前面 → O[val_a][val_b] 增强
+// Logic:
+//   For each pair (val_a, val_b) in sol:
+//     If on the same row → strengthen G[val_a][val_b]
+//     If val_a appears before val_b → strengthen O[val_a][val_b]
 //
-// 注意：元素值 val 必须在 [0, N) 范围内才有意义
-//       对于 partition 编码（VRP），元素值就是客户编号
-//       对于单行排列（TSP），元素值就是城市编号
+// Note: element values val are meaningful only in [0, N)
+//       For partition encoding (VRP), values are customer IDs
+//       For single-row permutation (TSP), values are city IDs
 
 template<typename Sol>
 void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) {
@@ -78,13 +78,13 @@ void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) {
     float alpha = rm.decay;
     float signal_strength = 1.0f;
     
-    // 衰减所有现有值
+    // Decay all existing values
     for (int i = 0; i < N * N; i++) {
         rm.h_G[i] *= alpha;
         rm.h_O[i] *= alpha;
     }
     
-    // 扫描解中的元素对关系
+    // Scan element-pair relations in the solution
     for (int r = 0; r < dim1; r++) {
         int sz = sol.dim2_sizes[r];
         for (int c1 = 0; c1 < sz; c1++) {
@@ -95,17 +95,17 @@ void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) {
                 int val_b = sol.data[r][c2];
                 if (val_b < 0 || val_b >= N) continue;
                 
-                // 同行 → G 增强（对称）
+                // Same row → strengthen G (symmetric)
                 rm.h_G[val_a * N + val_b] += (1.0f - alpha) * signal_strength;
                 rm.h_G[val_b * N + val_a] += (1.0f - alpha) * signal_strength;
                 
-                // val_a 在 val_b 前 → O[val_a][val_b] 增强
+                // val_a before val_b → strengthen O[val_a][val_b]
                 rm.h_O[val_a * N + val_b] += (1.0f - alpha) * signal_strength;
             }
         }
     }
     
-    // 裁剪到 [0, 1]
+    // Clamp to [0, 1]
     for (int i = 0; i < N * N; i++) {
         if (rm.h_G[i] > 1.0f) rm.h_G[i] = 1.0f;
         if (rm.h_O[i] > 1.0f) rm.h_O[i] = 1.0f;
@@ -115,7 +115,7 @@ void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) {
 }
 
 // ============================================================
-// 上传到 GPU
+// Upload to GPU
 // ============================================================
 
 inline void relation_matrix_upload(const RelationMatrix& rm) {
diff --git a/prototype/core/solver.cuh b/prototype/core/solver.cuh
index e27a38a..161bd4d 100644
--- a/prototype/core/solver.cuh
+++ b/prototype/core/solver.cuh
@@ -1,14 +1,14 @@
 /**
- * solver.cuh - 主求解循环
+ * solver.cuh - Main solve loop
  * 
- * v2.0: Block 级架构重构
- *   - 1 block = 1 solution（邻域并行）
- *   - Solution 存放在 shared memory
- *   - 每代：K 个线程各自生成候选 move + 评估 delta → 归约选最优 → thread 0 执行
- *   - 交叉暂用简化版（thread 0 执行，其余线程等待）
- *   - 迁移/精英注入保持单线程 kernel（操作全局内存）
+ * v2.0: Block-level architecture refactor
+ *   - 1 block = 1 solution (neighborhood parallelism)
+ *   - Solution lives in shared memory
+ *   - Each generation: K threads each propose a candidate move + evaluate delta -> reduce to best -> thread 0 applies
+ *   - Crossover uses a simplified path for now (thread 0 runs crossover, others wait)
+ *   - Migration / elite injection remain single-thread kernels (global memory)
  *
- * 要求 Problem 接口：
+ * Required Problem interface:
  *   size_t shared_mem_bytes() const;
  *   __device__ void load_shared(char* smem, int tid, int bsz);
  *   __device__ void evaluate(Sol& sol) const;
@@ -25,16 +25,16 @@
 #include <cmath>
 
 // ============================================================
-// 编译时常量
+// Compile-time constants
 // ============================================================
-constexpr int BLOCK_LEVEL_THREADS = 128;  // Block 级架构的默认线程数/block
+constexpr int BLOCK_LEVEL_THREADS = 128;  // Default threads per block for block-level architecture
 
 // ============================================================
-// EvolveParams — CUDA Graph 可变参数（device memory）
+// EvolveParams — CUDA Graph mutable parameters (device memory)
 // ============================================================
-// 将每个 batch 会变化的参数集中到一个 struct 中，
-// evolve_block_kernel 通过指针读取，CUDA Graph 录制时绑定指针。
-// 每次 replay 前只需 cudaMemcpy 更新这块 device memory。
+// Per-batch parameters are packed into one struct;
+// evolve_block_kernel reads via pointer; CUDA Graph capture binds the pointer.
+// Before each replay, only cudaMemcpy this device memory block.
 
 struct EvolveParams {
     float       temp_start;
@@ -46,13 +46,13 @@ struct EvolveParams {
 };
 
 // ============================================================
-// 工具：协作加载/存储 Solution（shared memory ↔ global memory）
+// Helpers: cooperative load/store Solution (shared memory ↔ global memory)
 // ============================================================
 
 template<typename Sol>
 __device__ inline void cooperative_load_sol(Sol& dst, const Sol& src,
                                              int tid, int num_threads) {
-    // 按 int 粒度协作拷贝整个 Solution 结构体
+    // Cooperative copy of entire Solution struct in int-sized chunks
     const int* src_ptr = reinterpret_cast<const int*>(&src);
     int* dst_ptr = reinterpret_cast<int*>(&dst);
     constexpr int n_ints = (sizeof(Sol) + sizeof(int) - 1) / sizeof(int);
@@ -63,11 +63,11 @@ __device__ inline void cooperative_load_sol(Sol& dst, const Sol& src,
 template<typename Sol>
 __device__ inline void cooperative_store_sol(Sol& dst, const Sol& src,
                                               int tid, int num_threads) {
-    cooperative_load_sol(dst, src, tid, num_threads);  // 同样的拷贝逻辑
+    cooperative_load_sol(dst, src, tid, num_threads);  // Same copy logic
 }
 
 // ============================================================
-// Kernel 1: 初始评估（只调用一次，1 block = 1 solution）
+// Kernel 1: Initial evaluation (once; 1 block = 1 solution)
 // ============================================================
 
 template<typename Problem, typename Sol>
@@ -77,27 +77,27 @@ __global__ void evaluate_kernel(Problem prob, Sol* pop, int pop_size,
     Problem lp = prob;
     if (smem_size > 0) { lp.load_shared(smem, threadIdx.x, blockDim.x); __syncthreads(); }
     
-    // 1-thread-per-solution 初始评估（保持简单，只调用一次）
+    // One-thread-per-solution initial evaluation (simple; called once)
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid < pop_size) lp.evaluate(pop[tid]);
 }
 
 // ============================================================
-// Kernel 2: Block 级批量进化（邻域并行）
+// Kernel 2: Block-level batched evolution (neighborhood parallelism)
 // ============================================================
 //
-// 每代流程：
-//   1. K 个线程各自生成一个候选 move
-//   2. K 个线程各自评估 move 的 delta（不修改 shared memory 中的 sol）
-//   3. Block 内归约：选 delta 最小的 move
-//   4. Thread 0 决定是否接受（SA / HC）
-//   5. Thread 0 执行最优 move 并更新 sol
-//   6. __syncthreads() 让所有线程看到更新后的 sol
+// Per-generation flow:
+//   1. Each of K threads generates one candidate move
+//   2. Each thread evaluates delta for its move (does not modify sol in shared memory)
+//   3. Block reduction: pick move with smallest delta
+//   4. Thread 0 accepts or rejects (SA / HC)
+//   5. Thread 0 applies best move and updates sol
+//   6. __syncthreads() so all threads see updated sol
 //
-// Solution 在 shared memory 中，Problem 数据也在 shared memory 中
+// Solution and Problem data live in shared memory
 
 // ============================================================
-// MultiStepCandidate — 多步执行结果（用于归约）
+// MultiStepCandidate — multi-step result (for reduction)
 // ============================================================
 struct MultiStepCandidate {
     float delta;
@@ -135,23 +135,23 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size,
     const float temp_start = d_params->temp_start;
     const ObjConfig oc = d_params->oc;
     
-    // --- shared memory 布局 ---
+    // --- shared memory layout ---
     // [0 .. sizeof(Sol)-1]                              : Solution
-    // [sizeof(Sol) .. sizeof(Sol)+prob_smem-1]          : Problem 数据
-    // [之后 .. ]                                        : MultiStepCandidate[num_threads] 归约工作区
-    // [之后 .. ]                                        : AOSStats (如果启用)
+    // [sizeof(Sol) .. sizeof(Sol)+prob_smem-1]          : Problem data
+    // [after .. ]                                       : MultiStepCandidate[num_threads] reduction workspace
+    // [after .. ]                                       : AOSStats (if enabled)
     
     Sol* s_sol = reinterpret_cast<Sol*>(smem);
     char* prob_smem_ptr = smem + sizeof(Sol);
     MultiStepCandidate* s_cands = reinterpret_cast<MultiStepCandidate*>(
         smem + sizeof(Sol) + prob_smem_size);
     
-    // AOS 统计（在 MultiStepCandidate 数组之后）
+    // AOS stats (after MultiStepCandidate array)
     AOSStats* s_aos = nullptr;
     if (d_aos_stats) {
         s_aos = reinterpret_cast<AOSStats*>(
             smem + sizeof(Sol) + prob_smem_size + sizeof(MultiStepCandidate) * num_threads);
-        // Thread 0 初始化 AOS 计数器
+        // Thread 0 initializes AOS counters
         if (tid == 0) {
             for (int i = 0; i < MAX_SEQ; i++) {
                 s_aos->usage[i] = 0;
@@ -164,13 +164,13 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size,
         }
     }
     
-    // 加载 Problem 数据到 shared memory
+    // Load Problem data into shared memory
     Problem lp = prob;
     if (prob_smem_size > 0) {
         lp.load_shared(prob_smem_ptr, tid, num_threads);
     }
     
-    // 协作加载 Solution 到 shared memory
+    // Cooperatively load Solution into shared memory
     cooperative_load_sol(*s_sol, pop[bid], tid, num_threads);
     __syncthreads();
     
@@ -181,12 +181,12 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size,
     
     for (int g = 0; g < gens_per_batch; g++) {
         // ============================================================
-        // Step 1: 每个线程独立采样 K 步数 + K 个序列，在 local copy 上执行
+        // Step 1: Each thread independently samples K steps + K sequences on local copy
         // ============================================================
         
-        // 采样 K（步数）：按 kstep.weights 权重
+        // Sample K (step count): weighted by kstep.weights
         float kr = curand_uniform(&rng);
-        int my_k = 1;  // 默认 K=1
+        int my_k = 1;  // default K=1
         {
             float cum = 0.0f;
             for (int i = 0; i < MAX_K; i++) {
@@ -195,7 +195,7 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size,
             }
         }
         
-        // 在 local memory 拷贝 sol，执行 K 步 move
+        // Copy sol in local memory, apply K moves
         Sol local_sol = *s_sol;
         MultiStepCandidate my_cand;
         my_cand.k_steps = my_k;
@@ -215,7 +215,7 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size,
             if (changed) all_noop = false;
         }
         
-        // Step 2: 评估最终 delta（K 步之后 vs 原始 sol）
+        // Step 2: Evaluate final delta (after K steps vs original sol)
         if (all_noop) {
             my_cand.delta = 1e30f;
             my_cand.new_penalty = s_sol->penalty;
@@ -242,7 +242,7 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size,
         s_cands[tid] = my_cand;
         __syncthreads();
         
-        // Step 3: Block 内并行归约，找 delta 最小的 candidate
+        // Step 3: Parallel reduction in block to find candidate with smallest delta
         for (int stride = num_threads / 2; stride > 0; stride >>= 1) {
             if (tid < stride) {
                 if (s_cands[tid + stride].delta < s_cands[tid].delta)
@@ -251,7 +251,7 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size,
             __syncthreads();
         }
         
-        // Step 4: Thread 0 决定是否接受
+        // Step 4: Thread 0 decides accept/reject
         if (tid == 0) {
             MultiStepCandidate& best = s_cands[0];
             bool has_valid = (best.delta < 1e29f);
@@ -269,7 +269,7 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size,
                 }
                 
                 if (accept) {
-                    // AOS 统计：K 层 + 算子层
+                    // AOS stats: K layer + operator layer
                     if (s_aos) {
                         int ki = best.k_steps - 1;
                         if (ki >= 0 && ki < MAX_K) {
@@ -304,23 +304,23 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size,
         __syncthreads();
     }
     
-    // 写回 Solution 到全局内存
+    // Write Solution back to global memory
     cooperative_store_sol(pop[bid], *s_sol, tid, num_threads);
     
-    // AOS 统计写回全局内存
+    // Write AOS stats back to global memory
     if (d_aos_stats && tid == 0) {
         d_aos_stats[bid] = *s_aos;
     }
     
-    // 保存 RNG 状态
+    // Save RNG state
     rng_states[rng_idx] = rng;
 }
 
 // ============================================================
-// Kernel 2b: Block 级交叉操作
+// Kernel 2b: Block-level crossover
 // ============================================================
-// 简化版：thread 0 执行交叉逻辑，其余线程协作加载/存储
-// 后续 Phase 3 会实现多线程协作交叉
+// Simplified: thread 0 runs crossover; others cooperative load/store
+// Phase 3 may add multi-thread cooperative crossover
 
 template<typename Problem, typename Sol>
 __global__ void crossover_block_kernel(Problem prob, Sol* pop, int pop_size,
@@ -338,7 +338,7 @@ __global__ void crossover_block_kernel(Problem prob, Sol* pop, int pop_size,
     
     if (bid >= pop_size) return;
     
-    // shared memory 布局：Sol + Problem data
+    // Shared memory layout: Sol + Problem data
     Sol* s_sol = reinterpret_cast<Sol*>(smem);
     char* prob_smem_ptr = smem + sizeof(Sol);
     
@@ -350,7 +350,7 @@ __global__ void crossover_block_kernel(Problem prob, Sol* pop, int pop_size,
     cooperative_load_sol(*s_sol, pop[bid], tid, K);
     __syncthreads();
     
-    // Thread 0 执行交叉逻辑
+    // Thread 0 runs crossover
     if (tid == 0) {
         int rng_idx = bid * K;
         curandState rng = rng_states[rng_idx];
@@ -389,12 +389,12 @@ __global__ void crossover_block_kernel(Problem prob, Sol* pop, int pop_size,
     }
     __syncthreads();
     
-    // 写回（可能被交叉更新了）
+    // Write back (possibly updated by crossover)
     cooperative_store_sol(pop[bid], *s_sol, tid, K);
 }
 
 // ============================================================
-// Kernel 3: 岛屿间迁移（保持不变，单线程 kernel）
+// Kernel 3: Inter-island migration (unchanged; single-thread kernel)
 // ============================================================
 
 template<typename Sol>
@@ -406,6 +406,8 @@ __device__ inline int find_worst_in_island(const Sol* pop, int base, int island_
     return worst;
 }
 
+constexpr int MAX_ISLANDS = 64;
+
 template<typename Sol>
 __global__ void migrate_kernel(Sol* pop, int pop_size, int island_size,
                                 ObjConfig oc,
@@ -414,8 +416,10 @@ __global__ void migrate_kernel(Sol* pop, int pop_size, int island_size,
     if (threadIdx.x != 0 || blockIdx.x != 0) return;
     int round = d_params->migrate_round;
     int num_islands = pop_size / island_size;
+    if (num_islands > MAX_ISLANDS) num_islands = MAX_ISLANDS;
+    if (num_islands <= 1) return;
     
-    int candidates[64];
+    int candidates[MAX_ISLANDS];
     for (int isle = 0; isle < num_islands; isle++) {
         int base = isle * island_size;
         int best = base;
@@ -424,9 +428,9 @@ __global__ void migrate_kernel(Sol* pop, int pop_size, int island_size,
         candidates[isle] = best;
     }
     
-    int topn[64];
+    int topn[MAX_ISLANDS];
     if (strategy == MigrateStrategy::TopN || strategy == MigrateStrategy::Hybrid) {
-        bool selected[64] = {};
+        bool selected[MAX_ISLANDS] = {};
         for (int t = 0; t < num_islands; t++) {
             int best_c = -1;
             for (int c = 0; c < num_islands; c++) {
@@ -459,7 +463,7 @@ __global__ void migrate_kernel(Sol* pop, int pop_size, int island_size,
 }
 
 // ============================================================
-// Kernel 4: 精英注入（保持不变）
+// Kernel 4: Elite injection (unchanged)
 // ============================================================
 
 template<typename Sol>
@@ -483,7 +487,7 @@ __global__ void elite_inject_kernel(Sol* pop, int pop_size,
 }
 
 // ============================================================
-// v5.0: 多 GPU 协同 — 注入外部解到岛屿
+// v5.0: Multi-GPU coordination — inject external solutions into islands
 // ============================================================
 
 template<typename Sol>
@@ -496,7 +500,7 @@ __global__ void inject_to_islands_kernel(Sol* pop, int pop_size, int island_size
     int num_islands = pop_size / island_size;
     if (num_islands == 0) return;
     
-    // 根据注入模式确定注入的岛屿数量
+    // Number of islands to inject into depends on mode
     int islands_to_inject = 0;
     if (mode == MultiGpuInjectMode::OneIsland) {
         islands_to_inject = 1;
@@ -506,15 +510,15 @@ __global__ void inject_to_islands_kernel(Sol* pop, int pop_size, int island_size
         islands_to_inject = num_islands;
     }
     
-    // 将注入解分配到各个岛屿的 worst 位置
+    // Place each injected solution at worst slot of an island
     for (int i = 0; i < islands_to_inject && i < num_inject; i++) {
         int target_isle = i % num_islands;
         int base = target_isle * island_size;
         
-        // 找到该岛的 worst 解
+        // Find worst solution on this island
         int worst = find_worst_in_island(pop, base, island_size, oc);
         
-        // 如果注入解更优，则替换
+        // Replace if injection is better
         if (is_better(inject_solutions[i], pop[worst], oc)) {
             pop[worst] = inject_solutions[i];
         }
@@ -522,49 +526,49 @@ __global__ void inject_to_islands_kernel(Sol* pop, int pop_size, int island_size
 }
 
 // ============================================================
-// v5.0 方案 B3: inject_check_kernel — 被动注入检查
+// v5.0 plan B3: inject_check_kernel — passive injection check
 // ============================================================
-// GPU 在 migrate 时检查 InjectBuffer，如果有新解则注入到第一个岛的 worst
-// 使用 atomicExch 原子读取并清除 flag，确保线程安全
+// During migrate, GPU checks InjectBuffer; if new solution exists, inject at worst of first island
+// atomicExch reads and clears flag atomically for thread safety
 //
-// 设计要点：
-// 1. 单线程执行（thread 0 of block 0），避免竞争
-// 2. atomicExch 原子读取 flag 并清零，确保每个解只被处理一次
-// 3. 只注入到第一个岛（OneIsland 策略），保持多样性
-// 4. 完全可选：如果 inject_buf 为 nullptr，直接跳过（不影响单 GPU）
+// Design notes:
+// 1. Single thread (thread 0 of block 0) to avoid races
+// 2. atomicExch reads flag and clears it so each solution is handled once
+// 3. Inject only into first island (OneIsland strategy) to preserve diversity
+// 4. Optional: if inject_buf is nullptr, skip (single-GPU unaffected)
 
 template<typename Sol>
 __global__ void inject_check_kernel(Sol* pop, int pop_size, int island_size,
                                      InjectBuffer<Sol>* inject_buf, ObjConfig oc) {
-    // 单线程执行
+    // Single-thread execution
     if (threadIdx.x != 0 || blockIdx.x != 0) return;
     
-    // 如果没有注入缓冲区，直接返回（单 GPU 场景）
+    // No injection buffer — return (single-GPU case)
     if (inject_buf == nullptr) return;
     
-    // 原子读取并清除 flag（确保每个解只被处理一次）
+    // Atomically read and clear flag (each solution processed once)
     int flag = atomicExch(inject_buf->d_flag, 0);
     
-    // 如果没有新解，直接返回
+    // No new solution — return
     if (flag != 1) return;
     
-    // 读取注入的解
+    // Read injected solution
     Sol inject_sol = *(inject_buf->d_solution);
     
-    // 找到第一个岛的 worst 位置
+    // Find worst slot on first island
     int num_islands = pop_size / island_size;
     if (num_islands == 0) return;
     
     int worst = find_worst_in_island(pop, 0, island_size, oc);
     
-    // 如果注入解更优，则替换
+    // Replace if injection is better
     if (is_better(inject_sol, pop[worst], oc)) {
         pop[worst] = inject_sol;
     }
 }
 
 // ============================================================
-// solve<Problem>: 主循环（Block 级架构）
+// solve<Problem>: main loop (block-level architecture)
 // ============================================================
 
 using RegistryCallback = void(*)(SeqRegistry&);
@@ -586,22 +590,22 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
     bool use_time_limit = cfg.time_limit_sec > 0.0f;
     bool use_stagnation = cfg.stagnation_limit > 0;
     
-    // Block 级参数
-    const int block_threads = BLOCK_LEVEL_THREADS;  // 128 线程/block
+    // Block-level parameters
+    const int block_threads = BLOCK_LEVEL_THREADS;  // 128 threads/block
     
-    // --- 0. Shared memory 计算（需要在 pop_size 确定之前完成，用于 occupancy 查询）---
+    // --- 0. Shared memory sizing (before pop_size; used for occupancy query) ---
     size_t prob_smem = prob.shared_mem_bytes();
-    // v3.1: 归约工作区为 MultiStepCandidate（含 K 步 moves + seq_indices）
+    // v3.1: reduction workspace is MultiStepCandidate (K-step moves + seq_indices)
     size_t total_smem = sizeof(Sol) + prob_smem + sizeof(MultiStepCandidate) * block_threads;
     if (use_aos) total_smem += sizeof(AOSStats);
     
-    // 查询 GPU 硬件属性
+    // Query GPU device properties
     cudaDeviceProp prop;
     int device;
     CUDA_CHECK(cudaGetDevice(&device));
     CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
     
-    // 尝试扩展 shared memory 上限（V100: 96KB, A100: 164KB 等）
+    // Try to raise shared memory cap (V100: 96KB, A100: 164KB, etc.)
     size_t max_smem = (size_t)prop.sharedMemPerBlock;
     if (total_smem > 48 * 1024) {
         cudaError_t err1 = cudaFuncSetAttribute(
@@ -617,7 +621,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         }
     }
     
-    // 检查 shared memory 上限
+    // Check shared memory limit
     bool smem_overflow = false;
     if (total_smem > max_smem) {
         smem_overflow = (prob_smem > 0);
@@ -626,12 +630,12 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         if (use_aos) total_smem += sizeof(AOSStats);
     }
     
-    // --- 0b. 确定 pop_size（自动或用户指定）---
+    // --- 0b. Determine pop_size (auto or user) ---
     int pop_size = cfg.pop_size;
     bool auto_pop = (pop_size <= 0);
     
     if (auto_pop) {
-        // 查询 occupancy：每个 SM 能同时运行多少个 block
+        // Query occupancy: how many blocks per SM
         int max_blocks_per_sm = 0;
         cudaOccupancyMaxActiveBlocksPerMultiprocessor(
             &max_blocks_per_sm,
@@ -642,17 +646,17 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         int full_capacity = max_blocks_per_sm * prop.multiProcessorCount;
         
         if (prob_smem > 0) {
-            // 问题数据在 shared memory → 无 L2 cache 压力，打满 SM
+            // Problem data in shared memory → no L2 pressure; fill SMs
             pop_size = full_capacity;
         } else {
-            // 问题数据在 global memory → 根据 L2 cache 容量估算合理并发度
+            // Problem data in global memory → estimate concurrency from L2 size
             //
-            // 模型：pop = L2_size / working_set_bytes
-            //   所有 block 访问同一份只读数据，L2/ws 反映 cache 能支撑的并发度
+            // Model: pop = L2_size / working_set_bytes
+            //   All blocks read same read-only data; L2/ws approximates cache-supported concurrency
             //
-            // SM 下限策略：L2/ws >= sm_min/2 时拉升到 sm_min（允许一定 cache 压力换取种群多样性）
-            //   ch150: L2/ws=70, sm_min=128 → 70 >= 64 → 拉升到 128 ✓（多样性优先）
-            //   pcb442: L2/ws=8, sm_min=128 → 8 < 64 → 不拉升 ✓（避免 thrashing）
+            // SM floor policy: if L2/ws >= sm_min/2, raise to sm_min (trade some cache pressure for diversity)
+            //   ch150: L2/ws=70, sm_min=128 -> 70 >= 64 -> raise to 128 (diversity first)
+            //   pcb442: L2/ws=8, sm_min=128 -> 8 < 64 -> do not raise (avoid thrashing)
             
             size_t ws = prob.working_set_bytes();
             if (ws > 0) {
@@ -671,26 +675,26 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
             }
         }
         
-        // 向下取整到 2 的幂（warp 对齐、归约友好、islands 整除）
+        // Round down to power of 2 (warp alignment, reduction-friendly, island divisibility)
         {
             int p = 1;
             while (p * 2 <= pop_size) p *= 2;
             pop_size = p;
         }
         
-        // 绝对下限：32（保证至少 1 岛 × 32 解的最小可用规模）
+        // Absolute floor: 32 (at least 1 island x 32 individuals)
         if (pop_size < 32) pop_size = 32;
     }
     
-    // 自适应岛屿数量（num_islands=0 时启用）
+    // Adaptive island count (when num_islands=0)
     int num_islands = cfg.num_islands;
     if (num_islands == 0) {
-        // 策略：每岛至少 32 个个体，最多 8 岛
-        // pop < 64   → 1 岛（纯 HC）
-        // 64-127     → 2 岛
-        // 128-255    → 4 岛
-        // 256-511    → 8 岛
-        // >= 512     → 8 岛
+        // Policy: at least 32 individuals per island, at most 8 islands
+        // pop < 64   -> 1 island (pure HC)
+        // 64-127     -> 2 islands
+        // 128-255    -> 4 islands
+        // 256-511    -> 8 islands
+        // >= 512     -> 8 islands
         if (pop_size < 64) {
             num_islands = 1;
         } else if (pop_size < 128) {
@@ -747,8 +751,8 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         printf(" seed=%u\n", cfg.seed);
     }
     
-    // --- 1. 分配 ---
-    // crossover 栈需求（thread 0 在 local memory 中构造 child）
+    // --- 1. Allocation ---
+    // Crossover stack needs (thread 0 builds child in local memory)
     if (use_crossover) {
         size_t ox_arrays = Sol::DIM1 * Sol::DIM2 * sizeof(bool)
                          + 512 * sizeof(bool)
@@ -759,7 +763,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
     
     ObjConfig oc = make_obj_config(pcfg);
     
-    // --- 1b. 采样择优初始化 ---
+    // --- 1b. Sample-and-select initialization ---
     int oversample = cfg.init_oversample;
     if (oversample < 1) oversample = 1;
     int candidate_size = pop_size * oversample;
@@ -768,13 +772,13 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
     Population<Sol> pop;
     
     if (do_oversample) {
-        // 生成 K × pop_size 个候选解
+        // Generate K x pop_size candidate solutions
         Population<Sol> candidates;
         candidates.allocate(candidate_size, block_threads);
         candidates.init_rng(cfg.seed, 256);
         candidates.init_population(pcfg, 256);
         
-        // 启发式初始解注入（替换候选池尾部）
+        // Inject heuristic initial solutions (replace tail of candidate pool)
         if (pcfg.encoding == EncodingType::Permutation) {
             HeuristicMatrix heur_mats[8];
             int num_mats = prob.heuristic_matrices(heur_mats, 8);
@@ -797,7 +801,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
             }
         }
         
-        // GPU 上评估所有候选
+        // Evaluate all candidates on GPU
         {
             size_t eval_smem = prob.shared_mem_bytes();
             if (eval_smem > 48 * 1024) {
@@ -810,12 +814,12 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
             CUDA_CHECK(cudaDeviceSynchronize());
         }
         
-        // 下载所有候选解到 host
+        // Download all candidates to host
         Sol* h_candidates = new Sol[candidate_size];
         CUDA_CHECK(cudaMemcpy(h_candidates, candidates.d_solutions,
                               sizeof(Sol) * candidate_size, cudaMemcpyDeviceToHost));
         
-        // 构建候选信息
+        // Build candidate metadata
         std::vector<init_sel::CandidateInfo> cand_info(candidate_size);
         for (int i = 0; i < candidate_size; i++) {
             cand_info[i].idx = i;
@@ -829,16 +833,16 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
             }
         }
         
-        // 计算目标重要性
+        // Compute objective importance
         float importance[MAX_OBJ];
         compute_importance(oc, importance);
         
-        // 纯随机保底名额
+        // Pure-random quota (floor)
         int num_random = (int)(pop_size * cfg.init_random_ratio);
         if (num_random < 1) num_random = 1;
         if (num_random > pop_size / 2) num_random = pop_size / 2;
         
-        // 选择
+        // Selection
         std::vector<int> selected;
         if (oc.num_obj == 1) {
             selected = init_sel::top_n_select(cand_info, pop_size, num_random);
@@ -847,13 +851,13 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
                                                pop_size, num_random);
         }
         
-        // 分配最终种群
+        // Allocate final population
         pop.allocate(pop_size, block_threads);
-        // 复用候选的 RNG 状态（取前 pop_size 份）
-        // 重新初始化 RNG 更安全（候选的 RNG 状态已被使用过）
+        // Could reuse candidate RNG state (first pop_size entries)
+        // Re-init RNG is safer (candidate RNGs were already used)
         pop.init_rng(cfg.seed + 1, 256);
         
-        // 上传选中的解到种群前部
+        // Upload selected solutions to front of population
         int num_selected = (int)selected.size();
         for (int i = 0; i < num_selected; i++) {
             CUDA_CHECK(cudaMemcpy(pop.d_solutions + i,
@@ -861,8 +865,8 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
                                   sizeof(Sol), cudaMemcpyDeviceToDevice));
         }
         
-        // 剩余位置（纯随机保底）：从候选中随机选未被选中的
-        // 简单做法：直接用候选中排在后面的未选中解
+        // Remaining slots (pure-random floor): fill from unselected candidates
+        // Simple approach: use later candidates that were not selected
         if (num_selected < pop_size) {
             int fill_idx = num_selected;
             for (int i = 0; i < candidate_size && fill_idx < pop_size; i++) {
@@ -876,7 +880,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         }
         
         if (cfg.verbose) {
-            // 统计选中解的平均质量 vs 全部候选的平均质量
+            // Compare mean quality of selected vs all candidates
             float sel_avg = 0.0f, all_avg = 0.0f;
             for (int i = 0; i < candidate_size; i++) all_avg += cand_info[i].objs[0];
             all_avg /= candidate_size;
@@ -893,20 +897,20 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         }
         
         delete[] h_candidates;
-        // candidates 析构自动释放 GPU 内存
+        // candidates dtor frees GPU memory
     } else {
-        // oversample=1：纯随机，和之前一样
+        // oversample=1: pure random, same as before
         pop.allocate(pop_size, block_threads);
         pop.init_rng(cfg.seed, 256);
         pop.init_population(pcfg, 256);
     }
     
-    // --- 1c. 注入用户提供的初始解 ---
-    // 策略：校验合法性 → 合法解替换种群尾部（保留 oversample 选出的好解在前部）
+    // --- 1c. Inject user-provided initial solutions ---
+    // Policy: validate -> valid solutions replace population tail (keep oversample winners at front)
     if (init_solutions && num_init_solutions > 0) {
-        int max_inject = pop_size / 16;  // 最多占种群 ~6%（保留多样性）
+        int max_inject = pop_size / 16;  // at most ~6% of population (diversity)
         if (max_inject < 1) max_inject = 1;
-        if (max_inject > 16) max_inject = 16;  // 绝对上限
+        if (max_inject > 16) max_inject = 16;  // hard cap
         int want = num_init_solutions;
         if (want > max_inject) want = max_inject;
         
@@ -915,17 +919,17 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
             const Sol& s = init_solutions[i];
             bool valid = true;
             
-            // 基本维度检查
+            // Basic dimension checks
             for (int r = 0; r < pcfg.dim1 && valid; r++) {
                 if (s.dim2_sizes[r] < 0 || s.dim2_sizes[r] > Sol::DIM2) {
                     valid = false; break;
                 }
             }
             
-            // 编码特定检查
+            // Encoding-specific checks
             if (valid && pcfg.encoding == EncodingType::Permutation) {
                 if (pcfg.row_mode == RowMode::Partition) {
-                    // 分区模式：跨行元素不重复，总数 = total_elements
+                    // Partition mode: no duplicate elements across rows; total = total_elements
                     bool seen[512] = {};
                     int total = 0;
                     for (int r = 0; r < pcfg.dim1 && valid; r++) {
@@ -939,7 +943,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
                     }
                     if (valid && total != pcfg.total_elements) valid = false;
                 } else if (pcfg.perm_repeat_count > 1) {
-                    // 多重集排列：每行中每个值 [0, N) 恰好出现 repeat_count 次
+                    // Multiset permutation: each value in [0, N) appears repeat_count times per row
                     int R = pcfg.perm_repeat_count;
                     int N = pcfg.dim2_default / R;
                     for (int r = 0; r < pcfg.dim1 && valid; r++) {
@@ -956,7 +960,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
                         }
                     }
                 } else {
-                    // 标准排列：每行元素 [0, dim2_default) 不重复
+                    // Standard permutation: each row is a permutation of [0, dim2_default)
                     for (int r = 0; r < pcfg.dim1 && valid; r++) {
                         if (s.dim2_sizes[r] != pcfg.dim2_default) { valid = false; break; }
                         bool seen[512] = {};
@@ -977,7 +981,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
             }
             
             if (valid) {
-                // 注入到种群尾部（从后往前填，保留前部的 oversample 好解）
+                // Inject at population tail (fill from end; keep oversample winners at front)
                 int target_idx = pop_size - 1 - injected;
                 CUDA_CHECK(cudaMemcpy(pop.d_solutions + target_idx, &s,
                                       sizeof(Sol), cudaMemcpyHostToDevice));
@@ -992,7 +996,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         }
     }
     
-    // v3.0: 构建序列注册表（替代旧的 d_op_weights）
+    // v3.0: Build sequence registry (replaces old d_op_weights)
     ProblemProfile profile = classify_problem(pcfg);
     SeqRegistry seq_reg = build_seq_registry(profile);
 
@@ -1000,7 +1004,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         custom_registry_fn(seq_reg);
     }
     
-    // v3.1: K 步配置（多步执行）
+    // v3.1: K-step config (multi-step execution)
     KStepConfig kstep = build_kstep_config();
     
     if (cfg.verbose) {
@@ -1022,13 +1026,13 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
     Sol* d_global_best = nullptr;
     if (use_sa) {
         CUDA_CHECK(cudaMalloc(&d_global_best, sizeof(Sol)));
-        // v5.0 方案 B3: 导出 d_global_best 指针供外部读取（可选）
+        // v5.0 plan B3: expose d_global_best pointer for external read (optional)
         if (d_global_best_out != nullptr) {
             *d_global_best_out = d_global_best;
         }
     }
     
-    // AOS: 分配全局内存统计缓冲区（序列级粒度）
+    // AOS: allocate global stats buffer (per-sequence granularity)
     AOSStats* d_aos_stats = nullptr;
     AOSStats* h_aos_stats = nullptr;
     
@@ -1037,8 +1041,8 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         h_aos_stats = new AOSStats[pop_size];
     }
     
-    // --- 关系矩阵（G/O）：用于 SEQ_LNS_GUIDED_REBUILD ---
-    // 仅 Permutation 编码 + 有 GUIDED_REBUILD 序列时启用
+    // --- Relation matrices (G/O) for SEQ_LNS_GUIDED_REBUILD ---
+    // Enabled only for Permutation encoding when GUIDED_REBUILD is in registry
     bool use_relation_matrix = false;
     RelationMatrix rel_mat = {};
     int rel_N = 0;
@@ -1051,11 +1055,11 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         }
     }
     if (use_relation_matrix) {
-        // N = dim2_default（排列中的元素数）
+        // N = dim2_default (number of elements in permutation)
         rel_N = pcfg.dim2_default;
         if (rel_N > 0) {
             rel_mat = relation_matrix_create(rel_N, 0.95f);
-            // 让用户提供先验知识初始化 G/O（可选，默认不做任何事）
+            // Optional prior init of G/O via user hook (default: no-op)
             prob.init_relation_matrix(rel_mat.h_G, rel_mat.h_O, rel_N);
             relation_matrix_upload(rel_mat);
         } else {
@@ -1063,11 +1067,11 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         }
     }
     
-    // grid = pop_size（每个 block 处理一个解）
+    // grid = pop_size (one block per solution)
     int grid = pop_size;
     
-    // --- 2. 初始评估 ---
-    // 采样择优路径中已经评估过候选，但最终种群可能包含随机解，需要重新评估
+    // --- 2. Initial evaluation ---
+    // Sample-select path already evaluated candidates; final pop may still have randoms — re-evaluate
     {
         size_t eval_smem = prob.shared_mem_bytes();
         if (eval_smem > 48 * 1024) {
@@ -1086,9 +1090,9 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         CUDA_CHECK(cudaMemcpy(d_global_best, pop.d_solutions + idx, sizeof(Sol), cudaMemcpyDeviceToDevice));
     }
     
-    // --- 3. 主循环 ---
-    // batch 大小决定了 AOS/关系矩阵/收敛检测的更新频率
-    // 需要平衡：太小 → 同步开销大，太大 → 反应迟钝
+    // --- 3. Main loop ---
+    // Batch size sets update cadence for AOS / relation matrix / convergence checks
+    // Balance: too small -> sync overhead; too slow to react if too large
     int batch;
     if (use_islands)
         batch = cfg.migrate_interval;
@@ -1097,7 +1101,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
     else
         batch = cfg.max_gen;
     
-    // 需要定期更新的功能：强制 batch ≤ 200
+    // Features needing periodic updates: force batch <= 200
     if (use_relation_matrix || use_aos || use_time_limit || use_stagnation) {
         if (batch > 200) batch = 200;
     }
@@ -1106,11 +1110,11 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
     int migrate_round = 0;
     StopReason stop_reason = StopReason::MaxGen;
     
-    // 收敛检测状态
+    // Convergence-check state
     float prev_best_scalar = 1e30f;
     int stagnation_count = 0;
     
-    // --- EvolveParams: 可变参数（device memory）---
+    // --- EvolveParams: mutable fields (device memory) ---
     EvolveParams h_params;
     h_params.temp_start = 0.0f;
     h_params.gens_per_batch = batch;
@@ -1133,7 +1137,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         CUDA_CHECK(cudaStreamCreate(&stream));
     }
     
-    // lambda: 在 stream 上发射一个 batch 的 GPU kernel 序列
+    // Lambda: launch one batch of GPU kernels on stream
     auto launch_batch_kernels = [&](cudaStream_t s) {
         evolve_block_kernel<<<grid, block_threads, total_smem, s>>>(
             prob, pop.d_solutions, pop_size,
@@ -1168,7 +1172,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         }
     };
     
-    // 捕获 CUDA Graph（首次）
+    // Capture CUDA Graph (first time)
     if (use_graph) {
         CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal));
         launch_batch_kernels(stream);
@@ -1187,7 +1191,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
     CUDA_CHECK(cudaEventCreate(&t_stop));
     CUDA_CHECK(cudaEventRecord(t_start));
     
-    // 时间感知 AOS：窗口累积器
+    // Time-aware AOS: window accumulators
     int win_seq_usage[MAX_SEQ] = {};
     int win_seq_improve[MAX_SEQ] = {};
     int win_k_usage[MAX_K] = {};
@@ -1195,9 +1199,15 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
     int batch_count = 0;
     const int aos_interval = (cfg.aos_update_interval > 0) ? cfg.aos_update_interval : 1;
     
-    // v4.0: 约束导向 + 分层搜索
+    // v4.0: constraint-directed + phased search (require AOS enabled)
     const bool use_constraint_directed = cfg.use_constraint_directed && use_aos;
     const bool use_phased_search = cfg.use_phased_search && use_aos;
+    if (cfg.verbose) {
+        if (cfg.use_constraint_directed && !use_aos)
+            printf("  [WARN] constraint_directed requires AOS, disabled\n");
+        if (cfg.use_phased_search && !use_aos)
+            printf("  [WARN] phased_search requires AOS, disabled\n");
+    }
     float base_max_w[MAX_SEQ];
     for (int i = 0; i < seq_reg.count; i++) base_max_w[i] = seq_reg.max_w[i];
     
@@ -1217,7 +1227,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         
         float temp = use_sa ? cfg.sa_temp_init * powf(cfg.sa_alpha, (float)gen_done) : 0.0f;
         
-        // 更新 device 端可变参数
+        // Update mutable device parameters
         h_params.temp_start = temp;
         h_params.gens_per_batch = gens;
         h_params.seq_reg = seq_reg;
@@ -1225,7 +1235,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         h_params.migrate_round = migrate_round;
         CUDA_CHECK(cudaMemcpy(d_params, &h_params, sizeof(EvolveParams), cudaMemcpyHostToDevice));
         
-        // 发射 GPU kernel 序列
+        // Launch GPU kernel sequence
         if (use_graph) {
             CUDA_CHECK(cudaGraphLaunch(graph_exec, stream));
             CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -1233,8 +1243,8 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
             launch_batch_kernels(nullptr);
         }
         
-        // v5.0 方案 B3: 被动注入检查（在 Graph 之外单独调用）
-        // 注意：必须在 Graph 之外，因为 inject_buf 内容是动态变化的
+        // v5.0 plan B3: passive injection check (outside Graph)
+        // Must be outside Graph: inject_buf content changes dynamically
         if (inject_buf != nullptr && use_islands) {
             inject_check_kernel<<<1, 1>>>(pop.d_solutions, pop_size,
                                            island_size, inject_buf, oc);
@@ -1245,14 +1255,14 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         if (use_islands) migrate_round++;
         batch_count++;
         
-        // AOS: 两层权重更新（EMA）+ 停滞检测
+        // AOS: two-level weight update (EMA) + stagnation detection
         if (use_aos && (batch_count % aos_interval == 0)) {
             CUDA_CHECK(cudaDeviceSynchronize());
             CUDA_CHECK(cudaMemcpy(h_aos_stats, d_aos_stats,
                                   sizeof(AOSStats) * pop_size,
                                   cudaMemcpyDeviceToHost));
             
-            // --- 聚合当前 batch 的统计到窗口累积器 ---
+            // --- Fold current batch stats into window accumulators ---
             for (int b = 0; b < pop_size; b++) {
                 for (int i = 0; i < seq_reg.count; i++) {
                     win_seq_usage[i] += h_aos_stats[b].usage[i];
@@ -1266,7 +1276,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
             
             constexpr float AOS_ALPHA = 0.6f;
             
-            // --- v4.0: 约束导向 — 计算种群约束违反率 ---
+            // --- v4.0: constraint-directed — population infeasibility ratio ---
             float penalty_ratio = 0.0f;
             if (use_constraint_directed) {
                 Sol* h_pop_snap = new Sol[pop_size];
@@ -1280,7 +1290,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
                 delete[] h_pop_snap;
             }
             
-            // --- v4.0: 分层搜索 — 计算当前阶段的 floor/cap 调整 ---
+            // --- v4.0: phased search — phase floor/cap multipliers ---
             float phase_floor_mult = 1.0f;
             float phase_cap_mult   = 1.0f;
             if (use_phased_search) {
@@ -1296,18 +1306,18 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
                     progress = (float)gen_done / (float)cfg.max_gen;
                 }
                 if (progress < cfg.phase_explore_end) {
-                    phase_floor_mult = 1.5f;   // 探索期：抬高 floor → 更均匀
-                    phase_cap_mult   = 0.7f;   // 探索期：压低 cap → 防止过早集中
+                    phase_floor_mult = 1.5f;   // explore: raise floor -> more uniform
+                    phase_cap_mult   = 0.7f;   // explore: lower cap -> avoid early concentration
                 } else if (progress >= cfg.phase_refine_start) {
-                    phase_floor_mult = 0.5f;   // 精细期：降低 floor → 允许弱算子退出
-                    phase_cap_mult   = 1.5f;   // 精细期：抬高 cap → 集中利用强算子
+                    phase_floor_mult = 0.5f;   // refine: lower floor -> weak ops can fade
+                    phase_cap_mult   = 1.5f;   // refine: raise cap -> exploit strong ops
                 }
             }
             
-            // --- 第二层：算子权重更新（EMA） ---
+            // --- Layer 2: operator weights (EMA) ---
             {
                 float new_w[MAX_SEQ];
-                // 延迟归一化：EMA 更新 + 边界约束（不归一化）
+                // Deferred normalization: EMA + bounds (no renormalize to sum 1)
                 for (int i = 0; i < seq_reg.count; i++) {
                     float signal = (win_seq_usage[i] > 0)
                         ? (float)win_seq_improve[i] / (float)win_seq_usage[i]
@@ -1322,7 +1332,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
                 float floor_val = base_floor * phase_floor_mult;
                 float global_cap = cfg.aos_weight_cap * phase_cap_mult;
                 
-                // --- v4.0: 约束导向 — boost 跨行/行级算子权重 + 放宽 cap ---
+                // --- v4.0: constraint-directed — boost cross-row/row-level weights + relax cap ---
                 if (use_constraint_directed && penalty_ratio > 0.1f) {
                     float boost = 1.0f + (penalty_ratio - 0.1f) / 0.9f
                                   * (cfg.constraint_boost_max - 1.0f);
@@ -1339,7 +1349,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
                         seq_reg.max_w[i] = base_max_w[i];
                 }
                 
-                // 应用边界约束（不归一化）
+                // Apply bounds (no renormalize to sum 1)
                 float sum = 0.0f;
                 for (int i = 0; i < seq_reg.count; i++) {
                     float cap_val = (seq_reg.max_w[i] > 0.0f) ? seq_reg.max_w[i] : global_cap;
@@ -1347,11 +1357,11 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
                     sum += seq_reg.weights[i];
                 }
                 
-                // 更新缓存的权重和
+                // Update cached weight sum
                 seq_reg.weights_sum = sum;
             }
             
-            // --- 第一层：K 步数权重更新（EMA + 延迟归一化） ---
+            // --- Layer 1: K-step weights (EMA + deferred normalize) ---
             {
                 float new_w[MAX_K];
                 for (int i = 0; i < MAX_K; i++) {
@@ -1362,14 +1372,14 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
                              + (1.0f - AOS_ALPHA) * (rate + AOS_WEIGHT_FLOOR);
                 }
                 
-                // 应用边界约束（不归一化）
+                // Apply bounds (no renormalize to sum 1)
                 float floor_val = cfg.aos_weight_floor;
                 float cap_val = 0.95f;
                 for (int i = 0; i < MAX_K; i++) {
                     kstep.weights[i] = fmaxf(floor_val, fminf(cap_val, new_w[i]));
                 }
                 
-                // K 步权重归一化（保持原有行为，因为 K 步选择不使用轮盘赌）
+                // Renormalize K-step weights (legacy behavior; K choice is not roulette)
                 float sum = 0.0f;
                 for (int i = 0; i < MAX_K; i++) sum += kstep.weights[i];
                 if (sum > 0.0f) {
@@ -1378,7 +1388,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
                 }
             }
             
-            // --- Debug: 前 5 个 batch 打印统计 ---
+            // --- Debug: print stats for first 5 batches ---
             if (cfg.verbose && gen_done <= batch * 5) {
                 fprintf(stderr, "  [AOS batch g=%d] usage:", gen_done);
                 for (int i = 0; i < seq_reg.count; i++) fprintf(stderr, " %d", win_seq_usage[i]);
@@ -1397,7 +1407,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
             }
             
             
-            // --- 停滞检测 ---
+            // --- Stagnation detection ---
             {
                 int total_improve_all = 0;
                 for (int i = 0; i < seq_reg.count; i++)
@@ -1417,25 +1427,25 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
                 }
             }
             
-            // --- 清零窗口累积器 ---
+            // --- Clear window accumulators ---
             memset(win_seq_usage, 0, sizeof(win_seq_usage));
             memset(win_seq_improve, 0, sizeof(win_seq_improve));
             memset(win_k_usage, 0, sizeof(win_k_usage));
             memset(win_k_improve, 0, sizeof(win_k_improve));
         }
         
-        // --- 关系矩阵更新（每个 batch 间隙，从种群 top-K 解统计）---
-        // 多个好解贡献 G/O 信号，加速矩阵信息积累
+        // --- Relation matrix update (between batches, from population top-K) ---
+        // Several good solutions contribute G/O signal to build the matrix faster
         if (use_relation_matrix) {
             if (!use_aos) {
                 CUDA_CHECK(cudaDeviceSynchronize());
             }
             
-            // 下载整个种群的目标值，找 top-K
+            // Download population objectives and find top-K
             constexpr int REL_TOP_K = 4;
             int top_indices[REL_TOP_K];
             {
-                // 简单方法：下载所有解的 scalar 目标，host 端排序取 top-K
+                // Simple approach: scalar objectives on host, pick top-K minima
                 float* h_scores = new float[pop_size];
                 Sol* h_pop_ptr = new Sol[pop_size];
                 CUDA_CHECK(cudaMemcpy(h_pop_ptr, pop.d_solutions,
@@ -1444,16 +1454,16 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
                     h_scores[b] = scalar_objective(h_pop_ptr[b], oc);
                     if (h_pop_ptr[b].penalty > 0.0f) h_scores[b] = 1e30f;
                 }
-                // 找 top-K 最小值
+                // Find top-K smallest scores
                 for (int k = 0; k < REL_TOP_K && k < pop_size; k++) {
                     int mi = 0;
                     for (int b = 1; b < pop_size; b++) {
                         if (h_scores[b] < h_scores[mi]) mi = b;
                     }
                     top_indices[k] = mi;
-                    h_scores[mi] = 1e30f;  // 标记已选
+                    h_scores[mi] = 1e30f;  // mark as taken
                 }
-                // 从 top-K 解更新 G/O
+                // Update G/O from top-K solutions
                 int actual_k = (pop_size < REL_TOP_K) ? pop_size : REL_TOP_K;
                 for (int k = 0; k < actual_k; k++) {
                     relation_matrix_update(rel_mat, h_pop_ptr[top_indices[k]], pcfg.dim1);
@@ -1465,9 +1475,9 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
             relation_matrix_upload(rel_mat);
         }
         
-        // 交叉 / 迁移 / 精英注入 已在 launch_batch_kernels 中统一发射
+        // Crossover / migrate / elite inject already launched in launch_batch_kernels
         
-        // --- 时间限制检查 ---
+        // --- Time limit check ---
         if (use_time_limit) {
             CUDA_CHECK(cudaEventRecord(t_stop));
             CUDA_CHECK(cudaEventSynchronize(t_stop));
@@ -1481,7 +1491,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
             }
         }
         
-        // --- 收敛检测 + reheat ---
+        // --- Convergence check + reheat ---
         if (use_stagnation) {
             find_best_kernel<<<1, 1>>>(pop.d_solutions, pop_size, oc, d_best_idx);
             CUDA_CHECK(cudaDeviceSynchronize());
@@ -1500,26 +1510,25 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
             
             if (stagnation_count >= cfg.stagnation_limit) {
                 if (use_sa && cfg.reheat_ratio > 0.0f) {
-                    // reheat：将温度恢复到初始温度的 reheat_ratio 倍
-                    // 通过回退 gen_done 实现（温度 = init * alpha^gen_done）
+                    // Reheat: restore temperature to reheat_ratio * initial
+                    // Implemented by rolling back gen_done (temp = init * alpha^gen_done)
                     float target_temp = cfg.sa_temp_init * cfg.reheat_ratio;
                     int reheat_gen = (int)(logf(target_temp / cfg.sa_temp_init) / logf(cfg.sa_alpha));
                     if (reheat_gen < 0) reheat_gen = 0;
-                    // 不真正回退 gen_done（会影响终止条件），而是记录一个 temp_offset
-                    // 简化做法：直接在下一轮 batch 中 temp 会自然从 reheat 后的值开始
-                    // 这里通过修改 gen_done 的等效温度来实现
+                    // Not a true gen_done rollback for termination; conceptually temp_offset
+                    // Simplified: next batch temp follows from adjusted gen_done
                     if (cfg.verbose) {
                         float cur_temp = cfg.sa_temp_init * powf(cfg.sa_alpha, (float)gen_done);
                         printf("  [REHEAT] stagnation=%d at gen %d, temp %.4f → %.4f\n",
                                cfg.stagnation_limit, gen_done, cur_temp, target_temp);
                     }
-                    // 将 gen_done 回退到对应 target_temp 的位置（但不超过已完成代数的一半）
+                    // Roll gen_done back to match target_temp (but not below half of completed gens)
                     int min_gen = gen_done / 2;
                     if (reheat_gen < min_gen) reheat_gen = min_gen;
                     gen_done = reheat_gen;
                     stagnation_count = 0;
                 } else {
-                    // 无 SA 时，收敛检测触发 → 提前终止
+                    // No SA: stagnation triggers early stop
                     stop_reason = StopReason::Stagnation;
                     if (cfg.verbose) printf("  [STOP] stagnation=%d at gen %d, no SA to reheat\n",
                                              cfg.stagnation_limit, gen_done);
@@ -1528,7 +1537,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
             }
         }
         
-        // 打印进度
+        // Progress printout
         if (cfg.verbose && gen_done % cfg.print_every == 0) {
             if (!use_stagnation) {
                 find_best_kernel<<<1, 1>>>(pop.d_solutions, pop_size, oc, d_best_idx);
@@ -1549,7 +1558,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
     float elapsed_ms = 0;
     CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, t_start, t_stop));
     
-    // --- 4. 最终结果 ---
+    // --- 4. Final result ---
     Sol best;
     if (use_sa) {
         CUDA_CHECK(cudaDeviceSynchronize());
@@ -1582,7 +1591,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         }
     }
     
-    // AOS: 打印最终两层权重
+    // AOS: print final two-level weights
     if (use_aos && cfg.verbose) {
         printf("  AOS K-step weights: K1=%.3f K2=%.3f K3=%.3f\n",
                kstep.weights[0], kstep.weights[1], kstep.weights[2]);
@@ -1592,7 +1601,7 @@ SolveResult<typename Problem::Sol> solve(Problem& prob, const SolverConfig& cfg,
         printf("\n");
     }
     
-    // 填充返回值
+    // Fill return struct
     result.best_solution = best;
     result.elapsed_ms = elapsed_ms;
     result.generations = gen_done;
diff --git a/prototype/core/types.cuh b/prototype/core/types.cuh
index a29934d..5547dff 100644
--- a/prototype/core/types.cuh
+++ b/prototype/core/types.cuh
@@ -1,38 +1,39 @@
 /**
- * types.cuh - 核心类型定义
+ * types.cuh - Core type definitions
  * 
- * 包含：编码类型、Solution 模板、ProblemConfig/SolverConfig、
- *       SeqRegistry（AOS 序列级权重）、KStepConfig（多步执行）、
- *       RelationMatrix（G/O 关系矩阵）、ProblemBase（CRTP 基类）
+ * Contains: encoding types, Solution template, ProblemConfig/SolverConfig,
+ *           SeqRegistry (AOS sequence-level weights), KStepConfig (multi-step execution),
+ *           RelationMatrix (G/O relation matrix), ProblemBase (CRTP base class)
  */
 
 #pragma once
 #include <cstdio>
+#include "cuda_utils.cuh"
 
 // ============================================================
-// 编译时常量
+// Compile-time constants
 // ============================================================
-constexpr int MAX_OBJ = 4;    // 最多 4 个目标（16字节，不值得模板化）
-constexpr int MAX_SEQ = 32;   // 最大序列数（内置 ~16 + 自定义算子 ≤8，留余量）
-constexpr int MAX_K   = 3;    // 多步执行的最大步数（K=1,2,3）
-// AOS 权重上下限（归一化后）
-constexpr float AOS_WEIGHT_FLOOR = 0.05f;  // 最低权重保底（确保充分探索）
-constexpr float AOS_WEIGHT_CAP   = 0.35f;  // 最高权重上限（防止赢者通吃）
+constexpr int MAX_OBJ = 4;    // Max 4 objectives (16 bytes, not worth templatizing)
+constexpr int MAX_SEQ = 32;   // Max sequences (built-in ~16 + custom ops ≤8, with margin)
+constexpr int MAX_K   = 3;    // Max steps for multi-step execution (K=1,2,3)
+// AOS weight bounds
+constexpr float AOS_WEIGHT_FLOOR = 0.05f;  // Minimum weight floor (ensures sufficient exploration)
+constexpr float AOS_WEIGHT_CAP   = 0.35f;  // Maximum weight cap (prevents winner-take-all)
 
 // ============================================================
-// 枚举类型
+// Enum types
 // ============================================================
 
 enum class EncodingType {
-    Permutation,    // 排列：元素不重复
-    Binary,         // 0-1：flip 是主要算子
-    Integer         // 有界整数
+    Permutation,    // Permutation: elements are unique
+    Binary,         // 0-1: flip is the main operator
+    Integer         // Bounded integers
 };
 
 enum class RowMode {
-    Single,     // dim1=1，单行（TSP/QAP/Knapsack 等大部分问题）
-    Fixed,      // dim1>1，行等长不可变（JSP-Int/Schedule，禁止 SPLIT/MERGE）
-    Partition   // dim1>1，元素分区到各行，行长可变（CVRP/VRPTW）
+    Single,     // dim1=1, single row (most problems: TSP/QAP/Knapsack, etc.)
+    Fixed,      // dim1>1, equal row lengths fixed (JSP-Int/Schedule; SPLIT/MERGE disallowed)
+    Partition   // dim1>1, elements partitioned across rows, variable row lengths (CVRP/VRPTW)
 };
 
 enum class ObjDir {
@@ -40,241 +41,235 @@ enum class ObjDir {
     Maximize
 };
 
-// 多目标比较模式
+// Multi-objective comparison mode
 enum class CompareMode {
-    Weighted,       // 加权求和：sum(weight[i] * obj[i])，越小越好
-    Lexicographic   // 字典法：按优先级逐目标比较，前面的目标优先
+    Weighted,       // Weighted sum: sum(weight[i] * obj[i]), lower is better
+    Lexicographic   // Lexicographic: compare objectives by priority order
 };
 
 enum class MigrateStrategy {
-    Ring,       // 环形：各岛最优→邻岛最差（慢传播，高多样性）
-    TopN,       // 全局 Top-N 轮转分发（快传播，强收敛）
-    Hybrid      // 两者兼顾：Top-N 替换最差 + Ring 替换次差
+    Ring,       // Ring: each island's best → neighbor's worst (slow spread, high diversity)
+    TopN,       // Global Top-N round-robin (fast spread, strong convergence)
+    Hybrid      // Hybrid: Top-N replaces worst + Ring replaces second-worst
 };
 
-// v5.0: 多 GPU 协同 — 解注入模式
+// v5.0: multi-GPU coordination — solution injection mode
 enum class MultiGpuInjectMode {
-    OneIsland,   // 注入到 1 个岛的 worst（保守，保持多样性）
-    HalfIslands, // 注入到 num_islands/2 个岛的 worst（平衡）
-    AllIslands   // 注入到所有岛的 worst（激进，快速传播）
+    OneIsland,   // Inject into worst of 1 island (conservative, preserves diversity)
+    HalfIslands, // Inject into worst on num_islands/2 islands (balanced)
+    AllIslands   // Inject into worst on all islands (aggressive, fast spread)
 };
 
-// v5.0 方案 B3: InjectBuffer — 被动注入缓冲区
-// GPU 无感知，CPU 同步写入，GPU 在 migrate_kernel 中检查并应用
-// 设计要点：
-// 1. 使用同步 cudaMemcpy 避免与 solve() 的 stream/Graph 冲突
-// 2. 写入顺序：先 solution 后 flag，GPU 端原子读 flag 确保一致性
-// 3. 完全解耦：不依赖 solve() 的任何内部状态
+// v5.0 option B3: InjectBuffer — passive injection buffer
+// GPU has no awareness; CPU writes synchronously; GPU checks and applies in migrate_kernel
+// Design notes:
+// 1. Use synchronous cudaMemcpy to avoid conflicts with solve() stream/Graph
+// 2. Write order: solution first, then flag; GPU atomic flag read ensures consistency
+// 3. Fully decoupled: does not depend on any internal state of solve()
 template<typename Sol>
 struct InjectBuffer {
-    Sol*  d_solution;    // Device 端解缓冲区（单个解）
-    int*  d_flag;        // Device 端标志位：0=空，1=有新解
+    Sol*  d_solution = nullptr;  // Device solution buffer (single solution)
+    int*  d_flag     = nullptr;  // Device flag: 0=empty, 1=new solution
+    int   owner_gpu  = 0;       // GPU that owns the allocation
     
-    // 分配 InjectBuffer（在指定 GPU 上）
+    // Allocate InjectBuffer (on given GPU)
     static InjectBuffer<Sol> allocate(int gpu_id) {
         InjectBuffer<Sol> buf;
+        buf.owner_gpu = gpu_id;
         
-        // 保存原设备，切换到目标 GPU
         int orig_device;
-        cudaGetDevice(&orig_device);
-        cudaSetDevice(gpu_id);
+        CUDA_CHECK(cudaGetDevice(&orig_device));
+        CUDA_CHECK(cudaSetDevice(gpu_id));
         
-        // 分配设备内存
-        cudaMalloc(&buf.d_solution, sizeof(Sol));
-        cudaMalloc(&buf.d_flag, sizeof(int));
+        CUDA_CHECK(cudaMalloc(&buf.d_solution, sizeof(Sol)));
+        CUDA_CHECK(cudaMalloc(&buf.d_flag, sizeof(int)));
         
-        // 初始化 flag 为 0
         int zero = 0;
-        cudaMemcpy(buf.d_flag, &zero, sizeof(int), cudaMemcpyHostToDevice);
+        CUDA_CHECK(cudaMemcpy(buf.d_flag, &zero, sizeof(int), cudaMemcpyHostToDevice));
         
-        // 恢复原设备
-        cudaSetDevice(orig_device);
+        CUDA_CHECK(cudaSetDevice(orig_device));
         
         return buf;
     }
     
-    // 释放 InjectBuffer
+    // Free InjectBuffer (switches to owner GPU before freeing)
     void destroy() {
-        if (d_solution) {
-            cudaFree(d_solution);
-            d_solution = nullptr;
-        }
-        if (d_flag) {
-            cudaFree(d_flag);
-            d_flag = nullptr;
+        if (d_solution || d_flag) {
+            int orig_device;
+            cudaGetDevice(&orig_device);
+            cudaSetDevice(owner_gpu);
+            if (d_solution) { cudaFree(d_solution); d_solution = nullptr; }
+            if (d_flag)     { cudaFree(d_flag);     d_flag = nullptr;     }
+            cudaSetDevice(orig_device);
         }
     }
     
-    // CPU 端写入新解
-    // 注意：使用同步 cudaMemcpy 避免与 solve() 的 stream 冲突
-    // 顺序：先写 solution，再写 flag（GPU 端原子读 flag 确保不会读到半写状态）
+    // CPU-side write of new solution
+    // Note: synchronous cudaMemcpy avoids stream conflicts with solve()
+    // Order: write solution first, then flag (GPU atomic flag read avoids half-written reads)
     void write_sync(const Sol& sol, int target_gpu) {
-        // 保存原设备，切换到目标 GPU
         int orig_device;
-        cudaGetDevice(&orig_device);
-        cudaSetDevice(target_gpu);
+        CUDA_CHECK(cudaGetDevice(&orig_device));
+        CUDA_CHECK(cudaSetDevice(target_gpu));
         
-        // 先写解数据
-        cudaMemcpy(d_solution, &sol, sizeof(Sol), cudaMemcpyHostToDevice);
-        // 再写标志位（确保解数据已写完）
+        CUDA_CHECK(cudaMemcpy(d_solution, &sol, sizeof(Sol), cudaMemcpyHostToDevice));
         int flag = 1;
-        cudaMemcpy(d_flag, &flag, sizeof(int), cudaMemcpyHostToDevice);
+        CUDA_CHECK(cudaMemcpy(d_flag, &flag, sizeof(int), cudaMemcpyHostToDevice));
         
-        // 恢复原设备
-        cudaSetDevice(orig_device);
+        CUDA_CHECK(cudaSetDevice(orig_device));
     }
 };
 
 
 // ============================================================
-// SeqID — 统一的 OperationSequence 编号
+// SeqID — unified OperationSequence IDs
 // ============================================================
-// 每个 SeqID 对应一种具体的搜索操作（原子或多步）
-// AOS 权重跟踪粒度 = SeqID（每个序列独立权重）
+// Each SeqID maps to one concrete search operation (atomic or multi-step)
+// AOS weight granularity = SeqID (independent weight per sequence)
 //
-// 命名规则：SEQ_{编码}_{操作名}
-// 跨编码共享的行级操作统一编号
+// Naming: SEQ_{encoding}_{operation}
+// Row-level ops shared across encodings use unified numbering
 
 namespace seq {
 
-// --- Permutation 行内（元素级）---
-constexpr int SEQ_PERM_SWAP           = 0;   // swap 两个位置
-constexpr int SEQ_PERM_REVERSE        = 1;   // 2-opt（反转区间）
-constexpr int SEQ_PERM_INSERT         = 2;   // insert（移动到新位置）
-constexpr int SEQ_PERM_3OPT           = 3;   // 3-opt（断 3 边重连）
+// --- Permutation in-row (element-level) ---
+constexpr int SEQ_PERM_SWAP           = 0;   // swap two positions
+constexpr int SEQ_PERM_REVERSE        = 1;   // 2-opt (reverse segment)
+constexpr int SEQ_PERM_INSERT         = 2;   // insert (move to new position)
+constexpr int SEQ_PERM_3OPT           = 3;   // 3-opt (reconnect after 3 edges)
 
-// --- Permutation 行内（片段级）---
-constexpr int SEQ_PERM_OR_OPT         = 4;   // or-opt（移动连续 k 个元素）
+// --- Permutation in-row (segment-level) ---
+constexpr int SEQ_PERM_OR_OPT         = 4;   // or-opt (move k consecutive elements)
 
-// --- Permutation 行内（组合级）---
-constexpr int SEQ_PERM_DOUBLE_SWAP    = 30;  // 连续两次 swap（同行）
-constexpr int SEQ_PERM_TRIPLE_SWAP    = 31;  // 连续三次 swap（同行）
+// --- Permutation in-row (combo-level) ---
+constexpr int SEQ_PERM_DOUBLE_SWAP    = 30;  // two consecutive swaps (same row)
+constexpr int SEQ_PERM_TRIPLE_SWAP    = 31;  // three consecutive swaps (same row)
 
-// --- Permutation 跨行（元素级）---
-constexpr int SEQ_PERM_CROSS_RELOCATE = 5;   // 单元素移行
-constexpr int SEQ_PERM_CROSS_SWAP     = 6;   // 单元素换行
+// --- Permutation cross-row (element-level) ---
+constexpr int SEQ_PERM_CROSS_RELOCATE = 5;   // single element moves row
+constexpr int SEQ_PERM_CROSS_SWAP     = 6;   // single element swaps rows
 
-// --- Permutation 跨行（片段级）---
-constexpr int SEQ_PERM_SEG_RELOCATE   = 7;   // 片段移行
-constexpr int SEQ_PERM_SEG_SWAP       = 8;   // 片段换行（2-opt*）
-constexpr int SEQ_PERM_CROSS_EXCHANGE = 9;   // 片段互换（保序）
+// --- Permutation cross-row (segment-level) ---
+constexpr int SEQ_PERM_SEG_RELOCATE   = 7;   // segment moves row
+constexpr int SEQ_PERM_SEG_SWAP       = 8;   // segment swaps rows (2-opt*)
+constexpr int SEQ_PERM_CROSS_EXCHANGE = 9;   // segment exchange (order preserved)
 
-// --- Binary 行内（元素级）---
-constexpr int SEQ_BIN_FLIP            = 0;   // 翻转一个位
-constexpr int SEQ_BIN_SWAP            = 1;   // 交换两个位
+// --- Binary in-row (element-level) ---
+constexpr int SEQ_BIN_FLIP            = 0;   // flip one bit
+constexpr int SEQ_BIN_SWAP            = 1;   // swap two bits
 
-// --- Binary 行内（片段级）---
-constexpr int SEQ_BIN_SEG_FLIP        = 2;   // 翻转连续 k 个位
-constexpr int SEQ_BIN_K_FLIP          = 3;   // 同时翻转 k 个随机位
+// --- Binary in-row (segment-level) ---
+constexpr int SEQ_BIN_SEG_FLIP        = 2;   // flip k consecutive bits
+constexpr int SEQ_BIN_K_FLIP          = 3;   // flip k random bits at once
 
-// --- Binary 跨行 ---
-constexpr int SEQ_BIN_CROSS_SWAP      = 4;   // 两行各一个位互换
-constexpr int SEQ_BIN_SEG_CROSS_SWAP  = 5;   // 两行各取一段互换
+// --- Binary cross-row ---
+constexpr int SEQ_BIN_CROSS_SWAP      = 4;   // swap one bit per row across two rows
+constexpr int SEQ_BIN_SEG_CROSS_SWAP  = 5;   // swap a segment from each row
 
-// --- 共享：行级（编码无关）---
-constexpr int SEQ_ROW_SWAP            = 10;  // 交换两行
-constexpr int SEQ_ROW_REVERSE         = 11;  // 反转行排列
-constexpr int SEQ_ROW_SPLIT           = 12;  // 一行拆两行
-constexpr int SEQ_ROW_MERGE           = 13;  // 两行合并
+// --- Shared: row-level (encoding-agnostic) ---
+constexpr int SEQ_ROW_SWAP            = 10;  // swap two rows
+constexpr int SEQ_ROW_REVERSE         = 11;  // reverse row order
+constexpr int SEQ_ROW_SPLIT           = 12;  // split one row into two
+constexpr int SEQ_ROW_MERGE           = 13;  // merge two rows
 
-// --- 特殊 ---
-constexpr int SEQ_PERTURBATION        = 14;  // 扰动（多步不可逆）
+// --- Special ---
+constexpr int SEQ_PERTURBATION        = 14;  // perturbation (multi-step, irreversible)
 
-// --- Integer 行内（元素级）---
-constexpr int SEQ_INT_RANDOM_RESET    = 0;   // 随机一个位置重置为 [lb, ub] 内随机值
-constexpr int SEQ_INT_DELTA           = 1;   // 随机一个位置 ±k（clamp 到 [lb, ub]）
-constexpr int SEQ_INT_SWAP            = 2;   // 交换两个位置的值
+// --- Integer in-row (element-level) ---
+constexpr int SEQ_INT_RANDOM_RESET    = 0;   // reset one position to random in [lb, ub]
+constexpr int SEQ_INT_DELTA           = 1;   // one position ±k (clamped to [lb, ub])
+constexpr int SEQ_INT_SWAP            = 2;   // swap values at two positions
 
-// --- Integer 行内（片段级）---
-constexpr int SEQ_INT_SEG_RESET       = 3;   // 连续 k 个位置全部重置
-constexpr int SEQ_INT_K_DELTA         = 4;   // 随机 k 个位置各自 ±1
+// --- Integer in-row (segment-level) ---
+constexpr int SEQ_INT_SEG_RESET       = 3;   // reset k consecutive positions
+constexpr int SEQ_INT_K_DELTA         = 4;   // k positions each ±1 at random
 
-// --- Integer 跨行 ---
-constexpr int SEQ_INT_CROSS_SWAP      = 5;   // 两行各一个位置互换
+// --- Integer cross-row ---
+constexpr int SEQ_INT_CROSS_SWAP      = 5;   // swap one position per row across two rows
 
-// --- LNS（大邻域搜索）---
-constexpr int SEQ_LNS_SEGMENT_SHUFFLE = 20;  // 打乱连续片段
-constexpr int SEQ_LNS_SCATTER_SHUFFLE = 21;  // 打乱随机分散位置
-constexpr int SEQ_LNS_GUIDED_REBUILD  = 22;  // 关系矩阵引导重建
+// --- LNS (large neighborhood search) ---
+constexpr int SEQ_LNS_SEGMENT_SHUFFLE = 20;  // shuffle a contiguous segment
+constexpr int SEQ_LNS_SCATTER_SHUFFLE = 21;  // shuffle a scattered set of positions
+constexpr int SEQ_LNS_GUIDED_REBUILD  = 22;  // guided rebuild from relation matrix
 
 }  // namespace seq
 
 // ============================================================
-// RelationMatrix — G/O 关系矩阵（GPU global memory）
+// RelationMatrix — G/O relation matrix (GPU global memory)
 // ============================================================
-// G[i][j]: 元素 i 和 j 的分组倾向（对称，越大越倾向同组）
-// O[i][j]: 元素 i 排在 j 前面的倾向（不对称）
-// 存储为一维数组 [N * N]，行优先
-// 小规模 N<200 直接 Dense，P2 再做稀疏化
+// G[i][j]: grouping tendency of elements i and j (symmetric; higher → more same-group)
+// O[i][j]: tendency for element i to precede j (asymmetric)
+// Stored as a 1D row-major array [N * N]
+// For small N<200 use dense directly; P2 may add sparsification
 //
-// 更新时机：host 端，每个 batch 间隙
-// 使用时机：kernel 中 SEQ_LNS_GUIDED_REBUILD 读取
+// Updated on: host, between batches
+// Read in: kernel for SEQ_LNS_GUIDED_REBUILD
 
 struct RelationMatrix {
-    float* d_G;           // GPU 上的 G 矩阵 [N * N]
-    float* d_O;           // GPU 上的 O 矩阵 [N * N]
-    float* h_G;           // Host 上的 G 矩阵 [N * N]（用于更新后上传）
-    float* h_O;           // Host 上的 O 矩阵 [N * N]
-    int    N;             // 元素总数
-    float  decay;         // 衰减系数 α（默认 0.95）
-    int    update_count;  // 已更新次数（用于冷启动判断）
+    float* d_G;           // G matrix on GPU [N * N]
+    float* d_O;           // O matrix on GPU [N * N]
+    float* h_G;           // G matrix on host [N * N] (for upload after update)
+    float* h_O;           // O matrix on host [N * N]
+    int    N;             // total number of elements
+    float  decay;         // decay factor α (default 0.95)
+    int    update_count;  // number of updates so far (for cold-start logic)
 };
 
 // ============================================================
-// SeqRegistry — 运行时可用序列注册表
+// SeqRegistry — runtime-available sequence registry
 // ============================================================
-// 根据 EncodingType 和 dim1 自动确定哪些序列可用
-// 传到 GPU 供 sample_sequence() 使用
+// Which sequences are available is determined from EncodingType and dim1
+// Passed to GPU for sample_sequence()
 
 enum class SeqCategory : int {
-    InRow    = 0,   // 行内算子（swap, reverse, insert, ...）
-    CrossRow = 1,   // 跨行算子（cross_relocate, cross_swap, seg_relocate, ...）
-    RowLevel = 2,   // 行级算子（row_swap, row_reverse, split, merge）
-    LNS      = 3,   // 大邻域搜索
+    InRow    = 0,   // within-row operators (swap, reverse, insert, ...)
+    CrossRow = 1,   // cross-row operators (cross_relocate, cross_swap, seg_relocate, ...)
+    RowLevel = 2,   // row-level operators (row_swap, row_reverse, split, merge)
+    LNS      = 3,   // large neighborhood search
 };
 
 struct SeqRegistry {
-    int   ids[MAX_SEQ];       // 可用序列的 SeqID 列表
-    int   count;              // 可用序列数量
-    float weights[MAX_SEQ];   // 每个序列的当前权重（未归一化，延迟归一化）
-    float weights_sum;        // 权重和（缓存，用于延迟归一化）
-    float max_w[MAX_SEQ];     // 每个序列的权重上限（0 = 不限，用全局 cap）
-    SeqCategory categories[MAX_SEQ];  // 每个序列的分类（约束导向用）
+    int   ids[MAX_SEQ];       // SeqID list of available sequences
+    int   count;              // number of available sequences
+    float weights[MAX_SEQ];   // current weight per sequence (unnormalized; lazy normalization)
+    float weights_sum;        // sum of weights (cached for lazy normalization)
+    float max_w[MAX_SEQ];     // per-sequence weight cap (0 = unlimited, use global cap)
+    SeqCategory categories[MAX_SEQ];  // category per sequence (for constraint-directed mode)
 };
 
 // ============================================================
-// KStepConfig — 多步执行的步数选择配置
+// KStepConfig — step-count selection for multi-step execution
 // ============================================================
-// K=1: 单步（当前行为），K=2/3: 连续执行多个序列后再评估
-// 两层权重体系的第一层
+// K=1: single step (current behavior); K=2/3: run several sequences then evaluate
+// First layer of the two-level weight system
 //
-// 自适应策略：
-//   - 初始 K=1 权重很大（保守），K>1 权重小
-//   - K>1 带来改进 → 增大该 K 的权重
-//   - 长时间无改进 → 重置/增大 K>1 权重（跳出局部最优）
+// Adaptive policy:
+//   - Initially K=1 has large weight (conservative), K>1 small
+//   - If K>1 yields improvement → increase that K's weight
+//   - Long stagnation → reset / boost K>1 weights (escape local optima)
 
 struct KStepConfig {
-    float weights[MAX_K];     // K=1,2,3 的采样权重（归一化）
-    int   stagnation_count;   // 连续无改进的 batch 数（用于触发重置）
-    int   stagnation_limit;   // 触发重置的阈值（默认 5 个 batch）
+    float weights[MAX_K];     // sampling weights for K=1,2,3 (normalized)
+    int   stagnation_count;   // consecutive batches without improvement (triggers reset)
+    int   stagnation_limit;   // threshold to trigger reset (default 5 batches)
 };
 
-// 构建默认 K 步配置
+// Build default K-step configuration
 inline KStepConfig build_kstep_config() {
     KStepConfig kc;
-    kc.weights[0] = 0.80f;   // K=1: 初始主导
-    kc.weights[1] = 0.15f;   // K=2: 少量探索
-    kc.weights[2] = 0.05f;   // K=3: 极少探索
+    kc.weights[0] = 0.80f;   // K=1: dominates initially
+    kc.weights[1] = 0.15f;   // K=2: little exploration
+    kc.weights[2] = 0.05f;   // K=3: minimal exploration
     kc.stagnation_count = 0;
     kc.stagnation_limit = 5;
     return kc;
 };
 
 // ============================================================
-// ProblemProfile — 基于结构特征推断的问题画像
+// ProblemProfile — problem profile inferred from structural features
 // ============================================================
-// 第一层：纯结构推断（不感知语义），用于驱动算子注册和初始权重
-// 未来第二层：可扩展更细粒度的画像（如多属性、高约束等）
+// Layer 1: structure-only inference (no semantics), drives operator registration and initial weights
+// Future layer 2: finer profiles (e.g. multi-attribute, high constraint)
 
 enum class ScaleClass  { Small, Medium, Large };
 enum class StructClass { SingleSeq, MultiFixed, MultiPartition };
@@ -286,10 +281,10 @@ struct ProblemProfile {
     float         cross_row_prob;
 };
 
-// classify_problem() 定义在 ProblemConfig 之后
+// classify_problem() is defined after ProblemConfig
 
 // ============================================================
-// 权重预设 — 由 ScaleClass 驱动
+// Weight presets — driven by ScaleClass
 // ============================================================
 
 struct WeightPreset {
@@ -308,100 +303,100 @@ inline WeightPreset get_weight_preset(ScaleClass scale) {
     return { 0.50f, 0.80f, 0.006f, 0.01f };
 }
 
-// classify_problem() 和 build_seq_registry() 定义在 ProblemConfig 之后
+// classify_problem() and build_seq_registry() are defined after ProblemConfig
 
 // ============================================================
-// Solution<D1, D2> — 解的模板化表示
+// Solution<D1, D2> — templated solution representation
 // ============================================================
-// D1: 行数上限 (TSP=1, VRP≤16, Schedule≤8)
-// D2: 每行列数上限 (TSP≤64, 背包≤32)
-// 每个 Problem 选择最小够用的 D1/D2，编译器生成紧凑的结构
+// D1: max number of rows (TSP=1, VRP≤16, Schedule≤8)
+// D2: max columns per row (TSP≤64, knapsack≤32)
+// Each Problem picks the smallest sufficient D1/D2; compiler emits a compact layout
 
 template<int D1, int D2>
 struct Solution {
-    static constexpr int DIM1 = D1;   // 编译时行数上限
-    static constexpr int DIM2 = D2;   // 编译时列数上限
-    int   data[D1][D2];               // D1×D2×4 字节
-    int   dim2_sizes[D1];             // D1×4 字节
-    float objectives[MAX_OBJ];        // 16 字节（固定）
-    float penalty;                    // 4 字节
+    static constexpr int DIM1 = D1;   // compile-time max rows
+    static constexpr int DIM2 = D2;   // compile-time max columns per row
+    int   data[D1][D2];               // D1×D2×4 bytes
+    int   dim2_sizes[D1];             // D1×4 bytes
+    float objectives[MAX_OBJ];        // 16 bytes (fixed)
+    float penalty;                    // 4 bytes
 };
 
 // ============================================================
-// ProblemConfig — 问题的运行时元信息
+// ProblemConfig — runtime metadata for a problem
 // ============================================================
 
 struct ProblemConfig {
     EncodingType encoding;
-    int   dim1;                       // 实际使用的行数 (≤ D1)
-    int   dim2_default;               // 实际使用的列数 (≤ D2)
+    int   dim1;                       // actual number of rows used (≤ D1)
+    int   dim2_default;               // actual number of columns used (≤ D2)
     int   num_objectives;
     ObjDir obj_dirs[MAX_OBJ];
-    float obj_weights[MAX_OBJ];       // Weighted 模式下的权重
-    // 多目标比较
+    float obj_weights[MAX_OBJ];       // weights in Weighted mode
+    // Multi-objective comparison
     CompareMode compare_mode = CompareMode::Weighted;
-    int   obj_priority[MAX_OBJ] = {0, 1, 2, 3};  // Lexicographic 模式下的比较顺序（索引）
-    float obj_tolerance[MAX_OBJ] = {0.0f, 0.0f, 0.0f, 0.0f};  // 字典法容差：差值 <= tol 视为相等
+    int   obj_priority[MAX_OBJ] = {0, 1, 2, 3};  // comparison order in Lexicographic mode (indices)
+    float obj_tolerance[MAX_OBJ] = {0.0f, 0.0f, 0.0f, 0.0f};  // lexicographic tolerance: |diff| ≤ tol ⇒ tie
     int   value_lower_bound;
     int   value_upper_bound;
-    // v3.4: 统一行模式
-    RowMode row_mode      = RowMode::Single;  // 行模式（Single/Fixed/Partition）
-    float cross_row_prob  = 0.0f;     // 跨行 move 概率（0=纯行内操作）
-    int   total_elements  = 0;        // Partition 模式下的总元素数
-    int   perm_repeat_count = 1;      // 排列中每个值的重复次数（1=标准排列，>1=多重集排列）
+    // v3.4: unified row mode
+    RowMode row_mode      = RowMode::Single;  // row mode (Single/Fixed/Partition)
+    float cross_row_prob  = 0.0f;     // probability of cross-row moves (0 = within-row only)
+    int   total_elements  = 0;        // total elements in Partition mode
+    int   perm_repeat_count = 1;      // repeats per value in permutation (1 = standard; >1 = multiset)
 };
 
 // ============================================================
-// SolverConfig — 求解器参数
+// SolverConfig — solver parameters
 // ============================================================
 
 struct SolverConfig {
-    int   pop_size         = 0;       // 种群大小（0 = 自动匹配 GPU 最大并行度）
+    int   pop_size         = 0;       // population size (0 = auto to max GPU parallelism)
     int   max_gen          = 1000;
     float mutation_rate    = 0.1f;
     unsigned seed          = 42;
     bool  verbose          = true;
     int   print_every      = 100;
-    // 岛屿模型参数
-    int   num_islands      = 1;       // 0 = 自适应，1 = 纯爬山（无岛屿），>1 = 岛屿模型
-    int   migrate_interval = 100;     // 每隔多少代执行一次迁移
+    // Island model
+    int   num_islands      = 1;       // 0 = adaptive, 1 = pure hill climbing (no islands), >1 = island model
+    int   migrate_interval = 100;     // migrate every this many generations
     MigrateStrategy migrate_strategy = MigrateStrategy::Hybrid;
-    // 模拟退火参数
-    float sa_temp_init     = 0.0f;    // 初始温度（0 = 禁用 SA，纯爬山）
-    float sa_alpha         = 0.998f;  // 冷却率（每代乘以 alpha）
-    // v1.0: 交叉参数
-    float crossover_rate   = 0.1f;    // 每代中执行交叉的概率（vs 变异）
-    // v2.0: 自适应算子选择
-    bool  use_aos          = false;   // 启用 AOS（batch 间更新算子权重）
-    float aos_weight_floor = AOS_WEIGHT_FLOOR;  // 运行时可覆盖的 floor
-    float aos_weight_cap   = AOS_WEIGHT_CAP;    // 运行时可覆盖的 cap
-    // v2.1: 初始解策略
-    int   init_oversample  = 4;       // 采样倍数（1 = 不做采样择优，即纯随机）
-    float init_random_ratio = 0.3f;   // 纯随机解占比（多样性保底）
-    // v3.0: 工程可用性
-    float time_limit_sec   = 0.0f;   // 时间限制（秒，0 = 不限制，按 max_gen 跑完）
-    int   stagnation_limit = 0;      // 收敛检测：连续多少个 batch 无改进后 reheat（0 = 禁用）
-    float reheat_ratio     = 0.5f;   // reheat 时温度恢复到初始温度的比例
+    // Simulated annealing
+    float sa_temp_init     = 0.0f;    // initial temperature (0 = disable SA, hill climb only)
+    float sa_alpha         = 0.998f;  // cooling rate (multiply by alpha each generation)
+    // v1.0: crossover
+    float crossover_rate   = 0.1f;    // probability of crossover per generation (vs mutation)
+    // v2.0: adaptive operator selection
+    bool  use_aos          = false;   // enable AOS (update operator weights between batches)
+    float aos_weight_floor = AOS_WEIGHT_FLOOR;  // runtime-overridable floor
+    float aos_weight_cap   = AOS_WEIGHT_CAP;    // runtime-overridable cap
+    // v2.1: initial solution strategy
+    int   init_oversample  = 4;       // oversampling factor (1 = no sampling selection, pure random)
+    float init_random_ratio = 0.3f;   // fraction of purely random solutions (diversity floor)
+    // v3.0: engineering usability
+    float time_limit_sec   = 0.0f;   // time limit in seconds (0 = none, run to max_gen)
+    int   stagnation_limit = 0;      // convergence: reheat after this many batches without improvement (0 = off)
+    float reheat_ratio     = 0.5f;   // on reheat, fraction of initial temperature to restore
     // v3.5: CUDA Graph
-    bool  use_cuda_graph   = false;  // 启用 CUDA Graph（减少 kernel launch 开销）
-    // v3.6: AOS 更新频率控制
-    int   aos_update_interval = 10;  // 每隔多少个 batch 更新一次 AOS 权重（降低 cudaMemcpy 同步频率）
-    // v4.0: 约束导向 + 分层搜索
-    bool  use_constraint_directed = false;  // 启用约束导向（根据 penalty 比例动态调整跨行算子权重）
-    bool  use_phased_search       = false;  // 启用分层搜索（按进度调整全局 floor/cap）
-    // 分层搜索参数：三期阈值
-    float phase_explore_end  = 0.30f;  // 探索期结束（进度比例）
-    float phase_refine_start = 0.70f;  // 精细期开始（进度比例）
-    // 约束导向参数
-    float constraint_boost_max = 2.5f; // 高约束时跨行算子 cap 提升倍率上限
-    // v5.0: 多 GPU 协同
-    int   num_gpus             = 1;    // 使用的 GPU 数量（1 = 单 GPU，>1 = 多 GPU 协同）
-    float multi_gpu_interval_sec = 10.0f;  // GPU 间交换最优解的时间间隔（秒）
-    MultiGpuInjectMode multi_gpu_inject_mode = MultiGpuInjectMode::HalfIslands;  // 注入模式
+    bool  use_cuda_graph   = false;  // enable CUDA Graph (fewer kernel launch overheads)
+    // v3.6: AOS update frequency
+    int   aos_update_interval = 10;  // update AOS weights every this many batches (lower cudaMemcpy sync rate)
+    // v4.0: constraint-directed + phased search
+    bool  use_constraint_directed = false;  // constraint-directed mode (scale cross-row weights by penalty ratio)
+    bool  use_phased_search       = false;  // phased search (adjust global floor/cap by progress)
+    // Phased search: three-phase thresholds
+    float phase_explore_end  = 0.30f;  // end of exploration phase (progress fraction)
+    float phase_refine_start = 0.70f;  // start of refinement phase (progress fraction)
+    // Constraint-directed parameters
+    float constraint_boost_max = 2.5f; // max multiplier boost for cross-row cap under high constraint
+    // v5.0: multi-GPU cooperation
+    int   num_gpus             = 1;    // number of GPUs (1 = single GPU, >1 = multi-GPU)
+    float multi_gpu_interval_sec = 10.0f;  // interval in seconds to exchange best solutions across GPUs
+    MultiGpuInjectMode multi_gpu_inject_mode = MultiGpuInjectMode::HalfIslands;  // injection mode
 };
 
 // ============================================================
-// classify_problem — 从 ProblemConfig 推断问题画像
+// classify_problem — infer problem profile from ProblemConfig
 // ============================================================
 
 inline ProblemProfile classify_problem(const ProblemConfig& pcfg) {
@@ -424,7 +419,7 @@ inline ProblemProfile classify_problem(const ProblemConfig& pcfg) {
 }
 
 // ============================================================
-// build_seq_registry — 由 ProblemProfile 驱动的算子注册
+// build_seq_registry — operator registration driven by ProblemProfile
 // ============================================================
 
 inline SeqRegistry build_seq_registry(const ProblemProfile& prof) {
@@ -436,7 +431,10 @@ inline SeqRegistry build_seq_registry(const ProblemProfile& prof) {
     }
 
     auto add = [&](int id, float w, SeqCategory cat, float cap = 0.0f) {
-        if (reg.count >= MAX_SEQ) return;
+        if (reg.count >= MAX_SEQ) {
+            printf("[WARN] SeqRegistry full (MAX_SEQ=%d), ignoring SeqID %d\n", MAX_SEQ, id);
+            return;
+        }
         reg.ids[reg.count] = id;
         reg.weights[reg.count] = w;
         reg.max_w[reg.count] = cap;
@@ -514,7 +512,7 @@ inline SeqRegistry build_seq_registry(const ProblemProfile& prof) {
         }
     }
 
-    // 延迟归一化：只计算权重和，不归一化
+    // Lazy normalization: only sum weights; do not normalize here
     reg.weights_sum = 0.0f;
     for (int i = 0; i < reg.count; i++) {
         reg.weights_sum += reg.weights[i];
@@ -523,19 +521,19 @@ inline SeqRegistry build_seq_registry(const ProblemProfile& prof) {
 }
 
 // ============================================================
-// ObjConfig — 传到 GPU 的目标比较配置（紧凑结构）
+// ObjConfig — compact objective comparison config for GPU
 // ============================================================
 
 struct ObjConfig {
     int         num_obj;
     CompareMode mode;
-    ObjDir      dirs[MAX_OBJ];       // 每个目标的方向
-    float       weights[MAX_OBJ];    // Weighted 模式下的权重
-    int         priority[MAX_OBJ];   // Lexicographic 模式下的比较顺序
-    float       tolerance[MAX_OBJ];  // Lexicographic 模式下的容差
+    ObjDir      dirs[MAX_OBJ];       // direction per objective
+    float       weights[MAX_OBJ];    // weights in Weighted mode
+    int         priority[MAX_OBJ];   // comparison order in Lexicographic mode
+    float       tolerance[MAX_OBJ];  // tolerance in Lexicographic mode
 };
 
-// 从 ProblemConfig 构造 ObjConfig（CPU 端）
+// Build ObjConfig from ProblemConfig (CPU side)
 inline ObjConfig make_obj_config(const ProblemConfig& pcfg) {
     ObjConfig oc;
     oc.num_obj = pcfg.num_objectives;
@@ -550,7 +548,7 @@ inline ObjConfig make_obj_config(const ProblemConfig& pcfg) {
 }
 
 // ============================================================
-// SolveResult — solve() 的返回值
+// SolveResult — return value of solve()
 // ============================================================
 
 enum class StopReason { MaxGen, TimeLimit, Stagnation };
@@ -564,12 +562,12 @@ struct SolveResult {
 };
 
 // ============================================================
-// 目标重要性映射 — 统一 Weighted / Lexicographic 的重要性度量
+// Objective importance mapping — unified importance for Weighted / Lexicographic
 // ============================================================
-// 用于初始化选种（NSGA-II 加权拥挤度 + 核心目标预留名额）
+// Used for initial selection (NSGA-II weighted crowding + core-object slots)
 // Weighted:      importance[i] = weight[i] / Σweight
 // Lexicographic: importance[i] = 0.5^rank[i] / Σ(0.5^rank)
-//   → 第一优先级 ~57%，第二 ~29%，第三 ~14%
+//   → first priority ~57%, second ~29%, third ~14%
 
 inline void compute_importance(const ObjConfig& oc, float* importance) {
     float sum = 0.0f;
@@ -590,26 +588,26 @@ inline void compute_importance(const ObjConfig& oc, float* importance) {
 }
 
 // ============================================================
-// 比较工具 — 支持 Weighted / Lexicographic
+// Comparison utilities — Weighted / Lexicographic
 // ============================================================
 
-// 将目标值统一为"越小越好"：Maximize 目标取负
+// Normalize objectives to "smaller is better": negate Maximize objectives
 __device__ __host__ inline float normalize_obj(float val, ObjDir dir) {
     return (dir == ObjDir::Maximize) ? -val : val;
 }
 
-// 核心比较：a 是否优于 b
-// v5.0: 添加 __host__ 支持多 GPU 在 CPU 端比较解
+// Core comparison: whether a is better than b
+// v5.0: add __host__ so multi-GPU can compare solutions on CPU
 template<typename Sol>
 __device__ __host__ inline bool is_better(const Sol& a, const Sol& b,
                                   const ObjConfig& oc) {
-    // penalty 优先：可行解一定优于不可行解
+    // Penalty first: feasible beats infeasible
     if (a.penalty <= 0.0f && b.penalty > 0.0f) return true;
     if (a.penalty > 0.0f && b.penalty <= 0.0f) return false;
     if (a.penalty > 0.0f && b.penalty > 0.0f) return a.penalty < b.penalty;
     
     if (oc.mode == CompareMode::Weighted) {
-        // 加权求和（权重已包含方向信息：Maximize 目标用负权重，或由 normalize_obj 处理）
+        // Weighted sum (weights may encode direction: negative for Maximize, or use normalize_obj)
         float sum_a = 0.0f, sum_b = 0.0f;
         for (int i = 0; i < oc.num_obj; i++) {
             float na = normalize_obj(a.objectives[i], oc.dirs[i]);
@@ -619,21 +617,22 @@ __device__ __host__ inline bool is_better(const Sol& a, const Sol& b,
         }
         return sum_a < sum_b;
     } else {
-        // 字典法：按 priority 顺序逐目标比较
+        // Lexicographic: compare objectives in priority order
         for (int p = 0; p < oc.num_obj; p++) {
             int idx = oc.priority[p];
+            if (idx < 0 || idx >= oc.num_obj) continue;
             float va = normalize_obj(a.objectives[idx], oc.dirs[idx]);
             float vb = normalize_obj(b.objectives[idx], oc.dirs[idx]);
             float diff = va - vb;
-            if (diff < -oc.tolerance[idx]) return true;   // a 明显更好
-            if (diff >  oc.tolerance[idx]) return false;  // b 明显更好
-            // 在容差内视为相等 → 继续比较下一个目标
+            if (diff < -oc.tolerance[idx]) return true;   // a clearly better
+            if (diff >  oc.tolerance[idx]) return false;  // b clearly better
+            // Within tolerance → tie, continue to next objective
         }
-        return false;  // 所有目标都在容差内相等
+        return false;  // all objectives tied within tolerance
     }
 }
 
-// 标量化（SA 接受概率用）：返回越小越好的标量
+// Scalarization (for SA acceptance): smaller is better
 template<typename Sol>
 __device__ __host__ inline float scalar_objective(const Sol& sol,
                                                     const ObjConfig& oc) {
@@ -643,13 +642,14 @@ __device__ __host__ inline float scalar_objective(const Sol& sol,
             sum += oc.weights[i] * normalize_obj(sol.objectives[i], oc.dirs[i]);
         return sum;
     } else {
-        // 字典法下 SA 用第一优先级目标作为标量
+        // Under lexicographic SA, use first-priority objective as scalar
         int idx = oc.priority[0];
+        if (idx < 0 || idx >= oc.num_obj) idx = 0;
         return normalize_obj(sol.objectives[idx], oc.dirs[idx]);
     }
 }
 
-// 轻量比较：直接操作 float[] 目标数组（避免复制整个 Sol）
+// Lightweight comparison: operate on float[] objectives (avoid copying full Sol)
 __device__ inline bool obj_is_better(const float* new_objs, const float* old_objs,
                                       const ObjConfig& oc) {
     if (oc.mode == CompareMode::Weighted) {
@@ -662,6 +662,7 @@ __device__ inline bool obj_is_better(const float* new_objs, const float* old_obj
     } else {
         for (int p = 0; p < oc.num_obj; p++) {
             int idx = oc.priority[p];
+            if (idx < 0 || idx >= oc.num_obj) continue;
             float va = normalize_obj(new_objs[idx], oc.dirs[idx]);
             float vb = normalize_obj(old_objs[idx], oc.dirs[idx]);
             float diff = va - vb;
@@ -672,7 +673,7 @@ __device__ inline bool obj_is_better(const float* new_objs, const float* old_obj
     }
 }
 
-// 轻量标量化：直接操作 float[] 目标数组
+// Lightweight scalarization: operate on float[] objectives
 __device__ __host__ inline float obj_scalar(const float* objs, const ObjConfig& oc) {
     if (oc.mode == CompareMode::Weighted) {
         float sum = 0.0f;
@@ -681,60 +682,61 @@ __device__ __host__ inline float obj_scalar(const float* objs, const ObjConfig&
         return sum;
     } else {
         int idx = oc.priority[0];
+        if (idx < 0 || idx >= oc.num_obj) idx = 0;
         return normalize_obj(objs[idx], oc.dirs[idx]);
     }
 }
 
 // ============================================================
-// AOSStats — 自适应算子选择统计（每个 block 一份）
+// AOSStats — adaptive operator selection stats (one per block)
 // ============================================================
-// v3.0: 粒度从 3 层 → MAX_SEQ 个序列
-// 记录每个序列的使用次数和改进次数
-// batch 结束后由 host 聚合，更新 SeqRegistry 权重
+// v3.0: granularity from 3 layers → MAX_SEQ sequences
+// Records per-sequence usage and improvement counts
+// Host aggregates after each batch and updates SeqRegistry weights
 
 struct AOSStats {
-    // 算子层统计（第二层）
-    int usage[MAX_SEQ];       // 各序列使用次数
-    int improvement[MAX_SEQ]; // 各序列改进次数（delta < 0 且被接受）
-    // K 步数层统计（第一层）
-    int k_usage[MAX_K];       // K=1,2,3 各自使用次数
-    int k_improvement[MAX_K]; // K=1,2,3 各自改进次数
+    // Operator-level stats (second layer)
+    int usage[MAX_SEQ];       // per-sequence usage counts
+    int improvement[MAX_SEQ]; // per-sequence improvements (delta < 0 and accepted)
+    // K-step layer stats (first layer)
+    int k_usage[MAX_K];       // usage counts for K=1,2,3
+    int k_improvement[MAX_K]; // improvement counts for K=1,2,3
 };
 
 // ============================================================
-// ObjDef — 单个目标的定义（编译期常量）
+// ObjDef — single-objective definition (compile-time constant)
 // ============================================================
 
 struct ObjDef {
-    ObjDir dir;           // 优化方向
-    float  weight;        // Weighted 模式下的权重
-    float  tolerance;     // Lexicographic 模式下的容差
+    ObjDir dir;           // optimization direction
+    float  weight;        // weight in Weighted mode
+    float  tolerance;     // tolerance in Lexicographic mode
 };
 
 // ============================================================
-// HeuristicMatrix — 启发式初始解构造用的数据矩阵描述
+// HeuristicMatrix — data matrix descriptor for heuristic initial solutions
 // ============================================================
 
 struct HeuristicMatrix {
-    const float* data;   // host 端 N*N 矩阵
-    int N;               // 维度
+    const float* data;   // N×N matrix on host
+    int N;               // dimension
 };
 
 // ============================================================
-// ProblemBase<Derived, D1, D2> — CRTP 基类
+// ProblemBase<Derived, D1, D2> — CRTP base class
 //
-// 用户继承此基类，提供：
-//   static constexpr ObjDef OBJ_DEFS[] = {...};   — 目标元信息
-//   __device__ float compute_obj(int idx, ...) const;  — 目标分发
+// Users inherit this base and provide:
+//   static constexpr ObjDef OBJ_DEFS[] = {...};   — objective metadata
+//   __device__ float compute_obj(int idx, ...) const;  — objective dispatch
 //   __device__ float compute_penalty(...) const;
 //
-// 约定：OBJ_DEFS 和 compute_obj 紧挨着写，case N 对应 OBJ_DEFS[N]
-// NUM_OBJ 由 sizeof(OBJ_DEFS) 自动推导，无需手动维护
+// Convention: OBJ_DEFS and compute_obj stay aligned; case N maps to OBJ_DEFS[N]
+// NUM_OBJ is derived from sizeof(OBJ_DEFS); no manual count
 //
-// 基类自动提供：
-//   evaluate(sol)           — 遍历目标列表调用 compute_obj
-//   fill_obj_config(cfg)    — 从 OBJ_DEFS 自动填充 ProblemConfig
-//   obj_config()            — 直接生成 ObjConfig
+// Base class provides:
+//   evaluate(sol)           — loop objectives and call compute_obj
+//   fill_obj_config(cfg)    — fill ProblemConfig from OBJ_DEFS
+//   obj_config()            — build ObjConfig directly
 // ============================================================
 
 template<typename Derived, int D1_, int D2_>
@@ -743,10 +745,10 @@ struct ProblemBase {
     static constexpr int D2 = D2_;
     using Sol = Solution<D1, D2>;
     
-    // NUM_OBJ 从 OBJ_DEFS 数组自动推导
+    // NUM_OBJ derived from OBJ_DEFS array size
     static constexpr int NUM_OBJ = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
     
-    // 自动评估：遍历目标列表
+    // Automatic evaluation: iterate objectives
     __device__ void evaluate(Sol& sol) const {
         const auto& self = static_cast<const Derived&>(*this);
         constexpr int n = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
@@ -755,7 +757,7 @@ struct ProblemBase {
         sol.penalty = self.compute_penalty(sol);
     }
     
-    // 从 OBJ_DEFS 自动填充 ProblemConfig 的目标部分
+    // Fill objective fields of ProblemConfig from OBJ_DEFS
     void fill_obj_config(ProblemConfig& cfg) const {
         constexpr int n = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
         cfg.num_objectives = n;
@@ -763,59 +765,59 @@ struct ProblemBase {
             cfg.obj_dirs[i]      = Derived::OBJ_DEFS[i].dir;
             cfg.obj_weights[i]   = Derived::OBJ_DEFS[i].weight;
             cfg.obj_tolerance[i] = Derived::OBJ_DEFS[i].tolerance;
-            cfg.obj_priority[i]  = i;  // 列表顺序即优先级
+            cfg.obj_priority[i]  = i;  // list order is priority order
         }
     }
     
-    // 直接生成 ObjConfig（供 solver 使用）
+    // Build ObjConfig directly (for solver)
     ObjConfig obj_config() const {
         ProblemConfig pcfg;
         fill_obj_config(pcfg);
         return make_obj_config(pcfg);
     }
     
-    // 可选：返回 shared memory 需求（字节）
-    // 默认返回 0（不使用 shared memory）
-    // 子类覆盖：如果问题数据可以放入 shared memory，返回实际大小
+    // Optional: shared memory requirement (bytes)
+    // Default 0 (no shared memory)
+    // Override if problem data fits in shared memory; return actual size
     size_t shared_mem_bytes() const {
         return 0;
     }
     
-    // 可选：加载问题数据到 shared memory
-    // 默认空实现（不使用 shared memory）
-    // 子类覆盖：如果 shared_mem_bytes() > 0，实现数据加载逻辑
+    // Optional: load problem data into shared memory
+    // Default no-op (no shared memory)
+    // Override if shared_mem_bytes() > 0 to implement loading
     __device__ void load_shared(char* smem, int tid, int bsz) {
-        (void)smem; (void)tid; (void)bsz;  // 默认：不做任何事
+        (void)smem; (void)tid; (void)bsz;  // default: no-op
     }
     
-    // 每个 block 在 global memory 中的热数据工作集大小（字节）
-    // 用于 auto pop_size 估算 L2 cache 压力
-    // 默认 = shared_mem_bytes()（数据在 smem 时，gmem 工作集为 0 不影响）
-    // 子类覆盖：当 shared_mem_bytes() 返回 0（数据放不进 smem）时，
-    //           返回实际数据大小（如距离矩阵 n*n*sizeof(float)）
+    // Hot working-set size in global memory per block (bytes)
+    // Used for auto pop_size L2 cache pressure estimate
+    // Default = shared_mem_bytes() (when data is in smem, gmem working set is 0)
+    // Override when shared_mem_bytes() is 0 (data does not fit in smem):
+    //           return actual data size (e.g. distance matrix n*n*sizeof(float))
     size_t working_set_bytes() const {
         return static_cast<const Derived&>(*this).shared_mem_bytes();
     }
     
-    // 可选：初始化 G/O 关系矩阵（为 GUIDED_REBUILD 提供先验知识）
-    // G[i*N+j]: 元素 i 和 j 的分组倾向（对称，[0,1]，越大越倾向同组）
-    // O[i*N+j]: 元素 i 排在 j 前面的倾向（不对称，[0,1]）
-    // 默认不提供（全零），搜索过程中通过 EMA 从历史好解积累
-    // 用户覆盖示例：距离近 → G 和 O 都高
+    // Optional: initialize G/O relation matrix (prior for GUIDED_REBUILD)
+    // G[i*N+j]: grouping tendency of i and j (symmetric, [0,1]; higher → same group)
+    // O[i*N+j]: tendency for i before j (asymmetric, [0,1])
+    // Default none (zeros); EMA accumulates from good solutions during search
+    // Example override: close distance → high G and O
     void init_relation_matrix(float* h_G, float* h_O, int N) const {
-        (void)h_G; (void)h_O; (void)N;  // 默认：不做任何事（保持全零）
+        (void)h_G; (void)h_O; (void)N;  // default: no-op (keep zeros)
     }
     
-    // 可选：返回 host 端数据矩阵供启发式初始解构造
-    // 默认返回 0（不提供），子类 override 后填充 out 数组并返回实际数量
+    // Optional: host-side data matrices for heuristic initial solutions
+    // Default 0 (none); override to fill out[] and return count
     int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
         (void)out; (void)max_count;
         return 0;
     }
     
-    // v5.0: 多 GPU 协同 — 克隆 Problem 到指定 GPU
-    // 子类需实现：cudaSetDevice(gpu_id) + 分配设备内存 + 拷贝数据
-    // 返回新的 Problem 实例指针（在 host 端，但其内部设备指针指向 gpu_id）
+    // v5.0: multi-GPU — clone Problem to a given GPU
+    // Subclasses implement: cudaSetDevice(gpu_id) + device alloc + copy
+    // Returns new Problem* on host; internal device pointers target gpu_id
     virtual Derived* clone_to_device(int gpu_id) const {
         (void)gpu_id;
         fprintf(stderr, "Error: clone_to_device() not implemented for this Problem type\n");
diff --git a/prototype/problems/assignment.cuh b/prototype/problems/assignment.cuh
index 6b4cdfb..7f8f975 100644
--- a/prototype/problems/assignment.cuh
+++ b/prototype/problems/assignment.cuh
@@ -1,7 +1,7 @@
 /**
- * assignment.cuh - 指派问题
- * 
- * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ * assignment.cuh - assignment problem
+ *
+ * Extends ProblemBase with ObjDef objective registration.
  */
 
 #pragma once
@@ -11,10 +11,10 @@
 
 struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
     const float* d_cost;
-    const float* h_cost;  // host 端成本矩阵（用于 init_relation_matrix）
+    const float* h_cost;  // host cost matrix (for init_relation_matrix)
     int n;
     
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
     __device__ float calc_total_cost(const Sol& sol) const {
         float total = 0.0f;
         const int* assign = sol.data[0];
@@ -24,7 +24,7 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
         return total;
     }
     
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
     static constexpr ObjDef OBJ_DEFS[] = {
         {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_cost
     };
@@ -47,7 +47,7 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
         return cfg;
     }
     
-    // ---- shared memory 接口 ----
+    // ---- shared memory interface ----
     static constexpr size_t SMEM_LIMIT = 48 * 1024;
     
     size_t shared_mem_bytes() const {
@@ -66,12 +66,12 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
         d_cost = sc;
     }
     
-    // 成本先验：task j 和 task k 如果被相似 agent 偏好，G 值高
-    // O 矩阵：task j 在位置 i 成本低 → O[j][k] 略高（j 倾向排在 k 前面的位置）
+    // Cost prior: if tasks j and k are similarly preferred by agents, G is high
+    // O matrix: low cost for task j at slot i → slightly higher O[j][k] (j tends before k)
     void init_relation_matrix(float* G, float* O, int N) const {
         if (!h_cost || N != n) return;
-        // 对每个 task，构建成本向量，task 间余弦相似度 → G
-        // 简化：成本列向量的相关性
+        // Per task, build cost vectors; cosine similarity between tasks → G
+        // Simplified: correlation of cost columns
         float max_c = 0.0f;
         for (int i = 0; i < N * N; i++)
             if (h_cost[i] > max_c) max_c = h_cost[i];
@@ -80,7 +80,7 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
         for (int j = 0; j < N; j++)
             for (int k = 0; k < N; k++) {
                 if (j == k) continue;
-                // G: 两个 task 的成本向量越相似 → 越可能互换
+                // G: more similar cost columns → more likely to swap tasks
                 float dot = 0.0f, nj = 0.0f, nk = 0.0f;
                 for (int i = 0; i < N; i++) {
                     float cj = h_cost[i * N + j] / max_c;
diff --git a/prototype/problems/bin_packing.cuh b/prototype/problems/bin_packing.cuh
index f230d4a..9616f95 100644
--- a/prototype/problems/bin_packing.cuh
+++ b/prototype/problems/bin_packing.cuh
@@ -1,13 +1,13 @@
 /**
- * bin_packing.cuh - 一维装箱问题（Integer 编码 + 约束）
- * 
- * N 个物品，每个重量 w[i]，装入最多 B 个箱子，每个箱子容量 C。
- * 决策变量：data[0][i] ∈ [0, B-1]，表示物品 i 放入的箱子编号。
- * 目标：最小化使用的箱子数。
- * 约束：每个箱子总重不超过 C，超出部分作为 penalty。
- * 
- * 验证实例：8 物品 weights=[7,5,3,4,6,2,8,1], C=10, 最优=4 箱
- *   箱0={7,3}=10, 箱1={5,4,1}=10, 箱2={6,2}=8, 箱3={8}=8
+ * bin_packing.cuh - one-dimensional bin packing (Integer encoding + constraints)
+ *
+ * N items with weights w[i], at most B bins, capacity C per bin.
+ * Decision: data[0][i] in [0, B-1] = bin index for item i.
+ * Objective: minimize number of bins used.
+ * Constraint: bin load ≤ C; overflow contributes to penalty.
+ *
+ * Validation instance: 8 items weights=[7,5,3,4,6,2,8,1], C=10, optimum=4 bins
+ *   bin0={7,3}=10, bin1={5,4,1}=10, bin2={6,2}=8, bin3={8}=8
  */
 
 #pragma once
@@ -16,9 +16,9 @@
 
 struct BinPackingProblem : ProblemBase<BinPackingProblem, 1, 64> {
     const float* d_weights;
-    int n;              // 物品数
-    int max_bins;       // 最大箱子数 B
-    float capacity;     // 箱子容量 C
+    int n;              // number of items
+    int max_bins;       // max bins B
+    float capacity;     // bin capacity C
     
     __device__ float calc_bins_used(const Sol& sol) const {
         bool used[32] = {};
diff --git a/prototype/problems/graph_color.cuh b/prototype/problems/graph_color.cuh
index fada0ec..1df1101 100644
--- a/prototype/problems/graph_color.cuh
+++ b/prototype/problems/graph_color.cuh
@@ -1,11 +1,11 @@
 /**
- * graph_color.cuh - 图着色问题（Integer 编码）
- * 
- * N 个节点的图，用 k 种颜色着色。
- * 决策变量：data[0][i] ∈ [0, k-1]，表示节点 i 的颜色。
- * 目标：最小化冲突边数（相邻节点同色的边数）。
- * 
- * 验证实例：Petersen 图（10 节点 15 边，色数=3，最优冲突=0）
+ * graph_color.cuh - graph coloring (Integer encoding)
+ *
+ * Graph on N nodes, k colors.
+ * Decision: data[0][i] in [0, k-1] = color of node i.
+ * Objective: minimize number of conflicting edges (adjacent same color).
+ *
+ * Validation instance: Petersen graph (10 nodes, 15 edges, chromatic number 3, optimal conflicts=0)
  */
 
 #pragma once
@@ -13,9 +13,9 @@
 #include "cuda_utils.cuh"
 
 struct GraphColorProblem : ProblemBase<GraphColorProblem, 1, 64> {
-    const int* d_adj;   // 邻接矩阵 [N*N]（1=相邻, 0=不相邻）
-    int n;              // 节点数
-    int k;              // 颜色数
+    const int* d_adj;   // adjacency [N*N] (1=edge, 0=no edge)
+    int n;              // number of nodes
+    int k;              // number of colors
     
     __device__ float calc_conflicts(const Sol& sol) const {
         int conflicts = 0;
diff --git a/prototype/problems/jsp.cuh b/prototype/problems/jsp.cuh
index 2297380..24c45d9 100644
--- a/prototype/problems/jsp.cuh
+++ b/prototype/problems/jsp.cuh
@@ -1,26 +1,26 @@
 /**
- * jsp.cuh - 车间调度问题 (Job Shop Scheduling Problem)
- * 
- * J 个工件，每个工件有 O 道工序，每道工序指定机器和耗时。
- * 
- * === 编码方案 A：Integer 多行（时间表编码）===
- * JSPProblem: data[j][i] = 工件 j 的第 i 道工序的开始时间
+ * jsp.cuh - Job Shop Scheduling Problem (JSSP)
+ *
+ * J jobs, each with O operations; each op specifies machine and duration.
+ *
+ * === Encoding A: multi-row Integer (time-table encoding) ===
+ * JSPProblem: data[j][i] = start time of job j's i-th operation
  *   dim1 = num_jobs, dim2_default = num_ops
- *   row_mode = Fixed（禁止 ROW_SPLIT/ROW_MERGE）
- *   每行代表一个工件的固定工序序列，行长度不可变
- * 
- * === 编码方案 B：Permutation 多重集（工序排列编码）===
- * JSPPermProblem: data[0][k] = 工件编号（0..J-1），长度 J*O
- *   值 j 出现 O 次。从左到右扫描，第 t 次遇到值 j 表示工件 j 的第 t 道工序。
+ *   row_mode = Fixed (no ROW_SPLIT/ROW_MERGE)
+ *   Each row is a fixed op sequence for one job; row length is fixed.
+ *
+ * === Encoding B: Permutation multiset (operation sequence encoding) ===
+ * JSPPermProblem: data[0][k] = job id (0..J-1), length J*O
+ *   Value j appears O times. Left-to-right scan: t-th occurrence of j is job j's t-th op.
  *   dim1 = 1, dim2_default = J*O, perm_repeat_count = O
- *   标准 Permutation 算子（swap/reverse/insert）天然保持多重集结构
- * 
- * 目标：Minimize makespan（所有工件完成时间的最大值）。
- * 约束：
- *   (a) 工序顺序：同一工件的工序必须按序执行
- *   (b) 机器冲突：同一机器同一时刻只能处理一个工序
- * 
- * 验证实例：自定义 3 工件 3 机器 (3x3)，最优 makespan = 12
+ *   Standard permutation ops (swap/reverse/insert) preserve multiset structure.
+ *
+ * Objective: minimize makespan (max completion time over jobs).
+ * Constraints:
+ *   (a) Precedence: ops of the same job must run in order.
+ *   (b) Machine conflict: one op per machine at a time.
+ *
+ * Validation instance: custom 3 jobs × 3 machines (3x3), optimal makespan = 12
  */
 
 #pragma once
@@ -28,16 +28,16 @@
 #include "cuda_utils.cuh"
 
 // ============================================================
-// 编码方案 A：Integer 多行（时间表编码）
+// Encoding A: multi-row Integer (time-table encoding)
 // ============================================================
 
 struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
-    const int*   d_machine;     // 工序所需机器 [J*O]
-    const float* d_duration;    // 工序耗时 [J*O]
-    int num_jobs;               // 工件数 J
-    int num_ops;                // 每工件工序数 O
-    int num_machines;           // 机器数 M
-    int time_horizon;           // 时间上界
+    const int*   d_machine;     // machine per op [J*O]
+    const float* d_duration;    // op duration [J*O]
+    int num_jobs;               // number of jobs J
+    int num_ops;                // ops per job O
+    int num_machines;           // number of machines M
+    int time_horizon;           // time horizon upper bound
     
     __device__ float calc_makespan(const Sol& sol) const {
         float makespan = 0.0f;
@@ -62,7 +62,7 @@ struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
     __device__ float compute_penalty(const Sol& sol) const {
         float penalty = 0.0f;
         
-        // (a) 工序顺序约束
+        // (a) Precedence constraints
         for (int j = 0; j < num_jobs; j++) {
             for (int i = 1; i < num_ops; i++) {
                 float prev_end = (float)sol.data[j][i-1] + d_duration[j * num_ops + (i-1)];
@@ -72,7 +72,7 @@ struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
             }
         }
         
-        // (b) 机器冲突约束
+        // (b) Machine conflict constraints
         int total = num_jobs * num_ops;
         for (int a = 0; a < total; a++) {
             int ja = a / num_ops, ia = a % num_ops;
@@ -151,28 +151,28 @@ struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
 };
 
 // ============================================================
-// 编码方案 B：Permutation 多重集（工序排列编码）
+// Encoding B: Permutation multiset (operation sequence encoding)
 // ============================================================
-// data[0] 是长度 J*O 的排列，值域 [0, J)，每个值出现 O 次
-// 从左到右扫描：第 t 次遇到值 j → 安排工件 j 的第 t 道工序
-// 贪心解码：每道工序安排在"最早可行时间"（满足工序顺序 + 机器空闲）
+// data[0] is a length-J*O sequence with values in [0, J), each appearing O times.
+// Left-to-right: t-th occurrence of j schedules job j's t-th operation.
+// Greedy decode: each op at earliest feasible time (precedence + machine free).
 
 struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
-    const int*   d_machine;     // 工序所需机器 [J*O]
-    const float* d_duration;    // 工序耗时 [J*O]
+    const int*   d_machine;     // machine per op [J*O]
+    const float* d_duration;    // op duration [J*O]
     int num_jobs;
     int num_ops;
     int num_machines;
     
-    // 贪心解码：从排列生成调度方案，返回 makespan
+    // Greedy decode: build schedule from permutation, return makespan
     __device__ float decode_and_makespan(const Sol& sol) const {
         int total = num_jobs * num_ops;
         int size = sol.dim2_sizes[0];
         if (size < total) return 1e9f;
         
-        float job_avail[8];     // 每个工件的下一道工序最早开始时间
-        float mach_avail[8];    // 每台机器的最早空闲时间
-        int   job_next_op[8];   // 每个工件的下一道待安排工序编号
+        float job_avail[8];     // earliest start for next op of each job
+        float mach_avail[8];    // earliest machine free time
+        int   job_next_op[8];   // next op index to schedule per job
         
         for (int j = 0; j < num_jobs; j++) { job_avail[j] = 0.0f; job_next_op[j] = 0; }
         for (int m = 0; m < num_machines; m++) mach_avail[m] = 0.0f;
@@ -182,13 +182,13 @@ struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
             int j = sol.data[0][k];
             if (j < 0 || j >= num_jobs) return 1e9f;
             int op = job_next_op[j];
-            if (op >= num_ops) continue;  // 该工件已安排完
+            if (op >= num_ops) continue;  // job already fully scheduled
             
             int flat = j * num_ops + op;
             int m = d_machine[flat];
             float dur = d_duration[flat];
             
-            // 最早开始时间 = max(工件前序完成, 机器空闲)
+            // Earliest start = max(job predecessor done, machine free)
             float start = fmaxf(job_avail[j], mach_avail[m]);
             float end = start + dur;
             
@@ -212,7 +212,7 @@ struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
         }
     }
     
-    // 贪心解码天然满足约束，penalty 始终为 0
+    // Greedy decode satisfies constraints; penalty is always 0
     __device__ float compute_penalty(const Sol& sol) const {
         return 0.0f;
     }
diff --git a/prototype/problems/knapsack.cuh b/prototype/problems/knapsack.cuh
index 82f47e8..0bf4a8e 100644
--- a/prototype/problems/knapsack.cuh
+++ b/prototype/problems/knapsack.cuh
@@ -1,7 +1,7 @@
 /**
- * knapsack.cuh - 0-1 背包问题
- * 
- * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ * knapsack.cuh - 0-1 knapsack
+ *
+ * Extends ProblemBase with ObjDef objective registration.
  */
 
 #pragma once
@@ -10,13 +10,13 @@
 #include "operators.cuh"
 
 struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
-    // 问题数据（d_weights 是物品重量，非目标权重）
+    // problem data (d_weights are item weights, not objective weights)
     const float* d_weights;
     const float* d_values;
     float capacity;
     int n;
     
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
     __device__ float calc_total_value(const Sol& sol) const {
         float tv = 0.0f;
         const int* sel = sol.data[0];
@@ -26,7 +26,7 @@ struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
         return tv;
     }
     
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
     static constexpr ObjDef OBJ_DEFS[] = {
         {ObjDir::Maximize, 1.0f, 0.0f},   // case 0: calc_total_value
     };
@@ -55,7 +55,7 @@ struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
         return cfg;
     }
     
-    // ---- shared memory 接口 ----
+    // ---- shared memory interface ----
     size_t shared_mem_bytes() const {
         return 2 * (size_t)n * sizeof(float);
     }
diff --git a/prototype/problems/load_balance.cuh b/prototype/problems/load_balance.cuh
index b462c9f..ee92017 100644
--- a/prototype/problems/load_balance.cuh
+++ b/prototype/problems/load_balance.cuh
@@ -1,12 +1,12 @@
 /**
- * load_balance.cuh - 离散负载均衡问题（Integer 编码验证）
- * 
- * N 个任务分配到 M 台机器，每个任务有一个处理时间 p[i]。
- * 决策变量：data[0][i] ∈ [0, M-1]，表示任务 i 分配到哪台机器。
- * 目标：最小化 makespan（最大机器负载）。
- * 
- * 已知 NP-hard（等价于 multiprocessor scheduling / load balancing）。
- * LPT（最长处理时间优先）贪心可得 4/3 近似。
+ * load_balance.cuh - discrete load balancing (Integer encoding sanity check)
+ *
+ * N tasks on M machines, processing time p[i] per task.
+ * Decision: data[0][i] in [0, M-1] = machine for task i.
+ * Objective: minimize makespan (max machine load).
+ *
+ * NP-hard (same as multiprocessor scheduling / load balancing).
+ * LPT (longest processing time first) greedy achieves 4/3 approximation.
  */
 
 #pragma once
@@ -14,12 +14,12 @@
 #include "cuda_utils.cuh"
 
 struct LoadBalanceProblem : ProblemBase<LoadBalanceProblem, 1, 64> {
-    const float* d_proc_time;   // 任务处理时间 [N]
-    int n;                      // 任务数
-    int m;                      // 机器数
+    const float* d_proc_time;   // task processing times [N]
+    int n;                      // number of tasks
+    int m;                      // number of machines
     
     __device__ float calc_makespan(const Sol& sol) const {
-        float load[32] = {};    // 最多 32 台机器
+        float load[32] = {};    // at most 32 machines
         int size = sol.dim2_sizes[0];
         for (int i = 0; i < size; i++) {
             int machine = sol.data[0][i];
@@ -43,7 +43,7 @@ struct LoadBalanceProblem : ProblemBase<LoadBalanceProblem, 1, 64> {
     }
     
     __device__ float compute_penalty(const Sol& sol) const {
-        return 0.0f;   // 无约束（任何分配都合法）
+        return 0.0f;   // no side constraints (any assignment is feasible)
     }
     
     ProblemConfig config() const {
diff --git a/prototype/problems/qap.cuh b/prototype/problems/qap.cuh
index 69343e2..352e3ca 100644
--- a/prototype/problems/qap.cuh
+++ b/prototype/problems/qap.cuh
@@ -1,14 +1,14 @@
 /**
- * qap.cuh - 二次分配问题 (Quadratic Assignment Problem)
- * 
- * N 个设施分配到 N 个位置（排列编码）。
- * 决策变量：data[0][i] = 设施 i 分配到的位置。
- * 目标：Minimize sum(flow[i][j] * dist[perm[i]][perm[j]])
- * 
- * 验证实例：自定义 5x5
- *   flow: 设施间的物流量
- *   dist: 位置间的距离
- *   已知最优 = 58
+ * qap.cuh - Quadratic Assignment Problem (QAP)
+ *
+ * Assign N facilities to N locations (permutation encoding).
+ * Decision: data[0][i] = location assigned to facility i.
+ * Objective: Minimize sum(flow[i][j] * dist[perm[i]][perm[j]])
+ *
+ * Validation instance: custom 5x5
+ *   flow: inter-facility flow
+ *   dist: inter-location distances
+ *   known optimum = 58
  */
 
 #pragma once
@@ -16,8 +16,10 @@
 #include "cuda_utils.cuh"
 
 struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
-    const float* d_flow;    // 物流量矩阵 [N*N]
-    const float* d_dist;    // 距离矩阵 [N*N]
+    const float* d_flow;    // flow matrix [N*N] (device)
+    const float* d_dist;    // distance matrix [N*N] (device)
+    const float* h_flow;    // flow matrix [N*N] (host, for clone_to_device)
+    const float* h_dist;    // distance matrix [N*N] (host, for clone_to_device)
     int n;
     
     __device__ float calc_cost(const Sol& sol) const {
@@ -64,14 +66,16 @@ struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
         d_dist = sd;
     }
     
-    static QAPProblem create(const float* h_flow, const float* h_dist, int n) {
+    static QAPProblem create(const float* h_flow_in, const float* h_dist_in, int n) {
         QAPProblem prob;
         prob.n = n;
+        prob.h_flow = h_flow_in;
+        prob.h_dist = h_dist_in;
         float *df, *dd;
         CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n));
         CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
-        CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice));
-        CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(df, h_flow_in, sizeof(float) * n * n, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist_in, sizeof(float) * n * n, cudaMemcpyHostToDevice));
         prob.d_flow = df; prob.d_dist = dd;
         return prob;
     }
@@ -82,18 +86,12 @@ struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
         d_flow = nullptr; d_dist = nullptr;
     }
     
-    // v5.0: 多 GPU 协同 — 克隆到指定 GPU
+    // v5.0: multi-GPU — clone onto a given device
     QAPProblem* clone_to_device(int gpu_id) const override {
         int orig_device;
         CUDA_CHECK(cudaGetDevice(&orig_device));
         
-        // 先下载数据到 host（从当前设备）
-        float* h_flow = new float[n * n];
-        float* h_dist = new float[n * n];
-        CUDA_CHECK(cudaMemcpy(h_flow, d_flow, sizeof(float) * n * n, cudaMemcpyDeviceToHost));
-        CUDA_CHECK(cudaMemcpy(h_dist, d_dist, sizeof(float) * n * n, cudaMemcpyDeviceToHost));
-        
-        // 切换到目标 GPU 并上传
+        // Use host-side matrices directly (no D2H needed)
         CUDA_CHECK(cudaSetDevice(gpu_id));
         float *df, *dd;
         CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n));
@@ -101,15 +99,12 @@ struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
         CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice));
         CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
         
-        delete[] h_flow;
-        delete[] h_dist;
-        
-        // 恢复原设备
         CUDA_CHECK(cudaSetDevice(orig_device));
         
-        // 创建新实例
         QAPProblem* new_prob = new QAPProblem();
         new_prob->n = n;
+        new_prob->h_flow = h_flow;
+        new_prob->h_dist = h_dist;
         new_prob->d_flow = df;
         new_prob->d_dist = dd;
         
diff --git a/prototype/problems/schedule.cuh b/prototype/problems/schedule.cuh
index 12409e1..0862fb3 100644
--- a/prototype/problems/schedule.cuh
+++ b/prototype/problems/schedule.cuh
@@ -1,8 +1,8 @@
 /**
- * schedule.cuh - 排班问题
- * 
- * 继承 ProblemBase，使用 ObjDef 目标注册机制
- * 2 个目标：总成本（min）+ 不公平度（min，权重更高）
+ * schedule.cuh - staff scheduling
+ *
+ * Extends ProblemBase with ObjDef objective registration.
+ * Two objectives: total cost (min) + unfairness (min, higher weight).
  */
 
 #pragma once
@@ -14,7 +14,7 @@ struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
     const float* d_cost;
     int days, emps, required;
     
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
     __device__ float calc_total_cost(const Sol& sol) const {
         float total = 0.0f;
         for (int d = 0; d < days; d++)
@@ -37,7 +37,7 @@ struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
         return (float)(max_w - min_w);
     }
     
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
     static constexpr ObjDef OBJ_DEFS[] = {
         {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_cost
         {ObjDir::Minimize, 5.0f, 0.0f},   // case 1: calc_unfairness
@@ -71,9 +71,9 @@ struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
         return cfg;
     }
     
-    // 默认回退全量（基类行为）— 不需要覆盖 evaluate_move
+    // Default full re-eval (base behavior) — no need to override evaluate_move
     
-    // ---- shared memory 接口 ----
+    // ---- shared memory interface ----
     size_t shared_mem_bytes() const {
         return (size_t)days * emps * sizeof(float);
     }
diff --git a/prototype/problems/tsp.cuh b/prototype/problems/tsp.cuh
index 8085ab2..4657e9a 100644
--- a/prototype/problems/tsp.cuh
+++ b/prototype/problems/tsp.cuh
@@ -1,7 +1,7 @@
 /**
- * tsp.cuh - TSP 问题定义
- * 
- * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ * tsp.cuh - Traveling Salesman Problem (TSP) definition
+ *
+ * Extends ProblemBase with ObjDef objective registration.
  */
 
 #pragma once
@@ -10,12 +10,12 @@
 #include "operators.cuh"
 
 struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
-    // 问题数据
+    // problem data
     const float* d_dist;
-    const float* h_dist;  // host 端距离矩阵（用于 init_relation_matrix）
+    const float* h_dist;  // host distance matrix (for init_relation_matrix)
     int n;
     
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
     __device__ float calc_total_distance(const Sol& sol) const {
         float total = 0.0f;
         const int* route = sol.data[0];
@@ -25,7 +25,7 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
         return total;
     }
     
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
     static constexpr ObjDef OBJ_DEFS[] = {
         {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_distance
     };
@@ -37,10 +37,10 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
     }
     
     __device__ float compute_penalty(const Sol& sol) const {
-        return 0.0f;  // TSP 无约束
+        return 0.0f;  // TSP has no side constraints
     }
     
-    // ---- config（编码/维度部分，目标由基类自动填充）----
+    // ---- config (encoding/dims; objectives filled by base class) ----
     ProblemConfig config() const {
         ProblemConfig cfg;
         cfg.encoding = EncodingType::Permutation;
@@ -49,7 +49,7 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
         return cfg;
     }
     
-    // ---- shared memory 接口 ----
+    // ---- shared memory interface ----
     static constexpr size_t SMEM_LIMIT = 48 * 1024;
     
     size_t shared_mem_bytes() const {
@@ -69,7 +69,7 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
         d_dist = sd;
     }
     
-    // 距离先验：距离近 → G/O 分数高
+    // Distance prior: closer cities → higher G/O scores
     void init_relation_matrix(float* G, float* O, int N) const {
         if (!h_dist || N != n) return;
         float max_d = 0.0f;
@@ -108,21 +108,21 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
         h_dist = nullptr;
     }
     
-    // v5.0: 多 GPU 协同 — 克隆到指定 GPU
+    // v5.0: multi-GPU — clone onto a given device
     TSPProblem* clone_to_device(int gpu_id) const override {
         int orig_device;
         CUDA_CHECK(cudaGetDevice(&orig_device));
         CUDA_CHECK(cudaSetDevice(gpu_id));
         
-        // 分配设备内存并拷贝距离矩阵
+        // Allocate device memory and copy distance matrix
         float* dd;
         CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
         CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
         
-        // 恢复原设备
+        // Restore original device
         CUDA_CHECK(cudaSetDevice(orig_device));
         
-        // 创建新的 Problem 实例（在 host 端）
+        // Create new Problem instance (on host)
         TSPProblem* new_prob = new TSPProblem();
         new_prob->n = n;
         new_prob->h_dist = h_dist;
diff --git a/prototype/problems/tsp_large.cuh b/prototype/problems/tsp_large.cuh
index 363b09b..fc411fd 100644
--- a/prototype/problems/tsp_large.cuh
+++ b/prototype/problems/tsp_large.cuh
@@ -1,7 +1,7 @@
 /**
- * tsp_large.cuh - 大规模 TSP 问题定义 (最多 256 城市)
- * 
- * 继承 ProblemBase，逻辑与 tsp.cuh 一致，仅 D2 上限不同
+ * tsp_large.cuh - large-scale TSP definition (up to 256 cities)
+ *
+ * Same logic as tsp.cuh under ProblemBase; only D2 cap differs.
  */
 
 #pragma once
@@ -14,7 +14,7 @@ struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
     const float* h_dist;
     int n;
     
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
     __device__ float calc_total_distance(const Sol& sol) const {
         float total = 0.0f;
         const int* route = sol.data[0];
@@ -24,7 +24,7 @@ struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
         return total;
     }
     
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
     static constexpr ObjDef OBJ_DEFS[] = {
         {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_distance
     };
@@ -54,7 +54,7 @@ struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
         return need <= SMEM_LIMIT ? need : 0;
     }
     
-    // 距离矩阵的实际大小（不管是否放进 smem）
+    // Actual distance matrix size (whether or not placed in smem)
     size_t working_set_bytes() const {
         return (size_t)n * n * sizeof(float);
     }
diff --git a/prototype/problems/tsp_xlarge.cuh b/prototype/problems/tsp_xlarge.cuh
index fa6afef..f2052d2 100644
--- a/prototype/problems/tsp_xlarge.cuh
+++ b/prototype/problems/tsp_xlarge.cuh
@@ -1,9 +1,9 @@
 /**
- * tsp_xlarge.cuh - 超大规模 TSP 问题定义 (最多 512 城市)
- * 
- * 继承 ProblemBase，逻辑与 tsp_large.cuh 一致，D2=512
- * 注意：距离矩阵 512×512×4B = 1MB，远超 48KB shared memory
- *       因此 shared_mem_bytes() 返回 0，距离矩阵留在 global memory
+ * tsp_xlarge.cuh - very large TSP definition (up to 512 cities)
+ *
+ * Same as tsp_large.cuh under ProblemBase, with D2=512.
+ * Note: distance matrix 512×512×4B = 1MB, far above 48KB shared memory,
+ *       so shared_mem_bytes() returns 0 and the matrix stays in global memory.
  */
 
 #pragma once
@@ -13,7 +13,7 @@
 
 struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
     const float* d_dist;
-    const float* h_dist;  // host 端距离矩阵（用于 init_relation_matrix）
+    const float* h_dist;  // host distance matrix (for init_relation_matrix)
     int n;
     
     __device__ float calc_total_distance(const Sol& sol) const {
@@ -45,7 +45,7 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
         return cfg;
     }
     
-    // 距离矩阵太大，不放 shared memory
+    // Distance matrix too large for shared memory
     size_t shared_mem_bytes() const { return 0; }
     __device__ void load_shared(char*, int, int) {}
     
@@ -53,10 +53,10 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
         return (size_t)n * n * sizeof(float);
     }
     
-    // 用距离矩阵初始化 G/O 先验：距离近 → 分数高
+    // Initialize G/O priors from distances: closer → higher score
     void init_relation_matrix(float* G, float* O, int N) const {
         if (!h_dist || N != n) return;
-        // 找最大距离用于归一化
+        // Max distance for normalization
         float max_d = 0.0f;
         for (int i = 0; i < N; i++)
             for (int j = 0; j < N; j++)
@@ -66,10 +66,10 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
         for (int i = 0; i < N; i++) {
             for (int j = 0; j < N; j++) {
                 if (i == j) continue;
-                // 距离近 → G 高（分组倾向强）
+                // Closer → higher G (stronger grouping signal)
                 float proximity = 1.0f - h_dist[i * N + j] / max_d;
-                G[i * N + j] = proximity * 0.3f;  // 初始信号不要太强，留空间给 EMA
-                // 距离近 → O 也给一点信号（对称的，不偏向任何方向）
+                G[i * N + j] = proximity * 0.3f;  // keep initial signal moderate for EMA headroom
+                // Closer → small O signal too (symmetric, no directional bias)
                 O[i * N + j] = proximity * 0.1f;
             }
         }
@@ -84,7 +84,7 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
     static TSPXLargeProblem create(const float* h_dist_ptr, int n) {
         TSPXLargeProblem prob;
         prob.n = n;
-        prob.h_dist = h_dist_ptr;  // 保留 host 指针
+        prob.h_dist = h_dist_ptr;  // keep host pointer
         float* dd;
         CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
         CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));
diff --git a/prototype/problems/vrp.cuh b/prototype/problems/vrp.cuh
index 81c05d5..e7a0626 100644
--- a/prototype/problems/vrp.cuh
+++ b/prototype/problems/vrp.cuh
@@ -1,8 +1,8 @@
 /**
- * vrp.cuh - 容量约束车辆路径问题 (CVRP)
- * 
- * 继承 ProblemBase，使用 ObjDef 目标注册机制
- * 多行编码（D1=K 条路线，分区初始化 + 跨行算子）
+ * vrp.cuh - Capacitated Vehicle Routing Problem (CVRP)
+ *
+ * Extends ProblemBase with ObjDef objective registration.
+ * Multi-row encoding (D1 = K routes, partition init + cross-row operators).
  */
 
 #pragma once
@@ -12,11 +12,11 @@
 #include "gpu_cache.cuh"
 
 struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
-    // GPU 数据
+    // GPU data
     const float* d_dist;
     const float* d_demand;
-    const float* h_dist;    // host 端距离矩阵（含 depot，用于 init_relation_matrix）
-    const float* h_demand;  // host 端需求数组（用于 clone_to_device）
+    const float* h_dist;    // host distance matrix (includes depot; for init_relation_matrix)
+    const float* h_demand;  // host demand array (for clone_to_device)
     int n;
     int stride;
     float capacity;
@@ -24,7 +24,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
     int max_vehicles;
     GpuCache cache;
     
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
     __device__ float compute_route_dist(const int* route, int size) const {
         if (size == 0) return 0.0f;
         float dist = 0.0f;
@@ -61,7 +61,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
         return total;
     }
     
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
     static constexpr ObjDef OBJ_DEFS[] = {
         {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_distance
     };
@@ -102,7 +102,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
         return cfg;
     }
     
-    // ---- shared memory 接口 ----
+    // ---- shared memory interface ----
     static constexpr size_t SMEM_LIMIT = 48 * 1024;
     
     size_t shared_mem_bytes() const {
@@ -129,14 +129,14 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
     void enable_cache(int cap = 65536) { cache = GpuCache::allocate(cap); }
     void print_cache_stats() const { cache.print_stats(); }
     
-    // 距离先验：客户间距离近 → G/O 分数高
-    // 注意：h_dist 含 depot（stride×stride），元素编号 0..n-1 对应 node 1..n
+    // Distance prior: closer customers → higher G/O scores
+    // Note: h_dist includes depot (stride×stride); indices 0..n-1 map to nodes 1..n
     void init_relation_matrix(float* G, float* O, int N) const {
         if (!h_dist || N != n) return;
         float max_d = 0.0f;
         for (int i = 0; i < N; i++)
             for (int j = 0; j < N; j++) {
-                float d = h_dist[(i + 1) * stride + (j + 1)];  // 跳过 depot
+                float d = h_dist[(i + 1) * stride + (j + 1)];  // skip depot
                 if (d > max_d) max_d = d;
             }
         if (max_d <= 0.0f) return;
@@ -161,7 +161,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
         prob.max_vehicles = max_vehicles;
         prob.cache = GpuCache::disabled();
         prob.h_dist = h_dist_ptr;
-        prob.h_demand = h_demand_ptr;  // 保存 host 端指针
+        prob.h_demand = h_demand_ptr;  // keep host pointer
         
         int n_nodes = n + 1;
         float* dd;
@@ -185,13 +185,13 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
         cache.destroy();
     }
     
-    // v5.0: 多 GPU 协同 — 克隆到指定 GPU
+    // v5.0: multi-GPU — clone onto a given device
     VRPProblem* clone_to_device(int gpu_id) const override {
         int orig_device;
         CUDA_CHECK(cudaGetDevice(&orig_device));
         CUDA_CHECK(cudaSetDevice(gpu_id));
         
-        // 从 host 端数据直接拷贝到目标 GPU（避免跨设备 D2H 拷贝）
+        // Copy from host straight to target GPU (avoid cross-device D2H staging)
         int n_nodes = n + 1;
         float* dd;
         CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n_nodes * n_nodes));
diff --git a/prototype/problems/vrptw.cuh b/prototype/problems/vrptw.cuh
index 484d20f..7fc2e45 100644
--- a/prototype/problems/vrptw.cuh
+++ b/prototype/problems/vrptw.cuh
@@ -1,12 +1,12 @@
 /**
- * vrptw.cuh - 带时间窗的车辆路径问题 (VRPTW)
- * 
- * 在 CVRP 基础上增加时间窗约束。
- * 编码：Perm 多行分区（同 CVRP），data[r][j] = 路线 r 的第 j 个客户。
- * 目标：Minimize 总距离。
- * 约束：(a) 容量约束, (b) 时间窗约束（到达时间必须 ≤ latest，早到需等待）。
- * 
- * 验证实例：8 客户 3 车, 手工设计坐标+时间窗, 确保有已知可行解。
+ * vrptw.cuh - Vehicle Routing Problem with Time Windows (VRPTW)
+ *
+ * CVRP plus time window constraints.
+ * Encoding: multi-row perm partition (same as CVRP); data[r][j] = j-th customer on route r.
+ * Objective: minimize total distance.
+ * Constraints: (a) capacity, (b) time windows (arrival ≤ latest; early arrival waits).
+ *
+ * Validation instance: 8 customers, 3 vehicles; hand-crafted coords + windows with known feasible solution.
  */
 
 #pragma once
@@ -14,12 +14,12 @@
 #include "cuda_utils.cuh"
 
 struct VRPTWProblem : ProblemBase<VRPTWProblem, 8, 64> {
-    const float* d_dist;        // 距离矩阵 [(n+1)*(n+1)]（含 depot）
-    const float* d_demand;      // 需求 [n]
-    const float* d_earliest;    // 最早服务时间 [n+1]（含 depot）
-    const float* d_latest;      // 最晚服务时间 [n+1]（含 depot）
-    const float* d_service;     // 服务耗时 [n+1]（含 depot）
-    int n;                      // 客户数（不含 depot）
+    const float* d_dist;        // distance matrix [(n+1)*(n+1)] (includes depot)
+    const float* d_demand;      // demand [n]
+    const float* d_earliest;    // earliest service time [n+1] (includes depot)
+    const float* d_latest;      // latest service time [n+1] (includes depot)
+    const float* d_service;     // service time [n+1] (includes depot)
+    int n;                      // number of customers (excludes depot)
     int stride;                 // n+1
     float capacity;
     int num_vehicles;
@@ -63,30 +63,30 @@ struct VRPTWProblem : ProblemBase<VRPTWProblem, 8, 64> {
             if (size == 0) continue;
             active++;
             
-            // 容量约束
+            // Capacity constraint
             float load = 0.0f;
             for (int j = 0; j < size; j++)
                 load += d_demand[sol.data[r][j]];
             if (load > capacity)
                 penalty += (load - capacity) * 100.0f;
             
-            // 时间窗约束：模拟路线行驶
+            // Time windows: simulate route travel
             float time = 0.0f;
             int prev = 0;
             for (int j = 0; j < size; j++) {
                 int node = sol.data[r][j] + 1;
                 float travel = d_dist[prev * stride + node];
                 time += travel;
-                // 早到需等待
+                // Wait if early
                 if (time < d_earliest[node])
                     time = d_earliest[node];
-                // 迟到产生惩罚
+                // Penalize lateness
                 if (time > d_latest[node])
                     penalty += (time - d_latest[node]) * 50.0f;
                 time += d_service[node];
                 prev = node;
             }
-            // 返回 depot 的时间窗
+            // Time window returning to depot
             float return_time = time + d_dist[prev * stride + 0];
             if (return_time > d_latest[0])
                 penalty += (return_time - d_latest[0]) * 50.0f;