fix: harden CUDA safety checks and translate comments to English

Safety fixes (4 critical, 4 warning) from code review: - qap.cuh: fix clone_to_device cross-device D2H by retaining host matrices - types.cuh: add CUDA_CHECK to InjectBuffer, track owner_gpu for safe destroy - types.cuh: add bounds check on lexicographic priority index - solver.cuh: cap migrate_kernel islands to MAX_ISLANDS=64 to prevent stack overflow - multi_gpu_solver.cuh: guard against 0 GPUs, propagate stop_reason from best GPU - types.cuh: warn on SeqRegistry overflow - solver.cuh: warn when constraint_directed/phased_search disabled without AOS Translate all Chinese comments to English across 25+ source files (core/*.cuh, problems/*.cuh, Makefile, multi-GPU tests). Verified on V100S×2 (sm_70, CUDA 12.8): e5 (12 problem types, all optimal), e13 (multi-objective + multi-GPU, 9 configs, all passed).
2026-04-24 12:06:22 +02:00 · 2026-03-25 11:52:50 +08:00 · 2026-03-25 11:52:50 +08:00 · a848730459
commit a848730459
parent ab278d0e82
25 changed files with 1147 additions and 1167 deletions
--- a/prototype/problems/assignment.cuh
+++ b/prototype/problems/assignment.cuh
@ -1,7 +1,7 @@
 /**
- * assignment.cuh - 指派问题
- * 
- * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ * assignment.cuh - assignment problem
+ *
+ * Extends ProblemBase with ObjDef objective registration.
 */

 #pragma once
@ -11,10 +11,10 @@

 struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
    const float* d_cost;
-    const float* h_cost;  // host 端成本矩阵（用于 init_relation_matrix）
+    const float* h_cost;  // host cost matrix (for init_relation_matrix)
    int n;
    
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
    __device__ float calc_total_cost(const Sol& sol) const {
        float total = 0.0f;
        const int* assign = sol.data[0];
@ -24,7 +24,7 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
        return total;
    }
    
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
    static constexpr ObjDef OBJ_DEFS[] = {
        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_cost
    };
@ -47,7 +47,7 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
        return cfg;
    }
    
-    // ---- shared memory 接口 ----
+    // ---- shared memory interface ----
    static constexpr size_t SMEM_LIMIT = 48 * 1024;
    
    size_t shared_mem_bytes() const {
@ -66,12 +66,12 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
        d_cost = sc;
    }
    
-    // 成本先验：task j 和 task k 如果被相似 agent 偏好，G 值高
-    // O 矩阵：task j 在位置 i 成本低 → O[j][k] 略高（j 倾向排在 k 前面的位置）
+    // Cost prior: if tasks j and k are similarly preferred by agents, G is high
+    // O matrix: low cost for task j at slot i → slightly higher O[j][k] (j tends before k)
    void init_relation_matrix(float* G, float* O, int N) const {
        if (!h_cost || N != n) return;
-        // 对每个 task，构建成本向量，task 间余弦相似度 → G
-        // 简化：成本列向量的相关性
+        // Per task, build cost vectors; cosine similarity between tasks → G
+        // Simplified: correlation of cost columns
        float max_c = 0.0f;
        for (int i = 0; i < N * N; i++)
            if (h_cost[i] > max_c) max_c = h_cost[i];
@ -80,7 +80,7 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
        for (int j = 0; j < N; j++)
            for (int k = 0; k < N; k++) {
                if (j == k) continue;
-                // G: 两个 task 的成本向量越相似 → 越可能互换
+                // G: more similar cost columns → more likely to swap tasks
                float dot = 0.0f, nj = 0.0f, nk = 0.0f;
                for (int i = 0; i < N; i++) {
                    float cj = h_cost[i * N + j] / max_c;
--- a/prototype/problems/bin_packing.cuh
+++ b/prototype/problems/bin_packing.cuh
@ -1,13 +1,13 @@
 /**
- * bin_packing.cuh - 一维装箱问题（Integer 编码 + 约束）
- * 
- * N 个物品，每个重量 w[i]，装入最多 B 个箱子，每个箱子容量 C。
- * 决策变量：data[0][i] ∈ [0, B-1]，表示物品 i 放入的箱子编号。
- * 目标：最小化使用的箱子数。
- * 约束：每个箱子总重不超过 C，超出部分作为 penalty。
- * 
- * 验证实例：8 物品 weights=[7,5,3,4,6,2,8,1], C=10, 最优=4 箱
- *   箱0={7,3}=10, 箱1={5,4,1}=10, 箱2={6,2}=8, 箱3={8}=8
+ * bin_packing.cuh - one-dimensional bin packing (Integer encoding + constraints)
+ *
+ * N items with weights w[i], at most B bins, capacity C per bin.
+ * Decision: data[0][i] in [0, B-1] = bin index for item i.
+ * Objective: minimize number of bins used.
+ * Constraint: bin load ≤ C; overflow contributes to penalty.
+ *
+ * Validation instance: 8 items weights=[7,5,3,4,6,2,8,1], C=10, optimum=4 bins
+ *   bin0={7,3}=10, bin1={5,4,1}=10, bin2={6,2}=8, bin3={8}=8
 */

 #pragma once
@ -16,9 +16,9 @@

 struct BinPackingProblem : ProblemBase<BinPackingProblem, 1, 64> {
    const float* d_weights;
-    int n;              // 物品数
-    int max_bins;       // 最大箱子数 B
-    float capacity;     // 箱子容量 C
+    int n;              // number of items
+    int max_bins;       // max bins B
+    float capacity;     // bin capacity C
    
    __device__ float calc_bins_used(const Sol& sol) const {
        bool used[32] = {};
--- a/prototype/problems/graph_color.cuh
+++ b/prototype/problems/graph_color.cuh
@ -1,11 +1,11 @@
 /**
- * graph_color.cuh - 图着色问题（Integer 编码）
- * 
- * N 个节点的图，用 k 种颜色着色。
- * 决策变量：data[0][i] ∈ [0, k-1]，表示节点 i 的颜色。
- * 目标：最小化冲突边数（相邻节点同色的边数）。
- * 
- * 验证实例：Petersen 图（10 节点 15 边，色数=3，最优冲突=0）
+ * graph_color.cuh - graph coloring (Integer encoding)
+ *
+ * Graph on N nodes, k colors.
+ * Decision: data[0][i] in [0, k-1] = color of node i.
+ * Objective: minimize number of conflicting edges (adjacent same color).
+ *
+ * Validation instance: Petersen graph (10 nodes, 15 edges, chromatic number 3, optimal conflicts=0)
 */

 #pragma once
@ -13,9 +13,9 @@
 #include "cuda_utils.cuh"

 struct GraphColorProblem : ProblemBase<GraphColorProblem, 1, 64> {
-    const int* d_adj;   // 邻接矩阵 [N*N]（1=相邻, 0=不相邻）
-    int n;              // 节点数
-    int k;              // 颜色数
+    const int* d_adj;   // adjacency [N*N] (1=edge, 0=no edge)
+    int n;              // number of nodes
+    int k;              // number of colors
    
    __device__ float calc_conflicts(const Sol& sol) const {
        int conflicts = 0;
--- a/prototype/problems/jsp.cuh
+++ b/prototype/problems/jsp.cuh
@ -1,26 +1,26 @@
 /**
- * jsp.cuh - 车间调度问题 (Job Shop Scheduling Problem)
- * 
- * J 个工件，每个工件有 O 道工序，每道工序指定机器和耗时。
- * 
- * === 编码方案 A：Integer 多行（时间表编码）===
- * JSPProblem: data[j][i] = 工件 j 的第 i 道工序的开始时间
+ * jsp.cuh - Job Shop Scheduling Problem (JSSP)
+ *
+ * J jobs, each with O operations; each op specifies machine and duration.
+ *
+ * === Encoding A: multi-row Integer (time-table encoding) ===
+ * JSPProblem: data[j][i] = start time of job j's i-th operation
 *   dim1 = num_jobs, dim2_default = num_ops
- *   row_mode = Fixed（禁止 ROW_SPLIT/ROW_MERGE）
- *   每行代表一个工件的固定工序序列，行长度不可变
- * 
- * === 编码方案 B：Permutation 多重集（工序排列编码）===
- * JSPPermProblem: data[0][k] = 工件编号（0..J-1），长度 J*O
- *   值 j 出现 O 次。从左到右扫描，第 t 次遇到值 j 表示工件 j 的第 t 道工序。
+ *   row_mode = Fixed (no ROW_SPLIT/ROW_MERGE)
+ *   Each row is a fixed op sequence for one job; row length is fixed.
+ *
+ * === Encoding B: Permutation multiset (operation sequence encoding) ===
+ * JSPPermProblem: data[0][k] = job id (0..J-1), length J*O
+ *   Value j appears O times. Left-to-right scan: t-th occurrence of j is job j's t-th op.
 *   dim1 = 1, dim2_default = J*O, perm_repeat_count = O
- *   标准 Permutation 算子（swap/reverse/insert）天然保持多重集结构
- * 
- * 目标：Minimize makespan（所有工件完成时间的最大值）。
- * 约束：
- *   (a) 工序顺序：同一工件的工序必须按序执行
- *   (b) 机器冲突：同一机器同一时刻只能处理一个工序
- * 
- * 验证实例：自定义 3 工件 3 机器 (3x3)，最优 makespan = 12
+ *   Standard permutation ops (swap/reverse/insert) preserve multiset structure.
+ *
+ * Objective: minimize makespan (max completion time over jobs).
+ * Constraints:
+ *   (a) Precedence: ops of the same job must run in order.
+ *   (b) Machine conflict: one op per machine at a time.
+ *
+ * Validation instance: custom 3 jobs × 3 machines (3x3), optimal makespan = 12
 */

 #pragma once
@ -28,16 +28,16 @@
 #include "cuda_utils.cuh"

 // ============================================================
-// 编码方案 A：Integer 多行（时间表编码）
+// Encoding A: multi-row Integer (time-table encoding)
 // ============================================================

 struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
-    const int*   d_machine;     // 工序所需机器 [J*O]
-    const float* d_duration;    // 工序耗时 [J*O]
-    int num_jobs;               // 工件数 J
-    int num_ops;                // 每工件工序数 O
-    int num_machines;           // 机器数 M
-    int time_horizon;           // 时间上界
+    const int*   d_machine;     // machine per op [J*O]
+    const float* d_duration;    // op duration [J*O]
+    int num_jobs;               // number of jobs J
+    int num_ops;                // ops per job O
+    int num_machines;           // number of machines M
+    int time_horizon;           // time horizon upper bound
    
    __device__ float calc_makespan(const Sol& sol) const {
        float makespan = 0.0f;
@ -62,7 +62,7 @@ struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
    __device__ float compute_penalty(const Sol& sol) const {
        float penalty = 0.0f;
        
-        // (a) 工序顺序约束
+        // (a) Precedence constraints
        for (int j = 0; j < num_jobs; j++) {
            for (int i = 1; i < num_ops; i++) {
                float prev_end = (float)sol.data[j][i-1] + d_duration[j * num_ops + (i-1)];
@ -72,7 +72,7 @@ struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
            }
        }
        
-        // (b) 机器冲突约束
+        // (b) Machine conflict constraints
        int total = num_jobs * num_ops;
        for (int a = 0; a < total; a++) {
            int ja = a / num_ops, ia = a % num_ops;
@ -151,28 +151,28 @@ struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
 };

 // ============================================================
-// 编码方案 B：Permutation 多重集（工序排列编码）
+// Encoding B: Permutation multiset (operation sequence encoding)
 // ============================================================
-// data[0] 是长度 J*O 的排列，值域 [0, J)，每个值出现 O 次
-// 从左到右扫描：第 t 次遇到值 j → 安排工件 j 的第 t 道工序
-// 贪心解码：每道工序安排在"最早可行时间"（满足工序顺序 + 机器空闲）
+// data[0] is a length-J*O sequence with values in [0, J), each appearing O times.
+// Left-to-right: t-th occurrence of j schedules job j's t-th operation.
+// Greedy decode: each op at earliest feasible time (precedence + machine free).

 struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
-    const int*   d_machine;     // 工序所需机器 [J*O]
-    const float* d_duration;    // 工序耗时 [J*O]
+    const int*   d_machine;     // machine per op [J*O]
+    const float* d_duration;    // op duration [J*O]
    int num_jobs;
    int num_ops;
    int num_machines;
    
-    // 贪心解码：从排列生成调度方案，返回 makespan
+    // Greedy decode: build schedule from permutation, return makespan
    __device__ float decode_and_makespan(const Sol& sol) const {
        int total = num_jobs * num_ops;
        int size = sol.dim2_sizes[0];
        if (size < total) return 1e9f;
        
-        float job_avail[8];     // 每个工件的下一道工序最早开始时间
-        float mach_avail[8];    // 每台机器的最早空闲时间
-        int   job_next_op[8];   // 每个工件的下一道待安排工序编号
+        float job_avail[8];     // earliest start for next op of each job
+        float mach_avail[8];    // earliest machine free time
+        int   job_next_op[8];   // next op index to schedule per job
        
        for (int j = 0; j < num_jobs; j++) { job_avail[j] = 0.0f; job_next_op[j] = 0; }
        for (int m = 0; m < num_machines; m++) mach_avail[m] = 0.0f;
@ -182,13 +182,13 @@ struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
            int j = sol.data[0][k];
            if (j < 0 || j >= num_jobs) return 1e9f;
            int op = job_next_op[j];
-            if (op >= num_ops) continue;  // 该工件已安排完
+            if (op >= num_ops) continue;  // job already fully scheduled
            
            int flat = j * num_ops + op;
            int m = d_machine[flat];
            float dur = d_duration[flat];
            
-            // 最早开始时间 = max(工件前序完成, 机器空闲)
+            // Earliest start = max(job predecessor done, machine free)
            float start = fmaxf(job_avail[j], mach_avail[m]);
            float end = start + dur;
            
@ -212,7 +212,7 @@ struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
        }
    }
    
-    // 贪心解码天然满足约束，penalty 始终为 0
+    // Greedy decode satisfies constraints; penalty is always 0
    __device__ float compute_penalty(const Sol& sol) const {
        return 0.0f;
    }
--- a/prototype/problems/knapsack.cuh
+++ b/prototype/problems/knapsack.cuh
@ -1,7 +1,7 @@
 /**
- * knapsack.cuh - 0-1 背包问题
- * 
- * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ * knapsack.cuh - 0-1 knapsack
+ *
+ * Extends ProblemBase with ObjDef objective registration.
 */

 #pragma once
@ -10,13 +10,13 @@
 #include "operators.cuh"

 struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
-    // 问题数据（d_weights 是物品重量，非目标权重）
+    // problem data (d_weights are item weights, not objective weights)
    const float* d_weights;
    const float* d_values;
    float capacity;
    int n;
    
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
    __device__ float calc_total_value(const Sol& sol) const {
        float tv = 0.0f;
        const int* sel = sol.data[0];
@ -26,7 +26,7 @@ struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
        return tv;
    }
    
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
    static constexpr ObjDef OBJ_DEFS[] = {
        {ObjDir::Maximize, 1.0f, 0.0f},   // case 0: calc_total_value
    };
@ -55,7 +55,7 @@ struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
        return cfg;
    }
    
-    // ---- shared memory 接口 ----
+    // ---- shared memory interface ----
    size_t shared_mem_bytes() const {
        return 2 * (size_t)n * sizeof(float);
    }
--- a/prototype/problems/load_balance.cuh
+++ b/prototype/problems/load_balance.cuh
@ -1,12 +1,12 @@
 /**
- * load_balance.cuh - 离散负载均衡问题（Integer 编码验证）
- * 
- * N 个任务分配到 M 台机器，每个任务有一个处理时间 p[i]。
- * 决策变量：data[0][i] ∈ [0, M-1]，表示任务 i 分配到哪台机器。
- * 目标：最小化 makespan（最大机器负载）。
- * 
- * 已知 NP-hard（等价于 multiprocessor scheduling / load balancing）。
- * LPT（最长处理时间优先）贪心可得 4/3 近似。
+ * load_balance.cuh - discrete load balancing (Integer encoding sanity check)
+ *
+ * N tasks on M machines, processing time p[i] per task.
+ * Decision: data[0][i] in [0, M-1] = machine for task i.
+ * Objective: minimize makespan (max machine load).
+ *
+ * NP-hard (same as multiprocessor scheduling / load balancing).
+ * LPT (longest processing time first) greedy achieves 4/3 approximation.
 */

 #pragma once
@ -14,12 +14,12 @@
 #include "cuda_utils.cuh"

 struct LoadBalanceProblem : ProblemBase<LoadBalanceProblem, 1, 64> {
-    const float* d_proc_time;   // 任务处理时间 [N]
-    int n;                      // 任务数
-    int m;                      // 机器数
+    const float* d_proc_time;   // task processing times [N]
+    int n;                      // number of tasks
+    int m;                      // number of machines
    
    __device__ float calc_makespan(const Sol& sol) const {
-        float load[32] = {};    // 最多 32 台机器
+        float load[32] = {};    // at most 32 machines
        int size = sol.dim2_sizes[0];
        for (int i = 0; i < size; i++) {
            int machine = sol.data[0][i];
@ -43,7 +43,7 @@ struct LoadBalanceProblem : ProblemBase<LoadBalanceProblem, 1, 64> {
    }
    
    __device__ float compute_penalty(const Sol& sol) const {
-        return 0.0f;   // 无约束（任何分配都合法）
+        return 0.0f;   // no side constraints (any assignment is feasible)
    }
    
    ProblemConfig config() const {
--- a/prototype/problems/qap.cuh
+++ b/prototype/problems/qap.cuh
@ -1,14 +1,14 @@
 /**
- * qap.cuh - 二次分配问题 (Quadratic Assignment Problem)
- * 
- * N 个设施分配到 N 个位置（排列编码）。
- * 决策变量：data[0][i] = 设施 i 分配到的位置。
- * 目标：Minimize sum(flow[i][j] * dist[perm[i]][perm[j]])
- * 
- * 验证实例：自定义 5x5
- *   flow: 设施间的物流量
- *   dist: 位置间的距离
- *   已知最优 = 58
+ * qap.cuh - Quadratic Assignment Problem (QAP)
+ *
+ * Assign N facilities to N locations (permutation encoding).
+ * Decision: data[0][i] = location assigned to facility i.
+ * Objective: Minimize sum(flow[i][j] * dist[perm[i]][perm[j]])
+ *
+ * Validation instance: custom 5x5
+ *   flow: inter-facility flow
+ *   dist: inter-location distances
+ *   known optimum = 58
 */

 #pragma once
@ -16,8 +16,10 @@
 #include "cuda_utils.cuh"

 struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
-    const float* d_flow;    // 物流量矩阵 [N*N]
-    const float* d_dist;    // 距离矩阵 [N*N]
+    const float* d_flow;    // flow matrix [N*N] (device)
+    const float* d_dist;    // distance matrix [N*N] (device)
+    const float* h_flow;    // flow matrix [N*N] (host, for clone_to_device)
+    const float* h_dist;    // distance matrix [N*N] (host, for clone_to_device)
    int n;
    
    __device__ float calc_cost(const Sol& sol) const {
@ -64,14 +66,16 @@ struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
        d_dist = sd;
    }
    
-    static QAPProblem create(const float* h_flow, const float* h_dist, int n) {
+    static QAPProblem create(const float* h_flow_in, const float* h_dist_in, int n) {
        QAPProblem prob;
        prob.n = n;
+        prob.h_flow = h_flow_in;
+        prob.h_dist = h_dist_in;
        float *df, *dd;
        CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n));
        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
-        CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice));
-        CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(df, h_flow_in, sizeof(float) * n * n, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(dd, h_dist_in, sizeof(float) * n * n, cudaMemcpyHostToDevice));
        prob.d_flow = df; prob.d_dist = dd;
        return prob;
    }
@ -82,18 +86,12 @@ struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
        d_flow = nullptr; d_dist = nullptr;
    }
    
-    // v5.0: 多 GPU 协同 — 克隆到指定 GPU
+    // v5.0: multi-GPU — clone onto a given device
    QAPProblem* clone_to_device(int gpu_id) const override {
        int orig_device;
        CUDA_CHECK(cudaGetDevice(&orig_device));
        
-        // 先下载数据到 host（从当前设备）
-        float* h_flow = new float[n * n];
-        float* h_dist = new float[n * n];
-        CUDA_CHECK(cudaMemcpy(h_flow, d_flow, sizeof(float) * n * n, cudaMemcpyDeviceToHost));
-        CUDA_CHECK(cudaMemcpy(h_dist, d_dist, sizeof(float) * n * n, cudaMemcpyDeviceToHost));
-        
-        // 切换到目标 GPU 并上传
+        // Use host-side matrices directly (no D2H needed)
        CUDA_CHECK(cudaSetDevice(gpu_id));
        float *df, *dd;
        CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n));
@ -101,15 +99,12 @@ struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
        CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
        
-        delete[] h_flow;
-        delete[] h_dist;
-        
-        // 恢复原设备
        CUDA_CHECK(cudaSetDevice(orig_device));
        
-        // 创建新实例
        QAPProblem* new_prob = new QAPProblem();
        new_prob->n = n;
+        new_prob->h_flow = h_flow;
+        new_prob->h_dist = h_dist;
        new_prob->d_flow = df;
        new_prob->d_dist = dd;
        
--- a/prototype/problems/schedule.cuh
+++ b/prototype/problems/schedule.cuh
@ -1,8 +1,8 @@
 /**
- * schedule.cuh - 排班问题
- * 
- * 继承 ProblemBase，使用 ObjDef 目标注册机制
- * 2 个目标：总成本（min）+ 不公平度（min，权重更高）
+ * schedule.cuh - staff scheduling
+ *
+ * Extends ProblemBase with ObjDef objective registration.
+ * Two objectives: total cost (min) + unfairness (min, higher weight).
 */

 #pragma once
@ -14,7 +14,7 @@ struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
    const float* d_cost;
    int days, emps, required;
    
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
    __device__ float calc_total_cost(const Sol& sol) const {
        float total = 0.0f;
        for (int d = 0; d < days; d++)
@ -37,7 +37,7 @@ struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
        return (float)(max_w - min_w);
    }
    
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
    static constexpr ObjDef OBJ_DEFS[] = {
        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_cost
        {ObjDir::Minimize, 5.0f, 0.0f},   // case 1: calc_unfairness
@ -71,9 +71,9 @@ struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
        return cfg;
    }
    
-    // 默认回退全量（基类行为）— 不需要覆盖 evaluate_move
+    // Default full re-eval (base behavior) — no need to override evaluate_move
    
-    // ---- shared memory 接口 ----
+    // ---- shared memory interface ----
    size_t shared_mem_bytes() const {
        return (size_t)days * emps * sizeof(float);
    }
--- a/prototype/problems/tsp.cuh
+++ b/prototype/problems/tsp.cuh
@ -1,7 +1,7 @@
 /**
- * tsp.cuh - TSP 问题定义
- * 
- * 继承 ProblemBase，使用 ObjDef 目标注册机制
+ * tsp.cuh - Traveling Salesman Problem (TSP) definition
+ *
+ * Extends ProblemBase with ObjDef objective registration.
 */

 #pragma once
@ -10,12 +10,12 @@
 #include "operators.cuh"

 struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
-    // 问题数据
+    // problem data
    const float* d_dist;
-    const float* h_dist;  // host 端距离矩阵（用于 init_relation_matrix）
+    const float* h_dist;  // host distance matrix (for init_relation_matrix)
    int n;
    
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
    __device__ float calc_total_distance(const Sol& sol) const {
        float total = 0.0f;
        const int* route = sol.data[0];
@ -25,7 +25,7 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
        return total;
    }
    
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
    static constexpr ObjDef OBJ_DEFS[] = {
        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_distance
    };
@ -37,10 +37,10 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
    }
    
    __device__ float compute_penalty(const Sol& sol) const {
-        return 0.0f;  // TSP 无约束
+        return 0.0f;  // TSP has no side constraints
    }
    
-    // ---- config（编码/维度部分，目标由基类自动填充）----
+    // ---- config (encoding/dims; objectives filled by base class) ----
    ProblemConfig config() const {
        ProblemConfig cfg;
        cfg.encoding = EncodingType::Permutation;
@ -49,7 +49,7 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
        return cfg;
    }
    
-    // ---- shared memory 接口 ----
+    // ---- shared memory interface ----
    static constexpr size_t SMEM_LIMIT = 48 * 1024;
    
    size_t shared_mem_bytes() const {
@ -69,7 +69,7 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
        d_dist = sd;
    }
    
-    // 距离先验：距离近 → G/O 分数高
+    // Distance prior: closer cities → higher G/O scores
    void init_relation_matrix(float* G, float* O, int N) const {
        if (!h_dist || N != n) return;
        float max_d = 0.0f;
@ -108,21 +108,21 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
        h_dist = nullptr;
    }
    
-    // v5.0: 多 GPU 协同 — 克隆到指定 GPU
+    // v5.0: multi-GPU — clone onto a given device
    TSPProblem* clone_to_device(int gpu_id) const override {
        int orig_device;
        CUDA_CHECK(cudaGetDevice(&orig_device));
        CUDA_CHECK(cudaSetDevice(gpu_id));
        
-        // 分配设备内存并拷贝距离矩阵
+        // Allocate device memory and copy distance matrix
        float* dd;
        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
        CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
        
-        // 恢复原设备
+        // Restore original device
        CUDA_CHECK(cudaSetDevice(orig_device));
        
-        // 创建新的 Problem 实例（在 host 端）
+        // Create new Problem instance (on host)
        TSPProblem* new_prob = new TSPProblem();
        new_prob->n = n;
        new_prob->h_dist = h_dist;
--- a/prototype/problems/tsp_large.cuh
+++ b/prototype/problems/tsp_large.cuh
@ -1,7 +1,7 @@
 /**
- * tsp_large.cuh - 大规模 TSP 问题定义 (最多 256 城市)
- * 
- * 继承 ProblemBase，逻辑与 tsp.cuh 一致，仅 D2 上限不同
+ * tsp_large.cuh - large-scale TSP definition (up to 256 cities)
+ *
+ * Same logic as tsp.cuh under ProblemBase; only D2 cap differs.
 */

 #pragma once
@ -14,7 +14,7 @@ struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
    const float* h_dist;
    int n;
    
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
    __device__ float calc_total_distance(const Sol& sol) const {
        float total = 0.0f;
        const int* route = sol.data[0];
@ -24,7 +24,7 @@ struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
        return total;
    }
    
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
    static constexpr ObjDef OBJ_DEFS[] = {
        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_distance
    };
@ -54,7 +54,7 @@ struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
        return need <= SMEM_LIMIT ? need : 0;
    }
    
-    // 距离矩阵的实际大小（不管是否放进 smem）
+    // Actual distance matrix size (whether or not placed in smem)
    size_t working_set_bytes() const {
        return (size_t)n * n * sizeof(float);
    }
--- a/prototype/problems/tsp_xlarge.cuh
+++ b/prototype/problems/tsp_xlarge.cuh
@ -1,9 +1,9 @@
 /**
- * tsp_xlarge.cuh - 超大规模 TSP 问题定义 (最多 512 城市)
- * 
- * 继承 ProblemBase，逻辑与 tsp_large.cuh 一致，D2=512
- * 注意：距离矩阵 512×512×4B = 1MB，远超 48KB shared memory
- *       因此 shared_mem_bytes() 返回 0，距离矩阵留在 global memory
+ * tsp_xlarge.cuh - very large TSP definition (up to 512 cities)
+ *
+ * Same as tsp_large.cuh under ProblemBase, with D2=512.
+ * Note: distance matrix 512×512×4B = 1MB, far above 48KB shared memory,
+ *       so shared_mem_bytes() returns 0 and the matrix stays in global memory.
 */

 #pragma once
@ -13,7 +13,7 @@

 struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
    const float* d_dist;
-    const float* h_dist;  // host 端距离矩阵（用于 init_relation_matrix）
+    const float* h_dist;  // host distance matrix (for init_relation_matrix)
    int n;
    
    __device__ float calc_total_distance(const Sol& sol) const {
@ -45,7 +45,7 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
        return cfg;
    }
    
-    // 距离矩阵太大，不放 shared memory
+    // Distance matrix too large for shared memory
    size_t shared_mem_bytes() const { return 0; }
    __device__ void load_shared(char*, int, int) {}
    
@ -53,10 +53,10 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
        return (size_t)n * n * sizeof(float);
    }
    
-    // 用距离矩阵初始化 G/O 先验：距离近 → 分数高
+    // Initialize G/O priors from distances: closer → higher score
    void init_relation_matrix(float* G, float* O, int N) const {
        if (!h_dist || N != n) return;
-        // 找最大距离用于归一化
+        // Max distance for normalization
        float max_d = 0.0f;
        for (int i = 0; i < N; i++)
            for (int j = 0; j < N; j++)
@ -66,10 +66,10 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                if (i == j) continue;
-                // 距离近 → G 高（分组倾向强）
+                // Closer → higher G (stronger grouping signal)
                float proximity = 1.0f - h_dist[i * N + j] / max_d;
-                G[i * N + j] = proximity * 0.3f;  // 初始信号不要太强，留空间给 EMA
-                // 距离近 → O 也给一点信号（对称的，不偏向任何方向）
+                G[i * N + j] = proximity * 0.3f;  // keep initial signal moderate for EMA headroom
+                // Closer → small O signal too (symmetric, no directional bias)
                O[i * N + j] = proximity * 0.1f;
            }
        }
@ -84,7 +84,7 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
    static TSPXLargeProblem create(const float* h_dist_ptr, int n) {
        TSPXLargeProblem prob;
        prob.n = n;
-        prob.h_dist = h_dist_ptr;  // 保留 host 指针
+        prob.h_dist = h_dist_ptr;  // keep host pointer
        float* dd;
        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
        CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));
--- a/prototype/problems/vrp.cuh
+++ b/prototype/problems/vrp.cuh
@ -1,8 +1,8 @@
 /**
- * vrp.cuh - 容量约束车辆路径问题 (CVRP)
- * 
- * 继承 ProblemBase，使用 ObjDef 目标注册机制
- * 多行编码（D1=K 条路线，分区初始化 + 跨行算子）
+ * vrp.cuh - Capacitated Vehicle Routing Problem (CVRP)
+ *
+ * Extends ProblemBase with ObjDef objective registration.
+ * Multi-row encoding (D1 = K routes, partition init + cross-row operators).
 */

 #pragma once
@ -12,11 +12,11 @@
 #include "gpu_cache.cuh"

 struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
-    // GPU 数据
+    // GPU data
    const float* d_dist;
    const float* d_demand;
-    const float* h_dist;    // host 端距离矩阵（含 depot，用于 init_relation_matrix）
-    const float* h_demand;  // host 端需求数组（用于 clone_to_device）
+    const float* h_dist;    // host distance matrix (includes depot; for init_relation_matrix)
+    const float* h_demand;  // host demand array (for clone_to_device)
    int n;
    int stride;
    float capacity;
@ -24,7 +24,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
    int max_vehicles;
    GpuCache cache;
    
-    // ---- 目标计算 ----
+    // ---- objective evaluation ----
    __device__ float compute_route_dist(const int* route, int size) const {
        if (size == 0) return 0.0f;
        float dist = 0.0f;
@ -61,7 +61,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
        return total;
    }
    
-    // ---- 目标定义（OBJ_DEFS 与 compute_obj 必须一一对应）----
+    // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
    static constexpr ObjDef OBJ_DEFS[] = {
        {ObjDir::Minimize, 1.0f, 0.0f},   // case 0: calc_total_distance
    };
@ -102,7 +102,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
        return cfg;
    }
    
-    // ---- shared memory 接口 ----
+    // ---- shared memory interface ----
    static constexpr size_t SMEM_LIMIT = 48 * 1024;
    
    size_t shared_mem_bytes() const {
@ -129,14 +129,14 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
    void enable_cache(int cap = 65536) { cache = GpuCache::allocate(cap); }
    void print_cache_stats() const { cache.print_stats(); }
    
-    // 距离先验：客户间距离近 → G/O 分数高
-    // 注意：h_dist 含 depot（stride×stride），元素编号 0..n-1 对应 node 1..n
+    // Distance prior: closer customers → higher G/O scores
+    // Note: h_dist includes depot (stride×stride); indices 0..n-1 map to nodes 1..n
    void init_relation_matrix(float* G, float* O, int N) const {
        if (!h_dist || N != n) return;
        float max_d = 0.0f;
        for (int i = 0; i < N; i++)
            for (int j = 0; j < N; j++) {
-                float d = h_dist[(i + 1) * stride + (j + 1)];  // 跳过 depot
+                float d = h_dist[(i + 1) * stride + (j + 1)];  // skip depot
                if (d > max_d) max_d = d;
            }
        if (max_d <= 0.0f) return;
@ -161,7 +161,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
        prob.max_vehicles = max_vehicles;
        prob.cache = GpuCache::disabled();
        prob.h_dist = h_dist_ptr;
-        prob.h_demand = h_demand_ptr;  // 保存 host 端指针
+        prob.h_demand = h_demand_ptr;  // keep host pointer
        
        int n_nodes = n + 1;
        float* dd;
@ -185,13 +185,13 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
        cache.destroy();
    }
    
-    // v5.0: 多 GPU 协同 — 克隆到指定 GPU
+    // v5.0: multi-GPU — clone onto a given device
    VRPProblem* clone_to_device(int gpu_id) const override {
        int orig_device;
        CUDA_CHECK(cudaGetDevice(&orig_device));
        CUDA_CHECK(cudaSetDevice(gpu_id));
        
-        // 从 host 端数据直接拷贝到目标 GPU（避免跨设备 D2H 拷贝）
+        // Copy from host straight to target GPU (avoid cross-device D2H staging)
        int n_nodes = n + 1;
        float* dd;
        CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n_nodes * n_nodes));
--- a/prototype/problems/vrptw.cuh
+++ b/prototype/problems/vrptw.cuh
@ -1,12 +1,12 @@
 /**
- * vrptw.cuh - 带时间窗的车辆路径问题 (VRPTW)
- * 
- * 在 CVRP 基础上增加时间窗约束。
- * 编码：Perm 多行分区（同 CVRP），data[r][j] = 路线 r 的第 j 个客户。
- * 目标：Minimize 总距离。
- * 约束：(a) 容量约束, (b) 时间窗约束（到达时间必须 ≤ latest，早到需等待）。
- * 
- * 验证实例：8 客户 3 车, 手工设计坐标+时间窗, 确保有已知可行解。
+ * vrptw.cuh - Vehicle Routing Problem with Time Windows (VRPTW)
+ *
+ * CVRP plus time window constraints.
+ * Encoding: multi-row perm partition (same as CVRP); data[r][j] = j-th customer on route r.
+ * Objective: minimize total distance.
+ * Constraints: (a) capacity, (b) time windows (arrival ≤ latest; early arrival waits).
+ *
+ * Validation instance: 8 customers, 3 vehicles; hand-crafted coords + windows with known feasible solution.
 */

 #pragma once
@ -14,12 +14,12 @@
 #include "cuda_utils.cuh"

 struct VRPTWProblem : ProblemBase<VRPTWProblem, 8, 64> {
-    const float* d_dist;        // 距离矩阵 [(n+1)*(n+1)]（含 depot）
-    const float* d_demand;      // 需求 [n]
-    const float* d_earliest;    // 最早服务时间 [n+1]（含 depot）
-    const float* d_latest;      // 最晚服务时间 [n+1]（含 depot）
-    const float* d_service;     // 服务耗时 [n+1]（含 depot）
-    int n;                      // 客户数（不含 depot）
+    const float* d_dist;        // distance matrix [(n+1)*(n+1)] (includes depot)
+    const float* d_demand;      // demand [n]
+    const float* d_earliest;    // earliest service time [n+1] (includes depot)
+    const float* d_latest;      // latest service time [n+1] (includes depot)
+    const float* d_service;     // service time [n+1] (includes depot)
+    int n;                      // number of customers (excludes depot)
    int stride;                 // n+1
    float capacity;
    int num_vehicles;
@ -63,30 +63,30 @@ struct VRPTWProblem : ProblemBase<VRPTWProblem, 8, 64> {
            if (size == 0) continue;
            active++;
            
-            // 容量约束
+            // Capacity constraint
            float load = 0.0f;
            for (int j = 0; j < size; j++)
                load += d_demand[sol.data[r][j]];
            if (load > capacity)
                penalty += (load - capacity) * 100.0f;
            
-            // 时间窗约束：模拟路线行驶
+            // Time windows: simulate route travel
            float time = 0.0f;
            int prev = 0;
            for (int j = 0; j < size; j++) {
                int node = sol.data[r][j] + 1;
                float travel = d_dist[prev * stride + node];
                time += travel;
-                // 早到需等待
+                // Wait if early
                if (time < d_earliest[node])
                    time = d_earliest[node];
-                // 迟到产生惩罚
+                // Penalize lateness
                if (time > d_latest[node])
                    penalty += (time - d_latest[node]) * 50.0f;
                time += d_service[node];
                prev = node;
            }
-            // 返回 depot 的时间窗
+            // Time window returning to depot
            float return_time = time + d_dist[prev * stride + 0];
            if (return_time > d_latest[0])
                penalty += (return_time - d_latest[0]) * 50.0f;