From a848730459da9b8a84efdf105b184f212e384ed2 Mon Sep 17 00:00:00 2001 From: L-yang-yang <15251858055@163.com> Date: Wed, 25 Mar 2026 11:52:50 +0800 Subject: [PATCH] fix: harden CUDA safety checks and translate comments to English MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Safety fixes (4 critical, 4 warning) from code review: - qap.cuh: fix clone_to_device cross-device D2H by retaining host matrices - types.cuh: add CUDA_CHECK to InjectBuffer, track owner_gpu for safe destroy - types.cuh: add bounds check on lexicographic priority index - solver.cuh: cap migrate_kernel islands to MAX_ISLANDS=64 to prevent stack overflow - multi_gpu_solver.cuh: guard against 0 GPUs, propagate stop_reason from best GPU - types.cuh: warn on SeqRegistry overflow - solver.cuh: warn when constraint_directed/phased_search disabled without AOS Translate all Chinese comments to English across 25+ source files (core/*.cuh, problems/*.cuh, Makefile, multi-GPU tests). Verified on V100S×2 (sm_70, CUDA 12.8): e5 (12 problem types, all optimal), e13 (multi-objective + multi-GPU, 9 configs, all passed). --- README.md | 37 +- prototype/Makefile | 14 +- prototype/core/cuda_utils.cuh | 24 +- prototype/core/gpu_cache.cuh | 52 +-- prototype/core/init_heuristic.cuh | 8 +- prototype/core/init_selection.cuh | 90 ++-- prototype/core/multi_gpu_solver.cuh | 133 +++--- prototype/core/operators.cuh | 446 ++++++++++---------- prototype/core/population.cuh | 28 +- prototype/core/relation_matrix.cuh | 60 +-- prototype/core/solver.cuh | 409 +++++++++--------- prototype/core/types.cuh | 616 ++++++++++++++-------------- prototype/problems/assignment.cuh | 24 +- prototype/problems/bin_packing.cuh | 24 +- prototype/problems/graph_color.cuh | 20 +- prototype/problems/jsp.cuh | 84 ++-- prototype/problems/knapsack.cuh | 14 +- prototype/problems/load_balance.cuh | 26 +- prototype/problems/qap.cuh | 51 ++- prototype/problems/schedule.cuh | 16 +- prototype/problems/tsp.cuh | 30 +- prototype/problems/tsp_large.cuh | 12 +- prototype/problems/tsp_xlarge.cuh | 26 +- prototype/problems/vrp.cuh | 32 +- prototype/problems/vrptw.cuh | 38 +- 25 files changed, 1147 insertions(+), 1167 deletions(-) diff --git a/README.md b/README.md index 1da5968..ae58554 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![CUDA](https://img.shields.io/badge/CUDA-11.0%2B-green.svg)](https://developer.nvidia.com/cuda-toolkit) [![Python](https://img.shields.io/badge/Python-3.8%2B-blue.svg)](https://www.python.org/) -**Paper**: [cuGenOpt: A GPU-Accelerated General-Purpose Metaheuristic Framework for Combinatorial Optimization](http://arxiv.org/abs/2603.19163) +**Paper**: [cuGenOpt: A GPU-Accelerated General-Purpose Metaheuristic Framework for Combinatorial Optimization](https://arxiv.org/abs/2603.19163) --- @@ -114,28 +114,7 @@ Define your own problem by inheriting `ProblemBase` and implementing `compute_ob └─────────────────────────────────────────────────────────┘ ``` ---- -## Project Structure - -``` -generic_solver/ -├── prototype/ # Core framework (header-only .cuh files) -│ ├── core/ # Solver, operators, population, types -│ └── problems/ # 12+ problem implementations -├── python/ # Python wrapper (pip install cugenopt) -│ ├── cugenopt/ # Python package (built-ins + JIT compiler) -│ └── tests/ # Test suite -├── benchmark/ # Experiments and benchmarks -│ ├── experiments/ # E0-E13: 14 experiment groups -│ ├── data/ # Standard instances (TSPLIB, Solomon, QAPLIB) -│ └── results/ # Experimental reports -├── paper_v3_en/ # Paper source (LaTeX) -├── STATUS.md # Project status and roadmap -└── README.md # This file -``` - ---- ## Performance Highlights @@ -186,8 +165,7 @@ generic_solver/ ## Installation ### Python Package - -coming soon~ +come soon ```bash pip install cugenopt ``` @@ -207,18 +185,7 @@ cd prototype make all ``` ---- -## Documentation - -| Document | Description | -|----------|-------------| -| [STATUS.md](STATUS.md) | Project status, roadmap, and design decisions | -| [Python API Guide](python/README.md) | Detailed Python API documentation | -| [Benchmark Design](benchmark/DESIGN.md) | Experimental methodology | -| [Paper](paper_v3_en/) | Full technical details and evaluation | - ---- ## Citation diff --git a/prototype/Makefile b/prototype/Makefile index 32ebcdf..c72945a 100644 --- a/prototype/Makefile +++ b/prototype/Makefile @@ -1,10 +1,10 @@ # GenSolver Makefile # -# 用法: -# make e1 e2 e3 e4 e5 e6 → 编译单个实验 -# make diag → 编译诊断程序 -# make all → 编译全部 -# make clean → 清理 +# Usage: +# make e1 e2 e3 e4 e5 e6 → Build individual experiments +# make diag → Build diagnostic program +# make all → Build all +# make clean → Clean NVCC = nvcc ARCH ?= -arch=sm_75 @@ -40,10 +40,10 @@ $(EXP_DIR)/%/gpu: $(EXP_DIR)/%/gpu.cu $(ALL_HEADERS) problems/tsplib_data.h $(EXP_DIR)/e0_diagnosis/bench_diagnosis: $(EXP_DIR)/e0_diagnosis/bench_diagnosis.cu $(ALL_HEADERS) $(NVCC) $(ARCH) $(CFLAGS) $(INCLUDES) -o $@ $< -test_multi_gpu: test_multi_gpu.cu $(ALL_HEADERS) +test_multi_gpu: $(EXP_DIR)/e9_multi_gpu_b3/test_multi_gpu.cu $(ALL_HEADERS) $(NVCC) $(ARCH) $(CFLAGS) $(INCLUDES) -o $@ $< -test_multi_gpu_b3: test_multi_gpu_b3.cu $(ALL_HEADERS) +test_multi_gpu_b3: $(EXP_DIR)/e9_multi_gpu_b3/test_multi_gpu_b3.cu $(ALL_HEADERS) $(NVCC) $(ARCH) $(CFLAGS) $(INCLUDES) -o $@ $< clean: diff --git a/prototype/core/cuda_utils.cuh b/prototype/core/cuda_utils.cuh index 72ea103..205f15a 100644 --- a/prototype/core/cuda_utils.cuh +++ b/prototype/core/cuda_utils.cuh @@ -1,8 +1,8 @@ /** - * cuda_utils.cuh - CUDA 工具集 + * cuda_utils.cuh - CUDA utilities * - * 职责:错误检查、设备信息、随机数工具 - * 规则:所有 CUDA API 调用都必须用 CUDA_CHECK 包裹 + * Responsibilities: error checking, device info, random number utilities + * Rule: every CUDA API call must be wrapped with CUDA_CHECK */ #pragma once @@ -11,7 +11,7 @@ #include // ============================================================ -// 错误检查 +// Error checking // ============================================================ #define CUDA_CHECK(call) do { \ @@ -23,7 +23,7 @@ } \ } while(0) -// kernel launch 后检查(捕获异步错误) +// Check after kernel launch (catches async errors) #define CUDA_CHECK_LAST() do { \ cudaError_t err = cudaGetLastError(); \ if (err != cudaSuccess) { \ @@ -34,7 +34,7 @@ } while(0) // ============================================================ -// 设备信息 +// Device info // ============================================================ inline void print_device_info() { @@ -52,10 +52,10 @@ inline void print_device_info() { } // ============================================================ -// 随机数工具 (Device 端) +// Random number utilities (device-side) // ============================================================ -// 初始化 curand 状态,每个线程一个 +// Initialize curand state: one per thread __global__ void init_curand_kernel(curandState* states, unsigned long long seed, int n) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n) { @@ -63,12 +63,12 @@ __global__ void init_curand_kernel(curandState* states, unsigned long long seed, } } -// Device 端:生成 [0, bound) 的随机整数 +// Device-side: random integer in [0, bound) __device__ inline int rand_int(curandState* state, int bound) { return curand(state) % bound; } -// Device 端:Fisher-Yates shuffle,对 arr[0..n-1] 做随机排列 +// Device-side: Fisher-Yates shuffle of arr[0..n-1] __device__ inline void shuffle(int* arr, int n, curandState* state) { for (int i = n - 1; i > 0; i--) { int j = rand_int(state, i + 1); @@ -79,12 +79,12 @@ __device__ inline void shuffle(int* arr, int n, curandState* state) { } // ============================================================ -// Kernel 启动参数计算 +// Kernel launch grid sizing // ============================================================ inline int div_ceil(int a, int b) { return (a + b - 1) / b; } -// 计算合适的 block 数量 +// Compute suitable number of blocks inline int calc_grid_size(int n, int block_size = 256) { return div_ceil(n, block_size); } diff --git a/prototype/core/gpu_cache.cuh b/prototype/core/gpu_cache.cuh index f7c2e06..ae69413 100644 --- a/prototype/core/gpu_cache.cuh +++ b/prototype/core/gpu_cache.cuh @@ -1,20 +1,20 @@ /** - * gpu_cache.cuh - GPU 全局内存哈希表(通用缓存组件) + * gpu_cache.cuh - GPU global-memory hash table (generic cache component) * - * 设计: - * - 开放寻址,固定容量(power of 2),线性探测 - * - key = uint64_t(由 Problem 自行计算 hash) - * - value = float(单个指标值) - * - 无锁:允许 race condition(缓存语义,偶尔脏读可接受) - * - 自带命中/未命中原子计数器 + * Design: + * - Open addressing, fixed capacity (power of 2), linear probing + * - key = uint64_t (hash computed by Problem) + * - value = float (single metric value) + * - Lock-free: race conditions allowed (cache semantics; occasional dirty reads OK) + * - Built-in atomic hit/miss counters * - * 用法: + * Usage: * GpuCache cache = GpuCache::allocate(65536); // host * // ... pass cache as Problem member to kernels ... * cache.print_stats(); // host * cache.destroy(); // host * - * 参考:scute 项目 LRUCache(key = metric_type + content_hash) + * Reference: scute project LRUCache (key = metric_type + content_hash) */ #pragma once @@ -22,25 +22,25 @@ #include // ============================================================ -// 常量 +// Constants // ============================================================ static constexpr uint64_t CACHE_EMPTY_KEY = 0xFFFFFFFFFFFFFFFFULL; -static constexpr int CACHE_MAX_PROBE = 8; // 最大线性探测步数 +static constexpr int CACHE_MAX_PROBE = 8; // Max linear probing steps // ============================================================ -// GpuCache 结构体(POD,可安全拷贝到 kernel) +// GpuCache struct (POD, safe to copy to kernel) // ============================================================ struct GpuCache { - uint64_t* keys; // GPU 全局内存 - float* values; // GPU 全局内存 - unsigned int* d_hits; // 原子计数器(GPU) - unsigned int* d_misses; // 原子计数器(GPU) - int capacity; // 必须是 2 的幂 + uint64_t* keys; // GPU global memory + float* values; // GPU global memory + unsigned int* d_hits; // Atomic counters (GPU) + unsigned int* d_misses; // Atomic counters (GPU) + int capacity; // Must be a power of 2 int mask; // = capacity - 1 - // ---- Host 操作 ---- + // ---- Host operations ---- static GpuCache allocate(int cap = 65536) { GpuCache c; @@ -94,20 +94,20 @@ struct GpuCache { }; // ============================================================ -// Device 函数:哈希 / 查找 / 插入 +// Device functions: hash / lookup / insert // ============================================================ -/// FNV-1a 哈希:对一段有序 int 序列(如路线中的客户 ID) +/// FNV-1a hash over an ordered int sequence (e.g. customer IDs on a route) __device__ inline uint64_t route_hash(const int* data, int len) { uint64_t h = 14695981039346656037ULL; // FNV offset basis for (int i = 0; i < len; i++) { h ^= (uint64_t)(unsigned int)data[i]; h *= 1099511628211ULL; // FNV prime } - return (h == CACHE_EMPTY_KEY) ? h - 1 : h; // 避免与哨兵值碰撞 + return (h == CACHE_EMPTY_KEY) ? h - 1 : h; // Avoid collision with sentinel value } -/// 查找:命中返回 true + 写入 out +/// Lookup: on hit returns true and writes out __device__ inline bool cache_lookup(const GpuCache& c, uint64_t key, float& out) { int slot = (int)(key & (uint64_t)c.mask); for (int p = 0; p < CACHE_MAX_PROBE; p++) { @@ -117,12 +117,12 @@ __device__ inline bool cache_lookup(const GpuCache& c, uint64_t key, float& out) out = c.values[idx]; return true; } - if (k == CACHE_EMPTY_KEY) return false; // 空槽 → 一定不存在 + if (k == CACHE_EMPTY_KEY) return false; // Empty slot -> key not present } - return false; // 探测用尽 + return false; // Probing exhausted } -/// 插入:写入 key-value,同 key 覆盖,探测满则驱逐首槽 +/// Insert: write key-value; same key overwrites; if probe full, evict first slot __device__ inline void cache_insert(const GpuCache& c, uint64_t key, float value) { int slot = (int)(key & (uint64_t)c.mask); for (int p = 0; p < CACHE_MAX_PROBE; p++) { @@ -134,7 +134,7 @@ __device__ inline void cache_insert(const GpuCache& c, uint64_t key, float value return; } } - // 探测满:驱逐首槽 + // Probe full: evict first slot int idx = slot & c.mask; c.keys[idx] = key; c.values[idx] = value; diff --git a/prototype/core/init_heuristic.cuh b/prototype/core/init_heuristic.cuh index 716284a..0a8cb90 100644 --- a/prototype/core/init_heuristic.cuh +++ b/prototype/core/init_heuristic.cuh @@ -6,7 +6,7 @@ namespace heuristic_init { -// 单行排列:所有行填相同排列 +// Single-row layout: same permutation in every row template static void build_sorted_permutation(Sol& sol, const std::vector& order, int dim1, int dim2) { @@ -19,7 +19,7 @@ static void build_sorted_permutation(Sol& sol, const std::vector& order, for (int i = 0; i < MAX_OBJ; i++) sol.objectives[i] = 0.0f; } -// Partition 模式:排列均匀切分到 dim1 行,元素不重复 +// Partition mode: split permutation evenly across dim1 rows, no duplicate elements template static void build_partition_from_order(Sol& sol, const std::vector& order, int dim1, int total_elements) { @@ -66,8 +66,8 @@ std::vector build_from_matrices(const HeuristicMatrix* matrices, int num_ma col_sum[j] += mat[i * N + j]; } - // 对于 Partition (VRPTW),距离矩阵含 depot (index 0), - // 排序只针对客户 (index 1..N-1),输出值为 0-based 客户编号 + // For Partition (VRPTW), the distance matrix includes depot (index 0); + // sorting is only over customers (indices 1..N-1); output values are 0-based customer ids std::vector idx; if (partition_mode && N > elem_count) { for (int i = 1; i <= elem_count; i++) idx.push_back(i); diff --git a/prototype/core/init_selection.cuh b/prototype/core/init_selection.cuh index 17f37e4..f8d8a86 100644 --- a/prototype/core/init_selection.cuh +++ b/prototype/core/init_selection.cuh @@ -1,15 +1,15 @@ /** - * init_selection.cuh - 初始解采样择优 + NSGA-II 选择 + * init_selection.cuh - Initial-solution sampling and NSGA-II selection * - * Host 端逻辑,在 solver 初始化阶段调用一次。 - * 从 K × pop_size 个候选解中选出 pop_size 个作为初始种群。 + * Host-side logic; called once during solver initialization. + * Selects pop_size individuals from K × pop_size candidates as the initial population. * - * 选择策略: - * 1. 核心目标预留名额(按 importance 分配) - * 2. NSGA-II 选择(非支配排序 + 加权拥挤度) - * 3. 纯随机保底(多样性) + * Selection strategy: + * 1. Reserve slots for core objectives (by importance) + * 2. NSGA-II selection (non-dominated sort + weighted crowding) + * 3. Pure random fallback (diversity) * - * 单目标时自动退化为 top-N 排序,无需分支。 + * Single-objective case automatically reduces to top-N sorting; no extra branching. */ #pragma once @@ -22,36 +22,36 @@ namespace init_sel { // ============================================================ -// 候选解的目标信息(从 GPU 下载后在 host 端使用) +// Per-candidate objective info (used on host after download from GPU) // ============================================================ struct CandidateInfo { - int idx; // 在候选数组中的原始索引 - float objs[MAX_OBJ]; // 归一化后的目标值(越小越好) + int idx; // Original index in the candidate array + float objs[MAX_OBJ]; // Normalized objectives (lower is better) float penalty; - int rank; // 非支配排序层级(0 = Pareto 前沿) - float crowding; // 拥挤度距离 - bool selected; // 是否已被选中 + int rank; // Non-dominated sort front (0 = Pareto front) + float crowding; // Crowding distance + bool selected; // Whether already selected }; // ============================================================ -// 非支配排序(Fast Non-dominated Sort) +// Non-dominated sort (Fast Non-dominated Sort) // ============================================================ -// 复杂度:O(M × N²),M = 目标数,N = 候选数 -// 对初始化场景(N ≤ 几千,M ≤ 4)完全可接受 +// Complexity: O(M × N²), M = number of objectives, N = number of candidates +// Acceptable for initialization (N up to a few thousand, M ≤ 4) inline void fast_nondominated_sort(std::vector& cands, int num_obj, std::vector>& fronts) { int n = (int)cands.size(); - std::vector dom_count(n, 0); // 被多少个解支配 - std::vector> dom_set(n); // 支配了哪些解 + std::vector dom_count(n, 0); // How many solutions dominate this one + std::vector> dom_set(n); // Which solutions this one dominates - // 判断 a 是否支配 b:a 在所有目标上 ≤ b,且至少一个 < - // 先处理 penalty:可行解支配不可行解 + // Whether a dominates b: a ≤ b on all objectives, and strictly < on at least one + // Handle penalty first: feasible dominates infeasible auto dominates = [&](int a, int b) -> bool { const auto& ca = cands[a]; const auto& cb = cands[b]; - // penalty 处理 + // Penalty handling if (ca.penalty <= 0.0f && cb.penalty > 0.0f) return true; if (ca.penalty > 0.0f && cb.penalty <= 0.0f) return false; if (ca.penalty > 0.0f && cb.penalty > 0.0f) return ca.penalty < cb.penalty; @@ -65,7 +65,7 @@ inline void fast_nondominated_sort(std::vector& cands, return all_leq && any_lt; }; - // 计算支配关系 + // Compute dominance relations for (int i = 0; i < n; i++) { for (int j = i + 1; j < n; j++) { if (dominates(i, j)) { @@ -78,7 +78,7 @@ inline void fast_nondominated_sort(std::vector& cands, } } - // 提取各层前沿 + // Extract each front layer fronts.clear(); std::vector current_front; for (int i = 0; i < n; i++) { @@ -107,9 +107,9 @@ inline void fast_nondominated_sort(std::vector& cands, } // ============================================================ -// 加权拥挤度距离 +// Weighted crowding distance // ============================================================ -// 标准拥挤度 + importance 加权:核心目标维度上的间距贡献更大 +// Standard crowding + importance weighting: larger gap contribution on core objectives inline void weighted_crowding_distance(std::vector& cands, const std::vector& front, @@ -117,7 +117,7 @@ inline void weighted_crowding_distance(std::vector& cands, const float* importance) { int n = (int)front.size(); if (n <= 2) { - for (int i : front) cands[i].crowding = 1e18f; // 边界解无穷大 + for (int i : front) cands[i].crowding = 1e18f; // Boundary solutions: infinite return; } @@ -126,18 +126,18 @@ inline void weighted_crowding_distance(std::vector& cands, std::vector sorted_idx(front.begin(), front.end()); for (int m = 0; m < num_obj; m++) { - // 按目标 m 排序 + // Sort by objective m std::sort(sorted_idx.begin(), sorted_idx.end(), [&](int a, int b) { return cands[a].objs[m] < cands[b].objs[m]; }); float range = cands[sorted_idx[n-1]].objs[m] - cands[sorted_idx[0]].objs[m]; - if (range < 1e-12f) continue; // 该目标无区分度 + if (range < 1e-12f) continue; // No spread on this objective - // 边界解设为无穷大 + // Boundary solutions: infinite crowding cands[sorted_idx[0]].crowding += 1e18f; cands[sorted_idx[n-1]].crowding += 1e18f; - // 中间解:相邻间距 × importance 权重 + // Interior: neighbor gap × importance weight float w = importance[m]; for (int i = 1; i < n - 1; i++) { float gap = cands[sorted_idx[i+1]].objs[m] - cands[sorted_idx[i-1]].objs[m]; @@ -147,29 +147,29 @@ inline void weighted_crowding_distance(std::vector& cands, } // ============================================================ -// 主选择函数:从 N 个候选中选出 target 个 +// Main selection: pick target candidates from N // ============================================================ -// 返回被选中的候选索引 +// Returns indices of selected candidates inline std::vector nsga2_select(std::vector& cands, int num_obj, const float* importance, int target, int num_reserved_random) { - // --- 1. 核心目标预留名额 --- + // --- 1. Reserve slots for core objectives --- int num_reserve_total = target - num_reserved_random; - // 预留比例:importance[i] × 30% 的名额(剩余 70% 给 NSGA-II) + // Reserve ratio: importance[i] × 30% of slots (remaining 70% for NSGA-II) float reserve_ratio = 0.3f; std::vector selected; selected.reserve(target); - // 对每个目标,按该目标排序取 top + // For each objective, sort by that objective and take top for (int m = 0; m < num_obj; m++) { int quota = (int)(num_reserve_total * importance[m] * reserve_ratio); - if (quota < 1 && num_obj > 1) quota = 1; // 每个目标至少 1 个 + if (quota < 1 && num_obj > 1) quota = 1; // At least one per objective - // 按目标 m 排序(越小越好) + // Sort by objective m (lower is better) std::vector by_obj(cands.size()); for (int i = 0; i < (int)cands.size(); i++) by_obj[i] = i; std::sort(by_obj.begin(), by_obj.end(), @@ -186,32 +186,32 @@ inline std::vector nsga2_select(std::vector& cands, } } - // --- 2. NSGA-II 选择填充剩余名额 --- + // --- 2. NSGA-II fills remaining slots --- int remaining = target - num_reserved_random - (int)selected.size(); if (remaining > 0) { - // 非支配排序 + // Non-dominated sort std::vector> fronts; fast_nondominated_sort(cands, num_obj, fronts); for (auto& front : fronts) { if (remaining <= 0) break; - // 过滤已选中的 + // Filter out already selected std::vector available; for (int i : front) { if (!cands[i].selected) available.push_back(i); } if ((int)available.size() <= remaining) { - // 整层都选 + // Take the whole front for (int i : available) { cands[i].selected = true; selected.push_back(i); remaining--; } } else { - // 该层需要截断:按加权拥挤度选 + // Truncate this front: pick by weighted crowding weighted_crowding_distance(cands, available, num_obj, importance); std::sort(available.begin(), available.end(), [&](int a, int b) { return cands[a].crowding > cands[b].crowding; }); @@ -228,14 +228,14 @@ inline std::vector nsga2_select(std::vector& cands, } // ============================================================ -// 单目标快速路径:直接按标量排序取 top +// Single-objective fast path: scalar sort and take top // ============================================================ inline std::vector top_n_select(std::vector& cands, int target, int num_reserved_random) { int to_select = target - num_reserved_random; - // 按 penalty 优先,然后按 objs[0](已归一化为越小越好) + // Prefer lower penalty, then objs[0] (normalized, lower is better) std::vector indices(cands.size()); for (int i = 0; i < (int)cands.size(); i++) indices[i] = i; std::sort(indices.begin(), indices.end(), [&](int a, int b) { diff --git a/prototype/core/multi_gpu_solver.cuh b/prototype/core/multi_gpu_solver.cuh index 1169685..84dc78f 100644 --- a/prototype/core/multi_gpu_solver.cuh +++ b/prototype/core/multi_gpu_solver.cuh @@ -1,12 +1,12 @@ /** - * multi_gpu_solver.cuh - 多 GPU 协同求解 + * multi_gpu_solver.cuh - Multi-GPU cooperative solving * - * v5.0 方案 B3: 被动注入 + GPU 无感知 - * - 每块 GPU 独立运行 solve(),各自用不同 seed - * - 每个 GPU 有一个 InjectBuffer(设备端) - * - CPU 协调线程定期(每 N 秒)收集各 GPU 的 best,异步写入其他 GPU 的 InjectBuffer - * - GPU 在 migrate_kernel 后检查 InjectBuffer,如果有新解则注入 - * - 完全解耦:GPU 无需暂停,CPU 异步写入,通过 CUDA Stream 同步保证安全 + * v5.0 plan B3: passive injection + GPU-agnostic design + * - Each GPU runs solve() independently with its own seed + * - Each GPU has an InjectBuffer (device memory) + * - A CPU coordinator thread periodically (every N seconds) collects each GPU's best and asynchronously writes to other GPUs' InjectBuffers + * - After migrate_kernel, each GPU checks InjectBuffer and injects if a new solution is present + * - Fully decoupled: GPUs need not pause; CPU writes asynchronously; CUDA stream sync ensures safety */ #pragma once @@ -18,25 +18,26 @@ #include // ============================================================ -// MultiGpuContext — 每个 GPU 的上下文 +// MultiGpuContext — per-GPU context // ============================================================ template struct MultiGpuContext { using Sol = typename Problem::Sol; - int gpu_id; // GPU 设备 ID - Problem* problem; // Problem 实例(设备指针指向该 GPU) - SolverConfig config; // 求解器配置(独立 seed) + int gpu_id; // GPU device ID + Problem* problem; // Problem instance (device pointer for this GPU) + SolverConfig config; // Solver config (independent seed) - Sol best_solution; // 当前最优解(host 端) - std::mutex best_mutex; // 保护 best_solution 的互斥锁 + Sol best_solution; // Current best solution (host) + SolveResult solve_result; // Full result from solve() + std::mutex best_mutex; // Mutex protecting best_solution - InjectBuffer* d_inject_buf; // Device 端注入缓冲区(在该 GPU 上分配) - Sol* d_global_best; // Device 端全局最优解指针(由 solve() 导出) + InjectBuffer* d_inject_buf; // Device-side inject buffer (allocated on this GPU) + Sol* d_global_best; // Device pointer to global best (exported by solve()) - std::atomic stop_flag; // 停止标志 - std::atomic running; // 运行状态标志(用于协调线程判断) + std::atomic stop_flag; // Stop flag + std::atomic running; // Running flag (for coordinator thread) MultiGpuContext(int id) : gpu_id(id), problem(nullptr), d_inject_buf(nullptr), d_global_best(nullptr), stop_flag(false), running(false) { @@ -47,45 +48,46 @@ struct MultiGpuContext { }; // ============================================================ -// GPU Worker 线程函数(方案 B3) +// GPU worker thread (plan B3) // ============================================================ template void gpu_worker(MultiGpuContext* ctx) { using Sol = typename Problem::Sol; - // 设置当前线程使用的 GPU + // Set GPU for this thread CUDA_CHECK(cudaSetDevice(ctx->gpu_id)); - // 标记开始运行 + // Mark as running ctx->running.store(true); - // 运行 solve(传入 inject_buf 和 d_global_best_out) + // Run solve (pass inject_buf and d_global_best_out) SolveResult result = solve(*ctx->problem, ctx->config, nullptr, 0, nullptr, ctx->d_inject_buf, &ctx->d_global_best); - // 标记运行结束 + // Mark as finished running ctx->running.store(false); - // 更新最优解 + // Update best solution and full result { std::lock_guard lock(ctx->best_mutex); ctx->best_solution = result.best_solution; + ctx->solve_result = result; } - // 标记完成 + // Mark complete ctx->stop_flag.store(true); } // ============================================================ -// 协调线程函数(方案 B3) +// Coordinator thread (plan B3) // ============================================================ -// 定期从各 GPU 的 d_global_best 读取当前 best,计算 global_best,注入到其他 GPU +// Periodically read each GPU's current best from d_global_best, compute global_best, inject to other GPUs // -// 关键设计: -// 1. 直接从各 GPU 的 d_global_best 读取(由 solve() 导出) -// 2. 要求启用 SA(否则无 d_global_best) -// 3. 轻量侵入:solve() 只需导出一个指针,对单 GPU 无影响 +// Key design: +// 1. Read directly from each GPU's d_global_best (exported by solve()) +// 2. Requires SA enabled (otherwise no d_global_best) +// 3. Light touch: solve() only exports a pointer; single-GPU path unchanged template void coordinator_thread(std::vector*>& contexts, @@ -96,7 +98,7 @@ void coordinator_thread(std::vector*>& contexts, auto interval_ms = std::chrono::milliseconds(static_cast(interval_sec * 1000)); int round = 0; - // 等待所有 GPU 的 d_global_best 就绪 + // Wait until all GPUs' d_global_best are ready bool all_ready = false; while (!all_ready) { std::this_thread::sleep_for(std::chrono::milliseconds(100)); @@ -110,10 +112,10 @@ void coordinator_thread(std::vector*>& contexts, } while (true) { - // 等待指定时间间隔 + // Wait for the configured interval std::this_thread::sleep_for(interval_ms); - // 检查是否所有 GPU 都已停止 + // Check whether all GPUs have stopped bool all_stopped = true; for (auto* ctx : contexts) { if (ctx->running.load()) { @@ -125,17 +127,17 @@ void coordinator_thread(std::vector*>& contexts, round++; - // 收集各 GPU 的当前最优解(从 d_global_best 读取) + // Collect each GPU's current best (from d_global_best) Sol global_best; global_best.penalty = 1e30f; global_best.objectives[0] = 1e30f; int best_gpu = -1; for (int i = 0; i < (int)contexts.size(); i++) { - if (!contexts[i]->running.load()) continue; // 已停止的 GPU 跳过 - if (contexts[i]->d_global_best == nullptr) continue; // 未就绪跳过 + if (!contexts[i]->running.load()) continue; // skip stopped GPUs + if (contexts[i]->d_global_best == nullptr) continue; // skip not ready - // 从该 GPU 的 d_global_best 读取 + // Read from this GPU's d_global_best Sol gpu_best; cudaSetDevice(contexts[i]->gpu_id); cudaMemcpy(&gpu_best, contexts[i]->d_global_best, sizeof(Sol), cudaMemcpyDeviceToHost); @@ -146,23 +148,23 @@ void coordinator_thread(std::vector*>& contexts, } } - if (best_gpu == -1) continue; // 所有 GPU 都已停止或未就绪 + if (best_gpu == -1) continue; // all GPUs stopped or not ready if (verbose) { printf(" [Coordinator Round %d] Global best from GPU %d: obj=%.2f, penalty=%.2f\n", round, best_gpu, global_best.objectives[0], global_best.penalty); } - // 将 global_best 注入到其他 GPU(除了 best_gpu 自己) + // Inject global_best into other GPUs (except best_gpu) for (int i = 0; i < (int)contexts.size(); i++) { - if (i == best_gpu) continue; // 不注入到自己 - if (!contexts[i]->running.load()) continue; // 已停止的 GPU 不注入 + if (i == best_gpu) continue; // do not inject to self + if (!contexts[i]->running.load()) continue; // do not inject to stopped GPUs - // 读取 InjectBuffer 结构(从 device 到 host) + // Read InjectBuffer struct (device to host) InjectBuffer buf; cudaMemcpy(&buf, contexts[i]->d_inject_buf, sizeof(InjectBuffer), cudaMemcpyDeviceToHost); - // 同步写入(会自动切换设备) + // Synchronous write (switches device as needed) buf.write_sync(global_best, contexts[i]->gpu_id); } } @@ -173,7 +175,7 @@ void coordinator_thread(std::vector*>& contexts, } // ============================================================ -// 多 GPU 协同求解主函数(方案 B3) +// Multi-GPU cooperative solve entry (plan B3) // ============================================================ template @@ -181,13 +183,17 @@ SolveResult solve_multi_gpu(Problem& prob, const SolverCo using Sol = typename Problem::Sol; if (cfg.num_gpus <= 1) { - // 单 GPU 模式,直接调用普通 solve + // Single-GPU mode: call plain solve return solve(prob, cfg); } - // 检查可用 GPU 数量 - int device_count; + // Check available GPU count + int device_count = 0; CUDA_CHECK(cudaGetDeviceCount(&device_count)); + if (device_count <= 0) { + fprintf(stderr, "Error: No CUDA devices available\n"); + return SolveResult{}; + } int actual_gpus = std::min(cfg.num_gpus, device_count); if (cfg.verbose) { @@ -199,15 +205,15 @@ SolveResult solve_multi_gpu(Problem& prob, const SolverCo cfg.multi_gpu_inject_mode == MultiGpuInjectMode::HalfIslands ? "HalfIslands" : "AllIslands"); } - // 创建各 GPU 的上下文 + // Create per-GPU contexts std::vector*> contexts; for (int i = 0; i < actual_gpus; i++) { auto* ctx = new MultiGpuContext(i); ctx->config = cfg; - ctx->config.seed = cfg.seed + i * 1000; // 每个 GPU 用不同 seed - ctx->config.num_gpus = 1; // 单 GPU 模式运行 + ctx->config.seed = cfg.seed + i * 1000; // distinct seed per GPU + ctx->config.num_gpus = 1; // run as single-GPU per device - // 克隆 Problem 到该 GPU + // Clone Problem onto this GPU ctx->problem = prob.clone_to_device(i); if (ctx->problem == nullptr) { fprintf(stderr, "Error: Failed to clone problem to GPU %d\n", i); @@ -218,10 +224,10 @@ SolveResult solve_multi_gpu(Problem& prob, const SolverCo return SolveResult{}; } - // 分配 InjectBuffer(在该 GPU 上) + // Allocate InjectBuffer on this GPU InjectBuffer buf = InjectBuffer::allocate(i); - // 将 InjectBuffer 拷贝到 device 端(传给 kernel) + // Copy InjectBuffer to device (for kernels) InjectBuffer* d_buf; CUDA_CHECK(cudaSetDevice(i)); CUDA_CHECK(cudaMalloc(&d_buf, sizeof(InjectBuffer))); @@ -231,34 +237,36 @@ SolveResult solve_multi_gpu(Problem& prob, const SolverCo contexts.push_back(ctx); } - // 启动 worker 线程 + // Start worker threads std::vector workers; for (auto* ctx : contexts) { workers.emplace_back(gpu_worker, ctx); } - // 启动协调线程(定期注入 global_best) + // Start coordinator thread (periodic global_best injection) std::thread coordinator(coordinator_thread, std::ref(contexts), cfg.multi_gpu_interval_sec, cfg.verbose); - // 等待所有 worker 完成 + // Wait for all workers to finish for (auto& w : workers) w.join(); - // 等待协调线程完成 + // Wait for coordinator to finish coordinator.join(); - // 收集最终结果 + // Collect final result from best GPU Sol final_best = contexts[0]->best_solution; + int best_ctx = 0; ObjConfig oc = prob.obj_config(); for (int i = 1; i < (int)contexts.size(); i++) { if (is_better(contexts[i]->best_solution, final_best, oc)) { final_best = contexts[i]->best_solution; + best_ctx = i; } } - // 清理 + // Cleanup for (auto* ctx : contexts) { - // 读取 InjectBuffer 的内容(用于释放) + // Read InjectBuffer content (for teardown) InjectBuffer buf; CUDA_CHECK(cudaSetDevice(ctx->gpu_id)); CUDA_CHECK(cudaMemcpy(&buf, ctx->d_inject_buf, sizeof(InjectBuffer), cudaMemcpyDeviceToHost)); @@ -269,10 +277,9 @@ SolveResult solve_multi_gpu(Problem& prob, const SolverCo delete ctx; } - // 构造返回结果 - SolveResult result; + // Build return value from best GPU's result + SolveResult result = contexts[best_ctx]->solve_result; result.best_solution = final_best; - result.stop_reason = StopReason::MaxGen; return result; } diff --git a/prototype/core/operators.cuh b/prototype/core/operators.cuh index f1db6e6..179ee63 100644 --- a/prototype/core/operators.cuh +++ b/prototype/core/operators.cuh @@ -1,40 +1,40 @@ /** - * operators.cuh - 四层搜索算子体系(Device 端) + * operators.cuh - Four-layer search operator hierarchy (device side) * - * v1.0: 二维通用编码的完整算子层次 + * v1.0: Full operator hierarchy for 2D universal encoding * - * 层次结构(所有算子只看 data[D1][D2] + dim2_sizes,不感知问题语义): + * Hierarchy (all operators only see data[D1][D2] + dim2_sizes, no problem semantics): * - * 第 1 层 - 元素级(Element): 操作单个元素 - * 行内: swap, reverse(2-opt), insert, flip - * 跨行: cross_relocate(单元素移行), cross_swap(单元素换行) + * Layer 1 - Element: operate on single elements + * Within row: swap, reverse(2-opt), insert, flip + * Cross-row: cross_relocate (move one element across rows), cross_swap (swap one element per row) * - * 第 2 层 - 片段级(Segment): 操作连续片段 - * 行内: or_opt(移动连续 k 个元素到行内新位置) - * 跨行: seg_relocate(片段从一行移到另一行) - * seg_swap(两行各取一段互换,即 2-opt*) + * Layer 2 - Segment: operate on contiguous segments + * Within row: or_opt (move contiguous k elements to a new position in the row) + * Cross-row: seg_relocate (move a segment from one row to another) + * seg_swap (swap two segments from two rows each, i.e. 2-opt*) * - * 第 3 层 - 行级(Row): 操作整行 - * row_swap(交换两行全部内容和长度) - * row_reverse(反转行的排列顺序) - * row_split(一行拆成两行) - * row_merge(两行合并为一行) + * Layer 3 - Row: operate on whole rows + * row_swap (swap full contents and lengths of two rows) + * row_reverse (reverse row order) + * row_split (split one row into two) + * row_merge (merge two rows into one) * - * 第 4 层 - 交叉(Crossover): 组合两个解 - * row_crossover(从父代 A/B 各取若干行组成子代) - * uniform_crossover(逐元素从两个父代中选) + * Layer 4 - Crossover: combine two solutions + * row_crossover (child takes some rows from parent A and B) + * uniform_crossover (pick per element from two parents) * - * Move 描述符: - * row, row2: 行索引(row2=-1 表示行内) - * op: 操作码 - * pos1, pos2: 位置参数 - * seg_len: 片段长度(第 2 层使用) + * Move descriptor: + * row, row2: row indices (row2=-1 means within-row) + * op: operation code + * pos1, pos2: position parameters + * seg_len: segment length (used by layer 2) * - * 设计原则: - * - 所有算子对问题类型无感知,只操作二维数组 - * - 每个算子都有对应的 undo 操作 - * - 空行安全:自动降级为 no-op - * - 编码类型决定可用算子集 + * Design principles: + * - All operators are problem-agnostic; they only manipulate a 2D array + * - Each operator has a corresponding undo + * - Empty-row safe: automatically degrades to no-op + * - Encoding type determines the available operator set */ #pragma once @@ -44,61 +44,61 @@ namespace ops { // ============================================================ -// Op 码常量 — 按层次编号,避免冲突 +// Op code constants — numbered by layer to avoid collisions // ============================================================ -// 通用 +// General constexpr int OP_NOOP = -1; -// --- 第 1 层:元素级 --- -// Permutation 行内 -constexpr int PERM_SWAP = 0; // 交换两个位置 -constexpr int PERM_REVERSE = 1; // 反转区间(2-opt) -constexpr int PERM_INSERT = 2; // 移动单个元素到新位置 -// Permutation 跨行 -constexpr int PERM_CROSS_RELOCATE = 3; // 单元素从一行移到另一行 -constexpr int PERM_CROSS_SWAP = 4; // 两行各一个元素互换 -// Binary 行内 -constexpr int BIN_FLIP = 0; // 翻转一个位 -constexpr int BIN_SWAP = 1; // 交换两个位 -// Binary 跨行 -constexpr int BIN_CROSS_SWAP = 2; // 两行各一个位互换 +// --- Layer 1: element --- +// Permutation within row +constexpr int PERM_SWAP = 0; // swap two positions +constexpr int PERM_REVERSE = 1; // reverse interval (2-opt) +constexpr int PERM_INSERT = 2; // move one element to a new position +// Permutation cross-row +constexpr int PERM_CROSS_RELOCATE = 3; // move one element from one row to another +constexpr int PERM_CROSS_SWAP = 4; // swap one element per row between two rows +// Binary within row +constexpr int BIN_FLIP = 0; // flip one bit +constexpr int BIN_SWAP = 1; // swap two bits +// Binary cross-row +constexpr int BIN_CROSS_SWAP = 2; // swap one bit per row between two rows -// --- 第 1 层(续):排列行内 --- -constexpr int PERM_3OPT = 5; // 3-opt:断 3 条边重连 +// --- Layer 1 (cont.): permutation within row --- +constexpr int PERM_3OPT = 5; // 3-opt: break 3 edges and reconnect -// --- 第 2 层:片段级 --- -constexpr int PERM_OR_OPT = 10; // 行内:移动连续 k 个元素 -constexpr int PERM_SEG_RELOCATE = 11; // 跨行:片段从一行移到另一行 -constexpr int PERM_SEG_SWAP = 12; // 跨行:两行各取一段互换(2-opt*) -constexpr int PERM_CROSS_EXCHANGE = 15; // 跨行:两行各取一段互换(保持各自内部顺序) -constexpr int BIN_SEG_FLIP = 13; // 行内:翻转连续 k 个位 -constexpr int BIN_SEG_CROSS_SWAP = 14; // 跨行:两行各取一段互换 -constexpr int BIN_K_FLIP = 16; // 行内:同时翻转 k 个随机位 +// --- Layer 2: segment --- +constexpr int PERM_OR_OPT = 10; // within row: move contiguous k elements +constexpr int PERM_SEG_RELOCATE = 11; // cross-row: move segment from one row to another +constexpr int PERM_SEG_SWAP = 12; // cross-row: swap two segments from two rows each (2-opt*) +constexpr int PERM_CROSS_EXCHANGE = 15; // cross-row: swap two segments (preserve internal order each) +constexpr int BIN_SEG_FLIP = 13; // within row: flip contiguous k bits +constexpr int BIN_SEG_CROSS_SWAP = 14; // cross-row: swap two segments from two rows each +constexpr int BIN_K_FLIP = 16; // within row: flip k random bits at once -// --- 第 3 层:行级 --- -constexpr int ROW_SWAP = 20; // 交换两行全部内容 -constexpr int ROW_REVERSE = 21; // 反转行的排列顺序(行号重排) -constexpr int ROW_SPLIT = 22; // 一行拆成两行 -constexpr int ROW_MERGE = 23; // 两行合并为一行 +// --- Layer 3: row --- +constexpr int ROW_SWAP = 20; // swap full contents of two rows +constexpr int ROW_REVERSE = 21; // reverse row order (row index permutation) +constexpr int ROW_SPLIT = 22; // split one row into two +constexpr int ROW_MERGE = 23; // merge two rows into one -// --- 特殊:扰动(连续多步 move,不可 undo,用于跳出局部最优)--- +// --- Special: perturbation (multi-step moves, no undo, escape local optima) --- constexpr int PERTURBATION = 40; -// --- 第 4 层:交叉 --- -constexpr int CROSS_ROW = 30; // 行级交叉:从两个父代各取若干行 -constexpr int CROSS_UNIFORM = 31; // 均匀交叉:逐元素从两个父代选 +// --- Layer 4: crossover --- +constexpr int CROSS_ROW = 30; // row crossover: take some rows from each parent +constexpr int CROSS_UNIFORM = 31; // uniform crossover: pick per element from two parents // ============================================================ -// Move 描述符 — 编码级别的变动描述 +// Move descriptor — encoding-level change description // ============================================================ struct Move { - int row; // 源行(或第一行) - int row2; // 目标行(-1 = 行内) - int op; // 操作码 - int pos1, pos2; // 位置参数 - int seg_len; // 片段长度(第 2 层使用,其他层 = 0) + int row; // source row (or first row) + int row2; // target row (-1 = within-row) + int op; // operation code + int pos1, pos2; // position parameters + int seg_len; // segment length (layer 2; 0 for other layers) }; } // namespace ops @@ -106,10 +106,10 @@ struct Move { namespace ops { // ============================================================ -// 第 1 层:元素级底层操作 +// Layer 1: element-level primitives // ============================================================ -// --- Permutation 行内 --- +// --- Permutation within row --- __device__ inline void perm_swap(int* row, int i, int j) { int tmp = row[i]; row[i] = row[j]; row[j] = tmp; @@ -126,9 +126,9 @@ __device__ inline void perm_insert(int* row, int from, int to, int size) { row[to] = val; } -// --- Permutation 跨行 --- +// --- Permutation cross-row --- -/// cross_relocate: 从 src_row[src_pos] 取出元素,插入 dst_row[dst_pos] +/// cross_relocate: take element from src_row[src_pos], insert at dst_row[dst_pos] __device__ inline void perm_cross_relocate(int* src_row, int& src_size, int* dst_row, int& dst_size, int src_pos, int dst_pos) { @@ -142,24 +142,24 @@ __device__ inline void perm_cross_relocate(int* src_row, int& src_size, dst_size++; } -/// cross_swap: 交换 rowA[posA] 和 rowB[posB] +/// cross_swap: swap rowA[posA] and rowB[posB] __device__ inline void cross_swap_elem(int* rowA, int posA, int* rowB, int posB) { int tmp = rowA[posA]; rowA[posA] = rowB[posB]; rowB[posB] = tmp; } -// --- Permutation 行内:3-opt --- -// 断开 3 条边,选择最佳重连方式(共 8 种组合,取随机一种非恒等变换) -// 参数:3 个断点 i < j < k,将路线分为 seg0=[0,i] seg1=[i+1,j] seg2=[j+1,k] seg3=[k+1,end] -// 实现:随机选一种重连(reverse seg1, reverse seg2, 或两者都反转) -// pos1=i, pos2=j, seg_len 编码 k +// --- Permutation within row: 3-opt --- +// Break 3 edges and pick a reconnection (8 combinations; pick one random non-identity) +// Args: three breakpoints i < j < k, route splits seg0=[0,i] seg1=[i+1,j] seg2=[j+1,k] seg3=[k+1,end] +// Impl: random reconnection (reverse seg1, reverse seg2, or both) +// pos1=i, pos2=j, seg_len encodes k __device__ inline void perm_3opt(int* row, int size, int i, int j, int k) { - // 3-opt 有多种重连方式,这里实现最常用的 3 种非恒等变换: - // type 1: reverse [i+1, j] — 等价于 2-opt(i+1, j) - // type 2: reverse [j+1, k] — 等价于 2-opt(j+1, k) - // type 3: reverse [i+1, j] + reverse [j+1, k] — 真正的 3-opt move - // type 4: 将 seg1 和 seg2 互换位置(不反转) — or-opt 的泛化 - // 我们随机选 type 3 或 type 4(type 1/2 已被 2-opt 覆盖) - // 这里固定做 type 3(双反转),因为它是 2-opt 无法达到的唯一新邻域 + // 3-opt has several reconnections; here we use the most common non-identity variants: + // type 1: reverse [i+1, j] — same as 2-opt(i+1, j) + // type 2: reverse [j+1, k] — same as 2-opt(j+1, k) + // type 3: reverse [i+1, j] + reverse [j+1, k] — true 3-opt move + // type 4: swap seg1 and seg2 (no reverse) — generalization of or-opt + // We would randomize type 3 or 4 (types 1/2 are covered by 2-opt) + // Here we fix type 3 (double reverse) as the only new neighborhood 2-opt cannot reach // reverse [i+1, j] int lo = i + 1, hi = j; while (lo < hi) { int t = row[lo]; row[lo] = row[hi]; row[hi] = t; lo++; hi--; } @@ -168,12 +168,12 @@ __device__ inline void perm_3opt(int* row, int size, int i, int j, int k) { while (lo < hi) { int t = row[lo]; row[lo] = row[hi]; row[hi] = t; lo++; hi--; } } -// 3-opt undo: 再做一次相同操作即可恢复(双反转是自反的) +// 3-opt undo: repeat the same move to restore (double reverse is self-inverse) __device__ inline void perm_3opt_undo(int* row, int size, int i, int j, int k) { - perm_3opt(row, size, i, j, k); // 自反 + perm_3opt(row, size, i, j, k); // self-inverse } -// --- Binary 行内 --- +// --- Binary within row --- __device__ inline void bin_flip(int* row, int i) { row[i] = 1 - row[i]; } @@ -182,51 +182,51 @@ __device__ inline void bin_swap(int* row, int i, int j) { } // ============================================================ -// 第 2 层:片段级底层操作 +// Layer 2: segment-level primitives // ============================================================ -/// or_opt: 行内移动连续 seg_len 个元素(从 from 开始)到 to 位置 -/// 等价于:取出 [from, from+seg_len),插入到 to 之前 -/// 约束:from + seg_len <= size, to 不在 [from, from+seg_len) 内 +/// or_opt: within row, move contiguous seg_len elements (starting at from) to position to +/// Same as: take [from, from+seg_len), insert before to +/// Constraints: from + seg_len <= size, to not in [from, from+seg_len) __device__ inline void perm_or_opt(int* row, int size, int from, int to, int seg_len) { - // 临时缓冲(片段最大长度受限于寄存器,实际 seg_len 通常 <= 4) - int buf[8]; // 足够覆盖常见 seg_len + // Temp buffer (max segment length limited by registers; seg_len usually <= 4) + int buf[8]; // enough for typical seg_len int actual_len = (seg_len > 8) ? 8 : seg_len; - // 保存片段 + // Save segment for (int i = 0; i < actual_len; i++) buf[i] = row[from + i]; - // 移除片段(左移填补空洞) + // Remove segment (shift left to close gap) int new_size = size - actual_len; for (int k = from; k < new_size; k++) row[k] = row[k + actual_len]; - // 计算插入位置(移除后的坐标系) + // Insert position after removal (coords after removal) int ins = (to > from) ? to - actual_len : to; if (ins < 0) ins = 0; if (ins > new_size) ins = new_size; - // 插入片段(右移腾位) + // Insert segment (shift right to make room) for (int k = new_size - 1; k >= ins; k--) row[k + actual_len] = row[k]; for (int i = 0; i < actual_len; i++) row[ins + i] = buf[i]; } -/// seg_relocate: 从 src_row 取出连续 seg_len 个元素,插入 dst_row 的 dst_pos -/// src_size 减 seg_len,dst_size 加 seg_len +/// seg_relocate: take contiguous seg_len elements from src_row, insert at dst_pos in dst_row +/// src_size -= seg_len, dst_size += seg_len __device__ inline void perm_seg_relocate(int* src_row, int& src_size, int* dst_row, int& dst_size, int src_pos, int dst_pos, int seg_len) { int buf[8]; int actual_len = (seg_len > 8) ? 8 : seg_len; - // 保存片段 + // Save segment for (int i = 0; i < actual_len; i++) buf[i] = src_row[src_pos + i]; - // 源行:移除(左移) + // Source row: remove (shift left) for (int k = src_pos; k < src_size - actual_len; k++) src_row[k] = src_row[k + actual_len]; src_size -= actual_len; - // 目标行:插入(右移) + // Destination row: insert (shift right) for (int k = dst_size - 1; k >= dst_pos; k--) dst_row[k + actual_len] = dst_row[k]; for (int i = 0; i < actual_len; i++) @@ -234,29 +234,29 @@ __device__ inline void perm_seg_relocate(int* src_row, int& src_size, dst_size += actual_len; } -/// seg_swap: 两行各取一段互换(2-opt* 的通用形式) +/// seg_swap: swap one segment from each row (general 2-opt*) /// rowA[posA..posA+lenA) <-> rowB[posB..posB+lenB) -/// 行长变化:sizeA += (lenB - lenA), sizeB += (lenA - lenB) +/// Row lengths: sizeA += (lenB - lenA), sizeB += (lenA - lenB) __device__ inline void perm_seg_swap(int* rowA, int& sizeA, int posA, int lenA, int* rowB, int& sizeB, int posB, int lenB) { int bufA[8], bufB[8]; int aLen = (lenA > 8) ? 8 : lenA; int bLen = (lenB > 8) ? 8 : lenB; - // 保存两段 + // Save both segments for (int i = 0; i < aLen; i++) bufA[i] = rowA[posA + i]; for (int i = 0; i < bLen; i++) bufB[i] = rowB[posB + i]; - // 从 rowA 移除 segA,腾出空间插入 segB - // 先移除 + // Remove segA from rowA to make room for segB + // Remove first int newSizeA = sizeA - aLen; for (int k = posA; k < newSizeA; k++) rowA[k] = rowA[k + aLen]; - // 再插入 segB + // Then insert segB for (int k = newSizeA - 1; k >= posA; k--) rowA[k + bLen] = rowA[k]; for (int i = 0; i < bLen; i++) rowA[posA + i] = bufB[i]; sizeA = newSizeA + bLen; - // 从 rowB 移除 segB,腾出空间插入 segA + // Remove segB from rowB to make room for segA int newSizeB = sizeB - bLen; for (int k = posB; k < newSizeB; k++) rowB[k] = rowB[k + bLen]; for (int k = newSizeB - 1; k >= posB; k--) rowB[k + aLen] = rowB[k]; @@ -264,10 +264,10 @@ __device__ inline void perm_seg_swap(int* rowA, int& sizeA, int posA, int lenA, sizeB = newSizeB + aLen; } -/// cross_exchange: 两行各取一段互换,保持各自内部顺序 -/// 与 seg_swap 的区别:seg_swap 是等长互换,cross_exchange 允许不等长 +/// cross_exchange: swap one segment from each row, preserving internal order each +/// Unlike seg_swap: seg_swap is equal-length swap; cross_exchange allows unequal lengths /// rowA[posA..posA+lenA) <-> rowB[posB..posB+lenB) -/// 行长变化:sizeA += (lenB - lenA), sizeB += (lenA - lenB) +/// Row lengths: sizeA += (lenB - lenA), sizeB += (lenA - lenB) __device__ inline void perm_cross_exchange(int* rowA, int& sizeA, int posA, int lenA, int* rowB, int& sizeB, int posB, int lenB) { int bufA[8], bufB[8]; @@ -277,14 +277,14 @@ __device__ inline void perm_cross_exchange(int* rowA, int& sizeA, int posA, int for (int i = 0; i < aLen; i++) bufA[i] = rowA[posA + i]; for (int i = 0; i < bLen; i++) bufB[i] = rowB[posB + i]; - // rowA: 移除 segA,插入 segB + // rowA: remove segA, insert segB int newSizeA = sizeA - aLen; for (int k = posA; k < newSizeA; k++) rowA[k] = rowA[k + aLen]; for (int k = newSizeA - 1; k >= posA; k--) rowA[k + bLen] = rowA[k]; for (int i = 0; i < bLen; i++) rowA[posA + i] = bufB[i]; sizeA = newSizeA + bLen; - // rowB: 移除 segB,插入 segA + // rowB: remove segB, insert segA int newSizeB = sizeB - bLen; for (int k = posB; k < newSizeB; k++) rowB[k] = rowB[k + bLen]; for (int k = newSizeB - 1; k >= posB; k--) rowB[k + aLen] = rowB[k]; @@ -292,8 +292,8 @@ __device__ inline void perm_cross_exchange(int* rowA, int& sizeA, int posA, int sizeB = newSizeB + aLen; } -/// k-bit flip: 同时翻转 k 个随机位(Binary 编码) -/// positions 数组存储要翻转的位置,k = 实际翻转数 +/// k-bit flip: flip k random bits at once (Binary encoding) +/// positions array holds indices to flip; k = number of flips __device__ inline void bin_k_flip(int* row, int size, int k, curandState* rng) { for (int i = 0; i < k; i++) { int pos = rand_int(rng, size); @@ -301,12 +301,12 @@ __device__ inline void bin_k_flip(int* row, int size, int k, curandState* rng) { } } -/// seg_flip: 翻转行内连续 seg_len 个位(Binary 编码) +/// seg_flip: flip contiguous seg_len bits within row (Binary encoding) __device__ inline void bin_seg_flip(int* row, int pos, int seg_len) { for (int i = 0; i < seg_len; i++) row[pos + i] = 1 - row[pos + i]; } -/// seg_cross_swap: 两行各取一段互换(Binary 编码,等长) +/// seg_cross_swap: swap one segment from each row (Binary encoding, equal length) __device__ inline void bin_seg_cross_swap(int* rowA, int posA, int* rowB, int posB, int seg_len) { for (int i = 0; i < seg_len; i++) { @@ -317,23 +317,23 @@ __device__ inline void bin_seg_cross_swap(int* rowA, int posA, } // ============================================================ -// Integer 编码底层操作 +// Integer encoding primitives // ============================================================ -/// int_clamp: 将值限制在 [lb, ub] 范围内 +/// int_clamp: clamp value to [lb, ub] __device__ inline int int_clamp(int v, int lb, int ub) { if (v < lb) return lb; if (v > ub) return ub; return v; } -/// int_random_reset: 随机一个位置重置为 [lb, ub] 内随机值 +/// int_random_reset: reset one random position to uniform random in [lb, ub] __device__ inline void int_random_reset(int* row, int pos, int lb, int ub, curandState* rng) { row[pos] = lb + (curand(rng) % (ub - lb + 1)); } -/// int_delta: 随机一个位置 ±k(clamp 到 [lb, ub]) +/// int_delta: random position, add ±k (clamped to [lb, ub]) __device__ inline void int_delta(int* row, int pos, int lb, int ub, curandState* rng) { int range = ub - lb + 1; @@ -343,7 +343,7 @@ __device__ inline void int_delta(int* row, int pos, int lb, int ub, row[pos] = int_clamp(row[pos] + step, lb, ub); } -/// int_seg_reset: 连续 k 个位置全部重置为 [lb, ub] 内随机值 +/// int_seg_reset: reset k contiguous positions to uniform random in [lb, ub] __device__ inline void int_seg_reset(int* row, int pos, int seg_len, int lb, int ub, curandState* rng) { int range = ub - lb + 1; @@ -351,7 +351,7 @@ __device__ inline void int_seg_reset(int* row, int pos, int seg_len, row[pos + i] = lb + (curand(rng) % range); } -/// int_k_delta: 随机 k 个位置各自 ±1 +/// int_k_delta: k random positions, each ±1 __device__ inline void int_k_delta(int* row, int size, int k, int lb, int ub, curandState* rng) { for (int i = 0; i < k; i++) { @@ -362,21 +362,21 @@ __device__ inline void int_k_delta(int* row, int size, int k, } // ============================================================ -// 第 3 层:行级底层操作 +// Layer 3: row-level primitives // ============================================================ -/// row_swap: 交换两行的全部内容和长度 +/// row_swap: swap full contents and lengths of two rows template __device__ inline void row_swap(Sol& sol, int r1, int r2) { - // 交换长度 + // Swap lengths int tmp_size = sol.dim2_sizes[r1]; sol.dim2_sizes[r1] = sol.dim2_sizes[r2]; sol.dim2_sizes[r2] = tmp_size; - // 交换数据(取两行中较长的长度) + // Swap data (use the longer of the two row lengths) int max_len = (sol.dim2_sizes[r1] > sol.dim2_sizes[r2]) ? sol.dim2_sizes[r1] : sol.dim2_sizes[r2]; - // 交换后 r1 的长度是原 r2 的,r2 的长度是原 r1 的 - // 所以需要交换 max(原r1长度, 原r2长度) 个元素 + // After swap, r1 has old r2 length and r2 has old r1 length + // So swap max(old r1 len, old r2 len) elements max_len = (tmp_size > max_len) ? tmp_size : max_len; for (int c = 0; c < max_len; c++) { int tmp = sol.data[r1][c]; @@ -385,8 +385,8 @@ __device__ inline void row_swap(Sol& sol, int r1, int r2) { } } -/// row_reverse: 反转 [r1, r2] 范围内的行排列顺序 -/// 例如 row_reverse(sol, 1, 4) 把行 1,2,3,4 变成 4,3,2,1 +/// row_reverse: reverse row order in [r1, r2] +/// e.g. row_reverse(sol, 1, 4) turns rows 1,2,3,4 into 4,3,2,1 template __device__ inline void row_reverse_range(Sol& sol, int r1, int r2) { while (r1 < r2) { @@ -395,23 +395,23 @@ __device__ inline void row_reverse_range(Sol& sol, int r1, int r2) { } } -/// row_split: 将 row 从 split_pos 处拆成两行 -/// row 保留 [0, split_pos),empty_row 接收 [split_pos, size) -/// 要求 empty_row 当前为空或有足够空间 +/// row_split: split row at split_pos into two rows +/// row keeps [0, split_pos), empty_row gets [split_pos, size) +/// requires empty_row empty or with enough space template __device__ inline void row_split(Sol& sol, int row, int empty_row, int split_pos) { int orig_size = sol.dim2_sizes[row]; int move_count = orig_size - split_pos; - // 复制后半段到 empty_row + // Copy tail to empty_row for (int i = 0; i < move_count; i++) sol.data[empty_row][i] = sol.data[row][split_pos + i]; sol.dim2_sizes[empty_row] = move_count; sol.dim2_sizes[row] = split_pos; } -/// row_merge: 将 src_row 的全部内容追加到 dst_row 末尾 -/// src_row 清空,dst_row 长度增加 -/// 要求 dst_size + src_size <= DIM2 +/// row_merge: append full contents of src_row to end of dst_row +/// src_row cleared, dst_row length increased +/// requires dst_size + src_size <= DIM2 template __device__ inline void row_merge(Sol& sol, int dst_row, int src_row) { int dst_size = sol.dim2_sizes[dst_row]; @@ -423,33 +423,33 @@ __device__ inline void row_merge(Sol& sol, int dst_row, int src_row) { } // ============================================================ -// 第 4 层:交叉底层操作 +// Layer 4: crossover primitives // ============================================================ // -// 排列编码:OX 家族(统一框架) -// 核心逻辑:A 中标记一组"保留位置"不动,空位按 B 的全局顺序填充 -// 三个变体只是"怎么选保留集"不同,填充逻辑完全共享 -// 天然保证唯一性:从 B 中按序取不在保留集中的元素,不会重复 -// 行长度不变(= A 的行长度),行边界不变 +// Permutation encoding: OX family (unified framework) +// Core: mark "kept" positions from A; fill gaps in B's global order +// Three variants differ only in how the keep set is chosen; fill logic is shared +// Uniqueness: take from B in order elements not in keep set, no duplicates +// Row lengths unchanged (= A's row lengths), row boundaries unchanged // -// Binary 编码:uniform_crossover(逐元素随机选) +// Binary encoding: uniform_crossover (random pick per element) // // ============================================================ -// ---- OX 核心填充逻辑 ---- -// keep[r][c] = true 表示 child[r][c] 保留 A 的值,false 表示空位 -// 空位按 B 中元素的出现顺序(逐行扫描)填充 -// 要求:child 已拷贝自 A,dim2_sizes 已设为 A 的行长度 +// ---- OX core fill logic ---- +// keep[r][c] = true means child[r][c] keeps A's value; false = gap to fill +// Gaps filled in order of appearance of elements in B (row-major scan) +// Requires: child copied from A, dim2_sizes set to A's row lengths // -// 参数 total_elements: 分区模式下的总元素数,非分区模式下 = 单行长度 -// 用于确定 B 中扫描的元素范围 +// total_elements: total elements in partitioned mode; in non-partitioned = single row length +// Used to bound the scan range in B template __device__ inline void ox_fill_from_b(Sol& child, const Sol& parentB, const bool* keep_flat, int dim1, int total_elements) { - // 统计 A 中保留位置的每个值的出现次数(支持多重集排列) - // keep_flat 是按行展平的:keep_flat[r * DIM2 + c] + // Count occurrences of each value at kept positions in A (multiset permutations) + // keep_flat is row-major flat: keep_flat[r * DIM2 + c] int keep_count[512]; for (int i = 0; i < total_elements; i++) keep_count[i] = 0; @@ -460,21 +460,21 @@ __device__ inline void ox_fill_from_b(Sol& child, const Sol& parentB, if (v >= 0 && v < total_elements) keep_count[v]++; } - // 从 B 中按行扫描顺序收集:每个值只取"需要填充"的份数 - // 标准排列:每个值最多 1 份,多重集:每个值最多 repeat_count 份 + // Collect from B in row scan order: take only as many of each value as needed to fill + // Standard permutation: at most 1 of each value; multiset: up to repeat_count each int fill_buf[512]; int fill_count = 0; for (int r = 0; r < dim1; r++) for (int c = 0; c < parentB.dim2_sizes[r]; c++) { int val = parentB.data[r][c]; if (val >= 0 && val < total_elements && keep_count[val] > 0) { - keep_count[val]--; // 消耗一个保留名额 + keep_count[val]--; // consume one kept slot } else if (val >= 0 && val < total_elements) { fill_buf[fill_count++] = val; } } - // 按空位顺序(逐行从左到右)填入 + // Fill gaps in order (row by row, left to right) int fi = 0; for (int r = 0; r < dim1; r++) for (int c = 0; c < child.dim2_sizes[r]; c++) @@ -482,26 +482,26 @@ __device__ inline void ox_fill_from_b(Sol& child, const Sol& parentB, child.data[r][c] = fill_buf[fi++]; } -// ---- 变体 1: OX-区间 ---- -// 每行随机选一个连续区间保留,保留邻接关系 +// ---- Variant 1: OX-interval ---- +// Per row, random contiguous interval kept; preserves adjacency template __device__ inline void ox_interval(Sol& child, const Sol& parentA, const Sol& parentB, int dim1, int total_elements, curandState* rng) { bool keep[Sol::DIM1 * Sol::DIM2]; for (int i = 0; i < Sol::DIM1 * Sol::DIM2; i++) keep[i] = false; - // child = A,同时标记每行的保留区间 + // child = A, mark each row's kept interval for (int r = 0; r < dim1; r++) { int sz = parentA.dim2_sizes[r]; child.dim2_sizes[r] = sz; for (int c = 0; c < sz; c++) child.data[r][c] = parentA.data[r][c]; if (sz < 2) { - // 长度 0 或 1:全部保留 + // length 0 or 1: keep all for (int c = 0; c < sz; c++) keep[r * Sol::DIM2 + c] = true; continue; } - // 随机选区间 [lo, hi] + // Random interval [lo, hi] int lo = rand_int(rng, sz); int hi = rand_int(rng, sz); if (lo > hi) { int tmp = lo; lo = hi; hi = tmp; } @@ -511,8 +511,8 @@ __device__ inline void ox_interval(Sol& child, const Sol& parentA, const Sol& pa ox_fill_from_b(child, parentB, keep, dim1, total_elements); } -// ---- 变体 2: OX-子集 ---- -// 随机选约 50% 的元素值保留其在 A 中的位置,通用性最强 +// ---- Variant 2: OX-subset ---- +// Randomly keep ~50% of positions at their A values; most general template __device__ inline void ox_subset(Sol& child, const Sol& parentA, const Sol& parentB, int dim1, int total_elements, curandState* rng) { @@ -526,7 +526,7 @@ __device__ inline void ox_subset(Sol& child, const Sol& parentA, const Sol& pare child.data[r][c] = parentA.data[r][c]; } - // 每个位置 50% 概率保留 + // 50% keep per position for (int r = 0; r < dim1; r++) for (int c = 0; c < child.dim2_sizes[r]; c++) keep[r * Sol::DIM2 + c] = (curand_uniform(rng) < 0.5f); @@ -534,9 +534,9 @@ __device__ inline void ox_subset(Sol& child, const Sol& parentA, const Sol& pare ox_fill_from_b(child, parentB, keep, dim1, total_elements); } -// ---- 变体 3: OX-行 ---- -// 随机选若干整行保留,其余行的元素全部按 B 的顺序重填 -// 保留整条路线结构,VRP 受益 +// ---- Variant 3: OX-row ---- +// Randomly keep whole rows; refill non-kept rows from B's order +// Preserves full route structure; good for VRP template __device__ inline void ox_row(Sol& child, const Sol& parentA, const Sol& parentB, int dim1, int total_elements, curandState* rng) { @@ -550,7 +550,7 @@ __device__ inline void ox_row(Sol& child, const Sol& parentA, const Sol& parentB child.data[r][c] = parentA.data[r][c]; } - // 每行 50% 概率整行保留 + // 50% chance to keep whole row int kept = 0; for (int r = 0; r < dim1; r++) { if (curand_uniform(rng) < 0.5f) { @@ -559,14 +559,14 @@ __device__ inline void ox_row(Sol& child, const Sol& parentA, const Sol& parentB kept++; } } - // 确保不是全保留或全不保留 + // Ensure not all-kept or all-unkept if (kept == 0) { int r = rand_int(rng, dim1); - // 不标记任何 keep → 全部重填(至少有一行不保留) - // 实际上 kept==0 意味着全部重填,这是合法的(child = B 的顺序填入 A 的结构) + // No keep marks → full refill (at least one row not kept) + // kept==0 means full refill; valid (child gets B's order into A's structure) } if (kept == dim1 && dim1 > 1) { - // 全保留 → 随机取消一行 + // All kept → randomly un-keep one row int r = rand_int(rng, dim1); for (int c = 0; c < child.dim2_sizes[r]; c++) keep[r * Sol::DIM2 + c] = false; @@ -575,14 +575,14 @@ __device__ inline void ox_row(Sol& child, const Sol& parentA, const Sol& parentB ox_fill_from_b(child, parentB, keep, dim1, total_elements); } -// ---- OX 统一入口 ---- -// 随机选一个变体执行 -// dim1==1 时只用区间和子集(行变体无意义) +// ---- OX unified entry ---- +// Pick one variant at random +// When dim1==1 use only interval and subset (row variant useless) template __device__ inline void perm_ox_crossover(Sol& child, const Sol& parentA, const Sol& parentB, int dim1, int total_elements, curandState* rng) { int n_variants = (dim1 > 1) ? 3 : 2; - int variant = rand_int(rng, n_variants); // 0: 区间, 1: 子集, [2: 行] + int variant = rand_int(rng, n_variants); // 0: interval, 1: subset, [2: row] switch (variant) { case 0: ox_interval(child, parentA, parentB, dim1, total_elements, rng); break; case 1: ox_subset(child, parentA, parentB, dim1, total_elements, rng); break; @@ -590,8 +590,8 @@ __device__ inline void perm_ox_crossover(Sol& child, const Sol& parentA, const S } } -/// uniform_crossover: 逐元素从两个父代中随机选择 -/// 适用于 Binary 编码(不破坏排列约束) +/// uniform_crossover: random parent choice per element +/// Suitable for Binary encoding (does not break permutation constraints) template __device__ inline void uniform_crossover(Sol& child, const Sol& parentA, const Sol& parentB, int dim1, curandState* rng) { @@ -607,15 +607,15 @@ __device__ inline void uniform_crossover(Sol& child, const Sol& parentA, const S } } -// [已删除] generate_move_for_seq / sample_and_generate / apply_move / undo_move -// P0 重构后主路径统一使用 execute_sequence,旧的 Move 生成+应用+撤销路径不再需要 +// [removed] generate_move_for_seq / sample_and_generate / apply_move / undo_move +// After P0 refactor the main path uses execute_sequence; old Move gen/apply/undo path removed // ============================================================ -// execute_sequence — 统一接口:生成参数并直接执行(不返回 Move) +// execute_sequence — unified API: generate params and execute directly (no Move returned) // ============================================================ -// 返回 true 若 sol 被修改,false 若 NOOP -// d_G, d_O, rel_N: 可选的关系矩阵指针(SEQ_LNS_GUIDED_REBUILD 使用) -// val_lb, val_ub: Integer 编码的值域范围(其他编码忽略) +// Returns true if sol modified, false if NOOP +// d_G, d_O, rel_N: optional relation matrices (for SEQ_LNS_GUIDED_REBUILD) +// val_lb, val_ub: Integer encoding value range (ignored for other encodings) template __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1, @@ -627,7 +627,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1, int val_ub = 1, const void* prob_data = nullptr) { // ============================================================ - // Permutation 序列 + // Permutation sequences // ============================================================ if (encoding == EncodingType::Permutation) { switch (seq_id) { @@ -841,15 +841,15 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1, return true; } case seq::SEQ_LNS_GUIDED_REBUILD: { - // 关系矩阵引导重建: - // 1. 随机选种子元素 seed - // 2. 查 G[seed] 找分组倾向最强的 K 个元素 - // 3. 在解中找到这些元素的位置 - // 4. 按 O 矩阵引导的顺序重排这些位置的元素 + // Relation-matrix guided rebuild: + // 1. Pick random seed element seed + // 2. Look up G[seed] for K elements with strongest grouping affinity + // 3. Find positions of these elements in the solution + // 4. Reorder these positions by order guided by O matrix // - // 如果没有关系矩阵(冷启动),退化为 scatter_shuffle + // Without relation matrices (cold start), fall back to scatter_shuffle if (!d_G || !d_O || rel_N <= 0) { - // 退化:随机 scatter shuffle + // Fallback: random scatter shuffle int row = (dim1 > 1) ? rand_int(rng, dim1) : 0; int sz = sol.dim2_sizes[row]; if (sz < 4) return false; @@ -872,21 +872,21 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1, return true; } - // --- 有关系矩阵:引导重建 --- - // 通用策略(不感知问题类型): - // G 矩阵 → 选哪些元素(分组倾向弱的 = 可能放错位置的) - // O 矩阵 → 怎么排(排序倾向引导重排顺序) - // 两者协同:G 选人,O 排序 + // --- With relation matrices: guided rebuild --- + // Generic strategy (problem-agnostic): + // G matrix → which elements (weak grouping with seed = likely misplaced) + // O matrix → how to order (ordering affinity guides reorder) + // Together: G picks, O orders int row = (dim1 > 1) ? rand_int(rng, dim1) : 0; int sz = sol.dim2_sizes[row]; if (sz < 4) return false; - // 选种子元素 + // Pick seed element int seed_pos = rand_int(rng, sz); int seed_val = sol.data[row][seed_pos]; if (seed_val < 0 || seed_val >= rel_N) return false; - // 检查矩阵是否有足够信息(G 和 O 任一有信号即可) + // Check matrices have enough signal (either G or O) float max_signal = 0.0f; for (int c = 0; c < sz; c++) { int v = sol.data[row][c]; @@ -897,11 +897,11 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1, if (o > max_signal) max_signal = o; } } - if (max_signal < 0.05f) return false; // 信息不足,跳过 + if (max_signal < 0.05f) return false; // insufficient signal, skip - // 破坏:锦标赛选择 G 值低的元素(t=2) - // G 值低 = 与 seed 分组倾向弱 = 可能放错位置 - // 锦标赛:随机抽 2 个,取 G 值更低的那个,重复 count 次 + // Destroy: tournament pick low-G elements (t=2) + // Low G = weak grouping with seed = likely misplaced + // Tournament: draw 2 at random, take lower G, repeat count times constexpr int MAX_REBUILD = 10; constexpr int TOUR_SIZE = 2; int count = sz / 5; // ~20% @@ -911,12 +911,12 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1, int sel_pos[MAX_REBUILD]; int sel_val[MAX_REBUILD]; - bool used[128] = {}; // 标记已选位置,防止重复 + bool used[128] = {}; // mark chosen positions to avoid duplicates int picked = 0; - int max_attempts = count * 4; // 防止死循环 + int max_attempts = count * 4; // avoid infinite loop for (int attempt = 0; attempt < max_attempts && picked < count; attempt++) { - // 锦标赛:随机抽 TOUR_SIZE 个候选,取 G 值最低的 + // Tournament: draw TOUR_SIZE candidates at random, take lowest G int best_c = -1; float best_g = 1e30f; for (int t = 0; t < TOUR_SIZE; t++) { @@ -936,15 +936,15 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1, if (picked < 2) return false; count = picked; - // 修复:锦标赛排序(O 矩阵引导 + 随机扰动) - // 插入排序,比较时加噪声实现概率性:O 值高的大概率排前面,但不绝对 + // Repair: tournament sort (O-guided + random noise) + // Insertion sort with noisy comparison: high O tends to go first, not guaranteed for (int i = 1; i < count; i++) { int key = sel_val[i]; int j = i - 1; while (j >= 0) { float o_key_before = d_O[key * rel_N + sel_val[j]]; float o_j_before = d_O[sel_val[j] * rel_N + key]; - // 噪声幅度 0.05:O 值差距 >0.05 时基本确定,<0.05 时随机 + // Noise scale 0.05: if O gap >0.05 mostly deterministic, else random float noise = (curand_uniform(rng) - 0.5f) * 0.1f; if (o_key_before + noise > o_j_before) { sel_val[j + 1] = sel_val[j]; @@ -956,7 +956,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1, sel_val[j + 1] = key; } - // 对 sel_pos 排序(升序),使写回位置有序 + // Sort sel_pos ascending so write-back order is stable for (int i = 1; i < count; i++) { int key = sel_pos[i]; int j = i - 1; @@ -967,7 +967,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1, sel_pos[j + 1] = key; } - // 检查是否真的改变了排列 + // Check whether permutation actually changed bool any_change = false; for (int i = 0; i < count; i++) { if (sol.data[row][sel_pos[i]] != sel_val[i]) { @@ -977,7 +977,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1, } if (!any_change) return false; - // 写回 + // Write back for (int i = 0; i < count; i++) { sol.data[row][sel_pos[i]] = sel_val[i]; } @@ -989,7 +989,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1, } // ============================================================ - // Binary 序列 + // Binary sequences // ============================================================ if (encoding == EncodingType::Binary) { switch (seq_id) { @@ -1063,7 +1063,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1, } // ============================================================ - // Integer 序列 + // Integer sequences // ============================================================ if (encoding == EncodingType::Integer) { switch (seq_id) { @@ -1131,7 +1131,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1, } // ============================================================ - // 共享:行级序列(编码无关) + // Shared: row-level sequences (encoding-agnostic) // ============================================================ switch (seq_id) { case seq::SEQ_ROW_SWAP: { @@ -1194,11 +1194,11 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1, } // ============================================================ -// sample_and_execute — 从 SeqRegistry 按权重采样 + 直接执行 +// sample_and_execute — sample from SeqRegistry by weight and execute directly // ============================================================ -// 返回 true 若 sol 被修改,false 若 NOOP -// 输出参数 out_seq_idx:采样到的序列在 registry 中的索引 -// d_G, d_O, rel_N: 可选的关系矩阵(传递给 execute_sequence) +// Returns true if sol modified, false if NOOP +// out_seq_idx: index of sampled sequence in registry +// d_G, d_O, rel_N: optional relation matrices (passed to execute_sequence) template __device__ inline bool sample_and_execute(const SeqRegistry& reg, @@ -1212,7 +1212,7 @@ __device__ inline bool sample_and_execute(const SeqRegistry& reg, int val_lb = 0, int val_ub = 1, const void* prob_data = nullptr) { - // 延迟归一化:使用缓存的 weights_sum + // Lazy normalization: use cached weights_sum float r = curand_uniform(rng) * reg.weights_sum; // r ∈ [0, weights_sum) float cumsum = 0.0f; out_seq_idx = reg.count - 1; diff --git a/prototype/core/population.cuh b/prototype/core/population.cuh index 4418ea8..338e548 100644 --- a/prototype/core/population.cuh +++ b/prototype/core/population.cuh @@ -1,10 +1,10 @@ /** - * population.cuh - 种群管理 + * population.cuh - Population management * - * v2.0: Block 级架构 - * - RNG 数组大小 = pop_size * block_size(每个 block 内每个线程独立 RNG) - * - 初始化 kernel 保持 1-thread-per-solution(初始化只做一次,不需要并行) - * - find_best_kernel 保持单线程(种群规模不大) + * v2.0: Block-level architecture + * - RNG array size = pop_size * block_size (one independent RNG per thread within each block) + * - Init kernel stays 1-thread-per-solution (initialization runs once; parallelism not needed) + * - find_best_kernel remains single-threaded (population size is modest) */ #pragma once @@ -12,7 +12,7 @@ #include "cuda_utils.cuh" // ============================================================ -// Device 端 Kernel(模板化) +// Device-side kernels (templated) // ============================================================ template @@ -65,9 +65,9 @@ __global__ void init_integer_kernel(Sol* pop, int pop_size, } // ============================================================ -// 多重集排列初始化 — 每个值 [0, N) 重复 R 次,总长度 N*R +// Multiset permutation init — each value in [0, N) repeated R times, total length N*R // ============================================================ -// 用于 JSP 工序排列编码:N=num_jobs, R=num_ops,值 j 出现 R 次表示工件 j +// For JSP operation-sequence encoding: N=num_jobs, R=num_ops; value j appearing R times means job j template __global__ void init_multiset_perm_kernel(Sol* pop, int pop_size, @@ -90,7 +90,7 @@ __global__ void init_multiset_perm_kernel(Sol* pop, int pop_size, } // ============================================================ -// 分区初始化 — 元素 {0..total_elements-1} 不重复分配到 dim1 行 +// Partition init — elements {0..total_elements-1} assigned without duplication across dim1 rows // ============================================================ template @@ -131,21 +131,21 @@ __global__ void find_best_kernel(const Sol* pop, int pop_size, } // ============================================================ -// Host 端 RAII 类(模板化) +// Host-side RAII class (templated) // ============================================================ template class Population { public: Sol* d_solutions = nullptr; - curandState* d_rng_states = nullptr; // 大小 = pop_size * block_size + curandState* d_rng_states = nullptr; // size = pop_size * block_size int size = 0; - int rng_count = 0; // RNG 状态总数 + int rng_count = 0; // total RNG states Population() = default; - // block_size: Block 级架构下每个 block 的线程数 - // RNG 数组大小 = pop_size * block_size(每个 block 内每个线程独立 RNG) + // block_size: threads per block under block-level architecture + // RNG array size = pop_size * block_size (one independent RNG per thread within each block) void allocate(int pop_size, int block_size = 128) { size = pop_size; rng_count = pop_size * block_size; diff --git a/prototype/core/relation_matrix.cuh b/prototype/core/relation_matrix.cuh index 89fb2ea..0fc0548 100644 --- a/prototype/core/relation_matrix.cuh +++ b/prototype/core/relation_matrix.cuh @@ -1,20 +1,20 @@ /** - * relation_matrix.cuh - G/O 关系矩阵管理 + * relation_matrix.cuh - G/O relation matrix management * - * G[i][j]: 分组倾向(元素 i 和 j 应在同一行的倾向,对称) - * O[i][j]: 排序倾向(元素 i 应排在 j 前面的倾向,不对称) + * G[i][j]: grouping affinity (tendency for elements i and j to be on the same row; symmetric) + * O[i][j]: ordering affinity (tendency for element i to appear before j; asymmetric) * - * 更新来源:历史最优解统计 - * 每当 host 端获取到当前 best 解,扫描所有元素对关系: - * - 同行 → G[i][j] 增强 - * - i 在 j 前 → O[i][j] 增强 - * 使用 EMA 衰减:M[i][j] = α * M[i][j] + (1-α) * signal + * Update source: statistics from historical best solutions + * Whenever the host obtains the current best solution, scan all element-pair relations: + * - Same row → strengthen G[i][j] + * - i before j → strengthen O[i][j] + * EMA decay: M[i][j] = α * M[i][j] + (1-α) * signal * - * 生命周期: - * 1. relation_matrix_create(N) — 分配 host/device 内存,初始化为 0 - * 2. relation_matrix_update(rm, sol, dim1) — 从一个解更新 G/O(host 端) - * 3. relation_matrix_upload(rm) — 上传 h_G/h_O 到 d_G/d_O - * 4. relation_matrix_destroy(rm) — 释放内存 + * Lifecycle: + * 1. relation_matrix_create(N) — allocate host/device memory, initialize to 0 + * 2. relation_matrix_update(rm, sol, dim1) — update G/O from one solution (host) + * 3. relation_matrix_upload(rm) — upload h_G/h_O to d_G/d_O + * 4. relation_matrix_destroy(rm) — free memory */ #pragma once @@ -23,7 +23,7 @@ #include // ============================================================ -// 创建 / 销毁 +// Create / destroy // ============================================================ inline RelationMatrix relation_matrix_create(int N, float decay = 0.95f) { @@ -58,19 +58,19 @@ inline void relation_matrix_destroy(RelationMatrix& rm) { } // ============================================================ -// 从一个解更新 G/O(host 端) +// Update G/O from one solution (host) // ============================================================ -// sol: 当前最优解(已下载到 host) -// dim1: 实际使用的行数 +// sol: current best solution (already copied to host) +// dim1: number of rows in use // -// 逻辑: -// 对 sol 中每对元素 (val_a, val_b): -// 如果在同一行 → G[val_a][val_b] 增强 -// 如果 val_a 在 val_b 前面 → O[val_a][val_b] 增强 +// Logic: +// For each pair (val_a, val_b) in sol: +// If on the same row → strengthen G[val_a][val_b] +// If val_a appears before val_b → strengthen O[val_a][val_b] // -// 注意:元素值 val 必须在 [0, N) 范围内才有意义 -// 对于 partition 编码(VRP),元素值就是客户编号 -// 对于单行排列(TSP),元素值就是城市编号 +// Note: element values val are meaningful only in [0, N) +// For partition encoding (VRP), values are customer IDs +// For single-row permutation (TSP), values are city IDs template void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) { @@ -78,13 +78,13 @@ void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) { float alpha = rm.decay; float signal_strength = 1.0f; - // 衰减所有现有值 + // Decay all existing values for (int i = 0; i < N * N; i++) { rm.h_G[i] *= alpha; rm.h_O[i] *= alpha; } - // 扫描解中的元素对关系 + // Scan element-pair relations in the solution for (int r = 0; r < dim1; r++) { int sz = sol.dim2_sizes[r]; for (int c1 = 0; c1 < sz; c1++) { @@ -95,17 +95,17 @@ void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) { int val_b = sol.data[r][c2]; if (val_b < 0 || val_b >= N) continue; - // 同行 → G 增强(对称) + // Same row → strengthen G (symmetric) rm.h_G[val_a * N + val_b] += (1.0f - alpha) * signal_strength; rm.h_G[val_b * N + val_a] += (1.0f - alpha) * signal_strength; - // val_a 在 val_b 前 → O[val_a][val_b] 增强 + // val_a before val_b → strengthen O[val_a][val_b] rm.h_O[val_a * N + val_b] += (1.0f - alpha) * signal_strength; } } } - // 裁剪到 [0, 1] + // Clamp to [0, 1] for (int i = 0; i < N * N; i++) { if (rm.h_G[i] > 1.0f) rm.h_G[i] = 1.0f; if (rm.h_O[i] > 1.0f) rm.h_O[i] = 1.0f; @@ -115,7 +115,7 @@ void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) { } // ============================================================ -// 上传到 GPU +// Upload to GPU // ============================================================ inline void relation_matrix_upload(const RelationMatrix& rm) { diff --git a/prototype/core/solver.cuh b/prototype/core/solver.cuh index e27a38a..161bd4d 100644 --- a/prototype/core/solver.cuh +++ b/prototype/core/solver.cuh @@ -1,14 +1,14 @@ /** - * solver.cuh - 主求解循环 + * solver.cuh - Main solve loop * - * v2.0: Block 级架构重构 - * - 1 block = 1 solution(邻域并行) - * - Solution 存放在 shared memory - * - 每代:K 个线程各自生成候选 move + 评估 delta → 归约选最优 → thread 0 执行 - * - 交叉暂用简化版(thread 0 执行,其余线程等待) - * - 迁移/精英注入保持单线程 kernel(操作全局内存) + * v2.0: Block-level architecture refactor + * - 1 block = 1 solution (neighborhood parallelism) + * - Solution lives in shared memory + * - Each generation: K threads each propose a candidate move + evaluate delta -> reduce to best -> thread 0 applies + * - Crossover uses a simplified path for now (thread 0 runs crossover, others wait) + * - Migration / elite injection remain single-thread kernels (global memory) * - * 要求 Problem 接口: + * Required Problem interface: * size_t shared_mem_bytes() const; * __device__ void load_shared(char* smem, int tid, int bsz); * __device__ void evaluate(Sol& sol) const; @@ -25,16 +25,16 @@ #include // ============================================================ -// 编译时常量 +// Compile-time constants // ============================================================ -constexpr int BLOCK_LEVEL_THREADS = 128; // Block 级架构的默认线程数/block +constexpr int BLOCK_LEVEL_THREADS = 128; // Default threads per block for block-level architecture // ============================================================ -// EvolveParams — CUDA Graph 可变参数(device memory) +// EvolveParams — CUDA Graph mutable parameters (device memory) // ============================================================ -// 将每个 batch 会变化的参数集中到一个 struct 中, -// evolve_block_kernel 通过指针读取,CUDA Graph 录制时绑定指针。 -// 每次 replay 前只需 cudaMemcpy 更新这块 device memory。 +// Per-batch parameters are packed into one struct; +// evolve_block_kernel reads via pointer; CUDA Graph capture binds the pointer. +// Before each replay, only cudaMemcpy this device memory block. struct EvolveParams { float temp_start; @@ -46,13 +46,13 @@ struct EvolveParams { }; // ============================================================ -// 工具:协作加载/存储 Solution(shared memory ↔ global memory) +// Helpers: cooperative load/store Solution (shared memory ↔ global memory) // ============================================================ template __device__ inline void cooperative_load_sol(Sol& dst, const Sol& src, int tid, int num_threads) { - // 按 int 粒度协作拷贝整个 Solution 结构体 + // Cooperative copy of entire Solution struct in int-sized chunks const int* src_ptr = reinterpret_cast(&src); int* dst_ptr = reinterpret_cast(&dst); constexpr int n_ints = (sizeof(Sol) + sizeof(int) - 1) / sizeof(int); @@ -63,11 +63,11 @@ __device__ inline void cooperative_load_sol(Sol& dst, const Sol& src, template __device__ inline void cooperative_store_sol(Sol& dst, const Sol& src, int tid, int num_threads) { - cooperative_load_sol(dst, src, tid, num_threads); // 同样的拷贝逻辑 + cooperative_load_sol(dst, src, tid, num_threads); // Same copy logic } // ============================================================ -// Kernel 1: 初始评估(只调用一次,1 block = 1 solution) +// Kernel 1: Initial evaluation (once; 1 block = 1 solution) // ============================================================ template @@ -77,27 +77,27 @@ __global__ void evaluate_kernel(Problem prob, Sol* pop, int pop_size, Problem lp = prob; if (smem_size > 0) { lp.load_shared(smem, threadIdx.x, blockDim.x); __syncthreads(); } - // 1-thread-per-solution 初始评估(保持简单,只调用一次) + // One-thread-per-solution initial evaluation (simple; called once) int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < pop_size) lp.evaluate(pop[tid]); } // ============================================================ -// Kernel 2: Block 级批量进化(邻域并行) +// Kernel 2: Block-level batched evolution (neighborhood parallelism) // ============================================================ // -// 每代流程: -// 1. K 个线程各自生成一个候选 move -// 2. K 个线程各自评估 move 的 delta(不修改 shared memory 中的 sol) -// 3. Block 内归约:选 delta 最小的 move -// 4. Thread 0 决定是否接受(SA / HC) -// 5. Thread 0 执行最优 move 并更新 sol -// 6. __syncthreads() 让所有线程看到更新后的 sol +// Per-generation flow: +// 1. Each of K threads generates one candidate move +// 2. Each thread evaluates delta for its move (does not modify sol in shared memory) +// 3. Block reduction: pick move with smallest delta +// 4. Thread 0 accepts or rejects (SA / HC) +// 5. Thread 0 applies best move and updates sol +// 6. __syncthreads() so all threads see updated sol // -// Solution 在 shared memory 中,Problem 数据也在 shared memory 中 +// Solution and Problem data live in shared memory // ============================================================ -// MultiStepCandidate — 多步执行结果(用于归约) +// MultiStepCandidate — multi-step result (for reduction) // ============================================================ struct MultiStepCandidate { float delta; @@ -135,23 +135,23 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size, const float temp_start = d_params->temp_start; const ObjConfig oc = d_params->oc; - // --- shared memory 布局 --- + // --- shared memory layout --- // [0 .. sizeof(Sol)-1] : Solution - // [sizeof(Sol) .. sizeof(Sol)+prob_smem-1] : Problem 数据 - // [之后 .. ] : MultiStepCandidate[num_threads] 归约工作区 - // [之后 .. ] : AOSStats (如果启用) + // [sizeof(Sol) .. sizeof(Sol)+prob_smem-1] : Problem data + // [after .. ] : MultiStepCandidate[num_threads] reduction workspace + // [after .. ] : AOSStats (if enabled) Sol* s_sol = reinterpret_cast(smem); char* prob_smem_ptr = smem + sizeof(Sol); MultiStepCandidate* s_cands = reinterpret_cast( smem + sizeof(Sol) + prob_smem_size); - // AOS 统计(在 MultiStepCandidate 数组之后) + // AOS stats (after MultiStepCandidate array) AOSStats* s_aos = nullptr; if (d_aos_stats) { s_aos = reinterpret_cast( smem + sizeof(Sol) + prob_smem_size + sizeof(MultiStepCandidate) * num_threads); - // Thread 0 初始化 AOS 计数器 + // Thread 0 initializes AOS counters if (tid == 0) { for (int i = 0; i < MAX_SEQ; i++) { s_aos->usage[i] = 0; @@ -164,13 +164,13 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size, } } - // 加载 Problem 数据到 shared memory + // Load Problem data into shared memory Problem lp = prob; if (prob_smem_size > 0) { lp.load_shared(prob_smem_ptr, tid, num_threads); } - // 协作加载 Solution 到 shared memory + // Cooperatively load Solution into shared memory cooperative_load_sol(*s_sol, pop[bid], tid, num_threads); __syncthreads(); @@ -181,12 +181,12 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size, for (int g = 0; g < gens_per_batch; g++) { // ============================================================ - // Step 1: 每个线程独立采样 K 步数 + K 个序列,在 local copy 上执行 + // Step 1: Each thread independently samples K steps + K sequences on local copy // ============================================================ - // 采样 K(步数):按 kstep.weights 权重 + // Sample K (step count): weighted by kstep.weights float kr = curand_uniform(&rng); - int my_k = 1; // 默认 K=1 + int my_k = 1; // default K=1 { float cum = 0.0f; for (int i = 0; i < MAX_K; i++) { @@ -195,7 +195,7 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size, } } - // 在 local memory 拷贝 sol,执行 K 步 move + // Copy sol in local memory, apply K moves Sol local_sol = *s_sol; MultiStepCandidate my_cand; my_cand.k_steps = my_k; @@ -215,7 +215,7 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size, if (changed) all_noop = false; } - // Step 2: 评估最终 delta(K 步之后 vs 原始 sol) + // Step 2: Evaluate final delta (after K steps vs original sol) if (all_noop) { my_cand.delta = 1e30f; my_cand.new_penalty = s_sol->penalty; @@ -242,7 +242,7 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size, s_cands[tid] = my_cand; __syncthreads(); - // Step 3: Block 内并行归约,找 delta 最小的 candidate + // Step 3: Parallel reduction in block to find candidate with smallest delta for (int stride = num_threads / 2; stride > 0; stride >>= 1) { if (tid < stride) { if (s_cands[tid + stride].delta < s_cands[tid].delta) @@ -251,7 +251,7 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size, __syncthreads(); } - // Step 4: Thread 0 决定是否接受 + // Step 4: Thread 0 decides accept/reject if (tid == 0) { MultiStepCandidate& best = s_cands[0]; bool has_valid = (best.delta < 1e29f); @@ -269,7 +269,7 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size, } if (accept) { - // AOS 统计:K 层 + 算子层 + // AOS stats: K layer + operator layer if (s_aos) { int ki = best.k_steps - 1; if (ki >= 0 && ki < MAX_K) { @@ -304,23 +304,23 @@ __global__ void evolve_block_kernel(Problem prob, Sol* pop, int pop_size, __syncthreads(); } - // 写回 Solution 到全局内存 + // Write Solution back to global memory cooperative_store_sol(pop[bid], *s_sol, tid, num_threads); - // AOS 统计写回全局内存 + // Write AOS stats back to global memory if (d_aos_stats && tid == 0) { d_aos_stats[bid] = *s_aos; } - // 保存 RNG 状态 + // Save RNG state rng_states[rng_idx] = rng; } // ============================================================ -// Kernel 2b: Block 级交叉操作 +// Kernel 2b: Block-level crossover // ============================================================ -// 简化版:thread 0 执行交叉逻辑,其余线程协作加载/存储 -// 后续 Phase 3 会实现多线程协作交叉 +// Simplified: thread 0 runs crossover; others cooperative load/store +// Phase 3 may add multi-thread cooperative crossover template __global__ void crossover_block_kernel(Problem prob, Sol* pop, int pop_size, @@ -338,7 +338,7 @@ __global__ void crossover_block_kernel(Problem prob, Sol* pop, int pop_size, if (bid >= pop_size) return; - // shared memory 布局:Sol + Problem data + // Shared memory layout: Sol + Problem data Sol* s_sol = reinterpret_cast(smem); char* prob_smem_ptr = smem + sizeof(Sol); @@ -350,7 +350,7 @@ __global__ void crossover_block_kernel(Problem prob, Sol* pop, int pop_size, cooperative_load_sol(*s_sol, pop[bid], tid, K); __syncthreads(); - // Thread 0 执行交叉逻辑 + // Thread 0 runs crossover if (tid == 0) { int rng_idx = bid * K; curandState rng = rng_states[rng_idx]; @@ -389,12 +389,12 @@ __global__ void crossover_block_kernel(Problem prob, Sol* pop, int pop_size, } __syncthreads(); - // 写回(可能被交叉更新了) + // Write back (possibly updated by crossover) cooperative_store_sol(pop[bid], *s_sol, tid, K); } // ============================================================ -// Kernel 3: 岛屿间迁移(保持不变,单线程 kernel) +// Kernel 3: Inter-island migration (unchanged; single-thread kernel) // ============================================================ template @@ -406,6 +406,8 @@ __device__ inline int find_worst_in_island(const Sol* pop, int base, int island_ return worst; } +constexpr int MAX_ISLANDS = 64; + template __global__ void migrate_kernel(Sol* pop, int pop_size, int island_size, ObjConfig oc, @@ -414,8 +416,10 @@ __global__ void migrate_kernel(Sol* pop, int pop_size, int island_size, if (threadIdx.x != 0 || blockIdx.x != 0) return; int round = d_params->migrate_round; int num_islands = pop_size / island_size; + if (num_islands > MAX_ISLANDS) num_islands = MAX_ISLANDS; + if (num_islands <= 1) return; - int candidates[64]; + int candidates[MAX_ISLANDS]; for (int isle = 0; isle < num_islands; isle++) { int base = isle * island_size; int best = base; @@ -424,9 +428,9 @@ __global__ void migrate_kernel(Sol* pop, int pop_size, int island_size, candidates[isle] = best; } - int topn[64]; + int topn[MAX_ISLANDS]; if (strategy == MigrateStrategy::TopN || strategy == MigrateStrategy::Hybrid) { - bool selected[64] = {}; + bool selected[MAX_ISLANDS] = {}; for (int t = 0; t < num_islands; t++) { int best_c = -1; for (int c = 0; c < num_islands; c++) { @@ -459,7 +463,7 @@ __global__ void migrate_kernel(Sol* pop, int pop_size, int island_size, } // ============================================================ -// Kernel 4: 精英注入(保持不变) +// Kernel 4: Elite injection (unchanged) // ============================================================ template @@ -483,7 +487,7 @@ __global__ void elite_inject_kernel(Sol* pop, int pop_size, } // ============================================================ -// v5.0: 多 GPU 协同 — 注入外部解到岛屿 +// v5.0: Multi-GPU coordination — inject external solutions into islands // ============================================================ template @@ -496,7 +500,7 @@ __global__ void inject_to_islands_kernel(Sol* pop, int pop_size, int island_size int num_islands = pop_size / island_size; if (num_islands == 0) return; - // 根据注入模式确定注入的岛屿数量 + // Number of islands to inject into depends on mode int islands_to_inject = 0; if (mode == MultiGpuInjectMode::OneIsland) { islands_to_inject = 1; @@ -506,15 +510,15 @@ __global__ void inject_to_islands_kernel(Sol* pop, int pop_size, int island_size islands_to_inject = num_islands; } - // 将注入解分配到各个岛屿的 worst 位置 + // Place each injected solution at worst slot of an island for (int i = 0; i < islands_to_inject && i < num_inject; i++) { int target_isle = i % num_islands; int base = target_isle * island_size; - // 找到该岛的 worst 解 + // Find worst solution on this island int worst = find_worst_in_island(pop, base, island_size, oc); - // 如果注入解更优,则替换 + // Replace if injection is better if (is_better(inject_solutions[i], pop[worst], oc)) { pop[worst] = inject_solutions[i]; } @@ -522,49 +526,49 @@ __global__ void inject_to_islands_kernel(Sol* pop, int pop_size, int island_size } // ============================================================ -// v5.0 方案 B3: inject_check_kernel — 被动注入检查 +// v5.0 plan B3: inject_check_kernel — passive injection check // ============================================================ -// GPU 在 migrate 时检查 InjectBuffer,如果有新解则注入到第一个岛的 worst -// 使用 atomicExch 原子读取并清除 flag,确保线程安全 +// During migrate, GPU checks InjectBuffer; if new solution exists, inject at worst of first island +// atomicExch reads and clears flag atomically for thread safety // -// 设计要点: -// 1. 单线程执行(thread 0 of block 0),避免竞争 -// 2. atomicExch 原子读取 flag 并清零,确保每个解只被处理一次 -// 3. 只注入到第一个岛(OneIsland 策略),保持多样性 -// 4. 完全可选:如果 inject_buf 为 nullptr,直接跳过(不影响单 GPU) +// Design notes: +// 1. Single thread (thread 0 of block 0) to avoid races +// 2. atomicExch reads flag and clears it so each solution is handled once +// 3. Inject only into first island (OneIsland strategy) to preserve diversity +// 4. Optional: if inject_buf is nullptr, skip (single-GPU unaffected) template __global__ void inject_check_kernel(Sol* pop, int pop_size, int island_size, InjectBuffer* inject_buf, ObjConfig oc) { - // 单线程执行 + // Single-thread execution if (threadIdx.x != 0 || blockIdx.x != 0) return; - // 如果没有注入缓冲区,直接返回(单 GPU 场景) + // No injection buffer — return (single-GPU case) if (inject_buf == nullptr) return; - // 原子读取并清除 flag(确保每个解只被处理一次) + // Atomically read and clear flag (each solution processed once) int flag = atomicExch(inject_buf->d_flag, 0); - // 如果没有新解,直接返回 + // No new solution — return if (flag != 1) return; - // 读取注入的解 + // Read injected solution Sol inject_sol = *(inject_buf->d_solution); - // 找到第一个岛的 worst 位置 + // Find worst slot on first island int num_islands = pop_size / island_size; if (num_islands == 0) return; int worst = find_worst_in_island(pop, 0, island_size, oc); - // 如果注入解更优,则替换 + // Replace if injection is better if (is_better(inject_sol, pop[worst], oc)) { pop[worst] = inject_sol; } } // ============================================================ -// solve: 主循环(Block 级架构) +// solve: main loop (block-level architecture) // ============================================================ using RegistryCallback = void(*)(SeqRegistry&); @@ -586,22 +590,22 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, bool use_time_limit = cfg.time_limit_sec > 0.0f; bool use_stagnation = cfg.stagnation_limit > 0; - // Block 级参数 - const int block_threads = BLOCK_LEVEL_THREADS; // 128 线程/block + // Block-level parameters + const int block_threads = BLOCK_LEVEL_THREADS; // 128 threads/block - // --- 0. Shared memory 计算(需要在 pop_size 确定之前完成,用于 occupancy 查询)--- + // --- 0. Shared memory sizing (before pop_size; used for occupancy query) --- size_t prob_smem = prob.shared_mem_bytes(); - // v3.1: 归约工作区为 MultiStepCandidate(含 K 步 moves + seq_indices) + // v3.1: reduction workspace is MultiStepCandidate (K-step moves + seq_indices) size_t total_smem = sizeof(Sol) + prob_smem + sizeof(MultiStepCandidate) * block_threads; if (use_aos) total_smem += sizeof(AOSStats); - // 查询 GPU 硬件属性 + // Query GPU device properties cudaDeviceProp prop; int device; CUDA_CHECK(cudaGetDevice(&device)); CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); - // 尝试扩展 shared memory 上限(V100: 96KB, A100: 164KB 等) + // Try to raise shared memory cap (V100: 96KB, A100: 164KB, etc.) size_t max_smem = (size_t)prop.sharedMemPerBlock; if (total_smem > 48 * 1024) { cudaError_t err1 = cudaFuncSetAttribute( @@ -617,7 +621,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } } - // 检查 shared memory 上限 + // Check shared memory limit bool smem_overflow = false; if (total_smem > max_smem) { smem_overflow = (prob_smem > 0); @@ -626,12 +630,12 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, if (use_aos) total_smem += sizeof(AOSStats); } - // --- 0b. 确定 pop_size(自动或用户指定)--- + // --- 0b. Determine pop_size (auto or user) --- int pop_size = cfg.pop_size; bool auto_pop = (pop_size <= 0); if (auto_pop) { - // 查询 occupancy:每个 SM 能同时运行多少个 block + // Query occupancy: how many blocks per SM int max_blocks_per_sm = 0; cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_blocks_per_sm, @@ -642,17 +646,17 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, int full_capacity = max_blocks_per_sm * prop.multiProcessorCount; if (prob_smem > 0) { - // 问题数据在 shared memory → 无 L2 cache 压力,打满 SM + // Problem data in shared memory → no L2 pressure; fill SMs pop_size = full_capacity; } else { - // 问题数据在 global memory → 根据 L2 cache 容量估算合理并发度 + // Problem data in global memory → estimate concurrency from L2 size // - // 模型:pop = L2_size / working_set_bytes - // 所有 block 访问同一份只读数据,L2/ws 反映 cache 能支撑的并发度 + // Model: pop = L2_size / working_set_bytes + // All blocks read same read-only data; L2/ws approximates cache-supported concurrency // - // SM 下限策略:L2/ws >= sm_min/2 时拉升到 sm_min(允许一定 cache 压力换取种群多样性) - // ch150: L2/ws=70, sm_min=128 → 70 >= 64 → 拉升到 128 ✓(多样性优先) - // pcb442: L2/ws=8, sm_min=128 → 8 < 64 → 不拉升 ✓(避免 thrashing) + // SM floor policy: if L2/ws >= sm_min/2, raise to sm_min (trade some cache pressure for diversity) + // ch150: L2/ws=70, sm_min=128 -> 70 >= 64 -> raise to 128 (diversity first) + // pcb442: L2/ws=8, sm_min=128 -> 8 < 64 -> do not raise (avoid thrashing) size_t ws = prob.working_set_bytes(); if (ws > 0) { @@ -671,26 +675,26 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } } - // 向下取整到 2 的幂(warp 对齐、归约友好、islands 整除) + // Round down to power of 2 (warp alignment, reduction-friendly, island divisibility) { int p = 1; while (p * 2 <= pop_size) p *= 2; pop_size = p; } - // 绝对下限:32(保证至少 1 岛 × 32 解的最小可用规模) + // Absolute floor: 32 (at least 1 island x 32 individuals) if (pop_size < 32) pop_size = 32; } - // 自适应岛屿数量(num_islands=0 时启用) + // Adaptive island count (when num_islands=0) int num_islands = cfg.num_islands; if (num_islands == 0) { - // 策略:每岛至少 32 个个体,最多 8 岛 - // pop < 64 → 1 岛(纯 HC) - // 64-127 → 2 岛 - // 128-255 → 4 岛 - // 256-511 → 8 岛 - // >= 512 → 8 岛 + // Policy: at least 32 individuals per island, at most 8 islands + // pop < 64 -> 1 island (pure HC) + // 64-127 -> 2 islands + // 128-255 -> 4 islands + // 256-511 -> 8 islands + // >= 512 -> 8 islands if (pop_size < 64) { num_islands = 1; } else if (pop_size < 128) { @@ -747,8 +751,8 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, printf(" seed=%u\n", cfg.seed); } - // --- 1. 分配 --- - // crossover 栈需求(thread 0 在 local memory 中构造 child) + // --- 1. Allocation --- + // Crossover stack needs (thread 0 builds child in local memory) if (use_crossover) { size_t ox_arrays = Sol::DIM1 * Sol::DIM2 * sizeof(bool) + 512 * sizeof(bool) @@ -759,7 +763,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, ObjConfig oc = make_obj_config(pcfg); - // --- 1b. 采样择优初始化 --- + // --- 1b. Sample-and-select initialization --- int oversample = cfg.init_oversample; if (oversample < 1) oversample = 1; int candidate_size = pop_size * oversample; @@ -768,13 +772,13 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, Population pop; if (do_oversample) { - // 生成 K × pop_size 个候选解 + // Generate K x pop_size candidate solutions Population candidates; candidates.allocate(candidate_size, block_threads); candidates.init_rng(cfg.seed, 256); candidates.init_population(pcfg, 256); - // 启发式初始解注入(替换候选池尾部) + // Inject heuristic initial solutions (replace tail of candidate pool) if (pcfg.encoding == EncodingType::Permutation) { HeuristicMatrix heur_mats[8]; int num_mats = prob.heuristic_matrices(heur_mats, 8); @@ -797,7 +801,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } } - // GPU 上评估所有候选 + // Evaluate all candidates on GPU { size_t eval_smem = prob.shared_mem_bytes(); if (eval_smem > 48 * 1024) { @@ -810,12 +814,12 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, CUDA_CHECK(cudaDeviceSynchronize()); } - // 下载所有候选解到 host + // Download all candidates to host Sol* h_candidates = new Sol[candidate_size]; CUDA_CHECK(cudaMemcpy(h_candidates, candidates.d_solutions, sizeof(Sol) * candidate_size, cudaMemcpyDeviceToHost)); - // 构建候选信息 + // Build candidate metadata std::vector cand_info(candidate_size); for (int i = 0; i < candidate_size; i++) { cand_info[i].idx = i; @@ -829,16 +833,16 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } } - // 计算目标重要性 + // Compute objective importance float importance[MAX_OBJ]; compute_importance(oc, importance); - // 纯随机保底名额 + // Pure-random quota (floor) int num_random = (int)(pop_size * cfg.init_random_ratio); if (num_random < 1) num_random = 1; if (num_random > pop_size / 2) num_random = pop_size / 2; - // 选择 + // Selection std::vector selected; if (oc.num_obj == 1) { selected = init_sel::top_n_select(cand_info, pop_size, num_random); @@ -847,13 +851,13 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, pop_size, num_random); } - // 分配最终种群 + // Allocate final population pop.allocate(pop_size, block_threads); - // 复用候选的 RNG 状态(取前 pop_size 份) - // 重新初始化 RNG 更安全(候选的 RNG 状态已被使用过) + // Could reuse candidate RNG state (first pop_size entries) + // Re-init RNG is safer (candidate RNGs were already used) pop.init_rng(cfg.seed + 1, 256); - // 上传选中的解到种群前部 + // Upload selected solutions to front of population int num_selected = (int)selected.size(); for (int i = 0; i < num_selected; i++) { CUDA_CHECK(cudaMemcpy(pop.d_solutions + i, @@ -861,8 +865,8 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, sizeof(Sol), cudaMemcpyDeviceToDevice)); } - // 剩余位置(纯随机保底):从候选中随机选未被选中的 - // 简单做法:直接用候选中排在后面的未选中解 + // Remaining slots (pure-random floor): fill from unselected candidates + // Simple approach: use later candidates that were not selected if (num_selected < pop_size) { int fill_idx = num_selected; for (int i = 0; i < candidate_size && fill_idx < pop_size; i++) { @@ -876,7 +880,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } if (cfg.verbose) { - // 统计选中解的平均质量 vs 全部候选的平均质量 + // Compare mean quality of selected vs all candidates float sel_avg = 0.0f, all_avg = 0.0f; for (int i = 0; i < candidate_size; i++) all_avg += cand_info[i].objs[0]; all_avg /= candidate_size; @@ -893,20 +897,20 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } delete[] h_candidates; - // candidates 析构自动释放 GPU 内存 + // candidates dtor frees GPU memory } else { - // oversample=1:纯随机,和之前一样 + // oversample=1: pure random, same as before pop.allocate(pop_size, block_threads); pop.init_rng(cfg.seed, 256); pop.init_population(pcfg, 256); } - // --- 1c. 注入用户提供的初始解 --- - // 策略:校验合法性 → 合法解替换种群尾部(保留 oversample 选出的好解在前部) + // --- 1c. Inject user-provided initial solutions --- + // Policy: validate -> valid solutions replace population tail (keep oversample winners at front) if (init_solutions && num_init_solutions > 0) { - int max_inject = pop_size / 16; // 最多占种群 ~6%(保留多样性) + int max_inject = pop_size / 16; // at most ~6% of population (diversity) if (max_inject < 1) max_inject = 1; - if (max_inject > 16) max_inject = 16; // 绝对上限 + if (max_inject > 16) max_inject = 16; // hard cap int want = num_init_solutions; if (want > max_inject) want = max_inject; @@ -915,17 +919,17 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, const Sol& s = init_solutions[i]; bool valid = true; - // 基本维度检查 + // Basic dimension checks for (int r = 0; r < pcfg.dim1 && valid; r++) { if (s.dim2_sizes[r] < 0 || s.dim2_sizes[r] > Sol::DIM2) { valid = false; break; } } - // 编码特定检查 + // Encoding-specific checks if (valid && pcfg.encoding == EncodingType::Permutation) { if (pcfg.row_mode == RowMode::Partition) { - // 分区模式:跨行元素不重复,总数 = total_elements + // Partition mode: no duplicate elements across rows; total = total_elements bool seen[512] = {}; int total = 0; for (int r = 0; r < pcfg.dim1 && valid; r++) { @@ -939,7 +943,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } if (valid && total != pcfg.total_elements) valid = false; } else if (pcfg.perm_repeat_count > 1) { - // 多重集排列:每行中每个值 [0, N) 恰好出现 repeat_count 次 + // Multiset permutation: each value in [0, N) appears repeat_count times per row int R = pcfg.perm_repeat_count; int N = pcfg.dim2_default / R; for (int r = 0; r < pcfg.dim1 && valid; r++) { @@ -956,7 +960,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } } } else { - // 标准排列:每行元素 [0, dim2_default) 不重复 + // Standard permutation: each row is a permutation of [0, dim2_default) for (int r = 0; r < pcfg.dim1 && valid; r++) { if (s.dim2_sizes[r] != pcfg.dim2_default) { valid = false; break; } bool seen[512] = {}; @@ -977,7 +981,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } if (valid) { - // 注入到种群尾部(从后往前填,保留前部的 oversample 好解) + // Inject at population tail (fill from end; keep oversample winners at front) int target_idx = pop_size - 1 - injected; CUDA_CHECK(cudaMemcpy(pop.d_solutions + target_idx, &s, sizeof(Sol), cudaMemcpyHostToDevice)); @@ -992,7 +996,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } } - // v3.0: 构建序列注册表(替代旧的 d_op_weights) + // v3.0: Build sequence registry (replaces old d_op_weights) ProblemProfile profile = classify_problem(pcfg); SeqRegistry seq_reg = build_seq_registry(profile); @@ -1000,7 +1004,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, custom_registry_fn(seq_reg); } - // v3.1: K 步配置(多步执行) + // v3.1: K-step config (multi-step execution) KStepConfig kstep = build_kstep_config(); if (cfg.verbose) { @@ -1022,13 +1026,13 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, Sol* d_global_best = nullptr; if (use_sa) { CUDA_CHECK(cudaMalloc(&d_global_best, sizeof(Sol))); - // v5.0 方案 B3: 导出 d_global_best 指针供外部读取(可选) + // v5.0 plan B3: expose d_global_best pointer for external read (optional) if (d_global_best_out != nullptr) { *d_global_best_out = d_global_best; } } - // AOS: 分配全局内存统计缓冲区(序列级粒度) + // AOS: allocate global stats buffer (per-sequence granularity) AOSStats* d_aos_stats = nullptr; AOSStats* h_aos_stats = nullptr; @@ -1037,8 +1041,8 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, h_aos_stats = new AOSStats[pop_size]; } - // --- 关系矩阵(G/O):用于 SEQ_LNS_GUIDED_REBUILD --- - // 仅 Permutation 编码 + 有 GUIDED_REBUILD 序列时启用 + // --- Relation matrices (G/O) for SEQ_LNS_GUIDED_REBUILD --- + // Enabled only for Permutation encoding when GUIDED_REBUILD is in registry bool use_relation_matrix = false; RelationMatrix rel_mat = {}; int rel_N = 0; @@ -1051,11 +1055,11 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } } if (use_relation_matrix) { - // N = dim2_default(排列中的元素数) + // N = dim2_default (number of elements in permutation) rel_N = pcfg.dim2_default; if (rel_N > 0) { rel_mat = relation_matrix_create(rel_N, 0.95f); - // 让用户提供先验知识初始化 G/O(可选,默认不做任何事) + // Optional prior init of G/O via user hook (default: no-op) prob.init_relation_matrix(rel_mat.h_G, rel_mat.h_O, rel_N); relation_matrix_upload(rel_mat); } else { @@ -1063,11 +1067,11 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } } - // grid = pop_size(每个 block 处理一个解) + // grid = pop_size (one block per solution) int grid = pop_size; - // --- 2. 初始评估 --- - // 采样择优路径中已经评估过候选,但最终种群可能包含随机解,需要重新评估 + // --- 2. Initial evaluation --- + // Sample-select path already evaluated candidates; final pop may still have randoms — re-evaluate { size_t eval_smem = prob.shared_mem_bytes(); if (eval_smem > 48 * 1024) { @@ -1086,9 +1090,9 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, CUDA_CHECK(cudaMemcpy(d_global_best, pop.d_solutions + idx, sizeof(Sol), cudaMemcpyDeviceToDevice)); } - // --- 3. 主循环 --- - // batch 大小决定了 AOS/关系矩阵/收敛检测的更新频率 - // 需要平衡:太小 → 同步开销大,太大 → 反应迟钝 + // --- 3. Main loop --- + // Batch size sets update cadence for AOS / relation matrix / convergence checks + // Balance: too small -> sync overhead; too slow to react if too large int batch; if (use_islands) batch = cfg.migrate_interval; @@ -1097,7 +1101,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, else batch = cfg.max_gen; - // 需要定期更新的功能:强制 batch ≤ 200 + // Features needing periodic updates: force batch <= 200 if (use_relation_matrix || use_aos || use_time_limit || use_stagnation) { if (batch > 200) batch = 200; } @@ -1106,11 +1110,11 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, int migrate_round = 0; StopReason stop_reason = StopReason::MaxGen; - // 收敛检测状态 + // Convergence-check state float prev_best_scalar = 1e30f; int stagnation_count = 0; - // --- EvolveParams: 可变参数(device memory)--- + // --- EvolveParams: mutable fields (device memory) --- EvolveParams h_params; h_params.temp_start = 0.0f; h_params.gens_per_batch = batch; @@ -1133,7 +1137,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, CUDA_CHECK(cudaStreamCreate(&stream)); } - // lambda: 在 stream 上发射一个 batch 的 GPU kernel 序列 + // Lambda: launch one batch of GPU kernels on stream auto launch_batch_kernels = [&](cudaStream_t s) { evolve_block_kernel<<>>( prob, pop.d_solutions, pop_size, @@ -1168,7 +1172,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } }; - // 捕获 CUDA Graph(首次) + // Capture CUDA Graph (first time) if (use_graph) { CUDA_CHECK(cudaStreamBeginCapture(stream, cudaStreamCaptureModeThreadLocal)); launch_batch_kernels(stream); @@ -1187,7 +1191,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, CUDA_CHECK(cudaEventCreate(&t_stop)); CUDA_CHECK(cudaEventRecord(t_start)); - // 时间感知 AOS:窗口累积器 + // Time-aware AOS: window accumulators int win_seq_usage[MAX_SEQ] = {}; int win_seq_improve[MAX_SEQ] = {}; int win_k_usage[MAX_K] = {}; @@ -1195,9 +1199,15 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, int batch_count = 0; const int aos_interval = (cfg.aos_update_interval > 0) ? cfg.aos_update_interval : 1; - // v4.0: 约束导向 + 分层搜索 + // v4.0: constraint-directed + phased search (require AOS enabled) const bool use_constraint_directed = cfg.use_constraint_directed && use_aos; const bool use_phased_search = cfg.use_phased_search && use_aos; + if (cfg.verbose) { + if (cfg.use_constraint_directed && !use_aos) + printf(" [WARN] constraint_directed requires AOS, disabled\n"); + if (cfg.use_phased_search && !use_aos) + printf(" [WARN] phased_search requires AOS, disabled\n"); + } float base_max_w[MAX_SEQ]; for (int i = 0; i < seq_reg.count; i++) base_max_w[i] = seq_reg.max_w[i]; @@ -1217,7 +1227,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, float temp = use_sa ? cfg.sa_temp_init * powf(cfg.sa_alpha, (float)gen_done) : 0.0f; - // 更新 device 端可变参数 + // Update mutable device parameters h_params.temp_start = temp; h_params.gens_per_batch = gens; h_params.seq_reg = seq_reg; @@ -1225,7 +1235,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, h_params.migrate_round = migrate_round; CUDA_CHECK(cudaMemcpy(d_params, &h_params, sizeof(EvolveParams), cudaMemcpyHostToDevice)); - // 发射 GPU kernel 序列 + // Launch GPU kernel sequence if (use_graph) { CUDA_CHECK(cudaGraphLaunch(graph_exec, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -1233,8 +1243,8 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, launch_batch_kernels(nullptr); } - // v5.0 方案 B3: 被动注入检查(在 Graph 之外单独调用) - // 注意:必须在 Graph 之外,因为 inject_buf 内容是动态变化的 + // v5.0 plan B3: passive injection check (outside Graph) + // Must be outside Graph: inject_buf content changes dynamically if (inject_buf != nullptr && use_islands) { inject_check_kernel<<<1, 1>>>(pop.d_solutions, pop_size, island_size, inject_buf, oc); @@ -1245,14 +1255,14 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, if (use_islands) migrate_round++; batch_count++; - // AOS: 两层权重更新(EMA)+ 停滞检测 + // AOS: two-level weight update (EMA) + stagnation detection if (use_aos && (batch_count % aos_interval == 0)) { CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaMemcpy(h_aos_stats, d_aos_stats, sizeof(AOSStats) * pop_size, cudaMemcpyDeviceToHost)); - // --- 聚合当前 batch 的统计到窗口累积器 --- + // --- Fold current batch stats into window accumulators --- for (int b = 0; b < pop_size; b++) { for (int i = 0; i < seq_reg.count; i++) { win_seq_usage[i] += h_aos_stats[b].usage[i]; @@ -1266,7 +1276,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, constexpr float AOS_ALPHA = 0.6f; - // --- v4.0: 约束导向 — 计算种群约束违反率 --- + // --- v4.0: constraint-directed — population infeasibility ratio --- float penalty_ratio = 0.0f; if (use_constraint_directed) { Sol* h_pop_snap = new Sol[pop_size]; @@ -1280,7 +1290,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, delete[] h_pop_snap; } - // --- v4.0: 分层搜索 — 计算当前阶段的 floor/cap 调整 --- + // --- v4.0: phased search — phase floor/cap multipliers --- float phase_floor_mult = 1.0f; float phase_cap_mult = 1.0f; if (use_phased_search) { @@ -1296,18 +1306,18 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, progress = (float)gen_done / (float)cfg.max_gen; } if (progress < cfg.phase_explore_end) { - phase_floor_mult = 1.5f; // 探索期:抬高 floor → 更均匀 - phase_cap_mult = 0.7f; // 探索期:压低 cap → 防止过早集中 + phase_floor_mult = 1.5f; // explore: raise floor -> more uniform + phase_cap_mult = 0.7f; // explore: lower cap -> avoid early concentration } else if (progress >= cfg.phase_refine_start) { - phase_floor_mult = 0.5f; // 精细期:降低 floor → 允许弱算子退出 - phase_cap_mult = 1.5f; // 精细期:抬高 cap → 集中利用强算子 + phase_floor_mult = 0.5f; // refine: lower floor -> weak ops can fade + phase_cap_mult = 1.5f; // refine: raise cap -> exploit strong ops } } - // --- 第二层:算子权重更新(EMA) --- + // --- Layer 2: operator weights (EMA) --- { float new_w[MAX_SEQ]; - // 延迟归一化:EMA 更新 + 边界约束(不归一化) + // Deferred normalization: EMA + bounds (no renormalize to sum 1) for (int i = 0; i < seq_reg.count; i++) { float signal = (win_seq_usage[i] > 0) ? (float)win_seq_improve[i] / (float)win_seq_usage[i] @@ -1322,7 +1332,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, float floor_val = base_floor * phase_floor_mult; float global_cap = cfg.aos_weight_cap * phase_cap_mult; - // --- v4.0: 约束导向 — boost 跨行/行级算子权重 + 放宽 cap --- + // --- v4.0: constraint-directed — boost cross-row/row-level weights + relax cap --- if (use_constraint_directed && penalty_ratio > 0.1f) { float boost = 1.0f + (penalty_ratio - 0.1f) / 0.9f * (cfg.constraint_boost_max - 1.0f); @@ -1339,7 +1349,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, seq_reg.max_w[i] = base_max_w[i]; } - // 应用边界约束(不归一化) + // Apply bounds (no renormalize to sum 1) float sum = 0.0f; for (int i = 0; i < seq_reg.count; i++) { float cap_val = (seq_reg.max_w[i] > 0.0f) ? seq_reg.max_w[i] : global_cap; @@ -1347,11 +1357,11 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, sum += seq_reg.weights[i]; } - // 更新缓存的权重和 + // Update cached weight sum seq_reg.weights_sum = sum; } - // --- 第一层:K 步数权重更新(EMA + 延迟归一化) --- + // --- Layer 1: K-step weights (EMA + deferred normalize) --- { float new_w[MAX_K]; for (int i = 0; i < MAX_K; i++) { @@ -1362,14 +1372,14 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, + (1.0f - AOS_ALPHA) * (rate + AOS_WEIGHT_FLOOR); } - // 应用边界约束(不归一化) + // Apply bounds (no renormalize to sum 1) float floor_val = cfg.aos_weight_floor; float cap_val = 0.95f; for (int i = 0; i < MAX_K; i++) { kstep.weights[i] = fmaxf(floor_val, fminf(cap_val, new_w[i])); } - // K 步权重归一化(保持原有行为,因为 K 步选择不使用轮盘赌) + // Renormalize K-step weights (legacy behavior; K choice is not roulette) float sum = 0.0f; for (int i = 0; i < MAX_K; i++) sum += kstep.weights[i]; if (sum > 0.0f) { @@ -1378,7 +1388,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } } - // --- Debug: 前 5 个 batch 打印统计 --- + // --- Debug: print stats for first 5 batches --- if (cfg.verbose && gen_done <= batch * 5) { fprintf(stderr, " [AOS batch g=%d] usage:", gen_done); for (int i = 0; i < seq_reg.count; i++) fprintf(stderr, " %d", win_seq_usage[i]); @@ -1397,7 +1407,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } - // --- 停滞检测 --- + // --- Stagnation detection --- { int total_improve_all = 0; for (int i = 0; i < seq_reg.count; i++) @@ -1417,25 +1427,25 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } } - // --- 清零窗口累积器 --- + // --- Clear window accumulators --- memset(win_seq_usage, 0, sizeof(win_seq_usage)); memset(win_seq_improve, 0, sizeof(win_seq_improve)); memset(win_k_usage, 0, sizeof(win_k_usage)); memset(win_k_improve, 0, sizeof(win_k_improve)); } - // --- 关系矩阵更新(每个 batch 间隙,从种群 top-K 解统计)--- - // 多个好解贡献 G/O 信号,加速矩阵信息积累 + // --- Relation matrix update (between batches, from population top-K) --- + // Several good solutions contribute G/O signal to build the matrix faster if (use_relation_matrix) { if (!use_aos) { CUDA_CHECK(cudaDeviceSynchronize()); } - // 下载整个种群的目标值,找 top-K + // Download population objectives and find top-K constexpr int REL_TOP_K = 4; int top_indices[REL_TOP_K]; { - // 简单方法:下载所有解的 scalar 目标,host 端排序取 top-K + // Simple approach: scalar objectives on host, pick top-K minima float* h_scores = new float[pop_size]; Sol* h_pop_ptr = new Sol[pop_size]; CUDA_CHECK(cudaMemcpy(h_pop_ptr, pop.d_solutions, @@ -1444,16 +1454,16 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, h_scores[b] = scalar_objective(h_pop_ptr[b], oc); if (h_pop_ptr[b].penalty > 0.0f) h_scores[b] = 1e30f; } - // 找 top-K 最小值 + // Find top-K smallest scores for (int k = 0; k < REL_TOP_K && k < pop_size; k++) { int mi = 0; for (int b = 1; b < pop_size; b++) { if (h_scores[b] < h_scores[mi]) mi = b; } top_indices[k] = mi; - h_scores[mi] = 1e30f; // 标记已选 + h_scores[mi] = 1e30f; // mark as taken } - // 从 top-K 解更新 G/O + // Update G/O from top-K solutions int actual_k = (pop_size < REL_TOP_K) ? pop_size : REL_TOP_K; for (int k = 0; k < actual_k; k++) { relation_matrix_update(rel_mat, h_pop_ptr[top_indices[k]], pcfg.dim1); @@ -1465,9 +1475,9 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, relation_matrix_upload(rel_mat); } - // 交叉 / 迁移 / 精英注入 已在 launch_batch_kernels 中统一发射 + // Crossover / migrate / elite inject already launched in launch_batch_kernels - // --- 时间限制检查 --- + // --- Time limit check --- if (use_time_limit) { CUDA_CHECK(cudaEventRecord(t_stop)); CUDA_CHECK(cudaEventSynchronize(t_stop)); @@ -1481,7 +1491,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } } - // --- 收敛检测 + reheat --- + // --- Convergence check + reheat --- if (use_stagnation) { find_best_kernel<<<1, 1>>>(pop.d_solutions, pop_size, oc, d_best_idx); CUDA_CHECK(cudaDeviceSynchronize()); @@ -1500,26 +1510,25 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, if (stagnation_count >= cfg.stagnation_limit) { if (use_sa && cfg.reheat_ratio > 0.0f) { - // reheat:将温度恢复到初始温度的 reheat_ratio 倍 - // 通过回退 gen_done 实现(温度 = init * alpha^gen_done) + // Reheat: restore temperature to reheat_ratio * initial + // Implemented by rolling back gen_done (temp = init * alpha^gen_done) float target_temp = cfg.sa_temp_init * cfg.reheat_ratio; int reheat_gen = (int)(logf(target_temp / cfg.sa_temp_init) / logf(cfg.sa_alpha)); if (reheat_gen < 0) reheat_gen = 0; - // 不真正回退 gen_done(会影响终止条件),而是记录一个 temp_offset - // 简化做法:直接在下一轮 batch 中 temp 会自然从 reheat 后的值开始 - // 这里通过修改 gen_done 的等效温度来实现 + // Not a true gen_done rollback for termination; conceptually temp_offset + // Simplified: next batch temp follows from adjusted gen_done if (cfg.verbose) { float cur_temp = cfg.sa_temp_init * powf(cfg.sa_alpha, (float)gen_done); printf(" [REHEAT] stagnation=%d at gen %d, temp %.4f → %.4f\n", cfg.stagnation_limit, gen_done, cur_temp, target_temp); } - // 将 gen_done 回退到对应 target_temp 的位置(但不超过已完成代数的一半) + // Roll gen_done back to match target_temp (but not below half of completed gens) int min_gen = gen_done / 2; if (reheat_gen < min_gen) reheat_gen = min_gen; gen_done = reheat_gen; stagnation_count = 0; } else { - // 无 SA 时,收敛检测触发 → 提前终止 + // No SA: stagnation triggers early stop stop_reason = StopReason::Stagnation; if (cfg.verbose) printf(" [STOP] stagnation=%d at gen %d, no SA to reheat\n", cfg.stagnation_limit, gen_done); @@ -1528,7 +1537,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } } - // 打印进度 + // Progress printout if (cfg.verbose && gen_done % cfg.print_every == 0) { if (!use_stagnation) { find_best_kernel<<<1, 1>>>(pop.d_solutions, pop_size, oc, d_best_idx); @@ -1549,7 +1558,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, float elapsed_ms = 0; CUDA_CHECK(cudaEventElapsedTime(&elapsed_ms, t_start, t_stop)); - // --- 4. 最终结果 --- + // --- 4. Final result --- Sol best; if (use_sa) { CUDA_CHECK(cudaDeviceSynchronize()); @@ -1582,7 +1591,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, } } - // AOS: 打印最终两层权重 + // AOS: print final two-level weights if (use_aos && cfg.verbose) { printf(" AOS K-step weights: K1=%.3f K2=%.3f K3=%.3f\n", kstep.weights[0], kstep.weights[1], kstep.weights[2]); @@ -1592,7 +1601,7 @@ SolveResult solve(Problem& prob, const SolverConfig& cfg, printf("\n"); } - // 填充返回值 + // Fill return struct result.best_solution = best; result.elapsed_ms = elapsed_ms; result.generations = gen_done; diff --git a/prototype/core/types.cuh b/prototype/core/types.cuh index a29934d..5547dff 100644 --- a/prototype/core/types.cuh +++ b/prototype/core/types.cuh @@ -1,38 +1,39 @@ /** - * types.cuh - 核心类型定义 + * types.cuh - Core type definitions * - * 包含:编码类型、Solution 模板、ProblemConfig/SolverConfig、 - * SeqRegistry(AOS 序列级权重)、KStepConfig(多步执行)、 - * RelationMatrix(G/O 关系矩阵)、ProblemBase(CRTP 基类) + * Contains: encoding types, Solution template, ProblemConfig/SolverConfig, + * SeqRegistry (AOS sequence-level weights), KStepConfig (multi-step execution), + * RelationMatrix (G/O relation matrix), ProblemBase (CRTP base class) */ #pragma once #include +#include "cuda_utils.cuh" // ============================================================ -// 编译时常量 +// Compile-time constants // ============================================================ -constexpr int MAX_OBJ = 4; // 最多 4 个目标(16字节,不值得模板化) -constexpr int MAX_SEQ = 32; // 最大序列数(内置 ~16 + 自定义算子 ≤8,留余量) -constexpr int MAX_K = 3; // 多步执行的最大步数(K=1,2,3) -// AOS 权重上下限(归一化后) -constexpr float AOS_WEIGHT_FLOOR = 0.05f; // 最低权重保底(确保充分探索) -constexpr float AOS_WEIGHT_CAP = 0.35f; // 最高权重上限(防止赢者通吃) +constexpr int MAX_OBJ = 4; // Max 4 objectives (16 bytes, not worth templatizing) +constexpr int MAX_SEQ = 32; // Max sequences (built-in ~16 + custom ops ≤8, with margin) +constexpr int MAX_K = 3; // Max steps for multi-step execution (K=1,2,3) +// AOS weight bounds +constexpr float AOS_WEIGHT_FLOOR = 0.05f; // Minimum weight floor (ensures sufficient exploration) +constexpr float AOS_WEIGHT_CAP = 0.35f; // Maximum weight cap (prevents winner-take-all) // ============================================================ -// 枚举类型 +// Enum types // ============================================================ enum class EncodingType { - Permutation, // 排列:元素不重复 - Binary, // 0-1:flip 是主要算子 - Integer // 有界整数 + Permutation, // Permutation: elements are unique + Binary, // 0-1: flip is the main operator + Integer // Bounded integers }; enum class RowMode { - Single, // dim1=1,单行(TSP/QAP/Knapsack 等大部分问题) - Fixed, // dim1>1,行等长不可变(JSP-Int/Schedule,禁止 SPLIT/MERGE) - Partition // dim1>1,元素分区到各行,行长可变(CVRP/VRPTW) + Single, // dim1=1, single row (most problems: TSP/QAP/Knapsack, etc.) + Fixed, // dim1>1, equal row lengths fixed (JSP-Int/Schedule; SPLIT/MERGE disallowed) + Partition // dim1>1, elements partitioned across rows, variable row lengths (CVRP/VRPTW) }; enum class ObjDir { @@ -40,241 +41,235 @@ enum class ObjDir { Maximize }; -// 多目标比较模式 +// Multi-objective comparison mode enum class CompareMode { - Weighted, // 加权求和:sum(weight[i] * obj[i]),越小越好 - Lexicographic // 字典法:按优先级逐目标比较,前面的目标优先 + Weighted, // Weighted sum: sum(weight[i] * obj[i]), lower is better + Lexicographic // Lexicographic: compare objectives by priority order }; enum class MigrateStrategy { - Ring, // 环形:各岛最优→邻岛最差(慢传播,高多样性) - TopN, // 全局 Top-N 轮转分发(快传播,强收敛) - Hybrid // 两者兼顾:Top-N 替换最差 + Ring 替换次差 + Ring, // Ring: each island's best → neighbor's worst (slow spread, high diversity) + TopN, // Global Top-N round-robin (fast spread, strong convergence) + Hybrid // Hybrid: Top-N replaces worst + Ring replaces second-worst }; -// v5.0: 多 GPU 协同 — 解注入模式 +// v5.0: multi-GPU coordination — solution injection mode enum class MultiGpuInjectMode { - OneIsland, // 注入到 1 个岛的 worst(保守,保持多样性) - HalfIslands, // 注入到 num_islands/2 个岛的 worst(平衡) - AllIslands // 注入到所有岛的 worst(激进,快速传播) + OneIsland, // Inject into worst of 1 island (conservative, preserves diversity) + HalfIslands, // Inject into worst on num_islands/2 islands (balanced) + AllIslands // Inject into worst on all islands (aggressive, fast spread) }; -// v5.0 方案 B3: InjectBuffer — 被动注入缓冲区 -// GPU 无感知,CPU 同步写入,GPU 在 migrate_kernel 中检查并应用 -// 设计要点: -// 1. 使用同步 cudaMemcpy 避免与 solve() 的 stream/Graph 冲突 -// 2. 写入顺序:先 solution 后 flag,GPU 端原子读 flag 确保一致性 -// 3. 完全解耦:不依赖 solve() 的任何内部状态 +// v5.0 option B3: InjectBuffer — passive injection buffer +// GPU has no awareness; CPU writes synchronously; GPU checks and applies in migrate_kernel +// Design notes: +// 1. Use synchronous cudaMemcpy to avoid conflicts with solve() stream/Graph +// 2. Write order: solution first, then flag; GPU atomic flag read ensures consistency +// 3. Fully decoupled: does not depend on any internal state of solve() template struct InjectBuffer { - Sol* d_solution; // Device 端解缓冲区(单个解) - int* d_flag; // Device 端标志位:0=空,1=有新解 + Sol* d_solution = nullptr; // Device solution buffer (single solution) + int* d_flag = nullptr; // Device flag: 0=empty, 1=new solution + int owner_gpu = 0; // GPU that owns the allocation - // 分配 InjectBuffer(在指定 GPU 上) + // Allocate InjectBuffer (on given GPU) static InjectBuffer allocate(int gpu_id) { InjectBuffer buf; + buf.owner_gpu = gpu_id; - // 保存原设备,切换到目标 GPU int orig_device; - cudaGetDevice(&orig_device); - cudaSetDevice(gpu_id); + CUDA_CHECK(cudaGetDevice(&orig_device)); + CUDA_CHECK(cudaSetDevice(gpu_id)); - // 分配设备内存 - cudaMalloc(&buf.d_solution, sizeof(Sol)); - cudaMalloc(&buf.d_flag, sizeof(int)); + CUDA_CHECK(cudaMalloc(&buf.d_solution, sizeof(Sol))); + CUDA_CHECK(cudaMalloc(&buf.d_flag, sizeof(int))); - // 初始化 flag 为 0 int zero = 0; - cudaMemcpy(buf.d_flag, &zero, sizeof(int), cudaMemcpyHostToDevice); + CUDA_CHECK(cudaMemcpy(buf.d_flag, &zero, sizeof(int), cudaMemcpyHostToDevice)); - // 恢复原设备 - cudaSetDevice(orig_device); + CUDA_CHECK(cudaSetDevice(orig_device)); return buf; } - // 释放 InjectBuffer + // Free InjectBuffer (switches to owner GPU before freeing) void destroy() { - if (d_solution) { - cudaFree(d_solution); - d_solution = nullptr; - } - if (d_flag) { - cudaFree(d_flag); - d_flag = nullptr; + if (d_solution || d_flag) { + int orig_device; + cudaGetDevice(&orig_device); + cudaSetDevice(owner_gpu); + if (d_solution) { cudaFree(d_solution); d_solution = nullptr; } + if (d_flag) { cudaFree(d_flag); d_flag = nullptr; } + cudaSetDevice(orig_device); } } - // CPU 端写入新解 - // 注意:使用同步 cudaMemcpy 避免与 solve() 的 stream 冲突 - // 顺序:先写 solution,再写 flag(GPU 端原子读 flag 确保不会读到半写状态) + // CPU-side write of new solution + // Note: synchronous cudaMemcpy avoids stream conflicts with solve() + // Order: write solution first, then flag (GPU atomic flag read avoids half-written reads) void write_sync(const Sol& sol, int target_gpu) { - // 保存原设备,切换到目标 GPU int orig_device; - cudaGetDevice(&orig_device); - cudaSetDevice(target_gpu); + CUDA_CHECK(cudaGetDevice(&orig_device)); + CUDA_CHECK(cudaSetDevice(target_gpu)); - // 先写解数据 - cudaMemcpy(d_solution, &sol, sizeof(Sol), cudaMemcpyHostToDevice); - // 再写标志位(确保解数据已写完) + CUDA_CHECK(cudaMemcpy(d_solution, &sol, sizeof(Sol), cudaMemcpyHostToDevice)); int flag = 1; - cudaMemcpy(d_flag, &flag, sizeof(int), cudaMemcpyHostToDevice); + CUDA_CHECK(cudaMemcpy(d_flag, &flag, sizeof(int), cudaMemcpyHostToDevice)); - // 恢复原设备 - cudaSetDevice(orig_device); + CUDA_CHECK(cudaSetDevice(orig_device)); } }; // ============================================================ -// SeqID — 统一的 OperationSequence 编号 +// SeqID — unified OperationSequence IDs // ============================================================ -// 每个 SeqID 对应一种具体的搜索操作(原子或多步) -// AOS 权重跟踪粒度 = SeqID(每个序列独立权重) +// Each SeqID maps to one concrete search operation (atomic or multi-step) +// AOS weight granularity = SeqID (independent weight per sequence) // -// 命名规则:SEQ_{编码}_{操作名} -// 跨编码共享的行级操作统一编号 +// Naming: SEQ_{encoding}_{operation} +// Row-level ops shared across encodings use unified numbering namespace seq { -// --- Permutation 行内(元素级)--- -constexpr int SEQ_PERM_SWAP = 0; // swap 两个位置 -constexpr int SEQ_PERM_REVERSE = 1; // 2-opt(反转区间) -constexpr int SEQ_PERM_INSERT = 2; // insert(移动到新位置) -constexpr int SEQ_PERM_3OPT = 3; // 3-opt(断 3 边重连) +// --- Permutation in-row (element-level) --- +constexpr int SEQ_PERM_SWAP = 0; // swap two positions +constexpr int SEQ_PERM_REVERSE = 1; // 2-opt (reverse segment) +constexpr int SEQ_PERM_INSERT = 2; // insert (move to new position) +constexpr int SEQ_PERM_3OPT = 3; // 3-opt (reconnect after 3 edges) -// --- Permutation 行内(片段级)--- -constexpr int SEQ_PERM_OR_OPT = 4; // or-opt(移动连续 k 个元素) +// --- Permutation in-row (segment-level) --- +constexpr int SEQ_PERM_OR_OPT = 4; // or-opt (move k consecutive elements) -// --- Permutation 行内(组合级)--- -constexpr int SEQ_PERM_DOUBLE_SWAP = 30; // 连续两次 swap(同行) -constexpr int SEQ_PERM_TRIPLE_SWAP = 31; // 连续三次 swap(同行) +// --- Permutation in-row (combo-level) --- +constexpr int SEQ_PERM_DOUBLE_SWAP = 30; // two consecutive swaps (same row) +constexpr int SEQ_PERM_TRIPLE_SWAP = 31; // three consecutive swaps (same row) -// --- Permutation 跨行(元素级)--- -constexpr int SEQ_PERM_CROSS_RELOCATE = 5; // 单元素移行 -constexpr int SEQ_PERM_CROSS_SWAP = 6; // 单元素换行 +// --- Permutation cross-row (element-level) --- +constexpr int SEQ_PERM_CROSS_RELOCATE = 5; // single element moves row +constexpr int SEQ_PERM_CROSS_SWAP = 6; // single element swaps rows -// --- Permutation 跨行(片段级)--- -constexpr int SEQ_PERM_SEG_RELOCATE = 7; // 片段移行 -constexpr int SEQ_PERM_SEG_SWAP = 8; // 片段换行(2-opt*) -constexpr int SEQ_PERM_CROSS_EXCHANGE = 9; // 片段互换(保序) +// --- Permutation cross-row (segment-level) --- +constexpr int SEQ_PERM_SEG_RELOCATE = 7; // segment moves row +constexpr int SEQ_PERM_SEG_SWAP = 8; // segment swaps rows (2-opt*) +constexpr int SEQ_PERM_CROSS_EXCHANGE = 9; // segment exchange (order preserved) -// --- Binary 行内(元素级)--- -constexpr int SEQ_BIN_FLIP = 0; // 翻转一个位 -constexpr int SEQ_BIN_SWAP = 1; // 交换两个位 +// --- Binary in-row (element-level) --- +constexpr int SEQ_BIN_FLIP = 0; // flip one bit +constexpr int SEQ_BIN_SWAP = 1; // swap two bits -// --- Binary 行内(片段级)--- -constexpr int SEQ_BIN_SEG_FLIP = 2; // 翻转连续 k 个位 -constexpr int SEQ_BIN_K_FLIP = 3; // 同时翻转 k 个随机位 +// --- Binary in-row (segment-level) --- +constexpr int SEQ_BIN_SEG_FLIP = 2; // flip k consecutive bits +constexpr int SEQ_BIN_K_FLIP = 3; // flip k random bits at once -// --- Binary 跨行 --- -constexpr int SEQ_BIN_CROSS_SWAP = 4; // 两行各一个位互换 -constexpr int SEQ_BIN_SEG_CROSS_SWAP = 5; // 两行各取一段互换 +// --- Binary cross-row --- +constexpr int SEQ_BIN_CROSS_SWAP = 4; // swap one bit per row across two rows +constexpr int SEQ_BIN_SEG_CROSS_SWAP = 5; // swap a segment from each row -// --- 共享:行级(编码无关)--- -constexpr int SEQ_ROW_SWAP = 10; // 交换两行 -constexpr int SEQ_ROW_REVERSE = 11; // 反转行排列 -constexpr int SEQ_ROW_SPLIT = 12; // 一行拆两行 -constexpr int SEQ_ROW_MERGE = 13; // 两行合并 +// --- Shared: row-level (encoding-agnostic) --- +constexpr int SEQ_ROW_SWAP = 10; // swap two rows +constexpr int SEQ_ROW_REVERSE = 11; // reverse row order +constexpr int SEQ_ROW_SPLIT = 12; // split one row into two +constexpr int SEQ_ROW_MERGE = 13; // merge two rows -// --- 特殊 --- -constexpr int SEQ_PERTURBATION = 14; // 扰动(多步不可逆) +// --- Special --- +constexpr int SEQ_PERTURBATION = 14; // perturbation (multi-step, irreversible) -// --- Integer 行内(元素级)--- -constexpr int SEQ_INT_RANDOM_RESET = 0; // 随机一个位置重置为 [lb, ub] 内随机值 -constexpr int SEQ_INT_DELTA = 1; // 随机一个位置 ±k(clamp 到 [lb, ub]) -constexpr int SEQ_INT_SWAP = 2; // 交换两个位置的值 +// --- Integer in-row (element-level) --- +constexpr int SEQ_INT_RANDOM_RESET = 0; // reset one position to random in [lb, ub] +constexpr int SEQ_INT_DELTA = 1; // one position ±k (clamped to [lb, ub]) +constexpr int SEQ_INT_SWAP = 2; // swap values at two positions -// --- Integer 行内(片段级)--- -constexpr int SEQ_INT_SEG_RESET = 3; // 连续 k 个位置全部重置 -constexpr int SEQ_INT_K_DELTA = 4; // 随机 k 个位置各自 ±1 +// --- Integer in-row (segment-level) --- +constexpr int SEQ_INT_SEG_RESET = 3; // reset k consecutive positions +constexpr int SEQ_INT_K_DELTA = 4; // k positions each ±1 at random -// --- Integer 跨行 --- -constexpr int SEQ_INT_CROSS_SWAP = 5; // 两行各一个位置互换 +// --- Integer cross-row --- +constexpr int SEQ_INT_CROSS_SWAP = 5; // swap one position per row across two rows -// --- LNS(大邻域搜索)--- -constexpr int SEQ_LNS_SEGMENT_SHUFFLE = 20; // 打乱连续片段 -constexpr int SEQ_LNS_SCATTER_SHUFFLE = 21; // 打乱随机分散位置 -constexpr int SEQ_LNS_GUIDED_REBUILD = 22; // 关系矩阵引导重建 +// --- LNS (large neighborhood search) --- +constexpr int SEQ_LNS_SEGMENT_SHUFFLE = 20; // shuffle a contiguous segment +constexpr int SEQ_LNS_SCATTER_SHUFFLE = 21; // shuffle a scattered set of positions +constexpr int SEQ_LNS_GUIDED_REBUILD = 22; // guided rebuild from relation matrix } // namespace seq // ============================================================ -// RelationMatrix — G/O 关系矩阵(GPU global memory) +// RelationMatrix — G/O relation matrix (GPU global memory) // ============================================================ -// G[i][j]: 元素 i 和 j 的分组倾向(对称,越大越倾向同组) -// O[i][j]: 元素 i 排在 j 前面的倾向(不对称) -// 存储为一维数组 [N * N],行优先 -// 小规模 N<200 直接 Dense,P2 再做稀疏化 +// G[i][j]: grouping tendency of elements i and j (symmetric; higher → more same-group) +// O[i][j]: tendency for element i to precede j (asymmetric) +// Stored as a 1D row-major array [N * N] +// For small N<200 use dense directly; P2 may add sparsification // -// 更新时机:host 端,每个 batch 间隙 -// 使用时机:kernel 中 SEQ_LNS_GUIDED_REBUILD 读取 +// Updated on: host, between batches +// Read in: kernel for SEQ_LNS_GUIDED_REBUILD struct RelationMatrix { - float* d_G; // GPU 上的 G 矩阵 [N * N] - float* d_O; // GPU 上的 O 矩阵 [N * N] - float* h_G; // Host 上的 G 矩阵 [N * N](用于更新后上传) - float* h_O; // Host 上的 O 矩阵 [N * N] - int N; // 元素总数 - float decay; // 衰减系数 α(默认 0.95) - int update_count; // 已更新次数(用于冷启动判断) + float* d_G; // G matrix on GPU [N * N] + float* d_O; // O matrix on GPU [N * N] + float* h_G; // G matrix on host [N * N] (for upload after update) + float* h_O; // O matrix on host [N * N] + int N; // total number of elements + float decay; // decay factor α (default 0.95) + int update_count; // number of updates so far (for cold-start logic) }; // ============================================================ -// SeqRegistry — 运行时可用序列注册表 +// SeqRegistry — runtime-available sequence registry // ============================================================ -// 根据 EncodingType 和 dim1 自动确定哪些序列可用 -// 传到 GPU 供 sample_sequence() 使用 +// Which sequences are available is determined from EncodingType and dim1 +// Passed to GPU for sample_sequence() enum class SeqCategory : int { - InRow = 0, // 行内算子(swap, reverse, insert, ...) - CrossRow = 1, // 跨行算子(cross_relocate, cross_swap, seg_relocate, ...) - RowLevel = 2, // 行级算子(row_swap, row_reverse, split, merge) - LNS = 3, // 大邻域搜索 + InRow = 0, // within-row operators (swap, reverse, insert, ...) + CrossRow = 1, // cross-row operators (cross_relocate, cross_swap, seg_relocate, ...) + RowLevel = 2, // row-level operators (row_swap, row_reverse, split, merge) + LNS = 3, // large neighborhood search }; struct SeqRegistry { - int ids[MAX_SEQ]; // 可用序列的 SeqID 列表 - int count; // 可用序列数量 - float weights[MAX_SEQ]; // 每个序列的当前权重(未归一化,延迟归一化) - float weights_sum; // 权重和(缓存,用于延迟归一化) - float max_w[MAX_SEQ]; // 每个序列的权重上限(0 = 不限,用全局 cap) - SeqCategory categories[MAX_SEQ]; // 每个序列的分类(约束导向用) + int ids[MAX_SEQ]; // SeqID list of available sequences + int count; // number of available sequences + float weights[MAX_SEQ]; // current weight per sequence (unnormalized; lazy normalization) + float weights_sum; // sum of weights (cached for lazy normalization) + float max_w[MAX_SEQ]; // per-sequence weight cap (0 = unlimited, use global cap) + SeqCategory categories[MAX_SEQ]; // category per sequence (for constraint-directed mode) }; // ============================================================ -// KStepConfig — 多步执行的步数选择配置 +// KStepConfig — step-count selection for multi-step execution // ============================================================ -// K=1: 单步(当前行为),K=2/3: 连续执行多个序列后再评估 -// 两层权重体系的第一层 +// K=1: single step (current behavior); K=2/3: run several sequences then evaluate +// First layer of the two-level weight system // -// 自适应策略: -// - 初始 K=1 权重很大(保守),K>1 权重小 -// - K>1 带来改进 → 增大该 K 的权重 -// - 长时间无改进 → 重置/增大 K>1 权重(跳出局部最优) +// Adaptive policy: +// - Initially K=1 has large weight (conservative), K>1 small +// - If K>1 yields improvement → increase that K's weight +// - Long stagnation → reset / boost K>1 weights (escape local optima) struct KStepConfig { - float weights[MAX_K]; // K=1,2,3 的采样权重(归一化) - int stagnation_count; // 连续无改进的 batch 数(用于触发重置) - int stagnation_limit; // 触发重置的阈值(默认 5 个 batch) + float weights[MAX_K]; // sampling weights for K=1,2,3 (normalized) + int stagnation_count; // consecutive batches without improvement (triggers reset) + int stagnation_limit; // threshold to trigger reset (default 5 batches) }; -// 构建默认 K 步配置 +// Build default K-step configuration inline KStepConfig build_kstep_config() { KStepConfig kc; - kc.weights[0] = 0.80f; // K=1: 初始主导 - kc.weights[1] = 0.15f; // K=2: 少量探索 - kc.weights[2] = 0.05f; // K=3: 极少探索 + kc.weights[0] = 0.80f; // K=1: dominates initially + kc.weights[1] = 0.15f; // K=2: little exploration + kc.weights[2] = 0.05f; // K=3: minimal exploration kc.stagnation_count = 0; kc.stagnation_limit = 5; return kc; }; // ============================================================ -// ProblemProfile — 基于结构特征推断的问题画像 +// ProblemProfile — problem profile inferred from structural features // ============================================================ -// 第一层:纯结构推断(不感知语义),用于驱动算子注册和初始权重 -// 未来第二层:可扩展更细粒度的画像(如多属性、高约束等) +// Layer 1: structure-only inference (no semantics), drives operator registration and initial weights +// Future layer 2: finer profiles (e.g. multi-attribute, high constraint) enum class ScaleClass { Small, Medium, Large }; enum class StructClass { SingleSeq, MultiFixed, MultiPartition }; @@ -286,10 +281,10 @@ struct ProblemProfile { float cross_row_prob; }; -// classify_problem() 定义在 ProblemConfig 之后 +// classify_problem() is defined after ProblemConfig // ============================================================ -// 权重预设 — 由 ScaleClass 驱动 +// Weight presets — driven by ScaleClass // ============================================================ struct WeightPreset { @@ -308,100 +303,100 @@ inline WeightPreset get_weight_preset(ScaleClass scale) { return { 0.50f, 0.80f, 0.006f, 0.01f }; } -// classify_problem() 和 build_seq_registry() 定义在 ProblemConfig 之后 +// classify_problem() and build_seq_registry() are defined after ProblemConfig // ============================================================ -// Solution — 解的模板化表示 +// Solution — templated solution representation // ============================================================ -// D1: 行数上限 (TSP=1, VRP≤16, Schedule≤8) -// D2: 每行列数上限 (TSP≤64, 背包≤32) -// 每个 Problem 选择最小够用的 D1/D2,编译器生成紧凑的结构 +// D1: max number of rows (TSP=1, VRP≤16, Schedule≤8) +// D2: max columns per row (TSP≤64, knapsack≤32) +// Each Problem picks the smallest sufficient D1/D2; compiler emits a compact layout template struct Solution { - static constexpr int DIM1 = D1; // 编译时行数上限 - static constexpr int DIM2 = D2; // 编译时列数上限 - int data[D1][D2]; // D1×D2×4 字节 - int dim2_sizes[D1]; // D1×4 字节 - float objectives[MAX_OBJ]; // 16 字节(固定) - float penalty; // 4 字节 + static constexpr int DIM1 = D1; // compile-time max rows + static constexpr int DIM2 = D2; // compile-time max columns per row + int data[D1][D2]; // D1×D2×4 bytes + int dim2_sizes[D1]; // D1×4 bytes + float objectives[MAX_OBJ]; // 16 bytes (fixed) + float penalty; // 4 bytes }; // ============================================================ -// ProblemConfig — 问题的运行时元信息 +// ProblemConfig — runtime metadata for a problem // ============================================================ struct ProblemConfig { EncodingType encoding; - int dim1; // 实际使用的行数 (≤ D1) - int dim2_default; // 实际使用的列数 (≤ D2) + int dim1; // actual number of rows used (≤ D1) + int dim2_default; // actual number of columns used (≤ D2) int num_objectives; ObjDir obj_dirs[MAX_OBJ]; - float obj_weights[MAX_OBJ]; // Weighted 模式下的权重 - // 多目标比较 + float obj_weights[MAX_OBJ]; // weights in Weighted mode + // Multi-objective comparison CompareMode compare_mode = CompareMode::Weighted; - int obj_priority[MAX_OBJ] = {0, 1, 2, 3}; // Lexicographic 模式下的比较顺序(索引) - float obj_tolerance[MAX_OBJ] = {0.0f, 0.0f, 0.0f, 0.0f}; // 字典法容差:差值 <= tol 视为相等 + int obj_priority[MAX_OBJ] = {0, 1, 2, 3}; // comparison order in Lexicographic mode (indices) + float obj_tolerance[MAX_OBJ] = {0.0f, 0.0f, 0.0f, 0.0f}; // lexicographic tolerance: |diff| ≤ tol ⇒ tie int value_lower_bound; int value_upper_bound; - // v3.4: 统一行模式 - RowMode row_mode = RowMode::Single; // 行模式(Single/Fixed/Partition) - float cross_row_prob = 0.0f; // 跨行 move 概率(0=纯行内操作) - int total_elements = 0; // Partition 模式下的总元素数 - int perm_repeat_count = 1; // 排列中每个值的重复次数(1=标准排列,>1=多重集排列) + // v3.4: unified row mode + RowMode row_mode = RowMode::Single; // row mode (Single/Fixed/Partition) + float cross_row_prob = 0.0f; // probability of cross-row moves (0 = within-row only) + int total_elements = 0; // total elements in Partition mode + int perm_repeat_count = 1; // repeats per value in permutation (1 = standard; >1 = multiset) }; // ============================================================ -// SolverConfig — 求解器参数 +// SolverConfig — solver parameters // ============================================================ struct SolverConfig { - int pop_size = 0; // 种群大小(0 = 自动匹配 GPU 最大并行度) + int pop_size = 0; // population size (0 = auto to max GPU parallelism) int max_gen = 1000; float mutation_rate = 0.1f; unsigned seed = 42; bool verbose = true; int print_every = 100; - // 岛屿模型参数 - int num_islands = 1; // 0 = 自适应,1 = 纯爬山(无岛屿),>1 = 岛屿模型 - int migrate_interval = 100; // 每隔多少代执行一次迁移 + // Island model + int num_islands = 1; // 0 = adaptive, 1 = pure hill climbing (no islands), >1 = island model + int migrate_interval = 100; // migrate every this many generations MigrateStrategy migrate_strategy = MigrateStrategy::Hybrid; - // 模拟退火参数 - float sa_temp_init = 0.0f; // 初始温度(0 = 禁用 SA,纯爬山) - float sa_alpha = 0.998f; // 冷却率(每代乘以 alpha) - // v1.0: 交叉参数 - float crossover_rate = 0.1f; // 每代中执行交叉的概率(vs 变异) - // v2.0: 自适应算子选择 - bool use_aos = false; // 启用 AOS(batch 间更新算子权重) - float aos_weight_floor = AOS_WEIGHT_FLOOR; // 运行时可覆盖的 floor - float aos_weight_cap = AOS_WEIGHT_CAP; // 运行时可覆盖的 cap - // v2.1: 初始解策略 - int init_oversample = 4; // 采样倍数(1 = 不做采样择优,即纯随机) - float init_random_ratio = 0.3f; // 纯随机解占比(多样性保底) - // v3.0: 工程可用性 - float time_limit_sec = 0.0f; // 时间限制(秒,0 = 不限制,按 max_gen 跑完) - int stagnation_limit = 0; // 收敛检测:连续多少个 batch 无改进后 reheat(0 = 禁用) - float reheat_ratio = 0.5f; // reheat 时温度恢复到初始温度的比例 + // Simulated annealing + float sa_temp_init = 0.0f; // initial temperature (0 = disable SA, hill climb only) + float sa_alpha = 0.998f; // cooling rate (multiply by alpha each generation) + // v1.0: crossover + float crossover_rate = 0.1f; // probability of crossover per generation (vs mutation) + // v2.0: adaptive operator selection + bool use_aos = false; // enable AOS (update operator weights between batches) + float aos_weight_floor = AOS_WEIGHT_FLOOR; // runtime-overridable floor + float aos_weight_cap = AOS_WEIGHT_CAP; // runtime-overridable cap + // v2.1: initial solution strategy + int init_oversample = 4; // oversampling factor (1 = no sampling selection, pure random) + float init_random_ratio = 0.3f; // fraction of purely random solutions (diversity floor) + // v3.0: engineering usability + float time_limit_sec = 0.0f; // time limit in seconds (0 = none, run to max_gen) + int stagnation_limit = 0; // convergence: reheat after this many batches without improvement (0 = off) + float reheat_ratio = 0.5f; // on reheat, fraction of initial temperature to restore // v3.5: CUDA Graph - bool use_cuda_graph = false; // 启用 CUDA Graph(减少 kernel launch 开销) - // v3.6: AOS 更新频率控制 - int aos_update_interval = 10; // 每隔多少个 batch 更新一次 AOS 权重(降低 cudaMemcpy 同步频率) - // v4.0: 约束导向 + 分层搜索 - bool use_constraint_directed = false; // 启用约束导向(根据 penalty 比例动态调整跨行算子权重) - bool use_phased_search = false; // 启用分层搜索(按进度调整全局 floor/cap) - // 分层搜索参数:三期阈值 - float phase_explore_end = 0.30f; // 探索期结束(进度比例) - float phase_refine_start = 0.70f; // 精细期开始(进度比例) - // 约束导向参数 - float constraint_boost_max = 2.5f; // 高约束时跨行算子 cap 提升倍率上限 - // v5.0: 多 GPU 协同 - int num_gpus = 1; // 使用的 GPU 数量(1 = 单 GPU,>1 = 多 GPU 协同) - float multi_gpu_interval_sec = 10.0f; // GPU 间交换最优解的时间间隔(秒) - MultiGpuInjectMode multi_gpu_inject_mode = MultiGpuInjectMode::HalfIslands; // 注入模式 + bool use_cuda_graph = false; // enable CUDA Graph (fewer kernel launch overheads) + // v3.6: AOS update frequency + int aos_update_interval = 10; // update AOS weights every this many batches (lower cudaMemcpy sync rate) + // v4.0: constraint-directed + phased search + bool use_constraint_directed = false; // constraint-directed mode (scale cross-row weights by penalty ratio) + bool use_phased_search = false; // phased search (adjust global floor/cap by progress) + // Phased search: three-phase thresholds + float phase_explore_end = 0.30f; // end of exploration phase (progress fraction) + float phase_refine_start = 0.70f; // start of refinement phase (progress fraction) + // Constraint-directed parameters + float constraint_boost_max = 2.5f; // max multiplier boost for cross-row cap under high constraint + // v5.0: multi-GPU cooperation + int num_gpus = 1; // number of GPUs (1 = single GPU, >1 = multi-GPU) + float multi_gpu_interval_sec = 10.0f; // interval in seconds to exchange best solutions across GPUs + MultiGpuInjectMode multi_gpu_inject_mode = MultiGpuInjectMode::HalfIslands; // injection mode }; // ============================================================ -// classify_problem — 从 ProblemConfig 推断问题画像 +// classify_problem — infer problem profile from ProblemConfig // ============================================================ inline ProblemProfile classify_problem(const ProblemConfig& pcfg) { @@ -424,7 +419,7 @@ inline ProblemProfile classify_problem(const ProblemConfig& pcfg) { } // ============================================================ -// build_seq_registry — 由 ProblemProfile 驱动的算子注册 +// build_seq_registry — operator registration driven by ProblemProfile // ============================================================ inline SeqRegistry build_seq_registry(const ProblemProfile& prof) { @@ -436,7 +431,10 @@ inline SeqRegistry build_seq_registry(const ProblemProfile& prof) { } auto add = [&](int id, float w, SeqCategory cat, float cap = 0.0f) { - if (reg.count >= MAX_SEQ) return; + if (reg.count >= MAX_SEQ) { + printf("[WARN] SeqRegistry full (MAX_SEQ=%d), ignoring SeqID %d\n", MAX_SEQ, id); + return; + } reg.ids[reg.count] = id; reg.weights[reg.count] = w; reg.max_w[reg.count] = cap; @@ -514,7 +512,7 @@ inline SeqRegistry build_seq_registry(const ProblemProfile& prof) { } } - // 延迟归一化:只计算权重和,不归一化 + // Lazy normalization: only sum weights; do not normalize here reg.weights_sum = 0.0f; for (int i = 0; i < reg.count; i++) { reg.weights_sum += reg.weights[i]; @@ -523,19 +521,19 @@ inline SeqRegistry build_seq_registry(const ProblemProfile& prof) { } // ============================================================ -// ObjConfig — 传到 GPU 的目标比较配置(紧凑结构) +// ObjConfig — compact objective comparison config for GPU // ============================================================ struct ObjConfig { int num_obj; CompareMode mode; - ObjDir dirs[MAX_OBJ]; // 每个目标的方向 - float weights[MAX_OBJ]; // Weighted 模式下的权重 - int priority[MAX_OBJ]; // Lexicographic 模式下的比较顺序 - float tolerance[MAX_OBJ]; // Lexicographic 模式下的容差 + ObjDir dirs[MAX_OBJ]; // direction per objective + float weights[MAX_OBJ]; // weights in Weighted mode + int priority[MAX_OBJ]; // comparison order in Lexicographic mode + float tolerance[MAX_OBJ]; // tolerance in Lexicographic mode }; -// 从 ProblemConfig 构造 ObjConfig(CPU 端) +// Build ObjConfig from ProblemConfig (CPU side) inline ObjConfig make_obj_config(const ProblemConfig& pcfg) { ObjConfig oc; oc.num_obj = pcfg.num_objectives; @@ -550,7 +548,7 @@ inline ObjConfig make_obj_config(const ProblemConfig& pcfg) { } // ============================================================ -// SolveResult — solve() 的返回值 +// SolveResult — return value of solve() // ============================================================ enum class StopReason { MaxGen, TimeLimit, Stagnation }; @@ -564,12 +562,12 @@ struct SolveResult { }; // ============================================================ -// 目标重要性映射 — 统一 Weighted / Lexicographic 的重要性度量 +// Objective importance mapping — unified importance for Weighted / Lexicographic // ============================================================ -// 用于初始化选种(NSGA-II 加权拥挤度 + 核心目标预留名额) +// Used for initial selection (NSGA-II weighted crowding + core-object slots) // Weighted: importance[i] = weight[i] / Σweight // Lexicographic: importance[i] = 0.5^rank[i] / Σ(0.5^rank) -// → 第一优先级 ~57%,第二 ~29%,第三 ~14% +// → first priority ~57%, second ~29%, third ~14% inline void compute_importance(const ObjConfig& oc, float* importance) { float sum = 0.0f; @@ -590,26 +588,26 @@ inline void compute_importance(const ObjConfig& oc, float* importance) { } // ============================================================ -// 比较工具 — 支持 Weighted / Lexicographic +// Comparison utilities — Weighted / Lexicographic // ============================================================ -// 将目标值统一为"越小越好":Maximize 目标取负 +// Normalize objectives to "smaller is better": negate Maximize objectives __device__ __host__ inline float normalize_obj(float val, ObjDir dir) { return (dir == ObjDir::Maximize) ? -val : val; } -// 核心比较:a 是否优于 b -// v5.0: 添加 __host__ 支持多 GPU 在 CPU 端比较解 +// Core comparison: whether a is better than b +// v5.0: add __host__ so multi-GPU can compare solutions on CPU template __device__ __host__ inline bool is_better(const Sol& a, const Sol& b, const ObjConfig& oc) { - // penalty 优先:可行解一定优于不可行解 + // Penalty first: feasible beats infeasible if (a.penalty <= 0.0f && b.penalty > 0.0f) return true; if (a.penalty > 0.0f && b.penalty <= 0.0f) return false; if (a.penalty > 0.0f && b.penalty > 0.0f) return a.penalty < b.penalty; if (oc.mode == CompareMode::Weighted) { - // 加权求和(权重已包含方向信息:Maximize 目标用负权重,或由 normalize_obj 处理) + // Weighted sum (weights may encode direction: negative for Maximize, or use normalize_obj) float sum_a = 0.0f, sum_b = 0.0f; for (int i = 0; i < oc.num_obj; i++) { float na = normalize_obj(a.objectives[i], oc.dirs[i]); @@ -619,21 +617,22 @@ __device__ __host__ inline bool is_better(const Sol& a, const Sol& b, } return sum_a < sum_b; } else { - // 字典法:按 priority 顺序逐目标比较 + // Lexicographic: compare objectives in priority order for (int p = 0; p < oc.num_obj; p++) { int idx = oc.priority[p]; + if (idx < 0 || idx >= oc.num_obj) continue; float va = normalize_obj(a.objectives[idx], oc.dirs[idx]); float vb = normalize_obj(b.objectives[idx], oc.dirs[idx]); float diff = va - vb; - if (diff < -oc.tolerance[idx]) return true; // a 明显更好 - if (diff > oc.tolerance[idx]) return false; // b 明显更好 - // 在容差内视为相等 → 继续比较下一个目标 + if (diff < -oc.tolerance[idx]) return true; // a clearly better + if (diff > oc.tolerance[idx]) return false; // b clearly better + // Within tolerance → tie, continue to next objective } - return false; // 所有目标都在容差内相等 + return false; // all objectives tied within tolerance } } -// 标量化(SA 接受概率用):返回越小越好的标量 +// Scalarization (for SA acceptance): smaller is better template __device__ __host__ inline float scalar_objective(const Sol& sol, const ObjConfig& oc) { @@ -643,13 +642,14 @@ __device__ __host__ inline float scalar_objective(const Sol& sol, sum += oc.weights[i] * normalize_obj(sol.objectives[i], oc.dirs[i]); return sum; } else { - // 字典法下 SA 用第一优先级目标作为标量 + // Under lexicographic SA, use first-priority objective as scalar int idx = oc.priority[0]; + if (idx < 0 || idx >= oc.num_obj) idx = 0; return normalize_obj(sol.objectives[idx], oc.dirs[idx]); } } -// 轻量比较:直接操作 float[] 目标数组(避免复制整个 Sol) +// Lightweight comparison: operate on float[] objectives (avoid copying full Sol) __device__ inline bool obj_is_better(const float* new_objs, const float* old_objs, const ObjConfig& oc) { if (oc.mode == CompareMode::Weighted) { @@ -662,6 +662,7 @@ __device__ inline bool obj_is_better(const float* new_objs, const float* old_obj } else { for (int p = 0; p < oc.num_obj; p++) { int idx = oc.priority[p]; + if (idx < 0 || idx >= oc.num_obj) continue; float va = normalize_obj(new_objs[idx], oc.dirs[idx]); float vb = normalize_obj(old_objs[idx], oc.dirs[idx]); float diff = va - vb; @@ -672,7 +673,7 @@ __device__ inline bool obj_is_better(const float* new_objs, const float* old_obj } } -// 轻量标量化:直接操作 float[] 目标数组 +// Lightweight scalarization: operate on float[] objectives __device__ __host__ inline float obj_scalar(const float* objs, const ObjConfig& oc) { if (oc.mode == CompareMode::Weighted) { float sum = 0.0f; @@ -681,60 +682,61 @@ __device__ __host__ inline float obj_scalar(const float* objs, const ObjConfig& return sum; } else { int idx = oc.priority[0]; + if (idx < 0 || idx >= oc.num_obj) idx = 0; return normalize_obj(objs[idx], oc.dirs[idx]); } } // ============================================================ -// AOSStats — 自适应算子选择统计(每个 block 一份) +// AOSStats — adaptive operator selection stats (one per block) // ============================================================ -// v3.0: 粒度从 3 层 → MAX_SEQ 个序列 -// 记录每个序列的使用次数和改进次数 -// batch 结束后由 host 聚合,更新 SeqRegistry 权重 +// v3.0: granularity from 3 layers → MAX_SEQ sequences +// Records per-sequence usage and improvement counts +// Host aggregates after each batch and updates SeqRegistry weights struct AOSStats { - // 算子层统计(第二层) - int usage[MAX_SEQ]; // 各序列使用次数 - int improvement[MAX_SEQ]; // 各序列改进次数(delta < 0 且被接受) - // K 步数层统计(第一层) - int k_usage[MAX_K]; // K=1,2,3 各自使用次数 - int k_improvement[MAX_K]; // K=1,2,3 各自改进次数 + // Operator-level stats (second layer) + int usage[MAX_SEQ]; // per-sequence usage counts + int improvement[MAX_SEQ]; // per-sequence improvements (delta < 0 and accepted) + // K-step layer stats (first layer) + int k_usage[MAX_K]; // usage counts for K=1,2,3 + int k_improvement[MAX_K]; // improvement counts for K=1,2,3 }; // ============================================================ -// ObjDef — 单个目标的定义(编译期常量) +// ObjDef — single-objective definition (compile-time constant) // ============================================================ struct ObjDef { - ObjDir dir; // 优化方向 - float weight; // Weighted 模式下的权重 - float tolerance; // Lexicographic 模式下的容差 + ObjDir dir; // optimization direction + float weight; // weight in Weighted mode + float tolerance; // tolerance in Lexicographic mode }; // ============================================================ -// HeuristicMatrix — 启发式初始解构造用的数据矩阵描述 +// HeuristicMatrix — data matrix descriptor for heuristic initial solutions // ============================================================ struct HeuristicMatrix { - const float* data; // host 端 N*N 矩阵 - int N; // 维度 + const float* data; // N×N matrix on host + int N; // dimension }; // ============================================================ -// ProblemBase — CRTP 基类 +// ProblemBase — CRTP base class // -// 用户继承此基类,提供: -// static constexpr ObjDef OBJ_DEFS[] = {...}; — 目标元信息 -// __device__ float compute_obj(int idx, ...) const; — 目标分发 +// Users inherit this base and provide: +// static constexpr ObjDef OBJ_DEFS[] = {...}; — objective metadata +// __device__ float compute_obj(int idx, ...) const; — objective dispatch // __device__ float compute_penalty(...) const; // -// 约定:OBJ_DEFS 和 compute_obj 紧挨着写,case N 对应 OBJ_DEFS[N] -// NUM_OBJ 由 sizeof(OBJ_DEFS) 自动推导,无需手动维护 +// Convention: OBJ_DEFS and compute_obj stay aligned; case N maps to OBJ_DEFS[N] +// NUM_OBJ is derived from sizeof(OBJ_DEFS); no manual count // -// 基类自动提供: -// evaluate(sol) — 遍历目标列表调用 compute_obj -// fill_obj_config(cfg) — 从 OBJ_DEFS 自动填充 ProblemConfig -// obj_config() — 直接生成 ObjConfig +// Base class provides: +// evaluate(sol) — loop objectives and call compute_obj +// fill_obj_config(cfg) — fill ProblemConfig from OBJ_DEFS +// obj_config() — build ObjConfig directly // ============================================================ template @@ -743,10 +745,10 @@ struct ProblemBase { static constexpr int D2 = D2_; using Sol = Solution; - // NUM_OBJ 从 OBJ_DEFS 数组自动推导 + // NUM_OBJ derived from OBJ_DEFS array size static constexpr int NUM_OBJ = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef); - // 自动评估:遍历目标列表 + // Automatic evaluation: iterate objectives __device__ void evaluate(Sol& sol) const { const auto& self = static_cast(*this); constexpr int n = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef); @@ -755,7 +757,7 @@ struct ProblemBase { sol.penalty = self.compute_penalty(sol); } - // 从 OBJ_DEFS 自动填充 ProblemConfig 的目标部分 + // Fill objective fields of ProblemConfig from OBJ_DEFS void fill_obj_config(ProblemConfig& cfg) const { constexpr int n = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef); cfg.num_objectives = n; @@ -763,59 +765,59 @@ struct ProblemBase { cfg.obj_dirs[i] = Derived::OBJ_DEFS[i].dir; cfg.obj_weights[i] = Derived::OBJ_DEFS[i].weight; cfg.obj_tolerance[i] = Derived::OBJ_DEFS[i].tolerance; - cfg.obj_priority[i] = i; // 列表顺序即优先级 + cfg.obj_priority[i] = i; // list order is priority order } } - // 直接生成 ObjConfig(供 solver 使用) + // Build ObjConfig directly (for solver) ObjConfig obj_config() const { ProblemConfig pcfg; fill_obj_config(pcfg); return make_obj_config(pcfg); } - // 可选:返回 shared memory 需求(字节) - // 默认返回 0(不使用 shared memory) - // 子类覆盖:如果问题数据可以放入 shared memory,返回实际大小 + // Optional: shared memory requirement (bytes) + // Default 0 (no shared memory) + // Override if problem data fits in shared memory; return actual size size_t shared_mem_bytes() const { return 0; } - // 可选:加载问题数据到 shared memory - // 默认空实现(不使用 shared memory) - // 子类覆盖:如果 shared_mem_bytes() > 0,实现数据加载逻辑 + // Optional: load problem data into shared memory + // Default no-op (no shared memory) + // Override if shared_mem_bytes() > 0 to implement loading __device__ void load_shared(char* smem, int tid, int bsz) { - (void)smem; (void)tid; (void)bsz; // 默认:不做任何事 + (void)smem; (void)tid; (void)bsz; // default: no-op } - // 每个 block 在 global memory 中的热数据工作集大小(字节) - // 用于 auto pop_size 估算 L2 cache 压力 - // 默认 = shared_mem_bytes()(数据在 smem 时,gmem 工作集为 0 不影响) - // 子类覆盖:当 shared_mem_bytes() 返回 0(数据放不进 smem)时, - // 返回实际数据大小(如距离矩阵 n*n*sizeof(float)) + // Hot working-set size in global memory per block (bytes) + // Used for auto pop_size L2 cache pressure estimate + // Default = shared_mem_bytes() (when data is in smem, gmem working set is 0) + // Override when shared_mem_bytes() is 0 (data does not fit in smem): + // return actual data size (e.g. distance matrix n*n*sizeof(float)) size_t working_set_bytes() const { return static_cast(*this).shared_mem_bytes(); } - // 可选:初始化 G/O 关系矩阵(为 GUIDED_REBUILD 提供先验知识) - // G[i*N+j]: 元素 i 和 j 的分组倾向(对称,[0,1],越大越倾向同组) - // O[i*N+j]: 元素 i 排在 j 前面的倾向(不对称,[0,1]) - // 默认不提供(全零),搜索过程中通过 EMA 从历史好解积累 - // 用户覆盖示例:距离近 → G 和 O 都高 + // Optional: initialize G/O relation matrix (prior for GUIDED_REBUILD) + // G[i*N+j]: grouping tendency of i and j (symmetric, [0,1]; higher → same group) + // O[i*N+j]: tendency for i before j (asymmetric, [0,1]) + // Default none (zeros); EMA accumulates from good solutions during search + // Example override: close distance → high G and O void init_relation_matrix(float* h_G, float* h_O, int N) const { - (void)h_G; (void)h_O; (void)N; // 默认:不做任何事(保持全零) + (void)h_G; (void)h_O; (void)N; // default: no-op (keep zeros) } - // 可选:返回 host 端数据矩阵供启发式初始解构造 - // 默认返回 0(不提供),子类 override 后填充 out 数组并返回实际数量 + // Optional: host-side data matrices for heuristic initial solutions + // Default 0 (none); override to fill out[] and return count int heuristic_matrices(HeuristicMatrix* out, int max_count) const { (void)out; (void)max_count; return 0; } - // v5.0: 多 GPU 协同 — 克隆 Problem 到指定 GPU - // 子类需实现:cudaSetDevice(gpu_id) + 分配设备内存 + 拷贝数据 - // 返回新的 Problem 实例指针(在 host 端,但其内部设备指针指向 gpu_id) + // v5.0: multi-GPU — clone Problem to a given GPU + // Subclasses implement: cudaSetDevice(gpu_id) + device alloc + copy + // Returns new Problem* on host; internal device pointers target gpu_id virtual Derived* clone_to_device(int gpu_id) const { (void)gpu_id; fprintf(stderr, "Error: clone_to_device() not implemented for this Problem type\n"); diff --git a/prototype/problems/assignment.cuh b/prototype/problems/assignment.cuh index 6b4cdfb..7f8f975 100644 --- a/prototype/problems/assignment.cuh +++ b/prototype/problems/assignment.cuh @@ -1,7 +1,7 @@ /** - * assignment.cuh - 指派问题 - * - * 继承 ProblemBase,使用 ObjDef 目标注册机制 + * assignment.cuh - assignment problem + * + * Extends ProblemBase with ObjDef objective registration. */ #pragma once @@ -11,10 +11,10 @@ struct AssignmentProblem : ProblemBase { const float* d_cost; - const float* h_cost; // host 端成本矩阵(用于 init_relation_matrix) + const float* h_cost; // host cost matrix (for init_relation_matrix) int n; - // ---- 目标计算 ---- + // ---- objective evaluation ---- __device__ float calc_total_cost(const Sol& sol) const { float total = 0.0f; const int* assign = sol.data[0]; @@ -24,7 +24,7 @@ struct AssignmentProblem : ProblemBase { return total; } - // ---- 目标定义(OBJ_DEFS 与 compute_obj 必须一一对应)---- + // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ---- static constexpr ObjDef OBJ_DEFS[] = { {ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_cost }; @@ -47,7 +47,7 @@ struct AssignmentProblem : ProblemBase { return cfg; } - // ---- shared memory 接口 ---- + // ---- shared memory interface ---- static constexpr size_t SMEM_LIMIT = 48 * 1024; size_t shared_mem_bytes() const { @@ -66,12 +66,12 @@ struct AssignmentProblem : ProblemBase { d_cost = sc; } - // 成本先验:task j 和 task k 如果被相似 agent 偏好,G 值高 - // O 矩阵:task j 在位置 i 成本低 → O[j][k] 略高(j 倾向排在 k 前面的位置) + // Cost prior: if tasks j and k are similarly preferred by agents, G is high + // O matrix: low cost for task j at slot i → slightly higher O[j][k] (j tends before k) void init_relation_matrix(float* G, float* O, int N) const { if (!h_cost || N != n) return; - // 对每个 task,构建成本向量,task 间余弦相似度 → G - // 简化:成本列向量的相关性 + // Per task, build cost vectors; cosine similarity between tasks → G + // Simplified: correlation of cost columns float max_c = 0.0f; for (int i = 0; i < N * N; i++) if (h_cost[i] > max_c) max_c = h_cost[i]; @@ -80,7 +80,7 @@ struct AssignmentProblem : ProblemBase { for (int j = 0; j < N; j++) for (int k = 0; k < N; k++) { if (j == k) continue; - // G: 两个 task 的成本向量越相似 → 越可能互换 + // G: more similar cost columns → more likely to swap tasks float dot = 0.0f, nj = 0.0f, nk = 0.0f; for (int i = 0; i < N; i++) { float cj = h_cost[i * N + j] / max_c; diff --git a/prototype/problems/bin_packing.cuh b/prototype/problems/bin_packing.cuh index f230d4a..9616f95 100644 --- a/prototype/problems/bin_packing.cuh +++ b/prototype/problems/bin_packing.cuh @@ -1,13 +1,13 @@ /** - * bin_packing.cuh - 一维装箱问题(Integer 编码 + 约束) - * - * N 个物品,每个重量 w[i],装入最多 B 个箱子,每个箱子容量 C。 - * 决策变量:data[0][i] ∈ [0, B-1],表示物品 i 放入的箱子编号。 - * 目标:最小化使用的箱子数。 - * 约束:每个箱子总重不超过 C,超出部分作为 penalty。 - * - * 验证实例:8 物品 weights=[7,5,3,4,6,2,8,1], C=10, 最优=4 箱 - * 箱0={7,3}=10, 箱1={5,4,1}=10, 箱2={6,2}=8, 箱3={8}=8 + * bin_packing.cuh - one-dimensional bin packing (Integer encoding + constraints) + * + * N items with weights w[i], at most B bins, capacity C per bin. + * Decision: data[0][i] in [0, B-1] = bin index for item i. + * Objective: minimize number of bins used. + * Constraint: bin load ≤ C; overflow contributes to penalty. + * + * Validation instance: 8 items weights=[7,5,3,4,6,2,8,1], C=10, optimum=4 bins + * bin0={7,3}=10, bin1={5,4,1}=10, bin2={6,2}=8, bin3={8}=8 */ #pragma once @@ -16,9 +16,9 @@ struct BinPackingProblem : ProblemBase { const float* d_weights; - int n; // 物品数 - int max_bins; // 最大箱子数 B - float capacity; // 箱子容量 C + int n; // number of items + int max_bins; // max bins B + float capacity; // bin capacity C __device__ float calc_bins_used(const Sol& sol) const { bool used[32] = {}; diff --git a/prototype/problems/graph_color.cuh b/prototype/problems/graph_color.cuh index fada0ec..1df1101 100644 --- a/prototype/problems/graph_color.cuh +++ b/prototype/problems/graph_color.cuh @@ -1,11 +1,11 @@ /** - * graph_color.cuh - 图着色问题(Integer 编码) - * - * N 个节点的图,用 k 种颜色着色。 - * 决策变量:data[0][i] ∈ [0, k-1],表示节点 i 的颜色。 - * 目标:最小化冲突边数(相邻节点同色的边数)。 - * - * 验证实例:Petersen 图(10 节点 15 边,色数=3,最优冲突=0) + * graph_color.cuh - graph coloring (Integer encoding) + * + * Graph on N nodes, k colors. + * Decision: data[0][i] in [0, k-1] = color of node i. + * Objective: minimize number of conflicting edges (adjacent same color). + * + * Validation instance: Petersen graph (10 nodes, 15 edges, chromatic number 3, optimal conflicts=0) */ #pragma once @@ -13,9 +13,9 @@ #include "cuda_utils.cuh" struct GraphColorProblem : ProblemBase { - const int* d_adj; // 邻接矩阵 [N*N](1=相邻, 0=不相邻) - int n; // 节点数 - int k; // 颜色数 + const int* d_adj; // adjacency [N*N] (1=edge, 0=no edge) + int n; // number of nodes + int k; // number of colors __device__ float calc_conflicts(const Sol& sol) const { int conflicts = 0; diff --git a/prototype/problems/jsp.cuh b/prototype/problems/jsp.cuh index 2297380..24c45d9 100644 --- a/prototype/problems/jsp.cuh +++ b/prototype/problems/jsp.cuh @@ -1,26 +1,26 @@ /** - * jsp.cuh - 车间调度问题 (Job Shop Scheduling Problem) - * - * J 个工件,每个工件有 O 道工序,每道工序指定机器和耗时。 - * - * === 编码方案 A:Integer 多行(时间表编码)=== - * JSPProblem: data[j][i] = 工件 j 的第 i 道工序的开始时间 + * jsp.cuh - Job Shop Scheduling Problem (JSSP) + * + * J jobs, each with O operations; each op specifies machine and duration. + * + * === Encoding A: multi-row Integer (time-table encoding) === + * JSPProblem: data[j][i] = start time of job j's i-th operation * dim1 = num_jobs, dim2_default = num_ops - * row_mode = Fixed(禁止 ROW_SPLIT/ROW_MERGE) - * 每行代表一个工件的固定工序序列,行长度不可变 - * - * === 编码方案 B:Permutation 多重集(工序排列编码)=== - * JSPPermProblem: data[0][k] = 工件编号(0..J-1),长度 J*O - * 值 j 出现 O 次。从左到右扫描,第 t 次遇到值 j 表示工件 j 的第 t 道工序。 + * row_mode = Fixed (no ROW_SPLIT/ROW_MERGE) + * Each row is a fixed op sequence for one job; row length is fixed. + * + * === Encoding B: Permutation multiset (operation sequence encoding) === + * JSPPermProblem: data[0][k] = job id (0..J-1), length J*O + * Value j appears O times. Left-to-right scan: t-th occurrence of j is job j's t-th op. * dim1 = 1, dim2_default = J*O, perm_repeat_count = O - * 标准 Permutation 算子(swap/reverse/insert)天然保持多重集结构 - * - * 目标:Minimize makespan(所有工件完成时间的最大值)。 - * 约束: - * (a) 工序顺序:同一工件的工序必须按序执行 - * (b) 机器冲突:同一机器同一时刻只能处理一个工序 - * - * 验证实例:自定义 3 工件 3 机器 (3x3),最优 makespan = 12 + * Standard permutation ops (swap/reverse/insert) preserve multiset structure. + * + * Objective: minimize makespan (max completion time over jobs). + * Constraints: + * (a) Precedence: ops of the same job must run in order. + * (b) Machine conflict: one op per machine at a time. + * + * Validation instance: custom 3 jobs × 3 machines (3x3), optimal makespan = 12 */ #pragma once @@ -28,16 +28,16 @@ #include "cuda_utils.cuh" // ============================================================ -// 编码方案 A:Integer 多行(时间表编码) +// Encoding A: multi-row Integer (time-table encoding) // ============================================================ struct JSPProblem : ProblemBase { - const int* d_machine; // 工序所需机器 [J*O] - const float* d_duration; // 工序耗时 [J*O] - int num_jobs; // 工件数 J - int num_ops; // 每工件工序数 O - int num_machines; // 机器数 M - int time_horizon; // 时间上界 + const int* d_machine; // machine per op [J*O] + const float* d_duration; // op duration [J*O] + int num_jobs; // number of jobs J + int num_ops; // ops per job O + int num_machines; // number of machines M + int time_horizon; // time horizon upper bound __device__ float calc_makespan(const Sol& sol) const { float makespan = 0.0f; @@ -62,7 +62,7 @@ struct JSPProblem : ProblemBase { __device__ float compute_penalty(const Sol& sol) const { float penalty = 0.0f; - // (a) 工序顺序约束 + // (a) Precedence constraints for (int j = 0; j < num_jobs; j++) { for (int i = 1; i < num_ops; i++) { float prev_end = (float)sol.data[j][i-1] + d_duration[j * num_ops + (i-1)]; @@ -72,7 +72,7 @@ struct JSPProblem : ProblemBase { } } - // (b) 机器冲突约束 + // (b) Machine conflict constraints int total = num_jobs * num_ops; for (int a = 0; a < total; a++) { int ja = a / num_ops, ia = a % num_ops; @@ -151,28 +151,28 @@ struct JSPProblem : ProblemBase { }; // ============================================================ -// 编码方案 B:Permutation 多重集(工序排列编码) +// Encoding B: Permutation multiset (operation sequence encoding) // ============================================================ -// data[0] 是长度 J*O 的排列,值域 [0, J),每个值出现 O 次 -// 从左到右扫描:第 t 次遇到值 j → 安排工件 j 的第 t 道工序 -// 贪心解码:每道工序安排在"最早可行时间"(满足工序顺序 + 机器空闲) +// data[0] is a length-J*O sequence with values in [0, J), each appearing O times. +// Left-to-right: t-th occurrence of j schedules job j's t-th operation. +// Greedy decode: each op at earliest feasible time (precedence + machine free). struct JSPPermProblem : ProblemBase { - const int* d_machine; // 工序所需机器 [J*O] - const float* d_duration; // 工序耗时 [J*O] + const int* d_machine; // machine per op [J*O] + const float* d_duration; // op duration [J*O] int num_jobs; int num_ops; int num_machines; - // 贪心解码:从排列生成调度方案,返回 makespan + // Greedy decode: build schedule from permutation, return makespan __device__ float decode_and_makespan(const Sol& sol) const { int total = num_jobs * num_ops; int size = sol.dim2_sizes[0]; if (size < total) return 1e9f; - float job_avail[8]; // 每个工件的下一道工序最早开始时间 - float mach_avail[8]; // 每台机器的最早空闲时间 - int job_next_op[8]; // 每个工件的下一道待安排工序编号 + float job_avail[8]; // earliest start for next op of each job + float mach_avail[8]; // earliest machine free time + int job_next_op[8]; // next op index to schedule per job for (int j = 0; j < num_jobs; j++) { job_avail[j] = 0.0f; job_next_op[j] = 0; } for (int m = 0; m < num_machines; m++) mach_avail[m] = 0.0f; @@ -182,13 +182,13 @@ struct JSPPermProblem : ProblemBase { int j = sol.data[0][k]; if (j < 0 || j >= num_jobs) return 1e9f; int op = job_next_op[j]; - if (op >= num_ops) continue; // 该工件已安排完 + if (op >= num_ops) continue; // job already fully scheduled int flat = j * num_ops + op; int m = d_machine[flat]; float dur = d_duration[flat]; - // 最早开始时间 = max(工件前序完成, 机器空闲) + // Earliest start = max(job predecessor done, machine free) float start = fmaxf(job_avail[j], mach_avail[m]); float end = start + dur; @@ -212,7 +212,7 @@ struct JSPPermProblem : ProblemBase { } } - // 贪心解码天然满足约束,penalty 始终为 0 + // Greedy decode satisfies constraints; penalty is always 0 __device__ float compute_penalty(const Sol& sol) const { return 0.0f; } diff --git a/prototype/problems/knapsack.cuh b/prototype/problems/knapsack.cuh index 82f47e8..0bf4a8e 100644 --- a/prototype/problems/knapsack.cuh +++ b/prototype/problems/knapsack.cuh @@ -1,7 +1,7 @@ /** - * knapsack.cuh - 0-1 背包问题 - * - * 继承 ProblemBase,使用 ObjDef 目标注册机制 + * knapsack.cuh - 0-1 knapsack + * + * Extends ProblemBase with ObjDef objective registration. */ #pragma once @@ -10,13 +10,13 @@ #include "operators.cuh" struct KnapsackProblem : ProblemBase { - // 问题数据(d_weights 是物品重量,非目标权重) + // problem data (d_weights are item weights, not objective weights) const float* d_weights; const float* d_values; float capacity; int n; - // ---- 目标计算 ---- + // ---- objective evaluation ---- __device__ float calc_total_value(const Sol& sol) const { float tv = 0.0f; const int* sel = sol.data[0]; @@ -26,7 +26,7 @@ struct KnapsackProblem : ProblemBase { return tv; } - // ---- 目标定义(OBJ_DEFS 与 compute_obj 必须一一对应)---- + // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ---- static constexpr ObjDef OBJ_DEFS[] = { {ObjDir::Maximize, 1.0f, 0.0f}, // case 0: calc_total_value }; @@ -55,7 +55,7 @@ struct KnapsackProblem : ProblemBase { return cfg; } - // ---- shared memory 接口 ---- + // ---- shared memory interface ---- size_t shared_mem_bytes() const { return 2 * (size_t)n * sizeof(float); } diff --git a/prototype/problems/load_balance.cuh b/prototype/problems/load_balance.cuh index b462c9f..ee92017 100644 --- a/prototype/problems/load_balance.cuh +++ b/prototype/problems/load_balance.cuh @@ -1,12 +1,12 @@ /** - * load_balance.cuh - 离散负载均衡问题(Integer 编码验证) - * - * N 个任务分配到 M 台机器,每个任务有一个处理时间 p[i]。 - * 决策变量:data[0][i] ∈ [0, M-1],表示任务 i 分配到哪台机器。 - * 目标:最小化 makespan(最大机器负载)。 - * - * 已知 NP-hard(等价于 multiprocessor scheduling / load balancing)。 - * LPT(最长处理时间优先)贪心可得 4/3 近似。 + * load_balance.cuh - discrete load balancing (Integer encoding sanity check) + * + * N tasks on M machines, processing time p[i] per task. + * Decision: data[0][i] in [0, M-1] = machine for task i. + * Objective: minimize makespan (max machine load). + * + * NP-hard (same as multiprocessor scheduling / load balancing). + * LPT (longest processing time first) greedy achieves 4/3 approximation. */ #pragma once @@ -14,12 +14,12 @@ #include "cuda_utils.cuh" struct LoadBalanceProblem : ProblemBase { - const float* d_proc_time; // 任务处理时间 [N] - int n; // 任务数 - int m; // 机器数 + const float* d_proc_time; // task processing times [N] + int n; // number of tasks + int m; // number of machines __device__ float calc_makespan(const Sol& sol) const { - float load[32] = {}; // 最多 32 台机器 + float load[32] = {}; // at most 32 machines int size = sol.dim2_sizes[0]; for (int i = 0; i < size; i++) { int machine = sol.data[0][i]; @@ -43,7 +43,7 @@ struct LoadBalanceProblem : ProblemBase { } __device__ float compute_penalty(const Sol& sol) const { - return 0.0f; // 无约束(任何分配都合法) + return 0.0f; // no side constraints (any assignment is feasible) } ProblemConfig config() const { diff --git a/prototype/problems/qap.cuh b/prototype/problems/qap.cuh index 69343e2..352e3ca 100644 --- a/prototype/problems/qap.cuh +++ b/prototype/problems/qap.cuh @@ -1,14 +1,14 @@ /** - * qap.cuh - 二次分配问题 (Quadratic Assignment Problem) - * - * N 个设施分配到 N 个位置(排列编码)。 - * 决策变量:data[0][i] = 设施 i 分配到的位置。 - * 目标:Minimize sum(flow[i][j] * dist[perm[i]][perm[j]]) - * - * 验证实例:自定义 5x5 - * flow: 设施间的物流量 - * dist: 位置间的距离 - * 已知最优 = 58 + * qap.cuh - Quadratic Assignment Problem (QAP) + * + * Assign N facilities to N locations (permutation encoding). + * Decision: data[0][i] = location assigned to facility i. + * Objective: Minimize sum(flow[i][j] * dist[perm[i]][perm[j]]) + * + * Validation instance: custom 5x5 + * flow: inter-facility flow + * dist: inter-location distances + * known optimum = 58 */ #pragma once @@ -16,8 +16,10 @@ #include "cuda_utils.cuh" struct QAPProblem : ProblemBase { - const float* d_flow; // 物流量矩阵 [N*N] - const float* d_dist; // 距离矩阵 [N*N] + const float* d_flow; // flow matrix [N*N] (device) + const float* d_dist; // distance matrix [N*N] (device) + const float* h_flow; // flow matrix [N*N] (host, for clone_to_device) + const float* h_dist; // distance matrix [N*N] (host, for clone_to_device) int n; __device__ float calc_cost(const Sol& sol) const { @@ -64,14 +66,16 @@ struct QAPProblem : ProblemBase { d_dist = sd; } - static QAPProblem create(const float* h_flow, const float* h_dist, int n) { + static QAPProblem create(const float* h_flow_in, const float* h_dist_in, int n) { QAPProblem prob; prob.n = n; + prob.h_flow = h_flow_in; + prob.h_dist = h_dist_in; float *df, *dd; CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n)); CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n)); - CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(df, h_flow_in, sizeof(float) * n * n, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(dd, h_dist_in, sizeof(float) * n * n, cudaMemcpyHostToDevice)); prob.d_flow = df; prob.d_dist = dd; return prob; } @@ -82,18 +86,12 @@ struct QAPProblem : ProblemBase { d_flow = nullptr; d_dist = nullptr; } - // v5.0: 多 GPU 协同 — 克隆到指定 GPU + // v5.0: multi-GPU — clone onto a given device QAPProblem* clone_to_device(int gpu_id) const override { int orig_device; CUDA_CHECK(cudaGetDevice(&orig_device)); - // 先下载数据到 host(从当前设备) - float* h_flow = new float[n * n]; - float* h_dist = new float[n * n]; - CUDA_CHECK(cudaMemcpy(h_flow, d_flow, sizeof(float) * n * n, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(h_dist, d_dist, sizeof(float) * n * n, cudaMemcpyDeviceToHost)); - - // 切换到目标 GPU 并上传 + // Use host-side matrices directly (no D2H needed) CUDA_CHECK(cudaSetDevice(gpu_id)); float *df, *dd; CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n)); @@ -101,15 +99,12 @@ struct QAPProblem : ProblemBase { CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice)); - delete[] h_flow; - delete[] h_dist; - - // 恢复原设备 CUDA_CHECK(cudaSetDevice(orig_device)); - // 创建新实例 QAPProblem* new_prob = new QAPProblem(); new_prob->n = n; + new_prob->h_flow = h_flow; + new_prob->h_dist = h_dist; new_prob->d_flow = df; new_prob->d_dist = dd; diff --git a/prototype/problems/schedule.cuh b/prototype/problems/schedule.cuh index 12409e1..0862fb3 100644 --- a/prototype/problems/schedule.cuh +++ b/prototype/problems/schedule.cuh @@ -1,8 +1,8 @@ /** - * schedule.cuh - 排班问题 - * - * 继承 ProblemBase,使用 ObjDef 目标注册机制 - * 2 个目标:总成本(min)+ 不公平度(min,权重更高) + * schedule.cuh - staff scheduling + * + * Extends ProblemBase with ObjDef objective registration. + * Two objectives: total cost (min) + unfairness (min, higher weight). */ #pragma once @@ -14,7 +14,7 @@ struct ScheduleProblem : ProblemBase { const float* d_cost; int days, emps, required; - // ---- 目标计算 ---- + // ---- objective evaluation ---- __device__ float calc_total_cost(const Sol& sol) const { float total = 0.0f; for (int d = 0; d < days; d++) @@ -37,7 +37,7 @@ struct ScheduleProblem : ProblemBase { return (float)(max_w - min_w); } - // ---- 目标定义(OBJ_DEFS 与 compute_obj 必须一一对应)---- + // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ---- static constexpr ObjDef OBJ_DEFS[] = { {ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_cost {ObjDir::Minimize, 5.0f, 0.0f}, // case 1: calc_unfairness @@ -71,9 +71,9 @@ struct ScheduleProblem : ProblemBase { return cfg; } - // 默认回退全量(基类行为)— 不需要覆盖 evaluate_move + // Default full re-eval (base behavior) — no need to override evaluate_move - // ---- shared memory 接口 ---- + // ---- shared memory interface ---- size_t shared_mem_bytes() const { return (size_t)days * emps * sizeof(float); } diff --git a/prototype/problems/tsp.cuh b/prototype/problems/tsp.cuh index 8085ab2..4657e9a 100644 --- a/prototype/problems/tsp.cuh +++ b/prototype/problems/tsp.cuh @@ -1,7 +1,7 @@ /** - * tsp.cuh - TSP 问题定义 - * - * 继承 ProblemBase,使用 ObjDef 目标注册机制 + * tsp.cuh - Traveling Salesman Problem (TSP) definition + * + * Extends ProblemBase with ObjDef objective registration. */ #pragma once @@ -10,12 +10,12 @@ #include "operators.cuh" struct TSPProblem : ProblemBase { - // 问题数据 + // problem data const float* d_dist; - const float* h_dist; // host 端距离矩阵(用于 init_relation_matrix) + const float* h_dist; // host distance matrix (for init_relation_matrix) int n; - // ---- 目标计算 ---- + // ---- objective evaluation ---- __device__ float calc_total_distance(const Sol& sol) const { float total = 0.0f; const int* route = sol.data[0]; @@ -25,7 +25,7 @@ struct TSPProblem : ProblemBase { return total; } - // ---- 目标定义(OBJ_DEFS 与 compute_obj 必须一一对应)---- + // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ---- static constexpr ObjDef OBJ_DEFS[] = { {ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_distance }; @@ -37,10 +37,10 @@ struct TSPProblem : ProblemBase { } __device__ float compute_penalty(const Sol& sol) const { - return 0.0f; // TSP 无约束 + return 0.0f; // TSP has no side constraints } - // ---- config(编码/维度部分,目标由基类自动填充)---- + // ---- config (encoding/dims; objectives filled by base class) ---- ProblemConfig config() const { ProblemConfig cfg; cfg.encoding = EncodingType::Permutation; @@ -49,7 +49,7 @@ struct TSPProblem : ProblemBase { return cfg; } - // ---- shared memory 接口 ---- + // ---- shared memory interface ---- static constexpr size_t SMEM_LIMIT = 48 * 1024; size_t shared_mem_bytes() const { @@ -69,7 +69,7 @@ struct TSPProblem : ProblemBase { d_dist = sd; } - // 距离先验:距离近 → G/O 分数高 + // Distance prior: closer cities → higher G/O scores void init_relation_matrix(float* G, float* O, int N) const { if (!h_dist || N != n) return; float max_d = 0.0f; @@ -108,21 +108,21 @@ struct TSPProblem : ProblemBase { h_dist = nullptr; } - // v5.0: 多 GPU 协同 — 克隆到指定 GPU + // v5.0: multi-GPU — clone onto a given device TSPProblem* clone_to_device(int gpu_id) const override { int orig_device; CUDA_CHECK(cudaGetDevice(&orig_device)); CUDA_CHECK(cudaSetDevice(gpu_id)); - // 分配设备内存并拷贝距离矩阵 + // Allocate device memory and copy distance matrix float* dd; CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n)); CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice)); - // 恢复原设备 + // Restore original device CUDA_CHECK(cudaSetDevice(orig_device)); - // 创建新的 Problem 实例(在 host 端) + // Create new Problem instance (on host) TSPProblem* new_prob = new TSPProblem(); new_prob->n = n; new_prob->h_dist = h_dist; diff --git a/prototype/problems/tsp_large.cuh b/prototype/problems/tsp_large.cuh index 363b09b..fc411fd 100644 --- a/prototype/problems/tsp_large.cuh +++ b/prototype/problems/tsp_large.cuh @@ -1,7 +1,7 @@ /** - * tsp_large.cuh - 大规模 TSP 问题定义 (最多 256 城市) - * - * 继承 ProblemBase,逻辑与 tsp.cuh 一致,仅 D2 上限不同 + * tsp_large.cuh - large-scale TSP definition (up to 256 cities) + * + * Same logic as tsp.cuh under ProblemBase; only D2 cap differs. */ #pragma once @@ -14,7 +14,7 @@ struct TSPLargeProblem : ProblemBase { const float* h_dist; int n; - // ---- 目标计算 ---- + // ---- objective evaluation ---- __device__ float calc_total_distance(const Sol& sol) const { float total = 0.0f; const int* route = sol.data[0]; @@ -24,7 +24,7 @@ struct TSPLargeProblem : ProblemBase { return total; } - // ---- 目标定义(OBJ_DEFS 与 compute_obj 必须一一对应)---- + // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ---- static constexpr ObjDef OBJ_DEFS[] = { {ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_distance }; @@ -54,7 +54,7 @@ struct TSPLargeProblem : ProblemBase { return need <= SMEM_LIMIT ? need : 0; } - // 距离矩阵的实际大小(不管是否放进 smem) + // Actual distance matrix size (whether or not placed in smem) size_t working_set_bytes() const { return (size_t)n * n * sizeof(float); } diff --git a/prototype/problems/tsp_xlarge.cuh b/prototype/problems/tsp_xlarge.cuh index fa6afef..f2052d2 100644 --- a/prototype/problems/tsp_xlarge.cuh +++ b/prototype/problems/tsp_xlarge.cuh @@ -1,9 +1,9 @@ /** - * tsp_xlarge.cuh - 超大规模 TSP 问题定义 (最多 512 城市) - * - * 继承 ProblemBase,逻辑与 tsp_large.cuh 一致,D2=512 - * 注意:距离矩阵 512×512×4B = 1MB,远超 48KB shared memory - * 因此 shared_mem_bytes() 返回 0,距离矩阵留在 global memory + * tsp_xlarge.cuh - very large TSP definition (up to 512 cities) + * + * Same as tsp_large.cuh under ProblemBase, with D2=512. + * Note: distance matrix 512×512×4B = 1MB, far above 48KB shared memory, + * so shared_mem_bytes() returns 0 and the matrix stays in global memory. */ #pragma once @@ -13,7 +13,7 @@ struct TSPXLargeProblem : ProblemBase { const float* d_dist; - const float* h_dist; // host 端距离矩阵(用于 init_relation_matrix) + const float* h_dist; // host distance matrix (for init_relation_matrix) int n; __device__ float calc_total_distance(const Sol& sol) const { @@ -45,7 +45,7 @@ struct TSPXLargeProblem : ProblemBase { return cfg; } - // 距离矩阵太大,不放 shared memory + // Distance matrix too large for shared memory size_t shared_mem_bytes() const { return 0; } __device__ void load_shared(char*, int, int) {} @@ -53,10 +53,10 @@ struct TSPXLargeProblem : ProblemBase { return (size_t)n * n * sizeof(float); } - // 用距离矩阵初始化 G/O 先验:距离近 → 分数高 + // Initialize G/O priors from distances: closer → higher score void init_relation_matrix(float* G, float* O, int N) const { if (!h_dist || N != n) return; - // 找最大距离用于归一化 + // Max distance for normalization float max_d = 0.0f; for (int i = 0; i < N; i++) for (int j = 0; j < N; j++) @@ -66,10 +66,10 @@ struct TSPXLargeProblem : ProblemBase { for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { if (i == j) continue; - // 距离近 → G 高(分组倾向强) + // Closer → higher G (stronger grouping signal) float proximity = 1.0f - h_dist[i * N + j] / max_d; - G[i * N + j] = proximity * 0.3f; // 初始信号不要太强,留空间给 EMA - // 距离近 → O 也给一点信号(对称的,不偏向任何方向) + G[i * N + j] = proximity * 0.3f; // keep initial signal moderate for EMA headroom + // Closer → small O signal too (symmetric, no directional bias) O[i * N + j] = proximity * 0.1f; } } @@ -84,7 +84,7 @@ struct TSPXLargeProblem : ProblemBase { static TSPXLargeProblem create(const float* h_dist_ptr, int n) { TSPXLargeProblem prob; prob.n = n; - prob.h_dist = h_dist_ptr; // 保留 host 指针 + prob.h_dist = h_dist_ptr; // keep host pointer float* dd; CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n)); CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice)); diff --git a/prototype/problems/vrp.cuh b/prototype/problems/vrp.cuh index 81c05d5..e7a0626 100644 --- a/prototype/problems/vrp.cuh +++ b/prototype/problems/vrp.cuh @@ -1,8 +1,8 @@ /** - * vrp.cuh - 容量约束车辆路径问题 (CVRP) - * - * 继承 ProblemBase,使用 ObjDef 目标注册机制 - * 多行编码(D1=K 条路线,分区初始化 + 跨行算子) + * vrp.cuh - Capacitated Vehicle Routing Problem (CVRP) + * + * Extends ProblemBase with ObjDef objective registration. + * Multi-row encoding (D1 = K routes, partition init + cross-row operators). */ #pragma once @@ -12,11 +12,11 @@ #include "gpu_cache.cuh" struct VRPProblem : ProblemBase { - // GPU 数据 + // GPU data const float* d_dist; const float* d_demand; - const float* h_dist; // host 端距离矩阵(含 depot,用于 init_relation_matrix) - const float* h_demand; // host 端需求数组(用于 clone_to_device) + const float* h_dist; // host distance matrix (includes depot; for init_relation_matrix) + const float* h_demand; // host demand array (for clone_to_device) int n; int stride; float capacity; @@ -24,7 +24,7 @@ struct VRPProblem : ProblemBase { int max_vehicles; GpuCache cache; - // ---- 目标计算 ---- + // ---- objective evaluation ---- __device__ float compute_route_dist(const int* route, int size) const { if (size == 0) return 0.0f; float dist = 0.0f; @@ -61,7 +61,7 @@ struct VRPProblem : ProblemBase { return total; } - // ---- 目标定义(OBJ_DEFS 与 compute_obj 必须一一对应)---- + // ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ---- static constexpr ObjDef OBJ_DEFS[] = { {ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_distance }; @@ -102,7 +102,7 @@ struct VRPProblem : ProblemBase { return cfg; } - // ---- shared memory 接口 ---- + // ---- shared memory interface ---- static constexpr size_t SMEM_LIMIT = 48 * 1024; size_t shared_mem_bytes() const { @@ -129,14 +129,14 @@ struct VRPProblem : ProblemBase { void enable_cache(int cap = 65536) { cache = GpuCache::allocate(cap); } void print_cache_stats() const { cache.print_stats(); } - // 距离先验:客户间距离近 → G/O 分数高 - // 注意:h_dist 含 depot(stride×stride),元素编号 0..n-1 对应 node 1..n + // Distance prior: closer customers → higher G/O scores + // Note: h_dist includes depot (stride×stride); indices 0..n-1 map to nodes 1..n void init_relation_matrix(float* G, float* O, int N) const { if (!h_dist || N != n) return; float max_d = 0.0f; for (int i = 0; i < N; i++) for (int j = 0; j < N; j++) { - float d = h_dist[(i + 1) * stride + (j + 1)]; // 跳过 depot + float d = h_dist[(i + 1) * stride + (j + 1)]; // skip depot if (d > max_d) max_d = d; } if (max_d <= 0.0f) return; @@ -161,7 +161,7 @@ struct VRPProblem : ProblemBase { prob.max_vehicles = max_vehicles; prob.cache = GpuCache::disabled(); prob.h_dist = h_dist_ptr; - prob.h_demand = h_demand_ptr; // 保存 host 端指针 + prob.h_demand = h_demand_ptr; // keep host pointer int n_nodes = n + 1; float* dd; @@ -185,13 +185,13 @@ struct VRPProblem : ProblemBase { cache.destroy(); } - // v5.0: 多 GPU 协同 — 克隆到指定 GPU + // v5.0: multi-GPU — clone onto a given device VRPProblem* clone_to_device(int gpu_id) const override { int orig_device; CUDA_CHECK(cudaGetDevice(&orig_device)); CUDA_CHECK(cudaSetDevice(gpu_id)); - // 从 host 端数据直接拷贝到目标 GPU(避免跨设备 D2H 拷贝) + // Copy from host straight to target GPU (avoid cross-device D2H staging) int n_nodes = n + 1; float* dd; CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n_nodes * n_nodes)); diff --git a/prototype/problems/vrptw.cuh b/prototype/problems/vrptw.cuh index 484d20f..7fc2e45 100644 --- a/prototype/problems/vrptw.cuh +++ b/prototype/problems/vrptw.cuh @@ -1,12 +1,12 @@ /** - * vrptw.cuh - 带时间窗的车辆路径问题 (VRPTW) - * - * 在 CVRP 基础上增加时间窗约束。 - * 编码:Perm 多行分区(同 CVRP),data[r][j] = 路线 r 的第 j 个客户。 - * 目标:Minimize 总距离。 - * 约束:(a) 容量约束, (b) 时间窗约束(到达时间必须 ≤ latest,早到需等待)。 - * - * 验证实例:8 客户 3 车, 手工设计坐标+时间窗, 确保有已知可行解。 + * vrptw.cuh - Vehicle Routing Problem with Time Windows (VRPTW) + * + * CVRP plus time window constraints. + * Encoding: multi-row perm partition (same as CVRP); data[r][j] = j-th customer on route r. + * Objective: minimize total distance. + * Constraints: (a) capacity, (b) time windows (arrival ≤ latest; early arrival waits). + * + * Validation instance: 8 customers, 3 vehicles; hand-crafted coords + windows with known feasible solution. */ #pragma once @@ -14,12 +14,12 @@ #include "cuda_utils.cuh" struct VRPTWProblem : ProblemBase { - const float* d_dist; // 距离矩阵 [(n+1)*(n+1)](含 depot) - const float* d_demand; // 需求 [n] - const float* d_earliest; // 最早服务时间 [n+1](含 depot) - const float* d_latest; // 最晚服务时间 [n+1](含 depot) - const float* d_service; // 服务耗时 [n+1](含 depot) - int n; // 客户数(不含 depot) + const float* d_dist; // distance matrix [(n+1)*(n+1)] (includes depot) + const float* d_demand; // demand [n] + const float* d_earliest; // earliest service time [n+1] (includes depot) + const float* d_latest; // latest service time [n+1] (includes depot) + const float* d_service; // service time [n+1] (includes depot) + int n; // number of customers (excludes depot) int stride; // n+1 float capacity; int num_vehicles; @@ -63,30 +63,30 @@ struct VRPTWProblem : ProblemBase { if (size == 0) continue; active++; - // 容量约束 + // Capacity constraint float load = 0.0f; for (int j = 0; j < size; j++) load += d_demand[sol.data[r][j]]; if (load > capacity) penalty += (load - capacity) * 100.0f; - // 时间窗约束:模拟路线行驶 + // Time windows: simulate route travel float time = 0.0f; int prev = 0; for (int j = 0; j < size; j++) { int node = sol.data[r][j] + 1; float travel = d_dist[prev * stride + node]; time += travel; - // 早到需等待 + // Wait if early if (time < d_earliest[node]) time = d_earliest[node]; - // 迟到产生惩罚 + // Penalize lateness if (time > d_latest[node]) penalty += (time - d_latest[node]) * 50.0f; time += d_service[node]; prev = node; } - // 返回 depot 的时间窗 + // Time window returning to depot float return_time = time + d_dist[prev * stride + 0]; if (return_time > d_latest[0]) penalty += (return_time - d_latest[0]) * 50.0f;