fix: harden CUDA safety checks and translate comments to English

Safety fixes (4 critical, 4 warning) from code review:

- qap.cuh: fix clone_to_device cross-device D2H by retaining host matrices
- types.cuh: add CUDA_CHECK to InjectBuffer, track owner_gpu for safe destroy
- types.cuh: add bounds check on lexicographic priority index
- solver.cuh: cap migrate_kernel islands to MAX_ISLANDS=64 to prevent stack overflow
- multi_gpu_solver.cuh: guard against 0 GPUs, propagate stop_reason from best GPU
- types.cuh: warn on SeqRegistry overflow
- solver.cuh: warn when constraint_directed/phased_search disabled without AOS

Translate all Chinese comments to English across 25+ source files
(core/*.cuh, problems/*.cuh, Makefile, multi-GPU tests).

Verified on V100S×2 (sm_70, CUDA 12.8): e5 (12 problem types, all optimal),
e13 (multi-objective + multi-GPU, 9 configs, all passed).
This commit is contained in:
L-yang-yang 2026-03-25 11:52:50 +08:00
parent ab278d0e82
commit a848730459
25 changed files with 1147 additions and 1167 deletions

View file

@ -6,7 +6,7 @@
[![CUDA](https://img.shields.io/badge/CUDA-11.0%2B-green.svg)](https://developer.nvidia.com/cuda-toolkit)
[![Python](https://img.shields.io/badge/Python-3.8%2B-blue.svg)](https://www.python.org/)
**Paper**: [cuGenOpt: A GPU-Accelerated General-Purpose Metaheuristic Framework for Combinatorial Optimization](http://arxiv.org/abs/2603.19163)
**Paper**: [cuGenOpt: A GPU-Accelerated General-Purpose Metaheuristic Framework for Combinatorial Optimization](https://arxiv.org/abs/2603.19163)
---
@ -114,28 +114,7 @@ Define your own problem by inheriting `ProblemBase` and implementing `compute_ob
└─────────────────────────────────────────────────────────┘
```
---
## Project Structure
```
generic_solver/
├── prototype/ # Core framework (header-only .cuh files)
│ ├── core/ # Solver, operators, population, types
│ └── problems/ # 12+ problem implementations
├── python/ # Python wrapper (pip install cugenopt)
│ ├── cugenopt/ # Python package (built-ins + JIT compiler)
│ └── tests/ # Test suite
├── benchmark/ # Experiments and benchmarks
│ ├── experiments/ # E0-E13: 14 experiment groups
│ ├── data/ # Standard instances (TSPLIB, Solomon, QAPLIB)
│ └── results/ # Experimental reports
├── paper_v3_en/ # Paper source (LaTeX)
├── STATUS.md # Project status and roadmap
└── README.md # This file
```
---
## Performance Highlights
@ -186,8 +165,7 @@ generic_solver/
## Installation
### Python Package
coming soon
come soon
```bash
pip install cugenopt
```
@ -207,18 +185,7 @@ cd prototype
make all
```
---
## Documentation
| Document | Description |
|----------|-------------|
| [STATUS.md](STATUS.md) | Project status, roadmap, and design decisions |
| [Python API Guide](python/README.md) | Detailed Python API documentation |
| [Benchmark Design](benchmark/DESIGN.md) | Experimental methodology |
| [Paper](paper_v3_en/) | Full technical details and evaluation |
---
## Citation

View file

@ -1,10 +1,10 @@
# GenSolver Makefile
#
# 用法:
# make e1 e2 e3 e4 e5 e6 → 编译单个实验
# make diag → 编译诊断程序
# make all → 编译全部
# make clean → 清理
# Usage:
# make e1 e2 e3 e4 e5 e6 → Build individual experiments
# make diag → Build diagnostic program
# make all → Build all
# make clean → Clean
NVCC = nvcc
ARCH ?= -arch=sm_75
@ -40,10 +40,10 @@ $(EXP_DIR)/%/gpu: $(EXP_DIR)/%/gpu.cu $(ALL_HEADERS) problems/tsplib_data.h
$(EXP_DIR)/e0_diagnosis/bench_diagnosis: $(EXP_DIR)/e0_diagnosis/bench_diagnosis.cu $(ALL_HEADERS)
$(NVCC) $(ARCH) $(CFLAGS) $(INCLUDES) -o $@ $<
test_multi_gpu: test_multi_gpu.cu $(ALL_HEADERS)
test_multi_gpu: $(EXP_DIR)/e9_multi_gpu_b3/test_multi_gpu.cu $(ALL_HEADERS)
$(NVCC) $(ARCH) $(CFLAGS) $(INCLUDES) -o $@ $<
test_multi_gpu_b3: test_multi_gpu_b3.cu $(ALL_HEADERS)
test_multi_gpu_b3: $(EXP_DIR)/e9_multi_gpu_b3/test_multi_gpu_b3.cu $(ALL_HEADERS)
$(NVCC) $(ARCH) $(CFLAGS) $(INCLUDES) -o $@ $<
clean:

View file

@ -1,8 +1,8 @@
/**
* cuda_utils.cuh - CUDA 工具集
* cuda_utils.cuh - CUDA utilities
*
* 职责:错误检查、设备信息、随机数工具
* 规则:所有 CUDA API 调用都必须用 CUDA_CHECK 包裹
* Responsibilities: error checking, device info, random number utilities
* Rule: every CUDA API call must be wrapped with CUDA_CHECK
*/
#pragma once
@ -11,7 +11,7 @@
#include <curand_kernel.h>
// ============================================================
// 错误检查
// Error checking
// ============================================================
#define CUDA_CHECK(call) do { \
@ -23,7 +23,7 @@
} \
} while(0)
// kernel launch 后检查(捕获异步错误)
// Check after kernel launch (catches async errors)
#define CUDA_CHECK_LAST() do { \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) { \
@ -34,7 +34,7 @@
} while(0)
// ============================================================
// 设备信息
// Device info
// ============================================================
inline void print_device_info() {
@ -52,10 +52,10 @@ inline void print_device_info() {
}
// ============================================================
// 随机数工具 (Device 端)
// Random number utilities (device-side)
// ============================================================
// 初始化 curand 状态,每个线程一个
// Initialize curand state: one per thread
__global__ void init_curand_kernel(curandState* states, unsigned long long seed, int n) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) {
@ -63,12 +63,12 @@ __global__ void init_curand_kernel(curandState* states, unsigned long long seed,
}
}
// Device 端:生成 [0, bound) 的随机整数
// Device-side: random integer in [0, bound)
__device__ inline int rand_int(curandState* state, int bound) {
return curand(state) % bound;
}
// DeviceFisher-Yates shuffle对 arr[0..n-1] 做随机排列
// Device-side: Fisher-Yates shuffle of arr[0..n-1]
__device__ inline void shuffle(int* arr, int n, curandState* state) {
for (int i = n - 1; i > 0; i--) {
int j = rand_int(state, i + 1);
@ -79,12 +79,12 @@ __device__ inline void shuffle(int* arr, int n, curandState* state) {
}
// ============================================================
// Kernel 启动参数计算
// Kernel launch grid sizing
// ============================================================
inline int div_ceil(int a, int b) { return (a + b - 1) / b; }
// 计算合适的 block 数量
// Compute suitable number of blocks
inline int calc_grid_size(int n, int block_size = 256) {
return div_ceil(n, block_size);
}

View file

@ -1,20 +1,20 @@
/**
* gpu_cache.cuh - GPU 全局内存哈希表(通用缓存组件)
* gpu_cache.cuh - GPU global-memory hash table (generic cache component)
*
* 设计:
* - 开放寻址固定容量power of 2线性探测
* - key = uint64_t(由 Problem 自行计算 hash
* - value = float(单个指标值)
* - 无锁:允许 race condition缓存语义偶尔脏读可接受
* - 自带命中/未命中原子计数器
* Design:
* - Open addressing, fixed capacity (power of 2), linear probing
* - key = uint64_t (hash computed by Problem)
* - value = float (single metric value)
* - Lock-free: race conditions allowed (cache semantics; occasional dirty reads OK)
* - Built-in atomic hit/miss counters
*
* 用法:
* Usage:
* GpuCache cache = GpuCache::allocate(65536); // host
* // ... pass cache as Problem member to kernels ...
* cache.print_stats(); // host
* cache.destroy(); // host
*
* 参考scute 项目 LRUCachekey = metric_type + content_hash
* Reference: scute project LRUCache (key = metric_type + content_hash)
*/
#pragma once
@ -22,25 +22,25 @@
#include <cstdint>
// ============================================================
// 常量
// Constants
// ============================================================
static constexpr uint64_t CACHE_EMPTY_KEY = 0xFFFFFFFFFFFFFFFFULL;
static constexpr int CACHE_MAX_PROBE = 8; // 最大线性探测步数
static constexpr int CACHE_MAX_PROBE = 8; // Max linear probing steps
// ============================================================
// GpuCache 结构体POD可安全拷贝到 kernel
// GpuCache struct (POD, safe to copy to kernel)
// ============================================================
struct GpuCache {
uint64_t* keys; // GPU 全局内存
float* values; // GPU 全局内存
unsigned int* d_hits; // 原子计数器GPU
unsigned int* d_misses; // 原子计数器GPU
int capacity; // 必须是 2 的幂
uint64_t* keys; // GPU global memory
float* values; // GPU global memory
unsigned int* d_hits; // Atomic counters (GPU)
unsigned int* d_misses; // Atomic counters (GPU)
int capacity; // Must be a power of 2
int mask; // = capacity - 1
// ---- Host 操作 ----
// ---- Host operations ----
static GpuCache allocate(int cap = 65536) {
GpuCache c;
@ -94,20 +94,20 @@ struct GpuCache {
};
// ============================================================
// Device 函数:哈希 / 查找 / 插入
// Device functions: hash / lookup / insert
// ============================================================
/// FNV-1a 哈希:对一段有序 int 序列(如路线中的客户 ID
/// FNV-1a hash over an ordered int sequence (e.g. customer IDs on a route)
__device__ inline uint64_t route_hash(const int* data, int len) {
uint64_t h = 14695981039346656037ULL; // FNV offset basis
for (int i = 0; i < len; i++) {
h ^= (uint64_t)(unsigned int)data[i];
h *= 1099511628211ULL; // FNV prime
}
return (h == CACHE_EMPTY_KEY) ? h - 1 : h; // 避免与哨兵值碰撞
return (h == CACHE_EMPTY_KEY) ? h - 1 : h; // Avoid collision with sentinel value
}
/// 查找:命中返回 true + 写入 out
/// Lookup: on hit returns true and writes out
__device__ inline bool cache_lookup(const GpuCache& c, uint64_t key, float& out) {
int slot = (int)(key & (uint64_t)c.mask);
for (int p = 0; p < CACHE_MAX_PROBE; p++) {
@ -117,12 +117,12 @@ __device__ inline bool cache_lookup(const GpuCache& c, uint64_t key, float& out)
out = c.values[idx];
return true;
}
if (k == CACHE_EMPTY_KEY) return false; // 空槽 → 一定不存在
if (k == CACHE_EMPTY_KEY) return false; // Empty slot -> key not present
}
return false; // 探测用尽
return false; // Probing exhausted
}
/// 插入:写入 key-value同 key 覆盖,探测满则驱逐首槽
/// Insert: write key-value; same key overwrites; if probe full, evict first slot
__device__ inline void cache_insert(const GpuCache& c, uint64_t key, float value) {
int slot = (int)(key & (uint64_t)c.mask);
for (int p = 0; p < CACHE_MAX_PROBE; p++) {
@ -134,7 +134,7 @@ __device__ inline void cache_insert(const GpuCache& c, uint64_t key, float value
return;
}
}
// 探测满:驱逐首槽
// Probe full: evict first slot
int idx = slot & c.mask;
c.keys[idx] = key;
c.values[idx] = value;

View file

@ -6,7 +6,7 @@
namespace heuristic_init {
// 单行排列:所有行填相同排列
// Single-row layout: same permutation in every row
template<typename Sol>
static void build_sorted_permutation(Sol& sol, const std::vector<int>& order,
int dim1, int dim2) {
@ -19,7 +19,7 @@ static void build_sorted_permutation(Sol& sol, const std::vector<int>& order,
for (int i = 0; i < MAX_OBJ; i++) sol.objectives[i] = 0.0f;
}
// Partition 模式:排列均匀切分到 dim1 行,元素不重复
// Partition mode: split permutation evenly across dim1 rows, no duplicate elements
template<typename Sol>
static void build_partition_from_order(Sol& sol, const std::vector<int>& order,
int dim1, int total_elements) {
@ -66,8 +66,8 @@ std::vector<Sol> build_from_matrices(const HeuristicMatrix* matrices, int num_ma
col_sum[j] += mat[i * N + j];
}
// 对于 Partition (VRPTW),距离矩阵含 depot (index 0)
// 排序只针对客户 (index 1..N-1),输出值为 0-based 客户编号
// For Partition (VRPTW), the distance matrix includes depot (index 0);
// sorting is only over customers (indices 1..N-1); output values are 0-based customer ids
std::vector<int> idx;
if (partition_mode && N > elem_count) {
for (int i = 1; i <= elem_count; i++) idx.push_back(i);

View file

@ -1,15 +1,15 @@
/**
* init_selection.cuh - 初始解采样择优 + NSGA-II 选择
* init_selection.cuh - Initial-solution sampling and NSGA-II selection
*
* Host 端逻辑,在 solver 初始化阶段调用一次。
* 从 K × pop_size 个候选解中选出 pop_size 个作为初始种群。
* Host-side logic; called once during solver initialization.
* Selects pop_size individuals from K × pop_size candidates as the initial population.
*
* 选择策略:
* 1. 核心目标预留名额(按 importance 分配)
* 2. NSGA-II 选择(非支配排序 + 加权拥挤度)
* 3. 纯随机保底(多样性)
* Selection strategy:
* 1. Reserve slots for core objectives (by importance)
* 2. NSGA-II selection (non-dominated sort + weighted crowding)
* 3. Pure random fallback (diversity)
*
* 单目标时自动退化为 top-N 排序,无需分支。
* Single-objective case automatically reduces to top-N sorting; no extra branching.
*/
#pragma once
@ -22,36 +22,36 @@
namespace init_sel {
// ============================================================
// 候选解的目标信息(从 GPU 下载后在 host 端使用)
// Per-candidate objective info (used on host after download from GPU)
// ============================================================
struct CandidateInfo {
int idx; // 在候选数组中的原始索引
float objs[MAX_OBJ]; // 归一化后的目标值(越小越好)
int idx; // Original index in the candidate array
float objs[MAX_OBJ]; // Normalized objectives (lower is better)
float penalty;
int rank; // 非支配排序层级0 = Pareto 前沿)
float crowding; // 拥挤度距离
bool selected; // 是否已被选中
int rank; // Non-dominated sort front (0 = Pareto front)
float crowding; // Crowding distance
bool selected; // Whether already selected
};
// ============================================================
// 非支配排序Fast Non-dominated Sort
// Non-dominated sort (Fast Non-dominated Sort)
// ============================================================
// 复杂度O(M × N²)M = 目标数N = 候选数
// 对初始化场景N ≤ 几千M ≤ 4完全可接受
// Complexity: O(M × N²), M = number of objectives, N = number of candidates
// Acceptable for initialization (N up to a few thousand, M ≤ 4)
inline void fast_nondominated_sort(std::vector<CandidateInfo>& cands,
int num_obj,
std::vector<std::vector<int>>& fronts) {
int n = (int)cands.size();
std::vector<int> dom_count(n, 0); // 被多少个解支配
std::vector<std::vector<int>> dom_set(n); // 支配了哪些解
std::vector<int> dom_count(n, 0); // How many solutions dominate this one
std::vector<std::vector<int>> dom_set(n); // Which solutions this one dominates
// 判断 a 是否支配 ba 在所有目标上 ≤ b且至少一个 <
// 先处理 penalty可行解支配不可行解
// Whether a dominates b: a ≤ b on all objectives, and strictly < on at least one
// Handle penalty first: feasible dominates infeasible
auto dominates = [&](int a, int b) -> bool {
const auto& ca = cands[a];
const auto& cb = cands[b];
// penalty 处理
// Penalty handling
if (ca.penalty <= 0.0f && cb.penalty > 0.0f) return true;
if (ca.penalty > 0.0f && cb.penalty <= 0.0f) return false;
if (ca.penalty > 0.0f && cb.penalty > 0.0f) return ca.penalty < cb.penalty;
@ -65,7 +65,7 @@ inline void fast_nondominated_sort(std::vector<CandidateInfo>& cands,
return all_leq && any_lt;
};
// 计算支配关系
// Compute dominance relations
for (int i = 0; i < n; i++) {
for (int j = i + 1; j < n; j++) {
if (dominates(i, j)) {
@ -78,7 +78,7 @@ inline void fast_nondominated_sort(std::vector<CandidateInfo>& cands,
}
}
// 提取各层前沿
// Extract each front layer
fronts.clear();
std::vector<int> current_front;
for (int i = 0; i < n; i++) {
@ -107,9 +107,9 @@ inline void fast_nondominated_sort(std::vector<CandidateInfo>& cands,
}
// ============================================================
// 加权拥挤度距离
// Weighted crowding distance
// ============================================================
// 标准拥挤度 + importance 加权:核心目标维度上的间距贡献更大
// Standard crowding + importance weighting: larger gap contribution on core objectives
inline void weighted_crowding_distance(std::vector<CandidateInfo>& cands,
const std::vector<int>& front,
@ -117,7 +117,7 @@ inline void weighted_crowding_distance(std::vector<CandidateInfo>& cands,
const float* importance) {
int n = (int)front.size();
if (n <= 2) {
for (int i : front) cands[i].crowding = 1e18f; // 边界解无穷大
for (int i : front) cands[i].crowding = 1e18f; // Boundary solutions: infinite
return;
}
@ -126,18 +126,18 @@ inline void weighted_crowding_distance(std::vector<CandidateInfo>& cands,
std::vector<int> sorted_idx(front.begin(), front.end());
for (int m = 0; m < num_obj; m++) {
// 按目标 m 排序
// Sort by objective m
std::sort(sorted_idx.begin(), sorted_idx.end(),
[&](int a, int b) { return cands[a].objs[m] < cands[b].objs[m]; });
float range = cands[sorted_idx[n-1]].objs[m] - cands[sorted_idx[0]].objs[m];
if (range < 1e-12f) continue; // 该目标无区分度
if (range < 1e-12f) continue; // No spread on this objective
// 边界解设为无穷大
// Boundary solutions: infinite crowding
cands[sorted_idx[0]].crowding += 1e18f;
cands[sorted_idx[n-1]].crowding += 1e18f;
// 中间解:相邻间距 × importance 权重
// Interior: neighbor gap × importance weight
float w = importance[m];
for (int i = 1; i < n - 1; i++) {
float gap = cands[sorted_idx[i+1]].objs[m] - cands[sorted_idx[i-1]].objs[m];
@ -147,29 +147,29 @@ inline void weighted_crowding_distance(std::vector<CandidateInfo>& cands,
}
// ============================================================
// 主选择函数:从 N 个候选中选出 target 个
// Main selection: pick target candidates from N
// ============================================================
// 返回被选中的候选索引
// Returns indices of selected candidates
inline std::vector<int> nsga2_select(std::vector<CandidateInfo>& cands,
int num_obj,
const float* importance,
int target,
int num_reserved_random) {
// --- 1. 核心目标预留名额 ---
// --- 1. Reserve slots for core objectives ---
int num_reserve_total = target - num_reserved_random;
// 预留比例importance[i] × 30% 的名额(剩余 70% 给 NSGA-II
// Reserve ratio: importance[i] × 30% of slots (remaining 70% for NSGA-II)
float reserve_ratio = 0.3f;
std::vector<int> selected;
selected.reserve(target);
// 对每个目标,按该目标排序取 top
// For each objective, sort by that objective and take top
for (int m = 0; m < num_obj; m++) {
int quota = (int)(num_reserve_total * importance[m] * reserve_ratio);
if (quota < 1 && num_obj > 1) quota = 1; // 每个目标至少 1 个
if (quota < 1 && num_obj > 1) quota = 1; // At least one per objective
// 按目标 m 排序(越小越好)
// Sort by objective m (lower is better)
std::vector<int> by_obj(cands.size());
for (int i = 0; i < (int)cands.size(); i++) by_obj[i] = i;
std::sort(by_obj.begin(), by_obj.end(),
@ -186,32 +186,32 @@ inline std::vector<int> nsga2_select(std::vector<CandidateInfo>& cands,
}
}
// --- 2. NSGA-II 选择填充剩余名额 ---
// --- 2. NSGA-II fills remaining slots ---
int remaining = target - num_reserved_random - (int)selected.size();
if (remaining > 0) {
// 非支配排序
// Non-dominated sort
std::vector<std::vector<int>> fronts;
fast_nondominated_sort(cands, num_obj, fronts);
for (auto& front : fronts) {
if (remaining <= 0) break;
// 过滤已选中的
// Filter out already selected
std::vector<int> available;
for (int i : front) {
if (!cands[i].selected) available.push_back(i);
}
if ((int)available.size() <= remaining) {
// 整层都选
// Take the whole front
for (int i : available) {
cands[i].selected = true;
selected.push_back(i);
remaining--;
}
} else {
// 该层需要截断:按加权拥挤度选
// Truncate this front: pick by weighted crowding
weighted_crowding_distance(cands, available, num_obj, importance);
std::sort(available.begin(), available.end(),
[&](int a, int b) { return cands[a].crowding > cands[b].crowding; });
@ -228,14 +228,14 @@ inline std::vector<int> nsga2_select(std::vector<CandidateInfo>& cands,
}
// ============================================================
// 单目标快速路径:直接按标量排序取 top
// Single-objective fast path: scalar sort and take top
// ============================================================
inline std::vector<int> top_n_select(std::vector<CandidateInfo>& cands,
int target,
int num_reserved_random) {
int to_select = target - num_reserved_random;
// 按 penalty 优先,然后按 objs[0](已归一化为越小越好)
// Prefer lower penalty, then objs[0] (normalized, lower is better)
std::vector<int> indices(cands.size());
for (int i = 0; i < (int)cands.size(); i++) indices[i] = i;
std::sort(indices.begin(), indices.end(), [&](int a, int b) {

View file

@ -1,12 +1,12 @@
/**
* multi_gpu_solver.cuh - 多 GPU 协同求解
* multi_gpu_solver.cuh - Multi-GPU cooperative solving
*
* v5.0 方案 B3: 被动注入 + GPU 无感知
* - 每块 GPU 独立运行 solve(),各自用不同 seed
* - 每个 GPU 有一个 InjectBuffer设备端
* - CPU 协调线程定期(每 N 秒)收集各 GPU 的 best异步写入其他 GPU 的 InjectBuffer
* - GPU 在 migrate_kernel 后检查 InjectBuffer如果有新解则注入
* - 完全解耦GPU 无需暂停CPU 异步写入,通过 CUDA Stream 同步保证安全
* v5.0 plan B3: passive injection + GPU-agnostic design
* - Each GPU runs solve() independently with its own seed
* - Each GPU has an InjectBuffer (device memory)
* - A CPU coordinator thread periodically (every N seconds) collects each GPU's best and asynchronously writes to other GPUs' InjectBuffers
* - After migrate_kernel, each GPU checks InjectBuffer and injects if a new solution is present
* - Fully decoupled: GPUs need not pause; CPU writes asynchronously; CUDA stream sync ensures safety
*/
#pragma once
@ -18,25 +18,26 @@
#include <chrono>
// ============================================================
// MultiGpuContext — 每个 GPU 的上下文
// MultiGpuContext — per-GPU context
// ============================================================
template<typename Problem>
struct MultiGpuContext {
using Sol = typename Problem::Sol;
int gpu_id; // GPU 设备 ID
Problem* problem; // Problem 实例(设备指针指向该 GPU
SolverConfig config; // 求解器配置(独立 seed
int gpu_id; // GPU device ID
Problem* problem; // Problem instance (device pointer for this GPU)
SolverConfig config; // Solver config (independent seed)
Sol best_solution; // 当前最优解host 端)
std::mutex best_mutex; // 保护 best_solution 的互斥锁
Sol best_solution; // Current best solution (host)
SolveResult<Sol> solve_result; // Full result from solve()
std::mutex best_mutex; // Mutex protecting best_solution
InjectBuffer<Sol>* d_inject_buf; // Device 端注入缓冲区(在该 GPU 上分配)
Sol* d_global_best; // Device 端全局最优解指针(由 solve() 导出)
InjectBuffer<Sol>* d_inject_buf; // Device-side inject buffer (allocated on this GPU)
Sol* d_global_best; // Device pointer to global best (exported by solve())
std::atomic<bool> stop_flag; // 停止标志
std::atomic<bool> running; // 运行状态标志(用于协调线程判断)
std::atomic<bool> stop_flag; // Stop flag
std::atomic<bool> running; // Running flag (for coordinator thread)
MultiGpuContext(int id) : gpu_id(id), problem(nullptr), d_inject_buf(nullptr),
d_global_best(nullptr), stop_flag(false), running(false) {
@ -47,45 +48,46 @@ struct MultiGpuContext {
};
// ============================================================
// GPU Worker 线程函数(方案 B3
// GPU worker thread (plan B3)
// ============================================================
template<typename Problem>
void gpu_worker(MultiGpuContext<Problem>* ctx) {
using Sol = typename Problem::Sol;
// 设置当前线程使用的 GPU
// Set GPU for this thread
CUDA_CHECK(cudaSetDevice(ctx->gpu_id));
// 标记开始运行
// Mark as running
ctx->running.store(true);
// 运行 solve传入 inject_buf 和 d_global_best_out
// Run solve (pass inject_buf and d_global_best_out)
SolveResult<Sol> result = solve(*ctx->problem, ctx->config,
nullptr, 0, nullptr, ctx->d_inject_buf, &ctx->d_global_best);
// 标记运行结束
// Mark as finished running
ctx->running.store(false);
// 更新最优解
// Update best solution and full result
{
std::lock_guard<std::mutex> lock(ctx->best_mutex);
ctx->best_solution = result.best_solution;
ctx->solve_result = result;
}
// 标记完成
// Mark complete
ctx->stop_flag.store(true);
}
// ============================================================
// 协调线程函数(方案 B3
// Coordinator thread (plan B3)
// ============================================================
// 定期从各 GPU 的 d_global_best 读取当前 best计算 global_best注入到其他 GPU
// Periodically read each GPU's current best from d_global_best, compute global_best, inject to other GPUs
//
// 关键设计:
// 1. 直接从各 GPU 的 d_global_best 读取(由 solve() 导出)
// 2. 要求启用 SA否则无 d_global_best
// 3. 轻量侵入solve() 只需导出一个指针,对单 GPU 无影响
// Key design:
// 1. Read directly from each GPU's d_global_best (exported by solve())
// 2. Requires SA enabled (otherwise no d_global_best)
// 3. Light touch: solve() only exports a pointer; single-GPU path unchanged
template<typename Problem>
void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
@ -96,7 +98,7 @@ void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
auto interval_ms = std::chrono::milliseconds(static_cast<int>(interval_sec * 1000));
int round = 0;
// 等待所有 GPU 的 d_global_best 就绪
// Wait until all GPUs' d_global_best are ready
bool all_ready = false;
while (!all_ready) {
std::this_thread::sleep_for(std::chrono::milliseconds(100));
@ -110,10 +112,10 @@ void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
}
while (true) {
// 等待指定时间间隔
// Wait for the configured interval
std::this_thread::sleep_for(interval_ms);
// 检查是否所有 GPU 都已停止
// Check whether all GPUs have stopped
bool all_stopped = true;
for (auto* ctx : contexts) {
if (ctx->running.load()) {
@ -125,17 +127,17 @@ void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
round++;
// 收集各 GPU 的当前最优解(从 d_global_best 读取)
// Collect each GPU's current best (from d_global_best)
Sol global_best;
global_best.penalty = 1e30f;
global_best.objectives[0] = 1e30f;
int best_gpu = -1;
for (int i = 0; i < (int)contexts.size(); i++) {
if (!contexts[i]->running.load()) continue; // 已停止的 GPU 跳过
if (contexts[i]->d_global_best == nullptr) continue; // 未就绪跳过
if (!contexts[i]->running.load()) continue; // skip stopped GPUs
if (contexts[i]->d_global_best == nullptr) continue; // skip not ready
// 从该 GPU 的 d_global_best 读取
// Read from this GPU's d_global_best
Sol gpu_best;
cudaSetDevice(contexts[i]->gpu_id);
cudaMemcpy(&gpu_best, contexts[i]->d_global_best, sizeof(Sol), cudaMemcpyDeviceToHost);
@ -146,23 +148,23 @@ void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
}
}
if (best_gpu == -1) continue; // 所有 GPU 都已停止或未就绪
if (best_gpu == -1) continue; // all GPUs stopped or not ready
if (verbose) {
printf(" [Coordinator Round %d] Global best from GPU %d: obj=%.2f, penalty=%.2f\n",
round, best_gpu, global_best.objectives[0], global_best.penalty);
}
// 将 global_best 注入到其他 GPU除了 best_gpu 自己)
// Inject global_best into other GPUs (except best_gpu)
for (int i = 0; i < (int)contexts.size(); i++) {
if (i == best_gpu) continue; // 不注入到自己
if (!contexts[i]->running.load()) continue; // 已停止的 GPU 不注入
if (i == best_gpu) continue; // do not inject to self
if (!contexts[i]->running.load()) continue; // do not inject to stopped GPUs
// 读取 InjectBuffer 结构(从 device 到 host
// Read InjectBuffer struct (device to host)
InjectBuffer<Sol> buf;
cudaMemcpy(&buf, contexts[i]->d_inject_buf, sizeof(InjectBuffer<Sol>), cudaMemcpyDeviceToHost);
// 同步写入(会自动切换设备)
// Synchronous write (switches device as needed)
buf.write_sync(global_best, contexts[i]->gpu_id);
}
}
@ -173,7 +175,7 @@ void coordinator_thread(std::vector<MultiGpuContext<Problem>*>& contexts,
}
// ============================================================
// 多 GPU 协同求解主函数(方案 B3
// Multi-GPU cooperative solve entry (plan B3)
// ============================================================
template<typename Problem>
@ -181,13 +183,17 @@ SolveResult<typename Problem::Sol> solve_multi_gpu(Problem& prob, const SolverCo
using Sol = typename Problem::Sol;
if (cfg.num_gpus <= 1) {
// 单 GPU 模式,直接调用普通 solve
// Single-GPU mode: call plain solve
return solve(prob, cfg);
}
// 检查可用 GPU 数量
int device_count;
// Check available GPU count
int device_count = 0;
CUDA_CHECK(cudaGetDeviceCount(&device_count));
if (device_count <= 0) {
fprintf(stderr, "Error: No CUDA devices available\n");
return SolveResult<Sol>{};
}
int actual_gpus = std::min(cfg.num_gpus, device_count);
if (cfg.verbose) {
@ -199,15 +205,15 @@ SolveResult<typename Problem::Sol> solve_multi_gpu(Problem& prob, const SolverCo
cfg.multi_gpu_inject_mode == MultiGpuInjectMode::HalfIslands ? "HalfIslands" : "AllIslands");
}
// 创建各 GPU 的上下文
// Create per-GPU contexts
std::vector<MultiGpuContext<Problem>*> contexts;
for (int i = 0; i < actual_gpus; i++) {
auto* ctx = new MultiGpuContext<Problem>(i);
ctx->config = cfg;
ctx->config.seed = cfg.seed + i * 1000; // 每个 GPU 用不同 seed
ctx->config.num_gpus = 1; // 单 GPU 模式运行
ctx->config.seed = cfg.seed + i * 1000; // distinct seed per GPU
ctx->config.num_gpus = 1; // run as single-GPU per device
// 克隆 Problem 到该 GPU
// Clone Problem onto this GPU
ctx->problem = prob.clone_to_device(i);
if (ctx->problem == nullptr) {
fprintf(stderr, "Error: Failed to clone problem to GPU %d\n", i);
@ -218,10 +224,10 @@ SolveResult<typename Problem::Sol> solve_multi_gpu(Problem& prob, const SolverCo
return SolveResult<Sol>{};
}
// 分配 InjectBuffer在该 GPU 上)
// Allocate InjectBuffer on this GPU
InjectBuffer<Sol> buf = InjectBuffer<Sol>::allocate(i);
// 将 InjectBuffer 拷贝到 device 端(传给 kernel
// Copy InjectBuffer to device (for kernels)
InjectBuffer<Sol>* d_buf;
CUDA_CHECK(cudaSetDevice(i));
CUDA_CHECK(cudaMalloc(&d_buf, sizeof(InjectBuffer<Sol>)));
@ -231,34 +237,36 @@ SolveResult<typename Problem::Sol> solve_multi_gpu(Problem& prob, const SolverCo
contexts.push_back(ctx);
}
// 启动 worker 线程
// Start worker threads
std::vector<std::thread> workers;
for (auto* ctx : contexts) {
workers.emplace_back(gpu_worker<Problem>, ctx);
}
// 启动协调线程(定期注入 global_best
// Start coordinator thread (periodic global_best injection)
std::thread coordinator(coordinator_thread<Problem>, std::ref(contexts),
cfg.multi_gpu_interval_sec, cfg.verbose);
// 等待所有 worker 完成
// Wait for all workers to finish
for (auto& w : workers) w.join();
// 等待协调线程完成
// Wait for coordinator to finish
coordinator.join();
// 收集最终结果
// Collect final result from best GPU
Sol final_best = contexts[0]->best_solution;
int best_ctx = 0;
ObjConfig oc = prob.obj_config();
for (int i = 1; i < (int)contexts.size(); i++) {
if (is_better(contexts[i]->best_solution, final_best, oc)) {
final_best = contexts[i]->best_solution;
best_ctx = i;
}
}
// 清理
// Cleanup
for (auto* ctx : contexts) {
// 读取 InjectBuffer 的内容(用于释放)
// Read InjectBuffer content (for teardown)
InjectBuffer<Sol> buf;
CUDA_CHECK(cudaSetDevice(ctx->gpu_id));
CUDA_CHECK(cudaMemcpy(&buf, ctx->d_inject_buf, sizeof(InjectBuffer<Sol>), cudaMemcpyDeviceToHost));
@ -269,10 +277,9 @@ SolveResult<typename Problem::Sol> solve_multi_gpu(Problem& prob, const SolverCo
delete ctx;
}
// 构造返回结果
SolveResult<Sol> result;
// Build return value from best GPU's result
SolveResult<Sol> result = contexts[best_ctx]->solve_result;
result.best_solution = final_best;
result.stop_reason = StopReason::MaxGen;
return result;
}

View file

@ -1,40 +1,40 @@
/**
* operators.cuh - 四层搜索算子体系Device 端)
* operators.cuh - Four-layer search operator hierarchy (device side)
*
* v1.0: 二维通用编码的完整算子层次
* v1.0: Full operator hierarchy for 2D universal encoding
*
* 层次结构(所有算子只看 data[D1][D2] + dim2_sizes不感知问题语义
* Hierarchy (all operators only see data[D1][D2] + dim2_sizes, no problem semantics):
*
* 第 1 层 - 元素级Element: 操作单个元素
* 行内: swap, reverse(2-opt), insert, flip
* 跨行: cross_relocate单元素移行, cross_swap单元素换行
* Layer 1 - Element: operate on single elements
* Within row: swap, reverse(2-opt), insert, flip
* Cross-row: cross_relocate (move one element across rows), cross_swap (swap one element per row)
*
* 第 2 层 - 片段级Segment: 操作连续片段
* 行内: or_opt移动连续 k 个元素到行内新位置)
* 跨行: seg_relocate片段从一行移到另一行
* seg_swap(两行各取一段互换,即 2-opt*
* Layer 2 - Segment: operate on contiguous segments
* Within row: or_opt (move contiguous k elements to a new position in the row)
* Cross-row: seg_relocate (move a segment from one row to another)
* seg_swap (swap two segments from two rows each, i.e. 2-opt*)
*
* 第 3 层 - 行级Row: 操作整行
* row_swap(交换两行全部内容和长度)
* row_reverse(反转行的排列顺序)
* row_split(一行拆成两行)
* row_merge(两行合并为一行)
* Layer 3 - Row: operate on whole rows
* row_swap (swap full contents and lengths of two rows)
* row_reverse (reverse row order)
* row_split (split one row into two)
* row_merge (merge two rows into one)
*
* 第 4 层 - 交叉Crossover: 组合两个解
* row_crossover(从父代 A/B 各取若干行组成子代)
* uniform_crossover(逐元素从两个父代中选)
* Layer 4 - Crossover: combine two solutions
* row_crossover (child takes some rows from parent A and B)
* uniform_crossover (pick per element from two parents)
*
* Move 描述符:
* row, row2: 行索引row2=-1 表示行内)
* op: 操作码
* pos1, pos2: 位置参数
* seg_len: 片段长度(第 2 层使用)
* Move descriptor:
* row, row2: row indices (row2=-1 means within-row)
* op: operation code
* pos1, pos2: position parameters
* seg_len: segment length (used by layer 2)
*
* 设计原则:
* - 所有算子对问题类型无感知,只操作二维数组
* - 每个算子都有对应的 undo 操作
* - 空行安全:自动降级为 no-op
* - 编码类型决定可用算子集
* Design principles:
* - All operators are problem-agnostic; they only manipulate a 2D array
* - Each operator has a corresponding undo
* - Empty-row safe: automatically degrades to no-op
* - Encoding type determines the available operator set
*/
#pragma once
@ -44,61 +44,61 @@
namespace ops {
// ============================================================
// Op 码常量 — 按层次编号,避免冲突
// Op code constants — numbered by layer to avoid collisions
// ============================================================
// 通用
// General
constexpr int OP_NOOP = -1;
// --- 第 1 层:元素级 ---
// Permutation 行内
constexpr int PERM_SWAP = 0; // 交换两个位置
constexpr int PERM_REVERSE = 1; // 反转区间2-opt
constexpr int PERM_INSERT = 2; // 移动单个元素到新位置
// Permutation 跨行
constexpr int PERM_CROSS_RELOCATE = 3; // 单元素从一行移到另一行
constexpr int PERM_CROSS_SWAP = 4; // 两行各一个元素互换
// Binary 行内
constexpr int BIN_FLIP = 0; // 翻转一个位
constexpr int BIN_SWAP = 1; // 交换两个位
// Binary 跨行
constexpr int BIN_CROSS_SWAP = 2; // 两行各一个位互换
// --- Layer 1: element ---
// Permutation within row
constexpr int PERM_SWAP = 0; // swap two positions
constexpr int PERM_REVERSE = 1; // reverse interval (2-opt)
constexpr int PERM_INSERT = 2; // move one element to a new position
// Permutation cross-row
constexpr int PERM_CROSS_RELOCATE = 3; // move one element from one row to another
constexpr int PERM_CROSS_SWAP = 4; // swap one element per row between two rows
// Binary within row
constexpr int BIN_FLIP = 0; // flip one bit
constexpr int BIN_SWAP = 1; // swap two bits
// Binary cross-row
constexpr int BIN_CROSS_SWAP = 2; // swap one bit per row between two rows
// --- 第 1 层(续):排列行内 ---
constexpr int PERM_3OPT = 5; // 3-opt:断 3 条边重连
// --- Layer 1 (cont.): permutation within row ---
constexpr int PERM_3OPT = 5; // 3-opt: break 3 edges and reconnect
// --- 第 2 层:片段级 ---
constexpr int PERM_OR_OPT = 10; // 行内:移动连续 k 个元素
constexpr int PERM_SEG_RELOCATE = 11; // 跨行:片段从一行移到另一行
constexpr int PERM_SEG_SWAP = 12; // 跨行两行各取一段互换2-opt*
constexpr int PERM_CROSS_EXCHANGE = 15; // 跨行:两行各取一段互换(保持各自内部顺序)
constexpr int BIN_SEG_FLIP = 13; // 行内:翻转连续 k 个位
constexpr int BIN_SEG_CROSS_SWAP = 14; // 跨行:两行各取一段互换
constexpr int BIN_K_FLIP = 16; // 行内:同时翻转 k 个随机位
// --- Layer 2: segment ---
constexpr int PERM_OR_OPT = 10; // within row: move contiguous k elements
constexpr int PERM_SEG_RELOCATE = 11; // cross-row: move segment from one row to another
constexpr int PERM_SEG_SWAP = 12; // cross-row: swap two segments from two rows each (2-opt*)
constexpr int PERM_CROSS_EXCHANGE = 15; // cross-row: swap two segments (preserve internal order each)
constexpr int BIN_SEG_FLIP = 13; // within row: flip contiguous k bits
constexpr int BIN_SEG_CROSS_SWAP = 14; // cross-row: swap two segments from two rows each
constexpr int BIN_K_FLIP = 16; // within row: flip k random bits at once
// --- 第 3 层:行级 ---
constexpr int ROW_SWAP = 20; // 交换两行全部内容
constexpr int ROW_REVERSE = 21; // 反转行的排列顺序(行号重排)
constexpr int ROW_SPLIT = 22; // 一行拆成两行
constexpr int ROW_MERGE = 23; // 两行合并为一行
// --- Layer 3: row ---
constexpr int ROW_SWAP = 20; // swap full contents of two rows
constexpr int ROW_REVERSE = 21; // reverse row order (row index permutation)
constexpr int ROW_SPLIT = 22; // split one row into two
constexpr int ROW_MERGE = 23; // merge two rows into one
// --- 特殊:扰动(连续多步 move不可 undo用于跳出局部最优---
// --- Special: perturbation (multi-step moves, no undo, escape local optima) ---
constexpr int PERTURBATION = 40;
// --- 第 4 层:交叉 ---
constexpr int CROSS_ROW = 30; // 行级交叉:从两个父代各取若干行
constexpr int CROSS_UNIFORM = 31; // 均匀交叉:逐元素从两个父代选
// --- Layer 4: crossover ---
constexpr int CROSS_ROW = 30; // row crossover: take some rows from each parent
constexpr int CROSS_UNIFORM = 31; // uniform crossover: pick per element from two parents
// ============================================================
// Move 描述符 — 编码级别的变动描述
// Move descriptor — encoding-level change description
// ============================================================
struct Move {
int row; // 源行(或第一行)
int row2; // 目标行(-1 = 行内)
int op; // 操作码
int pos1, pos2; // 位置参数
int seg_len; // 片段长度(第 2 层使用,其他层 = 0
int row; // source row (or first row)
int row2; // target row (-1 = within-row)
int op; // operation code
int pos1, pos2; // position parameters
int seg_len; // segment length (layer 2; 0 for other layers)
};
} // namespace ops
@ -106,10 +106,10 @@ struct Move {
namespace ops {
// ============================================================
// 第 1 层:元素级底层操作
// Layer 1: element-level primitives
// ============================================================
// --- Permutation 行内 ---
// --- Permutation within row ---
__device__ inline void perm_swap(int* row, int i, int j) {
int tmp = row[i]; row[i] = row[j]; row[j] = tmp;
@ -126,9 +126,9 @@ __device__ inline void perm_insert(int* row, int from, int to, int size) {
row[to] = val;
}
// --- Permutation 跨行 ---
// --- Permutation cross-row ---
/// cross_relocate: 从 src_row[src_pos] 取出元素,插入 dst_row[dst_pos]
/// cross_relocate: take element from src_row[src_pos], insert at dst_row[dst_pos]
__device__ inline void perm_cross_relocate(int* src_row, int& src_size,
int* dst_row, int& dst_size,
int src_pos, int dst_pos) {
@ -142,24 +142,24 @@ __device__ inline void perm_cross_relocate(int* src_row, int& src_size,
dst_size++;
}
/// cross_swap: 交换 rowA[posA] 和 rowB[posB]
/// cross_swap: swap rowA[posA] and rowB[posB]
__device__ inline void cross_swap_elem(int* rowA, int posA, int* rowB, int posB) {
int tmp = rowA[posA]; rowA[posA] = rowB[posB]; rowB[posB] = tmp;
}
// --- Permutation 行内:3-opt ---
// 断开 3 条边,选择最佳重连方式(共 8 种组合,取随机一种非恒等变换)
// 参数3 个断点 i < j < k将路线分为 seg0=[0,i] seg1=[i+1,j] seg2=[j+1,k] seg3=[k+1,end]
// 实现随机选一种重连reverse seg1, reverse seg2, 或两者都反转)
// pos1=i, pos2=j, seg_len 编码 k
// --- Permutation within row: 3-opt ---
// Break 3 edges and pick a reconnection (8 combinations; pick one random non-identity)
// Args: three breakpoints i < j < k, route splits seg0=[0,i] seg1=[i+1,j] seg2=[j+1,k] seg3=[k+1,end]
// Impl: random reconnection (reverse seg1, reverse seg2, or both)
// pos1=i, pos2=j, seg_len encodes k
__device__ inline void perm_3opt(int* row, int size, int i, int j, int k) {
// 3-opt 有多种重连方式,这里实现最常用的 3 种非恒等变换:
// type 1: reverse [i+1, j] — 等价于 2-opt(i+1, j)
// type 2: reverse [j+1, k] — 等价于 2-opt(j+1, k)
// type 3: reverse [i+1, j] + reverse [j+1, k] — 真正的 3-opt move
// type 4: 将 seg1 和 seg2 互换位置(不反转) — or-opt 的泛化
// 我们随机选 type 3 或 type 4type 1/2 已被 2-opt 覆盖)
// 这里固定做 type 3双反转因为它是 2-opt 无法达到的唯一新邻域
// 3-opt has several reconnections; here we use the most common non-identity variants:
// type 1: reverse [i+1, j] — same as 2-opt(i+1, j)
// type 2: reverse [j+1, k] — same as 2-opt(j+1, k)
// type 3: reverse [i+1, j] + reverse [j+1, k] — true 3-opt move
// type 4: swap seg1 and seg2 (no reverse) — generalization of or-opt
// We would randomize type 3 or 4 (types 1/2 are covered by 2-opt)
// Here we fix type 3 (double reverse) as the only new neighborhood 2-opt cannot reach
// reverse [i+1, j]
int lo = i + 1, hi = j;
while (lo < hi) { int t = row[lo]; row[lo] = row[hi]; row[hi] = t; lo++; hi--; }
@ -168,12 +168,12 @@ __device__ inline void perm_3opt(int* row, int size, int i, int j, int k) {
while (lo < hi) { int t = row[lo]; row[lo] = row[hi]; row[hi] = t; lo++; hi--; }
}
// 3-opt undo: 再做一次相同操作即可恢复(双反转是自反的)
// 3-opt undo: repeat the same move to restore (double reverse is self-inverse)
__device__ inline void perm_3opt_undo(int* row, int size, int i, int j, int k) {
perm_3opt(row, size, i, j, k); // 自反
perm_3opt(row, size, i, j, k); // self-inverse
}
// --- Binary 行内 ---
// --- Binary within row ---
__device__ inline void bin_flip(int* row, int i) { row[i] = 1 - row[i]; }
@ -182,51 +182,51 @@ __device__ inline void bin_swap(int* row, int i, int j) {
}
// ============================================================
// 第 2 层:片段级底层操作
// Layer 2: segment-level primitives
// ============================================================
/// or_opt: 行内移动连续 seg_len 个元素(从 from 开始)到 to 位置
/// 等价于:取出 [from, from+seg_len),插入到 to 之前
/// 约束from + seg_len <= size, to 不在 [from, from+seg_len) 内
/// or_opt: within row, move contiguous seg_len elements (starting at from) to position to
/// Same as: take [from, from+seg_len), insert before to
/// Constraints: from + seg_len <= size, to not in [from, from+seg_len)
__device__ inline void perm_or_opt(int* row, int size, int from, int to, int seg_len) {
// 临时缓冲(片段最大长度受限于寄存器,实际 seg_len 通常 <= 4
int buf[8]; // 足够覆盖常见 seg_len
// Temp buffer (max segment length limited by registers; seg_len usually <= 4)
int buf[8]; // enough for typical seg_len
int actual_len = (seg_len > 8) ? 8 : seg_len;
// 保存片段
// Save segment
for (int i = 0; i < actual_len; i++) buf[i] = row[from + i];
// 移除片段(左移填补空洞)
// Remove segment (shift left to close gap)
int new_size = size - actual_len;
for (int k = from; k < new_size; k++) row[k] = row[k + actual_len];
// 计算插入位置(移除后的坐标系)
// Insert position after removal (coords after removal)
int ins = (to > from) ? to - actual_len : to;
if (ins < 0) ins = 0;
if (ins > new_size) ins = new_size;
// 插入片段(右移腾位)
// Insert segment (shift right to make room)
for (int k = new_size - 1; k >= ins; k--) row[k + actual_len] = row[k];
for (int i = 0; i < actual_len; i++) row[ins + i] = buf[i];
}
/// seg_relocate: 从 src_row 取出连续 seg_len 个元素,插入 dst_row 的 dst_pos
/// src_size 减 seg_lendst_size 加 seg_len
/// seg_relocate: take contiguous seg_len elements from src_row, insert at dst_pos in dst_row
/// src_size -= seg_len, dst_size += seg_len
__device__ inline void perm_seg_relocate(int* src_row, int& src_size,
int* dst_row, int& dst_size,
int src_pos, int dst_pos, int seg_len) {
int buf[8];
int actual_len = (seg_len > 8) ? 8 : seg_len;
// 保存片段
// Save segment
for (int i = 0; i < actual_len; i++) buf[i] = src_row[src_pos + i];
// 源行:移除(左移)
// Source row: remove (shift left)
for (int k = src_pos; k < src_size - actual_len; k++)
src_row[k] = src_row[k + actual_len];
src_size -= actual_len;
// 目标行:插入(右移)
// Destination row: insert (shift right)
for (int k = dst_size - 1; k >= dst_pos; k--)
dst_row[k + actual_len] = dst_row[k];
for (int i = 0; i < actual_len; i++)
@ -234,29 +234,29 @@ __device__ inline void perm_seg_relocate(int* src_row, int& src_size,
dst_size += actual_len;
}
/// seg_swap: 两行各取一段互换2-opt* 的通用形式)
/// seg_swap: swap one segment from each row (general 2-opt*)
/// rowA[posA..posA+lenA) <-> rowB[posB..posB+lenB)
/// 行长变化:sizeA += (lenB - lenA), sizeB += (lenA - lenB)
/// Row lengths: sizeA += (lenB - lenA), sizeB += (lenA - lenB)
__device__ inline void perm_seg_swap(int* rowA, int& sizeA, int posA, int lenA,
int* rowB, int& sizeB, int posB, int lenB) {
int bufA[8], bufB[8];
int aLen = (lenA > 8) ? 8 : lenA;
int bLen = (lenB > 8) ? 8 : lenB;
// 保存两段
// Save both segments
for (int i = 0; i < aLen; i++) bufA[i] = rowA[posA + i];
for (int i = 0; i < bLen; i++) bufB[i] = rowB[posB + i];
// 从 rowA 移除 segA腾出空间插入 segB
// 先移除
// Remove segA from rowA to make room for segB
// Remove first
int newSizeA = sizeA - aLen;
for (int k = posA; k < newSizeA; k++) rowA[k] = rowA[k + aLen];
// 再插入 segB
// Then insert segB
for (int k = newSizeA - 1; k >= posA; k--) rowA[k + bLen] = rowA[k];
for (int i = 0; i < bLen; i++) rowA[posA + i] = bufB[i];
sizeA = newSizeA + bLen;
// 从 rowB 移除 segB腾出空间插入 segA
// Remove segB from rowB to make room for segA
int newSizeB = sizeB - bLen;
for (int k = posB; k < newSizeB; k++) rowB[k] = rowB[k + bLen];
for (int k = newSizeB - 1; k >= posB; k--) rowB[k + aLen] = rowB[k];
@ -264,10 +264,10 @@ __device__ inline void perm_seg_swap(int* rowA, int& sizeA, int posA, int lenA,
sizeB = newSizeB + aLen;
}
/// cross_exchange: 两行各取一段互换,保持各自内部顺序
/// 与 seg_swap 的区别seg_swap 是等长互换cross_exchange 允许不等长
/// cross_exchange: swap one segment from each row, preserving internal order each
/// Unlike seg_swap: seg_swap is equal-length swap; cross_exchange allows unequal lengths
/// rowA[posA..posA+lenA) <-> rowB[posB..posB+lenB)
/// 行长变化:sizeA += (lenB - lenA), sizeB += (lenA - lenB)
/// Row lengths: sizeA += (lenB - lenA), sizeB += (lenA - lenB)
__device__ inline void perm_cross_exchange(int* rowA, int& sizeA, int posA, int lenA,
int* rowB, int& sizeB, int posB, int lenB) {
int bufA[8], bufB[8];
@ -277,14 +277,14 @@ __device__ inline void perm_cross_exchange(int* rowA, int& sizeA, int posA, int
for (int i = 0; i < aLen; i++) bufA[i] = rowA[posA + i];
for (int i = 0; i < bLen; i++) bufB[i] = rowB[posB + i];
// rowA: 移除 segA插入 segB
// rowA: remove segA, insert segB
int newSizeA = sizeA - aLen;
for (int k = posA; k < newSizeA; k++) rowA[k] = rowA[k + aLen];
for (int k = newSizeA - 1; k >= posA; k--) rowA[k + bLen] = rowA[k];
for (int i = 0; i < bLen; i++) rowA[posA + i] = bufB[i];
sizeA = newSizeA + bLen;
// rowB: 移除 segB插入 segA
// rowB: remove segB, insert segA
int newSizeB = sizeB - bLen;
for (int k = posB; k < newSizeB; k++) rowB[k] = rowB[k + bLen];
for (int k = newSizeB - 1; k >= posB; k--) rowB[k + aLen] = rowB[k];
@ -292,8 +292,8 @@ __device__ inline void perm_cross_exchange(int* rowA, int& sizeA, int posA, int
sizeB = newSizeB + aLen;
}
/// k-bit flip: 同时翻转 k 个随机位Binary 编码)
/// positions 数组存储要翻转的位置k = 实际翻转数
/// k-bit flip: flip k random bits at once (Binary encoding)
/// positions array holds indices to flip; k = number of flips
__device__ inline void bin_k_flip(int* row, int size, int k, curandState* rng) {
for (int i = 0; i < k; i++) {
int pos = rand_int(rng, size);
@ -301,12 +301,12 @@ __device__ inline void bin_k_flip(int* row, int size, int k, curandState* rng) {
}
}
/// seg_flip: 翻转行内连续 seg_len 个位Binary 编码)
/// seg_flip: flip contiguous seg_len bits within row (Binary encoding)
__device__ inline void bin_seg_flip(int* row, int pos, int seg_len) {
for (int i = 0; i < seg_len; i++) row[pos + i] = 1 - row[pos + i];
}
/// seg_cross_swap: 两行各取一段互换Binary 编码,等长)
/// seg_cross_swap: swap one segment from each row (Binary encoding, equal length)
__device__ inline void bin_seg_cross_swap(int* rowA, int posA,
int* rowB, int posB, int seg_len) {
for (int i = 0; i < seg_len; i++) {
@ -317,23 +317,23 @@ __device__ inline void bin_seg_cross_swap(int* rowA, int posA,
}
// ============================================================
// Integer 编码底层操作
// Integer encoding primitives
// ============================================================
/// int_clamp: 将值限制在 [lb, ub] 范围内
/// int_clamp: clamp value to [lb, ub]
__device__ inline int int_clamp(int v, int lb, int ub) {
if (v < lb) return lb;
if (v > ub) return ub;
return v;
}
/// int_random_reset: 随机一个位置重置为 [lb, ub] 内随机值
/// int_random_reset: reset one random position to uniform random in [lb, ub]
__device__ inline void int_random_reset(int* row, int pos, int lb, int ub,
curandState* rng) {
row[pos] = lb + (curand(rng) % (ub - lb + 1));
}
/// int_delta: 随机一个位置 ±kclamp 到 [lb, ub]
/// int_delta: random position, add ±k (clamped to [lb, ub])
__device__ inline void int_delta(int* row, int pos, int lb, int ub,
curandState* rng) {
int range = ub - lb + 1;
@ -343,7 +343,7 @@ __device__ inline void int_delta(int* row, int pos, int lb, int ub,
row[pos] = int_clamp(row[pos] + step, lb, ub);
}
/// int_seg_reset: 连续 k 个位置全部重置为 [lb, ub] 内随机值
/// int_seg_reset: reset k contiguous positions to uniform random in [lb, ub]
__device__ inline void int_seg_reset(int* row, int pos, int seg_len,
int lb, int ub, curandState* rng) {
int range = ub - lb + 1;
@ -351,7 +351,7 @@ __device__ inline void int_seg_reset(int* row, int pos, int seg_len,
row[pos + i] = lb + (curand(rng) % range);
}
/// int_k_delta: 随机 k 个位置各自 ±1
/// int_k_delta: k random positions, each ±1
__device__ inline void int_k_delta(int* row, int size, int k,
int lb, int ub, curandState* rng) {
for (int i = 0; i < k; i++) {
@ -362,21 +362,21 @@ __device__ inline void int_k_delta(int* row, int size, int k,
}
// ============================================================
// 第 3 层:行级底层操作
// Layer 3: row-level primitives
// ============================================================
/// row_swap: 交换两行的全部内容和长度
/// row_swap: swap full contents and lengths of two rows
template<typename Sol>
__device__ inline void row_swap(Sol& sol, int r1, int r2) {
// 交换长度
// Swap lengths
int tmp_size = sol.dim2_sizes[r1];
sol.dim2_sizes[r1] = sol.dim2_sizes[r2];
sol.dim2_sizes[r2] = tmp_size;
// 交换数据(取两行中较长的长度)
// Swap data (use the longer of the two row lengths)
int max_len = (sol.dim2_sizes[r1] > sol.dim2_sizes[r2])
? sol.dim2_sizes[r1] : sol.dim2_sizes[r2];
// 交换后 r1 的长度是原 r2 的r2 的长度是原 r1 的
// 所以需要交换 max(原r1长度, 原r2长度) 个元素
// After swap, r1 has old r2 length and r2 has old r1 length
// So swap max(old r1 len, old r2 len) elements
max_len = (tmp_size > max_len) ? tmp_size : max_len;
for (int c = 0; c < max_len; c++) {
int tmp = sol.data[r1][c];
@ -385,8 +385,8 @@ __device__ inline void row_swap(Sol& sol, int r1, int r2) {
}
}
/// row_reverse: 反转 [r1, r2] 范围内的行排列顺序
/// 例如 row_reverse(sol, 1, 4) 把行 1,2,3,4 变成 4,3,2,1
/// row_reverse: reverse row order in [r1, r2]
/// e.g. row_reverse(sol, 1, 4) turns rows 1,2,3,4 into 4,3,2,1
template<typename Sol>
__device__ inline void row_reverse_range(Sol& sol, int r1, int r2) {
while (r1 < r2) {
@ -395,23 +395,23 @@ __device__ inline void row_reverse_range(Sol& sol, int r1, int r2) {
}
}
/// row_split: 将 row 从 split_pos 处拆成两行
/// row 保留 [0, split_pos)empty_row 接收 [split_pos, size)
/// 要求 empty_row 当前为空或有足够空间
/// row_split: split row at split_pos into two rows
/// row keeps [0, split_pos), empty_row gets [split_pos, size)
/// requires empty_row empty or with enough space
template<typename Sol>
__device__ inline void row_split(Sol& sol, int row, int empty_row, int split_pos) {
int orig_size = sol.dim2_sizes[row];
int move_count = orig_size - split_pos;
// 复制后半段到 empty_row
// Copy tail to empty_row
for (int i = 0; i < move_count; i++)
sol.data[empty_row][i] = sol.data[row][split_pos + i];
sol.dim2_sizes[empty_row] = move_count;
sol.dim2_sizes[row] = split_pos;
}
/// row_merge: 将 src_row 的全部内容追加到 dst_row 末尾
/// src_row 清空dst_row 长度增加
/// 要求 dst_size + src_size <= DIM2
/// row_merge: append full contents of src_row to end of dst_row
/// src_row cleared, dst_row length increased
/// requires dst_size + src_size <= DIM2
template<typename Sol>
__device__ inline void row_merge(Sol& sol, int dst_row, int src_row) {
int dst_size = sol.dim2_sizes[dst_row];
@ -423,33 +423,33 @@ __device__ inline void row_merge(Sol& sol, int dst_row, int src_row) {
}
// ============================================================
// 第 4 层:交叉底层操作
// Layer 4: crossover primitives
// ============================================================
//
// 排列编码OX 家族(统一框架)
// 核心逻辑A 中标记一组"保留位置"不动,空位按 B 的全局顺序填充
// 三个变体只是"怎么选保留集"不同,填充逻辑完全共享
// 天然保证唯一性:从 B 中按序取不在保留集中的元素,不会重复
// 行长度不变(= A 的行长度),行边界不变
// Permutation encoding: OX family (unified framework)
// Core: mark "kept" positions from A; fill gaps in B's global order
// Three variants differ only in how the keep set is chosen; fill logic is shared
// Uniqueness: take from B in order elements not in keep set, no duplicates
// Row lengths unchanged (= A's row lengths), row boundaries unchanged
//
// Binary 编码uniform_crossover逐元素随机选
// Binary encoding: uniform_crossover (random pick per element)
//
// ============================================================
// ---- OX 核心填充逻辑 ----
// keep[r][c] = true 表示 child[r][c] 保留 A 的值false 表示空位
// 空位按 B 中元素的出现顺序(逐行扫描)填充
// 要求child 已拷贝自 Adim2_sizes 已设为 A 的行长度
// ---- OX core fill logic ----
// keep[r][c] = true means child[r][c] keeps A's value; false = gap to fill
// Gaps filled in order of appearance of elements in B (row-major scan)
// Requires: child copied from A, dim2_sizes set to A's row lengths
//
// 参数 total_elements: 分区模式下的总元素数,非分区模式下 = 单行长度
// 用于确定 B 中扫描的元素范围
// total_elements: total elements in partitioned mode; in non-partitioned = single row length
// Used to bound the scan range in B
template<typename Sol>
__device__ inline void ox_fill_from_b(Sol& child, const Sol& parentB,
const bool* keep_flat,
int dim1, int total_elements) {
// 统计 A 中保留位置的每个值的出现次数(支持多重集排列)
// keep_flat 是按行展平的:keep_flat[r * DIM2 + c]
// Count occurrences of each value at kept positions in A (multiset permutations)
// keep_flat is row-major flat: keep_flat[r * DIM2 + c]
int keep_count[512];
for (int i = 0; i < total_elements; i++) keep_count[i] = 0;
@ -460,21 +460,21 @@ __device__ inline void ox_fill_from_b(Sol& child, const Sol& parentB,
if (v >= 0 && v < total_elements) keep_count[v]++;
}
// 从 B 中按行扫描顺序收集:每个值只取"需要填充"的份数
// 标准排列:每个值最多 1 份,多重集:每个值最多 repeat_count 份
// Collect from B in row scan order: take only as many of each value as needed to fill
// Standard permutation: at most 1 of each value; multiset: up to repeat_count each
int fill_buf[512];
int fill_count = 0;
for (int r = 0; r < dim1; r++)
for (int c = 0; c < parentB.dim2_sizes[r]; c++) {
int val = parentB.data[r][c];
if (val >= 0 && val < total_elements && keep_count[val] > 0) {
keep_count[val]--; // 消耗一个保留名额
keep_count[val]--; // consume one kept slot
} else if (val >= 0 && val < total_elements) {
fill_buf[fill_count++] = val;
}
}
// 按空位顺序(逐行从左到右)填入
// Fill gaps in order (row by row, left to right)
int fi = 0;
for (int r = 0; r < dim1; r++)
for (int c = 0; c < child.dim2_sizes[r]; c++)
@ -482,26 +482,26 @@ __device__ inline void ox_fill_from_b(Sol& child, const Sol& parentB,
child.data[r][c] = fill_buf[fi++];
}
// ---- 变体 1: OX-区间 ----
// 每行随机选一个连续区间保留,保留邻接关系
// ---- Variant 1: OX-interval ----
// Per row, random contiguous interval kept; preserves adjacency
template<typename Sol>
__device__ inline void ox_interval(Sol& child, const Sol& parentA, const Sol& parentB,
int dim1, int total_elements, curandState* rng) {
bool keep[Sol::DIM1 * Sol::DIM2];
for (int i = 0; i < Sol::DIM1 * Sol::DIM2; i++) keep[i] = false;
// child = A,同时标记每行的保留区间
// child = A, mark each row's kept interval
for (int r = 0; r < dim1; r++) {
int sz = parentA.dim2_sizes[r];
child.dim2_sizes[r] = sz;
for (int c = 0; c < sz; c++) child.data[r][c] = parentA.data[r][c];
if (sz < 2) {
// 长度 0 或 1全部保留
// length 0 or 1: keep all
for (int c = 0; c < sz; c++) keep[r * Sol::DIM2 + c] = true;
continue;
}
// 随机选区间 [lo, hi]
// Random interval [lo, hi]
int lo = rand_int(rng, sz);
int hi = rand_int(rng, sz);
if (lo > hi) { int tmp = lo; lo = hi; hi = tmp; }
@ -511,8 +511,8 @@ __device__ inline void ox_interval(Sol& child, const Sol& parentA, const Sol& pa
ox_fill_from_b(child, parentB, keep, dim1, total_elements);
}
// ---- 变体 2: OX-子集 ----
// 随机选约 50% 的元素值保留其在 A 中的位置,通用性最强
// ---- Variant 2: OX-subset ----
// Randomly keep ~50% of positions at their A values; most general
template<typename Sol>
__device__ inline void ox_subset(Sol& child, const Sol& parentA, const Sol& parentB,
int dim1, int total_elements, curandState* rng) {
@ -526,7 +526,7 @@ __device__ inline void ox_subset(Sol& child, const Sol& parentA, const Sol& pare
child.data[r][c] = parentA.data[r][c];
}
// 每个位置 50% 概率保留
// 50% keep per position
for (int r = 0; r < dim1; r++)
for (int c = 0; c < child.dim2_sizes[r]; c++)
keep[r * Sol::DIM2 + c] = (curand_uniform(rng) < 0.5f);
@ -534,9 +534,9 @@ __device__ inline void ox_subset(Sol& child, const Sol& parentA, const Sol& pare
ox_fill_from_b(child, parentB, keep, dim1, total_elements);
}
// ---- 变体 3: OX-行 ----
// 随机选若干整行保留,其余行的元素全部按 B 的顺序重填
// 保留整条路线结构VRP 受益
// ---- Variant 3: OX-row ----
// Randomly keep whole rows; refill non-kept rows from B's order
// Preserves full route structure; good for VRP
template<typename Sol>
__device__ inline void ox_row(Sol& child, const Sol& parentA, const Sol& parentB,
int dim1, int total_elements, curandState* rng) {
@ -550,7 +550,7 @@ __device__ inline void ox_row(Sol& child, const Sol& parentA, const Sol& parentB
child.data[r][c] = parentA.data[r][c];
}
// 每行 50% 概率整行保留
// 50% chance to keep whole row
int kept = 0;
for (int r = 0; r < dim1; r++) {
if (curand_uniform(rng) < 0.5f) {
@ -559,14 +559,14 @@ __device__ inline void ox_row(Sol& child, const Sol& parentA, const Sol& parentB
kept++;
}
}
// 确保不是全保留或全不保留
// Ensure not all-kept or all-unkept
if (kept == 0) {
int r = rand_int(rng, dim1);
// 不标记任何 keep → 全部重填(至少有一行不保留)
// 实际上 kept==0 意味着全部重填这是合法的child = B 的顺序填入 A 的结构)
// No keep marks → full refill (at least one row not kept)
// kept==0 means full refill; valid (child gets B's order into A's structure)
}
if (kept == dim1 && dim1 > 1) {
// 全保留 → 随机取消一行
// All kept → randomly un-keep one row
int r = rand_int(rng, dim1);
for (int c = 0; c < child.dim2_sizes[r]; c++)
keep[r * Sol::DIM2 + c] = false;
@ -575,14 +575,14 @@ __device__ inline void ox_row(Sol& child, const Sol& parentA, const Sol& parentB
ox_fill_from_b(child, parentB, keep, dim1, total_elements);
}
// ---- OX 统一入口 ----
// 随机选一个变体执行
// dim1==1 时只用区间和子集(行变体无意义)
// ---- OX unified entry ----
// Pick one variant at random
// When dim1==1 use only interval and subset (row variant useless)
template<typename Sol>
__device__ inline void perm_ox_crossover(Sol& child, const Sol& parentA, const Sol& parentB,
int dim1, int total_elements, curandState* rng) {
int n_variants = (dim1 > 1) ? 3 : 2;
int variant = rand_int(rng, n_variants); // 0: 区间, 1: 子集, [2: 行]
int variant = rand_int(rng, n_variants); // 0: interval, 1: subset, [2: row]
switch (variant) {
case 0: ox_interval(child, parentA, parentB, dim1, total_elements, rng); break;
case 1: ox_subset(child, parentA, parentB, dim1, total_elements, rng); break;
@ -590,8 +590,8 @@ __device__ inline void perm_ox_crossover(Sol& child, const Sol& parentA, const S
}
}
/// uniform_crossover: 逐元素从两个父代中随机选择
/// 适用于 Binary 编码(不破坏排列约束)
/// uniform_crossover: random parent choice per element
/// Suitable for Binary encoding (does not break permutation constraints)
template<typename Sol>
__device__ inline void uniform_crossover(Sol& child, const Sol& parentA, const Sol& parentB,
int dim1, curandState* rng) {
@ -607,15 +607,15 @@ __device__ inline void uniform_crossover(Sol& child, const Sol& parentA, const S
}
}
// [已删除] generate_move_for_seq / sample_and_generate / apply_move / undo_move
// P0 重构后主路径统一使用 execute_sequence旧的 Move 生成+应用+撤销路径不再需要
// [removed] generate_move_for_seq / sample_and_generate / apply_move / undo_move
// After P0 refactor the main path uses execute_sequence; old Move gen/apply/undo path removed
// ============================================================
// execute_sequence — 统一接口:生成参数并直接执行(不返回 Move
// execute_sequence — unified API: generate params and execute directly (no Move returned)
// ============================================================
// 返回 true 若 sol 被修改false 若 NOOP
// d_G, d_O, rel_N: 可选的关系矩阵指针SEQ_LNS_GUIDED_REBUILD 使用)
// val_lb, val_ub: Integer 编码的值域范围(其他编码忽略)
// Returns true if sol modified, false if NOOP
// d_G, d_O, rel_N: optional relation matrices (for SEQ_LNS_GUIDED_REBUILD)
// val_lb, val_ub: Integer encoding value range (ignored for other encodings)
template<typename Sol>
__device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
@ -627,7 +627,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
int val_ub = 1,
const void* prob_data = nullptr) {
// ============================================================
// Permutation 序列
// Permutation sequences
// ============================================================
if (encoding == EncodingType::Permutation) {
switch (seq_id) {
@ -841,15 +841,15 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
return true;
}
case seq::SEQ_LNS_GUIDED_REBUILD: {
// 关系矩阵引导重建:
// 1. 随机选种子元素 seed
// 2. 查 G[seed] 找分组倾向最强的 K 个元素
// 3. 在解中找到这些元素的位置
// 4. 按 O 矩阵引导的顺序重排这些位置的元素
// Relation-matrix guided rebuild:
// 1. Pick random seed element seed
// 2. Look up G[seed] for K elements with strongest grouping affinity
// 3. Find positions of these elements in the solution
// 4. Reorder these positions by order guided by O matrix
//
// 如果没有关系矩阵(冷启动),退化为 scatter_shuffle
// Without relation matrices (cold start), fall back to scatter_shuffle
if (!d_G || !d_O || rel_N <= 0) {
// 退化:随机 scatter shuffle
// Fallback: random scatter shuffle
int row = (dim1 > 1) ? rand_int(rng, dim1) : 0;
int sz = sol.dim2_sizes[row];
if (sz < 4) return false;
@ -872,21 +872,21 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
return true;
}
// --- 有关系矩阵:引导重建 ---
// 通用策略(不感知问题类型):
// G 矩阵 → 选哪些元素(分组倾向弱的 = 可能放错位置的)
// O 矩阵 → 怎么排(排序倾向引导重排顺序)
// 两者协同G 选人O 排序
// --- With relation matrices: guided rebuild ---
// Generic strategy (problem-agnostic):
// G matrix → which elements (weak grouping with seed = likely misplaced)
// O matrix → how to order (ordering affinity guides reorder)
// Together: G picks, O orders
int row = (dim1 > 1) ? rand_int(rng, dim1) : 0;
int sz = sol.dim2_sizes[row];
if (sz < 4) return false;
// 选种子元素
// Pick seed element
int seed_pos = rand_int(rng, sz);
int seed_val = sol.data[row][seed_pos];
if (seed_val < 0 || seed_val >= rel_N) return false;
// 检查矩阵是否有足够信息G 和 O 任一有信号即可)
// Check matrices have enough signal (either G or O)
float max_signal = 0.0f;
for (int c = 0; c < sz; c++) {
int v = sol.data[row][c];
@ -897,11 +897,11 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
if (o > max_signal) max_signal = o;
}
}
if (max_signal < 0.05f) return false; // 信息不足,跳过
if (max_signal < 0.05f) return false; // insufficient signal, skip
// 破坏:锦标赛选择 G 值低的元素t=2
// G 值低 = 与 seed 分组倾向弱 = 可能放错位置
// 锦标赛:随机抽 2 个,取 G 值更低的那个,重复 count 次
// Destroy: tournament pick low-G elements (t=2)
// Low G = weak grouping with seed = likely misplaced
// Tournament: draw 2 at random, take lower G, repeat count times
constexpr int MAX_REBUILD = 10;
constexpr int TOUR_SIZE = 2;
int count = sz / 5; // ~20%
@ -911,12 +911,12 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
int sel_pos[MAX_REBUILD];
int sel_val[MAX_REBUILD];
bool used[128] = {}; // 标记已选位置,防止重复
bool used[128] = {}; // mark chosen positions to avoid duplicates
int picked = 0;
int max_attempts = count * 4; // 防止死循环
int max_attempts = count * 4; // avoid infinite loop
for (int attempt = 0; attempt < max_attempts && picked < count; attempt++) {
// 锦标赛:随机抽 TOUR_SIZE 个候选,取 G 值最低的
// Tournament: draw TOUR_SIZE candidates at random, take lowest G
int best_c = -1;
float best_g = 1e30f;
for (int t = 0; t < TOUR_SIZE; t++) {
@ -936,15 +936,15 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
if (picked < 2) return false;
count = picked;
// 修复锦标赛排序O 矩阵引导 + 随机扰动)
// 插入排序比较时加噪声实现概率性O 值高的大概率排前面,但不绝对
// Repair: tournament sort (O-guided + random noise)
// Insertion sort with noisy comparison: high O tends to go first, not guaranteed
for (int i = 1; i < count; i++) {
int key = sel_val[i];
int j = i - 1;
while (j >= 0) {
float o_key_before = d_O[key * rel_N + sel_val[j]];
float o_j_before = d_O[sel_val[j] * rel_N + key];
// 噪声幅度 0.05O 值差距 >0.05 时基本确定,<0.05 时随机
// Noise scale 0.05: if O gap >0.05 mostly deterministic, else random
float noise = (curand_uniform(rng) - 0.5f) * 0.1f;
if (o_key_before + noise > o_j_before) {
sel_val[j + 1] = sel_val[j];
@ -956,7 +956,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
sel_val[j + 1] = key;
}
// 对 sel_pos 排序(升序),使写回位置有序
// Sort sel_pos ascending so write-back order is stable
for (int i = 1; i < count; i++) {
int key = sel_pos[i];
int j = i - 1;
@ -967,7 +967,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
sel_pos[j + 1] = key;
}
// 检查是否真的改变了排列
// Check whether permutation actually changed
bool any_change = false;
for (int i = 0; i < count; i++) {
if (sol.data[row][sel_pos[i]] != sel_val[i]) {
@ -977,7 +977,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
}
if (!any_change) return false;
// 写回
// Write back
for (int i = 0; i < count; i++) {
sol.data[row][sel_pos[i]] = sel_val[i];
}
@ -989,7 +989,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
}
// ============================================================
// Binary 序列
// Binary sequences
// ============================================================
if (encoding == EncodingType::Binary) {
switch (seq_id) {
@ -1063,7 +1063,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
}
// ============================================================
// Integer 序列
// Integer sequences
// ============================================================
if (encoding == EncodingType::Integer) {
switch (seq_id) {
@ -1131,7 +1131,7 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
}
// ============================================================
// 共享:行级序列(编码无关)
// Shared: row-level sequences (encoding-agnostic)
// ============================================================
switch (seq_id) {
case seq::SEQ_ROW_SWAP: {
@ -1194,11 +1194,11 @@ __device__ inline bool execute_sequence(int seq_id, Sol& sol, int dim1,
}
// ============================================================
// sample_and_execute — 从 SeqRegistry 按权重采样 + 直接执行
// sample_and_execute — sample from SeqRegistry by weight and execute directly
// ============================================================
// 返回 true 若 sol 被修改false 若 NOOP
// 输出参数 out_seq_idx采样到的序列在 registry 中的索引
// d_G, d_O, rel_N: 可选的关系矩阵(传递给 execute_sequence
// Returns true if sol modified, false if NOOP
// out_seq_idx: index of sampled sequence in registry
// d_G, d_O, rel_N: optional relation matrices (passed to execute_sequence)
template<typename Sol>
__device__ inline bool sample_and_execute(const SeqRegistry& reg,
@ -1212,7 +1212,7 @@ __device__ inline bool sample_and_execute(const SeqRegistry& reg,
int val_lb = 0,
int val_ub = 1,
const void* prob_data = nullptr) {
// 延迟归一化:使用缓存的 weights_sum
// Lazy normalization: use cached weights_sum
float r = curand_uniform(rng) * reg.weights_sum; // r ∈ [0, weights_sum)
float cumsum = 0.0f;
out_seq_idx = reg.count - 1;

View file

@ -1,10 +1,10 @@
/**
* population.cuh - 种群管理
* population.cuh - Population management
*
* v2.0: Block 级架构
* - RNG 数组大小 = pop_size * block_size每个 block 内每个线程独立 RNG
* - 初始化 kernel 保持 1-thread-per-solution初始化只做一次不需要并行
* - find_best_kernel 保持单线程(种群规模不大)
* v2.0: Block-level architecture
* - RNG array size = pop_size * block_size (one independent RNG per thread within each block)
* - Init kernel stays 1-thread-per-solution (initialization runs once; parallelism not needed)
* - find_best_kernel remains single-threaded (population size is modest)
*/
#pragma once
@ -12,7 +12,7 @@
#include "cuda_utils.cuh"
// ============================================================
// Device 端 Kernel模板化
// Device-side kernels (templated)
// ============================================================
template<typename Sol>
@ -65,9 +65,9 @@ __global__ void init_integer_kernel(Sol* pop, int pop_size,
}
// ============================================================
// 多重集排列初始化 — 每个值 [0, N) 重复 R 次,总长度 N*R
// Multiset permutation init — each value in [0, N) repeated R times, total length N*R
// ============================================================
// 用于 JSP 工序排列编码N=num_jobs, R=num_ops值 j 出现 R 次表示工件 j
// For JSP operation-sequence encoding: N=num_jobs, R=num_ops; value j appearing R times means job j
template<typename Sol>
__global__ void init_multiset_perm_kernel(Sol* pop, int pop_size,
@ -90,7 +90,7 @@ __global__ void init_multiset_perm_kernel(Sol* pop, int pop_size,
}
// ============================================================
// 分区初始化 — 元素 {0..total_elements-1} 不重复分配到 dim1 行
// Partition init — elements {0..total_elements-1} assigned without duplication across dim1 rows
// ============================================================
template<typename Sol>
@ -131,21 +131,21 @@ __global__ void find_best_kernel(const Sol* pop, int pop_size,
}
// ============================================================
// Host 端 RAII 类(模板化)
// Host-side RAII class (templated)
// ============================================================
template<typename Sol>
class Population {
public:
Sol* d_solutions = nullptr;
curandState* d_rng_states = nullptr; // 大小 = pop_size * block_size
curandState* d_rng_states = nullptr; // size = pop_size * block_size
int size = 0;
int rng_count = 0; // RNG 状态总数
int rng_count = 0; // total RNG states
Population() = default;
// block_size: Block 级架构下每个 block 的线程数
// RNG 数组大小 = pop_size * block_size每个 block 内每个线程独立 RNG
// block_size: threads per block under block-level architecture
// RNG array size = pop_size * block_size (one independent RNG per thread within each block)
void allocate(int pop_size, int block_size = 128) {
size = pop_size;
rng_count = pop_size * block_size;

View file

@ -1,20 +1,20 @@
/**
* relation_matrix.cuh - G/O 关系矩阵管理
* relation_matrix.cuh - G/O relation matrix management
*
* G[i][j]: 分组倾向(元素 i 和 j 应在同一行的倾向,对称)
* O[i][j]: 排序倾向(元素 i 应排在 j 前面的倾向,不对称)
* G[i][j]: grouping affinity (tendency for elements i and j to be on the same row; symmetric)
* O[i][j]: ordering affinity (tendency for element i to appear before j; asymmetric)
*
* 更新来源:历史最优解统计
* 每当 host 端获取到当前 best 解,扫描所有元素对关系:
* - 同行 → G[i][j] 增强
* - i 在 j 前 → O[i][j] 增强
* 使用 EMA 衰减:M[i][j] = α * M[i][j] + (1-α) * signal
* Update source: statistics from historical best solutions
* Whenever the host obtains the current best solution, scan all element-pair relations:
* - Same row → strengthen G[i][j]
* - i before j → strengthen O[i][j]
* EMA decay: M[i][j] = α * M[i][j] + (1-α) * signal
*
* 生命周期:
* 1. relation_matrix_create(N) — 分配 host/device 内存,初始化为 0
* 2. relation_matrix_update(rm, sol, dim1) — 从一个解更新 G/Ohost 端)
* 3. relation_matrix_upload(rm) — 上传 h_G/h_O 到 d_G/d_O
* 4. relation_matrix_destroy(rm) — 释放内存
* Lifecycle:
* 1. relation_matrix_create(N) — allocate host/device memory, initialize to 0
* 2. relation_matrix_update(rm, sol, dim1) — update G/O from one solution (host)
* 3. relation_matrix_upload(rm) — upload h_G/h_O to d_G/d_O
* 4. relation_matrix_destroy(rm) — free memory
*/
#pragma once
@ -23,7 +23,7 @@
#include <cstring>
// ============================================================
// 创建 / 销毁
// Create / destroy
// ============================================================
inline RelationMatrix relation_matrix_create(int N, float decay = 0.95f) {
@ -58,19 +58,19 @@ inline void relation_matrix_destroy(RelationMatrix& rm) {
}
// ============================================================
// 从一个解更新 G/Ohost 端)
// Update G/O from one solution (host)
// ============================================================
// sol: 当前最优解(已下载到 host
// dim1: 实际使用的行数
// sol: current best solution (already copied to host)
// dim1: number of rows in use
//
// 逻辑:
// 对 sol 中每对元素 (val_a, val_b)
// 如果在同一行 → G[val_a][val_b] 增强
// 如果 val_a 在 val_b 前面 → O[val_a][val_b] 增强
// Logic:
// For each pair (val_a, val_b) in sol:
// If on the same row → strengthen G[val_a][val_b]
// If val_a appears before val_b → strengthen O[val_a][val_b]
//
// 注意:元素值 val 必须在 [0, N) 范围内才有意义
// 对于 partition 编码VRP元素值就是客户编号
// 对于单行排列TSP元素值就是城市编号
// Note: element values val are meaningful only in [0, N)
// For partition encoding (VRP), values are customer IDs
// For single-row permutation (TSP), values are city IDs
template<typename Sol>
void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) {
@ -78,13 +78,13 @@ void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) {
float alpha = rm.decay;
float signal_strength = 1.0f;
// 衰减所有现有值
// Decay all existing values
for (int i = 0; i < N * N; i++) {
rm.h_G[i] *= alpha;
rm.h_O[i] *= alpha;
}
// 扫描解中的元素对关系
// Scan element-pair relations in the solution
for (int r = 0; r < dim1; r++) {
int sz = sol.dim2_sizes[r];
for (int c1 = 0; c1 < sz; c1++) {
@ -95,17 +95,17 @@ void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) {
int val_b = sol.data[r][c2];
if (val_b < 0 || val_b >= N) continue;
// 同行 → G 增强(对称)
// Same row → strengthen G (symmetric)
rm.h_G[val_a * N + val_b] += (1.0f - alpha) * signal_strength;
rm.h_G[val_b * N + val_a] += (1.0f - alpha) * signal_strength;
// val_a 在 val_b 前 → O[val_a][val_b] 增强
// val_a before val_b → strengthen O[val_a][val_b]
rm.h_O[val_a * N + val_b] += (1.0f - alpha) * signal_strength;
}
}
}
// 裁剪到 [0, 1]
// Clamp to [0, 1]
for (int i = 0; i < N * N; i++) {
if (rm.h_G[i] > 1.0f) rm.h_G[i] = 1.0f;
if (rm.h_O[i] > 1.0f) rm.h_O[i] = 1.0f;
@ -115,7 +115,7 @@ void relation_matrix_update(RelationMatrix& rm, const Sol& sol, int dim1) {
}
// ============================================================
// 上传到 GPU
// Upload to GPU
// ============================================================
inline void relation_matrix_upload(const RelationMatrix& rm) {

File diff suppressed because it is too large Load diff

View file

@ -1,38 +1,39 @@
/**
* types.cuh - 核心类型定义
* types.cuh - Core type definitions
*
* 包含编码类型、Solution 模板、ProblemConfig/SolverConfig、
* SeqRegistryAOS 序列级权重、KStepConfig多步执行
* RelationMatrixG/O 关系矩阵、ProblemBaseCRTP 基类)
* Contains: encoding types, Solution template, ProblemConfig/SolverConfig,
* SeqRegistry (AOS sequence-level weights), KStepConfig (multi-step execution),
* RelationMatrix (G/O relation matrix), ProblemBase (CRTP base class)
*/
#pragma once
#include <cstdio>
#include "cuda_utils.cuh"
// ============================================================
// 编译时常量
// Compile-time constants
// ============================================================
constexpr int MAX_OBJ = 4; // 最多 4 个目标16字节不值得模板化
constexpr int MAX_SEQ = 32; // 最大序列数(内置 ~16 + 自定义算子 ≤8留余量
constexpr int MAX_K = 3; // 多步执行的最大步数K=1,2,3
// AOS 权重上下限(归一化后)
constexpr float AOS_WEIGHT_FLOOR = 0.05f; // 最低权重保底(确保充分探索)
constexpr float AOS_WEIGHT_CAP = 0.35f; // 最高权重上限(防止赢者通吃)
constexpr int MAX_OBJ = 4; // Max 4 objectives (16 bytes, not worth templatizing)
constexpr int MAX_SEQ = 32; // Max sequences (built-in ~16 + custom ops ≤8, with margin)
constexpr int MAX_K = 3; // Max steps for multi-step execution (K=1,2,3)
// AOS weight bounds
constexpr float AOS_WEIGHT_FLOOR = 0.05f; // Minimum weight floor (ensures sufficient exploration)
constexpr float AOS_WEIGHT_CAP = 0.35f; // Maximum weight cap (prevents winner-take-all)
// ============================================================
// 枚举类型
// Enum types
// ============================================================
enum class EncodingType {
Permutation, // 排列:元素不重复
Binary, // 0-1flip 是主要算子
Integer // 有界整数
Permutation, // Permutation: elements are unique
Binary, // 0-1: flip is the main operator
Integer // Bounded integers
};
enum class RowMode {
Single, // dim1=1单行TSP/QAP/Knapsack 等大部分问题)
Fixed, // dim1>1行等长不可变JSP-Int/Schedule禁止 SPLIT/MERGE
Partition // dim1>1元素分区到各行行长可变CVRP/VRPTW
Single, // dim1=1, single row (most problems: TSP/QAP/Knapsack, etc.)
Fixed, // dim1>1, equal row lengths fixed (JSP-Int/Schedule; SPLIT/MERGE disallowed)
Partition // dim1>1, elements partitioned across rows, variable row lengths (CVRP/VRPTW)
};
enum class ObjDir {
@ -40,241 +41,235 @@ enum class ObjDir {
Maximize
};
// 多目标比较模式
// Multi-objective comparison mode
enum class CompareMode {
Weighted, // 加权求和sum(weight[i] * obj[i]),越小越好
Lexicographic // 字典法:按优先级逐目标比较,前面的目标优先
Weighted, // Weighted sum: sum(weight[i] * obj[i]), lower is better
Lexicographic // Lexicographic: compare objectives by priority order
};
enum class MigrateStrategy {
Ring, // 环形:各岛最优→邻岛最差(慢传播,高多样性)
TopN, // 全局 Top-N 轮转分发(快传播,强收敛)
Hybrid // 两者兼顾Top-N 替换最差 + Ring 替换次差
Ring, // Ring: each island's best → neighbor's worst (slow spread, high diversity)
TopN, // Global Top-N round-robin (fast spread, strong convergence)
Hybrid // Hybrid: Top-N replaces worst + Ring replaces second-worst
};
// v5.0: 多 GPU 协同 — 解注入模式
// v5.0: multi-GPU coordination — solution injection mode
enum class MultiGpuInjectMode {
OneIsland, // 注入到 1 个岛的 worst保守保持多样性
HalfIslands, // 注入到 num_islands/2 个岛的 worst平衡
AllIslands // 注入到所有岛的 worst激进快速传播
OneIsland, // Inject into worst of 1 island (conservative, preserves diversity)
HalfIslands, // Inject into worst on num_islands/2 islands (balanced)
AllIslands // Inject into worst on all islands (aggressive, fast spread)
};
// v5.0 方案 B3: InjectBuffer — 被动注入缓冲区
// GPU 无感知CPU 同步写入GPU 在 migrate_kernel 中检查并应用
// 设计要点:
// 1. 使用同步 cudaMemcpy 避免与 solve() 的 stream/Graph 冲突
// 2. 写入顺序:先 solution 后 flagGPU 端原子读 flag 确保一致性
// 3. 完全解耦:不依赖 solve() 的任何内部状态
// v5.0 option B3: InjectBuffer — passive injection buffer
// GPU has no awareness; CPU writes synchronously; GPU checks and applies in migrate_kernel
// Design notes:
// 1. Use synchronous cudaMemcpy to avoid conflicts with solve() stream/Graph
// 2. Write order: solution first, then flag; GPU atomic flag read ensures consistency
// 3. Fully decoupled: does not depend on any internal state of solve()
template<typename Sol>
struct InjectBuffer {
Sol* d_solution; // Device 端解缓冲区(单个解)
int* d_flag; // Device 端标志位0=空1=有新解
Sol* d_solution = nullptr; // Device solution buffer (single solution)
int* d_flag = nullptr; // Device flag: 0=empty, 1=new solution
int owner_gpu = 0; // GPU that owns the allocation
// 分配 InjectBuffer在指定 GPU 上)
// Allocate InjectBuffer (on given GPU)
static InjectBuffer<Sol> allocate(int gpu_id) {
InjectBuffer<Sol> buf;
buf.owner_gpu = gpu_id;
// 保存原设备,切换到目标 GPU
int orig_device;
cudaGetDevice(&orig_device);
cudaSetDevice(gpu_id);
CUDA_CHECK(cudaGetDevice(&orig_device));
CUDA_CHECK(cudaSetDevice(gpu_id));
// 分配设备内存
cudaMalloc(&buf.d_solution, sizeof(Sol));
cudaMalloc(&buf.d_flag, sizeof(int));
CUDA_CHECK(cudaMalloc(&buf.d_solution, sizeof(Sol)));
CUDA_CHECK(cudaMalloc(&buf.d_flag, sizeof(int)));
// 初始化 flag 为 0
int zero = 0;
cudaMemcpy(buf.d_flag, &zero, sizeof(int), cudaMemcpyHostToDevice);
CUDA_CHECK(cudaMemcpy(buf.d_flag, &zero, sizeof(int), cudaMemcpyHostToDevice));
// 恢复原设备
cudaSetDevice(orig_device);
CUDA_CHECK(cudaSetDevice(orig_device));
return buf;
}
// 释放 InjectBuffer
// Free InjectBuffer (switches to owner GPU before freeing)
void destroy() {
if (d_solution) {
cudaFree(d_solution);
d_solution = nullptr;
}
if (d_flag) {
cudaFree(d_flag);
d_flag = nullptr;
if (d_solution || d_flag) {
int orig_device;
cudaGetDevice(&orig_device);
cudaSetDevice(owner_gpu);
if (d_solution) { cudaFree(d_solution); d_solution = nullptr; }
if (d_flag) { cudaFree(d_flag); d_flag = nullptr; }
cudaSetDevice(orig_device);
}
}
// CPU 端写入新解
// 注意:使用同步 cudaMemcpy 避免与 solve() 的 stream 冲突
// 顺序:先写 solution再写 flagGPU 端原子读 flag 确保不会读到半写状态)
// CPU-side write of new solution
// Note: synchronous cudaMemcpy avoids stream conflicts with solve()
// Order: write solution first, then flag (GPU atomic flag read avoids half-written reads)
void write_sync(const Sol& sol, int target_gpu) {
// 保存原设备,切换到目标 GPU
int orig_device;
cudaGetDevice(&orig_device);
cudaSetDevice(target_gpu);
CUDA_CHECK(cudaGetDevice(&orig_device));
CUDA_CHECK(cudaSetDevice(target_gpu));
// 先写解数据
cudaMemcpy(d_solution, &sol, sizeof(Sol), cudaMemcpyHostToDevice);
// 再写标志位(确保解数据已写完)
CUDA_CHECK(cudaMemcpy(d_solution, &sol, sizeof(Sol), cudaMemcpyHostToDevice));
int flag = 1;
cudaMemcpy(d_flag, &flag, sizeof(int), cudaMemcpyHostToDevice);
CUDA_CHECK(cudaMemcpy(d_flag, &flag, sizeof(int), cudaMemcpyHostToDevice));
// 恢复原设备
cudaSetDevice(orig_device);
CUDA_CHECK(cudaSetDevice(orig_device));
}
};
// ============================================================
// SeqID — 统一的 OperationSequence 编号
// SeqID — unified OperationSequence IDs
// ============================================================
// 每个 SeqID 对应一种具体的搜索操作(原子或多步)
// AOS 权重跟踪粒度 = SeqID每个序列独立权重
// Each SeqID maps to one concrete search operation (atomic or multi-step)
// AOS weight granularity = SeqID (independent weight per sequence)
//
// 命名规则SEQ_{编码}_{操作名}
// 跨编码共享的行级操作统一编号
// Naming: SEQ_{encoding}_{operation}
// Row-level ops shared across encodings use unified numbering
namespace seq {
// --- Permutation 行内(元素级)---
constexpr int SEQ_PERM_SWAP = 0; // swap 两个位置
constexpr int SEQ_PERM_REVERSE = 1; // 2-opt(反转区间)
constexpr int SEQ_PERM_INSERT = 2; // insert(移动到新位置)
constexpr int SEQ_PERM_3OPT = 3; // 3-opt(断 3 边重连)
// --- Permutation in-row (element-level) ---
constexpr int SEQ_PERM_SWAP = 0; // swap two positions
constexpr int SEQ_PERM_REVERSE = 1; // 2-opt (reverse segment)
constexpr int SEQ_PERM_INSERT = 2; // insert (move to new position)
constexpr int SEQ_PERM_3OPT = 3; // 3-opt (reconnect after 3 edges)
// --- Permutation 行内(片段级)---
constexpr int SEQ_PERM_OR_OPT = 4; // or-opt(移动连续 k 个元素)
// --- Permutation in-row (segment-level) ---
constexpr int SEQ_PERM_OR_OPT = 4; // or-opt (move k consecutive elements)
// --- Permutation 行内(组合级)---
constexpr int SEQ_PERM_DOUBLE_SWAP = 30; // 连续两次 swap同行
constexpr int SEQ_PERM_TRIPLE_SWAP = 31; // 连续三次 swap同行
// --- Permutation in-row (combo-level) ---
constexpr int SEQ_PERM_DOUBLE_SWAP = 30; // two consecutive swaps (same row)
constexpr int SEQ_PERM_TRIPLE_SWAP = 31; // three consecutive swaps (same row)
// --- Permutation 跨行(元素级)---
constexpr int SEQ_PERM_CROSS_RELOCATE = 5; // 单元素移行
constexpr int SEQ_PERM_CROSS_SWAP = 6; // 单元素换行
// --- Permutation cross-row (element-level) ---
constexpr int SEQ_PERM_CROSS_RELOCATE = 5; // single element moves row
constexpr int SEQ_PERM_CROSS_SWAP = 6; // single element swaps rows
// --- Permutation 跨行(片段级)---
constexpr int SEQ_PERM_SEG_RELOCATE = 7; // 片段移行
constexpr int SEQ_PERM_SEG_SWAP = 8; // 片段换行2-opt*
constexpr int SEQ_PERM_CROSS_EXCHANGE = 9; // 片段互换(保序)
// --- Permutation cross-row (segment-level) ---
constexpr int SEQ_PERM_SEG_RELOCATE = 7; // segment moves row
constexpr int SEQ_PERM_SEG_SWAP = 8; // segment swaps rows (2-opt*)
constexpr int SEQ_PERM_CROSS_EXCHANGE = 9; // segment exchange (order preserved)
// --- Binary 行内(元素级)---
constexpr int SEQ_BIN_FLIP = 0; // 翻转一个位
constexpr int SEQ_BIN_SWAP = 1; // 交换两个位
// --- Binary in-row (element-level) ---
constexpr int SEQ_BIN_FLIP = 0; // flip one bit
constexpr int SEQ_BIN_SWAP = 1; // swap two bits
// --- Binary 行内(片段级)---
constexpr int SEQ_BIN_SEG_FLIP = 2; // 翻转连续 k 个位
constexpr int SEQ_BIN_K_FLIP = 3; // 同时翻转 k 个随机位
// --- Binary in-row (segment-level) ---
constexpr int SEQ_BIN_SEG_FLIP = 2; // flip k consecutive bits
constexpr int SEQ_BIN_K_FLIP = 3; // flip k random bits at once
// --- Binary 跨行 ---
constexpr int SEQ_BIN_CROSS_SWAP = 4; // 两行各一个位互换
constexpr int SEQ_BIN_SEG_CROSS_SWAP = 5; // 两行各取一段互换
// --- Binary cross-row ---
constexpr int SEQ_BIN_CROSS_SWAP = 4; // swap one bit per row across two rows
constexpr int SEQ_BIN_SEG_CROSS_SWAP = 5; // swap a segment from each row
// --- 共享:行级(编码无关)---
constexpr int SEQ_ROW_SWAP = 10; // 交换两行
constexpr int SEQ_ROW_REVERSE = 11; // 反转行排列
constexpr int SEQ_ROW_SPLIT = 12; // 一行拆两行
constexpr int SEQ_ROW_MERGE = 13; // 两行合并
// --- Shared: row-level (encoding-agnostic) ---
constexpr int SEQ_ROW_SWAP = 10; // swap two rows
constexpr int SEQ_ROW_REVERSE = 11; // reverse row order
constexpr int SEQ_ROW_SPLIT = 12; // split one row into two
constexpr int SEQ_ROW_MERGE = 13; // merge two rows
// --- 特殊 ---
constexpr int SEQ_PERTURBATION = 14; // 扰动(多步不可逆)
// --- Special ---
constexpr int SEQ_PERTURBATION = 14; // perturbation (multi-step, irreversible)
// --- Integer 行内(元素级)---
constexpr int SEQ_INT_RANDOM_RESET = 0; // 随机一个位置重置为 [lb, ub] 内随机值
constexpr int SEQ_INT_DELTA = 1; // 随机一个位置 ±kclamp 到 [lb, ub]
constexpr int SEQ_INT_SWAP = 2; // 交换两个位置的值
// --- Integer in-row (element-level) ---
constexpr int SEQ_INT_RANDOM_RESET = 0; // reset one position to random in [lb, ub]
constexpr int SEQ_INT_DELTA = 1; // one position ±k (clamped to [lb, ub])
constexpr int SEQ_INT_SWAP = 2; // swap values at two positions
// --- Integer 行内(片段级)---
constexpr int SEQ_INT_SEG_RESET = 3; // 连续 k 个位置全部重置
constexpr int SEQ_INT_K_DELTA = 4; // 随机 k 个位置各自 ±1
// --- Integer in-row (segment-level) ---
constexpr int SEQ_INT_SEG_RESET = 3; // reset k consecutive positions
constexpr int SEQ_INT_K_DELTA = 4; // k positions each ±1 at random
// --- Integer 跨行 ---
constexpr int SEQ_INT_CROSS_SWAP = 5; // 两行各一个位置互换
// --- Integer cross-row ---
constexpr int SEQ_INT_CROSS_SWAP = 5; // swap one position per row across two rows
// --- LNS(大邻域搜索)---
constexpr int SEQ_LNS_SEGMENT_SHUFFLE = 20; // 打乱连续片段
constexpr int SEQ_LNS_SCATTER_SHUFFLE = 21; // 打乱随机分散位置
constexpr int SEQ_LNS_GUIDED_REBUILD = 22; // 关系矩阵引导重建
// --- LNS (large neighborhood search) ---
constexpr int SEQ_LNS_SEGMENT_SHUFFLE = 20; // shuffle a contiguous segment
constexpr int SEQ_LNS_SCATTER_SHUFFLE = 21; // shuffle a scattered set of positions
constexpr int SEQ_LNS_GUIDED_REBUILD = 22; // guided rebuild from relation matrix
} // namespace seq
// ============================================================
// RelationMatrix — G/O 关系矩阵GPU global memory
// RelationMatrix — G/O relation matrix (GPU global memory)
// ============================================================
// G[i][j]: 元素 i 和 j 的分组倾向(对称,越大越倾向同组)
// O[i][j]: 元素 i 排在 j 前面的倾向(不对称)
// 存储为一维数组 [N * N],行优先
// 小规模 N<200 直接 DenseP2 再做稀疏化
// G[i][j]: grouping tendency of elements i and j (symmetric; higher → more same-group)
// O[i][j]: tendency for element i to precede j (asymmetric)
// Stored as a 1D row-major array [N * N]
// For small N<200 use dense directly; P2 may add sparsification
//
// 更新时机host 端,每个 batch 间隙
// 使用时机kernel 中 SEQ_LNS_GUIDED_REBUILD 读取
// Updated on: host, between batches
// Read in: kernel for SEQ_LNS_GUIDED_REBUILD
struct RelationMatrix {
float* d_G; // GPU 上的 G 矩阵 [N * N]
float* d_O; // GPU 上的 O 矩阵 [N * N]
float* h_G; // Host 上的 G 矩阵 [N * N](用于更新后上传)
float* h_O; // Host 上的 O 矩阵 [N * N]
int N; // 元素总数
float decay; // 衰减系数 α(默认 0.95
int update_count; // 已更新次数(用于冷启动判断)
float* d_G; // G matrix on GPU [N * N]
float* d_O; // O matrix on GPU [N * N]
float* h_G; // G matrix on host [N * N] (for upload after update)
float* h_O; // O matrix on host [N * N]
int N; // total number of elements
float decay; // decay factor α (default 0.95)
int update_count; // number of updates so far (for cold-start logic)
};
// ============================================================
// SeqRegistry — 运行时可用序列注册表
// SeqRegistry — runtime-available sequence registry
// ============================================================
// 根据 EncodingType 和 dim1 自动确定哪些序列可用
// 传到 GPU 供 sample_sequence() 使用
// Which sequences are available is determined from EncodingType and dim1
// Passed to GPU for sample_sequence()
enum class SeqCategory : int {
InRow = 0, // 行内算子swap, reverse, insert, ...
CrossRow = 1, // 跨行算子cross_relocate, cross_swap, seg_relocate, ...
RowLevel = 2, // 行级算子row_swap, row_reverse, split, merge
LNS = 3, // 大邻域搜索
InRow = 0, // within-row operators (swap, reverse, insert, ...)
CrossRow = 1, // cross-row operators (cross_relocate, cross_swap, seg_relocate, ...)
RowLevel = 2, // row-level operators (row_swap, row_reverse, split, merge)
LNS = 3, // large neighborhood search
};
struct SeqRegistry {
int ids[MAX_SEQ]; // 可用序列的 SeqID 列表
int count; // 可用序列数量
float weights[MAX_SEQ]; // 每个序列的当前权重(未归一化,延迟归一化)
float weights_sum; // 权重和(缓存,用于延迟归一化)
float max_w[MAX_SEQ]; // 每个序列的权重上限0 = 不限,用全局 cap
SeqCategory categories[MAX_SEQ]; // 每个序列的分类(约束导向用)
int ids[MAX_SEQ]; // SeqID list of available sequences
int count; // number of available sequences
float weights[MAX_SEQ]; // current weight per sequence (unnormalized; lazy normalization)
float weights_sum; // sum of weights (cached for lazy normalization)
float max_w[MAX_SEQ]; // per-sequence weight cap (0 = unlimited, use global cap)
SeqCategory categories[MAX_SEQ]; // category per sequence (for constraint-directed mode)
};
// ============================================================
// KStepConfig — 多步执行的步数选择配置
// KStepConfig — step-count selection for multi-step execution
// ============================================================
// K=1: 单步当前行为K=2/3: 连续执行多个序列后再评估
// 两层权重体系的第一层
// K=1: single step (current behavior); K=2/3: run several sequences then evaluate
// First layer of the two-level weight system
//
// 自适应策略:
// - 初始 K=1 权重很大保守K>1 权重小
// - K>1 带来改进 → 增大该 K 的权重
// - 长时间无改进 → 重置/增大 K>1 权重(跳出局部最优)
// Adaptive policy:
// - Initially K=1 has large weight (conservative), K>1 small
// - If K>1 yields improvement → increase that K's weight
// - Long stagnation → reset / boost K>1 weights (escape local optima)
struct KStepConfig {
float weights[MAX_K]; // K=1,2,3 的采样权重(归一化)
int stagnation_count; // 连续无改进的 batch 数(用于触发重置)
int stagnation_limit; // 触发重置的阈值(默认 5 个 batch
float weights[MAX_K]; // sampling weights for K=1,2,3 (normalized)
int stagnation_count; // consecutive batches without improvement (triggers reset)
int stagnation_limit; // threshold to trigger reset (default 5 batches)
};
// 构建默认 K 步配置
// Build default K-step configuration
inline KStepConfig build_kstep_config() {
KStepConfig kc;
kc.weights[0] = 0.80f; // K=1: 初始主导
kc.weights[1] = 0.15f; // K=2: 少量探索
kc.weights[2] = 0.05f; // K=3: 极少探索
kc.weights[0] = 0.80f; // K=1: dominates initially
kc.weights[1] = 0.15f; // K=2: little exploration
kc.weights[2] = 0.05f; // K=3: minimal exploration
kc.stagnation_count = 0;
kc.stagnation_limit = 5;
return kc;
};
// ============================================================
// ProblemProfile — 基于结构特征推断的问题画像
// ProblemProfile — problem profile inferred from structural features
// ============================================================
// 第一层:纯结构推断(不感知语义),用于驱动算子注册和初始权重
// 未来第二层:可扩展更细粒度的画像(如多属性、高约束等)
// Layer 1: structure-only inference (no semantics), drives operator registration and initial weights
// Future layer 2: finer profiles (e.g. multi-attribute, high constraint)
enum class ScaleClass { Small, Medium, Large };
enum class StructClass { SingleSeq, MultiFixed, MultiPartition };
@ -286,10 +281,10 @@ struct ProblemProfile {
float cross_row_prob;
};
// classify_problem() 定义在 ProblemConfig 之后
// classify_problem() is defined after ProblemConfig
// ============================================================
// 权重预设 — 由 ScaleClass 驱动
// Weight presets — driven by ScaleClass
// ============================================================
struct WeightPreset {
@ -308,100 +303,100 @@ inline WeightPreset get_weight_preset(ScaleClass scale) {
return { 0.50f, 0.80f, 0.006f, 0.01f };
}
// classify_problem() 和 build_seq_registry() 定义在 ProblemConfig 之后
// classify_problem() and build_seq_registry() are defined after ProblemConfig
// ============================================================
// Solution<D1, D2> — 解的模板化表示
// Solution<D1, D2> — templated solution representation
// ============================================================
// D1: 行数上限 (TSP=1, VRP≤16, Schedule≤8)
// D2: 每行列数上限 (TSP≤64, 背包≤32)
// 每个 Problem 选择最小够用的 D1/D2编译器生成紧凑的结构
// D1: max number of rows (TSP=1, VRP≤16, Schedule≤8)
// D2: max columns per row (TSP≤64, knapsack≤32)
// Each Problem picks the smallest sufficient D1/D2; compiler emits a compact layout
template<int D1, int D2>
struct Solution {
static constexpr int DIM1 = D1; // 编译时行数上限
static constexpr int DIM2 = D2; // 编译时列数上限
int data[D1][D2]; // D1×D2×4 字节
int dim2_sizes[D1]; // D1×4 字节
float objectives[MAX_OBJ]; // 16 字节(固定)
float penalty; // 4 字节
static constexpr int DIM1 = D1; // compile-time max rows
static constexpr int DIM2 = D2; // compile-time max columns per row
int data[D1][D2]; // D1×D2×4 bytes
int dim2_sizes[D1]; // D1×4 bytes
float objectives[MAX_OBJ]; // 16 bytes (fixed)
float penalty; // 4 bytes
};
// ============================================================
// ProblemConfig — 问题的运行时元信息
// ProblemConfig — runtime metadata for a problem
// ============================================================
struct ProblemConfig {
EncodingType encoding;
int dim1; // 实际使用的行数 (≤ D1)
int dim2_default; // 实际使用的列数 (≤ D2)
int dim1; // actual number of rows used (≤ D1)
int dim2_default; // actual number of columns used (≤ D2)
int num_objectives;
ObjDir obj_dirs[MAX_OBJ];
float obj_weights[MAX_OBJ]; // Weighted 模式下的权重
// 多目标比较
float obj_weights[MAX_OBJ]; // weights in Weighted mode
// Multi-objective comparison
CompareMode compare_mode = CompareMode::Weighted;
int obj_priority[MAX_OBJ] = {0, 1, 2, 3}; // Lexicographic 模式下的比较顺序(索引)
float obj_tolerance[MAX_OBJ] = {0.0f, 0.0f, 0.0f, 0.0f}; // 字典法容差:差值 <= tol 视为相等
int obj_priority[MAX_OBJ] = {0, 1, 2, 3}; // comparison order in Lexicographic mode (indices)
float obj_tolerance[MAX_OBJ] = {0.0f, 0.0f, 0.0f, 0.0f}; // lexicographic tolerance: |diff| ≤ tol ⇒ tie
int value_lower_bound;
int value_upper_bound;
// v3.4: 统一行模式
RowMode row_mode = RowMode::Single; // 行模式Single/Fixed/Partition
float cross_row_prob = 0.0f; // 跨行 move 概率0=纯行内操作)
int total_elements = 0; // Partition 模式下的总元素数
int perm_repeat_count = 1; // 排列中每个值的重复次数1=标准排列,>1=多重集排列)
// v3.4: unified row mode
RowMode row_mode = RowMode::Single; // row mode (Single/Fixed/Partition)
float cross_row_prob = 0.0f; // probability of cross-row moves (0 = within-row only)
int total_elements = 0; // total elements in Partition mode
int perm_repeat_count = 1; // repeats per value in permutation (1 = standard; >1 = multiset)
};
// ============================================================
// SolverConfig — 求解器参数
// SolverConfig — solver parameters
// ============================================================
struct SolverConfig {
int pop_size = 0; // 种群大小0 = 自动匹配 GPU 最大并行度)
int pop_size = 0; // population size (0 = auto to max GPU parallelism)
int max_gen = 1000;
float mutation_rate = 0.1f;
unsigned seed = 42;
bool verbose = true;
int print_every = 100;
// 岛屿模型参数
int num_islands = 1; // 0 = 自适应1 = 纯爬山(无岛屿),>1 = 岛屿模型
int migrate_interval = 100; // 每隔多少代执行一次迁移
// Island model
int num_islands = 1; // 0 = adaptive, 1 = pure hill climbing (no islands), >1 = island model
int migrate_interval = 100; // migrate every this many generations
MigrateStrategy migrate_strategy = MigrateStrategy::Hybrid;
// 模拟退火参数
float sa_temp_init = 0.0f; // 初始温度0 = 禁用 SA纯爬山
float sa_alpha = 0.998f; // 冷却率(每代乘以 alpha
// v1.0: 交叉参数
float crossover_rate = 0.1f; // 每代中执行交叉的概率vs 变异)
// v2.0: 自适应算子选择
bool use_aos = false; // 启用 AOSbatch 间更新算子权重)
float aos_weight_floor = AOS_WEIGHT_FLOOR; // 运行时可覆盖的 floor
float aos_weight_cap = AOS_WEIGHT_CAP; // 运行时可覆盖的 cap
// v2.1: 初始解策略
int init_oversample = 4; // 采样倍数1 = 不做采样择优,即纯随机)
float init_random_ratio = 0.3f; // 纯随机解占比(多样性保底)
// v3.0: 工程可用性
float time_limit_sec = 0.0f; // 时间限制0 = 不限制,按 max_gen 跑完)
int stagnation_limit = 0; // 收敛检测:连续多少个 batch 无改进后 reheat0 = 禁用)
float reheat_ratio = 0.5f; // reheat 时温度恢复到初始温度的比例
// Simulated annealing
float sa_temp_init = 0.0f; // initial temperature (0 = disable SA, hill climb only)
float sa_alpha = 0.998f; // cooling rate (multiply by alpha each generation)
// v1.0: crossover
float crossover_rate = 0.1f; // probability of crossover per generation (vs mutation)
// v2.0: adaptive operator selection
bool use_aos = false; // enable AOS (update operator weights between batches)
float aos_weight_floor = AOS_WEIGHT_FLOOR; // runtime-overridable floor
float aos_weight_cap = AOS_WEIGHT_CAP; // runtime-overridable cap
// v2.1: initial solution strategy
int init_oversample = 4; // oversampling factor (1 = no sampling selection, pure random)
float init_random_ratio = 0.3f; // fraction of purely random solutions (diversity floor)
// v3.0: engineering usability
float time_limit_sec = 0.0f; // time limit in seconds (0 = none, run to max_gen)
int stagnation_limit = 0; // convergence: reheat after this many batches without improvement (0 = off)
float reheat_ratio = 0.5f; // on reheat, fraction of initial temperature to restore
// v3.5: CUDA Graph
bool use_cuda_graph = false; // 启用 CUDA Graph减少 kernel launch 开销)
// v3.6: AOS 更新频率控制
int aos_update_interval = 10; // 每隔多少个 batch 更新一次 AOS 权重(降低 cudaMemcpy 同步频率)
// v4.0: 约束导向 + 分层搜索
bool use_constraint_directed = false; // 启用约束导向(根据 penalty 比例动态调整跨行算子权重)
bool use_phased_search = false; // 启用分层搜索(按进度调整全局 floor/cap
// 分层搜索参数:三期阈值
float phase_explore_end = 0.30f; // 探索期结束(进度比例)
float phase_refine_start = 0.70f; // 精细期开始(进度比例)
// 约束导向参数
float constraint_boost_max = 2.5f; // 高约束时跨行算子 cap 提升倍率上限
// v5.0: 多 GPU 协同
int num_gpus = 1; // 使用的 GPU 数量1 = 单 GPU>1 = 多 GPU 协同)
float multi_gpu_interval_sec = 10.0f; // GPU 间交换最优解的时间间隔(秒)
MultiGpuInjectMode multi_gpu_inject_mode = MultiGpuInjectMode::HalfIslands; // 注入模式
bool use_cuda_graph = false; // enable CUDA Graph (fewer kernel launch overheads)
// v3.6: AOS update frequency
int aos_update_interval = 10; // update AOS weights every this many batches (lower cudaMemcpy sync rate)
// v4.0: constraint-directed + phased search
bool use_constraint_directed = false; // constraint-directed mode (scale cross-row weights by penalty ratio)
bool use_phased_search = false; // phased search (adjust global floor/cap by progress)
// Phased search: three-phase thresholds
float phase_explore_end = 0.30f; // end of exploration phase (progress fraction)
float phase_refine_start = 0.70f; // start of refinement phase (progress fraction)
// Constraint-directed parameters
float constraint_boost_max = 2.5f; // max multiplier boost for cross-row cap under high constraint
// v5.0: multi-GPU cooperation
int num_gpus = 1; // number of GPUs (1 = single GPU, >1 = multi-GPU)
float multi_gpu_interval_sec = 10.0f; // interval in seconds to exchange best solutions across GPUs
MultiGpuInjectMode multi_gpu_inject_mode = MultiGpuInjectMode::HalfIslands; // injection mode
};
// ============================================================
// classify_problem — 从 ProblemConfig 推断问题画像
// classify_problem — infer problem profile from ProblemConfig
// ============================================================
inline ProblemProfile classify_problem(const ProblemConfig& pcfg) {
@ -424,7 +419,7 @@ inline ProblemProfile classify_problem(const ProblemConfig& pcfg) {
}
// ============================================================
// build_seq_registry — 由 ProblemProfile 驱动的算子注册
// build_seq_registry — operator registration driven by ProblemProfile
// ============================================================
inline SeqRegistry build_seq_registry(const ProblemProfile& prof) {
@ -436,7 +431,10 @@ inline SeqRegistry build_seq_registry(const ProblemProfile& prof) {
}
auto add = [&](int id, float w, SeqCategory cat, float cap = 0.0f) {
if (reg.count >= MAX_SEQ) return;
if (reg.count >= MAX_SEQ) {
printf("[WARN] SeqRegistry full (MAX_SEQ=%d), ignoring SeqID %d\n", MAX_SEQ, id);
return;
}
reg.ids[reg.count] = id;
reg.weights[reg.count] = w;
reg.max_w[reg.count] = cap;
@ -514,7 +512,7 @@ inline SeqRegistry build_seq_registry(const ProblemProfile& prof) {
}
}
// 延迟归一化:只计算权重和,不归一化
// Lazy normalization: only sum weights; do not normalize here
reg.weights_sum = 0.0f;
for (int i = 0; i < reg.count; i++) {
reg.weights_sum += reg.weights[i];
@ -523,19 +521,19 @@ inline SeqRegistry build_seq_registry(const ProblemProfile& prof) {
}
// ============================================================
// ObjConfig — 传到 GPU 的目标比较配置(紧凑结构)
// ObjConfig — compact objective comparison config for GPU
// ============================================================
struct ObjConfig {
int num_obj;
CompareMode mode;
ObjDir dirs[MAX_OBJ]; // 每个目标的方向
float weights[MAX_OBJ]; // Weighted 模式下的权重
int priority[MAX_OBJ]; // Lexicographic 模式下的比较顺序
float tolerance[MAX_OBJ]; // Lexicographic 模式下的容差
ObjDir dirs[MAX_OBJ]; // direction per objective
float weights[MAX_OBJ]; // weights in Weighted mode
int priority[MAX_OBJ]; // comparison order in Lexicographic mode
float tolerance[MAX_OBJ]; // tolerance in Lexicographic mode
};
// 从 ProblemConfig 构造 ObjConfigCPU 端)
// Build ObjConfig from ProblemConfig (CPU side)
inline ObjConfig make_obj_config(const ProblemConfig& pcfg) {
ObjConfig oc;
oc.num_obj = pcfg.num_objectives;
@ -550,7 +548,7 @@ inline ObjConfig make_obj_config(const ProblemConfig& pcfg) {
}
// ============================================================
// SolveResult — solve() 的返回值
// SolveResult — return value of solve()
// ============================================================
enum class StopReason { MaxGen, TimeLimit, Stagnation };
@ -564,12 +562,12 @@ struct SolveResult {
};
// ============================================================
// 目标重要性映射 — 统一 Weighted / Lexicographic 的重要性度量
// Objective importance mapping — unified importance for Weighted / Lexicographic
// ============================================================
// 用于初始化选种NSGA-II 加权拥挤度 + 核心目标预留名额)
// Used for initial selection (NSGA-II weighted crowding + core-object slots)
// Weighted: importance[i] = weight[i] / Σweight
// Lexicographic: importance[i] = 0.5^rank[i] / Σ(0.5^rank)
// → 第一优先级 ~57%,第二 ~29%,第三 ~14%
// → first priority ~57%, second ~29%, third ~14%
inline void compute_importance(const ObjConfig& oc, float* importance) {
float sum = 0.0f;
@ -590,26 +588,26 @@ inline void compute_importance(const ObjConfig& oc, float* importance) {
}
// ============================================================
// 比较工具 — 支持 Weighted / Lexicographic
// Comparison utilities — Weighted / Lexicographic
// ============================================================
// 将目标值统一为"越小越好"Maximize 目标取负
// Normalize objectives to "smaller is better": negate Maximize objectives
__device__ __host__ inline float normalize_obj(float val, ObjDir dir) {
return (dir == ObjDir::Maximize) ? -val : val;
}
// 核心比较a 是否优于 b
// v5.0: 添加 __host__ 支持多 GPU 在 CPU 端比较解
// Core comparison: whether a is better than b
// v5.0: add __host__ so multi-GPU can compare solutions on CPU
template<typename Sol>
__device__ __host__ inline bool is_better(const Sol& a, const Sol& b,
const ObjConfig& oc) {
// penalty 优先:可行解一定优于不可行解
// Penalty first: feasible beats infeasible
if (a.penalty <= 0.0f && b.penalty > 0.0f) return true;
if (a.penalty > 0.0f && b.penalty <= 0.0f) return false;
if (a.penalty > 0.0f && b.penalty > 0.0f) return a.penalty < b.penalty;
if (oc.mode == CompareMode::Weighted) {
// 加权求和权重已包含方向信息Maximize 目标用负权重,或由 normalize_obj 处理)
// Weighted sum (weights may encode direction: negative for Maximize, or use normalize_obj)
float sum_a = 0.0f, sum_b = 0.0f;
for (int i = 0; i < oc.num_obj; i++) {
float na = normalize_obj(a.objectives[i], oc.dirs[i]);
@ -619,21 +617,22 @@ __device__ __host__ inline bool is_better(const Sol& a, const Sol& b,
}
return sum_a < sum_b;
} else {
// 字典法:按 priority 顺序逐目标比较
// Lexicographic: compare objectives in priority order
for (int p = 0; p < oc.num_obj; p++) {
int idx = oc.priority[p];
if (idx < 0 || idx >= oc.num_obj) continue;
float va = normalize_obj(a.objectives[idx], oc.dirs[idx]);
float vb = normalize_obj(b.objectives[idx], oc.dirs[idx]);
float diff = va - vb;
if (diff < -oc.tolerance[idx]) return true; // a 明显更好
if (diff > oc.tolerance[idx]) return false; // b 明显更好
// 在容差内视为相等 → 继续比较下一个目标
if (diff < -oc.tolerance[idx]) return true; // a clearly better
if (diff > oc.tolerance[idx]) return false; // b clearly better
// Within tolerance → tie, continue to next objective
}
return false; // 所有目标都在容差内相等
return false; // all objectives tied within tolerance
}
}
// 标量化SA 接受概率用):返回越小越好的标量
// Scalarization (for SA acceptance): smaller is better
template<typename Sol>
__device__ __host__ inline float scalar_objective(const Sol& sol,
const ObjConfig& oc) {
@ -643,13 +642,14 @@ __device__ __host__ inline float scalar_objective(const Sol& sol,
sum += oc.weights[i] * normalize_obj(sol.objectives[i], oc.dirs[i]);
return sum;
} else {
// 字典法下 SA 用第一优先级目标作为标量
// Under lexicographic SA, use first-priority objective as scalar
int idx = oc.priority[0];
if (idx < 0 || idx >= oc.num_obj) idx = 0;
return normalize_obj(sol.objectives[idx], oc.dirs[idx]);
}
}
// 轻量比较:直接操作 float[] 目标数组(避免复制整个 Sol
// Lightweight comparison: operate on float[] objectives (avoid copying full Sol)
__device__ inline bool obj_is_better(const float* new_objs, const float* old_objs,
const ObjConfig& oc) {
if (oc.mode == CompareMode::Weighted) {
@ -662,6 +662,7 @@ __device__ inline bool obj_is_better(const float* new_objs, const float* old_obj
} else {
for (int p = 0; p < oc.num_obj; p++) {
int idx = oc.priority[p];
if (idx < 0 || idx >= oc.num_obj) continue;
float va = normalize_obj(new_objs[idx], oc.dirs[idx]);
float vb = normalize_obj(old_objs[idx], oc.dirs[idx]);
float diff = va - vb;
@ -672,7 +673,7 @@ __device__ inline bool obj_is_better(const float* new_objs, const float* old_obj
}
}
// 轻量标量化:直接操作 float[] 目标数组
// Lightweight scalarization: operate on float[] objectives
__device__ __host__ inline float obj_scalar(const float* objs, const ObjConfig& oc) {
if (oc.mode == CompareMode::Weighted) {
float sum = 0.0f;
@ -681,60 +682,61 @@ __device__ __host__ inline float obj_scalar(const float* objs, const ObjConfig&
return sum;
} else {
int idx = oc.priority[0];
if (idx < 0 || idx >= oc.num_obj) idx = 0;
return normalize_obj(objs[idx], oc.dirs[idx]);
}
}
// ============================================================
// AOSStats — 自适应算子选择统计(每个 block 一份)
// AOSStats — adaptive operator selection stats (one per block)
// ============================================================
// v3.0: 粒度从 3 层 → MAX_SEQ 个序列
// 记录每个序列的使用次数和改进次数
// batch 结束后由 host 聚合,更新 SeqRegistry 权重
// v3.0: granularity from 3 layers → MAX_SEQ sequences
// Records per-sequence usage and improvement counts
// Host aggregates after each batch and updates SeqRegistry weights
struct AOSStats {
// 算子层统计(第二层)
int usage[MAX_SEQ]; // 各序列使用次数
int improvement[MAX_SEQ]; // 各序列改进次数delta < 0 且被接受)
// K 步数层统计(第一层)
int k_usage[MAX_K]; // K=1,2,3 各自使用次数
int k_improvement[MAX_K]; // K=1,2,3 各自改进次数
// Operator-level stats (second layer)
int usage[MAX_SEQ]; // per-sequence usage counts
int improvement[MAX_SEQ]; // per-sequence improvements (delta < 0 and accepted)
// K-step layer stats (first layer)
int k_usage[MAX_K]; // usage counts for K=1,2,3
int k_improvement[MAX_K]; // improvement counts for K=1,2,3
};
// ============================================================
// ObjDef — 单个目标的定义(编译期常量)
// ObjDef — single-objective definition (compile-time constant)
// ============================================================
struct ObjDef {
ObjDir dir; // 优化方向
float weight; // Weighted 模式下的权重
float tolerance; // Lexicographic 模式下的容差
ObjDir dir; // optimization direction
float weight; // weight in Weighted mode
float tolerance; // tolerance in Lexicographic mode
};
// ============================================================
// HeuristicMatrix — 启发式初始解构造用的数据矩阵描述
// HeuristicMatrix — data matrix descriptor for heuristic initial solutions
// ============================================================
struct HeuristicMatrix {
const float* data; // host 端 N*N 矩阵
int N; // 维度
const float* data; // N×N matrix on host
int N; // dimension
};
// ============================================================
// ProblemBase<Derived, D1, D2> — CRTP 基类
// ProblemBase<Derived, D1, D2> — CRTP base class
//
// 用户继承此基类,提供:
// static constexpr ObjDef OBJ_DEFS[] = {...}; — 目标元信息
// __device__ float compute_obj(int idx, ...) const; — 目标分发
// Users inherit this base and provide:
// static constexpr ObjDef OBJ_DEFS[] = {...}; — objective metadata
// __device__ float compute_obj(int idx, ...) const; — objective dispatch
// __device__ float compute_penalty(...) const;
//
// 约定OBJ_DEFS 和 compute_obj 紧挨着写case N 对应 OBJ_DEFS[N]
// NUM_OBJ 由 sizeof(OBJ_DEFS) 自动推导,无需手动维护
// Convention: OBJ_DEFS and compute_obj stay aligned; case N maps to OBJ_DEFS[N]
// NUM_OBJ is derived from sizeof(OBJ_DEFS); no manual count
//
// 基类自动提供:
// evaluate(sol) — 遍历目标列表调用 compute_obj
// fill_obj_config(cfg) — 从 OBJ_DEFS 自动填充 ProblemConfig
// obj_config() — 直接生成 ObjConfig
// Base class provides:
// evaluate(sol) — loop objectives and call compute_obj
// fill_obj_config(cfg) — fill ProblemConfig from OBJ_DEFS
// obj_config() — build ObjConfig directly
// ============================================================
template<typename Derived, int D1_, int D2_>
@ -743,10 +745,10 @@ struct ProblemBase {
static constexpr int D2 = D2_;
using Sol = Solution<D1, D2>;
// NUM_OBJ 从 OBJ_DEFS 数组自动推导
// NUM_OBJ derived from OBJ_DEFS array size
static constexpr int NUM_OBJ = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
// 自动评估:遍历目标列表
// Automatic evaluation: iterate objectives
__device__ void evaluate(Sol& sol) const {
const auto& self = static_cast<const Derived&>(*this);
constexpr int n = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
@ -755,7 +757,7 @@ struct ProblemBase {
sol.penalty = self.compute_penalty(sol);
}
// 从 OBJ_DEFS 自动填充 ProblemConfig 的目标部分
// Fill objective fields of ProblemConfig from OBJ_DEFS
void fill_obj_config(ProblemConfig& cfg) const {
constexpr int n = sizeof(Derived::OBJ_DEFS) / sizeof(ObjDef);
cfg.num_objectives = n;
@ -763,59 +765,59 @@ struct ProblemBase {
cfg.obj_dirs[i] = Derived::OBJ_DEFS[i].dir;
cfg.obj_weights[i] = Derived::OBJ_DEFS[i].weight;
cfg.obj_tolerance[i] = Derived::OBJ_DEFS[i].tolerance;
cfg.obj_priority[i] = i; // 列表顺序即优先级
cfg.obj_priority[i] = i; // list order is priority order
}
}
// 直接生成 ObjConfig供 solver 使用)
// Build ObjConfig directly (for solver)
ObjConfig obj_config() const {
ProblemConfig pcfg;
fill_obj_config(pcfg);
return make_obj_config(pcfg);
}
// 可选:返回 shared memory 需求(字节)
// 默认返回 0不使用 shared memory
// 子类覆盖:如果问题数据可以放入 shared memory返回实际大小
// Optional: shared memory requirement (bytes)
// Default 0 (no shared memory)
// Override if problem data fits in shared memory; return actual size
size_t shared_mem_bytes() const {
return 0;
}
// 可选:加载问题数据到 shared memory
// 默认空实现(不使用 shared memory
// 子类覆盖:如果 shared_mem_bytes() > 0实现数据加载逻辑
// Optional: load problem data into shared memory
// Default no-op (no shared memory)
// Override if shared_mem_bytes() > 0 to implement loading
__device__ void load_shared(char* smem, int tid, int bsz) {
(void)smem; (void)tid; (void)bsz; // 默认:不做任何事
(void)smem; (void)tid; (void)bsz; // default: no-op
}
// 每个 block 在 global memory 中的热数据工作集大小(字节)
// 用于 auto pop_size 估算 L2 cache 压力
// 默认 = shared_mem_bytes()(数据在 smem 时gmem 工作集为 0 不影响)
// 子类覆盖:当 shared_mem_bytes() 返回 0数据放不进 smem
// 返回实际数据大小(如距离矩阵 n*n*sizeof(float)
// Hot working-set size in global memory per block (bytes)
// Used for auto pop_size L2 cache pressure estimate
// Default = shared_mem_bytes() (when data is in smem, gmem working set is 0)
// Override when shared_mem_bytes() is 0 (data does not fit in smem):
// return actual data size (e.g. distance matrix n*n*sizeof(float))
size_t working_set_bytes() const {
return static_cast<const Derived&>(*this).shared_mem_bytes();
}
// 可选:初始化 G/O 关系矩阵(为 GUIDED_REBUILD 提供先验知识)
// G[i*N+j]: 元素 i 和 j 的分组倾向(对称,[0,1],越大越倾向同组)
// O[i*N+j]: 元素 i 排在 j 前面的倾向(不对称,[0,1]
// 默认不提供(全零),搜索过程中通过 EMA 从历史好解积累
// 用户覆盖示例:距离近 → G 和 O 都高
// Optional: initialize G/O relation matrix (prior for GUIDED_REBUILD)
// G[i*N+j]: grouping tendency of i and j (symmetric, [0,1]; higher → same group)
// O[i*N+j]: tendency for i before j (asymmetric, [0,1])
// Default none (zeros); EMA accumulates from good solutions during search
// Example override: close distance → high G and O
void init_relation_matrix(float* h_G, float* h_O, int N) const {
(void)h_G; (void)h_O; (void)N; // 默认:不做任何事(保持全零)
(void)h_G; (void)h_O; (void)N; // default: no-op (keep zeros)
}
// 可选:返回 host 端数据矩阵供启发式初始解构造
// 默认返回 0不提供子类 override 后填充 out 数组并返回实际数量
// Optional: host-side data matrices for heuristic initial solutions
// Default 0 (none); override to fill out[] and return count
int heuristic_matrices(HeuristicMatrix* out, int max_count) const {
(void)out; (void)max_count;
return 0;
}
// v5.0: 多 GPU 协同 — 克隆 Problem 到指定 GPU
// 子类需实现cudaSetDevice(gpu_id) + 分配设备内存 + 拷贝数据
// 返回新的 Problem 实例指针(在 host 端,但其内部设备指针指向 gpu_id
// v5.0: multi-GPU — clone Problem to a given GPU
// Subclasses implement: cudaSetDevice(gpu_id) + device alloc + copy
// Returns new Problem* on host; internal device pointers target gpu_id
virtual Derived* clone_to_device(int gpu_id) const {
(void)gpu_id;
fprintf(stderr, "Error: clone_to_device() not implemented for this Problem type\n");

View file

@ -1,7 +1,7 @@
/**
* assignment.cuh - 指派问题
*
* 继承 ProblemBase使用 ObjDef 目标注册机制
* assignment.cuh - assignment problem
*
* Extends ProblemBase with ObjDef objective registration.
*/
#pragma once
@ -11,10 +11,10 @@
struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
const float* d_cost;
const float* h_cost; // host 端成本矩阵(用于 init_relation_matrix
const float* h_cost; // host cost matrix (for init_relation_matrix)
int n;
// ---- 目标计算 ----
// ---- objective evaluation ----
__device__ float calc_total_cost(const Sol& sol) const {
float total = 0.0f;
const int* assign = sol.data[0];
@ -24,7 +24,7 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
return total;
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
// ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_cost
};
@ -47,7 +47,7 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
return cfg;
}
// ---- shared memory 接口 ----
// ---- shared memory interface ----
static constexpr size_t SMEM_LIMIT = 48 * 1024;
size_t shared_mem_bytes() const {
@ -66,12 +66,12 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
d_cost = sc;
}
// 成本先验task j 和 task k 如果被相似 agent 偏好G 值高
// O 矩阵task j 在位置 i 成本低 → O[j][k] 略高j 倾向排在 k 前面的位置)
// Cost prior: if tasks j and k are similarly preferred by agents, G is high
// O matrix: low cost for task j at slot i → slightly higher O[j][k] (j tends before k)
void init_relation_matrix(float* G, float* O, int N) const {
if (!h_cost || N != n) return;
// 对每个 task构建成本向量task 间余弦相似度 → G
// 简化:成本列向量的相关性
// Per task, build cost vectors; cosine similarity between tasks → G
// Simplified: correlation of cost columns
float max_c = 0.0f;
for (int i = 0; i < N * N; i++)
if (h_cost[i] > max_c) max_c = h_cost[i];
@ -80,7 +80,7 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
for (int j = 0; j < N; j++)
for (int k = 0; k < N; k++) {
if (j == k) continue;
// G: 两个 task 的成本向量越相似 → 越可能互换
// G: more similar cost columns → more likely to swap tasks
float dot = 0.0f, nj = 0.0f, nk = 0.0f;
for (int i = 0; i < N; i++) {
float cj = h_cost[i * N + j] / max_c;

View file

@ -1,13 +1,13 @@
/**
* bin_packing.cuh - 一维装箱问题Integer 编码 + 约束)
*
* N 个物品,每个重量 w[i],装入最多 B 个箱子,每个箱子容量 C。
* 决策变量data[0][i] ∈ [0, B-1],表示物品 i 放入的箱子编号。
* 目标:最小化使用的箱子数。
* 约束:每个箱子总重不超过 C超出部分作为 penalty。
*
* 验证实例8 物品 weights=[7,5,3,4,6,2,8,1], C=10, 最优=4 箱
* 箱0={7,3}=10, 箱1={5,4,1}=10, 箱2={6,2}=8, 箱3={8}=8
* bin_packing.cuh - one-dimensional bin packing (Integer encoding + constraints)
*
* N items with weights w[i], at most B bins, capacity C per bin.
* Decision: data[0][i] in [0, B-1] = bin index for item i.
* Objective: minimize number of bins used.
* Constraint: bin load ≤ C; overflow contributes to penalty.
*
* Validation instance: 8 items weights=[7,5,3,4,6,2,8,1], C=10, optimum=4 bins
* bin0={7,3}=10, bin1={5,4,1}=10, bin2={6,2}=8, bin3={8}=8
*/
#pragma once
@ -16,9 +16,9 @@
struct BinPackingProblem : ProblemBase<BinPackingProblem, 1, 64> {
const float* d_weights;
int n; // 物品数
int max_bins; // 最大箱子数 B
float capacity; // 箱子容量 C
int n; // number of items
int max_bins; // max bins B
float capacity; // bin capacity C
__device__ float calc_bins_used(const Sol& sol) const {
bool used[32] = {};

View file

@ -1,11 +1,11 @@
/**
* graph_color.cuh - 图着色问题Integer 编码)
*
* N 个节点的图,用 k 种颜色着色。
* 决策变量data[0][i] ∈ [0, k-1],表示节点 i 的颜色。
* 目标:最小化冲突边数(相邻节点同色的边数)。
*
* 验证实例Petersen 图10 节点 15 边,色数=3最优冲突=0
* graph_color.cuh - graph coloring (Integer encoding)
*
* Graph on N nodes, k colors.
* Decision: data[0][i] in [0, k-1] = color of node i.
* Objective: minimize number of conflicting edges (adjacent same color).
*
* Validation instance: Petersen graph (10 nodes, 15 edges, chromatic number 3, optimal conflicts=0)
*/
#pragma once
@ -13,9 +13,9 @@
#include "cuda_utils.cuh"
struct GraphColorProblem : ProblemBase<GraphColorProblem, 1, 64> {
const int* d_adj; // 邻接矩阵 [N*N]1=相邻, 0=不相邻)
int n; // 节点数
int k; // 颜色数
const int* d_adj; // adjacency [N*N] (1=edge, 0=no edge)
int n; // number of nodes
int k; // number of colors
__device__ float calc_conflicts(const Sol& sol) const {
int conflicts = 0;

View file

@ -1,26 +1,26 @@
/**
* jsp.cuh - 车间调度问题 (Job Shop Scheduling Problem)
*
* J 个工件,每个工件有 O 道工序,每道工序指定机器和耗时。
*
* === 编码方案 AInteger 多行(时间表编码)===
* JSPProblem: data[j][i] = 工件 j 的第 i 道工序的开始时间
* jsp.cuh - Job Shop Scheduling Problem (JSSP)
*
* J jobs, each with O operations; each op specifies machine and duration.
*
* === Encoding A: multi-row Integer (time-table encoding) ===
* JSPProblem: data[j][i] = start time of job j's i-th operation
* dim1 = num_jobs, dim2_default = num_ops
* row_mode = Fixed(禁止 ROW_SPLIT/ROW_MERGE
* 每行代表一个工件的固定工序序列,行长度不可变
*
* === 编码方案 BPermutation 多重集(工序排列编码)===
* JSPPermProblem: data[0][k] = 工件编号0..J-1长度 J*O
* 值 j 出现 O 次。从左到右扫描,第 t 次遇到值 j 表示工件 j 的第 t 道工序。
* row_mode = Fixed (no ROW_SPLIT/ROW_MERGE)
* Each row is a fixed op sequence for one job; row length is fixed.
*
* === Encoding B: Permutation multiset (operation sequence encoding) ===
* JSPPermProblem: data[0][k] = job id (0..J-1), length J*O
* Value j appears O times. Left-to-right scan: t-th occurrence of j is job j's t-th op.
* dim1 = 1, dim2_default = J*O, perm_repeat_count = O
* 标准 Permutation 算子swap/reverse/insert天然保持多重集结构
*
* 目标Minimize makespan所有工件完成时间的最大值
* 约束:
* (a) 工序顺序:同一工件的工序必须按序执行
* (b) 机器冲突:同一机器同一时刻只能处理一个工序
*
* 验证实例:自定义 3 工件 3 机器 (3x3),最优 makespan = 12
* Standard permutation ops (swap/reverse/insert) preserve multiset structure.
*
* Objective: minimize makespan (max completion time over jobs).
* Constraints:
* (a) Precedence: ops of the same job must run in order.
* (b) Machine conflict: one op per machine at a time.
*
* Validation instance: custom 3 jobs × 3 machines (3x3), optimal makespan = 12
*/
#pragma once
@ -28,16 +28,16 @@
#include "cuda_utils.cuh"
// ============================================================
// 编码方案 AInteger 多行(时间表编码)
// Encoding A: multi-row Integer (time-table encoding)
// ============================================================
struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
const int* d_machine; // 工序所需机器 [J*O]
const float* d_duration; // 工序耗时 [J*O]
int num_jobs; // 工件数 J
int num_ops; // 每工件工序数 O
int num_machines; // 机器数 M
int time_horizon; // 时间上界
const int* d_machine; // machine per op [J*O]
const float* d_duration; // op duration [J*O]
int num_jobs; // number of jobs J
int num_ops; // ops per job O
int num_machines; // number of machines M
int time_horizon; // time horizon upper bound
__device__ float calc_makespan(const Sol& sol) const {
float makespan = 0.0f;
@ -62,7 +62,7 @@ struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
__device__ float compute_penalty(const Sol& sol) const {
float penalty = 0.0f;
// (a) 工序顺序约束
// (a) Precedence constraints
for (int j = 0; j < num_jobs; j++) {
for (int i = 1; i < num_ops; i++) {
float prev_end = (float)sol.data[j][i-1] + d_duration[j * num_ops + (i-1)];
@ -72,7 +72,7 @@ struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
}
}
// (b) 机器冲突约束
// (b) Machine conflict constraints
int total = num_jobs * num_ops;
for (int a = 0; a < total; a++) {
int ja = a / num_ops, ia = a % num_ops;
@ -151,28 +151,28 @@ struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
};
// ============================================================
// 编码方案 BPermutation 多重集(工序排列编码)
// Encoding B: Permutation multiset (operation sequence encoding)
// ============================================================
// data[0] 是长度 J*O 的排列,值域 [0, J),每个值出现 O 次
// 从左到右扫描:第 t 次遇到值 j → 安排工件 j 的第 t 道工序
// 贪心解码:每道工序安排在"最早可行时间"(满足工序顺序 + 机器空闲)
// data[0] is a length-J*O sequence with values in [0, J), each appearing O times.
// Left-to-right: t-th occurrence of j schedules job j's t-th operation.
// Greedy decode: each op at earliest feasible time (precedence + machine free).
struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
const int* d_machine; // 工序所需机器 [J*O]
const float* d_duration; // 工序耗时 [J*O]
const int* d_machine; // machine per op [J*O]
const float* d_duration; // op duration [J*O]
int num_jobs;
int num_ops;
int num_machines;
// 贪心解码:从排列生成调度方案,返回 makespan
// Greedy decode: build schedule from permutation, return makespan
__device__ float decode_and_makespan(const Sol& sol) const {
int total = num_jobs * num_ops;
int size = sol.dim2_sizes[0];
if (size < total) return 1e9f;
float job_avail[8]; // 每个工件的下一道工序最早开始时间
float mach_avail[8]; // 每台机器的最早空闲时间
int job_next_op[8]; // 每个工件的下一道待安排工序编号
float job_avail[8]; // earliest start for next op of each job
float mach_avail[8]; // earliest machine free time
int job_next_op[8]; // next op index to schedule per job
for (int j = 0; j < num_jobs; j++) { job_avail[j] = 0.0f; job_next_op[j] = 0; }
for (int m = 0; m < num_machines; m++) mach_avail[m] = 0.0f;
@ -182,13 +182,13 @@ struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
int j = sol.data[0][k];
if (j < 0 || j >= num_jobs) return 1e9f;
int op = job_next_op[j];
if (op >= num_ops) continue; // 该工件已安排完
if (op >= num_ops) continue; // job already fully scheduled
int flat = j * num_ops + op;
int m = d_machine[flat];
float dur = d_duration[flat];
// 最早开始时间 = max(工件前序完成, 机器空闲)
// Earliest start = max(job predecessor done, machine free)
float start = fmaxf(job_avail[j], mach_avail[m]);
float end = start + dur;
@ -212,7 +212,7 @@ struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
}
}
// 贪心解码天然满足约束penalty 始终为 0
// Greedy decode satisfies constraints; penalty is always 0
__device__ float compute_penalty(const Sol& sol) const {
return 0.0f;
}

View file

@ -1,7 +1,7 @@
/**
* knapsack.cuh - 0-1 背包问题
*
* 继承 ProblemBase使用 ObjDef 目标注册机制
* knapsack.cuh - 0-1 knapsack
*
* Extends ProblemBase with ObjDef objective registration.
*/
#pragma once
@ -10,13 +10,13 @@
#include "operators.cuh"
struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
// 问题数据d_weights 是物品重量,非目标权重)
// problem data (d_weights are item weights, not objective weights)
const float* d_weights;
const float* d_values;
float capacity;
int n;
// ---- 目标计算 ----
// ---- objective evaluation ----
__device__ float calc_total_value(const Sol& sol) const {
float tv = 0.0f;
const int* sel = sol.data[0];
@ -26,7 +26,7 @@ struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
return tv;
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
// ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Maximize, 1.0f, 0.0f}, // case 0: calc_total_value
};
@ -55,7 +55,7 @@ struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
return cfg;
}
// ---- shared memory 接口 ----
// ---- shared memory interface ----
size_t shared_mem_bytes() const {
return 2 * (size_t)n * sizeof(float);
}

View file

@ -1,12 +1,12 @@
/**
* load_balance.cuh - 离散负载均衡问题Integer 编码验证)
*
* N 个任务分配到 M 台机器,每个任务有一个处理时间 p[i]。
* 决策变量data[0][i] ∈ [0, M-1],表示任务 i 分配到哪台机器。
* 目标:最小化 makespan最大机器负载
*
* 已知 NP-hard等价于 multiprocessor scheduling / load balancing
* LPT(最长处理时间优先)贪心可得 4/3 近似。
* load_balance.cuh - discrete load balancing (Integer encoding sanity check)
*
* N tasks on M machines, processing time p[i] per task.
* Decision: data[0][i] in [0, M-1] = machine for task i.
* Objective: minimize makespan (max machine load).
*
* NP-hard (same as multiprocessor scheduling / load balancing).
* LPT (longest processing time first) greedy achieves 4/3 approximation.
*/
#pragma once
@ -14,12 +14,12 @@
#include "cuda_utils.cuh"
struct LoadBalanceProblem : ProblemBase<LoadBalanceProblem, 1, 64> {
const float* d_proc_time; // 任务处理时间 [N]
int n; // 任务数
int m; // 机器数
const float* d_proc_time; // task processing times [N]
int n; // number of tasks
int m; // number of machines
__device__ float calc_makespan(const Sol& sol) const {
float load[32] = {}; // 最多 32 台机器
float load[32] = {}; // at most 32 machines
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++) {
int machine = sol.data[0][i];
@ -43,7 +43,7 @@ struct LoadBalanceProblem : ProblemBase<LoadBalanceProblem, 1, 64> {
}
__device__ float compute_penalty(const Sol& sol) const {
return 0.0f; // 无约束(任何分配都合法)
return 0.0f; // no side constraints (any assignment is feasible)
}
ProblemConfig config() const {

View file

@ -1,14 +1,14 @@
/**
* qap.cuh - 二次分配问题 (Quadratic Assignment Problem)
*
* N 个设施分配到 N 个位置(排列编码)。
* 决策变量data[0][i] = 设施 i 分配到的位置。
* 目标:Minimize sum(flow[i][j] * dist[perm[i]][perm[j]])
*
* 验证实例:自定义 5x5
* flow: 设施间的物流量
* dist: 位置间的距离
* 已知最优 = 58
* qap.cuh - Quadratic Assignment Problem (QAP)
*
* Assign N facilities to N locations (permutation encoding).
* Decision: data[0][i] = location assigned to facility i.
* Objective: Minimize sum(flow[i][j] * dist[perm[i]][perm[j]])
*
* Validation instance: custom 5x5
* flow: inter-facility flow
* dist: inter-location distances
* known optimum = 58
*/
#pragma once
@ -16,8 +16,10 @@
#include "cuda_utils.cuh"
struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
const float* d_flow; // 物流量矩阵 [N*N]
const float* d_dist; // 距离矩阵 [N*N]
const float* d_flow; // flow matrix [N*N] (device)
const float* d_dist; // distance matrix [N*N] (device)
const float* h_flow; // flow matrix [N*N] (host, for clone_to_device)
const float* h_dist; // distance matrix [N*N] (host, for clone_to_device)
int n;
__device__ float calc_cost(const Sol& sol) const {
@ -64,14 +66,16 @@ struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
d_dist = sd;
}
static QAPProblem create(const float* h_flow, const float* h_dist, int n) {
static QAPProblem create(const float* h_flow_in, const float* h_dist_in, int n) {
QAPProblem prob;
prob.n = n;
prob.h_flow = h_flow_in;
prob.h_dist = h_dist_in;
float *df, *dd;
CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n));
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(df, h_flow_in, sizeof(float) * n * n, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(dd, h_dist_in, sizeof(float) * n * n, cudaMemcpyHostToDevice));
prob.d_flow = df; prob.d_dist = dd;
return prob;
}
@ -82,18 +86,12 @@ struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
d_flow = nullptr; d_dist = nullptr;
}
// v5.0: 多 GPU 协同 — 克隆到指定 GPU
// v5.0: multi-GPU — clone onto a given device
QAPProblem* clone_to_device(int gpu_id) const override {
int orig_device;
CUDA_CHECK(cudaGetDevice(&orig_device));
// 先下载数据到 host从当前设备
float* h_flow = new float[n * n];
float* h_dist = new float[n * n];
CUDA_CHECK(cudaMemcpy(h_flow, d_flow, sizeof(float) * n * n, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(h_dist, d_dist, sizeof(float) * n * n, cudaMemcpyDeviceToHost));
// 切换到目标 GPU 并上传
// Use host-side matrices directly (no D2H needed)
CUDA_CHECK(cudaSetDevice(gpu_id));
float *df, *dd;
CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n));
@ -101,15 +99,12 @@ struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
delete[] h_flow;
delete[] h_dist;
// 恢复原设备
CUDA_CHECK(cudaSetDevice(orig_device));
// 创建新实例
QAPProblem* new_prob = new QAPProblem();
new_prob->n = n;
new_prob->h_flow = h_flow;
new_prob->h_dist = h_dist;
new_prob->d_flow = df;
new_prob->d_dist = dd;

View file

@ -1,8 +1,8 @@
/**
* schedule.cuh - 排班问题
*
* 继承 ProblemBase使用 ObjDef 目标注册机制
* 2 个目标总成本min+ 不公平度min权重更高
* schedule.cuh - staff scheduling
*
* Extends ProblemBase with ObjDef objective registration.
* Two objectives: total cost (min) + unfairness (min, higher weight).
*/
#pragma once
@ -14,7 +14,7 @@ struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
const float* d_cost;
int days, emps, required;
// ---- 目标计算 ----
// ---- objective evaluation ----
__device__ float calc_total_cost(const Sol& sol) const {
float total = 0.0f;
for (int d = 0; d < days; d++)
@ -37,7 +37,7 @@ struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
return (float)(max_w - min_w);
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
// ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_cost
{ObjDir::Minimize, 5.0f, 0.0f}, // case 1: calc_unfairness
@ -71,9 +71,9 @@ struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
return cfg;
}
// 默认回退全量(基类行为)— 不需要覆盖 evaluate_move
// Default full re-eval (base behavior) — no need to override evaluate_move
// ---- shared memory 接口 ----
// ---- shared memory interface ----
size_t shared_mem_bytes() const {
return (size_t)days * emps * sizeof(float);
}

View file

@ -1,7 +1,7 @@
/**
* tsp.cuh - TSP 问题定义
*
* 继承 ProblemBase使用 ObjDef 目标注册机制
* tsp.cuh - Traveling Salesman Problem (TSP) definition
*
* Extends ProblemBase with ObjDef objective registration.
*/
#pragma once
@ -10,12 +10,12 @@
#include "operators.cuh"
struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
// 问题数据
// problem data
const float* d_dist;
const float* h_dist; // host 端距离矩阵(用于 init_relation_matrix
const float* h_dist; // host distance matrix (for init_relation_matrix)
int n;
// ---- 目标计算 ----
// ---- objective evaluation ----
__device__ float calc_total_distance(const Sol& sol) const {
float total = 0.0f;
const int* route = sol.data[0];
@ -25,7 +25,7 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
return total;
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
// ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_distance
};
@ -37,10 +37,10 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
}
__device__ float compute_penalty(const Sol& sol) const {
return 0.0f; // TSP 无约束
return 0.0f; // TSP has no side constraints
}
// ---- config(编码/维度部分,目标由基类自动填充)----
// ---- config (encoding/dims; objectives filled by base class) ----
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Permutation;
@ -49,7 +49,7 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
return cfg;
}
// ---- shared memory 接口 ----
// ---- shared memory interface ----
static constexpr size_t SMEM_LIMIT = 48 * 1024;
size_t shared_mem_bytes() const {
@ -69,7 +69,7 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
d_dist = sd;
}
// 距离先验:距离近 → G/O 分数高
// Distance prior: closer cities → higher G/O scores
void init_relation_matrix(float* G, float* O, int N) const {
if (!h_dist || N != n) return;
float max_d = 0.0f;
@ -108,21 +108,21 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
h_dist = nullptr;
}
// v5.0: 多 GPU 协同 — 克隆到指定 GPU
// v5.0: multi-GPU — clone onto a given device
TSPProblem* clone_to_device(int gpu_id) const override {
int orig_device;
CUDA_CHECK(cudaGetDevice(&orig_device));
CUDA_CHECK(cudaSetDevice(gpu_id));
// 分配设备内存并拷贝距离矩阵
// Allocate device memory and copy distance matrix
float* dd;
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
// 恢复原设备
// Restore original device
CUDA_CHECK(cudaSetDevice(orig_device));
// 创建新的 Problem 实例(在 host 端)
// Create new Problem instance (on host)
TSPProblem* new_prob = new TSPProblem();
new_prob->n = n;
new_prob->h_dist = h_dist;

View file

@ -1,7 +1,7 @@
/**
* tsp_large.cuh - 大规模 TSP 问题定义 (最多 256 城市)
*
* 继承 ProblemBase逻辑与 tsp.cuh 一致,仅 D2 上限不同
* tsp_large.cuh - large-scale TSP definition (up to 256 cities)
*
* Same logic as tsp.cuh under ProblemBase; only D2 cap differs.
*/
#pragma once
@ -14,7 +14,7 @@ struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
const float* h_dist;
int n;
// ---- 目标计算 ----
// ---- objective evaluation ----
__device__ float calc_total_distance(const Sol& sol) const {
float total = 0.0f;
const int* route = sol.data[0];
@ -24,7 +24,7 @@ struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
return total;
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
// ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_distance
};
@ -54,7 +54,7 @@ struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
return need <= SMEM_LIMIT ? need : 0;
}
// 距离矩阵的实际大小(不管是否放进 smem
// Actual distance matrix size (whether or not placed in smem)
size_t working_set_bytes() const {
return (size_t)n * n * sizeof(float);
}

View file

@ -1,9 +1,9 @@
/**
* tsp_xlarge.cuh - 超大规模 TSP 问题定义 (最多 512 城市)
*
* 继承 ProblemBase逻辑与 tsp_large.cuh 一致D2=512
* 注意:距离矩阵 512×512×4B = 1MB远超 48KB shared memory
* 因此 shared_mem_bytes() 返回 0距离矩阵留在 global memory
* tsp_xlarge.cuh - very large TSP definition (up to 512 cities)
*
* Same as tsp_large.cuh under ProblemBase, with D2=512.
* Note: distance matrix 512×512×4B = 1MB, far above 48KB shared memory,
* so shared_mem_bytes() returns 0 and the matrix stays in global memory.
*/
#pragma once
@ -13,7 +13,7 @@
struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
const float* d_dist;
const float* h_dist; // host 端距离矩阵(用于 init_relation_matrix
const float* h_dist; // host distance matrix (for init_relation_matrix)
int n;
__device__ float calc_total_distance(const Sol& sol) const {
@ -45,7 +45,7 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
return cfg;
}
// 距离矩阵太大,不放 shared memory
// Distance matrix too large for shared memory
size_t shared_mem_bytes() const { return 0; }
__device__ void load_shared(char*, int, int) {}
@ -53,10 +53,10 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
return (size_t)n * n * sizeof(float);
}
// 用距离矩阵初始化 G/O 先验:距离近 → 分数高
// Initialize G/O priors from distances: closer → higher score
void init_relation_matrix(float* G, float* O, int N) const {
if (!h_dist || N != n) return;
// 找最大距离用于归一化
// Max distance for normalization
float max_d = 0.0f;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
@ -66,10 +66,10 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
if (i == j) continue;
// 距离近 → G 高(分组倾向强)
// Closer → higher G (stronger grouping signal)
float proximity = 1.0f - h_dist[i * N + j] / max_d;
G[i * N + j] = proximity * 0.3f; // 初始信号不要太强,留空间给 EMA
// 距离近 → O 也给一点信号(对称的,不偏向任何方向)
G[i * N + j] = proximity * 0.3f; // keep initial signal moderate for EMA headroom
// Closer → small O signal too (symmetric, no directional bias)
O[i * N + j] = proximity * 0.1f;
}
}
@ -84,7 +84,7 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
static TSPXLargeProblem create(const float* h_dist_ptr, int n) {
TSPXLargeProblem prob;
prob.n = n;
prob.h_dist = h_dist_ptr; // 保留 host 指针
prob.h_dist = h_dist_ptr; // keep host pointer
float* dd;
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));

View file

@ -1,8 +1,8 @@
/**
* vrp.cuh - 容量约束车辆路径问题 (CVRP)
*
* 继承 ProblemBase使用 ObjDef 目标注册机制
* 多行编码D1=K 条路线,分区初始化 + 跨行算子)
* vrp.cuh - Capacitated Vehicle Routing Problem (CVRP)
*
* Extends ProblemBase with ObjDef objective registration.
* Multi-row encoding (D1 = K routes, partition init + cross-row operators).
*/
#pragma once
@ -12,11 +12,11 @@
#include "gpu_cache.cuh"
struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
// GPU 数据
// GPU data
const float* d_dist;
const float* d_demand;
const float* h_dist; // host 端距离矩阵(含 depot用于 init_relation_matrix
const float* h_demand; // host 端需求数组(用于 clone_to_device
const float* h_dist; // host distance matrix (includes depot; for init_relation_matrix)
const float* h_demand; // host demand array (for clone_to_device)
int n;
int stride;
float capacity;
@ -24,7 +24,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
int max_vehicles;
GpuCache cache;
// ---- 目标计算 ----
// ---- objective evaluation ----
__device__ float compute_route_dist(const int* route, int size) const {
if (size == 0) return 0.0f;
float dist = 0.0f;
@ -61,7 +61,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
return total;
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
// ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_distance
};
@ -102,7 +102,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
return cfg;
}
// ---- shared memory 接口 ----
// ---- shared memory interface ----
static constexpr size_t SMEM_LIMIT = 48 * 1024;
size_t shared_mem_bytes() const {
@ -129,14 +129,14 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
void enable_cache(int cap = 65536) { cache = GpuCache::allocate(cap); }
void print_cache_stats() const { cache.print_stats(); }
// 距离先验:客户间距离近 → G/O 分数高
// 注意h_dist 含 depotstride×stride元素编号 0..n-1 对应 node 1..n
// Distance prior: closer customers → higher G/O scores
// Note: h_dist includes depot (stride×stride); indices 0..n-1 map to nodes 1..n
void init_relation_matrix(float* G, float* O, int N) const {
if (!h_dist || N != n) return;
float max_d = 0.0f;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++) {
float d = h_dist[(i + 1) * stride + (j + 1)]; // 跳过 depot
float d = h_dist[(i + 1) * stride + (j + 1)]; // skip depot
if (d > max_d) max_d = d;
}
if (max_d <= 0.0f) return;
@ -161,7 +161,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
prob.max_vehicles = max_vehicles;
prob.cache = GpuCache::disabled();
prob.h_dist = h_dist_ptr;
prob.h_demand = h_demand_ptr; // 保存 host 端指针
prob.h_demand = h_demand_ptr; // keep host pointer
int n_nodes = n + 1;
float* dd;
@ -185,13 +185,13 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
cache.destroy();
}
// v5.0: 多 GPU 协同 — 克隆到指定 GPU
// v5.0: multi-GPU — clone onto a given device
VRPProblem* clone_to_device(int gpu_id) const override {
int orig_device;
CUDA_CHECK(cudaGetDevice(&orig_device));
CUDA_CHECK(cudaSetDevice(gpu_id));
// 从 host 端数据直接拷贝到目标 GPU避免跨设备 D2H 拷贝)
// Copy from host straight to target GPU (avoid cross-device D2H staging)
int n_nodes = n + 1;
float* dd;
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n_nodes * n_nodes));

View file

@ -1,12 +1,12 @@
/**
* vrptw.cuh - 带时间窗的车辆路径问题 (VRPTW)
*
* 在 CVRP 基础上增加时间窗约束。
* 编码Perm 多行分区(同 CVRPdata[r][j] = 路线 r 的第 j 个客户。
* 目标Minimize 总距离。
* 约束:(a) 容量约束, (b) 时间窗约束(到达时间必须 ≤ latest早到需等待
*
* 验证实例8 客户 3 车, 手工设计坐标+时间窗, 确保有已知可行解。
* vrptw.cuh - Vehicle Routing Problem with Time Windows (VRPTW)
*
* CVRP plus time window constraints.
* Encoding: multi-row perm partition (same as CVRP); data[r][j] = j-th customer on route r.
* Objective: minimize total distance.
* Constraints: (a) capacity, (b) time windows (arrival ≤ latest; early arrival waits).
*
* Validation instance: 8 customers, 3 vehicles; hand-crafted coords + windows with known feasible solution.
*/
#pragma once
@ -14,12 +14,12 @@
#include "cuda_utils.cuh"
struct VRPTWProblem : ProblemBase<VRPTWProblem, 8, 64> {
const float* d_dist; // 距离矩阵 [(n+1)*(n+1)](含 depot
const float* d_demand; // 需求 [n]
const float* d_earliest; // 最早服务时间 [n+1](含 depot
const float* d_latest; // 最晚服务时间 [n+1](含 depot
const float* d_service; // 服务耗时 [n+1](含 depot
int n; // 客户数(不含 depot
const float* d_dist; // distance matrix [(n+1)*(n+1)] (includes depot)
const float* d_demand; // demand [n]
const float* d_earliest; // earliest service time [n+1] (includes depot)
const float* d_latest; // latest service time [n+1] (includes depot)
const float* d_service; // service time [n+1] (includes depot)
int n; // number of customers (excludes depot)
int stride; // n+1
float capacity;
int num_vehicles;
@ -63,30 +63,30 @@ struct VRPTWProblem : ProblemBase<VRPTWProblem, 8, 64> {
if (size == 0) continue;
active++;
// 容量约束
// Capacity constraint
float load = 0.0f;
for (int j = 0; j < size; j++)
load += d_demand[sol.data[r][j]];
if (load > capacity)
penalty += (load - capacity) * 100.0f;
// 时间窗约束:模拟路线行驶
// Time windows: simulate route travel
float time = 0.0f;
int prev = 0;
for (int j = 0; j < size; j++) {
int node = sol.data[r][j] + 1;
float travel = d_dist[prev * stride + node];
time += travel;
// 早到需等待
// Wait if early
if (time < d_earliest[node])
time = d_earliest[node];
// 迟到产生惩罚
// Penalize lateness
if (time > d_latest[node])
penalty += (time - d_latest[node]) * 50.0f;
time += d_service[node];
prev = node;
}
// 返回 depot 的时间窗
// Time window returning to depot
float return_time = time + d_dist[prev * stride + 0];
if (return_time > d_latest[0])
penalty += (return_time - d_latest[0]) * 50.0f;