fix: harden CUDA safety checks and translate comments to English

Safety fixes (4 critical, 4 warning) from code review:

- qap.cuh: fix clone_to_device cross-device D2H by retaining host matrices
- types.cuh: add CUDA_CHECK to InjectBuffer, track owner_gpu for safe destroy
- types.cuh: add bounds check on lexicographic priority index
- solver.cuh: cap migrate_kernel islands to MAX_ISLANDS=64 to prevent stack overflow
- multi_gpu_solver.cuh: guard against 0 GPUs, propagate stop_reason from best GPU
- types.cuh: warn on SeqRegistry overflow
- solver.cuh: warn when constraint_directed/phased_search disabled without AOS

Translate all Chinese comments to English across 25+ source files
(core/*.cuh, problems/*.cuh, Makefile, multi-GPU tests).

Verified on V100S×2 (sm_70, CUDA 12.8): e5 (12 problem types, all optimal),
e13 (multi-objective + multi-GPU, 9 configs, all passed).
This commit is contained in:
L-yang-yang 2026-03-25 11:52:50 +08:00
parent ab278d0e82
commit a848730459
25 changed files with 1147 additions and 1167 deletions

View file

@ -1,7 +1,7 @@
/**
* assignment.cuh - 指派问题
*
* 继承 ProblemBase使用 ObjDef 目标注册机制
* assignment.cuh - assignment problem
*
* Extends ProblemBase with ObjDef objective registration.
*/
#pragma once
@ -11,10 +11,10 @@
struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
const float* d_cost;
const float* h_cost; // host 端成本矩阵(用于 init_relation_matrix
const float* h_cost; // host cost matrix (for init_relation_matrix)
int n;
// ---- 目标计算 ----
// ---- objective evaluation ----
__device__ float calc_total_cost(const Sol& sol) const {
float total = 0.0f;
const int* assign = sol.data[0];
@ -24,7 +24,7 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
return total;
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
// ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_cost
};
@ -47,7 +47,7 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
return cfg;
}
// ---- shared memory 接口 ----
// ---- shared memory interface ----
static constexpr size_t SMEM_LIMIT = 48 * 1024;
size_t shared_mem_bytes() const {
@ -66,12 +66,12 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
d_cost = sc;
}
// 成本先验task j 和 task k 如果被相似 agent 偏好G 值高
// O 矩阵task j 在位置 i 成本低 → O[j][k] 略高j 倾向排在 k 前面的位置)
// Cost prior: if tasks j and k are similarly preferred by agents, G is high
// O matrix: low cost for task j at slot i → slightly higher O[j][k] (j tends before k)
void init_relation_matrix(float* G, float* O, int N) const {
if (!h_cost || N != n) return;
// 对每个 task构建成本向量task 间余弦相似度 → G
// 简化:成本列向量的相关性
// Per task, build cost vectors; cosine similarity between tasks → G
// Simplified: correlation of cost columns
float max_c = 0.0f;
for (int i = 0; i < N * N; i++)
if (h_cost[i] > max_c) max_c = h_cost[i];
@ -80,7 +80,7 @@ struct AssignmentProblem : ProblemBase<AssignmentProblem, 1, 16> {
for (int j = 0; j < N; j++)
for (int k = 0; k < N; k++) {
if (j == k) continue;
// G: 两个 task 的成本向量越相似 → 越可能互换
// G: more similar cost columns → more likely to swap tasks
float dot = 0.0f, nj = 0.0f, nk = 0.0f;
for (int i = 0; i < N; i++) {
float cj = h_cost[i * N + j] / max_c;

View file

@ -1,13 +1,13 @@
/**
* bin_packing.cuh - 一维装箱问题Integer 编码 + 约束)
*
* N 个物品,每个重量 w[i],装入最多 B 个箱子,每个箱子容量 C。
* 决策变量data[0][i] ∈ [0, B-1],表示物品 i 放入的箱子编号。
* 目标:最小化使用的箱子数。
* 约束:每个箱子总重不超过 C超出部分作为 penalty。
*
* 验证实例8 物品 weights=[7,5,3,4,6,2,8,1], C=10, 最优=4 箱
* 箱0={7,3}=10, 箱1={5,4,1}=10, 箱2={6,2}=8, 箱3={8}=8
* bin_packing.cuh - one-dimensional bin packing (Integer encoding + constraints)
*
* N items with weights w[i], at most B bins, capacity C per bin.
* Decision: data[0][i] in [0, B-1] = bin index for item i.
* Objective: minimize number of bins used.
* Constraint: bin load ≤ C; overflow contributes to penalty.
*
* Validation instance: 8 items weights=[7,5,3,4,6,2,8,1], C=10, optimum=4 bins
* bin0={7,3}=10, bin1={5,4,1}=10, bin2={6,2}=8, bin3={8}=8
*/
#pragma once
@ -16,9 +16,9 @@
struct BinPackingProblem : ProblemBase<BinPackingProblem, 1, 64> {
const float* d_weights;
int n; // 物品数
int max_bins; // 最大箱子数 B
float capacity; // 箱子容量 C
int n; // number of items
int max_bins; // max bins B
float capacity; // bin capacity C
__device__ float calc_bins_used(const Sol& sol) const {
bool used[32] = {};

View file

@ -1,11 +1,11 @@
/**
* graph_color.cuh - 图着色问题Integer 编码)
*
* N 个节点的图,用 k 种颜色着色。
* 决策变量data[0][i] ∈ [0, k-1],表示节点 i 的颜色。
* 目标:最小化冲突边数(相邻节点同色的边数)。
*
* 验证实例Petersen 图10 节点 15 边,色数=3最优冲突=0
* graph_color.cuh - graph coloring (Integer encoding)
*
* Graph on N nodes, k colors.
* Decision: data[0][i] in [0, k-1] = color of node i.
* Objective: minimize number of conflicting edges (adjacent same color).
*
* Validation instance: Petersen graph (10 nodes, 15 edges, chromatic number 3, optimal conflicts=0)
*/
#pragma once
@ -13,9 +13,9 @@
#include "cuda_utils.cuh"
struct GraphColorProblem : ProblemBase<GraphColorProblem, 1, 64> {
const int* d_adj; // 邻接矩阵 [N*N]1=相邻, 0=不相邻)
int n; // 节点数
int k; // 颜色数
const int* d_adj; // adjacency [N*N] (1=edge, 0=no edge)
int n; // number of nodes
int k; // number of colors
__device__ float calc_conflicts(const Sol& sol) const {
int conflicts = 0;

View file

@ -1,26 +1,26 @@
/**
* jsp.cuh - 车间调度问题 (Job Shop Scheduling Problem)
*
* J 个工件,每个工件有 O 道工序,每道工序指定机器和耗时。
*
* === 编码方案 AInteger 多行(时间表编码)===
* JSPProblem: data[j][i] = 工件 j 的第 i 道工序的开始时间
* jsp.cuh - Job Shop Scheduling Problem (JSSP)
*
* J jobs, each with O operations; each op specifies machine and duration.
*
* === Encoding A: multi-row Integer (time-table encoding) ===
* JSPProblem: data[j][i] = start time of job j's i-th operation
* dim1 = num_jobs, dim2_default = num_ops
* row_mode = Fixed(禁止 ROW_SPLIT/ROW_MERGE
* 每行代表一个工件的固定工序序列,行长度不可变
*
* === 编码方案 BPermutation 多重集(工序排列编码)===
* JSPPermProblem: data[0][k] = 工件编号0..J-1长度 J*O
* 值 j 出现 O 次。从左到右扫描,第 t 次遇到值 j 表示工件 j 的第 t 道工序。
* row_mode = Fixed (no ROW_SPLIT/ROW_MERGE)
* Each row is a fixed op sequence for one job; row length is fixed.
*
* === Encoding B: Permutation multiset (operation sequence encoding) ===
* JSPPermProblem: data[0][k] = job id (0..J-1), length J*O
* Value j appears O times. Left-to-right scan: t-th occurrence of j is job j's t-th op.
* dim1 = 1, dim2_default = J*O, perm_repeat_count = O
* 标准 Permutation 算子swap/reverse/insert天然保持多重集结构
*
* 目标Minimize makespan所有工件完成时间的最大值
* 约束:
* (a) 工序顺序:同一工件的工序必须按序执行
* (b) 机器冲突:同一机器同一时刻只能处理一个工序
*
* 验证实例:自定义 3 工件 3 机器 (3x3),最优 makespan = 12
* Standard permutation ops (swap/reverse/insert) preserve multiset structure.
*
* Objective: minimize makespan (max completion time over jobs).
* Constraints:
* (a) Precedence: ops of the same job must run in order.
* (b) Machine conflict: one op per machine at a time.
*
* Validation instance: custom 3 jobs × 3 machines (3x3), optimal makespan = 12
*/
#pragma once
@ -28,16 +28,16 @@
#include "cuda_utils.cuh"
// ============================================================
// 编码方案 AInteger 多行(时间表编码)
// Encoding A: multi-row Integer (time-table encoding)
// ============================================================
struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
const int* d_machine; // 工序所需机器 [J*O]
const float* d_duration; // 工序耗时 [J*O]
int num_jobs; // 工件数 J
int num_ops; // 每工件工序数 O
int num_machines; // 机器数 M
int time_horizon; // 时间上界
const int* d_machine; // machine per op [J*O]
const float* d_duration; // op duration [J*O]
int num_jobs; // number of jobs J
int num_ops; // ops per job O
int num_machines; // number of machines M
int time_horizon; // time horizon upper bound
__device__ float calc_makespan(const Sol& sol) const {
float makespan = 0.0f;
@ -62,7 +62,7 @@ struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
__device__ float compute_penalty(const Sol& sol) const {
float penalty = 0.0f;
// (a) 工序顺序约束
// (a) Precedence constraints
for (int j = 0; j < num_jobs; j++) {
for (int i = 1; i < num_ops; i++) {
float prev_end = (float)sol.data[j][i-1] + d_duration[j * num_ops + (i-1)];
@ -72,7 +72,7 @@ struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
}
}
// (b) 机器冲突约束
// (b) Machine conflict constraints
int total = num_jobs * num_ops;
for (int a = 0; a < total; a++) {
int ja = a / num_ops, ia = a % num_ops;
@ -151,28 +151,28 @@ struct JSPProblem : ProblemBase<JSPProblem, 8, 16> {
};
// ============================================================
// 编码方案 BPermutation 多重集(工序排列编码)
// Encoding B: Permutation multiset (operation sequence encoding)
// ============================================================
// data[0] 是长度 J*O 的排列,值域 [0, J),每个值出现 O 次
// 从左到右扫描:第 t 次遇到值 j → 安排工件 j 的第 t 道工序
// 贪心解码:每道工序安排在"最早可行时间"(满足工序顺序 + 机器空闲)
// data[0] is a length-J*O sequence with values in [0, J), each appearing O times.
// Left-to-right: t-th occurrence of j schedules job j's t-th operation.
// Greedy decode: each op at earliest feasible time (precedence + machine free).
struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
const int* d_machine; // 工序所需机器 [J*O]
const float* d_duration; // 工序耗时 [J*O]
const int* d_machine; // machine per op [J*O]
const float* d_duration; // op duration [J*O]
int num_jobs;
int num_ops;
int num_machines;
// 贪心解码:从排列生成调度方案,返回 makespan
// Greedy decode: build schedule from permutation, return makespan
__device__ float decode_and_makespan(const Sol& sol) const {
int total = num_jobs * num_ops;
int size = sol.dim2_sizes[0];
if (size < total) return 1e9f;
float job_avail[8]; // 每个工件的下一道工序最早开始时间
float mach_avail[8]; // 每台机器的最早空闲时间
int job_next_op[8]; // 每个工件的下一道待安排工序编号
float job_avail[8]; // earliest start for next op of each job
float mach_avail[8]; // earliest machine free time
int job_next_op[8]; // next op index to schedule per job
for (int j = 0; j < num_jobs; j++) { job_avail[j] = 0.0f; job_next_op[j] = 0; }
for (int m = 0; m < num_machines; m++) mach_avail[m] = 0.0f;
@ -182,13 +182,13 @@ struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
int j = sol.data[0][k];
if (j < 0 || j >= num_jobs) return 1e9f;
int op = job_next_op[j];
if (op >= num_ops) continue; // 该工件已安排完
if (op >= num_ops) continue; // job already fully scheduled
int flat = j * num_ops + op;
int m = d_machine[flat];
float dur = d_duration[flat];
// 最早开始时间 = max(工件前序完成, 机器空闲)
// Earliest start = max(job predecessor done, machine free)
float start = fmaxf(job_avail[j], mach_avail[m]);
float end = start + dur;
@ -212,7 +212,7 @@ struct JSPPermProblem : ProblemBase<JSPPermProblem, 1, 64> {
}
}
// 贪心解码天然满足约束penalty 始终为 0
// Greedy decode satisfies constraints; penalty is always 0
__device__ float compute_penalty(const Sol& sol) const {
return 0.0f;
}

View file

@ -1,7 +1,7 @@
/**
* knapsack.cuh - 0-1 背包问题
*
* 继承 ProblemBase使用 ObjDef 目标注册机制
* knapsack.cuh - 0-1 knapsack
*
* Extends ProblemBase with ObjDef objective registration.
*/
#pragma once
@ -10,13 +10,13 @@
#include "operators.cuh"
struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
// 问题数据d_weights 是物品重量,非目标权重)
// problem data (d_weights are item weights, not objective weights)
const float* d_weights;
const float* d_values;
float capacity;
int n;
// ---- 目标计算 ----
// ---- objective evaluation ----
__device__ float calc_total_value(const Sol& sol) const {
float tv = 0.0f;
const int* sel = sol.data[0];
@ -26,7 +26,7 @@ struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
return tv;
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
// ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Maximize, 1.0f, 0.0f}, // case 0: calc_total_value
};
@ -55,7 +55,7 @@ struct KnapsackProblem : ProblemBase<KnapsackProblem, 1, 32> {
return cfg;
}
// ---- shared memory 接口 ----
// ---- shared memory interface ----
size_t shared_mem_bytes() const {
return 2 * (size_t)n * sizeof(float);
}

View file

@ -1,12 +1,12 @@
/**
* load_balance.cuh - 离散负载均衡问题Integer 编码验证)
*
* N 个任务分配到 M 台机器,每个任务有一个处理时间 p[i]。
* 决策变量data[0][i] ∈ [0, M-1],表示任务 i 分配到哪台机器。
* 目标:最小化 makespan最大机器负载
*
* 已知 NP-hard等价于 multiprocessor scheduling / load balancing
* LPT(最长处理时间优先)贪心可得 4/3 近似。
* load_balance.cuh - discrete load balancing (Integer encoding sanity check)
*
* N tasks on M machines, processing time p[i] per task.
* Decision: data[0][i] in [0, M-1] = machine for task i.
* Objective: minimize makespan (max machine load).
*
* NP-hard (same as multiprocessor scheduling / load balancing).
* LPT (longest processing time first) greedy achieves 4/3 approximation.
*/
#pragma once
@ -14,12 +14,12 @@
#include "cuda_utils.cuh"
struct LoadBalanceProblem : ProblemBase<LoadBalanceProblem, 1, 64> {
const float* d_proc_time; // 任务处理时间 [N]
int n; // 任务数
int m; // 机器数
const float* d_proc_time; // task processing times [N]
int n; // number of tasks
int m; // number of machines
__device__ float calc_makespan(const Sol& sol) const {
float load[32] = {}; // 最多 32 台机器
float load[32] = {}; // at most 32 machines
int size = sol.dim2_sizes[0];
for (int i = 0; i < size; i++) {
int machine = sol.data[0][i];
@ -43,7 +43,7 @@ struct LoadBalanceProblem : ProblemBase<LoadBalanceProblem, 1, 64> {
}
__device__ float compute_penalty(const Sol& sol) const {
return 0.0f; // 无约束(任何分配都合法)
return 0.0f; // no side constraints (any assignment is feasible)
}
ProblemConfig config() const {

View file

@ -1,14 +1,14 @@
/**
* qap.cuh - 二次分配问题 (Quadratic Assignment Problem)
*
* N 个设施分配到 N 个位置(排列编码)。
* 决策变量data[0][i] = 设施 i 分配到的位置。
* 目标:Minimize sum(flow[i][j] * dist[perm[i]][perm[j]])
*
* 验证实例:自定义 5x5
* flow: 设施间的物流量
* dist: 位置间的距离
* 已知最优 = 58
* qap.cuh - Quadratic Assignment Problem (QAP)
*
* Assign N facilities to N locations (permutation encoding).
* Decision: data[0][i] = location assigned to facility i.
* Objective: Minimize sum(flow[i][j] * dist[perm[i]][perm[j]])
*
* Validation instance: custom 5x5
* flow: inter-facility flow
* dist: inter-location distances
* known optimum = 58
*/
#pragma once
@ -16,8 +16,10 @@
#include "cuda_utils.cuh"
struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
const float* d_flow; // 物流量矩阵 [N*N]
const float* d_dist; // 距离矩阵 [N*N]
const float* d_flow; // flow matrix [N*N] (device)
const float* d_dist; // distance matrix [N*N] (device)
const float* h_flow; // flow matrix [N*N] (host, for clone_to_device)
const float* h_dist; // distance matrix [N*N] (host, for clone_to_device)
int n;
__device__ float calc_cost(const Sol& sol) const {
@ -64,14 +66,16 @@ struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
d_dist = sd;
}
static QAPProblem create(const float* h_flow, const float* h_dist, int n) {
static QAPProblem create(const float* h_flow_in, const float* h_dist_in, int n) {
QAPProblem prob;
prob.n = n;
prob.h_flow = h_flow_in;
prob.h_dist = h_dist_in;
float *df, *dd;
CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n));
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(df, h_flow_in, sizeof(float) * n * n, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(dd, h_dist_in, sizeof(float) * n * n, cudaMemcpyHostToDevice));
prob.d_flow = df; prob.d_dist = dd;
return prob;
}
@ -82,18 +86,12 @@ struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
d_flow = nullptr; d_dist = nullptr;
}
// v5.0: 多 GPU 协同 — 克隆到指定 GPU
// v5.0: multi-GPU — clone onto a given device
QAPProblem* clone_to_device(int gpu_id) const override {
int orig_device;
CUDA_CHECK(cudaGetDevice(&orig_device));
// 先下载数据到 host从当前设备
float* h_flow = new float[n * n];
float* h_dist = new float[n * n];
CUDA_CHECK(cudaMemcpy(h_flow, d_flow, sizeof(float) * n * n, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(h_dist, d_dist, sizeof(float) * n * n, cudaMemcpyDeviceToHost));
// 切换到目标 GPU 并上传
// Use host-side matrices directly (no D2H needed)
CUDA_CHECK(cudaSetDevice(gpu_id));
float *df, *dd;
CUDA_CHECK(cudaMalloc(&df, sizeof(float) * n * n));
@ -101,15 +99,12 @@ struct QAPProblem : ProblemBase<QAPProblem, 1, 32> {
CUDA_CHECK(cudaMemcpy(df, h_flow, sizeof(float) * n * n, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
delete[] h_flow;
delete[] h_dist;
// 恢复原设备
CUDA_CHECK(cudaSetDevice(orig_device));
// 创建新实例
QAPProblem* new_prob = new QAPProblem();
new_prob->n = n;
new_prob->h_flow = h_flow;
new_prob->h_dist = h_dist;
new_prob->d_flow = df;
new_prob->d_dist = dd;

View file

@ -1,8 +1,8 @@
/**
* schedule.cuh - 排班问题
*
* 继承 ProblemBase使用 ObjDef 目标注册机制
* 2 个目标总成本min+ 不公平度min权重更高
* schedule.cuh - staff scheduling
*
* Extends ProblemBase with ObjDef objective registration.
* Two objectives: total cost (min) + unfairness (min, higher weight).
*/
#pragma once
@ -14,7 +14,7 @@ struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
const float* d_cost;
int days, emps, required;
// ---- 目标计算 ----
// ---- objective evaluation ----
__device__ float calc_total_cost(const Sol& sol) const {
float total = 0.0f;
for (int d = 0; d < days; d++)
@ -37,7 +37,7 @@ struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
return (float)(max_w - min_w);
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
// ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_cost
{ObjDir::Minimize, 5.0f, 0.0f}, // case 1: calc_unfairness
@ -71,9 +71,9 @@ struct ScheduleProblem : ProblemBase<ScheduleProblem, 8, 16> {
return cfg;
}
// 默认回退全量(基类行为)— 不需要覆盖 evaluate_move
// Default full re-eval (base behavior) — no need to override evaluate_move
// ---- shared memory 接口 ----
// ---- shared memory interface ----
size_t shared_mem_bytes() const {
return (size_t)days * emps * sizeof(float);
}

View file

@ -1,7 +1,7 @@
/**
* tsp.cuh - TSP 问题定义
*
* 继承 ProblemBase使用 ObjDef 目标注册机制
* tsp.cuh - Traveling Salesman Problem (TSP) definition
*
* Extends ProblemBase with ObjDef objective registration.
*/
#pragma once
@ -10,12 +10,12 @@
#include "operators.cuh"
struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
// 问题数据
// problem data
const float* d_dist;
const float* h_dist; // host 端距离矩阵(用于 init_relation_matrix
const float* h_dist; // host distance matrix (for init_relation_matrix)
int n;
// ---- 目标计算 ----
// ---- objective evaluation ----
__device__ float calc_total_distance(const Sol& sol) const {
float total = 0.0f;
const int* route = sol.data[0];
@ -25,7 +25,7 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
return total;
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
// ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_distance
};
@ -37,10 +37,10 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
}
__device__ float compute_penalty(const Sol& sol) const {
return 0.0f; // TSP 无约束
return 0.0f; // TSP has no side constraints
}
// ---- config(编码/维度部分,目标由基类自动填充)----
// ---- config (encoding/dims; objectives filled by base class) ----
ProblemConfig config() const {
ProblemConfig cfg;
cfg.encoding = EncodingType::Permutation;
@ -49,7 +49,7 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
return cfg;
}
// ---- shared memory 接口 ----
// ---- shared memory interface ----
static constexpr size_t SMEM_LIMIT = 48 * 1024;
size_t shared_mem_bytes() const {
@ -69,7 +69,7 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
d_dist = sd;
}
// 距离先验:距离近 → G/O 分数高
// Distance prior: closer cities → higher G/O scores
void init_relation_matrix(float* G, float* O, int N) const {
if (!h_dist || N != n) return;
float max_d = 0.0f;
@ -108,21 +108,21 @@ struct TSPProblem : ProblemBase<TSPProblem, 1, 64> {
h_dist = nullptr;
}
// v5.0: 多 GPU 协同 — 克隆到指定 GPU
// v5.0: multi-GPU — clone onto a given device
TSPProblem* clone_to_device(int gpu_id) const override {
int orig_device;
CUDA_CHECK(cudaGetDevice(&orig_device));
CUDA_CHECK(cudaSetDevice(gpu_id));
// 分配设备内存并拷贝距离矩阵
// Allocate device memory and copy distance matrix
float* dd;
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
CUDA_CHECK(cudaMemcpy(dd, h_dist, sizeof(float) * n * n, cudaMemcpyHostToDevice));
// 恢复原设备
// Restore original device
CUDA_CHECK(cudaSetDevice(orig_device));
// 创建新的 Problem 实例(在 host 端)
// Create new Problem instance (on host)
TSPProblem* new_prob = new TSPProblem();
new_prob->n = n;
new_prob->h_dist = h_dist;

View file

@ -1,7 +1,7 @@
/**
* tsp_large.cuh - 大规模 TSP 问题定义 (最多 256 城市)
*
* 继承 ProblemBase逻辑与 tsp.cuh 一致,仅 D2 上限不同
* tsp_large.cuh - large-scale TSP definition (up to 256 cities)
*
* Same logic as tsp.cuh under ProblemBase; only D2 cap differs.
*/
#pragma once
@ -14,7 +14,7 @@ struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
const float* h_dist;
int n;
// ---- 目标计算 ----
// ---- objective evaluation ----
__device__ float calc_total_distance(const Sol& sol) const {
float total = 0.0f;
const int* route = sol.data[0];
@ -24,7 +24,7 @@ struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
return total;
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
// ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_distance
};
@ -54,7 +54,7 @@ struct TSPLargeProblem : ProblemBase<TSPLargeProblem, 1, 256> {
return need <= SMEM_LIMIT ? need : 0;
}
// 距离矩阵的实际大小(不管是否放进 smem
// Actual distance matrix size (whether or not placed in smem)
size_t working_set_bytes() const {
return (size_t)n * n * sizeof(float);
}

View file

@ -1,9 +1,9 @@
/**
* tsp_xlarge.cuh - 超大规模 TSP 问题定义 (最多 512 城市)
*
* 继承 ProblemBase逻辑与 tsp_large.cuh 一致D2=512
* 注意:距离矩阵 512×512×4B = 1MB远超 48KB shared memory
* 因此 shared_mem_bytes() 返回 0距离矩阵留在 global memory
* tsp_xlarge.cuh - very large TSP definition (up to 512 cities)
*
* Same as tsp_large.cuh under ProblemBase, with D2=512.
* Note: distance matrix 512×512×4B = 1MB, far above 48KB shared memory,
* so shared_mem_bytes() returns 0 and the matrix stays in global memory.
*/
#pragma once
@ -13,7 +13,7 @@
struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
const float* d_dist;
const float* h_dist; // host 端距离矩阵(用于 init_relation_matrix
const float* h_dist; // host distance matrix (for init_relation_matrix)
int n;
__device__ float calc_total_distance(const Sol& sol) const {
@ -45,7 +45,7 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
return cfg;
}
// 距离矩阵太大,不放 shared memory
// Distance matrix too large for shared memory
size_t shared_mem_bytes() const { return 0; }
__device__ void load_shared(char*, int, int) {}
@ -53,10 +53,10 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
return (size_t)n * n * sizeof(float);
}
// 用距离矩阵初始化 G/O 先验:距离近 → 分数高
// Initialize G/O priors from distances: closer → higher score
void init_relation_matrix(float* G, float* O, int N) const {
if (!h_dist || N != n) return;
// 找最大距离用于归一化
// Max distance for normalization
float max_d = 0.0f;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
@ -66,10 +66,10 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
if (i == j) continue;
// 距离近 → G 高(分组倾向强)
// Closer → higher G (stronger grouping signal)
float proximity = 1.0f - h_dist[i * N + j] / max_d;
G[i * N + j] = proximity * 0.3f; // 初始信号不要太强,留空间给 EMA
// 距离近 → O 也给一点信号(对称的,不偏向任何方向)
G[i * N + j] = proximity * 0.3f; // keep initial signal moderate for EMA headroom
// Closer → small O signal too (symmetric, no directional bias)
O[i * N + j] = proximity * 0.1f;
}
}
@ -84,7 +84,7 @@ struct TSPXLargeProblem : ProblemBase<TSPXLargeProblem, 1, 512> {
static TSPXLargeProblem create(const float* h_dist_ptr, int n) {
TSPXLargeProblem prob;
prob.n = n;
prob.h_dist = h_dist_ptr; // 保留 host 指针
prob.h_dist = h_dist_ptr; // keep host pointer
float* dd;
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n * n));
CUDA_CHECK(cudaMemcpy(dd, h_dist_ptr, sizeof(float) * n * n, cudaMemcpyHostToDevice));

View file

@ -1,8 +1,8 @@
/**
* vrp.cuh - 容量约束车辆路径问题 (CVRP)
*
* 继承 ProblemBase使用 ObjDef 目标注册机制
* 多行编码D1=K 条路线,分区初始化 + 跨行算子)
* vrp.cuh - Capacitated Vehicle Routing Problem (CVRP)
*
* Extends ProblemBase with ObjDef objective registration.
* Multi-row encoding (D1 = K routes, partition init + cross-row operators).
*/
#pragma once
@ -12,11 +12,11 @@
#include "gpu_cache.cuh"
struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
// GPU 数据
// GPU data
const float* d_dist;
const float* d_demand;
const float* h_dist; // host 端距离矩阵(含 depot用于 init_relation_matrix
const float* h_demand; // host 端需求数组(用于 clone_to_device
const float* h_dist; // host distance matrix (includes depot; for init_relation_matrix)
const float* h_demand; // host demand array (for clone_to_device)
int n;
int stride;
float capacity;
@ -24,7 +24,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
int max_vehicles;
GpuCache cache;
// ---- 目标计算 ----
// ---- objective evaluation ----
__device__ float compute_route_dist(const int* route, int size) const {
if (size == 0) return 0.0f;
float dist = 0.0f;
@ -61,7 +61,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
return total;
}
// ---- 目标定义OBJ_DEFS 与 compute_obj 必须一一对应)----
// ---- objective defs (OBJ_DEFS must match compute_obj one-to-one) ----
static constexpr ObjDef OBJ_DEFS[] = {
{ObjDir::Minimize, 1.0f, 0.0f}, // case 0: calc_total_distance
};
@ -102,7 +102,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
return cfg;
}
// ---- shared memory 接口 ----
// ---- shared memory interface ----
static constexpr size_t SMEM_LIMIT = 48 * 1024;
size_t shared_mem_bytes() const {
@ -129,14 +129,14 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
void enable_cache(int cap = 65536) { cache = GpuCache::allocate(cap); }
void print_cache_stats() const { cache.print_stats(); }
// 距离先验:客户间距离近 → G/O 分数高
// 注意h_dist 含 depotstride×stride元素编号 0..n-1 对应 node 1..n
// Distance prior: closer customers → higher G/O scores
// Note: h_dist includes depot (stride×stride); indices 0..n-1 map to nodes 1..n
void init_relation_matrix(float* G, float* O, int N) const {
if (!h_dist || N != n) return;
float max_d = 0.0f;
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++) {
float d = h_dist[(i + 1) * stride + (j + 1)]; // 跳过 depot
float d = h_dist[(i + 1) * stride + (j + 1)]; // skip depot
if (d > max_d) max_d = d;
}
if (max_d <= 0.0f) return;
@ -161,7 +161,7 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
prob.max_vehicles = max_vehicles;
prob.cache = GpuCache::disabled();
prob.h_dist = h_dist_ptr;
prob.h_demand = h_demand_ptr; // 保存 host 端指针
prob.h_demand = h_demand_ptr; // keep host pointer
int n_nodes = n + 1;
float* dd;
@ -185,13 +185,13 @@ struct VRPProblem : ProblemBase<VRPProblem, 8, 64> {
cache.destroy();
}
// v5.0: 多 GPU 协同 — 克隆到指定 GPU
// v5.0: multi-GPU — clone onto a given device
VRPProblem* clone_to_device(int gpu_id) const override {
int orig_device;
CUDA_CHECK(cudaGetDevice(&orig_device));
CUDA_CHECK(cudaSetDevice(gpu_id));
// 从 host 端数据直接拷贝到目标 GPU避免跨设备 D2H 拷贝)
// Copy from host straight to target GPU (avoid cross-device D2H staging)
int n_nodes = n + 1;
float* dd;
CUDA_CHECK(cudaMalloc(&dd, sizeof(float) * n_nodes * n_nodes));

View file

@ -1,12 +1,12 @@
/**
* vrptw.cuh - 带时间窗的车辆路径问题 (VRPTW)
*
* 在 CVRP 基础上增加时间窗约束。
* 编码Perm 多行分区(同 CVRPdata[r][j] = 路线 r 的第 j 个客户。
* 目标Minimize 总距离。
* 约束:(a) 容量约束, (b) 时间窗约束(到达时间必须 ≤ latest早到需等待
*
* 验证实例8 客户 3 车, 手工设计坐标+时间窗, 确保有已知可行解。
* vrptw.cuh - Vehicle Routing Problem with Time Windows (VRPTW)
*
* CVRP plus time window constraints.
* Encoding: multi-row perm partition (same as CVRP); data[r][j] = j-th customer on route r.
* Objective: minimize total distance.
* Constraints: (a) capacity, (b) time windows (arrival ≤ latest; early arrival waits).
*
* Validation instance: 8 customers, 3 vehicles; hand-crafted coords + windows with known feasible solution.
*/
#pragma once
@ -14,12 +14,12 @@
#include "cuda_utils.cuh"
struct VRPTWProblem : ProblemBase<VRPTWProblem, 8, 64> {
const float* d_dist; // 距离矩阵 [(n+1)*(n+1)](含 depot
const float* d_demand; // 需求 [n]
const float* d_earliest; // 最早服务时间 [n+1](含 depot
const float* d_latest; // 最晚服务时间 [n+1](含 depot
const float* d_service; // 服务耗时 [n+1](含 depot
int n; // 客户数(不含 depot
const float* d_dist; // distance matrix [(n+1)*(n+1)] (includes depot)
const float* d_demand; // demand [n]
const float* d_earliest; // earliest service time [n+1] (includes depot)
const float* d_latest; // latest service time [n+1] (includes depot)
const float* d_service; // service time [n+1] (includes depot)
int n; // number of customers (excludes depot)
int stride; // n+1
float capacity;
int num_vehicles;
@ -63,30 +63,30 @@ struct VRPTWProblem : ProblemBase<VRPTWProblem, 8, 64> {
if (size == 0) continue;
active++;
// 容量约束
// Capacity constraint
float load = 0.0f;
for (int j = 0; j < size; j++)
load += d_demand[sol.data[r][j]];
if (load > capacity)
penalty += (load - capacity) * 100.0f;
// 时间窗约束:模拟路线行驶
// Time windows: simulate route travel
float time = 0.0f;
int prev = 0;
for (int j = 0; j < size; j++) {
int node = sol.data[r][j] + 1;
float travel = d_dist[prev * stride + node];
time += travel;
// 早到需等待
// Wait if early
if (time < d_earliest[node])
time = d_earliest[node];
// 迟到产生惩罚
// Penalize lateness
if (time > d_latest[node])
penalty += (time - d_latest[node]) * 50.0f;
time += d_service[node];
prev = node;
}
// 返回 depot 的时间窗
// Time window returning to depot
float return_time = time + d_dist[prev * stride + 0];
if (return_time > d_latest[0])
penalty += (return_time - d_latest[0]) * 50.0f;